{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9836867862969005, "eval_steps": 39, "global_step": 306, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0065252854812398045, "grad_norm": 0.23828125, "learning_rate": 2e-05, "loss": 1.3184, "step": 1 }, { "epoch": 0.0065252854812398045, "eval_loss": 1.3192611932754517, "eval_runtime": 235.1739, "eval_samples_per_second": 1.811, "eval_steps_per_second": 0.906, "step": 1 }, { "epoch": 0.013050570962479609, "grad_norm": 0.2373046875, "learning_rate": 4e-05, "loss": 1.3213, "step": 2 }, { "epoch": 0.01957585644371941, "grad_norm": 0.2373046875, "learning_rate": 6e-05, "loss": 1.3232, "step": 3 }, { "epoch": 0.026101141924959218, "grad_norm": 0.259765625, "learning_rate": 8e-05, "loss": 1.3418, "step": 4 }, { "epoch": 0.03262642740619902, "grad_norm": 0.275390625, "learning_rate": 0.0001, "loss": 1.2712, "step": 5 }, { "epoch": 0.03915171288743882, "grad_norm": 0.2412109375, "learning_rate": 0.00012, "loss": 1.2682, "step": 6 }, { "epoch": 0.04567699836867863, "grad_norm": 0.1015625, "learning_rate": 0.00014, "loss": 1.2672, "step": 7 }, { "epoch": 0.052202283849918436, "grad_norm": 0.44140625, "learning_rate": 0.00016, "loss": 1.2337, "step": 8 }, { "epoch": 0.05872756933115824, "grad_norm": 0.51171875, "learning_rate": 0.00018, "loss": 1.3138, "step": 9 }, { "epoch": 0.06525285481239804, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.238, "step": 10 }, { "epoch": 0.07177814029363784, "grad_norm": 0.138671875, "learning_rate": 0.0001999986383177335, "loss": 1.2244, "step": 11 }, { "epoch": 0.07830342577487764, "grad_norm": 0.07666015625, "learning_rate": 0.0001999945533080175, "loss": 1.1796, "step": 12 }, { "epoch": 0.08482871125611746, "grad_norm": 0.1171875, "learning_rate": 0.0001999877450821018, "loss": 1.1873, "step": 13 }, { "epoch": 0.09135399673735727, "grad_norm": 0.154296875, "learning_rate": 0.0001999782138253991, "loss": 1.1684, "step": 14 }, { "epoch": 0.09787928221859707, "grad_norm": 0.15625, "learning_rate": 0.00019996595979748037, "loss": 1.2122, "step": 15 }, { "epoch": 0.10440456769983687, "grad_norm": 0.12158203125, "learning_rate": 0.00019995098333206742, "loss": 1.1772, "step": 16 }, { "epoch": 0.11092985318107668, "grad_norm": 0.07958984375, "learning_rate": 0.00019993328483702393, "loss": 1.1742, "step": 17 }, { "epoch": 0.11745513866231648, "grad_norm": 0.09326171875, "learning_rate": 0.00019991286479434454, "loss": 1.1791, "step": 18 }, { "epoch": 0.12398042414355628, "grad_norm": 0.11474609375, "learning_rate": 0.00019988972376014142, "loss": 1.1651, "step": 19 }, { "epoch": 0.13050570962479607, "grad_norm": 0.1572265625, "learning_rate": 0.00019986386236462924, "loss": 1.1874, "step": 20 }, { "epoch": 0.1370309951060359, "grad_norm": 0.12255859375, "learning_rate": 0.00019983528131210812, "loss": 1.1371, "step": 21 }, { "epoch": 0.14355628058727568, "grad_norm": 0.08544921875, "learning_rate": 0.00019980398138094428, "loss": 1.1725, "step": 22 }, { "epoch": 0.1500815660685155, "grad_norm": 0.07763671875, "learning_rate": 0.00019976996342354898, "loss": 1.1264, "step": 23 }, { "epoch": 0.1566068515497553, "grad_norm": 0.078125, "learning_rate": 0.00019973322836635518, "loss": 1.091, "step": 24 }, { "epoch": 0.1631321370309951, "grad_norm": 0.09423828125, "learning_rate": 0.00019969377720979237, "loss": 1.1636, "step": 25 }, { "epoch": 0.16965742251223492, "grad_norm": 0.1044921875, "learning_rate": 0.00019965161102825945, "loss": 1.1472, "step": 26 }, { "epoch": 0.1761827079934747, "grad_norm": 0.0751953125, "learning_rate": 0.00019960673097009518, "loss": 1.1212, "step": 27 }, { "epoch": 0.18270799347471453, "grad_norm": 0.072265625, "learning_rate": 0.00019955913825754713, "loss": 1.1466, "step": 28 }, { "epoch": 0.18923327895595432, "grad_norm": 0.0693359375, "learning_rate": 0.0001995088341867384, "loss": 1.1235, "step": 29 }, { "epoch": 0.19575856443719414, "grad_norm": 0.06787109375, "learning_rate": 0.0001994558201276322, "loss": 1.1123, "step": 30 }, { "epoch": 0.20228384991843393, "grad_norm": 0.07177734375, "learning_rate": 0.0001994000975239946, "loss": 1.1459, "step": 31 }, { "epoch": 0.20880913539967375, "grad_norm": 0.07470703125, "learning_rate": 0.00019934166789335525, "loss": 1.1231, "step": 32 }, { "epoch": 0.21533442088091354, "grad_norm": 0.06689453125, "learning_rate": 0.00019928053282696596, "loss": 1.1153, "step": 33 }, { "epoch": 0.22185970636215335, "grad_norm": 0.078125, "learning_rate": 0.00019921669398975745, "loss": 1.0979, "step": 34 }, { "epoch": 0.22838499184339314, "grad_norm": 0.07421875, "learning_rate": 0.000199150153120294, "loss": 1.0667, "step": 35 }, { "epoch": 0.23491027732463296, "grad_norm": 0.06689453125, "learning_rate": 0.00019908091203072598, "loss": 1.1144, "step": 36 }, { "epoch": 0.24143556280587275, "grad_norm": 0.07080078125, "learning_rate": 0.00019900897260674073, "loss": 1.104, "step": 37 }, { "epoch": 0.24796084828711257, "grad_norm": 0.06689453125, "learning_rate": 0.00019893433680751103, "loss": 1.1016, "step": 38 }, { "epoch": 0.2544861337683524, "grad_norm": 0.07275390625, "learning_rate": 0.0001988570066656417, "loss": 1.089, "step": 39 }, { "epoch": 0.2544861337683524, "eval_loss": 1.1131086349487305, "eval_runtime": 236.8764, "eval_samples_per_second": 1.798, "eval_steps_per_second": 0.899, "step": 39 }, { "epoch": 0.26101141924959215, "grad_norm": 0.078125, "learning_rate": 0.00019877698428711442, "loss": 1.1207, "step": 40 }, { "epoch": 0.26753670473083196, "grad_norm": 0.06982421875, "learning_rate": 0.00019869427185123027, "loss": 1.1376, "step": 41 }, { "epoch": 0.2740619902120718, "grad_norm": 0.06982421875, "learning_rate": 0.00019860887161055038, "loss": 1.1, "step": 42 }, { "epoch": 0.2805872756933116, "grad_norm": 0.080078125, "learning_rate": 0.00019852078589083466, "loss": 1.1185, "step": 43 }, { "epoch": 0.28711256117455136, "grad_norm": 0.0732421875, "learning_rate": 0.0001984300170909783, "loss": 1.1282, "step": 44 }, { "epoch": 0.2936378466557912, "grad_norm": 0.0654296875, "learning_rate": 0.00019833656768294662, "loss": 1.1505, "step": 45 }, { "epoch": 0.300163132137031, "grad_norm": 0.06689453125, "learning_rate": 0.0001982404402117077, "loss": 1.1102, "step": 46 }, { "epoch": 0.3066884176182708, "grad_norm": 0.0712890625, "learning_rate": 0.00019814163729516292, "loss": 1.1254, "step": 47 }, { "epoch": 0.3132137030995106, "grad_norm": 0.0673828125, "learning_rate": 0.0001980401616240759, "loss": 1.0934, "step": 48 }, { "epoch": 0.3197389885807504, "grad_norm": 0.06689453125, "learning_rate": 0.00019793601596199912, "loss": 1.0792, "step": 49 }, { "epoch": 0.3262642740619902, "grad_norm": 0.07177734375, "learning_rate": 0.00019782920314519856, "loss": 1.0936, "step": 50 }, { "epoch": 0.33278955954323003, "grad_norm": 0.06884765625, "learning_rate": 0.00019771972608257659, "loss": 1.0636, "step": 51 }, { "epoch": 0.33931484502446985, "grad_norm": 0.0673828125, "learning_rate": 0.00019760758775559274, "loss": 1.0693, "step": 52 }, { "epoch": 0.3458401305057096, "grad_norm": 0.06787109375, "learning_rate": 0.00019749279121818235, "loss": 1.0882, "step": 53 }, { "epoch": 0.3523654159869494, "grad_norm": 0.0703125, "learning_rate": 0.0001973753395966737, "loss": 1.1066, "step": 54 }, { "epoch": 0.35889070146818924, "grad_norm": 0.0712890625, "learning_rate": 0.00019725523608970255, "loss": 1.049, "step": 55 }, { "epoch": 0.36541598694942906, "grad_norm": 0.06982421875, "learning_rate": 0.00019713248396812524, "loss": 1.1083, "step": 56 }, { "epoch": 0.3719412724306688, "grad_norm": 0.072265625, "learning_rate": 0.00019700708657492948, "loss": 1.0854, "step": 57 }, { "epoch": 0.37846655791190864, "grad_norm": 0.07275390625, "learning_rate": 0.0001968790473251434, "loss": 1.0954, "step": 58 }, { "epoch": 0.38499184339314846, "grad_norm": 0.07373046875, "learning_rate": 0.00019674836970574254, "loss": 1.0833, "step": 59 }, { "epoch": 0.3915171288743883, "grad_norm": 0.07275390625, "learning_rate": 0.00019661505727555482, "loss": 1.0945, "step": 60 }, { "epoch": 0.39804241435562804, "grad_norm": 0.07080078125, "learning_rate": 0.0001964791136651637, "loss": 1.07, "step": 61 }, { "epoch": 0.40456769983686786, "grad_norm": 0.07177734375, "learning_rate": 0.00019634054257680923, "loss": 1.0998, "step": 62 }, { "epoch": 0.4110929853181077, "grad_norm": 0.0712890625, "learning_rate": 0.0001961993477842873, "loss": 1.0718, "step": 63 }, { "epoch": 0.4176182707993475, "grad_norm": 0.078125, "learning_rate": 0.0001960555331328468, "loss": 1.0285, "step": 64 }, { "epoch": 0.42414355628058725, "grad_norm": 0.08203125, "learning_rate": 0.00019590910253908494, "loss": 1.0836, "step": 65 }, { "epoch": 0.43066884176182707, "grad_norm": 0.076171875, "learning_rate": 0.0001957600599908406, "loss": 1.0836, "step": 66 }, { "epoch": 0.4371941272430669, "grad_norm": 0.07373046875, "learning_rate": 0.00019560840954708565, "loss": 1.0868, "step": 67 }, { "epoch": 0.4437194127243067, "grad_norm": 0.06884765625, "learning_rate": 0.0001954541553378145, "loss": 1.115, "step": 68 }, { "epoch": 0.45024469820554647, "grad_norm": 0.0703125, "learning_rate": 0.0001952973015639316, "loss": 1.047, "step": 69 }, { "epoch": 0.4567699836867863, "grad_norm": 0.07373046875, "learning_rate": 0.00019513785249713697, "loss": 1.0878, "step": 70 }, { "epoch": 0.4632952691680261, "grad_norm": 0.07275390625, "learning_rate": 0.00019497581247980992, "loss": 1.0547, "step": 71 }, { "epoch": 0.4698205546492659, "grad_norm": 0.0732421875, "learning_rate": 0.00019481118592489086, "loss": 1.0907, "step": 72 }, { "epoch": 0.4763458401305057, "grad_norm": 0.0703125, "learning_rate": 0.00019464397731576094, "loss": 1.053, "step": 73 }, { "epoch": 0.4828711256117455, "grad_norm": 0.07421875, "learning_rate": 0.00019447419120612017, "loss": 1.0768, "step": 74 }, { "epoch": 0.4893964110929853, "grad_norm": 0.0732421875, "learning_rate": 0.00019430183221986325, "loss": 1.0599, "step": 75 }, { "epoch": 0.49592169657422513, "grad_norm": 0.076171875, "learning_rate": 0.00019412690505095365, "loss": 1.0721, "step": 76 }, { "epoch": 0.5024469820554649, "grad_norm": 0.07666015625, "learning_rate": 0.00019394941446329583, "loss": 1.0802, "step": 77 }, { "epoch": 0.5089722675367048, "grad_norm": 0.0771484375, "learning_rate": 0.00019376936529060554, "loss": 1.0475, "step": 78 }, { "epoch": 0.5089722675367048, "eval_loss": 1.0718731880187988, "eval_runtime": 236.7722, "eval_samples_per_second": 1.799, "eval_steps_per_second": 0.9, "step": 78 }, { "epoch": 0.5154975530179445, "grad_norm": 0.072265625, "learning_rate": 0.00019358676243627808, "loss": 1.0706, "step": 79 }, { "epoch": 0.5220228384991843, "grad_norm": 0.07568359375, "learning_rate": 0.0001934016108732548, "loss": 1.0679, "step": 80 }, { "epoch": 0.5285481239804242, "grad_norm": 0.08056640625, "learning_rate": 0.00019321391564388775, "loss": 1.044, "step": 81 }, { "epoch": 0.5350734094616639, "grad_norm": 0.078125, "learning_rate": 0.00019302368185980217, "loss": 1.0878, "step": 82 }, { "epoch": 0.5415986949429038, "grad_norm": 0.08447265625, "learning_rate": 0.00019283091470175754, "loss": 1.0469, "step": 83 }, { "epoch": 0.5481239804241436, "grad_norm": 0.07861328125, "learning_rate": 0.00019263561941950622, "loss": 1.0453, "step": 84 }, { "epoch": 0.5546492659053833, "grad_norm": 0.08154296875, "learning_rate": 0.00019243780133165067, "loss": 1.0677, "step": 85 }, { "epoch": 0.5611745513866232, "grad_norm": 0.08056640625, "learning_rate": 0.00019223746582549853, "loss": 1.0648, "step": 86 }, { "epoch": 0.567699836867863, "grad_norm": 0.083984375, "learning_rate": 0.00019203461835691594, "loss": 1.0891, "step": 87 }, { "epoch": 0.5742251223491027, "grad_norm": 0.08447265625, "learning_rate": 0.00019182926445017893, "loss": 1.0607, "step": 88 }, { "epoch": 0.5807504078303426, "grad_norm": 0.078125, "learning_rate": 0.00019162140969782292, "loss": 1.0241, "step": 89 }, { "epoch": 0.5872756933115824, "grad_norm": 0.08642578125, "learning_rate": 0.00019141105976049053, "loss": 1.0675, "step": 90 }, { "epoch": 0.5938009787928222, "grad_norm": 0.08642578125, "learning_rate": 0.00019119822036677738, "loss": 1.0538, "step": 91 }, { "epoch": 0.600326264274062, "grad_norm": 0.0751953125, "learning_rate": 0.000190982897313076, "loss": 1.0572, "step": 92 }, { "epoch": 0.6068515497553018, "grad_norm": 0.076171875, "learning_rate": 0.00019076509646341806, "loss": 1.0372, "step": 93 }, { "epoch": 0.6133768352365416, "grad_norm": 0.07861328125, "learning_rate": 0.00019054482374931467, "loss": 1.0693, "step": 94 }, { "epoch": 0.6199021207177814, "grad_norm": 0.07861328125, "learning_rate": 0.0001903220851695948, "loss": 1.0726, "step": 95 }, { "epoch": 0.6264274061990212, "grad_norm": 0.0791015625, "learning_rate": 0.0001900968867902419, "loss": 1.0543, "step": 96 }, { "epoch": 0.632952691680261, "grad_norm": 0.08447265625, "learning_rate": 0.00018986923474422884, "loss": 1.0577, "step": 97 }, { "epoch": 0.6394779771615008, "grad_norm": 0.08447265625, "learning_rate": 0.0001896391352313506, "loss": 1.0649, "step": 98 }, { "epoch": 0.6460032626427407, "grad_norm": 0.08056640625, "learning_rate": 0.0001894065945180558, "loss": 1.0828, "step": 99 }, { "epoch": 0.6525285481239804, "grad_norm": 0.07861328125, "learning_rate": 0.0001891716189372757, "loss": 1.0805, "step": 100 }, { "epoch": 0.6590538336052202, "grad_norm": 0.07861328125, "learning_rate": 0.0001889342148882519, "loss": 1.0158, "step": 101 }, { "epoch": 0.6655791190864601, "grad_norm": 0.08056640625, "learning_rate": 0.00018869438883636214, "loss": 1.0686, "step": 102 }, { "epoch": 0.6721044045676998, "grad_norm": 0.07958984375, "learning_rate": 0.000188452147312944, "loss": 1.0498, "step": 103 }, { "epoch": 0.6786296900489397, "grad_norm": 0.07861328125, "learning_rate": 0.0001882074969151172, "loss": 1.0783, "step": 104 }, { "epoch": 0.6851549755301795, "grad_norm": 0.08056640625, "learning_rate": 0.000187960444305604, "loss": 0.9942, "step": 105 }, { "epoch": 0.6916802610114192, "grad_norm": 0.0791015625, "learning_rate": 0.00018771099621254746, "loss": 1.0517, "step": 106 }, { "epoch": 0.6982055464926591, "grad_norm": 0.0810546875, "learning_rate": 0.0001874591594293285, "loss": 1.0619, "step": 107 }, { "epoch": 0.7047308319738989, "grad_norm": 0.08251953125, "learning_rate": 0.00018720494081438078, "loss": 1.0584, "step": 108 }, { "epoch": 0.7112561174551386, "grad_norm": 0.08740234375, "learning_rate": 0.00018694834729100386, "loss": 1.0466, "step": 109 }, { "epoch": 0.7177814029363785, "grad_norm": 0.08349609375, "learning_rate": 0.00018668938584717471, "loss": 1.0391, "step": 110 }, { "epoch": 0.7243066884176182, "grad_norm": 0.08251953125, "learning_rate": 0.00018642806353535754, "loss": 1.0331, "step": 111 }, { "epoch": 0.7308319738988581, "grad_norm": 0.0859375, "learning_rate": 0.00018616438747231148, "loss": 1.0671, "step": 112 }, { "epoch": 0.7373572593800979, "grad_norm": 0.087890625, "learning_rate": 0.00018589836483889687, "loss": 1.0525, "step": 113 }, { "epoch": 0.7438825448613376, "grad_norm": 0.08935546875, "learning_rate": 0.0001856300028798798, "loss": 1.0182, "step": 114 }, { "epoch": 0.7504078303425775, "grad_norm": 0.08349609375, "learning_rate": 0.00018535930890373466, "loss": 1.0566, "step": 115 }, { "epoch": 0.7569331158238173, "grad_norm": 0.080078125, "learning_rate": 0.00018508629028244519, "loss": 1.0316, "step": 116 }, { "epoch": 0.763458401305057, "grad_norm": 0.09033203125, "learning_rate": 0.0001848109544513037, "loss": 1.0362, "step": 117 }, { "epoch": 0.763458401305057, "eval_loss": 1.0525188446044922, "eval_runtime": 236.6432, "eval_samples_per_second": 1.8, "eval_steps_per_second": 0.9, "step": 117 }, { "epoch": 0.7699836867862969, "grad_norm": 0.08544921875, "learning_rate": 0.00018453330890870855, "loss": 1.0141, "step": 118 }, { "epoch": 0.7765089722675367, "grad_norm": 0.08203125, "learning_rate": 0.00018425336121596, "loss": 1.0464, "step": 119 }, { "epoch": 0.7830342577487766, "grad_norm": 0.0830078125, "learning_rate": 0.00018397111899705419, "loss": 1.0278, "step": 120 }, { "epoch": 0.7895595432300163, "grad_norm": 0.080078125, "learning_rate": 0.00018368658993847566, "loss": 1.0439, "step": 121 }, { "epoch": 0.7960848287112561, "grad_norm": 0.08251953125, "learning_rate": 0.0001833997817889878, "loss": 1.053, "step": 122 }, { "epoch": 0.802610114192496, "grad_norm": 0.08642578125, "learning_rate": 0.0001831107023594221, "loss": 1.0447, "step": 123 }, { "epoch": 0.8091353996737357, "grad_norm": 0.08349609375, "learning_rate": 0.00018281935952246518, "loss": 1.0347, "step": 124 }, { "epoch": 0.8156606851549756, "grad_norm": 0.0810546875, "learning_rate": 0.00018252576121244456, "loss": 1.0628, "step": 125 }, { "epoch": 0.8221859706362153, "grad_norm": 0.08056640625, "learning_rate": 0.00018222991542511246, "loss": 1.0112, "step": 126 }, { "epoch": 0.8287112561174551, "grad_norm": 0.08984375, "learning_rate": 0.0001819318302174281, "loss": 1.0657, "step": 127 }, { "epoch": 0.835236541598695, "grad_norm": 0.08837890625, "learning_rate": 0.00018163151370733838, "loss": 1.0936, "step": 128 }, { "epoch": 0.8417618270799347, "grad_norm": 0.0869140625, "learning_rate": 0.00018132897407355657, "loss": 1.0519, "step": 129 }, { "epoch": 0.8482871125611745, "grad_norm": 0.08447265625, "learning_rate": 0.00018102421955533974, "loss": 1.018, "step": 130 }, { "epoch": 0.8548123980424144, "grad_norm": 0.08349609375, "learning_rate": 0.00018071725845226436, "loss": 0.9982, "step": 131 }, { "epoch": 0.8613376835236541, "grad_norm": 0.0966796875, "learning_rate": 0.0001804080991240003, "loss": 1.0522, "step": 132 }, { "epoch": 0.867862969004894, "grad_norm": 0.0830078125, "learning_rate": 0.000180096749990083, "loss": 1.0149, "step": 133 }, { "epoch": 0.8743882544861338, "grad_norm": 0.0810546875, "learning_rate": 0.00017978321952968434, "loss": 1.0096, "step": 134 }, { "epoch": 0.8809135399673735, "grad_norm": 0.09765625, "learning_rate": 0.00017946751628138174, "loss": 1.0551, "step": 135 }, { "epoch": 0.8874388254486134, "grad_norm": 0.083984375, "learning_rate": 0.00017914964884292544, "loss": 1.0384, "step": 136 }, { "epoch": 0.8939641109298532, "grad_norm": 0.09326171875, "learning_rate": 0.0001788296258710045, "loss": 1.0341, "step": 137 }, { "epoch": 0.9004893964110929, "grad_norm": 0.0859375, "learning_rate": 0.0001785074560810111, "loss": 1.0162, "step": 138 }, { "epoch": 0.9070146818923328, "grad_norm": 0.0849609375, "learning_rate": 0.000178183148246803, "loss": 1.0332, "step": 139 }, { "epoch": 0.9135399673735726, "grad_norm": 0.08251953125, "learning_rate": 0.00017785671120046473, "loss": 1.0219, "step": 140 }, { "epoch": 0.9200652528548124, "grad_norm": 0.080078125, "learning_rate": 0.00017752815383206705, "loss": 1.0557, "step": 141 }, { "epoch": 0.9265905383360522, "grad_norm": 0.0849609375, "learning_rate": 0.0001771974850894248, "loss": 1.075, "step": 142 }, { "epoch": 0.933115823817292, "grad_norm": 0.087890625, "learning_rate": 0.0001768647139778532, "loss": 1.0458, "step": 143 }, { "epoch": 0.9396411092985318, "grad_norm": 0.0830078125, "learning_rate": 0.00017652984955992277, "loss": 1.054, "step": 144 }, { "epoch": 0.9461663947797716, "grad_norm": 0.08154296875, "learning_rate": 0.0001761929009552122, "loss": 1.0297, "step": 145 }, { "epoch": 0.9526916802610114, "grad_norm": 0.0849609375, "learning_rate": 0.00017585387734006034, "loss": 1.0299, "step": 146 }, { "epoch": 0.9592169657422512, "grad_norm": 0.0810546875, "learning_rate": 0.00017551278794731607, "loss": 1.0578, "step": 147 }, { "epoch": 0.965742251223491, "grad_norm": 0.0859375, "learning_rate": 0.00017516964206608696, "loss": 1.0362, "step": 148 }, { "epoch": 0.9722675367047309, "grad_norm": 0.09130859375, "learning_rate": 0.00017482444904148617, "loss": 1.0356, "step": 149 }, { "epoch": 0.9787928221859706, "grad_norm": 0.0869140625, "learning_rate": 0.0001744772182743782, "loss": 1.038, "step": 150 }, { "epoch": 0.9853181076672104, "grad_norm": 0.0859375, "learning_rate": 0.00017412795922112253, "loss": 1.0445, "step": 151 }, { "epoch": 0.9918433931484503, "grad_norm": 0.08544921875, "learning_rate": 0.0001737766813933164, "loss": 1.0524, "step": 152 }, { "epoch": 0.99836867862969, "grad_norm": 0.08837890625, "learning_rate": 0.00017342339435753553, "loss": 1.0517, "step": 153 }, { "epoch": 1.0048939641109298, "grad_norm": 0.083984375, "learning_rate": 0.00017306810773507376, "loss": 1.0033, "step": 154 }, { "epoch": 1.0114192495921697, "grad_norm": 0.09033203125, "learning_rate": 0.00017271083120168102, "loss": 1.0477, "step": 155 }, { "epoch": 1.0048939641109298, "grad_norm": 0.13671875, "learning_rate": 0.00017235157448729967, "loss": 1.0619, "step": 156 }, { "epoch": 1.0048939641109298, "eval_loss": 1.0388679504394531, "eval_runtime": 236.7713, "eval_samples_per_second": 1.799, "eval_steps_per_second": 0.9, "step": 156 }, { "epoch": 1.0114192495921697, "grad_norm": 0.08740234375, "learning_rate": 0.0001719903473757996, "loss": 1.0131, "step": 157 }, { "epoch": 1.0179445350734095, "grad_norm": 0.0927734375, "learning_rate": 0.0001716271597047119, "loss": 0.9966, "step": 158 }, { "epoch": 1.0244698205546492, "grad_norm": 0.0947265625, "learning_rate": 0.0001712620213649608, "loss": 0.9817, "step": 159 }, { "epoch": 1.030995106035889, "grad_norm": 0.0888671875, "learning_rate": 0.00017089494230059432, "loss": 1.0036, "step": 160 }, { "epoch": 1.037520391517129, "grad_norm": 0.09716796875, "learning_rate": 0.0001705259325085135, "loss": 1.0318, "step": 161 }, { "epoch": 1.0440456769983686, "grad_norm": 0.09814453125, "learning_rate": 0.00017015500203820022, "loss": 1.0106, "step": 162 }, { "epoch": 1.0505709624796085, "grad_norm": 0.09521484375, "learning_rate": 0.0001697821609914432, "loss": 1.0178, "step": 163 }, { "epoch": 1.0570962479608483, "grad_norm": 0.09033203125, "learning_rate": 0.0001694074195220634, "loss": 0.9909, "step": 164 }, { "epoch": 1.0636215334420882, "grad_norm": 0.08740234375, "learning_rate": 0.000169030787835637, "loss": 0.9916, "step": 165 }, { "epoch": 1.0701468189233279, "grad_norm": 0.09130859375, "learning_rate": 0.00016865227618921788, "loss": 1.0101, "step": 166 }, { "epoch": 1.0766721044045677, "grad_norm": 0.0908203125, "learning_rate": 0.00016827189489105788, "loss": 0.9966, "step": 167 }, { "epoch": 1.0831973898858076, "grad_norm": 0.09521484375, "learning_rate": 0.00016788965430032638, "loss": 1.0048, "step": 168 }, { "epoch": 1.0897226753670473, "grad_norm": 0.09521484375, "learning_rate": 0.00016750556482682805, "loss": 1.0074, "step": 169 }, { "epoch": 1.0962479608482871, "grad_norm": 0.08935546875, "learning_rate": 0.00016711963693071943, "loss": 0.9762, "step": 170 }, { "epoch": 1.102773246329527, "grad_norm": 0.09619140625, "learning_rate": 0.00016673188112222394, "loss": 1.0044, "step": 171 }, { "epoch": 1.1092985318107667, "grad_norm": 0.091796875, "learning_rate": 0.00016634230796134576, "loss": 0.9972, "step": 172 }, { "epoch": 1.1158238172920065, "grad_norm": 0.09228515625, "learning_rate": 0.0001659509280575821, "loss": 1.0308, "step": 173 }, { "epoch": 1.1223491027732464, "grad_norm": 0.0927734375, "learning_rate": 0.0001655577520696346, "loss": 0.996, "step": 174 }, { "epoch": 1.128874388254486, "grad_norm": 0.0947265625, "learning_rate": 0.00016516279070511854, "loss": 0.9671, "step": 175 }, { "epoch": 1.135399673735726, "grad_norm": 0.09228515625, "learning_rate": 0.00016476605472027172, "loss": 1.0121, "step": 176 }, { "epoch": 1.1419249592169658, "grad_norm": 0.0927734375, "learning_rate": 0.00016436755491966115, "loss": 1.0132, "step": 177 }, { "epoch": 1.1484502446982057, "grad_norm": 0.09375, "learning_rate": 0.00016396730215588915, "loss": 1.0146, "step": 178 }, { "epoch": 1.1549755301794453, "grad_norm": 0.09521484375, "learning_rate": 0.0001635653073292975, "loss": 0.9911, "step": 179 }, { "epoch": 1.1615008156606852, "grad_norm": 0.09716796875, "learning_rate": 0.0001631615813876707, "loss": 1.0125, "step": 180 }, { "epoch": 1.1680261011419248, "grad_norm": 0.09521484375, "learning_rate": 0.0001627561353259379, "loss": 1.0162, "step": 181 }, { "epoch": 1.1745513866231647, "grad_norm": 0.09716796875, "learning_rate": 0.00016234898018587337, "loss": 0.995, "step": 182 }, { "epoch": 1.1810766721044046, "grad_norm": 0.09326171875, "learning_rate": 0.00016194012705579572, "loss": 0.9839, "step": 183 }, { "epoch": 1.1876019575856445, "grad_norm": 0.09716796875, "learning_rate": 0.00016152958707026614, "loss": 1.0107, "step": 184 }, { "epoch": 1.1941272430668841, "grad_norm": 0.09326171875, "learning_rate": 0.00016111737140978494, "loss": 0.9715, "step": 185 }, { "epoch": 1.200652528548124, "grad_norm": 0.09326171875, "learning_rate": 0.00016070349130048724, "loss": 0.9751, "step": 186 }, { "epoch": 1.2071778140293639, "grad_norm": 0.09375, "learning_rate": 0.00016028795801383718, "loss": 0.9845, "step": 187 }, { "epoch": 1.2137030995106035, "grad_norm": 0.10302734375, "learning_rate": 0.0001598707828663209, "loss": 0.9604, "step": 188 }, { "epoch": 1.2202283849918434, "grad_norm": 0.09765625, "learning_rate": 0.00015945197721913833, "loss": 0.9715, "step": 189 }, { "epoch": 1.2267536704730833, "grad_norm": 0.0966796875, "learning_rate": 0.00015903155247789404, "loss": 1.0156, "step": 190 }, { "epoch": 1.233278955954323, "grad_norm": 0.09814453125, "learning_rate": 0.00015860952009228625, "loss": 1.0044, "step": 191 }, { "epoch": 1.2398042414355628, "grad_norm": 0.099609375, "learning_rate": 0.0001581858915557953, "loss": 0.9849, "step": 192 }, { "epoch": 1.2463295269168027, "grad_norm": 0.09619140625, "learning_rate": 0.0001577606784053705, "loss": 1.0007, "step": 193 }, { "epoch": 1.2528548123980423, "grad_norm": 0.0947265625, "learning_rate": 0.00015733389222111592, "loss": 0.9682, "step": 194 }, { "epoch": 1.2593800978792822, "grad_norm": 0.0966796875, "learning_rate": 0.00015690554462597522, "loss": 1.0165, "step": 195 }, { "epoch": 1.2593800978792822, "eval_loss": 1.0322120189666748, "eval_runtime": 236.6694, "eval_samples_per_second": 1.8, "eval_steps_per_second": 0.9, "step": 195 }, { "epoch": 1.265905383360522, "grad_norm": 0.1005859375, "learning_rate": 0.00015647564728541485, "loss": 1.0376, "step": 196 }, { "epoch": 1.272430668841762, "grad_norm": 0.09619140625, "learning_rate": 0.0001560442119071065, "loss": 1.0066, "step": 197 }, { "epoch": 1.2789559543230016, "grad_norm": 0.095703125, "learning_rate": 0.00015561125024060826, "loss": 0.9872, "step": 198 }, { "epoch": 1.2854812398042414, "grad_norm": 0.0966796875, "learning_rate": 0.0001551767740770446, "loss": 1.0079, "step": 199 }, { "epoch": 1.2920065252854813, "grad_norm": 0.09765625, "learning_rate": 0.00015474079524878525, "loss": 0.9865, "step": 200 }, { "epoch": 1.298531810766721, "grad_norm": 0.09423828125, "learning_rate": 0.000154303325629123, "loss": 0.9747, "step": 201 }, { "epoch": 1.3050570962479608, "grad_norm": 0.09814453125, "learning_rate": 0.0001538643771319503, "loss": 1.0119, "step": 202 }, { "epoch": 1.3115823817292007, "grad_norm": 0.095703125, "learning_rate": 0.00015342396171143488, "loss": 0.9815, "step": 203 }, { "epoch": 1.3181076672104406, "grad_norm": 0.09912109375, "learning_rate": 0.00015298209136169403, "loss": 0.9772, "step": 204 }, { "epoch": 1.3246329526916802, "grad_norm": 0.095703125, "learning_rate": 0.00015253877811646817, "loss": 0.9804, "step": 205 }, { "epoch": 1.3311582381729201, "grad_norm": 0.09716796875, "learning_rate": 0.00015209403404879303, "loss": 0.9457, "step": 206 }, { "epoch": 1.3376835236541598, "grad_norm": 0.1005859375, "learning_rate": 0.0001516478712706708, "loss": 0.9951, "step": 207 }, { "epoch": 1.3442088091353996, "grad_norm": 0.09521484375, "learning_rate": 0.00015120030193274027, "loss": 1.0085, "step": 208 }, { "epoch": 1.3507340946166395, "grad_norm": 0.09619140625, "learning_rate": 0.00015075133822394613, "loss": 1.0105, "step": 209 }, { "epoch": 1.3572593800978794, "grad_norm": 0.1025390625, "learning_rate": 0.00015030099237120674, "loss": 1.005, "step": 210 }, { "epoch": 1.363784665579119, "grad_norm": 0.0986328125, "learning_rate": 0.00014984927663908137, "loss": 0.9703, "step": 211 }, { "epoch": 1.370309951060359, "grad_norm": 0.0947265625, "learning_rate": 0.00014939620332943604, "loss": 0.9747, "step": 212 }, { "epoch": 1.3768352365415986, "grad_norm": 0.09716796875, "learning_rate": 0.00014894178478110857, "loss": 0.9785, "step": 213 }, { "epoch": 1.3833605220228384, "grad_norm": 0.09814453125, "learning_rate": 0.00014848603336957251, "loss": 1.0329, "step": 214 }, { "epoch": 1.3898858075040783, "grad_norm": 0.09423828125, "learning_rate": 0.00014802896150660022, "loss": 0.9429, "step": 215 }, { "epoch": 1.3964110929853182, "grad_norm": 0.099609375, "learning_rate": 0.00014757058163992464, "loss": 0.9578, "step": 216 }, { "epoch": 1.4029363784665578, "grad_norm": 0.1025390625, "learning_rate": 0.00014711090625290057, "loss": 1.0216, "step": 217 }, { "epoch": 1.4094616639477977, "grad_norm": 0.10986328125, "learning_rate": 0.0001466499478641644, "loss": 0.9792, "step": 218 }, { "epoch": 1.4159869494290376, "grad_norm": 0.095703125, "learning_rate": 0.00014618771902729342, "loss": 1.0001, "step": 219 }, { "epoch": 1.4225122349102772, "grad_norm": 0.1044921875, "learning_rate": 0.00014572423233046386, "loss": 1.0057, "step": 220 }, { "epoch": 1.429037520391517, "grad_norm": 0.09912109375, "learning_rate": 0.000145259500396108, "loss": 0.9698, "step": 221 }, { "epoch": 1.435562805872757, "grad_norm": 0.09814453125, "learning_rate": 0.00014479353588057052, "loss": 0.9533, "step": 222 }, { "epoch": 1.4420880913539968, "grad_norm": 0.103515625, "learning_rate": 0.00014432635147376376, "loss": 0.9686, "step": 223 }, { "epoch": 1.4486133768352365, "grad_norm": 0.0986328125, "learning_rate": 0.00014385795989882221, "loss": 1.0278, "step": 224 }, { "epoch": 1.4551386623164764, "grad_norm": 0.103515625, "learning_rate": 0.00014338837391175582, "loss": 1.0013, "step": 225 }, { "epoch": 1.461663947797716, "grad_norm": 0.10400390625, "learning_rate": 0.00014291760630110288, "loss": 1.0053, "step": 226 }, { "epoch": 1.468189233278956, "grad_norm": 0.10302734375, "learning_rate": 0.00014244566988758152, "loss": 1.0216, "step": 227 }, { "epoch": 1.4747145187601958, "grad_norm": 0.09912109375, "learning_rate": 0.0001419725775237406, "loss": 0.992, "step": 228 }, { "epoch": 1.4812398042414356, "grad_norm": 0.099609375, "learning_rate": 0.00014149834209360986, "loss": 1.0053, "step": 229 }, { "epoch": 1.4877650897226753, "grad_norm": 0.103515625, "learning_rate": 0.0001410229765123487, "loss": 0.9948, "step": 230 }, { "epoch": 1.4942903752039152, "grad_norm": 0.09912109375, "learning_rate": 0.00014054649372589482, "loss": 0.9717, "step": 231 }, { "epoch": 1.5008156606851548, "grad_norm": 0.0986328125, "learning_rate": 0.00014006890671061143, "loss": 0.9805, "step": 232 }, { "epoch": 1.5073409461663947, "grad_norm": 0.10400390625, "learning_rate": 0.00013959022847293391, "loss": 0.9826, "step": 233 }, { "epoch": 1.5138662316476346, "grad_norm": 0.09765625, "learning_rate": 0.0001391104720490156, "loss": 0.9394, "step": 234 }, { "epoch": 1.5138662316476346, "eval_loss": 1.0245814323425293, "eval_runtime": 236.6376, "eval_samples_per_second": 1.8, "eval_steps_per_second": 0.9, "step": 234 }, { "epoch": 1.5203915171288744, "grad_norm": 0.10107421875, "learning_rate": 0.0001386296505043728, "loss": 0.9857, "step": 235 }, { "epoch": 1.5269168026101143, "grad_norm": 0.099609375, "learning_rate": 0.000138147776933529, "loss": 0.997, "step": 236 }, { "epoch": 1.533442088091354, "grad_norm": 0.107421875, "learning_rate": 0.00013766486445965795, "loss": 0.9927, "step": 237 }, { "epoch": 1.5399673735725938, "grad_norm": 0.095703125, "learning_rate": 0.00013718092623422686, "loss": 0.9745, "step": 238 }, { "epoch": 1.5464926590538335, "grad_norm": 0.1005859375, "learning_rate": 0.00013669597543663762, "loss": 0.9842, "step": 239 }, { "epoch": 1.5530179445350734, "grad_norm": 0.099609375, "learning_rate": 0.00013621002527386834, "loss": 1.0102, "step": 240 }, { "epoch": 1.5595432300163132, "grad_norm": 0.1015625, "learning_rate": 0.0001357230889801133, "loss": 0.9389, "step": 241 }, { "epoch": 1.566068515497553, "grad_norm": 0.099609375, "learning_rate": 0.00013523517981642286, "loss": 1.0044, "step": 242 }, { "epoch": 1.572593800978793, "grad_norm": 0.09716796875, "learning_rate": 0.0001347463110703422, "loss": 0.9698, "step": 243 }, { "epoch": 1.5791190864600326, "grad_norm": 0.1015625, "learning_rate": 0.00013425649605554928, "loss": 0.9858, "step": 244 }, { "epoch": 1.5856443719412723, "grad_norm": 0.1396484375, "learning_rate": 0.00013376574811149253, "loss": 0.9513, "step": 245 }, { "epoch": 1.5921696574225122, "grad_norm": 0.10009765625, "learning_rate": 0.00013327408060302738, "loss": 0.9943, "step": 246 }, { "epoch": 1.598694942903752, "grad_norm": 0.0986328125, "learning_rate": 0.00013278150692005243, "loss": 0.9933, "step": 247 }, { "epoch": 1.605220228384992, "grad_norm": 0.1005859375, "learning_rate": 0.00013228804047714463, "loss": 0.9865, "step": 248 }, { "epoch": 1.6117455138662318, "grad_norm": 0.09716796875, "learning_rate": 0.00013179369471319404, "loss": 0.9466, "step": 249 }, { "epoch": 1.6182707993474714, "grad_norm": 0.0986328125, "learning_rate": 0.0001312984830910379, "loss": 0.9684, "step": 250 }, { "epoch": 1.6247960848287113, "grad_norm": 0.10205078125, "learning_rate": 0.00013080241909709387, "loss": 1.004, "step": 251 }, { "epoch": 1.631321370309951, "grad_norm": 0.10400390625, "learning_rate": 0.00013030551624099287, "loss": 0.9964, "step": 252 }, { "epoch": 1.6378466557911908, "grad_norm": 0.10888671875, "learning_rate": 0.000129807788055211, "loss": 0.9745, "step": 253 }, { "epoch": 1.6443719412724307, "grad_norm": 0.1044921875, "learning_rate": 0.00012930924809470115, "loss": 1.0074, "step": 254 }, { "epoch": 1.6508972267536706, "grad_norm": 0.1064453125, "learning_rate": 0.00012880990993652377, "loss": 0.9842, "step": 255 }, { "epoch": 1.6574225122349104, "grad_norm": 0.1025390625, "learning_rate": 0.00012830978717947718, "loss": 0.9699, "step": 256 }, { "epoch": 1.66394779771615, "grad_norm": 0.10009765625, "learning_rate": 0.00012780889344372718, "loss": 0.9892, "step": 257 }, { "epoch": 1.6704730831973897, "grad_norm": 0.1025390625, "learning_rate": 0.00012730724237043615, "loss": 0.9804, "step": 258 }, { "epoch": 1.6769983686786296, "grad_norm": 0.09814453125, "learning_rate": 0.0001268048476213914, "loss": 0.9873, "step": 259 }, { "epoch": 1.6835236541598695, "grad_norm": 0.09765625, "learning_rate": 0.0001263017228786334, "loss": 0.9737, "step": 260 }, { "epoch": 1.6900489396411094, "grad_norm": 0.10107421875, "learning_rate": 0.00012579788184408295, "loss": 1.0064, "step": 261 }, { "epoch": 1.6965742251223492, "grad_norm": 0.10888671875, "learning_rate": 0.00012529333823916807, "loss": 0.9745, "step": 262 }, { "epoch": 1.7030995106035889, "grad_norm": 0.1005859375, "learning_rate": 0.0001247881058044504, "loss": 0.9988, "step": 263 }, { "epoch": 1.7096247960848288, "grad_norm": 0.10595703125, "learning_rate": 0.00012428219829925083, "loss": 1.0021, "step": 264 }, { "epoch": 1.7161500815660684, "grad_norm": 0.09814453125, "learning_rate": 0.00012377562950127493, "loss": 0.9681, "step": 265 }, { "epoch": 1.7226753670473083, "grad_norm": 0.107421875, "learning_rate": 0.00012326841320623767, "loss": 0.9968, "step": 266 }, { "epoch": 1.7292006525285482, "grad_norm": 0.10791015625, "learning_rate": 0.00012276056322748778, "loss": 1.0235, "step": 267 }, { "epoch": 1.735725938009788, "grad_norm": 0.10498046875, "learning_rate": 0.00012225209339563145, "loss": 0.977, "step": 268 }, { "epoch": 1.7422512234910277, "grad_norm": 0.10302734375, "learning_rate": 0.00012174301755815571, "loss": 1.0021, "step": 269 }, { "epoch": 1.7487765089722676, "grad_norm": 0.09716796875, "learning_rate": 0.0001212333495790514, "loss": 0.9781, "step": 270 }, { "epoch": 1.7553017944535072, "grad_norm": 0.10546875, "learning_rate": 0.00012072310333843544, "loss": 0.9901, "step": 271 }, { "epoch": 1.761827079934747, "grad_norm": 0.10205078125, "learning_rate": 0.00012021229273217302, "loss": 0.9675, "step": 272 }, { "epoch": 1.768352365415987, "grad_norm": 0.10546875, "learning_rate": 0.00011970093167149905, "loss": 0.999, "step": 273 }, { "epoch": 1.768352365415987, "eval_loss": 1.0181984901428223, "eval_runtime": 236.6668, "eval_samples_per_second": 1.8, "eval_steps_per_second": 0.9, "step": 273 }, { "epoch": 1.7748776508972268, "grad_norm": 0.1005859375, "learning_rate": 0.00011918903408263924, "loss": 0.9798, "step": 274 }, { "epoch": 1.7814029363784667, "grad_norm": 0.103515625, "learning_rate": 0.000118676613906431, "loss": 0.9826, "step": 275 }, { "epoch": 1.7879282218597063, "grad_norm": 0.0986328125, "learning_rate": 0.00011816368509794364, "loss": 0.9781, "step": 276 }, { "epoch": 1.7944535073409462, "grad_norm": 0.099609375, "learning_rate": 0.00011765026162609847, "loss": 0.9811, "step": 277 }, { "epoch": 1.8009787928221859, "grad_norm": 0.10498046875, "learning_rate": 0.00011713635747328818, "loss": 0.9859, "step": 278 }, { "epoch": 1.8075040783034257, "grad_norm": 0.10107421875, "learning_rate": 0.00011662198663499619, "loss": 0.9644, "step": 279 }, { "epoch": 1.8140293637846656, "grad_norm": 0.1025390625, "learning_rate": 0.0001161071631194155, "loss": 0.9931, "step": 280 }, { "epoch": 1.8205546492659055, "grad_norm": 0.09716796875, "learning_rate": 0.00011559190094706714, "loss": 1.0061, "step": 281 }, { "epoch": 1.8270799347471451, "grad_norm": 0.1015625, "learning_rate": 0.00011507621415041837, "loss": 0.9511, "step": 282 }, { "epoch": 1.833605220228385, "grad_norm": 0.10888671875, "learning_rate": 0.00011456011677350051, "loss": 0.9875, "step": 283 }, { "epoch": 1.8401305057096247, "grad_norm": 0.099609375, "learning_rate": 0.00011404362287152646, "loss": 0.9577, "step": 284 }, { "epoch": 1.8466557911908645, "grad_norm": 0.10205078125, "learning_rate": 0.00011352674651050796, "loss": 1.0018, "step": 285 }, { "epoch": 1.8531810766721044, "grad_norm": 0.1025390625, "learning_rate": 0.00011300950176687255, "loss": 0.9738, "step": 286 }, { "epoch": 1.8597063621533443, "grad_norm": 0.10009765625, "learning_rate": 0.00011249190272708008, "loss": 0.9635, "step": 287 }, { "epoch": 1.8662316476345842, "grad_norm": 0.10302734375, "learning_rate": 0.00011197396348723923, "loss": 0.9783, "step": 288 }, { "epoch": 1.8727569331158238, "grad_norm": 0.1123046875, "learning_rate": 0.0001114556981527236, "loss": 0.9895, "step": 289 }, { "epoch": 1.8792822185970635, "grad_norm": 0.10009765625, "learning_rate": 0.00011093712083778746, "loss": 0.9993, "step": 290 }, { "epoch": 1.8858075040783033, "grad_norm": 0.1123046875, "learning_rate": 0.00011041824566518146, "loss": 0.9842, "step": 291 }, { "epoch": 1.8923327895595432, "grad_norm": 0.10498046875, "learning_rate": 0.00010989908676576807, "loss": 0.9693, "step": 292 }, { "epoch": 1.898858075040783, "grad_norm": 0.0986328125, "learning_rate": 0.00010937965827813661, "loss": 0.9838, "step": 293 }, { "epoch": 1.905383360522023, "grad_norm": 0.1064453125, "learning_rate": 0.00010885997434821831, "loss": 0.9992, "step": 294 }, { "epoch": 1.9119086460032626, "grad_norm": 0.09912109375, "learning_rate": 0.00010834004912890092, "loss": 1.0066, "step": 295 }, { "epoch": 1.9184339314845025, "grad_norm": 0.10009765625, "learning_rate": 0.00010781989677964355, "loss": 0.9737, "step": 296 }, { "epoch": 1.9249592169657421, "grad_norm": 0.10546875, "learning_rate": 0.00010729953146609076, "loss": 0.9913, "step": 297 }, { "epoch": 1.931484502446982, "grad_norm": 0.10546875, "learning_rate": 0.00010677896735968693, "loss": 0.9835, "step": 298 }, { "epoch": 1.9380097879282219, "grad_norm": 0.1015625, "learning_rate": 0.00010625821863729036, "loss": 0.967, "step": 299 }, { "epoch": 1.9445350734094617, "grad_norm": 0.1015625, "learning_rate": 0.00010573729948078699, "loss": 0.9724, "step": 300 }, { "epoch": 1.9510603588907016, "grad_norm": 0.1005859375, "learning_rate": 0.00010521622407670439, "loss": 0.9732, "step": 301 }, { "epoch": 1.9575856443719413, "grad_norm": 0.1005859375, "learning_rate": 0.00010469500661582536, "loss": 1.0138, "step": 302 }, { "epoch": 1.964110929853181, "grad_norm": 0.1015625, "learning_rate": 0.00010417366129280133, "loss": 0.9903, "step": 303 }, { "epoch": 1.9706362153344208, "grad_norm": 0.099609375, "learning_rate": 0.0001036522023057659, "loss": 0.962, "step": 304 }, { "epoch": 1.9771615008156607, "grad_norm": 0.10302734375, "learning_rate": 0.00010313064385594822, "loss": 0.9793, "step": 305 }, { "epoch": 1.9836867862969005, "grad_norm": 0.103515625, "learning_rate": 0.0001026090001472861, "loss": 0.975, "step": 306 } ], "logging_steps": 1, "max_steps": 612, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 153, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.127437882233979e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }