{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9814814814814814, "eval_steps": 81, "global_step": 648, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0030864197530864196, "grad_norm": 0.11897344887256622, "learning_rate": 1.0000000000000002e-06, "loss": 0.6253, "step": 1 }, { "epoch": 0.0030864197530864196, "eval_loss": 0.6252603530883789, "eval_runtime": 44.2936, "eval_samples_per_second": 8.308, "eval_steps_per_second": 1.039, "step": 1 }, { "epoch": 0.006172839506172839, "grad_norm": 0.11417510360479355, "learning_rate": 2.0000000000000003e-06, "loss": 0.6376, "step": 2 }, { "epoch": 0.009259259259259259, "grad_norm": 0.0693814605474472, "learning_rate": 3e-06, "loss": 0.2684, "step": 3 }, { "epoch": 0.012345679012345678, "grad_norm": 0.1110842302441597, "learning_rate": 4.000000000000001e-06, "loss": 0.5096, "step": 4 }, { "epoch": 0.015432098765432098, "grad_norm": 0.09205043315887451, "learning_rate": 5e-06, "loss": 0.5674, "step": 5 }, { "epoch": 0.018518518518518517, "grad_norm": 0.1063380092382431, "learning_rate": 6e-06, "loss": 0.6219, "step": 6 }, { "epoch": 0.021604938271604937, "grad_norm": 0.0740552470088005, "learning_rate": 7e-06, "loss": 0.5478, "step": 7 }, { "epoch": 0.024691358024691357, "grad_norm": 0.10674550384283066, "learning_rate": 8.000000000000001e-06, "loss": 0.6168, "step": 8 }, { "epoch": 0.027777777777777776, "grad_norm": 0.1061239168047905, "learning_rate": 9e-06, "loss": 0.7106, "step": 9 }, { "epoch": 0.030864197530864196, "grad_norm": 0.10123332589864731, "learning_rate": 1e-05, "loss": 0.5221, "step": 10 }, { "epoch": 0.033950617283950615, "grad_norm": 0.06680818647146225, "learning_rate": 9.999939382570075e-06, "loss": 0.2592, "step": 11 }, { "epoch": 0.037037037037037035, "grad_norm": 0.09670277684926987, "learning_rate": 9.999757531750086e-06, "loss": 0.5183, "step": 12 }, { "epoch": 0.040123456790123455, "grad_norm": 0.07567557692527771, "learning_rate": 9.999454451949364e-06, "loss": 0.3257, "step": 13 }, { "epoch": 0.043209876543209874, "grad_norm": 0.10101059824228287, "learning_rate": 9.999030150516681e-06, "loss": 0.4788, "step": 14 }, { "epoch": 0.046296296296296294, "grad_norm": 0.1238669604063034, "learning_rate": 9.998484637740058e-06, "loss": 0.6218, "step": 15 }, { "epoch": 0.04938271604938271, "grad_norm": 0.10699903219938278, "learning_rate": 9.997817926846528e-06, "loss": 0.6429, "step": 16 }, { "epoch": 0.05246913580246913, "grad_norm": 0.08470468968153, "learning_rate": 9.997030034001815e-06, "loss": 0.3134, "step": 17 }, { "epoch": 0.05555555555555555, "grad_norm": 0.1229688748717308, "learning_rate": 9.99612097830993e-06, "loss": 0.712, "step": 18 }, { "epoch": 0.05864197530864197, "grad_norm": 0.10526233166456223, "learning_rate": 9.995090781812724e-06, "loss": 0.504, "step": 19 }, { "epoch": 0.06172839506172839, "grad_norm": 0.11165868490934372, "learning_rate": 9.993939469489342e-06, "loss": 0.5122, "step": 20 }, { "epoch": 0.06481481481481481, "grad_norm": 0.09065920859575272, "learning_rate": 9.99266706925562e-06, "loss": 0.4664, "step": 21 }, { "epoch": 0.06790123456790123, "grad_norm": 0.10060250014066696, "learning_rate": 9.991273611963413e-06, "loss": 0.4732, "step": 22 }, { "epoch": 0.07098765432098765, "grad_norm": 0.10402392596006393, "learning_rate": 9.98975913139984e-06, "loss": 0.4899, "step": 23 }, { "epoch": 0.07407407407407407, "grad_norm": 0.11345162242650986, "learning_rate": 9.98812366428647e-06, "loss": 0.5365, "step": 24 }, { "epoch": 0.07716049382716049, "grad_norm": 0.1189904510974884, "learning_rate": 9.986367250278423e-06, "loss": 0.6293, "step": 25 }, { "epoch": 0.08024691358024691, "grad_norm": 0.11722761392593384, "learning_rate": 9.984489931963429e-06, "loss": 0.4991, "step": 26 }, { "epoch": 0.08333333333333333, "grad_norm": 0.08803360909223557, "learning_rate": 9.982491754860763e-06, "loss": 0.381, "step": 27 }, { "epoch": 0.08641975308641975, "grad_norm": 0.11037921905517578, "learning_rate": 9.980372767420179e-06, "loss": 0.5814, "step": 28 }, { "epoch": 0.08950617283950617, "grad_norm": 0.0851665586233139, "learning_rate": 9.978133021020697e-06, "loss": 0.3629, "step": 29 }, { "epoch": 0.09259259259259259, "grad_norm": 0.10195960849523544, "learning_rate": 9.97577256996939e-06, "loss": 0.5672, "step": 30 }, { "epoch": 0.09567901234567901, "grad_norm": 0.12112904340028763, "learning_rate": 9.97329147150005e-06, "loss": 0.6165, "step": 31 }, { "epoch": 0.09876543209876543, "grad_norm": 0.07611838728189468, "learning_rate": 9.970689785771798e-06, "loss": 0.3902, "step": 32 }, { "epoch": 0.10185185185185185, "grad_norm": 0.1013374775648117, "learning_rate": 9.96796757586764e-06, "loss": 0.5096, "step": 33 }, { "epoch": 0.10493827160493827, "grad_norm": 0.08809865266084671, "learning_rate": 9.965124907792916e-06, "loss": 0.3333, "step": 34 }, { "epoch": 0.10802469135802469, "grad_norm": 0.0764087364077568, "learning_rate": 9.962161850473723e-06, "loss": 0.3461, "step": 35 }, { "epoch": 0.1111111111111111, "grad_norm": 0.0995788499712944, "learning_rate": 9.95907847575523e-06, "loss": 0.4225, "step": 36 }, { "epoch": 0.11419753086419752, "grad_norm": 0.11751396954059601, "learning_rate": 9.955874858399936e-06, "loss": 0.4991, "step": 37 }, { "epoch": 0.11728395061728394, "grad_norm": 0.10502217710018158, "learning_rate": 9.952551076085864e-06, "loss": 0.5847, "step": 38 }, { "epoch": 0.12037037037037036, "grad_norm": 0.1077880784869194, "learning_rate": 9.949107209404664e-06, "loss": 0.4901, "step": 39 }, { "epoch": 0.12345679012345678, "grad_norm": 0.08844556659460068, "learning_rate": 9.945543341859681e-06, "loss": 0.5752, "step": 40 }, { "epoch": 0.12654320987654322, "grad_norm": 0.10771756619215012, "learning_rate": 9.94185955986391e-06, "loss": 0.5393, "step": 41 }, { "epoch": 0.12962962962962962, "grad_norm": 0.07496192306280136, "learning_rate": 9.938055952737908e-06, "loss": 0.3334, "step": 42 }, { "epoch": 0.13271604938271606, "grad_norm": 0.106163389980793, "learning_rate": 9.934132612707631e-06, "loss": 0.5319, "step": 43 }, { "epoch": 0.13580246913580246, "grad_norm": 0.09276831895112991, "learning_rate": 9.930089634902197e-06, "loss": 0.486, "step": 44 }, { "epoch": 0.1388888888888889, "grad_norm": 0.09449384361505508, "learning_rate": 9.925927117351573e-06, "loss": 0.3858, "step": 45 }, { "epoch": 0.1419753086419753, "grad_norm": 0.07955848425626755, "learning_rate": 9.921645160984205e-06, "loss": 0.4648, "step": 46 }, { "epoch": 0.14506172839506173, "grad_norm": 0.10575301945209503, "learning_rate": 9.917243869624573e-06, "loss": 0.4704, "step": 47 }, { "epoch": 0.14814814814814814, "grad_norm": 0.0714716911315918, "learning_rate": 9.91272334999066e-06, "loss": 0.372, "step": 48 }, { "epoch": 0.15123456790123457, "grad_norm": 0.08894475549459457, "learning_rate": 9.908083711691383e-06, "loss": 0.5005, "step": 49 }, { "epoch": 0.15432098765432098, "grad_norm": 0.0800170972943306, "learning_rate": 9.903325067223918e-06, "loss": 0.3688, "step": 50 }, { "epoch": 0.1574074074074074, "grad_norm": 0.09310433268547058, "learning_rate": 9.898447531970989e-06, "loss": 0.5127, "step": 51 }, { "epoch": 0.16049382716049382, "grad_norm": 0.07690192013978958, "learning_rate": 9.893451224198051e-06, "loss": 0.2993, "step": 52 }, { "epoch": 0.16358024691358025, "grad_norm": 0.08025282621383667, "learning_rate": 9.888336265050443e-06, "loss": 0.4004, "step": 53 }, { "epoch": 0.16666666666666666, "grad_norm": 0.06500386446714401, "learning_rate": 9.883102778550434e-06, "loss": 0.3317, "step": 54 }, { "epoch": 0.1697530864197531, "grad_norm": 0.07926575839519501, "learning_rate": 9.877750891594224e-06, "loss": 0.3606, "step": 55 }, { "epoch": 0.1728395061728395, "grad_norm": 0.07245253026485443, "learning_rate": 9.872280733948867e-06, "loss": 0.4437, "step": 56 }, { "epoch": 0.17592592592592593, "grad_norm": 0.07353054732084274, "learning_rate": 9.866692438249124e-06, "loss": 0.36, "step": 57 }, { "epoch": 0.17901234567901234, "grad_norm": 0.09307980537414551, "learning_rate": 9.86098613999424e-06, "loss": 0.5175, "step": 58 }, { "epoch": 0.18209876543209877, "grad_norm": 0.07782690227031708, "learning_rate": 9.855161977544672e-06, "loss": 0.4332, "step": 59 }, { "epoch": 0.18518518518518517, "grad_norm": 0.06865860521793365, "learning_rate": 9.849220092118721e-06, "loss": 0.3464, "step": 60 }, { "epoch": 0.1882716049382716, "grad_norm": 0.0760008841753006, "learning_rate": 9.84316062778912e-06, "loss": 0.3808, "step": 61 }, { "epoch": 0.19135802469135801, "grad_norm": 0.07834326475858688, "learning_rate": 9.836983731479526e-06, "loss": 0.499, "step": 62 }, { "epoch": 0.19444444444444445, "grad_norm": 0.08240173012018204, "learning_rate": 9.830689552960974e-06, "loss": 0.4432, "step": 63 }, { "epoch": 0.19753086419753085, "grad_norm": 0.06976404786109924, "learning_rate": 9.824278244848236e-06, "loss": 0.3482, "step": 64 }, { "epoch": 0.2006172839506173, "grad_norm": 0.09335274249315262, "learning_rate": 9.817749962596115e-06, "loss": 0.4533, "step": 65 }, { "epoch": 0.2037037037037037, "grad_norm": 0.10973995178937912, "learning_rate": 9.811104864495691e-06, "loss": 0.6042, "step": 66 }, { "epoch": 0.20679012345679013, "grad_norm": 0.08284437656402588, "learning_rate": 9.804343111670472e-06, "loss": 0.4818, "step": 67 }, { "epoch": 0.20987654320987653, "grad_norm": 0.08448096364736557, "learning_rate": 9.797464868072489e-06, "loss": 0.518, "step": 68 }, { "epoch": 0.21296296296296297, "grad_norm": 0.07667321711778641, "learning_rate": 9.790470300478318e-06, "loss": 0.3757, "step": 69 }, { "epoch": 0.21604938271604937, "grad_norm": 0.0944654569029808, "learning_rate": 9.783359578485047e-06, "loss": 0.4863, "step": 70 }, { "epoch": 0.2191358024691358, "grad_norm": 0.07617281377315521, "learning_rate": 9.776132874506153e-06, "loss": 0.3484, "step": 71 }, { "epoch": 0.2222222222222222, "grad_norm": 0.09038567543029785, "learning_rate": 9.768790363767321e-06, "loss": 0.596, "step": 72 }, { "epoch": 0.22530864197530864, "grad_norm": 0.0843636766076088, "learning_rate": 9.761332224302209e-06, "loss": 0.4042, "step": 73 }, { "epoch": 0.22839506172839505, "grad_norm": 0.09003959596157074, "learning_rate": 9.753758636948112e-06, "loss": 0.5011, "step": 74 }, { "epoch": 0.23148148148148148, "grad_norm": 0.079057976603508, "learning_rate": 9.74606978534159e-06, "loss": 0.4703, "step": 75 }, { "epoch": 0.2345679012345679, "grad_norm": 0.07765232026576996, "learning_rate": 9.738265855914014e-06, "loss": 0.3294, "step": 76 }, { "epoch": 0.23765432098765432, "grad_norm": 0.07654544711112976, "learning_rate": 9.730347037887041e-06, "loss": 0.4039, "step": 77 }, { "epoch": 0.24074074074074073, "grad_norm": 0.05925621837377548, "learning_rate": 9.722313523268028e-06, "loss": 0.2295, "step": 78 }, { "epoch": 0.24382716049382716, "grad_norm": 0.07830403745174408, "learning_rate": 9.714165506845381e-06, "loss": 0.3721, "step": 79 }, { "epoch": 0.24691358024691357, "grad_norm": 0.09928114712238312, "learning_rate": 9.705903186183828e-06, "loss": 0.5154, "step": 80 }, { "epoch": 0.25, "grad_norm": 0.06352175772190094, "learning_rate": 9.697526761619621e-06, "loss": 0.2613, "step": 81 }, { "epoch": 0.25, "eval_loss": 0.5444870591163635, "eval_runtime": 44.3715, "eval_samples_per_second": 8.294, "eval_steps_per_second": 1.037, "step": 81 }, { "epoch": 0.25308641975308643, "grad_norm": 0.07308296114206314, "learning_rate": 9.689036436255698e-06, "loss": 0.3455, "step": 82 }, { "epoch": 0.25617283950617287, "grad_norm": 0.07788842916488647, "learning_rate": 9.680432415956736e-06, "loss": 0.4675, "step": 83 }, { "epoch": 0.25925925925925924, "grad_norm": 0.09506388008594513, "learning_rate": 9.671714909344175e-06, "loss": 0.5544, "step": 84 }, { "epoch": 0.2623456790123457, "grad_norm": 0.08810863643884659, "learning_rate": 9.66288412779115e-06, "loss": 0.497, "step": 85 }, { "epoch": 0.2654320987654321, "grad_norm": 0.06235141307115555, "learning_rate": 9.653940285417381e-06, "loss": 0.2775, "step": 86 }, { "epoch": 0.26851851851851855, "grad_norm": 0.07534658908843994, "learning_rate": 9.644883599083959e-06, "loss": 0.3706, "step": 87 }, { "epoch": 0.2716049382716049, "grad_norm": 0.11235971748828888, "learning_rate": 9.635714288388103e-06, "loss": 0.6166, "step": 88 }, { "epoch": 0.27469135802469136, "grad_norm": 0.07352706789970398, "learning_rate": 9.626432575657834e-06, "loss": 0.4254, "step": 89 }, { "epoch": 0.2777777777777778, "grad_norm": 0.10939712822437286, "learning_rate": 9.617038685946578e-06, "loss": 0.3768, "step": 90 }, { "epoch": 0.2808641975308642, "grad_norm": 0.0766228511929512, "learning_rate": 9.60753284702772e-06, "loss": 0.3562, "step": 91 }, { "epoch": 0.2839506172839506, "grad_norm": 0.08354140818119049, "learning_rate": 9.597915289389067e-06, "loss": 0.4783, "step": 92 }, { "epoch": 0.28703703703703703, "grad_norm": 0.08200543373823166, "learning_rate": 9.58818624622727e-06, "loss": 0.3947, "step": 93 }, { "epoch": 0.29012345679012347, "grad_norm": 0.08410683274269104, "learning_rate": 9.578345953442163e-06, "loss": 0.5048, "step": 94 }, { "epoch": 0.2932098765432099, "grad_norm": 0.1019473522901535, "learning_rate": 9.568394649631055e-06, "loss": 0.5842, "step": 95 }, { "epoch": 0.2962962962962963, "grad_norm": 0.08855041116476059, "learning_rate": 9.558332576082925e-06, "loss": 0.4176, "step": 96 }, { "epoch": 0.2993827160493827, "grad_norm": 0.08165948837995529, "learning_rate": 9.548159976772593e-06, "loss": 0.4098, "step": 97 }, { "epoch": 0.30246913580246915, "grad_norm": 0.07580746710300446, "learning_rate": 9.537877098354787e-06, "loss": 0.3886, "step": 98 }, { "epoch": 0.3055555555555556, "grad_norm": 0.0938824713230133, "learning_rate": 9.527484190158171e-06, "loss": 0.4551, "step": 99 }, { "epoch": 0.30864197530864196, "grad_norm": 0.07878723740577698, "learning_rate": 9.5169815041793e-06, "loss": 0.4042, "step": 100 }, { "epoch": 0.3117283950617284, "grad_norm": 0.07207982987165451, "learning_rate": 9.506369295076505e-06, "loss": 0.3541, "step": 101 }, { "epoch": 0.3148148148148148, "grad_norm": 0.06538520753383636, "learning_rate": 9.495647820163725e-06, "loss": 0.2972, "step": 102 }, { "epoch": 0.31790123456790126, "grad_norm": 0.08196717500686646, "learning_rate": 9.484817339404261e-06, "loss": 0.401, "step": 103 }, { "epoch": 0.32098765432098764, "grad_norm": 0.07677263766527176, "learning_rate": 9.473878115404477e-06, "loss": 0.4073, "step": 104 }, { "epoch": 0.32407407407407407, "grad_norm": 0.11730651557445526, "learning_rate": 9.462830413407427e-06, "loss": 0.4501, "step": 105 }, { "epoch": 0.3271604938271605, "grad_norm": 0.06849709898233414, "learning_rate": 9.451674501286436e-06, "loss": 0.2538, "step": 106 }, { "epoch": 0.33024691358024694, "grad_norm": 0.09413019567728043, "learning_rate": 9.440410649538592e-06, "loss": 0.4646, "step": 107 }, { "epoch": 0.3333333333333333, "grad_norm": 0.15361227095127106, "learning_rate": 9.42903913127819e-06, "loss": 0.5303, "step": 108 }, { "epoch": 0.33641975308641975, "grad_norm": 0.08900155127048492, "learning_rate": 9.417560222230115e-06, "loss": 0.383, "step": 109 }, { "epoch": 0.3395061728395062, "grad_norm": 0.07807417958974838, "learning_rate": 9.405974200723156e-06, "loss": 0.3673, "step": 110 }, { "epoch": 0.3425925925925926, "grad_norm": 0.1323561668395996, "learning_rate": 9.394281347683247e-06, "loss": 0.597, "step": 111 }, { "epoch": 0.345679012345679, "grad_norm": 0.11236107349395752, "learning_rate": 9.382481946626673e-06, "loss": 0.5051, "step": 112 }, { "epoch": 0.3487654320987654, "grad_norm": 0.09908317029476166, "learning_rate": 9.370576283653178e-06, "loss": 0.3208, "step": 113 }, { "epoch": 0.35185185185185186, "grad_norm": 0.08509659022092819, "learning_rate": 9.358564647439037e-06, "loss": 0.3801, "step": 114 }, { "epoch": 0.3549382716049383, "grad_norm": 0.05896300822496414, "learning_rate": 9.34644732923006e-06, "loss": 0.2217, "step": 115 }, { "epoch": 0.35802469135802467, "grad_norm": 0.06763949990272522, "learning_rate": 9.33422462283452e-06, "loss": 0.3583, "step": 116 }, { "epoch": 0.3611111111111111, "grad_norm": 0.0857081338763237, "learning_rate": 9.321896824616036e-06, "loss": 0.4122, "step": 117 }, { "epoch": 0.36419753086419754, "grad_norm": 0.07149571180343628, "learning_rate": 9.309464233486386e-06, "loss": 0.2959, "step": 118 }, { "epoch": 0.36728395061728397, "grad_norm": 0.09094710648059845, "learning_rate": 9.29692715089826e-06, "loss": 0.3633, "step": 119 }, { "epoch": 0.37037037037037035, "grad_norm": 0.07034748792648315, "learning_rate": 9.284285880837947e-06, "loss": 0.2826, "step": 120 }, { "epoch": 0.3734567901234568, "grad_norm": 0.0919278934597969, "learning_rate": 9.271540729817969e-06, "loss": 0.389, "step": 121 }, { "epoch": 0.3765432098765432, "grad_norm": 0.07186863571405411, "learning_rate": 9.258692006869644e-06, "loss": 0.296, "step": 122 }, { "epoch": 0.37962962962962965, "grad_norm": 0.09665773808956146, "learning_rate": 9.245740023535596e-06, "loss": 0.4324, "step": 123 }, { "epoch": 0.38271604938271603, "grad_norm": 0.08115452527999878, "learning_rate": 9.232685093862206e-06, "loss": 0.3555, "step": 124 }, { "epoch": 0.38580246913580246, "grad_norm": 0.07702954113483429, "learning_rate": 9.219527534391983e-06, "loss": 0.3385, "step": 125 }, { "epoch": 0.3888888888888889, "grad_norm": 0.10876493901014328, "learning_rate": 9.206267664155906e-06, "loss": 0.4446, "step": 126 }, { "epoch": 0.39197530864197533, "grad_norm": 0.07764764875173569, "learning_rate": 9.192905804665677e-06, "loss": 0.369, "step": 127 }, { "epoch": 0.3950617283950617, "grad_norm": 0.10887006670236588, "learning_rate": 9.179442279905927e-06, "loss": 0.4297, "step": 128 }, { "epoch": 0.39814814814814814, "grad_norm": 0.10183979570865631, "learning_rate": 9.165877416326365e-06, "loss": 0.5906, "step": 129 }, { "epoch": 0.4012345679012346, "grad_norm": 0.07278673350811005, "learning_rate": 9.152211542833856e-06, "loss": 0.3017, "step": 130 }, { "epoch": 0.404320987654321, "grad_norm": 0.08892305195331573, "learning_rate": 9.138444990784455e-06, "loss": 0.3919, "step": 131 }, { "epoch": 0.4074074074074074, "grad_norm": 0.0926053375005722, "learning_rate": 9.124578093975358e-06, "loss": 0.4833, "step": 132 }, { "epoch": 0.4104938271604938, "grad_norm": 0.1312541514635086, "learning_rate": 9.110611188636828e-06, "loss": 0.4139, "step": 133 }, { "epoch": 0.41358024691358025, "grad_norm": 0.07399484515190125, "learning_rate": 9.096544613424026e-06, "loss": 0.3156, "step": 134 }, { "epoch": 0.4166666666666667, "grad_norm": 0.0757204219698906, "learning_rate": 9.082378709408805e-06, "loss": 0.3355, "step": 135 }, { "epoch": 0.41975308641975306, "grad_norm": 0.08242496103048325, "learning_rate": 9.068113820071447e-06, "loss": 0.3647, "step": 136 }, { "epoch": 0.4228395061728395, "grad_norm": 0.08191465586423874, "learning_rate": 9.053750291292321e-06, "loss": 0.3801, "step": 137 }, { "epoch": 0.42592592592592593, "grad_norm": 0.08579788357019424, "learning_rate": 9.039288471343505e-06, "loss": 0.4375, "step": 138 }, { "epoch": 0.42901234567901236, "grad_norm": 0.09289571642875671, "learning_rate": 9.024728710880345e-06, "loss": 0.3733, "step": 139 }, { "epoch": 0.43209876543209874, "grad_norm": 0.09474348276853561, "learning_rate": 9.010071362932945e-06, "loss": 0.5004, "step": 140 }, { "epoch": 0.4351851851851852, "grad_norm": 0.09607541561126709, "learning_rate": 8.995316782897605e-06, "loss": 0.3496, "step": 141 }, { "epoch": 0.4382716049382716, "grad_norm": 0.08354438096284866, "learning_rate": 8.98046532852822e-06, "loss": 0.3528, "step": 142 }, { "epoch": 0.44135802469135804, "grad_norm": 0.08367566019296646, "learning_rate": 8.965517359927583e-06, "loss": 0.3365, "step": 143 }, { "epoch": 0.4444444444444444, "grad_norm": 0.08424922823905945, "learning_rate": 8.950473239538672e-06, "loss": 0.3636, "step": 144 }, { "epoch": 0.44753086419753085, "grad_norm": 0.07770823687314987, "learning_rate": 8.935333332135853e-06, "loss": 0.2757, "step": 145 }, { "epoch": 0.4506172839506173, "grad_norm": 0.08803431689739227, "learning_rate": 8.920098004816035e-06, "loss": 0.3397, "step": 146 }, { "epoch": 0.4537037037037037, "grad_norm": 0.11619243025779724, "learning_rate": 8.904767626989774e-06, "loss": 0.4058, "step": 147 }, { "epoch": 0.4567901234567901, "grad_norm": 0.08595902472734451, "learning_rate": 8.88934257037231e-06, "loss": 0.3447, "step": 148 }, { "epoch": 0.45987654320987653, "grad_norm": 0.08116041868925095, "learning_rate": 8.873823208974557e-06, "loss": 0.3578, "step": 149 }, { "epoch": 0.46296296296296297, "grad_norm": 0.13053898513317108, "learning_rate": 8.85820991909404e-06, "loss": 0.5429, "step": 150 }, { "epoch": 0.4660493827160494, "grad_norm": 0.08137528598308563, "learning_rate": 8.842503079305757e-06, "loss": 0.3078, "step": 151 }, { "epoch": 0.4691358024691358, "grad_norm": 0.0843534767627716, "learning_rate": 8.826703070453014e-06, "loss": 0.3807, "step": 152 }, { "epoch": 0.4722222222222222, "grad_norm": 0.13925758004188538, "learning_rate": 8.810810275638183e-06, "loss": 0.4771, "step": 153 }, { "epoch": 0.47530864197530864, "grad_norm": 0.08117470145225525, "learning_rate": 8.794825080213415e-06, "loss": 0.3197, "step": 154 }, { "epoch": 0.4783950617283951, "grad_norm": 0.07650022953748703, "learning_rate": 8.778747871771293e-06, "loss": 0.2993, "step": 155 }, { "epoch": 0.48148148148148145, "grad_norm": 0.09445349872112274, "learning_rate": 8.76257904013544e-06, "loss": 0.3641, "step": 156 }, { "epoch": 0.4845679012345679, "grad_norm": 0.097043976187706, "learning_rate": 8.746318977351066e-06, "loss": 0.4181, "step": 157 }, { "epoch": 0.4876543209876543, "grad_norm": 0.1167394146323204, "learning_rate": 8.729968077675454e-06, "loss": 0.5277, "step": 158 }, { "epoch": 0.49074074074074076, "grad_norm": 0.08402277529239655, "learning_rate": 8.713526737568415e-06, "loss": 0.2867, "step": 159 }, { "epoch": 0.49382716049382713, "grad_norm": 0.09060430526733398, "learning_rate": 8.696995355682656e-06, "loss": 0.3219, "step": 160 }, { "epoch": 0.49691358024691357, "grad_norm": 0.1259710192680359, "learning_rate": 8.680374332854134e-06, "loss": 0.5394, "step": 161 }, { "epoch": 0.5, "grad_norm": 0.09654678404331207, "learning_rate": 8.663664072092324e-06, "loss": 0.3679, "step": 162 }, { "epoch": 0.5, "eval_loss": 0.5044411420822144, "eval_runtime": 44.4479, "eval_samples_per_second": 8.279, "eval_steps_per_second": 1.035, "step": 162 }, { "epoch": 0.5030864197530864, "grad_norm": 0.13062100112438202, "learning_rate": 8.646864978570445e-06, "loss": 0.38, "step": 163 }, { "epoch": 0.5061728395061729, "grad_norm": 0.11305861920118332, "learning_rate": 8.629977459615655e-06, "loss": 0.3435, "step": 164 }, { "epoch": 0.5092592592592593, "grad_norm": 0.07454624772071838, "learning_rate": 8.613001924699146e-06, "loss": 0.2768, "step": 165 }, { "epoch": 0.5123456790123457, "grad_norm": 0.08615926653146744, "learning_rate": 8.595938785426241e-06, "loss": 0.3404, "step": 166 }, { "epoch": 0.5154320987654321, "grad_norm": 0.09183604270219803, "learning_rate": 8.578788455526398e-06, "loss": 0.3493, "step": 167 }, { "epoch": 0.5185185185185185, "grad_norm": 0.08047281205654144, "learning_rate": 8.561551350843185e-06, "loss": 0.3271, "step": 168 }, { "epoch": 0.5216049382716049, "grad_norm": 0.08007708936929703, "learning_rate": 8.544227889324199e-06, "loss": 0.2844, "step": 169 }, { "epoch": 0.5246913580246914, "grad_norm": 0.08152032643556595, "learning_rate": 8.526818491010922e-06, "loss": 0.3033, "step": 170 }, { "epoch": 0.5277777777777778, "grad_norm": 0.10703514516353607, "learning_rate": 8.509323578028547e-06, "loss": 0.4296, "step": 171 }, { "epoch": 0.5308641975308642, "grad_norm": 0.07901628315448761, "learning_rate": 8.491743574575743e-06, "loss": 0.29, "step": 172 }, { "epoch": 0.5339506172839507, "grad_norm": 0.09099699556827545, "learning_rate": 8.474078906914359e-06, "loss": 0.3021, "step": 173 }, { "epoch": 0.5370370370370371, "grad_norm": 0.0866774320602417, "learning_rate": 8.456330003359093e-06, "loss": 0.2633, "step": 174 }, { "epoch": 0.5401234567901234, "grad_norm": 0.10114055871963501, "learning_rate": 8.438497294267117e-06, "loss": 0.3735, "step": 175 }, { "epoch": 0.5432098765432098, "grad_norm": 0.1260298639535904, "learning_rate": 8.420581212027625e-06, "loss": 0.4687, "step": 176 }, { "epoch": 0.5462962962962963, "grad_norm": 0.1004004031419754, "learning_rate": 8.402582191051365e-06, "loss": 0.29, "step": 177 }, { "epoch": 0.5493827160493827, "grad_norm": 0.08794572949409485, "learning_rate": 8.38450066776009e-06, "loss": 0.3589, "step": 178 }, { "epoch": 0.5524691358024691, "grad_norm": 0.10174311697483063, "learning_rate": 8.36633708057599e-06, "loss": 0.3832, "step": 179 }, { "epoch": 0.5555555555555556, "grad_norm": 0.11463697254657745, "learning_rate": 8.348091869911054e-06, "loss": 0.4172, "step": 180 }, { "epoch": 0.558641975308642, "grad_norm": 0.11808864772319794, "learning_rate": 8.329765478156394e-06, "loss": 0.494, "step": 181 }, { "epoch": 0.5617283950617284, "grad_norm": 0.11152324080467224, "learning_rate": 8.311358349671516e-06, "loss": 0.3973, "step": 182 }, { "epoch": 0.5648148148148148, "grad_norm": 0.09295979887247086, "learning_rate": 8.292870930773551e-06, "loss": 0.3696, "step": 183 }, { "epoch": 0.5679012345679012, "grad_norm": 0.10292661935091019, "learning_rate": 8.274303669726427e-06, "loss": 0.3408, "step": 184 }, { "epoch": 0.5709876543209876, "grad_norm": 0.10190277546644211, "learning_rate": 8.255657016729997e-06, "loss": 0.3513, "step": 185 }, { "epoch": 0.5740740740740741, "grad_norm": 0.08307984471321106, "learning_rate": 8.23693142390914e-06, "loss": 0.2577, "step": 186 }, { "epoch": 0.5771604938271605, "grad_norm": 0.11023180931806564, "learning_rate": 8.218127345302775e-06, "loss": 0.4168, "step": 187 }, { "epoch": 0.5802469135802469, "grad_norm": 0.10529080033302307, "learning_rate": 8.199245236852871e-06, "loss": 0.4223, "step": 188 }, { "epoch": 0.5833333333333334, "grad_norm": 0.14696502685546875, "learning_rate": 8.180285556393384e-06, "loss": 0.5283, "step": 189 }, { "epoch": 0.5864197530864198, "grad_norm": 0.15351015329360962, "learning_rate": 8.161248763639154e-06, "loss": 0.5173, "step": 190 }, { "epoch": 0.5895061728395061, "grad_norm": 0.10003789514303207, "learning_rate": 8.142135320174758e-06, "loss": 0.3617, "step": 191 }, { "epoch": 0.5925925925925926, "grad_norm": 0.09017117321491241, "learning_rate": 8.122945689443328e-06, "loss": 0.2601, "step": 192 }, { "epoch": 0.595679012345679, "grad_norm": 0.11840925365686417, "learning_rate": 8.1036803367353e-06, "loss": 0.4291, "step": 193 }, { "epoch": 0.5987654320987654, "grad_norm": 0.09116993844509125, "learning_rate": 8.084339729177142e-06, "loss": 0.3061, "step": 194 }, { "epoch": 0.6018518518518519, "grad_norm": 0.11056546866893768, "learning_rate": 8.064924335720023e-06, "loss": 0.3712, "step": 195 }, { "epoch": 0.6049382716049383, "grad_norm": 0.10576466470956802, "learning_rate": 8.045434627128446e-06, "loss": 0.3591, "step": 196 }, { "epoch": 0.6080246913580247, "grad_norm": 0.09751347452402115, "learning_rate": 8.025871075968828e-06, "loss": 0.3268, "step": 197 }, { "epoch": 0.6111111111111112, "grad_norm": 0.11890437453985214, "learning_rate": 8.006234156598043e-06, "loss": 0.3256, "step": 198 }, { "epoch": 0.6141975308641975, "grad_norm": 0.12418389320373535, "learning_rate": 7.986524345151924e-06, "loss": 0.5357, "step": 199 }, { "epoch": 0.6172839506172839, "grad_norm": 0.11261377483606339, "learning_rate": 7.966742119533724e-06, "loss": 0.4537, "step": 200 }, { "epoch": 0.6203703703703703, "grad_norm": 0.12626801431179047, "learning_rate": 7.946887959402504e-06, "loss": 0.3786, "step": 201 }, { "epoch": 0.6234567901234568, "grad_norm": 0.12130914628505707, "learning_rate": 7.926962346161535e-06, "loss": 0.4564, "step": 202 }, { "epoch": 0.6265432098765432, "grad_norm": 0.10559491068124771, "learning_rate": 7.9069657629466e-06, "loss": 0.3984, "step": 203 }, { "epoch": 0.6296296296296297, "grad_norm": 0.11549825966358185, "learning_rate": 7.886898694614292e-06, "loss": 0.4251, "step": 204 }, { "epoch": 0.6327160493827161, "grad_norm": 0.10902281850576401, "learning_rate": 7.866761627730253e-06, "loss": 0.4012, "step": 205 }, { "epoch": 0.6358024691358025, "grad_norm": 0.11586394906044006, "learning_rate": 7.846555050557381e-06, "loss": 0.3586, "step": 206 }, { "epoch": 0.6388888888888888, "grad_norm": 0.10988422483205795, "learning_rate": 7.826279453043985e-06, "loss": 0.4294, "step": 207 }, { "epoch": 0.6419753086419753, "grad_norm": 0.1205698624253273, "learning_rate": 7.805935326811913e-06, "loss": 0.4782, "step": 208 }, { "epoch": 0.6450617283950617, "grad_norm": 0.08950233459472656, "learning_rate": 7.78552316514462e-06, "loss": 0.2901, "step": 209 }, { "epoch": 0.6481481481481481, "grad_norm": 0.13640360534191132, "learning_rate": 7.765043462975217e-06, "loss": 0.4403, "step": 210 }, { "epoch": 0.6512345679012346, "grad_norm": 0.13739749789237976, "learning_rate": 7.744496716874472e-06, "loss": 0.472, "step": 211 }, { "epoch": 0.654320987654321, "grad_norm": 0.10840674489736557, "learning_rate": 7.723883425038759e-06, "loss": 0.3961, "step": 212 }, { "epoch": 0.6574074074074074, "grad_norm": 0.11287008225917816, "learning_rate": 7.703204087277989e-06, "loss": 0.4169, "step": 213 }, { "epoch": 0.6604938271604939, "grad_norm": 0.1013006791472435, "learning_rate": 7.682459205003484e-06, "loss": 0.3537, "step": 214 }, { "epoch": 0.6635802469135802, "grad_norm": 0.12204479426145554, "learning_rate": 7.661649281215823e-06, "loss": 0.3444, "step": 215 }, { "epoch": 0.6666666666666666, "grad_norm": 0.1041225790977478, "learning_rate": 7.640774820492647e-06, "loss": 0.3432, "step": 216 }, { "epoch": 0.6697530864197531, "grad_norm": 0.12317519634962082, "learning_rate": 7.619836328976416e-06, "loss": 0.4119, "step": 217 }, { "epoch": 0.6728395061728395, "grad_norm": 0.15862716734409332, "learning_rate": 7.598834314362151e-06, "loss": 0.3585, "step": 218 }, { "epoch": 0.6759259259259259, "grad_norm": 0.10013571381568909, "learning_rate": 7.57776928588511e-06, "loss": 0.3589, "step": 219 }, { "epoch": 0.6790123456790124, "grad_norm": 0.11820396035909653, "learning_rate": 7.556641754308447e-06, "loss": 0.2838, "step": 220 }, { "epoch": 0.6820987654320988, "grad_norm": 0.08206115663051605, "learning_rate": 7.535452231910829e-06, "loss": 0.1639, "step": 221 }, { "epoch": 0.6851851851851852, "grad_norm": 0.13305512070655823, "learning_rate": 7.514201232474012e-06, "loss": 0.3923, "step": 222 }, { "epoch": 0.6882716049382716, "grad_norm": 0.1208796426653862, "learning_rate": 7.492889271270382e-06, "loss": 0.3698, "step": 223 }, { "epoch": 0.691358024691358, "grad_norm": 0.11946754902601242, "learning_rate": 7.471516865050468e-06, "loss": 0.3797, "step": 224 }, { "epoch": 0.6944444444444444, "grad_norm": 0.08816403150558472, "learning_rate": 7.450084532030402e-06, "loss": 0.2238, "step": 225 }, { "epoch": 0.6975308641975309, "grad_norm": 0.12045780569314957, "learning_rate": 7.428592791879361e-06, "loss": 0.3699, "step": 226 }, { "epoch": 0.7006172839506173, "grad_norm": 0.11096329241991043, "learning_rate": 7.407042165706969e-06, "loss": 0.362, "step": 227 }, { "epoch": 0.7037037037037037, "grad_norm": 0.14540982246398926, "learning_rate": 7.385433176050654e-06, "loss": 0.4543, "step": 228 }, { "epoch": 0.7067901234567902, "grad_norm": 0.11663732677698135, "learning_rate": 7.36376634686298e-06, "loss": 0.4606, "step": 229 }, { "epoch": 0.7098765432098766, "grad_norm": 0.11102988570928574, "learning_rate": 7.342042203498952e-06, "loss": 0.3526, "step": 230 }, { "epoch": 0.7129629629629629, "grad_norm": 0.11012902110815048, "learning_rate": 7.320261272703259e-06, "loss": 0.4337, "step": 231 }, { "epoch": 0.7160493827160493, "grad_norm": 0.09911687672138214, "learning_rate": 7.298424082597526e-06, "loss": 0.2504, "step": 232 }, { "epoch": 0.7191358024691358, "grad_norm": 0.13727596402168274, "learning_rate": 7.276531162667484e-06, "loss": 0.4725, "step": 233 }, { "epoch": 0.7222222222222222, "grad_norm": 0.10461889952421188, "learning_rate": 7.254583043750152e-06, "loss": 0.3202, "step": 234 }, { "epoch": 0.7253086419753086, "grad_norm": 0.18260876834392548, "learning_rate": 7.232580258020952e-06, "loss": 0.4248, "step": 235 }, { "epoch": 0.7283950617283951, "grad_norm": 0.13938364386558533, "learning_rate": 7.210523338980814e-06, "loss": 0.2602, "step": 236 }, { "epoch": 0.7314814814814815, "grad_norm": 0.11910004913806915, "learning_rate": 7.1884128214432366e-06, "loss": 0.4185, "step": 237 }, { "epoch": 0.7345679012345679, "grad_norm": 0.10073763877153397, "learning_rate": 7.1662492415213194e-06, "loss": 0.2697, "step": 238 }, { "epoch": 0.7376543209876543, "grad_norm": 0.11307626962661743, "learning_rate": 7.14403313661476e-06, "loss": 0.4232, "step": 239 }, { "epoch": 0.7407407407407407, "grad_norm": 0.10806172341108322, "learning_rate": 7.1217650453968335e-06, "loss": 0.2928, "step": 240 }, { "epoch": 0.7438271604938271, "grad_norm": 0.14010940492153168, "learning_rate": 7.099445507801324e-06, "loss": 0.3915, "step": 241 }, { "epoch": 0.7469135802469136, "grad_norm": 0.09002690017223358, "learning_rate": 7.0770750650094335e-06, "loss": 0.2801, "step": 242 }, { "epoch": 0.75, "grad_norm": 0.11942241340875626, "learning_rate": 7.0546542594366605e-06, "loss": 0.4149, "step": 243 }, { "epoch": 0.75, "eval_loss": 0.4767835736274719, "eval_runtime": 44.3688, "eval_samples_per_second": 8.294, "eval_steps_per_second": 1.037, "step": 243 }, { "epoch": 0.7530864197530864, "grad_norm": 0.16698460280895233, "learning_rate": 7.03218363471965e-06, "loss": 0.4605, "step": 244 }, { "epoch": 0.7561728395061729, "grad_norm": 0.12310118973255157, "learning_rate": 7.0096637357030105e-06, "loss": 0.4328, "step": 245 }, { "epoch": 0.7592592592592593, "grad_norm": 0.11915367841720581, "learning_rate": 6.987095108426102e-06, "loss": 0.3907, "step": 246 }, { "epoch": 0.7623456790123457, "grad_norm": 0.1066504493355751, "learning_rate": 6.964478300109796e-06, "loss": 0.3148, "step": 247 }, { "epoch": 0.7654320987654321, "grad_norm": 0.09711527079343796, "learning_rate": 6.94181385914321e-06, "loss": 0.2736, "step": 248 }, { "epoch": 0.7685185185185185, "grad_norm": 0.08204776048660278, "learning_rate": 6.91910233507041e-06, "loss": 0.1607, "step": 249 }, { "epoch": 0.7716049382716049, "grad_norm": 0.13877205550670624, "learning_rate": 6.896344278577083e-06, "loss": 0.3763, "step": 250 }, { "epoch": 0.7746913580246914, "grad_norm": 0.11828643828630447, "learning_rate": 6.873540241477189e-06, "loss": 0.4063, "step": 251 }, { "epoch": 0.7777777777777778, "grad_norm": 0.13950656354427338, "learning_rate": 6.850690776699574e-06, "loss": 0.4348, "step": 252 }, { "epoch": 0.7808641975308642, "grad_norm": 0.13861550390720367, "learning_rate": 6.8277964382745675e-06, "loss": 0.4007, "step": 253 }, { "epoch": 0.7839506172839507, "grad_norm": 0.12502089142799377, "learning_rate": 6.804857781320558e-06, "loss": 0.4157, "step": 254 }, { "epoch": 0.7870370370370371, "grad_norm": 0.1129172146320343, "learning_rate": 6.781875362030512e-06, "loss": 0.3087, "step": 255 }, { "epoch": 0.7901234567901234, "grad_norm": 0.18749450147151947, "learning_rate": 6.758849737658508e-06, "loss": 0.381, "step": 256 }, { "epoch": 0.7932098765432098, "grad_norm": 0.11505936086177826, "learning_rate": 6.735781466506216e-06, "loss": 0.3639, "step": 257 }, { "epoch": 0.7962962962962963, "grad_norm": 0.13606995344161987, "learning_rate": 6.712671107909359e-06, "loss": 0.4504, "step": 258 }, { "epoch": 0.7993827160493827, "grad_norm": 0.13360187411308289, "learning_rate": 6.6895192222241534e-06, "loss": 0.4113, "step": 259 }, { "epoch": 0.8024691358024691, "grad_norm": 0.1227497085928917, "learning_rate": 6.666326370813722e-06, "loss": 0.3156, "step": 260 }, { "epoch": 0.8055555555555556, "grad_norm": 0.1294088065624237, "learning_rate": 6.643093116034486e-06, "loss": 0.2544, "step": 261 }, { "epoch": 0.808641975308642, "grad_norm": 0.11842790246009827, "learning_rate": 6.619820021222518e-06, "loss": 0.2796, "step": 262 }, { "epoch": 0.8117283950617284, "grad_norm": 0.11302869021892548, "learning_rate": 6.5965076506799e-06, "loss": 0.3225, "step": 263 }, { "epoch": 0.8148148148148148, "grad_norm": 0.1153462752699852, "learning_rate": 6.573156569661026e-06, "loss": 0.3168, "step": 264 }, { "epoch": 0.8179012345679012, "grad_norm": 0.14865292608737946, "learning_rate": 6.549767344358903e-06, "loss": 0.3793, "step": 265 }, { "epoch": 0.8209876543209876, "grad_norm": 0.18601423501968384, "learning_rate": 6.526340541891418e-06, "loss": 0.383, "step": 266 }, { "epoch": 0.8240740740740741, "grad_norm": 0.11983994394540787, "learning_rate": 6.5028767302875974e-06, "loss": 0.3366, "step": 267 }, { "epoch": 0.8271604938271605, "grad_norm": 0.11204046756029129, "learning_rate": 6.479376478473822e-06, "loss": 0.2842, "step": 268 }, { "epoch": 0.8302469135802469, "grad_norm": 0.12731367349624634, "learning_rate": 6.455840356260041e-06, "loss": 0.3664, "step": 269 }, { "epoch": 0.8333333333333334, "grad_norm": 0.12762831151485443, "learning_rate": 6.432268934325947e-06, "loss": 0.4333, "step": 270 }, { "epoch": 0.8364197530864198, "grad_norm": 0.1425330489873886, "learning_rate": 6.408662784207149e-06, "loss": 0.283, "step": 271 }, { "epoch": 0.8395061728395061, "grad_norm": 0.1323920488357544, "learning_rate": 6.385022478281307e-06, "loss": 0.4108, "step": 272 }, { "epoch": 0.8425925925925926, "grad_norm": 0.1550484001636505, "learning_rate": 6.361348589754255e-06, "loss": 0.3396, "step": 273 }, { "epoch": 0.845679012345679, "grad_norm": 0.09628990292549133, "learning_rate": 6.337641692646106e-06, "loss": 0.246, "step": 274 }, { "epoch": 0.8487654320987654, "grad_norm": 0.1477012187242508, "learning_rate": 6.313902361777327e-06, "loss": 0.4705, "step": 275 }, { "epoch": 0.8518518518518519, "grad_norm": 0.14865955710411072, "learning_rate": 6.290131172754811e-06, "loss": 0.417, "step": 276 }, { "epoch": 0.8549382716049383, "grad_norm": 0.11468877643346786, "learning_rate": 6.266328701957911e-06, "loss": 0.3683, "step": 277 }, { "epoch": 0.8580246913580247, "grad_norm": 0.1273777186870575, "learning_rate": 6.24249552652447e-06, "loss": 0.2808, "step": 278 }, { "epoch": 0.8611111111111112, "grad_norm": 0.10113878548145294, "learning_rate": 6.2186322243368236e-06, "loss": 0.3368, "step": 279 }, { "epoch": 0.8641975308641975, "grad_norm": 0.1183820515871048, "learning_rate": 6.194739374007792e-06, "loss": 0.3095, "step": 280 }, { "epoch": 0.8672839506172839, "grad_norm": 0.12614701688289642, "learning_rate": 6.170817554866646e-06, "loss": 0.3772, "step": 281 }, { "epoch": 0.8703703703703703, "grad_norm": 0.19127966463565826, "learning_rate": 6.1468673469450655e-06, "loss": 0.3179, "step": 282 }, { "epoch": 0.8734567901234568, "grad_norm": 0.14781445264816284, "learning_rate": 6.122889330963069e-06, "loss": 0.3659, "step": 283 }, { "epoch": 0.8765432098765432, "grad_norm": 0.1360250860452652, "learning_rate": 6.098884088314938e-06, "loss": 0.4211, "step": 284 }, { "epoch": 0.8796296296296297, "grad_norm": 0.1149686872959137, "learning_rate": 6.074852201055121e-06, "loss": 0.2571, "step": 285 }, { "epoch": 0.8827160493827161, "grad_norm": 0.14958076179027557, "learning_rate": 6.050794251884112e-06, "loss": 0.4164, "step": 286 }, { "epoch": 0.8858024691358025, "grad_norm": 0.12140931189060211, "learning_rate": 6.026710824134331e-06, "loss": 0.2203, "step": 287 }, { "epoch": 0.8888888888888888, "grad_norm": 0.12924239039421082, "learning_rate": 6.002602501755974e-06, "loss": 0.4255, "step": 288 }, { "epoch": 0.8919753086419753, "grad_norm": 0.1369277834892273, "learning_rate": 5.978469869302861e-06, "loss": 0.4083, "step": 289 }, { "epoch": 0.8950617283950617, "grad_norm": 0.13165542483329773, "learning_rate": 5.954313511918252e-06, "loss": 0.3317, "step": 290 }, { "epoch": 0.8981481481481481, "grad_norm": 0.16248537600040436, "learning_rate": 5.9301340153206685e-06, "loss": 0.4079, "step": 291 }, { "epoch": 0.9012345679012346, "grad_norm": 0.14584743976593018, "learning_rate": 5.905931965789688e-06, "loss": 0.3508, "step": 292 }, { "epoch": 0.904320987654321, "grad_norm": 0.15875974297523499, "learning_rate": 5.881707950151725e-06, "loss": 0.3597, "step": 293 }, { "epoch": 0.9074074074074074, "grad_norm": 0.11724277585744858, "learning_rate": 5.857462555765809e-06, "loss": 0.3152, "step": 294 }, { "epoch": 0.9104938271604939, "grad_norm": 0.12342196702957153, "learning_rate": 5.8331963705093375e-06, "loss": 0.318, "step": 295 }, { "epoch": 0.9135802469135802, "grad_norm": 0.12013120949268341, "learning_rate": 5.808909982763825e-06, "loss": 0.3951, "step": 296 }, { "epoch": 0.9166666666666666, "grad_norm": 0.10280231386423111, "learning_rate": 5.784603981400632e-06, "loss": 0.2725, "step": 297 }, { "epoch": 0.9197530864197531, "grad_norm": 0.12491166591644287, "learning_rate": 5.760278955766695e-06, "loss": 0.3837, "step": 298 }, { "epoch": 0.9228395061728395, "grad_norm": 0.11760140210390091, "learning_rate": 5.735935495670229e-06, "loss": 0.2464, "step": 299 }, { "epoch": 0.9259259259259259, "grad_norm": 0.13774855434894562, "learning_rate": 5.711574191366427e-06, "loss": 0.3504, "step": 300 }, { "epoch": 0.9290123456790124, "grad_norm": 0.09982441365718842, "learning_rate": 5.687195633543151e-06, "loss": 0.2457, "step": 301 }, { "epoch": 0.9320987654320988, "grad_norm": 0.11534377187490463, "learning_rate": 5.662800413306611e-06, "loss": 0.2951, "step": 302 }, { "epoch": 0.9351851851851852, "grad_norm": 0.100958451628685, "learning_rate": 5.6383891221670275e-06, "loss": 0.19, "step": 303 }, { "epoch": 0.9382716049382716, "grad_norm": 0.17198745906352997, "learning_rate": 5.613962352024293e-06, "loss": 0.3832, "step": 304 }, { "epoch": 0.941358024691358, "grad_norm": 0.16045625507831573, "learning_rate": 5.589520695153618e-06, "loss": 0.4173, "step": 305 }, { "epoch": 0.9444444444444444, "grad_norm": 0.12690144777297974, "learning_rate": 5.5650647441911706e-06, "loss": 0.3318, "step": 306 }, { "epoch": 0.9475308641975309, "grad_norm": 0.12933467328548431, "learning_rate": 5.540595092119709e-06, "loss": 0.3169, "step": 307 }, { "epoch": 0.9506172839506173, "grad_norm": 0.1863582581281662, "learning_rate": 5.516112332254203e-06, "loss": 0.3925, "step": 308 }, { "epoch": 0.9537037037037037, "grad_norm": 0.15057547390460968, "learning_rate": 5.491617058227443e-06, "loss": 0.4953, "step": 309 }, { "epoch": 0.9567901234567902, "grad_norm": 0.159704327583313, "learning_rate": 5.46710986397565e-06, "loss": 0.3831, "step": 310 }, { "epoch": 0.9598765432098766, "grad_norm": 0.0988263189792633, "learning_rate": 5.442591343724081e-06, "loss": 0.1455, "step": 311 }, { "epoch": 0.9629629629629629, "grad_norm": 0.13106189668178558, "learning_rate": 5.418062091972604e-06, "loss": 0.227, "step": 312 }, { "epoch": 0.9660493827160493, "grad_norm": 0.17571298778057098, "learning_rate": 5.393522703481303e-06, "loss": 0.4638, "step": 313 }, { "epoch": 0.9691358024691358, "grad_norm": 0.12073665857315063, "learning_rate": 5.36897377325604e-06, "loss": 0.2587, "step": 314 }, { "epoch": 0.9722222222222222, "grad_norm": 0.08656695485115051, "learning_rate": 5.344415896534039e-06, "loss": 0.2088, "step": 315 }, { "epoch": 0.9753086419753086, "grad_norm": 0.1401841789484024, "learning_rate": 5.319849668769449e-06, "loss": 0.3667, "step": 316 }, { "epoch": 0.9783950617283951, "grad_norm": 0.1650845855474472, "learning_rate": 5.295275685618905e-06, "loss": 0.3667, "step": 317 }, { "epoch": 0.9814814814814815, "grad_norm": 0.13909409940242767, "learning_rate": 5.270694542927089e-06, "loss": 0.3811, "step": 318 }, { "epoch": 0.9845679012345679, "grad_norm": 0.11377997696399689, "learning_rate": 5.246106836712277e-06, "loss": 0.2349, "step": 319 }, { "epoch": 0.9876543209876543, "grad_norm": 0.12037783116102219, "learning_rate": 5.2215131631518945e-06, "loss": 0.2901, "step": 320 }, { "epoch": 0.9907407407407407, "grad_norm": 0.13020600378513336, "learning_rate": 5.196914118568054e-06, "loss": 0.3427, "step": 321 }, { "epoch": 0.9938271604938271, "grad_norm": 0.15103194117546082, "learning_rate": 5.1723102994130994e-06, "loss": 0.4012, "step": 322 }, { "epoch": 0.9969135802469136, "grad_norm": 0.105732262134552, "learning_rate": 5.147702302255143e-06, "loss": 0.175, "step": 323 }, { "epoch": 1.0, "grad_norm": 0.17236697673797607, "learning_rate": 5.123090723763607e-06, "loss": 0.3751, "step": 324 }, { "epoch": 1.0, "eval_loss": 0.4522034823894501, "eval_runtime": 44.5334, "eval_samples_per_second": 8.263, "eval_steps_per_second": 1.033, "step": 324 }, { "epoch": 1.0030864197530864, "grad_norm": 0.15303292870521545, "learning_rate": 5.098476160694741e-06, "loss": 0.4663, "step": 325 }, { "epoch": 1.0061728395061729, "grad_norm": 0.10959513485431671, "learning_rate": 5.073859209877167e-06, "loss": 0.2389, "step": 326 }, { "epoch": 1.0092592592592593, "grad_norm": 0.14050254225730896, "learning_rate": 5.049240468197401e-06, "loss": 0.3591, "step": 327 }, { "epoch": 1.0123456790123457, "grad_norm": 0.12712690234184265, "learning_rate": 5.0246205325853824e-06, "loss": 0.3452, "step": 328 }, { "epoch": 1.0154320987654322, "grad_norm": 0.1756986677646637, "learning_rate": 5e-06, "loss": 0.4289, "step": 329 }, { "epoch": 1.0185185185185186, "grad_norm": 0.14214292168617249, "learning_rate": 4.975379467414621e-06, "loss": 0.3695, "step": 330 }, { "epoch": 1.0030864197530864, "grad_norm": 0.1542719155550003, "learning_rate": 4.950759531802602e-06, "loss": 0.3824, "step": 331 }, { "epoch": 1.0061728395061729, "grad_norm": 0.12223492562770844, "learning_rate": 4.926140790122835e-06, "loss": 0.2753, "step": 332 }, { "epoch": 1.0092592592592593, "grad_norm": 0.12852071225643158, "learning_rate": 4.90152383930526e-06, "loss": 0.2418, "step": 333 }, { "epoch": 1.0123456790123457, "grad_norm": 0.1099737137556076, "learning_rate": 4.876909276236395e-06, "loss": 0.2964, "step": 334 }, { "epoch": 1.0154320987654322, "grad_norm": 0.1437702178955078, "learning_rate": 4.852297697744857e-06, "loss": 0.355, "step": 335 }, { "epoch": 1.0185185185185186, "grad_norm": 0.12063878774642944, "learning_rate": 4.827689700586902e-06, "loss": 0.2879, "step": 336 }, { "epoch": 1.021604938271605, "grad_norm": 0.19743777811527252, "learning_rate": 4.803085881431949e-06, "loss": 0.3412, "step": 337 }, { "epoch": 1.0246913580246915, "grad_norm": 0.22067442536354065, "learning_rate": 4.778486836848107e-06, "loss": 0.3051, "step": 338 }, { "epoch": 1.0277777777777777, "grad_norm": 0.1556781828403473, "learning_rate": 4.7538931632877254e-06, "loss": 0.3369, "step": 339 }, { "epoch": 1.0308641975308641, "grad_norm": 0.132530078291893, "learning_rate": 4.729305457072913e-06, "loss": 0.3452, "step": 340 }, { "epoch": 1.0339506172839505, "grad_norm": 0.16023634374141693, "learning_rate": 4.704724314381097e-06, "loss": 0.3887, "step": 341 }, { "epoch": 1.037037037037037, "grad_norm": 0.14671647548675537, "learning_rate": 4.680150331230552e-06, "loss": 0.3082, "step": 342 }, { "epoch": 1.0401234567901234, "grad_norm": 0.20157098770141602, "learning_rate": 4.6555841034659625e-06, "loss": 0.5004, "step": 343 }, { "epoch": 1.0432098765432098, "grad_norm": 0.14635726809501648, "learning_rate": 4.631026226743962e-06, "loss": 0.4104, "step": 344 }, { "epoch": 1.0462962962962963, "grad_norm": 0.14289334416389465, "learning_rate": 4.606477296518698e-06, "loss": 0.3206, "step": 345 }, { "epoch": 1.0493827160493827, "grad_norm": 0.14635069668293, "learning_rate": 4.581937908027397e-06, "loss": 0.2957, "step": 346 }, { "epoch": 1.0524691358024691, "grad_norm": 0.1479678899049759, "learning_rate": 4.55740865627592e-06, "loss": 0.3168, "step": 347 }, { "epoch": 1.0555555555555556, "grad_norm": 0.12210693210363388, "learning_rate": 4.532890136024351e-06, "loss": 0.2854, "step": 348 }, { "epoch": 1.058641975308642, "grad_norm": 0.16018199920654297, "learning_rate": 4.508382941772558e-06, "loss": 0.2937, "step": 349 }, { "epoch": 1.0617283950617284, "grad_norm": 0.14056287705898285, "learning_rate": 4.483887667745798e-06, "loss": 0.3246, "step": 350 }, { "epoch": 1.0648148148148149, "grad_norm": 0.14486226439476013, "learning_rate": 4.459404907880293e-06, "loss": 0.3133, "step": 351 }, { "epoch": 1.0679012345679013, "grad_norm": 0.1279231458902359, "learning_rate": 4.434935255808831e-06, "loss": 0.2219, "step": 352 }, { "epoch": 1.0709876543209877, "grad_norm": 0.16269516944885254, "learning_rate": 4.410479304846385e-06, "loss": 0.3531, "step": 353 }, { "epoch": 1.074074074074074, "grad_norm": 0.15139630436897278, "learning_rate": 4.386037647975708e-06, "loss": 0.2508, "step": 354 }, { "epoch": 1.0771604938271604, "grad_norm": 0.15115757286548615, "learning_rate": 4.361610877832974e-06, "loss": 0.3908, "step": 355 }, { "epoch": 1.0802469135802468, "grad_norm": 0.17080338299274445, "learning_rate": 4.337199586693389e-06, "loss": 0.4233, "step": 356 }, { "epoch": 1.0833333333333333, "grad_norm": 0.149905264377594, "learning_rate": 4.312804366456851e-06, "loss": 0.3354, "step": 357 }, { "epoch": 1.0864197530864197, "grad_norm": 0.2038925588130951, "learning_rate": 4.2884258086335755e-06, "loss": 0.422, "step": 358 }, { "epoch": 1.0895061728395061, "grad_norm": 0.1319386065006256, "learning_rate": 4.2640645043297715e-06, "loss": 0.2812, "step": 359 }, { "epoch": 1.0925925925925926, "grad_norm": 0.210116446018219, "learning_rate": 4.239721044233306e-06, "loss": 0.3266, "step": 360 }, { "epoch": 1.095679012345679, "grad_norm": 0.15533123910427094, "learning_rate": 4.215396018599369e-06, "loss": 0.3106, "step": 361 }, { "epoch": 1.0987654320987654, "grad_norm": 0.15208472311496735, "learning_rate": 4.191090017236177e-06, "loss": 0.3423, "step": 362 }, { "epoch": 1.1018518518518519, "grad_norm": 0.12684912979602814, "learning_rate": 4.166803629490664e-06, "loss": 0.2755, "step": 363 }, { "epoch": 1.1049382716049383, "grad_norm": 0.18555931746959686, "learning_rate": 4.142537444234192e-06, "loss": 0.4007, "step": 364 }, { "epoch": 1.1080246913580247, "grad_norm": 0.20792073011398315, "learning_rate": 4.118292049848277e-06, "loss": 0.2467, "step": 365 }, { "epoch": 1.1111111111111112, "grad_norm": 0.13857008516788483, "learning_rate": 4.094068034210313e-06, "loss": 0.3666, "step": 366 }, { "epoch": 1.1141975308641976, "grad_norm": 0.10900649428367615, "learning_rate": 4.069865984679332e-06, "loss": 0.1954, "step": 367 }, { "epoch": 1.117283950617284, "grad_norm": 0.13190750777721405, "learning_rate": 4.045686488081748e-06, "loss": 0.309, "step": 368 }, { "epoch": 1.1203703703703705, "grad_norm": 0.16032575070858002, "learning_rate": 4.021530130697141e-06, "loss": 0.3524, "step": 369 }, { "epoch": 1.123456790123457, "grad_norm": 0.14147287607192993, "learning_rate": 3.997397498244028e-06, "loss": 0.3088, "step": 370 }, { "epoch": 1.126543209876543, "grad_norm": 0.1288299709558487, "learning_rate": 3.97328917586567e-06, "loss": 0.3216, "step": 371 }, { "epoch": 1.1296296296296295, "grad_norm": 0.17235535383224487, "learning_rate": 3.9492057481158905e-06, "loss": 0.3339, "step": 372 }, { "epoch": 1.132716049382716, "grad_norm": 0.21856486797332764, "learning_rate": 3.92514779894488e-06, "loss": 0.3691, "step": 373 }, { "epoch": 1.1358024691358024, "grad_norm": 0.188248872756958, "learning_rate": 3.901115911685063e-06, "loss": 0.3879, "step": 374 }, { "epoch": 1.1388888888888888, "grad_norm": 0.17136438190937042, "learning_rate": 3.877110669036932e-06, "loss": 0.4754, "step": 375 }, { "epoch": 1.1419753086419753, "grad_norm": 0.14845937490463257, "learning_rate": 3.853132653054936e-06, "loss": 0.4178, "step": 376 }, { "epoch": 1.1450617283950617, "grad_norm": 0.14598865807056427, "learning_rate": 3.829182445133356e-06, "loss": 0.2653, "step": 377 }, { "epoch": 1.1481481481481481, "grad_norm": 0.12898695468902588, "learning_rate": 3.8052606259922097e-06, "loss": 0.2613, "step": 378 }, { "epoch": 1.1512345679012346, "grad_norm": 0.12332043796777725, "learning_rate": 3.7813677756631773e-06, "loss": 0.2803, "step": 379 }, { "epoch": 1.154320987654321, "grad_norm": 0.1356392502784729, "learning_rate": 3.75750447347553e-06, "loss": 0.4038, "step": 380 }, { "epoch": 1.1574074074074074, "grad_norm": 0.25393664836883545, "learning_rate": 3.7336712980420897e-06, "loss": 0.5067, "step": 381 }, { "epoch": 1.1604938271604939, "grad_norm": 0.12110210955142975, "learning_rate": 3.7098688272451893e-06, "loss": 0.2413, "step": 382 }, { "epoch": 1.1635802469135803, "grad_norm": 0.12632521986961365, "learning_rate": 3.6860976382226747e-06, "loss": 0.2583, "step": 383 }, { "epoch": 1.1666666666666667, "grad_norm": 0.15142959356307983, "learning_rate": 3.662358307353897e-06, "loss": 0.4542, "step": 384 }, { "epoch": 1.1697530864197532, "grad_norm": 0.11639465391635895, "learning_rate": 3.638651410245746e-06, "loss": 0.1849, "step": 385 }, { "epoch": 1.1728395061728394, "grad_norm": 0.14406833052635193, "learning_rate": 3.6149775217186954e-06, "loss": 0.3171, "step": 386 }, { "epoch": 1.175925925925926, "grad_norm": 0.1374572366476059, "learning_rate": 3.5913372157928515e-06, "loss": 0.2849, "step": 387 }, { "epoch": 1.1790123456790123, "grad_norm": 0.16935373842716217, "learning_rate": 3.5677310656740537e-06, "loss": 0.3982, "step": 388 }, { "epoch": 1.1820987654320987, "grad_norm": 0.1098417416214943, "learning_rate": 3.5441596437399596e-06, "loss": 0.2149, "step": 389 }, { "epoch": 1.1851851851851851, "grad_norm": 0.14076852798461914, "learning_rate": 3.5206235215261785e-06, "loss": 0.2685, "step": 390 }, { "epoch": 1.1882716049382716, "grad_norm": 0.12600207328796387, "learning_rate": 3.4971232697124046e-06, "loss": 0.2009, "step": 391 }, { "epoch": 1.191358024691358, "grad_norm": 0.13086476922035217, "learning_rate": 3.4736594581085837e-06, "loss": 0.3062, "step": 392 }, { "epoch": 1.1944444444444444, "grad_norm": 0.16587767004966736, "learning_rate": 3.4502326556411e-06, "loss": 0.2432, "step": 393 }, { "epoch": 1.1975308641975309, "grad_norm": 0.13524991273880005, "learning_rate": 3.4268434303389747e-06, "loss": 0.3204, "step": 394 }, { "epoch": 1.2006172839506173, "grad_norm": 0.15923044085502625, "learning_rate": 3.403492349320101e-06, "loss": 0.36, "step": 395 }, { "epoch": 1.2037037037037037, "grad_norm": 0.19655781984329224, "learning_rate": 3.380179978777482e-06, "loss": 0.4863, "step": 396 }, { "epoch": 1.2067901234567902, "grad_norm": 0.13031858205795288, "learning_rate": 3.356906883965516e-06, "loss": 0.2884, "step": 397 }, { "epoch": 1.2098765432098766, "grad_norm": 0.12421680986881256, "learning_rate": 3.33367362918628e-06, "loss": 0.1891, "step": 398 }, { "epoch": 1.212962962962963, "grad_norm": 0.15903340280056, "learning_rate": 3.3104807777758487e-06, "loss": 0.4381, "step": 399 }, { "epoch": 1.2160493827160495, "grad_norm": 0.11143235117197037, "learning_rate": 3.2873288920906436e-06, "loss": 0.2269, "step": 400 }, { "epoch": 1.2191358024691359, "grad_norm": 0.1427583545446396, "learning_rate": 3.2642185334937853e-06, "loss": 0.3874, "step": 401 }, { "epoch": 1.2222222222222223, "grad_norm": 0.21431690454483032, "learning_rate": 3.2411502623414925e-06, "loss": 0.4815, "step": 402 }, { "epoch": 1.2253086419753085, "grad_norm": 0.20369336009025574, "learning_rate": 3.2181246379694886e-06, "loss": 0.429, "step": 403 }, { "epoch": 1.228395061728395, "grad_norm": 0.21474803984165192, "learning_rate": 3.1951422186794447e-06, "loss": 0.4217, "step": 404 }, { "epoch": 1.2314814814814814, "grad_norm": 0.1690702587366104, "learning_rate": 3.1722035617254333e-06, "loss": 0.3388, "step": 405 }, { "epoch": 1.2314814814814814, "eval_loss": 0.4383295774459839, "eval_runtime": 44.45, "eval_samples_per_second": 8.279, "eval_steps_per_second": 1.035, "step": 405 }, { "epoch": 1.2345679012345678, "grad_norm": 0.13106146454811096, "learning_rate": 3.149309223300428e-06, "loss": 0.2537, "step": 406 }, { "epoch": 1.2376543209876543, "grad_norm": 0.18745112419128418, "learning_rate": 3.126459758522813e-06, "loss": 0.3825, "step": 407 }, { "epoch": 1.2407407407407407, "grad_norm": 0.1358872950077057, "learning_rate": 3.103655721422917e-06, "loss": 0.3057, "step": 408 }, { "epoch": 1.2438271604938271, "grad_norm": 0.15695077180862427, "learning_rate": 3.080897664929592e-06, "loss": 0.412, "step": 409 }, { "epoch": 1.2469135802469136, "grad_norm": 0.15740308165550232, "learning_rate": 3.0581861408567907e-06, "loss": 0.371, "step": 410 }, { "epoch": 1.25, "grad_norm": 0.17210154235363007, "learning_rate": 3.035521699890206e-06, "loss": 0.4671, "step": 411 }, { "epoch": 1.2530864197530864, "grad_norm": 0.1564391851425171, "learning_rate": 3.0129048915739013e-06, "loss": 0.397, "step": 412 }, { "epoch": 1.2561728395061729, "grad_norm": 0.15035340189933777, "learning_rate": 2.9903362642969903e-06, "loss": 0.3696, "step": 413 }, { "epoch": 1.2592592592592593, "grad_norm": 0.12334346026182175, "learning_rate": 2.967816365280351e-06, "loss": 0.2595, "step": 414 }, { "epoch": 1.2623456790123457, "grad_norm": 0.159285768866539, "learning_rate": 2.94534574056334e-06, "loss": 0.3444, "step": 415 }, { "epoch": 1.2654320987654322, "grad_norm": 0.14071713387966156, "learning_rate": 2.9229249349905686e-06, "loss": 0.264, "step": 416 }, { "epoch": 1.2685185185185186, "grad_norm": 0.17824961245059967, "learning_rate": 2.9005544921986774e-06, "loss": 0.3823, "step": 417 }, { "epoch": 1.2716049382716048, "grad_norm": 0.14212675392627716, "learning_rate": 2.8782349546031673e-06, "loss": 0.253, "step": 418 }, { "epoch": 1.2746913580246915, "grad_norm": 0.21493245661258698, "learning_rate": 2.8559668633852433e-06, "loss": 0.3181, "step": 419 }, { "epoch": 1.2777777777777777, "grad_norm": 0.14115536212921143, "learning_rate": 2.8337507584786826e-06, "loss": 0.3007, "step": 420 }, { "epoch": 1.2808641975308643, "grad_norm": 0.16807730495929718, "learning_rate": 2.811587178556764e-06, "loss": 0.271, "step": 421 }, { "epoch": 1.2839506172839505, "grad_norm": 0.19324727356433868, "learning_rate": 2.789476661019186e-06, "loss": 0.3613, "step": 422 }, { "epoch": 1.287037037037037, "grad_norm": 0.22242026031017303, "learning_rate": 2.7674197419790493e-06, "loss": 0.3391, "step": 423 }, { "epoch": 1.2901234567901234, "grad_norm": 0.1270921379327774, "learning_rate": 2.7454169562498503e-06, "loss": 0.2094, "step": 424 }, { "epoch": 1.2932098765432098, "grad_norm": 0.12505224347114563, "learning_rate": 2.723468837332517e-06, "loss": 0.2807, "step": 425 }, { "epoch": 1.2962962962962963, "grad_norm": 0.16030734777450562, "learning_rate": 2.7015759174024756e-06, "loss": 0.3266, "step": 426 }, { "epoch": 1.2993827160493827, "grad_norm": 0.1334860622882843, "learning_rate": 2.6797387272967414e-06, "loss": 0.2262, "step": 427 }, { "epoch": 1.3024691358024691, "grad_norm": 0.16829054057598114, "learning_rate": 2.65795779650105e-06, "loss": 0.3483, "step": 428 }, { "epoch": 1.3055555555555556, "grad_norm": 0.16048014163970947, "learning_rate": 2.63623365313702e-06, "loss": 0.3673, "step": 429 }, { "epoch": 1.308641975308642, "grad_norm": 0.22250574827194214, "learning_rate": 2.614566823949348e-06, "loss": 0.3418, "step": 430 }, { "epoch": 1.3117283950617284, "grad_norm": 0.13716565072536469, "learning_rate": 2.592957834293033e-06, "loss": 0.2986, "step": 431 }, { "epoch": 1.3148148148148149, "grad_norm": 0.15584644675254822, "learning_rate": 2.5714072081206407e-06, "loss": 0.3419, "step": 432 }, { "epoch": 1.3179012345679013, "grad_norm": 0.17043578624725342, "learning_rate": 2.5499154679696014e-06, "loss": 0.3133, "step": 433 }, { "epoch": 1.3209876543209877, "grad_norm": 0.1307077258825302, "learning_rate": 2.528483134949535e-06, "loss": 0.2484, "step": 434 }, { "epoch": 1.324074074074074, "grad_norm": 0.19332851469516754, "learning_rate": 2.50711072872962e-06, "loss": 0.338, "step": 435 }, { "epoch": 1.3271604938271606, "grad_norm": 0.18752485513687134, "learning_rate": 2.4857987675259887e-06, "loss": 0.3693, "step": 436 }, { "epoch": 1.3302469135802468, "grad_norm": 0.171221524477005, "learning_rate": 2.4645477680891734e-06, "loss": 0.3222, "step": 437 }, { "epoch": 1.3333333333333333, "grad_norm": 0.2540048062801361, "learning_rate": 2.4433582456915556e-06, "loss": 0.4404, "step": 438 }, { "epoch": 1.3364197530864197, "grad_norm": 0.13886091113090515, "learning_rate": 2.422230714114891e-06, "loss": 0.3246, "step": 439 }, { "epoch": 1.3395061728395061, "grad_norm": 0.11673127859830856, "learning_rate": 2.4011656856378513e-06, "loss": 0.1878, "step": 440 }, { "epoch": 1.3425925925925926, "grad_norm": 0.20191854238510132, "learning_rate": 2.3801636710235836e-06, "loss": 0.2979, "step": 441 }, { "epoch": 1.345679012345679, "grad_norm": 0.16786165535449982, "learning_rate": 2.3592251795073564e-06, "loss": 0.2931, "step": 442 }, { "epoch": 1.3487654320987654, "grad_norm": 0.1304280310869217, "learning_rate": 2.338350718784177e-06, "loss": 0.2368, "step": 443 }, { "epoch": 1.3518518518518519, "grad_norm": 0.14287714660167694, "learning_rate": 2.3175407949965167e-06, "loss": 0.286, "step": 444 }, { "epoch": 1.3549382716049383, "grad_norm": 0.13601404428482056, "learning_rate": 2.296795912722014e-06, "loss": 0.268, "step": 445 }, { "epoch": 1.3580246913580247, "grad_norm": 0.1764301061630249, "learning_rate": 2.2761165749612417e-06, "loss": 0.355, "step": 446 }, { "epoch": 1.3611111111111112, "grad_norm": 0.1622696816921234, "learning_rate": 2.25550328312553e-06, "loss": 0.3438, "step": 447 }, { "epoch": 1.3641975308641976, "grad_norm": 0.15518330037593842, "learning_rate": 2.2349565370247837e-06, "loss": 0.2844, "step": 448 }, { "epoch": 1.367283950617284, "grad_norm": 0.13542047142982483, "learning_rate": 2.214476834855382e-06, "loss": 0.324, "step": 449 }, { "epoch": 1.3703703703703702, "grad_norm": 0.20794177055358887, "learning_rate": 2.1940646731880887e-06, "loss": 0.5443, "step": 450 }, { "epoch": 1.373456790123457, "grad_norm": 0.1371917873620987, "learning_rate": 2.173720546956015e-06, "loss": 0.3663, "step": 451 }, { "epoch": 1.376543209876543, "grad_norm": 0.17952483892440796, "learning_rate": 2.1534449494426203e-06, "loss": 0.3209, "step": 452 }, { "epoch": 1.3796296296296298, "grad_norm": 0.1383998692035675, "learning_rate": 2.1332383722697483e-06, "loss": 0.2407, "step": 453 }, { "epoch": 1.382716049382716, "grad_norm": 0.17842058837413788, "learning_rate": 2.1131013053857097e-06, "loss": 0.5964, "step": 454 }, { "epoch": 1.3858024691358024, "grad_norm": 0.13012441992759705, "learning_rate": 2.0930342370534013e-06, "loss": 0.2686, "step": 455 }, { "epoch": 1.3888888888888888, "grad_norm": 0.1683279275894165, "learning_rate": 2.073037653838466e-06, "loss": 0.4134, "step": 456 }, { "epoch": 1.3919753086419753, "grad_norm": 0.18860593438148499, "learning_rate": 2.053112040597495e-06, "loss": 0.2766, "step": 457 }, { "epoch": 1.3950617283950617, "grad_norm": 0.15948981046676636, "learning_rate": 2.0332578804662783e-06, "loss": 0.452, "step": 458 }, { "epoch": 1.3981481481481481, "grad_norm": 0.13614550232887268, "learning_rate": 2.013475654848076e-06, "loss": 0.3028, "step": 459 }, { "epoch": 1.4012345679012346, "grad_norm": 0.1575852334499359, "learning_rate": 1.99376584340196e-06, "loss": 0.3772, "step": 460 }, { "epoch": 1.404320987654321, "grad_norm": 0.1815677434206009, "learning_rate": 1.9741289240311757e-06, "loss": 0.4218, "step": 461 }, { "epoch": 1.4074074074074074, "grad_norm": 0.16409048438072205, "learning_rate": 1.954565372871554e-06, "loss": 0.4449, "step": 462 }, { "epoch": 1.4104938271604939, "grad_norm": 0.17997804284095764, "learning_rate": 1.935075664279978e-06, "loss": 0.3908, "step": 463 }, { "epoch": 1.4135802469135803, "grad_norm": 0.17692823708057404, "learning_rate": 1.9156602708228584e-06, "loss": 0.3506, "step": 464 }, { "epoch": 1.4166666666666667, "grad_norm": 0.17066018283367157, "learning_rate": 1.8963196632647008e-06, "loss": 0.4187, "step": 465 }, { "epoch": 1.4197530864197532, "grad_norm": 0.17325402796268463, "learning_rate": 1.8770543105566752e-06, "loss": 0.3865, "step": 466 }, { "epoch": 1.4228395061728394, "grad_norm": 0.1373230516910553, "learning_rate": 1.8578646798252432e-06, "loss": 0.2194, "step": 467 }, { "epoch": 1.425925925925926, "grad_norm": 0.14924941956996918, "learning_rate": 1.8387512363608496e-06, "loss": 0.3415, "step": 468 }, { "epoch": 1.4290123456790123, "grad_norm": 0.15401771664619446, "learning_rate": 1.8197144436066167e-06, "loss": 0.3132, "step": 469 }, { "epoch": 1.4320987654320987, "grad_norm": 0.24441462755203247, "learning_rate": 1.8007547631471289e-06, "loss": 0.365, "step": 470 }, { "epoch": 1.4351851851851851, "grad_norm": 0.2641655206680298, "learning_rate": 1.781872654697226e-06, "loss": 0.4653, "step": 471 }, { "epoch": 1.4382716049382716, "grad_norm": 0.18639406561851501, "learning_rate": 1.7630685760908623e-06, "loss": 0.3422, "step": 472 }, { "epoch": 1.441358024691358, "grad_norm": 0.14547406136989594, "learning_rate": 1.7443429832700038e-06, "loss": 0.3541, "step": 473 }, { "epoch": 1.4444444444444444, "grad_norm": 0.179130420088768, "learning_rate": 1.7256963302735752e-06, "loss": 0.3341, "step": 474 }, { "epoch": 1.4475308641975309, "grad_norm": 0.1942981481552124, "learning_rate": 1.7071290692264492e-06, "loss": 0.392, "step": 475 }, { "epoch": 1.4506172839506173, "grad_norm": 0.10643615573644638, "learning_rate": 1.6886416503284835e-06, "loss": 0.2317, "step": 476 }, { "epoch": 1.4537037037037037, "grad_norm": 0.14966462552547455, "learning_rate": 1.6702345218436066e-06, "loss": 0.2882, "step": 477 }, { "epoch": 1.4567901234567902, "grad_norm": 0.1604948490858078, "learning_rate": 1.6519081300889472e-06, "loss": 0.3337, "step": 478 }, { "epoch": 1.4598765432098766, "grad_norm": 0.23344826698303223, "learning_rate": 1.6336629194240118e-06, "loss": 0.3655, "step": 479 }, { "epoch": 1.462962962962963, "grad_norm": 0.1553526222705841, "learning_rate": 1.6154993322399114e-06, "loss": 0.316, "step": 480 }, { "epoch": 1.4660493827160495, "grad_norm": 0.1312614530324936, "learning_rate": 1.5974178089486364e-06, "loss": 0.301, "step": 481 }, { "epoch": 1.4691358024691357, "grad_norm": 0.13480979204177856, "learning_rate": 1.5794187879723755e-06, "loss": 0.356, "step": 482 }, { "epoch": 1.4722222222222223, "grad_norm": 0.14350688457489014, "learning_rate": 1.561502705732883e-06, "loss": 0.3021, "step": 483 }, { "epoch": 1.4753086419753085, "grad_norm": 0.13871291279792786, "learning_rate": 1.543669996640908e-06, "loss": 0.4188, "step": 484 }, { "epoch": 1.4783950617283952, "grad_norm": 0.16152562201023102, "learning_rate": 1.5259210930856423e-06, "loss": 0.3632, "step": 485 }, { "epoch": 1.4814814814814814, "grad_norm": 0.17420196533203125, "learning_rate": 1.5082564254242583e-06, "loss": 0.3735, "step": 486 }, { "epoch": 1.4814814814814814, "eval_loss": 0.430364727973938, "eval_runtime": 44.4346, "eval_samples_per_second": 8.282, "eval_steps_per_second": 1.035, "step": 486 }, { "epoch": 1.4845679012345678, "grad_norm": 0.15298381447792053, "learning_rate": 1.4906764219714537e-06, "loss": 0.3162, "step": 487 }, { "epoch": 1.4876543209876543, "grad_norm": 0.17767275869846344, "learning_rate": 1.4731815089890795e-06, "loss": 0.451, "step": 488 }, { "epoch": 1.4907407407407407, "grad_norm": 0.2112477868795395, "learning_rate": 1.455772110675804e-06, "loss": 0.3914, "step": 489 }, { "epoch": 1.4938271604938271, "grad_norm": 0.18488173186779022, "learning_rate": 1.438448649156815e-06, "loss": 0.3242, "step": 490 }, { "epoch": 1.4969135802469136, "grad_norm": 0.19138255715370178, "learning_rate": 1.4212115444736024e-06, "loss": 0.3273, "step": 491 }, { "epoch": 1.5, "grad_norm": 0.17519411444664001, "learning_rate": 1.4040612145737608e-06, "loss": 0.314, "step": 492 }, { "epoch": 1.5030864197530864, "grad_norm": 0.11331440508365631, "learning_rate": 1.3869980753008537e-06, "loss": 0.2184, "step": 493 }, { "epoch": 1.5061728395061729, "grad_norm": 0.1674378216266632, "learning_rate": 1.370022540384347e-06, "loss": 0.3075, "step": 494 }, { "epoch": 1.5092592592592593, "grad_norm": 0.14736564457416534, "learning_rate": 1.353135021429554e-06, "loss": 0.3719, "step": 495 }, { "epoch": 1.5123456790123457, "grad_norm": 0.14618776738643646, "learning_rate": 1.3363359279076776e-06, "loss": 0.3625, "step": 496 }, { "epoch": 1.515432098765432, "grad_norm": 0.15497514605522156, "learning_rate": 1.3196256671458663e-06, "loss": 0.3522, "step": 497 }, { "epoch": 1.5185185185185186, "grad_norm": 0.1439277082681656, "learning_rate": 1.3030046443173445e-06, "loss": 0.2904, "step": 498 }, { "epoch": 1.5216049382716048, "grad_norm": 0.14361339807510376, "learning_rate": 1.2864732624315867e-06, "loss": 0.3338, "step": 499 }, { "epoch": 1.5246913580246915, "grad_norm": 0.1480712592601776, "learning_rate": 1.270031922324546e-06, "loss": 0.4092, "step": 500 }, { "epoch": 1.5277777777777777, "grad_norm": 0.156494140625, "learning_rate": 1.2536810226489354e-06, "loss": 0.3855, "step": 501 }, { "epoch": 1.5308641975308643, "grad_norm": 0.2111222743988037, "learning_rate": 1.237420959864561e-06, "loss": 0.4681, "step": 502 }, { "epoch": 1.5339506172839505, "grad_norm": 0.20178188383579254, "learning_rate": 1.2212521282287093e-06, "loss": 0.3472, "step": 503 }, { "epoch": 1.5370370370370372, "grad_norm": 0.14656566083431244, "learning_rate": 1.2051749197865875e-06, "loss": 0.2829, "step": 504 }, { "epoch": 1.5401234567901234, "grad_norm": 0.17030468583106995, "learning_rate": 1.1891897243618184e-06, "loss": 0.457, "step": 505 }, { "epoch": 1.5432098765432098, "grad_norm": 0.16490556299686432, "learning_rate": 1.173296929546987e-06, "loss": 0.4265, "step": 506 }, { "epoch": 1.5462962962962963, "grad_norm": 0.15814335644245148, "learning_rate": 1.1574969206942443e-06, "loss": 0.3079, "step": 507 }, { "epoch": 1.5493827160493827, "grad_norm": 0.15672267973423004, "learning_rate": 1.1417900809059623e-06, "loss": 0.2618, "step": 508 }, { "epoch": 1.5524691358024691, "grad_norm": 0.26926475763320923, "learning_rate": 1.1261767910254422e-06, "loss": 0.4501, "step": 509 }, { "epoch": 1.5555555555555556, "grad_norm": 0.22438615560531616, "learning_rate": 1.1106574296276923e-06, "loss": 0.5102, "step": 510 }, { "epoch": 1.558641975308642, "grad_norm": 0.16849224269390106, "learning_rate": 1.095232373010226e-06, "loss": 0.4356, "step": 511 }, { "epoch": 1.5617283950617284, "grad_norm": 0.15593089163303375, "learning_rate": 1.0799019951839656e-06, "loss": 0.2973, "step": 512 }, { "epoch": 1.5648148148148149, "grad_norm": 0.14039039611816406, "learning_rate": 1.0646666678641477e-06, "loss": 0.4104, "step": 513 }, { "epoch": 1.567901234567901, "grad_norm": 0.11041123420000076, "learning_rate": 1.0495267604613273e-06, "loss": 0.2541, "step": 514 }, { "epoch": 1.5709876543209877, "grad_norm": 0.1312185525894165, "learning_rate": 1.0344826400724185e-06, "loss": 0.2818, "step": 515 }, { "epoch": 1.574074074074074, "grad_norm": 0.20511452853679657, "learning_rate": 1.0195346714717813e-06, "loss": 0.3218, "step": 516 }, { "epoch": 1.5771604938271606, "grad_norm": 0.2118871957063675, "learning_rate": 1.0046832171023952e-06, "loss": 0.2921, "step": 517 }, { "epoch": 1.5802469135802468, "grad_norm": 0.18419800698757172, "learning_rate": 9.899286370670575e-07, "loss": 0.4502, "step": 518 }, { "epoch": 1.5833333333333335, "grad_norm": 0.1755116879940033, "learning_rate": 9.752712891196558e-07, "loss": 0.3514, "step": 519 }, { "epoch": 1.5864197530864197, "grad_norm": 0.16331788897514343, "learning_rate": 9.607115286564972e-07, "loss": 0.318, "step": 520 }, { "epoch": 1.5895061728395061, "grad_norm": 0.18510426580905914, "learning_rate": 9.46249708707681e-07, "loss": 0.3207, "step": 521 }, { "epoch": 1.5925925925925926, "grad_norm": 0.1467633843421936, "learning_rate": 9.318861799285539e-07, "loss": 0.32, "step": 522 }, { "epoch": 1.595679012345679, "grad_norm": 0.21128030121326447, "learning_rate": 9.176212905911946e-07, "loss": 0.4566, "step": 523 }, { "epoch": 1.5987654320987654, "grad_norm": 0.14944253861904144, "learning_rate": 9.034553865759754e-07, "loss": 0.4221, "step": 524 }, { "epoch": 1.6018518518518519, "grad_norm": 0.1913837343454361, "learning_rate": 8.893888113631732e-07, "loss": 0.3236, "step": 525 }, { "epoch": 1.6049382716049383, "grad_norm": 0.14830860495567322, "learning_rate": 8.754219060246432e-07, "loss": 0.3504, "step": 526 }, { "epoch": 1.6080246913580247, "grad_norm": 0.1303461194038391, "learning_rate": 8.615550092155478e-07, "loss": 0.2281, "step": 527 }, { "epoch": 1.6111111111111112, "grad_norm": 0.11773131787776947, "learning_rate": 8.477884571661449e-07, "loss": 0.2038, "step": 528 }, { "epoch": 1.6141975308641974, "grad_norm": 0.16557615995407104, "learning_rate": 8.341225836736367e-07, "loss": 0.2965, "step": 529 }, { "epoch": 1.617283950617284, "grad_norm": 0.15140382945537567, "learning_rate": 8.20557720094074e-07, "loss": 0.2804, "step": 530 }, { "epoch": 1.6203703703703702, "grad_norm": 0.15120923519134521, "learning_rate": 8.070941953343242e-07, "loss": 0.3037, "step": 531 }, { "epoch": 1.623456790123457, "grad_norm": 0.28693991899490356, "learning_rate": 7.937323358440935e-07, "loss": 0.4625, "step": 532 }, { "epoch": 1.626543209876543, "grad_norm": 0.226279154419899, "learning_rate": 7.804724656080182e-07, "loss": 0.3529, "step": 533 }, { "epoch": 1.6296296296296298, "grad_norm": 0.14384153485298157, "learning_rate": 7.673149061377966e-07, "loss": 0.4064, "step": 534 }, { "epoch": 1.632716049382716, "grad_norm": 0.153773695230484, "learning_rate": 7.542599764644049e-07, "loss": 0.2779, "step": 535 }, { "epoch": 1.6358024691358026, "grad_norm": 0.2235001176595688, "learning_rate": 7.413079931303591e-07, "loss": 0.4181, "step": 536 }, { "epoch": 1.6388888888888888, "grad_norm": 0.1906222552061081, "learning_rate": 7.284592701820325e-07, "loss": 0.2867, "step": 537 }, { "epoch": 1.6419753086419753, "grad_norm": 0.189738929271698, "learning_rate": 7.157141191620548e-07, "loss": 0.3274, "step": 538 }, { "epoch": 1.6450617283950617, "grad_norm": 0.15748707950115204, "learning_rate": 7.030728491017408e-07, "loss": 0.2892, "step": 539 }, { "epoch": 1.6481481481481481, "grad_norm": 0.2472158521413803, "learning_rate": 6.905357665136142e-07, "loss": 0.3892, "step": 540 }, { "epoch": 1.6512345679012346, "grad_norm": 0.18736745417118073, "learning_rate": 6.781031753839662e-07, "loss": 0.3192, "step": 541 }, { "epoch": 1.654320987654321, "grad_norm": 0.15377798676490784, "learning_rate": 6.657753771654812e-07, "loss": 0.2991, "step": 542 }, { "epoch": 1.6574074074074074, "grad_norm": 0.16992682218551636, "learning_rate": 6.535526707699408e-07, "loss": 0.3628, "step": 543 }, { "epoch": 1.6604938271604939, "grad_norm": 0.201069176197052, "learning_rate": 6.414353525609628e-07, "loss": 0.3127, "step": 544 }, { "epoch": 1.6635802469135803, "grad_norm": 0.14373762905597687, "learning_rate": 6.294237163468231e-07, "loss": 0.2488, "step": 545 }, { "epoch": 1.6666666666666665, "grad_norm": 0.16759946942329407, "learning_rate": 6.175180533733277e-07, "loss": 0.3833, "step": 546 }, { "epoch": 1.6697530864197532, "grad_norm": 0.2061176598072052, "learning_rate": 6.057186523167529e-07, "loss": 0.252, "step": 547 }, { "epoch": 1.6728395061728394, "grad_norm": 0.18383823335170746, "learning_rate": 5.940257992768456e-07, "loss": 0.3677, "step": 548 }, { "epoch": 1.675925925925926, "grad_norm": 0.2329624891281128, "learning_rate": 5.824397777698859e-07, "loss": 0.3821, "step": 549 }, { "epoch": 1.6790123456790123, "grad_norm": 0.16050845384597778, "learning_rate": 5.709608687218116e-07, "loss": 0.3203, "step": 550 }, { "epoch": 1.682098765432099, "grad_norm": 0.1575547456741333, "learning_rate": 5.595893504614097e-07, "loss": 0.4154, "step": 551 }, { "epoch": 1.6851851851851851, "grad_norm": 0.14166632294654846, "learning_rate": 5.483254987135644e-07, "loss": 0.2528, "step": 552 }, { "epoch": 1.6882716049382716, "grad_norm": 0.1413419544696808, "learning_rate": 5.371695865925736e-07, "loss": 0.2011, "step": 553 }, { "epoch": 1.691358024691358, "grad_norm": 0.14001396298408508, "learning_rate": 5.261218845955246e-07, "loss": 0.2521, "step": 554 }, { "epoch": 1.6944444444444444, "grad_norm": 0.2379157692193985, "learning_rate": 5.151826605957394e-07, "loss": 0.3396, "step": 555 }, { "epoch": 1.6975308641975309, "grad_norm": 0.1787138283252716, "learning_rate": 5.043521798362755e-07, "loss": 0.2596, "step": 556 }, { "epoch": 1.7006172839506173, "grad_norm": 0.41910964250564575, "learning_rate": 4.936307049234956e-07, "loss": 0.3327, "step": 557 }, { "epoch": 1.7037037037037037, "grad_norm": 0.1860780268907547, "learning_rate": 4.830184958207007e-07, "loss": 0.399, "step": 558 }, { "epoch": 1.7067901234567902, "grad_norm": 0.16398878395557404, "learning_rate": 4.725158098418309e-07, "loss": 0.3953, "step": 559 }, { "epoch": 1.7098765432098766, "grad_norm": 0.1744304746389389, "learning_rate": 4.6212290164521554e-07, "loss": 0.2567, "step": 560 }, { "epoch": 1.7129629629629628, "grad_norm": 0.19683323800563812, "learning_rate": 4.5184002322740784e-07, "loss": 0.4327, "step": 561 }, { "epoch": 1.7160493827160495, "grad_norm": 0.17663246393203735, "learning_rate": 4.4166742391707593e-07, "loss": 0.2145, "step": 562 }, { "epoch": 1.7191358024691357, "grad_norm": 0.16606709361076355, "learning_rate": 4.316053503689466e-07, "loss": 0.3419, "step": 563 }, { "epoch": 1.7222222222222223, "grad_norm": 0.21532438695430756, "learning_rate": 4.2165404655783836e-07, "loss": 0.379, "step": 564 }, { "epoch": 1.7253086419753085, "grad_norm": 0.1450224667787552, "learning_rate": 4.1181375377273237e-07, "loss": 0.19, "step": 565 }, { "epoch": 1.7283950617283952, "grad_norm": 0.18900087475776672, "learning_rate": 4.020847106109349e-07, "loss": 0.3304, "step": 566 }, { "epoch": 1.7314814814814814, "grad_norm": 0.1328793317079544, "learning_rate": 3.9246715297228176e-07, "loss": 0.283, "step": 567 }, { "epoch": 1.7314814814814814, "eval_loss": 0.42760223150253296, "eval_runtime": 44.2033, "eval_samples_per_second": 8.325, "eval_steps_per_second": 1.041, "step": 567 }, { "epoch": 1.734567901234568, "grad_norm": 0.14145122468471527, "learning_rate": 3.829613140534222e-07, "loss": 0.3045, "step": 568 }, { "epoch": 1.7376543209876543, "grad_norm": 0.1800602227449417, "learning_rate": 3.7356742434216775e-07, "loss": 0.2553, "step": 569 }, { "epoch": 1.7407407407407407, "grad_norm": 0.18250073492527008, "learning_rate": 3.642857116118986e-07, "loss": 0.23, "step": 570 }, { "epoch": 1.7438271604938271, "grad_norm": 0.14363303780555725, "learning_rate": 3.5511640091604293e-07, "loss": 0.2744, "step": 571 }, { "epoch": 1.7469135802469136, "grad_norm": 0.16794289648532867, "learning_rate": 3.4605971458262e-07, "loss": 0.3806, "step": 572 }, { "epoch": 1.75, "grad_norm": 0.15108714997768402, "learning_rate": 3.371158722088497e-07, "loss": 0.2868, "step": 573 }, { "epoch": 1.7530864197530864, "grad_norm": 0.2250644415616989, "learning_rate": 3.2828509065582713e-07, "loss": 0.4173, "step": 574 }, { "epoch": 1.7561728395061729, "grad_norm": 0.16634950041770935, "learning_rate": 3.195675840432655e-07, "loss": 0.3429, "step": 575 }, { "epoch": 1.7592592592592593, "grad_norm": 0.3840501010417938, "learning_rate": 3.109635637443026e-07, "loss": 0.3564, "step": 576 }, { "epoch": 1.7623456790123457, "grad_norm": 0.1317005604505539, "learning_rate": 3.02473238380378e-07, "loss": 0.2571, "step": 577 }, { "epoch": 1.765432098765432, "grad_norm": 0.16465657949447632, "learning_rate": 2.9409681381617315e-07, "loss": 0.3739, "step": 578 }, { "epoch": 1.7685185185185186, "grad_norm": 0.14124394953250885, "learning_rate": 2.858344931546181e-07, "loss": 0.2025, "step": 579 }, { "epoch": 1.7716049382716048, "grad_norm": 0.19090065360069275, "learning_rate": 2.776864767319731e-07, "loss": 0.3652, "step": 580 }, { "epoch": 1.7746913580246915, "grad_norm": 0.16761578619480133, "learning_rate": 2.696529621129618e-07, "loss": 0.3257, "step": 581 }, { "epoch": 1.7777777777777777, "grad_norm": 0.17358000576496124, "learning_rate": 2.617341440859883e-07, "loss": 0.3162, "step": 582 }, { "epoch": 1.7808641975308643, "grad_norm": 0.13688547909259796, "learning_rate": 2.539302146584116e-07, "loss": 0.2838, "step": 583 }, { "epoch": 1.7839506172839505, "grad_norm": 0.12233246117830276, "learning_rate": 2.4624136305188895e-07, "loss": 0.2656, "step": 584 }, { "epoch": 1.7870370370370372, "grad_norm": 0.14487585425376892, "learning_rate": 2.3866777569779234e-07, "loss": 0.2808, "step": 585 }, { "epoch": 1.7901234567901234, "grad_norm": 0.1593523919582367, "learning_rate": 2.3120963623267822e-07, "loss": 0.3441, "step": 586 }, { "epoch": 1.7932098765432098, "grad_norm": 0.1122526079416275, "learning_rate": 2.2386712549384848e-07, "loss": 0.1452, "step": 587 }, { "epoch": 1.7962962962962963, "grad_norm": 0.1848554015159607, "learning_rate": 2.1664042151495424e-07, "loss": 0.407, "step": 588 }, { "epoch": 1.7993827160493827, "grad_norm": 0.17059315741062164, "learning_rate": 2.095296995216828e-07, "loss": 0.3516, "step": 589 }, { "epoch": 1.8024691358024691, "grad_norm": 0.18412597477436066, "learning_rate": 2.0253513192751374e-07, "loss": 0.2922, "step": 590 }, { "epoch": 1.8055555555555556, "grad_norm": 0.17134982347488403, "learning_rate": 1.9565688832952846e-07, "loss": 0.2951, "step": 591 }, { "epoch": 1.808641975308642, "grad_norm": 0.11777715384960175, "learning_rate": 1.8889513550430892e-07, "loss": 0.24, "step": 592 }, { "epoch": 1.8117283950617284, "grad_norm": 0.18584772944450378, "learning_rate": 1.8225003740388546e-07, "loss": 0.3498, "step": 593 }, { "epoch": 1.8148148148148149, "grad_norm": 0.15893200039863586, "learning_rate": 1.7572175515176538e-07, "loss": 0.3392, "step": 594 }, { "epoch": 1.817901234567901, "grad_norm": 0.152305468916893, "learning_rate": 1.693104470390261e-07, "loss": 0.2333, "step": 595 }, { "epoch": 1.8209876543209877, "grad_norm": 0.15064826607704163, "learning_rate": 1.6301626852047504e-07, "loss": 0.2935, "step": 596 }, { "epoch": 1.824074074074074, "grad_norm": 0.18689890205860138, "learning_rate": 1.5683937221088242e-07, "loss": 0.4082, "step": 597 }, { "epoch": 1.8271604938271606, "grad_norm": 0.16067026555538177, "learning_rate": 1.5077990788127993e-07, "loss": 0.2624, "step": 598 }, { "epoch": 1.8302469135802468, "grad_norm": 0.15756982564926147, "learning_rate": 1.448380224553303e-07, "loss": 0.3681, "step": 599 }, { "epoch": 1.8333333333333335, "grad_norm": 0.16193000972270966, "learning_rate": 1.3901386000576112e-07, "loss": 0.5148, "step": 600 }, { "epoch": 1.8364197530864197, "grad_norm": 0.1545064002275467, "learning_rate": 1.3330756175087778e-07, "loss": 0.2837, "step": 601 }, { "epoch": 1.8395061728395061, "grad_norm": 0.1584656536579132, "learning_rate": 1.2771926605113283e-07, "loss": 0.267, "step": 602 }, { "epoch": 1.8425925925925926, "grad_norm": 0.23085588216781616, "learning_rate": 1.2224910840577642e-07, "loss": 0.3637, "step": 603 }, { "epoch": 1.845679012345679, "grad_norm": 0.15698540210723877, "learning_rate": 1.1689722144956672e-07, "loss": 0.2152, "step": 604 }, { "epoch": 1.8487654320987654, "grad_norm": 0.1545877605676651, "learning_rate": 1.1166373494955696e-07, "loss": 0.3073, "step": 605 }, { "epoch": 1.8518518518518519, "grad_norm": 0.16467563807964325, "learning_rate": 1.06548775801949e-07, "loss": 0.3654, "step": 606 }, { "epoch": 1.8549382716049383, "grad_norm": 0.20076429843902588, "learning_rate": 1.0155246802901198e-07, "loss": 0.3131, "step": 607 }, { "epoch": 1.8580246913580247, "grad_norm": 0.14146511256694794, "learning_rate": 9.667493277608187e-08, "loss": 0.3651, "step": 608 }, { "epoch": 1.8611111111111112, "grad_norm": 0.15111708641052246, "learning_rate": 9.191628830861832e-08, "loss": 0.267, "step": 609 }, { "epoch": 1.8641975308641974, "grad_norm": 0.13036541640758514, "learning_rate": 8.727665000934027e-08, "loss": 0.2568, "step": 610 }, { "epoch": 1.867283950617284, "grad_norm": 0.16827543079853058, "learning_rate": 8.275613037542873e-08, "loss": 0.4188, "step": 611 }, { "epoch": 1.8703703703703702, "grad_norm": 0.18110865354537964, "learning_rate": 7.835483901579454e-08, "loss": 0.3361, "step": 612 }, { "epoch": 1.873456790123457, "grad_norm": 0.1515679508447647, "learning_rate": 7.407288264842772e-08, "loss": 0.3421, "step": 613 }, { "epoch": 1.876543209876543, "grad_norm": 0.1735447645187378, "learning_rate": 6.991036509780391e-08, "loss": 0.3908, "step": 614 }, { "epoch": 1.8796296296296298, "grad_norm": 0.15131166577339172, "learning_rate": 6.58673872923693e-08, "loss": 0.2439, "step": 615 }, { "epoch": 1.882716049382716, "grad_norm": 0.12076130509376526, "learning_rate": 6.194404726209358e-08, "loss": 0.2178, "step": 616 }, { "epoch": 1.8858024691358026, "grad_norm": 0.1315135806798935, "learning_rate": 5.8140440136091326e-08, "loss": 0.2291, "step": 617 }, { "epoch": 1.8888888888888888, "grad_norm": 0.17915165424346924, "learning_rate": 5.445665814031942e-08, "loss": 0.2377, "step": 618 }, { "epoch": 1.8919753086419753, "grad_norm": 0.14008641242980957, "learning_rate": 5.089279059533658e-08, "loss": 0.2266, "step": 619 }, { "epoch": 1.8950617283950617, "grad_norm": 0.18772335350513458, "learning_rate": 4.744892391413791e-08, "loss": 0.4006, "step": 620 }, { "epoch": 1.8981481481481481, "grad_norm": 0.14937154948711395, "learning_rate": 4.412514160006376e-08, "loss": 0.3891, "step": 621 }, { "epoch": 1.9012345679012346, "grad_norm": 0.12767252326011658, "learning_rate": 4.092152424477025e-08, "loss": 0.2397, "step": 622 }, { "epoch": 1.904320987654321, "grad_norm": 0.16874873638153076, "learning_rate": 3.7838149526277514e-08, "loss": 0.3338, "step": 623 }, { "epoch": 1.9074074074074074, "grad_norm": 0.1845911145210266, "learning_rate": 3.487509220708563e-08, "loss": 0.4378, "step": 624 }, { "epoch": 1.9104938271604939, "grad_norm": 0.14064140617847443, "learning_rate": 3.2032424132362736e-08, "loss": 0.2801, "step": 625 }, { "epoch": 1.9135802469135803, "grad_norm": 0.14805810153484344, "learning_rate": 2.9310214228202016e-08, "loss": 0.3122, "step": 626 }, { "epoch": 1.9166666666666665, "grad_norm": 0.1921551674604416, "learning_rate": 2.6708528499950758e-08, "loss": 0.2982, "step": 627 }, { "epoch": 1.9197530864197532, "grad_norm": 0.14775682985782623, "learning_rate": 2.4227430030609455e-08, "loss": 0.3503, "step": 628 }, { "epoch": 1.9228395061728394, "grad_norm": 0.17906314134597778, "learning_rate": 2.1866978979303567e-08, "loss": 0.3863, "step": 629 }, { "epoch": 1.925925925925926, "grad_norm": 0.1467551589012146, "learning_rate": 1.962723257982302e-08, "loss": 0.2993, "step": 630 }, { "epoch": 1.9290123456790123, "grad_norm": 0.2205621749162674, "learning_rate": 1.7508245139236658e-08, "loss": 0.3168, "step": 631 }, { "epoch": 1.932098765432099, "grad_norm": 0.1704474836587906, "learning_rate": 1.5510068036573288e-08, "loss": 0.3177, "step": 632 }, { "epoch": 1.9351851851851851, "grad_norm": 0.15591393411159515, "learning_rate": 1.3632749721577132e-08, "loss": 0.2671, "step": 633 }, { "epoch": 1.9382716049382716, "grad_norm": 0.1339595913887024, "learning_rate": 1.1876335713532638e-08, "loss": 0.196, "step": 634 }, { "epoch": 1.941358024691358, "grad_norm": 0.15144091844558716, "learning_rate": 1.024086860016149e-08, "loss": 0.306, "step": 635 }, { "epoch": 1.9444444444444444, "grad_norm": 0.14868693053722382, "learning_rate": 8.726388036587874e-09, "loss": 0.271, "step": 636 }, { "epoch": 1.9475308641975309, "grad_norm": 0.14298443496227264, "learning_rate": 7.332930744380906e-09, "loss": 0.225, "step": 637 }, { "epoch": 1.9506172839506173, "grad_norm": 0.14053991436958313, "learning_rate": 6.060530510659246e-09, "loss": 0.32, "step": 638 }, { "epoch": 1.9537037037037037, "grad_norm": 0.2039446085691452, "learning_rate": 4.909218187276743e-09, "loss": 0.4306, "step": 639 }, { "epoch": 1.9567901234567902, "grad_norm": 0.20658931136131287, "learning_rate": 3.8790216900702615e-09, "loss": 0.4053, "step": 640 }, { "epoch": 1.9598765432098766, "grad_norm": 0.30260926485061646, "learning_rate": 2.9699659981863306e-09, "loss": 0.3979, "step": 641 }, { "epoch": 1.9629629629629628, "grad_norm": 0.1412692815065384, "learning_rate": 2.182073153471631e-09, "loss": 0.1879, "step": 642 }, { "epoch": 1.9660493827160495, "grad_norm": 0.11770602315664291, "learning_rate": 1.5153622599428652e-09, "loss": 0.2462, "step": 643 }, { "epoch": 1.9691358024691357, "grad_norm": 0.156539648771286, "learning_rate": 9.698494833199068e-10, "loss": 0.3218, "step": 644 }, { "epoch": 1.9722222222222223, "grad_norm": 0.19168072938919067, "learning_rate": 5.455480506355582e-10, "loss": 0.4821, "step": 645 }, { "epoch": 1.9753086419753085, "grad_norm": 0.13230177760124207, "learning_rate": 2.4246824991525085e-10, "loss": 0.3134, "step": 646 }, { "epoch": 1.9783950617283952, "grad_norm": 0.1942073255777359, "learning_rate": 6.061742992613529e-11, "loss": 0.3413, "step": 647 }, { "epoch": 1.9814814814814814, "grad_norm": 0.15652911365032196, "learning_rate": 0.0, "loss": 0.2942, "step": 648 }, { "epoch": 1.9814814814814814, "eval_loss": 0.42709851264953613, "eval_runtime": 44.317, "eval_samples_per_second": 8.304, "eval_steps_per_second": 1.038, "step": 648 } ], "logging_steps": 1, "max_steps": 648, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 162, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.584525189221712e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }