diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4641 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9814814814814814, + "eval_steps": 81, + "global_step": 648, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0030864197530864196, + "grad_norm": 0.11897344887256622, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.6253, + "step": 1 + }, + { + "epoch": 0.0030864197530864196, + "eval_loss": 0.6252603530883789, + "eval_runtime": 44.2936, + "eval_samples_per_second": 8.308, + "eval_steps_per_second": 1.039, + "step": 1 + }, + { + "epoch": 0.006172839506172839, + "grad_norm": 0.11417510360479355, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6376, + "step": 2 + }, + { + "epoch": 0.009259259259259259, + "grad_norm": 0.0693814605474472, + "learning_rate": 3e-06, + "loss": 0.2684, + "step": 3 + }, + { + "epoch": 0.012345679012345678, + "grad_norm": 0.1110842302441597, + "learning_rate": 4.000000000000001e-06, + "loss": 0.5096, + "step": 4 + }, + { + "epoch": 0.015432098765432098, + "grad_norm": 0.09205043315887451, + "learning_rate": 5e-06, + "loss": 0.5674, + "step": 5 + }, + { + "epoch": 0.018518518518518517, + "grad_norm": 0.1063380092382431, + "learning_rate": 6e-06, + "loss": 0.6219, + "step": 6 + }, + { + "epoch": 0.021604938271604937, + "grad_norm": 0.0740552470088005, + "learning_rate": 7e-06, + "loss": 0.5478, + "step": 7 + }, + { + "epoch": 0.024691358024691357, + "grad_norm": 0.10674550384283066, + "learning_rate": 8.000000000000001e-06, + "loss": 0.6168, + "step": 8 + }, + { + "epoch": 0.027777777777777776, + "grad_norm": 0.1061239168047905, + "learning_rate": 9e-06, + "loss": 0.7106, + "step": 9 + }, + { + "epoch": 0.030864197530864196, + "grad_norm": 0.10123332589864731, + "learning_rate": 1e-05, + "loss": 0.5221, + "step": 10 + }, + { + "epoch": 0.033950617283950615, + "grad_norm": 0.06680818647146225, + "learning_rate": 9.999939382570075e-06, + "loss": 0.2592, + "step": 11 + }, + { + "epoch": 0.037037037037037035, + "grad_norm": 0.09670277684926987, + "learning_rate": 9.999757531750086e-06, + "loss": 0.5183, + "step": 12 + }, + { + "epoch": 0.040123456790123455, + "grad_norm": 0.07567557692527771, + "learning_rate": 9.999454451949364e-06, + "loss": 0.3257, + "step": 13 + }, + { + "epoch": 0.043209876543209874, + "grad_norm": 0.10101059824228287, + "learning_rate": 9.999030150516681e-06, + "loss": 0.4788, + "step": 14 + }, + { + "epoch": 0.046296296296296294, + "grad_norm": 0.1238669604063034, + "learning_rate": 9.998484637740058e-06, + "loss": 0.6218, + "step": 15 + }, + { + "epoch": 0.04938271604938271, + "grad_norm": 0.10699903219938278, + "learning_rate": 9.997817926846528e-06, + "loss": 0.6429, + "step": 16 + }, + { + "epoch": 0.05246913580246913, + "grad_norm": 0.08470468968153, + "learning_rate": 9.997030034001815e-06, + "loss": 0.3134, + "step": 17 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 0.1229688748717308, + "learning_rate": 9.99612097830993e-06, + "loss": 0.712, + "step": 18 + }, + { + "epoch": 0.05864197530864197, + "grad_norm": 0.10526233166456223, + "learning_rate": 9.995090781812724e-06, + "loss": 0.504, + "step": 19 + }, + { + "epoch": 0.06172839506172839, + "grad_norm": 0.11165868490934372, + "learning_rate": 9.993939469489342e-06, + "loss": 0.5122, + "step": 20 + }, + { + "epoch": 0.06481481481481481, + "grad_norm": 0.09065920859575272, + "learning_rate": 9.99266706925562e-06, + "loss": 0.4664, + "step": 21 + }, + { + "epoch": 0.06790123456790123, + "grad_norm": 0.10060250014066696, + "learning_rate": 9.991273611963413e-06, + "loss": 0.4732, + "step": 22 + }, + { + "epoch": 0.07098765432098765, + "grad_norm": 0.10402392596006393, + "learning_rate": 9.98975913139984e-06, + "loss": 0.4899, + "step": 23 + }, + { + "epoch": 0.07407407407407407, + "grad_norm": 0.11345162242650986, + "learning_rate": 9.98812366428647e-06, + "loss": 0.5365, + "step": 24 + }, + { + "epoch": 0.07716049382716049, + "grad_norm": 0.1189904510974884, + "learning_rate": 9.986367250278423e-06, + "loss": 0.6293, + "step": 25 + }, + { + "epoch": 0.08024691358024691, + "grad_norm": 0.11722761392593384, + "learning_rate": 9.984489931963429e-06, + "loss": 0.4991, + "step": 26 + }, + { + "epoch": 0.08333333333333333, + "grad_norm": 0.08803360909223557, + "learning_rate": 9.982491754860763e-06, + "loss": 0.381, + "step": 27 + }, + { + "epoch": 0.08641975308641975, + "grad_norm": 0.11037921905517578, + "learning_rate": 9.980372767420179e-06, + "loss": 0.5814, + "step": 28 + }, + { + "epoch": 0.08950617283950617, + "grad_norm": 0.0851665586233139, + "learning_rate": 9.978133021020697e-06, + "loss": 0.3629, + "step": 29 + }, + { + "epoch": 0.09259259259259259, + "grad_norm": 0.10195960849523544, + "learning_rate": 9.97577256996939e-06, + "loss": 0.5672, + "step": 30 + }, + { + "epoch": 0.09567901234567901, + "grad_norm": 0.12112904340028763, + "learning_rate": 9.97329147150005e-06, + "loss": 0.6165, + "step": 31 + }, + { + "epoch": 0.09876543209876543, + "grad_norm": 0.07611838728189468, + "learning_rate": 9.970689785771798e-06, + "loss": 0.3902, + "step": 32 + }, + { + "epoch": 0.10185185185185185, + "grad_norm": 0.1013374775648117, + "learning_rate": 9.96796757586764e-06, + "loss": 0.5096, + "step": 33 + }, + { + "epoch": 0.10493827160493827, + "grad_norm": 0.08809865266084671, + "learning_rate": 9.965124907792916e-06, + "loss": 0.3333, + "step": 34 + }, + { + "epoch": 0.10802469135802469, + "grad_norm": 0.0764087364077568, + "learning_rate": 9.962161850473723e-06, + "loss": 0.3461, + "step": 35 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.0995788499712944, + "learning_rate": 9.95907847575523e-06, + "loss": 0.4225, + "step": 36 + }, + { + "epoch": 0.11419753086419752, + "grad_norm": 0.11751396954059601, + "learning_rate": 9.955874858399936e-06, + "loss": 0.4991, + "step": 37 + }, + { + "epoch": 0.11728395061728394, + "grad_norm": 0.10502217710018158, + "learning_rate": 9.952551076085864e-06, + "loss": 0.5847, + "step": 38 + }, + { + "epoch": 0.12037037037037036, + "grad_norm": 0.1077880784869194, + "learning_rate": 9.949107209404664e-06, + "loss": 0.4901, + "step": 39 + }, + { + "epoch": 0.12345679012345678, + "grad_norm": 0.08844556659460068, + "learning_rate": 9.945543341859681e-06, + "loss": 0.5752, + "step": 40 + }, + { + "epoch": 0.12654320987654322, + "grad_norm": 0.10771756619215012, + "learning_rate": 9.94185955986391e-06, + "loss": 0.5393, + "step": 41 + }, + { + "epoch": 0.12962962962962962, + "grad_norm": 0.07496192306280136, + "learning_rate": 9.938055952737908e-06, + "loss": 0.3334, + "step": 42 + }, + { + "epoch": 0.13271604938271606, + "grad_norm": 0.106163389980793, + "learning_rate": 9.934132612707631e-06, + "loss": 0.5319, + "step": 43 + }, + { + "epoch": 0.13580246913580246, + "grad_norm": 0.09276831895112991, + "learning_rate": 9.930089634902197e-06, + "loss": 0.486, + "step": 44 + }, + { + "epoch": 0.1388888888888889, + "grad_norm": 0.09449384361505508, + "learning_rate": 9.925927117351573e-06, + "loss": 0.3858, + "step": 45 + }, + { + "epoch": 0.1419753086419753, + "grad_norm": 0.07955848425626755, + "learning_rate": 9.921645160984205e-06, + "loss": 0.4648, + "step": 46 + }, + { + "epoch": 0.14506172839506173, + "grad_norm": 0.10575301945209503, + "learning_rate": 9.917243869624573e-06, + "loss": 0.4704, + "step": 47 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 0.0714716911315918, + "learning_rate": 9.91272334999066e-06, + "loss": 0.372, + "step": 48 + }, + { + "epoch": 0.15123456790123457, + "grad_norm": 0.08894475549459457, + "learning_rate": 9.908083711691383e-06, + "loss": 0.5005, + "step": 49 + }, + { + "epoch": 0.15432098765432098, + "grad_norm": 0.0800170972943306, + "learning_rate": 9.903325067223918e-06, + "loss": 0.3688, + "step": 50 + }, + { + "epoch": 0.1574074074074074, + "grad_norm": 0.09310433268547058, + "learning_rate": 9.898447531970989e-06, + "loss": 0.5127, + "step": 51 + }, + { + "epoch": 0.16049382716049382, + "grad_norm": 0.07690192013978958, + "learning_rate": 9.893451224198051e-06, + "loss": 0.2993, + "step": 52 + }, + { + "epoch": 0.16358024691358025, + "grad_norm": 0.08025282621383667, + "learning_rate": 9.888336265050443e-06, + "loss": 0.4004, + "step": 53 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 0.06500386446714401, + "learning_rate": 9.883102778550434e-06, + "loss": 0.3317, + "step": 54 + }, + { + "epoch": 0.1697530864197531, + "grad_norm": 0.07926575839519501, + "learning_rate": 9.877750891594224e-06, + "loss": 0.3606, + "step": 55 + }, + { + "epoch": 0.1728395061728395, + "grad_norm": 0.07245253026485443, + "learning_rate": 9.872280733948867e-06, + "loss": 0.4437, + "step": 56 + }, + { + "epoch": 0.17592592592592593, + "grad_norm": 0.07353054732084274, + "learning_rate": 9.866692438249124e-06, + "loss": 0.36, + "step": 57 + }, + { + "epoch": 0.17901234567901234, + "grad_norm": 0.09307980537414551, + "learning_rate": 9.86098613999424e-06, + "loss": 0.5175, + "step": 58 + }, + { + "epoch": 0.18209876543209877, + "grad_norm": 0.07782690227031708, + "learning_rate": 9.855161977544672e-06, + "loss": 0.4332, + "step": 59 + }, + { + "epoch": 0.18518518518518517, + "grad_norm": 0.06865860521793365, + "learning_rate": 9.849220092118721e-06, + "loss": 0.3464, + "step": 60 + }, + { + "epoch": 0.1882716049382716, + "grad_norm": 0.0760008841753006, + "learning_rate": 9.84316062778912e-06, + "loss": 0.3808, + "step": 61 + }, + { + "epoch": 0.19135802469135801, + "grad_norm": 0.07834326475858688, + "learning_rate": 9.836983731479526e-06, + "loss": 0.499, + "step": 62 + }, + { + "epoch": 0.19444444444444445, + "grad_norm": 0.08240173012018204, + "learning_rate": 9.830689552960974e-06, + "loss": 0.4432, + "step": 63 + }, + { + "epoch": 0.19753086419753085, + "grad_norm": 0.06976404786109924, + "learning_rate": 9.824278244848236e-06, + "loss": 0.3482, + "step": 64 + }, + { + "epoch": 0.2006172839506173, + "grad_norm": 0.09335274249315262, + "learning_rate": 9.817749962596115e-06, + "loss": 0.4533, + "step": 65 + }, + { + "epoch": 0.2037037037037037, + "grad_norm": 0.10973995178937912, + "learning_rate": 9.811104864495691e-06, + "loss": 0.6042, + "step": 66 + }, + { + "epoch": 0.20679012345679013, + "grad_norm": 0.08284437656402588, + "learning_rate": 9.804343111670472e-06, + "loss": 0.4818, + "step": 67 + }, + { + "epoch": 0.20987654320987653, + "grad_norm": 0.08448096364736557, + "learning_rate": 9.797464868072489e-06, + "loss": 0.518, + "step": 68 + }, + { + "epoch": 0.21296296296296297, + "grad_norm": 0.07667321711778641, + "learning_rate": 9.790470300478318e-06, + "loss": 0.3757, + "step": 69 + }, + { + "epoch": 0.21604938271604937, + "grad_norm": 0.0944654569029808, + "learning_rate": 9.783359578485047e-06, + "loss": 0.4863, + "step": 70 + }, + { + "epoch": 0.2191358024691358, + "grad_norm": 0.07617281377315521, + "learning_rate": 9.776132874506153e-06, + "loss": 0.3484, + "step": 71 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.09038567543029785, + "learning_rate": 9.768790363767321e-06, + "loss": 0.596, + "step": 72 + }, + { + "epoch": 0.22530864197530864, + "grad_norm": 0.0843636766076088, + "learning_rate": 9.761332224302209e-06, + "loss": 0.4042, + "step": 73 + }, + { + "epoch": 0.22839506172839505, + "grad_norm": 0.09003959596157074, + "learning_rate": 9.753758636948112e-06, + "loss": 0.5011, + "step": 74 + }, + { + "epoch": 0.23148148148148148, + "grad_norm": 0.079057976603508, + "learning_rate": 9.74606978534159e-06, + "loss": 0.4703, + "step": 75 + }, + { + "epoch": 0.2345679012345679, + "grad_norm": 0.07765232026576996, + "learning_rate": 9.738265855914014e-06, + "loss": 0.3294, + "step": 76 + }, + { + "epoch": 0.23765432098765432, + "grad_norm": 0.07654544711112976, + "learning_rate": 9.730347037887041e-06, + "loss": 0.4039, + "step": 77 + }, + { + "epoch": 0.24074074074074073, + "grad_norm": 0.05925621837377548, + "learning_rate": 9.722313523268028e-06, + "loss": 0.2295, + "step": 78 + }, + { + "epoch": 0.24382716049382716, + "grad_norm": 0.07830403745174408, + "learning_rate": 9.714165506845381e-06, + "loss": 0.3721, + "step": 79 + }, + { + "epoch": 0.24691358024691357, + "grad_norm": 0.09928114712238312, + "learning_rate": 9.705903186183828e-06, + "loss": 0.5154, + "step": 80 + }, + { + "epoch": 0.25, + "grad_norm": 0.06352175772190094, + "learning_rate": 9.697526761619621e-06, + "loss": 0.2613, + "step": 81 + }, + { + "epoch": 0.25, + "eval_loss": 0.5444870591163635, + "eval_runtime": 44.3715, + "eval_samples_per_second": 8.294, + "eval_steps_per_second": 1.037, + "step": 81 + }, + { + "epoch": 0.25308641975308643, + "grad_norm": 0.07308296114206314, + "learning_rate": 9.689036436255698e-06, + "loss": 0.3455, + "step": 82 + }, + { + "epoch": 0.25617283950617287, + "grad_norm": 0.07788842916488647, + "learning_rate": 9.680432415956736e-06, + "loss": 0.4675, + "step": 83 + }, + { + "epoch": 0.25925925925925924, + "grad_norm": 0.09506388008594513, + "learning_rate": 9.671714909344175e-06, + "loss": 0.5544, + "step": 84 + }, + { + "epoch": 0.2623456790123457, + "grad_norm": 0.08810863643884659, + "learning_rate": 9.66288412779115e-06, + "loss": 0.497, + "step": 85 + }, + { + "epoch": 0.2654320987654321, + "grad_norm": 0.06235141307115555, + "learning_rate": 9.653940285417381e-06, + "loss": 0.2775, + "step": 86 + }, + { + "epoch": 0.26851851851851855, + "grad_norm": 0.07534658908843994, + "learning_rate": 9.644883599083959e-06, + "loss": 0.3706, + "step": 87 + }, + { + "epoch": 0.2716049382716049, + "grad_norm": 0.11235971748828888, + "learning_rate": 9.635714288388103e-06, + "loss": 0.6166, + "step": 88 + }, + { + "epoch": 0.27469135802469136, + "grad_norm": 0.07352706789970398, + "learning_rate": 9.626432575657834e-06, + "loss": 0.4254, + "step": 89 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 0.10939712822437286, + "learning_rate": 9.617038685946578e-06, + "loss": 0.3768, + "step": 90 + }, + { + "epoch": 0.2808641975308642, + "grad_norm": 0.0766228511929512, + "learning_rate": 9.60753284702772e-06, + "loss": 0.3562, + "step": 91 + }, + { + "epoch": 0.2839506172839506, + "grad_norm": 0.08354140818119049, + "learning_rate": 9.597915289389067e-06, + "loss": 0.4783, + "step": 92 + }, + { + "epoch": 0.28703703703703703, + "grad_norm": 0.08200543373823166, + "learning_rate": 9.58818624622727e-06, + "loss": 0.3947, + "step": 93 + }, + { + "epoch": 0.29012345679012347, + "grad_norm": 0.08410683274269104, + "learning_rate": 9.578345953442163e-06, + "loss": 0.5048, + "step": 94 + }, + { + "epoch": 0.2932098765432099, + "grad_norm": 0.1019473522901535, + "learning_rate": 9.568394649631055e-06, + "loss": 0.5842, + "step": 95 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 0.08855041116476059, + "learning_rate": 9.558332576082925e-06, + "loss": 0.4176, + "step": 96 + }, + { + "epoch": 0.2993827160493827, + "grad_norm": 0.08165948837995529, + "learning_rate": 9.548159976772593e-06, + "loss": 0.4098, + "step": 97 + }, + { + "epoch": 0.30246913580246915, + "grad_norm": 0.07580746710300446, + "learning_rate": 9.537877098354787e-06, + "loss": 0.3886, + "step": 98 + }, + { + "epoch": 0.3055555555555556, + "grad_norm": 0.0938824713230133, + "learning_rate": 9.527484190158171e-06, + "loss": 0.4551, + "step": 99 + }, + { + "epoch": 0.30864197530864196, + "grad_norm": 0.07878723740577698, + "learning_rate": 9.5169815041793e-06, + "loss": 0.4042, + "step": 100 + }, + { + "epoch": 0.3117283950617284, + "grad_norm": 0.07207982987165451, + "learning_rate": 9.506369295076505e-06, + "loss": 0.3541, + "step": 101 + }, + { + "epoch": 0.3148148148148148, + "grad_norm": 0.06538520753383636, + "learning_rate": 9.495647820163725e-06, + "loss": 0.2972, + "step": 102 + }, + { + "epoch": 0.31790123456790126, + "grad_norm": 0.08196717500686646, + "learning_rate": 9.484817339404261e-06, + "loss": 0.401, + "step": 103 + }, + { + "epoch": 0.32098765432098764, + "grad_norm": 0.07677263766527176, + "learning_rate": 9.473878115404477e-06, + "loss": 0.4073, + "step": 104 + }, + { + "epoch": 0.32407407407407407, + "grad_norm": 0.11730651557445526, + "learning_rate": 9.462830413407427e-06, + "loss": 0.4501, + "step": 105 + }, + { + "epoch": 0.3271604938271605, + "grad_norm": 0.06849709898233414, + "learning_rate": 9.451674501286436e-06, + "loss": 0.2538, + "step": 106 + }, + { + "epoch": 0.33024691358024694, + "grad_norm": 0.09413019567728043, + "learning_rate": 9.440410649538592e-06, + "loss": 0.4646, + "step": 107 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.15361227095127106, + "learning_rate": 9.42903913127819e-06, + "loss": 0.5303, + "step": 108 + }, + { + "epoch": 0.33641975308641975, + "grad_norm": 0.08900155127048492, + "learning_rate": 9.417560222230115e-06, + "loss": 0.383, + "step": 109 + }, + { + "epoch": 0.3395061728395062, + "grad_norm": 0.07807417958974838, + "learning_rate": 9.405974200723156e-06, + "loss": 0.3673, + "step": 110 + }, + { + "epoch": 0.3425925925925926, + "grad_norm": 0.1323561668395996, + "learning_rate": 9.394281347683247e-06, + "loss": 0.597, + "step": 111 + }, + { + "epoch": 0.345679012345679, + "grad_norm": 0.11236107349395752, + "learning_rate": 9.382481946626673e-06, + "loss": 0.5051, + "step": 112 + }, + { + "epoch": 0.3487654320987654, + "grad_norm": 0.09908317029476166, + "learning_rate": 9.370576283653178e-06, + "loss": 0.3208, + "step": 113 + }, + { + "epoch": 0.35185185185185186, + "grad_norm": 0.08509659022092819, + "learning_rate": 9.358564647439037e-06, + "loss": 0.3801, + "step": 114 + }, + { + "epoch": 0.3549382716049383, + "grad_norm": 0.05896300822496414, + "learning_rate": 9.34644732923006e-06, + "loss": 0.2217, + "step": 115 + }, + { + "epoch": 0.35802469135802467, + "grad_norm": 0.06763949990272522, + "learning_rate": 9.33422462283452e-06, + "loss": 0.3583, + "step": 116 + }, + { + "epoch": 0.3611111111111111, + "grad_norm": 0.0857081338763237, + "learning_rate": 9.321896824616036e-06, + "loss": 0.4122, + "step": 117 + }, + { + "epoch": 0.36419753086419754, + "grad_norm": 0.07149571180343628, + "learning_rate": 9.309464233486386e-06, + "loss": 0.2959, + "step": 118 + }, + { + "epoch": 0.36728395061728397, + "grad_norm": 0.09094710648059845, + "learning_rate": 9.29692715089826e-06, + "loss": 0.3633, + "step": 119 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 0.07034748792648315, + "learning_rate": 9.284285880837947e-06, + "loss": 0.2826, + "step": 120 + }, + { + "epoch": 0.3734567901234568, + "grad_norm": 0.0919278934597969, + "learning_rate": 9.271540729817969e-06, + "loss": 0.389, + "step": 121 + }, + { + "epoch": 0.3765432098765432, + "grad_norm": 0.07186863571405411, + "learning_rate": 9.258692006869644e-06, + "loss": 0.296, + "step": 122 + }, + { + "epoch": 0.37962962962962965, + "grad_norm": 0.09665773808956146, + "learning_rate": 9.245740023535596e-06, + "loss": 0.4324, + "step": 123 + }, + { + "epoch": 0.38271604938271603, + "grad_norm": 0.08115452527999878, + "learning_rate": 9.232685093862206e-06, + "loss": 0.3555, + "step": 124 + }, + { + "epoch": 0.38580246913580246, + "grad_norm": 0.07702954113483429, + "learning_rate": 9.219527534391983e-06, + "loss": 0.3385, + "step": 125 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 0.10876493901014328, + "learning_rate": 9.206267664155906e-06, + "loss": 0.4446, + "step": 126 + }, + { + "epoch": 0.39197530864197533, + "grad_norm": 0.07764764875173569, + "learning_rate": 9.192905804665677e-06, + "loss": 0.369, + "step": 127 + }, + { + "epoch": 0.3950617283950617, + "grad_norm": 0.10887006670236588, + "learning_rate": 9.179442279905927e-06, + "loss": 0.4297, + "step": 128 + }, + { + "epoch": 0.39814814814814814, + "grad_norm": 0.10183979570865631, + "learning_rate": 9.165877416326365e-06, + "loss": 0.5906, + "step": 129 + }, + { + "epoch": 0.4012345679012346, + "grad_norm": 0.07278673350811005, + "learning_rate": 9.152211542833856e-06, + "loss": 0.3017, + "step": 130 + }, + { + "epoch": 0.404320987654321, + "grad_norm": 0.08892305195331573, + "learning_rate": 9.138444990784455e-06, + "loss": 0.3919, + "step": 131 + }, + { + "epoch": 0.4074074074074074, + "grad_norm": 0.0926053375005722, + "learning_rate": 9.124578093975358e-06, + "loss": 0.4833, + "step": 132 + }, + { + "epoch": 0.4104938271604938, + "grad_norm": 0.1312541514635086, + "learning_rate": 9.110611188636828e-06, + "loss": 0.4139, + "step": 133 + }, + { + "epoch": 0.41358024691358025, + "grad_norm": 0.07399484515190125, + "learning_rate": 9.096544613424026e-06, + "loss": 0.3156, + "step": 134 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.0757204219698906, + "learning_rate": 9.082378709408805e-06, + "loss": 0.3355, + "step": 135 + }, + { + "epoch": 0.41975308641975306, + "grad_norm": 0.08242496103048325, + "learning_rate": 9.068113820071447e-06, + "loss": 0.3647, + "step": 136 + }, + { + "epoch": 0.4228395061728395, + "grad_norm": 0.08191465586423874, + "learning_rate": 9.053750291292321e-06, + "loss": 0.3801, + "step": 137 + }, + { + "epoch": 0.42592592592592593, + "grad_norm": 0.08579788357019424, + "learning_rate": 9.039288471343505e-06, + "loss": 0.4375, + "step": 138 + }, + { + "epoch": 0.42901234567901236, + "grad_norm": 0.09289571642875671, + "learning_rate": 9.024728710880345e-06, + "loss": 0.3733, + "step": 139 + }, + { + "epoch": 0.43209876543209874, + "grad_norm": 0.09474348276853561, + "learning_rate": 9.010071362932945e-06, + "loss": 0.5004, + "step": 140 + }, + { + "epoch": 0.4351851851851852, + "grad_norm": 0.09607541561126709, + "learning_rate": 8.995316782897605e-06, + "loss": 0.3496, + "step": 141 + }, + { + "epoch": 0.4382716049382716, + "grad_norm": 0.08354438096284866, + "learning_rate": 8.98046532852822e-06, + "loss": 0.3528, + "step": 142 + }, + { + "epoch": 0.44135802469135804, + "grad_norm": 0.08367566019296646, + "learning_rate": 8.965517359927583e-06, + "loss": 0.3365, + "step": 143 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.08424922823905945, + "learning_rate": 8.950473239538672e-06, + "loss": 0.3636, + "step": 144 + }, + { + "epoch": 0.44753086419753085, + "grad_norm": 0.07770823687314987, + "learning_rate": 8.935333332135853e-06, + "loss": 0.2757, + "step": 145 + }, + { + "epoch": 0.4506172839506173, + "grad_norm": 0.08803431689739227, + "learning_rate": 8.920098004816035e-06, + "loss": 0.3397, + "step": 146 + }, + { + "epoch": 0.4537037037037037, + "grad_norm": 0.11619243025779724, + "learning_rate": 8.904767626989774e-06, + "loss": 0.4058, + "step": 147 + }, + { + "epoch": 0.4567901234567901, + "grad_norm": 0.08595902472734451, + "learning_rate": 8.88934257037231e-06, + "loss": 0.3447, + "step": 148 + }, + { + "epoch": 0.45987654320987653, + "grad_norm": 0.08116041868925095, + "learning_rate": 8.873823208974557e-06, + "loss": 0.3578, + "step": 149 + }, + { + "epoch": 0.46296296296296297, + "grad_norm": 0.13053898513317108, + "learning_rate": 8.85820991909404e-06, + "loss": 0.5429, + "step": 150 + }, + { + "epoch": 0.4660493827160494, + "grad_norm": 0.08137528598308563, + "learning_rate": 8.842503079305757e-06, + "loss": 0.3078, + "step": 151 + }, + { + "epoch": 0.4691358024691358, + "grad_norm": 0.0843534767627716, + "learning_rate": 8.826703070453014e-06, + "loss": 0.3807, + "step": 152 + }, + { + "epoch": 0.4722222222222222, + "grad_norm": 0.13925758004188538, + "learning_rate": 8.810810275638183e-06, + "loss": 0.4771, + "step": 153 + }, + { + "epoch": 0.47530864197530864, + "grad_norm": 0.08117470145225525, + "learning_rate": 8.794825080213415e-06, + "loss": 0.3197, + "step": 154 + }, + { + "epoch": 0.4783950617283951, + "grad_norm": 0.07650022953748703, + "learning_rate": 8.778747871771293e-06, + "loss": 0.2993, + "step": 155 + }, + { + "epoch": 0.48148148148148145, + "grad_norm": 0.09445349872112274, + "learning_rate": 8.76257904013544e-06, + "loss": 0.3641, + "step": 156 + }, + { + "epoch": 0.4845679012345679, + "grad_norm": 0.097043976187706, + "learning_rate": 8.746318977351066e-06, + "loss": 0.4181, + "step": 157 + }, + { + "epoch": 0.4876543209876543, + "grad_norm": 0.1167394146323204, + "learning_rate": 8.729968077675454e-06, + "loss": 0.5277, + "step": 158 + }, + { + "epoch": 0.49074074074074076, + "grad_norm": 0.08402277529239655, + "learning_rate": 8.713526737568415e-06, + "loss": 0.2867, + "step": 159 + }, + { + "epoch": 0.49382716049382713, + "grad_norm": 0.09060430526733398, + "learning_rate": 8.696995355682656e-06, + "loss": 0.3219, + "step": 160 + }, + { + "epoch": 0.49691358024691357, + "grad_norm": 0.1259710192680359, + "learning_rate": 8.680374332854134e-06, + "loss": 0.5394, + "step": 161 + }, + { + "epoch": 0.5, + "grad_norm": 0.09654678404331207, + "learning_rate": 8.663664072092324e-06, + "loss": 0.3679, + "step": 162 + }, + { + "epoch": 0.5, + "eval_loss": 0.5044411420822144, + "eval_runtime": 44.4479, + "eval_samples_per_second": 8.279, + "eval_steps_per_second": 1.035, + "step": 162 + }, + { + "epoch": 0.5030864197530864, + "grad_norm": 0.13062100112438202, + "learning_rate": 8.646864978570445e-06, + "loss": 0.38, + "step": 163 + }, + { + "epoch": 0.5061728395061729, + "grad_norm": 0.11305861920118332, + "learning_rate": 8.629977459615655e-06, + "loss": 0.3435, + "step": 164 + }, + { + "epoch": 0.5092592592592593, + "grad_norm": 0.07454624772071838, + "learning_rate": 8.613001924699146e-06, + "loss": 0.2768, + "step": 165 + }, + { + "epoch": 0.5123456790123457, + "grad_norm": 0.08615926653146744, + "learning_rate": 8.595938785426241e-06, + "loss": 0.3404, + "step": 166 + }, + { + "epoch": 0.5154320987654321, + "grad_norm": 0.09183604270219803, + "learning_rate": 8.578788455526398e-06, + "loss": 0.3493, + "step": 167 + }, + { + "epoch": 0.5185185185185185, + "grad_norm": 0.08047281205654144, + "learning_rate": 8.561551350843185e-06, + "loss": 0.3271, + "step": 168 + }, + { + "epoch": 0.5216049382716049, + "grad_norm": 0.08007708936929703, + "learning_rate": 8.544227889324199e-06, + "loss": 0.2844, + "step": 169 + }, + { + "epoch": 0.5246913580246914, + "grad_norm": 0.08152032643556595, + "learning_rate": 8.526818491010922e-06, + "loss": 0.3033, + "step": 170 + }, + { + "epoch": 0.5277777777777778, + "grad_norm": 0.10703514516353607, + "learning_rate": 8.509323578028547e-06, + "loss": 0.4296, + "step": 171 + }, + { + "epoch": 0.5308641975308642, + "grad_norm": 0.07901628315448761, + "learning_rate": 8.491743574575743e-06, + "loss": 0.29, + "step": 172 + }, + { + "epoch": 0.5339506172839507, + "grad_norm": 0.09099699556827545, + "learning_rate": 8.474078906914359e-06, + "loss": 0.3021, + "step": 173 + }, + { + "epoch": 0.5370370370370371, + "grad_norm": 0.0866774320602417, + "learning_rate": 8.456330003359093e-06, + "loss": 0.2633, + "step": 174 + }, + { + "epoch": 0.5401234567901234, + "grad_norm": 0.10114055871963501, + "learning_rate": 8.438497294267117e-06, + "loss": 0.3735, + "step": 175 + }, + { + "epoch": 0.5432098765432098, + "grad_norm": 0.1260298639535904, + "learning_rate": 8.420581212027625e-06, + "loss": 0.4687, + "step": 176 + }, + { + "epoch": 0.5462962962962963, + "grad_norm": 0.1004004031419754, + "learning_rate": 8.402582191051365e-06, + "loss": 0.29, + "step": 177 + }, + { + "epoch": 0.5493827160493827, + "grad_norm": 0.08794572949409485, + "learning_rate": 8.38450066776009e-06, + "loss": 0.3589, + "step": 178 + }, + { + "epoch": 0.5524691358024691, + "grad_norm": 0.10174311697483063, + "learning_rate": 8.36633708057599e-06, + "loss": 0.3832, + "step": 179 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.11463697254657745, + "learning_rate": 8.348091869911054e-06, + "loss": 0.4172, + "step": 180 + }, + { + "epoch": 0.558641975308642, + "grad_norm": 0.11808864772319794, + "learning_rate": 8.329765478156394e-06, + "loss": 0.494, + "step": 181 + }, + { + "epoch": 0.5617283950617284, + "grad_norm": 0.11152324080467224, + "learning_rate": 8.311358349671516e-06, + "loss": 0.3973, + "step": 182 + }, + { + "epoch": 0.5648148148148148, + "grad_norm": 0.09295979887247086, + "learning_rate": 8.292870930773551e-06, + "loss": 0.3696, + "step": 183 + }, + { + "epoch": 0.5679012345679012, + "grad_norm": 0.10292661935091019, + "learning_rate": 8.274303669726427e-06, + "loss": 0.3408, + "step": 184 + }, + { + "epoch": 0.5709876543209876, + "grad_norm": 0.10190277546644211, + "learning_rate": 8.255657016729997e-06, + "loss": 0.3513, + "step": 185 + }, + { + "epoch": 0.5740740740740741, + "grad_norm": 0.08307984471321106, + "learning_rate": 8.23693142390914e-06, + "loss": 0.2577, + "step": 186 + }, + { + "epoch": 0.5771604938271605, + "grad_norm": 0.11023180931806564, + "learning_rate": 8.218127345302775e-06, + "loss": 0.4168, + "step": 187 + }, + { + "epoch": 0.5802469135802469, + "grad_norm": 0.10529080033302307, + "learning_rate": 8.199245236852871e-06, + "loss": 0.4223, + "step": 188 + }, + { + "epoch": 0.5833333333333334, + "grad_norm": 0.14696502685546875, + "learning_rate": 8.180285556393384e-06, + "loss": 0.5283, + "step": 189 + }, + { + "epoch": 0.5864197530864198, + "grad_norm": 0.15351015329360962, + "learning_rate": 8.161248763639154e-06, + "loss": 0.5173, + "step": 190 + }, + { + "epoch": 0.5895061728395061, + "grad_norm": 0.10003789514303207, + "learning_rate": 8.142135320174758e-06, + "loss": 0.3617, + "step": 191 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 0.09017117321491241, + "learning_rate": 8.122945689443328e-06, + "loss": 0.2601, + "step": 192 + }, + { + "epoch": 0.595679012345679, + "grad_norm": 0.11840925365686417, + "learning_rate": 8.1036803367353e-06, + "loss": 0.4291, + "step": 193 + }, + { + "epoch": 0.5987654320987654, + "grad_norm": 0.09116993844509125, + "learning_rate": 8.084339729177142e-06, + "loss": 0.3061, + "step": 194 + }, + { + "epoch": 0.6018518518518519, + "grad_norm": 0.11056546866893768, + "learning_rate": 8.064924335720023e-06, + "loss": 0.3712, + "step": 195 + }, + { + "epoch": 0.6049382716049383, + "grad_norm": 0.10576466470956802, + "learning_rate": 8.045434627128446e-06, + "loss": 0.3591, + "step": 196 + }, + { + "epoch": 0.6080246913580247, + "grad_norm": 0.09751347452402115, + "learning_rate": 8.025871075968828e-06, + "loss": 0.3268, + "step": 197 + }, + { + "epoch": 0.6111111111111112, + "grad_norm": 0.11890437453985214, + "learning_rate": 8.006234156598043e-06, + "loss": 0.3256, + "step": 198 + }, + { + "epoch": 0.6141975308641975, + "grad_norm": 0.12418389320373535, + "learning_rate": 7.986524345151924e-06, + "loss": 0.5357, + "step": 199 + }, + { + "epoch": 0.6172839506172839, + "grad_norm": 0.11261377483606339, + "learning_rate": 7.966742119533724e-06, + "loss": 0.4537, + "step": 200 + }, + { + "epoch": 0.6203703703703703, + "grad_norm": 0.12626801431179047, + "learning_rate": 7.946887959402504e-06, + "loss": 0.3786, + "step": 201 + }, + { + "epoch": 0.6234567901234568, + "grad_norm": 0.12130914628505707, + "learning_rate": 7.926962346161535e-06, + "loss": 0.4564, + "step": 202 + }, + { + "epoch": 0.6265432098765432, + "grad_norm": 0.10559491068124771, + "learning_rate": 7.9069657629466e-06, + "loss": 0.3984, + "step": 203 + }, + { + "epoch": 0.6296296296296297, + "grad_norm": 0.11549825966358185, + "learning_rate": 7.886898694614292e-06, + "loss": 0.4251, + "step": 204 + }, + { + "epoch": 0.6327160493827161, + "grad_norm": 0.10902281850576401, + "learning_rate": 7.866761627730253e-06, + "loss": 0.4012, + "step": 205 + }, + { + "epoch": 0.6358024691358025, + "grad_norm": 0.11586394906044006, + "learning_rate": 7.846555050557381e-06, + "loss": 0.3586, + "step": 206 + }, + { + "epoch": 0.6388888888888888, + "grad_norm": 0.10988422483205795, + "learning_rate": 7.826279453043985e-06, + "loss": 0.4294, + "step": 207 + }, + { + "epoch": 0.6419753086419753, + "grad_norm": 0.1205698624253273, + "learning_rate": 7.805935326811913e-06, + "loss": 0.4782, + "step": 208 + }, + { + "epoch": 0.6450617283950617, + "grad_norm": 0.08950233459472656, + "learning_rate": 7.78552316514462e-06, + "loss": 0.2901, + "step": 209 + }, + { + "epoch": 0.6481481481481481, + "grad_norm": 0.13640360534191132, + "learning_rate": 7.765043462975217e-06, + "loss": 0.4403, + "step": 210 + }, + { + "epoch": 0.6512345679012346, + "grad_norm": 0.13739749789237976, + "learning_rate": 7.744496716874472e-06, + "loss": 0.472, + "step": 211 + }, + { + "epoch": 0.654320987654321, + "grad_norm": 0.10840674489736557, + "learning_rate": 7.723883425038759e-06, + "loss": 0.3961, + "step": 212 + }, + { + "epoch": 0.6574074074074074, + "grad_norm": 0.11287008225917816, + "learning_rate": 7.703204087277989e-06, + "loss": 0.4169, + "step": 213 + }, + { + "epoch": 0.6604938271604939, + "grad_norm": 0.1013006791472435, + "learning_rate": 7.682459205003484e-06, + "loss": 0.3537, + "step": 214 + }, + { + "epoch": 0.6635802469135802, + "grad_norm": 0.12204479426145554, + "learning_rate": 7.661649281215823e-06, + "loss": 0.3444, + "step": 215 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.1041225790977478, + "learning_rate": 7.640774820492647e-06, + "loss": 0.3432, + "step": 216 + }, + { + "epoch": 0.6697530864197531, + "grad_norm": 0.12317519634962082, + "learning_rate": 7.619836328976416e-06, + "loss": 0.4119, + "step": 217 + }, + { + "epoch": 0.6728395061728395, + "grad_norm": 0.15862716734409332, + "learning_rate": 7.598834314362151e-06, + "loss": 0.3585, + "step": 218 + }, + { + "epoch": 0.6759259259259259, + "grad_norm": 0.10013571381568909, + "learning_rate": 7.57776928588511e-06, + "loss": 0.3589, + "step": 219 + }, + { + "epoch": 0.6790123456790124, + "grad_norm": 0.11820396035909653, + "learning_rate": 7.556641754308447e-06, + "loss": 0.2838, + "step": 220 + }, + { + "epoch": 0.6820987654320988, + "grad_norm": 0.08206115663051605, + "learning_rate": 7.535452231910829e-06, + "loss": 0.1639, + "step": 221 + }, + { + "epoch": 0.6851851851851852, + "grad_norm": 0.13305512070655823, + "learning_rate": 7.514201232474012e-06, + "loss": 0.3923, + "step": 222 + }, + { + "epoch": 0.6882716049382716, + "grad_norm": 0.1208796426653862, + "learning_rate": 7.492889271270382e-06, + "loss": 0.3698, + "step": 223 + }, + { + "epoch": 0.691358024691358, + "grad_norm": 0.11946754902601242, + "learning_rate": 7.471516865050468e-06, + "loss": 0.3797, + "step": 224 + }, + { + "epoch": 0.6944444444444444, + "grad_norm": 0.08816403150558472, + "learning_rate": 7.450084532030402e-06, + "loss": 0.2238, + "step": 225 + }, + { + "epoch": 0.6975308641975309, + "grad_norm": 0.12045780569314957, + "learning_rate": 7.428592791879361e-06, + "loss": 0.3699, + "step": 226 + }, + { + "epoch": 0.7006172839506173, + "grad_norm": 0.11096329241991043, + "learning_rate": 7.407042165706969e-06, + "loss": 0.362, + "step": 227 + }, + { + "epoch": 0.7037037037037037, + "grad_norm": 0.14540982246398926, + "learning_rate": 7.385433176050654e-06, + "loss": 0.4543, + "step": 228 + }, + { + "epoch": 0.7067901234567902, + "grad_norm": 0.11663732677698135, + "learning_rate": 7.36376634686298e-06, + "loss": 0.4606, + "step": 229 + }, + { + "epoch": 0.7098765432098766, + "grad_norm": 0.11102988570928574, + "learning_rate": 7.342042203498952e-06, + "loss": 0.3526, + "step": 230 + }, + { + "epoch": 0.7129629629629629, + "grad_norm": 0.11012902110815048, + "learning_rate": 7.320261272703259e-06, + "loss": 0.4337, + "step": 231 + }, + { + "epoch": 0.7160493827160493, + "grad_norm": 0.09911687672138214, + "learning_rate": 7.298424082597526e-06, + "loss": 0.2504, + "step": 232 + }, + { + "epoch": 0.7191358024691358, + "grad_norm": 0.13727596402168274, + "learning_rate": 7.276531162667484e-06, + "loss": 0.4725, + "step": 233 + }, + { + "epoch": 0.7222222222222222, + "grad_norm": 0.10461889952421188, + "learning_rate": 7.254583043750152e-06, + "loss": 0.3202, + "step": 234 + }, + { + "epoch": 0.7253086419753086, + "grad_norm": 0.18260876834392548, + "learning_rate": 7.232580258020952e-06, + "loss": 0.4248, + "step": 235 + }, + { + "epoch": 0.7283950617283951, + "grad_norm": 0.13938364386558533, + "learning_rate": 7.210523338980814e-06, + "loss": 0.2602, + "step": 236 + }, + { + "epoch": 0.7314814814814815, + "grad_norm": 0.11910004913806915, + "learning_rate": 7.1884128214432366e-06, + "loss": 0.4185, + "step": 237 + }, + { + "epoch": 0.7345679012345679, + "grad_norm": 0.10073763877153397, + "learning_rate": 7.1662492415213194e-06, + "loss": 0.2697, + "step": 238 + }, + { + "epoch": 0.7376543209876543, + "grad_norm": 0.11307626962661743, + "learning_rate": 7.14403313661476e-06, + "loss": 0.4232, + "step": 239 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.10806172341108322, + "learning_rate": 7.1217650453968335e-06, + "loss": 0.2928, + "step": 240 + }, + { + "epoch": 0.7438271604938271, + "grad_norm": 0.14010940492153168, + "learning_rate": 7.099445507801324e-06, + "loss": 0.3915, + "step": 241 + }, + { + "epoch": 0.7469135802469136, + "grad_norm": 0.09002690017223358, + "learning_rate": 7.0770750650094335e-06, + "loss": 0.2801, + "step": 242 + }, + { + "epoch": 0.75, + "grad_norm": 0.11942241340875626, + "learning_rate": 7.0546542594366605e-06, + "loss": 0.4149, + "step": 243 + }, + { + "epoch": 0.75, + "eval_loss": 0.4767835736274719, + "eval_runtime": 44.3688, + "eval_samples_per_second": 8.294, + "eval_steps_per_second": 1.037, + "step": 243 + }, + { + "epoch": 0.7530864197530864, + "grad_norm": 0.16698460280895233, + "learning_rate": 7.03218363471965e-06, + "loss": 0.4605, + "step": 244 + }, + { + "epoch": 0.7561728395061729, + "grad_norm": 0.12310118973255157, + "learning_rate": 7.0096637357030105e-06, + "loss": 0.4328, + "step": 245 + }, + { + "epoch": 0.7592592592592593, + "grad_norm": 0.11915367841720581, + "learning_rate": 6.987095108426102e-06, + "loss": 0.3907, + "step": 246 + }, + { + "epoch": 0.7623456790123457, + "grad_norm": 0.1066504493355751, + "learning_rate": 6.964478300109796e-06, + "loss": 0.3148, + "step": 247 + }, + { + "epoch": 0.7654320987654321, + "grad_norm": 0.09711527079343796, + "learning_rate": 6.94181385914321e-06, + "loss": 0.2736, + "step": 248 + }, + { + "epoch": 0.7685185185185185, + "grad_norm": 0.08204776048660278, + "learning_rate": 6.91910233507041e-06, + "loss": 0.1607, + "step": 249 + }, + { + "epoch": 0.7716049382716049, + "grad_norm": 0.13877205550670624, + "learning_rate": 6.896344278577083e-06, + "loss": 0.3763, + "step": 250 + }, + { + "epoch": 0.7746913580246914, + "grad_norm": 0.11828643828630447, + "learning_rate": 6.873540241477189e-06, + "loss": 0.4063, + "step": 251 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.13950656354427338, + "learning_rate": 6.850690776699574e-06, + "loss": 0.4348, + "step": 252 + }, + { + "epoch": 0.7808641975308642, + "grad_norm": 0.13861550390720367, + "learning_rate": 6.8277964382745675e-06, + "loss": 0.4007, + "step": 253 + }, + { + "epoch": 0.7839506172839507, + "grad_norm": 0.12502089142799377, + "learning_rate": 6.804857781320558e-06, + "loss": 0.4157, + "step": 254 + }, + { + "epoch": 0.7870370370370371, + "grad_norm": 0.1129172146320343, + "learning_rate": 6.781875362030512e-06, + "loss": 0.3087, + "step": 255 + }, + { + "epoch": 0.7901234567901234, + "grad_norm": 0.18749450147151947, + "learning_rate": 6.758849737658508e-06, + "loss": 0.381, + "step": 256 + }, + { + "epoch": 0.7932098765432098, + "grad_norm": 0.11505936086177826, + "learning_rate": 6.735781466506216e-06, + "loss": 0.3639, + "step": 257 + }, + { + "epoch": 0.7962962962962963, + "grad_norm": 0.13606995344161987, + "learning_rate": 6.712671107909359e-06, + "loss": 0.4504, + "step": 258 + }, + { + "epoch": 0.7993827160493827, + "grad_norm": 0.13360187411308289, + "learning_rate": 6.6895192222241534e-06, + "loss": 0.4113, + "step": 259 + }, + { + "epoch": 0.8024691358024691, + "grad_norm": 0.1227497085928917, + "learning_rate": 6.666326370813722e-06, + "loss": 0.3156, + "step": 260 + }, + { + "epoch": 0.8055555555555556, + "grad_norm": 0.1294088065624237, + "learning_rate": 6.643093116034486e-06, + "loss": 0.2544, + "step": 261 + }, + { + "epoch": 0.808641975308642, + "grad_norm": 0.11842790246009827, + "learning_rate": 6.619820021222518e-06, + "loss": 0.2796, + "step": 262 + }, + { + "epoch": 0.8117283950617284, + "grad_norm": 0.11302869021892548, + "learning_rate": 6.5965076506799e-06, + "loss": 0.3225, + "step": 263 + }, + { + "epoch": 0.8148148148148148, + "grad_norm": 0.1153462752699852, + "learning_rate": 6.573156569661026e-06, + "loss": 0.3168, + "step": 264 + }, + { + "epoch": 0.8179012345679012, + "grad_norm": 0.14865292608737946, + "learning_rate": 6.549767344358903e-06, + "loss": 0.3793, + "step": 265 + }, + { + "epoch": 0.8209876543209876, + "grad_norm": 0.18601423501968384, + "learning_rate": 6.526340541891418e-06, + "loss": 0.383, + "step": 266 + }, + { + "epoch": 0.8240740740740741, + "grad_norm": 0.11983994394540787, + "learning_rate": 6.5028767302875974e-06, + "loss": 0.3366, + "step": 267 + }, + { + "epoch": 0.8271604938271605, + "grad_norm": 0.11204046756029129, + "learning_rate": 6.479376478473822e-06, + "loss": 0.2842, + "step": 268 + }, + { + "epoch": 0.8302469135802469, + "grad_norm": 0.12731367349624634, + "learning_rate": 6.455840356260041e-06, + "loss": 0.3664, + "step": 269 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.12762831151485443, + "learning_rate": 6.432268934325947e-06, + "loss": 0.4333, + "step": 270 + }, + { + "epoch": 0.8364197530864198, + "grad_norm": 0.1425330489873886, + "learning_rate": 6.408662784207149e-06, + "loss": 0.283, + "step": 271 + }, + { + "epoch": 0.8395061728395061, + "grad_norm": 0.1323920488357544, + "learning_rate": 6.385022478281307e-06, + "loss": 0.4108, + "step": 272 + }, + { + "epoch": 0.8425925925925926, + "grad_norm": 0.1550484001636505, + "learning_rate": 6.361348589754255e-06, + "loss": 0.3396, + "step": 273 + }, + { + "epoch": 0.845679012345679, + "grad_norm": 0.09628990292549133, + "learning_rate": 6.337641692646106e-06, + "loss": 0.246, + "step": 274 + }, + { + "epoch": 0.8487654320987654, + "grad_norm": 0.1477012187242508, + "learning_rate": 6.313902361777327e-06, + "loss": 0.4705, + "step": 275 + }, + { + "epoch": 0.8518518518518519, + "grad_norm": 0.14865955710411072, + "learning_rate": 6.290131172754811e-06, + "loss": 0.417, + "step": 276 + }, + { + "epoch": 0.8549382716049383, + "grad_norm": 0.11468877643346786, + "learning_rate": 6.266328701957911e-06, + "loss": 0.3683, + "step": 277 + }, + { + "epoch": 0.8580246913580247, + "grad_norm": 0.1273777186870575, + "learning_rate": 6.24249552652447e-06, + "loss": 0.2808, + "step": 278 + }, + { + "epoch": 0.8611111111111112, + "grad_norm": 0.10113878548145294, + "learning_rate": 6.2186322243368236e-06, + "loss": 0.3368, + "step": 279 + }, + { + "epoch": 0.8641975308641975, + "grad_norm": 0.1183820515871048, + "learning_rate": 6.194739374007792e-06, + "loss": 0.3095, + "step": 280 + }, + { + "epoch": 0.8672839506172839, + "grad_norm": 0.12614701688289642, + "learning_rate": 6.170817554866646e-06, + "loss": 0.3772, + "step": 281 + }, + { + "epoch": 0.8703703703703703, + "grad_norm": 0.19127966463565826, + "learning_rate": 6.1468673469450655e-06, + "loss": 0.3179, + "step": 282 + }, + { + "epoch": 0.8734567901234568, + "grad_norm": 0.14781445264816284, + "learning_rate": 6.122889330963069e-06, + "loss": 0.3659, + "step": 283 + }, + { + "epoch": 0.8765432098765432, + "grad_norm": 0.1360250860452652, + "learning_rate": 6.098884088314938e-06, + "loss": 0.4211, + "step": 284 + }, + { + "epoch": 0.8796296296296297, + "grad_norm": 0.1149686872959137, + "learning_rate": 6.074852201055121e-06, + "loss": 0.2571, + "step": 285 + }, + { + "epoch": 0.8827160493827161, + "grad_norm": 0.14958076179027557, + "learning_rate": 6.050794251884112e-06, + "loss": 0.4164, + "step": 286 + }, + { + "epoch": 0.8858024691358025, + "grad_norm": 0.12140931189060211, + "learning_rate": 6.026710824134331e-06, + "loss": 0.2203, + "step": 287 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.12924239039421082, + "learning_rate": 6.002602501755974e-06, + "loss": 0.4255, + "step": 288 + }, + { + "epoch": 0.8919753086419753, + "grad_norm": 0.1369277834892273, + "learning_rate": 5.978469869302861e-06, + "loss": 0.4083, + "step": 289 + }, + { + "epoch": 0.8950617283950617, + "grad_norm": 0.13165542483329773, + "learning_rate": 5.954313511918252e-06, + "loss": 0.3317, + "step": 290 + }, + { + "epoch": 0.8981481481481481, + "grad_norm": 0.16248537600040436, + "learning_rate": 5.9301340153206685e-06, + "loss": 0.4079, + "step": 291 + }, + { + "epoch": 0.9012345679012346, + "grad_norm": 0.14584743976593018, + "learning_rate": 5.905931965789688e-06, + "loss": 0.3508, + "step": 292 + }, + { + "epoch": 0.904320987654321, + "grad_norm": 0.15875974297523499, + "learning_rate": 5.881707950151725e-06, + "loss": 0.3597, + "step": 293 + }, + { + "epoch": 0.9074074074074074, + "grad_norm": 0.11724277585744858, + "learning_rate": 5.857462555765809e-06, + "loss": 0.3152, + "step": 294 + }, + { + "epoch": 0.9104938271604939, + "grad_norm": 0.12342196702957153, + "learning_rate": 5.8331963705093375e-06, + "loss": 0.318, + "step": 295 + }, + { + "epoch": 0.9135802469135802, + "grad_norm": 0.12013120949268341, + "learning_rate": 5.808909982763825e-06, + "loss": 0.3951, + "step": 296 + }, + { + "epoch": 0.9166666666666666, + "grad_norm": 0.10280231386423111, + "learning_rate": 5.784603981400632e-06, + "loss": 0.2725, + "step": 297 + }, + { + "epoch": 0.9197530864197531, + "grad_norm": 0.12491166591644287, + "learning_rate": 5.760278955766695e-06, + "loss": 0.3837, + "step": 298 + }, + { + "epoch": 0.9228395061728395, + "grad_norm": 0.11760140210390091, + "learning_rate": 5.735935495670229e-06, + "loss": 0.2464, + "step": 299 + }, + { + "epoch": 0.9259259259259259, + "grad_norm": 0.13774855434894562, + "learning_rate": 5.711574191366427e-06, + "loss": 0.3504, + "step": 300 + }, + { + "epoch": 0.9290123456790124, + "grad_norm": 0.09982441365718842, + "learning_rate": 5.687195633543151e-06, + "loss": 0.2457, + "step": 301 + }, + { + "epoch": 0.9320987654320988, + "grad_norm": 0.11534377187490463, + "learning_rate": 5.662800413306611e-06, + "loss": 0.2951, + "step": 302 + }, + { + "epoch": 0.9351851851851852, + "grad_norm": 0.100958451628685, + "learning_rate": 5.6383891221670275e-06, + "loss": 0.19, + "step": 303 + }, + { + "epoch": 0.9382716049382716, + "grad_norm": 0.17198745906352997, + "learning_rate": 5.613962352024293e-06, + "loss": 0.3832, + "step": 304 + }, + { + "epoch": 0.941358024691358, + "grad_norm": 0.16045625507831573, + "learning_rate": 5.589520695153618e-06, + "loss": 0.4173, + "step": 305 + }, + { + "epoch": 0.9444444444444444, + "grad_norm": 0.12690144777297974, + "learning_rate": 5.5650647441911706e-06, + "loss": 0.3318, + "step": 306 + }, + { + "epoch": 0.9475308641975309, + "grad_norm": 0.12933467328548431, + "learning_rate": 5.540595092119709e-06, + "loss": 0.3169, + "step": 307 + }, + { + "epoch": 0.9506172839506173, + "grad_norm": 0.1863582581281662, + "learning_rate": 5.516112332254203e-06, + "loss": 0.3925, + "step": 308 + }, + { + "epoch": 0.9537037037037037, + "grad_norm": 0.15057547390460968, + "learning_rate": 5.491617058227443e-06, + "loss": 0.4953, + "step": 309 + }, + { + "epoch": 0.9567901234567902, + "grad_norm": 0.159704327583313, + "learning_rate": 5.46710986397565e-06, + "loss": 0.3831, + "step": 310 + }, + { + "epoch": 0.9598765432098766, + "grad_norm": 0.0988263189792633, + "learning_rate": 5.442591343724081e-06, + "loss": 0.1455, + "step": 311 + }, + { + "epoch": 0.9629629629629629, + "grad_norm": 0.13106189668178558, + "learning_rate": 5.418062091972604e-06, + "loss": 0.227, + "step": 312 + }, + { + "epoch": 0.9660493827160493, + "grad_norm": 0.17571298778057098, + "learning_rate": 5.393522703481303e-06, + "loss": 0.4638, + "step": 313 + }, + { + "epoch": 0.9691358024691358, + "grad_norm": 0.12073665857315063, + "learning_rate": 5.36897377325604e-06, + "loss": 0.2587, + "step": 314 + }, + { + "epoch": 0.9722222222222222, + "grad_norm": 0.08656695485115051, + "learning_rate": 5.344415896534039e-06, + "loss": 0.2088, + "step": 315 + }, + { + "epoch": 0.9753086419753086, + "grad_norm": 0.1401841789484024, + "learning_rate": 5.319849668769449e-06, + "loss": 0.3667, + "step": 316 + }, + { + "epoch": 0.9783950617283951, + "grad_norm": 0.1650845855474472, + "learning_rate": 5.295275685618905e-06, + "loss": 0.3667, + "step": 317 + }, + { + "epoch": 0.9814814814814815, + "grad_norm": 0.13909409940242767, + "learning_rate": 5.270694542927089e-06, + "loss": 0.3811, + "step": 318 + }, + { + "epoch": 0.9845679012345679, + "grad_norm": 0.11377997696399689, + "learning_rate": 5.246106836712277e-06, + "loss": 0.2349, + "step": 319 + }, + { + "epoch": 0.9876543209876543, + "grad_norm": 0.12037783116102219, + "learning_rate": 5.2215131631518945e-06, + "loss": 0.2901, + "step": 320 + }, + { + "epoch": 0.9907407407407407, + "grad_norm": 0.13020600378513336, + "learning_rate": 5.196914118568054e-06, + "loss": 0.3427, + "step": 321 + }, + { + "epoch": 0.9938271604938271, + "grad_norm": 0.15103194117546082, + "learning_rate": 5.1723102994130994e-06, + "loss": 0.4012, + "step": 322 + }, + { + "epoch": 0.9969135802469136, + "grad_norm": 0.105732262134552, + "learning_rate": 5.147702302255143e-06, + "loss": 0.175, + "step": 323 + }, + { + "epoch": 1.0, + "grad_norm": 0.17236697673797607, + "learning_rate": 5.123090723763607e-06, + "loss": 0.3751, + "step": 324 + }, + { + "epoch": 1.0, + "eval_loss": 0.4522034823894501, + "eval_runtime": 44.5334, + "eval_samples_per_second": 8.263, + "eval_steps_per_second": 1.033, + "step": 324 + }, + { + "epoch": 1.0030864197530864, + "grad_norm": 0.15303292870521545, + "learning_rate": 5.098476160694741e-06, + "loss": 0.4663, + "step": 325 + }, + { + "epoch": 1.0061728395061729, + "grad_norm": 0.10959513485431671, + "learning_rate": 5.073859209877167e-06, + "loss": 0.2389, + "step": 326 + }, + { + "epoch": 1.0092592592592593, + "grad_norm": 0.14050254225730896, + "learning_rate": 5.049240468197401e-06, + "loss": 0.3591, + "step": 327 + }, + { + "epoch": 1.0123456790123457, + "grad_norm": 0.12712690234184265, + "learning_rate": 5.0246205325853824e-06, + "loss": 0.3452, + "step": 328 + }, + { + "epoch": 1.0154320987654322, + "grad_norm": 0.1756986677646637, + "learning_rate": 5e-06, + "loss": 0.4289, + "step": 329 + }, + { + "epoch": 1.0185185185185186, + "grad_norm": 0.14214292168617249, + "learning_rate": 4.975379467414621e-06, + "loss": 0.3695, + "step": 330 + }, + { + "epoch": 1.0030864197530864, + "grad_norm": 0.1542719155550003, + "learning_rate": 4.950759531802602e-06, + "loss": 0.3824, + "step": 331 + }, + { + "epoch": 1.0061728395061729, + "grad_norm": 0.12223492562770844, + "learning_rate": 4.926140790122835e-06, + "loss": 0.2753, + "step": 332 + }, + { + "epoch": 1.0092592592592593, + "grad_norm": 0.12852071225643158, + "learning_rate": 4.90152383930526e-06, + "loss": 0.2418, + "step": 333 + }, + { + "epoch": 1.0123456790123457, + "grad_norm": 0.1099737137556076, + "learning_rate": 4.876909276236395e-06, + "loss": 0.2964, + "step": 334 + }, + { + "epoch": 1.0154320987654322, + "grad_norm": 0.1437702178955078, + "learning_rate": 4.852297697744857e-06, + "loss": 0.355, + "step": 335 + }, + { + "epoch": 1.0185185185185186, + "grad_norm": 0.12063878774642944, + "learning_rate": 4.827689700586902e-06, + "loss": 0.2879, + "step": 336 + }, + { + "epoch": 1.021604938271605, + "grad_norm": 0.19743777811527252, + "learning_rate": 4.803085881431949e-06, + "loss": 0.3412, + "step": 337 + }, + { + "epoch": 1.0246913580246915, + "grad_norm": 0.22067442536354065, + "learning_rate": 4.778486836848107e-06, + "loss": 0.3051, + "step": 338 + }, + { + "epoch": 1.0277777777777777, + "grad_norm": 0.1556781828403473, + "learning_rate": 4.7538931632877254e-06, + "loss": 0.3369, + "step": 339 + }, + { + "epoch": 1.0308641975308641, + "grad_norm": 0.132530078291893, + "learning_rate": 4.729305457072913e-06, + "loss": 0.3452, + "step": 340 + }, + { + "epoch": 1.0339506172839505, + "grad_norm": 0.16023634374141693, + "learning_rate": 4.704724314381097e-06, + "loss": 0.3887, + "step": 341 + }, + { + "epoch": 1.037037037037037, + "grad_norm": 0.14671647548675537, + "learning_rate": 4.680150331230552e-06, + "loss": 0.3082, + "step": 342 + }, + { + "epoch": 1.0401234567901234, + "grad_norm": 0.20157098770141602, + "learning_rate": 4.6555841034659625e-06, + "loss": 0.5004, + "step": 343 + }, + { + "epoch": 1.0432098765432098, + "grad_norm": 0.14635726809501648, + "learning_rate": 4.631026226743962e-06, + "loss": 0.4104, + "step": 344 + }, + { + "epoch": 1.0462962962962963, + "grad_norm": 0.14289334416389465, + "learning_rate": 4.606477296518698e-06, + "loss": 0.3206, + "step": 345 + }, + { + "epoch": 1.0493827160493827, + "grad_norm": 0.14635069668293, + "learning_rate": 4.581937908027397e-06, + "loss": 0.2957, + "step": 346 + }, + { + "epoch": 1.0524691358024691, + "grad_norm": 0.1479678899049759, + "learning_rate": 4.55740865627592e-06, + "loss": 0.3168, + "step": 347 + }, + { + "epoch": 1.0555555555555556, + "grad_norm": 0.12210693210363388, + "learning_rate": 4.532890136024351e-06, + "loss": 0.2854, + "step": 348 + }, + { + "epoch": 1.058641975308642, + "grad_norm": 0.16018199920654297, + "learning_rate": 4.508382941772558e-06, + "loss": 0.2937, + "step": 349 + }, + { + "epoch": 1.0617283950617284, + "grad_norm": 0.14056287705898285, + "learning_rate": 4.483887667745798e-06, + "loss": 0.3246, + "step": 350 + }, + { + "epoch": 1.0648148148148149, + "grad_norm": 0.14486226439476013, + "learning_rate": 4.459404907880293e-06, + "loss": 0.3133, + "step": 351 + }, + { + "epoch": 1.0679012345679013, + "grad_norm": 0.1279231458902359, + "learning_rate": 4.434935255808831e-06, + "loss": 0.2219, + "step": 352 + }, + { + "epoch": 1.0709876543209877, + "grad_norm": 0.16269516944885254, + "learning_rate": 4.410479304846385e-06, + "loss": 0.3531, + "step": 353 + }, + { + "epoch": 1.074074074074074, + "grad_norm": 0.15139630436897278, + "learning_rate": 4.386037647975708e-06, + "loss": 0.2508, + "step": 354 + }, + { + "epoch": 1.0771604938271604, + "grad_norm": 0.15115757286548615, + "learning_rate": 4.361610877832974e-06, + "loss": 0.3908, + "step": 355 + }, + { + "epoch": 1.0802469135802468, + "grad_norm": 0.17080338299274445, + "learning_rate": 4.337199586693389e-06, + "loss": 0.4233, + "step": 356 + }, + { + "epoch": 1.0833333333333333, + "grad_norm": 0.149905264377594, + "learning_rate": 4.312804366456851e-06, + "loss": 0.3354, + "step": 357 + }, + { + "epoch": 1.0864197530864197, + "grad_norm": 0.2038925588130951, + "learning_rate": 4.2884258086335755e-06, + "loss": 0.422, + "step": 358 + }, + { + "epoch": 1.0895061728395061, + "grad_norm": 0.1319386065006256, + "learning_rate": 4.2640645043297715e-06, + "loss": 0.2812, + "step": 359 + }, + { + "epoch": 1.0925925925925926, + "grad_norm": 0.210116446018219, + "learning_rate": 4.239721044233306e-06, + "loss": 0.3266, + "step": 360 + }, + { + "epoch": 1.095679012345679, + "grad_norm": 0.15533123910427094, + "learning_rate": 4.215396018599369e-06, + "loss": 0.3106, + "step": 361 + }, + { + "epoch": 1.0987654320987654, + "grad_norm": 0.15208472311496735, + "learning_rate": 4.191090017236177e-06, + "loss": 0.3423, + "step": 362 + }, + { + "epoch": 1.1018518518518519, + "grad_norm": 0.12684912979602814, + "learning_rate": 4.166803629490664e-06, + "loss": 0.2755, + "step": 363 + }, + { + "epoch": 1.1049382716049383, + "grad_norm": 0.18555931746959686, + "learning_rate": 4.142537444234192e-06, + "loss": 0.4007, + "step": 364 + }, + { + "epoch": 1.1080246913580247, + "grad_norm": 0.20792073011398315, + "learning_rate": 4.118292049848277e-06, + "loss": 0.2467, + "step": 365 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.13857008516788483, + "learning_rate": 4.094068034210313e-06, + "loss": 0.3666, + "step": 366 + }, + { + "epoch": 1.1141975308641976, + "grad_norm": 0.10900649428367615, + "learning_rate": 4.069865984679332e-06, + "loss": 0.1954, + "step": 367 + }, + { + "epoch": 1.117283950617284, + "grad_norm": 0.13190750777721405, + "learning_rate": 4.045686488081748e-06, + "loss": 0.309, + "step": 368 + }, + { + "epoch": 1.1203703703703705, + "grad_norm": 0.16032575070858002, + "learning_rate": 4.021530130697141e-06, + "loss": 0.3524, + "step": 369 + }, + { + "epoch": 1.123456790123457, + "grad_norm": 0.14147287607192993, + "learning_rate": 3.997397498244028e-06, + "loss": 0.3088, + "step": 370 + }, + { + "epoch": 1.126543209876543, + "grad_norm": 0.1288299709558487, + "learning_rate": 3.97328917586567e-06, + "loss": 0.3216, + "step": 371 + }, + { + "epoch": 1.1296296296296295, + "grad_norm": 0.17235535383224487, + "learning_rate": 3.9492057481158905e-06, + "loss": 0.3339, + "step": 372 + }, + { + "epoch": 1.132716049382716, + "grad_norm": 0.21856486797332764, + "learning_rate": 3.92514779894488e-06, + "loss": 0.3691, + "step": 373 + }, + { + "epoch": 1.1358024691358024, + "grad_norm": 0.188248872756958, + "learning_rate": 3.901115911685063e-06, + "loss": 0.3879, + "step": 374 + }, + { + "epoch": 1.1388888888888888, + "grad_norm": 0.17136438190937042, + "learning_rate": 3.877110669036932e-06, + "loss": 0.4754, + "step": 375 + }, + { + "epoch": 1.1419753086419753, + "grad_norm": 0.14845937490463257, + "learning_rate": 3.853132653054936e-06, + "loss": 0.4178, + "step": 376 + }, + { + "epoch": 1.1450617283950617, + "grad_norm": 0.14598865807056427, + "learning_rate": 3.829182445133356e-06, + "loss": 0.2653, + "step": 377 + }, + { + "epoch": 1.1481481481481481, + "grad_norm": 0.12898695468902588, + "learning_rate": 3.8052606259922097e-06, + "loss": 0.2613, + "step": 378 + }, + { + "epoch": 1.1512345679012346, + "grad_norm": 0.12332043796777725, + "learning_rate": 3.7813677756631773e-06, + "loss": 0.2803, + "step": 379 + }, + { + "epoch": 1.154320987654321, + "grad_norm": 0.1356392502784729, + "learning_rate": 3.75750447347553e-06, + "loss": 0.4038, + "step": 380 + }, + { + "epoch": 1.1574074074074074, + "grad_norm": 0.25393664836883545, + "learning_rate": 3.7336712980420897e-06, + "loss": 0.5067, + "step": 381 + }, + { + "epoch": 1.1604938271604939, + "grad_norm": 0.12110210955142975, + "learning_rate": 3.7098688272451893e-06, + "loss": 0.2413, + "step": 382 + }, + { + "epoch": 1.1635802469135803, + "grad_norm": 0.12632521986961365, + "learning_rate": 3.6860976382226747e-06, + "loss": 0.2583, + "step": 383 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.15142959356307983, + "learning_rate": 3.662358307353897e-06, + "loss": 0.4542, + "step": 384 + }, + { + "epoch": 1.1697530864197532, + "grad_norm": 0.11639465391635895, + "learning_rate": 3.638651410245746e-06, + "loss": 0.1849, + "step": 385 + }, + { + "epoch": 1.1728395061728394, + "grad_norm": 0.14406833052635193, + "learning_rate": 3.6149775217186954e-06, + "loss": 0.3171, + "step": 386 + }, + { + "epoch": 1.175925925925926, + "grad_norm": 0.1374572366476059, + "learning_rate": 3.5913372157928515e-06, + "loss": 0.2849, + "step": 387 + }, + { + "epoch": 1.1790123456790123, + "grad_norm": 0.16935373842716217, + "learning_rate": 3.5677310656740537e-06, + "loss": 0.3982, + "step": 388 + }, + { + "epoch": 1.1820987654320987, + "grad_norm": 0.1098417416214943, + "learning_rate": 3.5441596437399596e-06, + "loss": 0.2149, + "step": 389 + }, + { + "epoch": 1.1851851851851851, + "grad_norm": 0.14076852798461914, + "learning_rate": 3.5206235215261785e-06, + "loss": 0.2685, + "step": 390 + }, + { + "epoch": 1.1882716049382716, + "grad_norm": 0.12600207328796387, + "learning_rate": 3.4971232697124046e-06, + "loss": 0.2009, + "step": 391 + }, + { + "epoch": 1.191358024691358, + "grad_norm": 0.13086476922035217, + "learning_rate": 3.4736594581085837e-06, + "loss": 0.3062, + "step": 392 + }, + { + "epoch": 1.1944444444444444, + "grad_norm": 0.16587767004966736, + "learning_rate": 3.4502326556411e-06, + "loss": 0.2432, + "step": 393 + }, + { + "epoch": 1.1975308641975309, + "grad_norm": 0.13524991273880005, + "learning_rate": 3.4268434303389747e-06, + "loss": 0.3204, + "step": 394 + }, + { + "epoch": 1.2006172839506173, + "grad_norm": 0.15923044085502625, + "learning_rate": 3.403492349320101e-06, + "loss": 0.36, + "step": 395 + }, + { + "epoch": 1.2037037037037037, + "grad_norm": 0.19655781984329224, + "learning_rate": 3.380179978777482e-06, + "loss": 0.4863, + "step": 396 + }, + { + "epoch": 1.2067901234567902, + "grad_norm": 0.13031858205795288, + "learning_rate": 3.356906883965516e-06, + "loss": 0.2884, + "step": 397 + }, + { + "epoch": 1.2098765432098766, + "grad_norm": 0.12421680986881256, + "learning_rate": 3.33367362918628e-06, + "loss": 0.1891, + "step": 398 + }, + { + "epoch": 1.212962962962963, + "grad_norm": 0.15903340280056, + "learning_rate": 3.3104807777758487e-06, + "loss": 0.4381, + "step": 399 + }, + { + "epoch": 1.2160493827160495, + "grad_norm": 0.11143235117197037, + "learning_rate": 3.2873288920906436e-06, + "loss": 0.2269, + "step": 400 + }, + { + "epoch": 1.2191358024691359, + "grad_norm": 0.1427583545446396, + "learning_rate": 3.2642185334937853e-06, + "loss": 0.3874, + "step": 401 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.21431690454483032, + "learning_rate": 3.2411502623414925e-06, + "loss": 0.4815, + "step": 402 + }, + { + "epoch": 1.2253086419753085, + "grad_norm": 0.20369336009025574, + "learning_rate": 3.2181246379694886e-06, + "loss": 0.429, + "step": 403 + }, + { + "epoch": 1.228395061728395, + "grad_norm": 0.21474803984165192, + "learning_rate": 3.1951422186794447e-06, + "loss": 0.4217, + "step": 404 + }, + { + "epoch": 1.2314814814814814, + "grad_norm": 0.1690702587366104, + "learning_rate": 3.1722035617254333e-06, + "loss": 0.3388, + "step": 405 + }, + { + "epoch": 1.2314814814814814, + "eval_loss": 0.4383295774459839, + "eval_runtime": 44.45, + "eval_samples_per_second": 8.279, + "eval_steps_per_second": 1.035, + "step": 405 + }, + { + "epoch": 1.2345679012345678, + "grad_norm": 0.13106146454811096, + "learning_rate": 3.149309223300428e-06, + "loss": 0.2537, + "step": 406 + }, + { + "epoch": 1.2376543209876543, + "grad_norm": 0.18745112419128418, + "learning_rate": 3.126459758522813e-06, + "loss": 0.3825, + "step": 407 + }, + { + "epoch": 1.2407407407407407, + "grad_norm": 0.1358872950077057, + "learning_rate": 3.103655721422917e-06, + "loss": 0.3057, + "step": 408 + }, + { + "epoch": 1.2438271604938271, + "grad_norm": 0.15695077180862427, + "learning_rate": 3.080897664929592e-06, + "loss": 0.412, + "step": 409 + }, + { + "epoch": 1.2469135802469136, + "grad_norm": 0.15740308165550232, + "learning_rate": 3.0581861408567907e-06, + "loss": 0.371, + "step": 410 + }, + { + "epoch": 1.25, + "grad_norm": 0.17210154235363007, + "learning_rate": 3.035521699890206e-06, + "loss": 0.4671, + "step": 411 + }, + { + "epoch": 1.2530864197530864, + "grad_norm": 0.1564391851425171, + "learning_rate": 3.0129048915739013e-06, + "loss": 0.397, + "step": 412 + }, + { + "epoch": 1.2561728395061729, + "grad_norm": 0.15035340189933777, + "learning_rate": 2.9903362642969903e-06, + "loss": 0.3696, + "step": 413 + }, + { + "epoch": 1.2592592592592593, + "grad_norm": 0.12334346026182175, + "learning_rate": 2.967816365280351e-06, + "loss": 0.2595, + "step": 414 + }, + { + "epoch": 1.2623456790123457, + "grad_norm": 0.159285768866539, + "learning_rate": 2.94534574056334e-06, + "loss": 0.3444, + "step": 415 + }, + { + "epoch": 1.2654320987654322, + "grad_norm": 0.14071713387966156, + "learning_rate": 2.9229249349905686e-06, + "loss": 0.264, + "step": 416 + }, + { + "epoch": 1.2685185185185186, + "grad_norm": 0.17824961245059967, + "learning_rate": 2.9005544921986774e-06, + "loss": 0.3823, + "step": 417 + }, + { + "epoch": 1.2716049382716048, + "grad_norm": 0.14212675392627716, + "learning_rate": 2.8782349546031673e-06, + "loss": 0.253, + "step": 418 + }, + { + "epoch": 1.2746913580246915, + "grad_norm": 0.21493245661258698, + "learning_rate": 2.8559668633852433e-06, + "loss": 0.3181, + "step": 419 + }, + { + "epoch": 1.2777777777777777, + "grad_norm": 0.14115536212921143, + "learning_rate": 2.8337507584786826e-06, + "loss": 0.3007, + "step": 420 + }, + { + "epoch": 1.2808641975308643, + "grad_norm": 0.16807730495929718, + "learning_rate": 2.811587178556764e-06, + "loss": 0.271, + "step": 421 + }, + { + "epoch": 1.2839506172839505, + "grad_norm": 0.19324727356433868, + "learning_rate": 2.789476661019186e-06, + "loss": 0.3613, + "step": 422 + }, + { + "epoch": 1.287037037037037, + "grad_norm": 0.22242026031017303, + "learning_rate": 2.7674197419790493e-06, + "loss": 0.3391, + "step": 423 + }, + { + "epoch": 1.2901234567901234, + "grad_norm": 0.1270921379327774, + "learning_rate": 2.7454169562498503e-06, + "loss": 0.2094, + "step": 424 + }, + { + "epoch": 1.2932098765432098, + "grad_norm": 0.12505224347114563, + "learning_rate": 2.723468837332517e-06, + "loss": 0.2807, + "step": 425 + }, + { + "epoch": 1.2962962962962963, + "grad_norm": 0.16030734777450562, + "learning_rate": 2.7015759174024756e-06, + "loss": 0.3266, + "step": 426 + }, + { + "epoch": 1.2993827160493827, + "grad_norm": 0.1334860622882843, + "learning_rate": 2.6797387272967414e-06, + "loss": 0.2262, + "step": 427 + }, + { + "epoch": 1.3024691358024691, + "grad_norm": 0.16829054057598114, + "learning_rate": 2.65795779650105e-06, + "loss": 0.3483, + "step": 428 + }, + { + "epoch": 1.3055555555555556, + "grad_norm": 0.16048014163970947, + "learning_rate": 2.63623365313702e-06, + "loss": 0.3673, + "step": 429 + }, + { + "epoch": 1.308641975308642, + "grad_norm": 0.22250574827194214, + "learning_rate": 2.614566823949348e-06, + "loss": 0.3418, + "step": 430 + }, + { + "epoch": 1.3117283950617284, + "grad_norm": 0.13716565072536469, + "learning_rate": 2.592957834293033e-06, + "loss": 0.2986, + "step": 431 + }, + { + "epoch": 1.3148148148148149, + "grad_norm": 0.15584644675254822, + "learning_rate": 2.5714072081206407e-06, + "loss": 0.3419, + "step": 432 + }, + { + "epoch": 1.3179012345679013, + "grad_norm": 0.17043578624725342, + "learning_rate": 2.5499154679696014e-06, + "loss": 0.3133, + "step": 433 + }, + { + "epoch": 1.3209876543209877, + "grad_norm": 0.1307077258825302, + "learning_rate": 2.528483134949535e-06, + "loss": 0.2484, + "step": 434 + }, + { + "epoch": 1.324074074074074, + "grad_norm": 0.19332851469516754, + "learning_rate": 2.50711072872962e-06, + "loss": 0.338, + "step": 435 + }, + { + "epoch": 1.3271604938271606, + "grad_norm": 0.18752485513687134, + "learning_rate": 2.4857987675259887e-06, + "loss": 0.3693, + "step": 436 + }, + { + "epoch": 1.3302469135802468, + "grad_norm": 0.171221524477005, + "learning_rate": 2.4645477680891734e-06, + "loss": 0.3222, + "step": 437 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.2540048062801361, + "learning_rate": 2.4433582456915556e-06, + "loss": 0.4404, + "step": 438 + }, + { + "epoch": 1.3364197530864197, + "grad_norm": 0.13886091113090515, + "learning_rate": 2.422230714114891e-06, + "loss": 0.3246, + "step": 439 + }, + { + "epoch": 1.3395061728395061, + "grad_norm": 0.11673127859830856, + "learning_rate": 2.4011656856378513e-06, + "loss": 0.1878, + "step": 440 + }, + { + "epoch": 1.3425925925925926, + "grad_norm": 0.20191854238510132, + "learning_rate": 2.3801636710235836e-06, + "loss": 0.2979, + "step": 441 + }, + { + "epoch": 1.345679012345679, + "grad_norm": 0.16786165535449982, + "learning_rate": 2.3592251795073564e-06, + "loss": 0.2931, + "step": 442 + }, + { + "epoch": 1.3487654320987654, + "grad_norm": 0.1304280310869217, + "learning_rate": 2.338350718784177e-06, + "loss": 0.2368, + "step": 443 + }, + { + "epoch": 1.3518518518518519, + "grad_norm": 0.14287714660167694, + "learning_rate": 2.3175407949965167e-06, + "loss": 0.286, + "step": 444 + }, + { + "epoch": 1.3549382716049383, + "grad_norm": 0.13601404428482056, + "learning_rate": 2.296795912722014e-06, + "loss": 0.268, + "step": 445 + }, + { + "epoch": 1.3580246913580247, + "grad_norm": 0.1764301061630249, + "learning_rate": 2.2761165749612417e-06, + "loss": 0.355, + "step": 446 + }, + { + "epoch": 1.3611111111111112, + "grad_norm": 0.1622696816921234, + "learning_rate": 2.25550328312553e-06, + "loss": 0.3438, + "step": 447 + }, + { + "epoch": 1.3641975308641976, + "grad_norm": 0.15518330037593842, + "learning_rate": 2.2349565370247837e-06, + "loss": 0.2844, + "step": 448 + }, + { + "epoch": 1.367283950617284, + "grad_norm": 0.13542047142982483, + "learning_rate": 2.214476834855382e-06, + "loss": 0.324, + "step": 449 + }, + { + "epoch": 1.3703703703703702, + "grad_norm": 0.20794177055358887, + "learning_rate": 2.1940646731880887e-06, + "loss": 0.5443, + "step": 450 + }, + { + "epoch": 1.373456790123457, + "grad_norm": 0.1371917873620987, + "learning_rate": 2.173720546956015e-06, + "loss": 0.3663, + "step": 451 + }, + { + "epoch": 1.376543209876543, + "grad_norm": 0.17952483892440796, + "learning_rate": 2.1534449494426203e-06, + "loss": 0.3209, + "step": 452 + }, + { + "epoch": 1.3796296296296298, + "grad_norm": 0.1383998692035675, + "learning_rate": 2.1332383722697483e-06, + "loss": 0.2407, + "step": 453 + }, + { + "epoch": 1.382716049382716, + "grad_norm": 0.17842058837413788, + "learning_rate": 2.1131013053857097e-06, + "loss": 0.5964, + "step": 454 + }, + { + "epoch": 1.3858024691358024, + "grad_norm": 0.13012441992759705, + "learning_rate": 2.0930342370534013e-06, + "loss": 0.2686, + "step": 455 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.1683279275894165, + "learning_rate": 2.073037653838466e-06, + "loss": 0.4134, + "step": 456 + }, + { + "epoch": 1.3919753086419753, + "grad_norm": 0.18860593438148499, + "learning_rate": 2.053112040597495e-06, + "loss": 0.2766, + "step": 457 + }, + { + "epoch": 1.3950617283950617, + "grad_norm": 0.15948981046676636, + "learning_rate": 2.0332578804662783e-06, + "loss": 0.452, + "step": 458 + }, + { + "epoch": 1.3981481481481481, + "grad_norm": 0.13614550232887268, + "learning_rate": 2.013475654848076e-06, + "loss": 0.3028, + "step": 459 + }, + { + "epoch": 1.4012345679012346, + "grad_norm": 0.1575852334499359, + "learning_rate": 1.99376584340196e-06, + "loss": 0.3772, + "step": 460 + }, + { + "epoch": 1.404320987654321, + "grad_norm": 0.1815677434206009, + "learning_rate": 1.9741289240311757e-06, + "loss": 0.4218, + "step": 461 + }, + { + "epoch": 1.4074074074074074, + "grad_norm": 0.16409048438072205, + "learning_rate": 1.954565372871554e-06, + "loss": 0.4449, + "step": 462 + }, + { + "epoch": 1.4104938271604939, + "grad_norm": 0.17997804284095764, + "learning_rate": 1.935075664279978e-06, + "loss": 0.3908, + "step": 463 + }, + { + "epoch": 1.4135802469135803, + "grad_norm": 0.17692823708057404, + "learning_rate": 1.9156602708228584e-06, + "loss": 0.3506, + "step": 464 + }, + { + "epoch": 1.4166666666666667, + "grad_norm": 0.17066018283367157, + "learning_rate": 1.8963196632647008e-06, + "loss": 0.4187, + "step": 465 + }, + { + "epoch": 1.4197530864197532, + "grad_norm": 0.17325402796268463, + "learning_rate": 1.8770543105566752e-06, + "loss": 0.3865, + "step": 466 + }, + { + "epoch": 1.4228395061728394, + "grad_norm": 0.1373230516910553, + "learning_rate": 1.8578646798252432e-06, + "loss": 0.2194, + "step": 467 + }, + { + "epoch": 1.425925925925926, + "grad_norm": 0.14924941956996918, + "learning_rate": 1.8387512363608496e-06, + "loss": 0.3415, + "step": 468 + }, + { + "epoch": 1.4290123456790123, + "grad_norm": 0.15401771664619446, + "learning_rate": 1.8197144436066167e-06, + "loss": 0.3132, + "step": 469 + }, + { + "epoch": 1.4320987654320987, + "grad_norm": 0.24441462755203247, + "learning_rate": 1.8007547631471289e-06, + "loss": 0.365, + "step": 470 + }, + { + "epoch": 1.4351851851851851, + "grad_norm": 0.2641655206680298, + "learning_rate": 1.781872654697226e-06, + "loss": 0.4653, + "step": 471 + }, + { + "epoch": 1.4382716049382716, + "grad_norm": 0.18639406561851501, + "learning_rate": 1.7630685760908623e-06, + "loss": 0.3422, + "step": 472 + }, + { + "epoch": 1.441358024691358, + "grad_norm": 0.14547406136989594, + "learning_rate": 1.7443429832700038e-06, + "loss": 0.3541, + "step": 473 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.179130420088768, + "learning_rate": 1.7256963302735752e-06, + "loss": 0.3341, + "step": 474 + }, + { + "epoch": 1.4475308641975309, + "grad_norm": 0.1942981481552124, + "learning_rate": 1.7071290692264492e-06, + "loss": 0.392, + "step": 475 + }, + { + "epoch": 1.4506172839506173, + "grad_norm": 0.10643615573644638, + "learning_rate": 1.6886416503284835e-06, + "loss": 0.2317, + "step": 476 + }, + { + "epoch": 1.4537037037037037, + "grad_norm": 0.14966462552547455, + "learning_rate": 1.6702345218436066e-06, + "loss": 0.2882, + "step": 477 + }, + { + "epoch": 1.4567901234567902, + "grad_norm": 0.1604948490858078, + "learning_rate": 1.6519081300889472e-06, + "loss": 0.3337, + "step": 478 + }, + { + "epoch": 1.4598765432098766, + "grad_norm": 0.23344826698303223, + "learning_rate": 1.6336629194240118e-06, + "loss": 0.3655, + "step": 479 + }, + { + "epoch": 1.462962962962963, + "grad_norm": 0.1553526222705841, + "learning_rate": 1.6154993322399114e-06, + "loss": 0.316, + "step": 480 + }, + { + "epoch": 1.4660493827160495, + "grad_norm": 0.1312614530324936, + "learning_rate": 1.5974178089486364e-06, + "loss": 0.301, + "step": 481 + }, + { + "epoch": 1.4691358024691357, + "grad_norm": 0.13480979204177856, + "learning_rate": 1.5794187879723755e-06, + "loss": 0.356, + "step": 482 + }, + { + "epoch": 1.4722222222222223, + "grad_norm": 0.14350688457489014, + "learning_rate": 1.561502705732883e-06, + "loss": 0.3021, + "step": 483 + }, + { + "epoch": 1.4753086419753085, + "grad_norm": 0.13871291279792786, + "learning_rate": 1.543669996640908e-06, + "loss": 0.4188, + "step": 484 + }, + { + "epoch": 1.4783950617283952, + "grad_norm": 0.16152562201023102, + "learning_rate": 1.5259210930856423e-06, + "loss": 0.3632, + "step": 485 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 0.17420196533203125, + "learning_rate": 1.5082564254242583e-06, + "loss": 0.3735, + "step": 486 + }, + { + "epoch": 1.4814814814814814, + "eval_loss": 0.430364727973938, + "eval_runtime": 44.4346, + "eval_samples_per_second": 8.282, + "eval_steps_per_second": 1.035, + "step": 486 + }, + { + "epoch": 1.4845679012345678, + "grad_norm": 0.15298381447792053, + "learning_rate": 1.4906764219714537e-06, + "loss": 0.3162, + "step": 487 + }, + { + "epoch": 1.4876543209876543, + "grad_norm": 0.17767275869846344, + "learning_rate": 1.4731815089890795e-06, + "loss": 0.451, + "step": 488 + }, + { + "epoch": 1.4907407407407407, + "grad_norm": 0.2112477868795395, + "learning_rate": 1.455772110675804e-06, + "loss": 0.3914, + "step": 489 + }, + { + "epoch": 1.4938271604938271, + "grad_norm": 0.18488173186779022, + "learning_rate": 1.438448649156815e-06, + "loss": 0.3242, + "step": 490 + }, + { + "epoch": 1.4969135802469136, + "grad_norm": 0.19138255715370178, + "learning_rate": 1.4212115444736024e-06, + "loss": 0.3273, + "step": 491 + }, + { + "epoch": 1.5, + "grad_norm": 0.17519411444664001, + "learning_rate": 1.4040612145737608e-06, + "loss": 0.314, + "step": 492 + }, + { + "epoch": 1.5030864197530864, + "grad_norm": 0.11331440508365631, + "learning_rate": 1.3869980753008537e-06, + "loss": 0.2184, + "step": 493 + }, + { + "epoch": 1.5061728395061729, + "grad_norm": 0.1674378216266632, + "learning_rate": 1.370022540384347e-06, + "loss": 0.3075, + "step": 494 + }, + { + "epoch": 1.5092592592592593, + "grad_norm": 0.14736564457416534, + "learning_rate": 1.353135021429554e-06, + "loss": 0.3719, + "step": 495 + }, + { + "epoch": 1.5123456790123457, + "grad_norm": 0.14618776738643646, + "learning_rate": 1.3363359279076776e-06, + "loss": 0.3625, + "step": 496 + }, + { + "epoch": 1.515432098765432, + "grad_norm": 0.15497514605522156, + "learning_rate": 1.3196256671458663e-06, + "loss": 0.3522, + "step": 497 + }, + { + "epoch": 1.5185185185185186, + "grad_norm": 0.1439277082681656, + "learning_rate": 1.3030046443173445e-06, + "loss": 0.2904, + "step": 498 + }, + { + "epoch": 1.5216049382716048, + "grad_norm": 0.14361339807510376, + "learning_rate": 1.2864732624315867e-06, + "loss": 0.3338, + "step": 499 + }, + { + "epoch": 1.5246913580246915, + "grad_norm": 0.1480712592601776, + "learning_rate": 1.270031922324546e-06, + "loss": 0.4092, + "step": 500 + }, + { + "epoch": 1.5277777777777777, + "grad_norm": 0.156494140625, + "learning_rate": 1.2536810226489354e-06, + "loss": 0.3855, + "step": 501 + }, + { + "epoch": 1.5308641975308643, + "grad_norm": 0.2111222743988037, + "learning_rate": 1.237420959864561e-06, + "loss": 0.4681, + "step": 502 + }, + { + "epoch": 1.5339506172839505, + "grad_norm": 0.20178188383579254, + "learning_rate": 1.2212521282287093e-06, + "loss": 0.3472, + "step": 503 + }, + { + "epoch": 1.5370370370370372, + "grad_norm": 0.14656566083431244, + "learning_rate": 1.2051749197865875e-06, + "loss": 0.2829, + "step": 504 + }, + { + "epoch": 1.5401234567901234, + "grad_norm": 0.17030468583106995, + "learning_rate": 1.1891897243618184e-06, + "loss": 0.457, + "step": 505 + }, + { + "epoch": 1.5432098765432098, + "grad_norm": 0.16490556299686432, + "learning_rate": 1.173296929546987e-06, + "loss": 0.4265, + "step": 506 + }, + { + "epoch": 1.5462962962962963, + "grad_norm": 0.15814335644245148, + "learning_rate": 1.1574969206942443e-06, + "loss": 0.3079, + "step": 507 + }, + { + "epoch": 1.5493827160493827, + "grad_norm": 0.15672267973423004, + "learning_rate": 1.1417900809059623e-06, + "loss": 0.2618, + "step": 508 + }, + { + "epoch": 1.5524691358024691, + "grad_norm": 0.26926475763320923, + "learning_rate": 1.1261767910254422e-06, + "loss": 0.4501, + "step": 509 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.22438615560531616, + "learning_rate": 1.1106574296276923e-06, + "loss": 0.5102, + "step": 510 + }, + { + "epoch": 1.558641975308642, + "grad_norm": 0.16849224269390106, + "learning_rate": 1.095232373010226e-06, + "loss": 0.4356, + "step": 511 + }, + { + "epoch": 1.5617283950617284, + "grad_norm": 0.15593089163303375, + "learning_rate": 1.0799019951839656e-06, + "loss": 0.2973, + "step": 512 + }, + { + "epoch": 1.5648148148148149, + "grad_norm": 0.14039039611816406, + "learning_rate": 1.0646666678641477e-06, + "loss": 0.4104, + "step": 513 + }, + { + "epoch": 1.567901234567901, + "grad_norm": 0.11041123420000076, + "learning_rate": 1.0495267604613273e-06, + "loss": 0.2541, + "step": 514 + }, + { + "epoch": 1.5709876543209877, + "grad_norm": 0.1312185525894165, + "learning_rate": 1.0344826400724185e-06, + "loss": 0.2818, + "step": 515 + }, + { + "epoch": 1.574074074074074, + "grad_norm": 0.20511452853679657, + "learning_rate": 1.0195346714717813e-06, + "loss": 0.3218, + "step": 516 + }, + { + "epoch": 1.5771604938271606, + "grad_norm": 0.2118871957063675, + "learning_rate": 1.0046832171023952e-06, + "loss": 0.2921, + "step": 517 + }, + { + "epoch": 1.5802469135802468, + "grad_norm": 0.18419800698757172, + "learning_rate": 9.899286370670575e-07, + "loss": 0.4502, + "step": 518 + }, + { + "epoch": 1.5833333333333335, + "grad_norm": 0.1755116879940033, + "learning_rate": 9.752712891196558e-07, + "loss": 0.3514, + "step": 519 + }, + { + "epoch": 1.5864197530864197, + "grad_norm": 0.16331788897514343, + "learning_rate": 9.607115286564972e-07, + "loss": 0.318, + "step": 520 + }, + { + "epoch": 1.5895061728395061, + "grad_norm": 0.18510426580905914, + "learning_rate": 9.46249708707681e-07, + "loss": 0.3207, + "step": 521 + }, + { + "epoch": 1.5925925925925926, + "grad_norm": 0.1467633843421936, + "learning_rate": 9.318861799285539e-07, + "loss": 0.32, + "step": 522 + }, + { + "epoch": 1.595679012345679, + "grad_norm": 0.21128030121326447, + "learning_rate": 9.176212905911946e-07, + "loss": 0.4566, + "step": 523 + }, + { + "epoch": 1.5987654320987654, + "grad_norm": 0.14944253861904144, + "learning_rate": 9.034553865759754e-07, + "loss": 0.4221, + "step": 524 + }, + { + "epoch": 1.6018518518518519, + "grad_norm": 0.1913837343454361, + "learning_rate": 8.893888113631732e-07, + "loss": 0.3236, + "step": 525 + }, + { + "epoch": 1.6049382716049383, + "grad_norm": 0.14830860495567322, + "learning_rate": 8.754219060246432e-07, + "loss": 0.3504, + "step": 526 + }, + { + "epoch": 1.6080246913580247, + "grad_norm": 0.1303461194038391, + "learning_rate": 8.615550092155478e-07, + "loss": 0.2281, + "step": 527 + }, + { + "epoch": 1.6111111111111112, + "grad_norm": 0.11773131787776947, + "learning_rate": 8.477884571661449e-07, + "loss": 0.2038, + "step": 528 + }, + { + "epoch": 1.6141975308641974, + "grad_norm": 0.16557615995407104, + "learning_rate": 8.341225836736367e-07, + "loss": 0.2965, + "step": 529 + }, + { + "epoch": 1.617283950617284, + "grad_norm": 0.15140382945537567, + "learning_rate": 8.20557720094074e-07, + "loss": 0.2804, + "step": 530 + }, + { + "epoch": 1.6203703703703702, + "grad_norm": 0.15120923519134521, + "learning_rate": 8.070941953343242e-07, + "loss": 0.3037, + "step": 531 + }, + { + "epoch": 1.623456790123457, + "grad_norm": 0.28693991899490356, + "learning_rate": 7.937323358440935e-07, + "loss": 0.4625, + "step": 532 + }, + { + "epoch": 1.626543209876543, + "grad_norm": 0.226279154419899, + "learning_rate": 7.804724656080182e-07, + "loss": 0.3529, + "step": 533 + }, + { + "epoch": 1.6296296296296298, + "grad_norm": 0.14384153485298157, + "learning_rate": 7.673149061377966e-07, + "loss": 0.4064, + "step": 534 + }, + { + "epoch": 1.632716049382716, + "grad_norm": 0.153773695230484, + "learning_rate": 7.542599764644049e-07, + "loss": 0.2779, + "step": 535 + }, + { + "epoch": 1.6358024691358026, + "grad_norm": 0.2235001176595688, + "learning_rate": 7.413079931303591e-07, + "loss": 0.4181, + "step": 536 + }, + { + "epoch": 1.6388888888888888, + "grad_norm": 0.1906222552061081, + "learning_rate": 7.284592701820325e-07, + "loss": 0.2867, + "step": 537 + }, + { + "epoch": 1.6419753086419753, + "grad_norm": 0.189738929271698, + "learning_rate": 7.157141191620548e-07, + "loss": 0.3274, + "step": 538 + }, + { + "epoch": 1.6450617283950617, + "grad_norm": 0.15748707950115204, + "learning_rate": 7.030728491017408e-07, + "loss": 0.2892, + "step": 539 + }, + { + "epoch": 1.6481481481481481, + "grad_norm": 0.2472158521413803, + "learning_rate": 6.905357665136142e-07, + "loss": 0.3892, + "step": 540 + }, + { + "epoch": 1.6512345679012346, + "grad_norm": 0.18736745417118073, + "learning_rate": 6.781031753839662e-07, + "loss": 0.3192, + "step": 541 + }, + { + "epoch": 1.654320987654321, + "grad_norm": 0.15377798676490784, + "learning_rate": 6.657753771654812e-07, + "loss": 0.2991, + "step": 542 + }, + { + "epoch": 1.6574074074074074, + "grad_norm": 0.16992682218551636, + "learning_rate": 6.535526707699408e-07, + "loss": 0.3628, + "step": 543 + }, + { + "epoch": 1.6604938271604939, + "grad_norm": 0.201069176197052, + "learning_rate": 6.414353525609628e-07, + "loss": 0.3127, + "step": 544 + }, + { + "epoch": 1.6635802469135803, + "grad_norm": 0.14373762905597687, + "learning_rate": 6.294237163468231e-07, + "loss": 0.2488, + "step": 545 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.16759946942329407, + "learning_rate": 6.175180533733277e-07, + "loss": 0.3833, + "step": 546 + }, + { + "epoch": 1.6697530864197532, + "grad_norm": 0.2061176598072052, + "learning_rate": 6.057186523167529e-07, + "loss": 0.252, + "step": 547 + }, + { + "epoch": 1.6728395061728394, + "grad_norm": 0.18383823335170746, + "learning_rate": 5.940257992768456e-07, + "loss": 0.3677, + "step": 548 + }, + { + "epoch": 1.675925925925926, + "grad_norm": 0.2329624891281128, + "learning_rate": 5.824397777698859e-07, + "loss": 0.3821, + "step": 549 + }, + { + "epoch": 1.6790123456790123, + "grad_norm": 0.16050845384597778, + "learning_rate": 5.709608687218116e-07, + "loss": 0.3203, + "step": 550 + }, + { + "epoch": 1.682098765432099, + "grad_norm": 0.1575547456741333, + "learning_rate": 5.595893504614097e-07, + "loss": 0.4154, + "step": 551 + }, + { + "epoch": 1.6851851851851851, + "grad_norm": 0.14166632294654846, + "learning_rate": 5.483254987135644e-07, + "loss": 0.2528, + "step": 552 + }, + { + "epoch": 1.6882716049382716, + "grad_norm": 0.1413419544696808, + "learning_rate": 5.371695865925736e-07, + "loss": 0.2011, + "step": 553 + }, + { + "epoch": 1.691358024691358, + "grad_norm": 0.14001396298408508, + "learning_rate": 5.261218845955246e-07, + "loss": 0.2521, + "step": 554 + }, + { + "epoch": 1.6944444444444444, + "grad_norm": 0.2379157692193985, + "learning_rate": 5.151826605957394e-07, + "loss": 0.3396, + "step": 555 + }, + { + "epoch": 1.6975308641975309, + "grad_norm": 0.1787138283252716, + "learning_rate": 5.043521798362755e-07, + "loss": 0.2596, + "step": 556 + }, + { + "epoch": 1.7006172839506173, + "grad_norm": 0.41910964250564575, + "learning_rate": 4.936307049234956e-07, + "loss": 0.3327, + "step": 557 + }, + { + "epoch": 1.7037037037037037, + "grad_norm": 0.1860780268907547, + "learning_rate": 4.830184958207007e-07, + "loss": 0.399, + "step": 558 + }, + { + "epoch": 1.7067901234567902, + "grad_norm": 0.16398878395557404, + "learning_rate": 4.725158098418309e-07, + "loss": 0.3953, + "step": 559 + }, + { + "epoch": 1.7098765432098766, + "grad_norm": 0.1744304746389389, + "learning_rate": 4.6212290164521554e-07, + "loss": 0.2567, + "step": 560 + }, + { + "epoch": 1.7129629629629628, + "grad_norm": 0.19683323800563812, + "learning_rate": 4.5184002322740784e-07, + "loss": 0.4327, + "step": 561 + }, + { + "epoch": 1.7160493827160495, + "grad_norm": 0.17663246393203735, + "learning_rate": 4.4166742391707593e-07, + "loss": 0.2145, + "step": 562 + }, + { + "epoch": 1.7191358024691357, + "grad_norm": 0.16606709361076355, + "learning_rate": 4.316053503689466e-07, + "loss": 0.3419, + "step": 563 + }, + { + "epoch": 1.7222222222222223, + "grad_norm": 0.21532438695430756, + "learning_rate": 4.2165404655783836e-07, + "loss": 0.379, + "step": 564 + }, + { + "epoch": 1.7253086419753085, + "grad_norm": 0.1450224667787552, + "learning_rate": 4.1181375377273237e-07, + "loss": 0.19, + "step": 565 + }, + { + "epoch": 1.7283950617283952, + "grad_norm": 0.18900087475776672, + "learning_rate": 4.020847106109349e-07, + "loss": 0.3304, + "step": 566 + }, + { + "epoch": 1.7314814814814814, + "grad_norm": 0.1328793317079544, + "learning_rate": 3.9246715297228176e-07, + "loss": 0.283, + "step": 567 + }, + { + "epoch": 1.7314814814814814, + "eval_loss": 0.42760223150253296, + "eval_runtime": 44.2033, + "eval_samples_per_second": 8.325, + "eval_steps_per_second": 1.041, + "step": 567 + }, + { + "epoch": 1.734567901234568, + "grad_norm": 0.14145122468471527, + "learning_rate": 3.829613140534222e-07, + "loss": 0.3045, + "step": 568 + }, + { + "epoch": 1.7376543209876543, + "grad_norm": 0.1800602227449417, + "learning_rate": 3.7356742434216775e-07, + "loss": 0.2553, + "step": 569 + }, + { + "epoch": 1.7407407407407407, + "grad_norm": 0.18250073492527008, + "learning_rate": 3.642857116118986e-07, + "loss": 0.23, + "step": 570 + }, + { + "epoch": 1.7438271604938271, + "grad_norm": 0.14363303780555725, + "learning_rate": 3.5511640091604293e-07, + "loss": 0.2744, + "step": 571 + }, + { + "epoch": 1.7469135802469136, + "grad_norm": 0.16794289648532867, + "learning_rate": 3.4605971458262e-07, + "loss": 0.3806, + "step": 572 + }, + { + "epoch": 1.75, + "grad_norm": 0.15108714997768402, + "learning_rate": 3.371158722088497e-07, + "loss": 0.2868, + "step": 573 + }, + { + "epoch": 1.7530864197530864, + "grad_norm": 0.2250644415616989, + "learning_rate": 3.2828509065582713e-07, + "loss": 0.4173, + "step": 574 + }, + { + "epoch": 1.7561728395061729, + "grad_norm": 0.16634950041770935, + "learning_rate": 3.195675840432655e-07, + "loss": 0.3429, + "step": 575 + }, + { + "epoch": 1.7592592592592593, + "grad_norm": 0.3840501010417938, + "learning_rate": 3.109635637443026e-07, + "loss": 0.3564, + "step": 576 + }, + { + "epoch": 1.7623456790123457, + "grad_norm": 0.1317005604505539, + "learning_rate": 3.02473238380378e-07, + "loss": 0.2571, + "step": 577 + }, + { + "epoch": 1.765432098765432, + "grad_norm": 0.16465657949447632, + "learning_rate": 2.9409681381617315e-07, + "loss": 0.3739, + "step": 578 + }, + { + "epoch": 1.7685185185185186, + "grad_norm": 0.14124394953250885, + "learning_rate": 2.858344931546181e-07, + "loss": 0.2025, + "step": 579 + }, + { + "epoch": 1.7716049382716048, + "grad_norm": 0.19090065360069275, + "learning_rate": 2.776864767319731e-07, + "loss": 0.3652, + "step": 580 + }, + { + "epoch": 1.7746913580246915, + "grad_norm": 0.16761578619480133, + "learning_rate": 2.696529621129618e-07, + "loss": 0.3257, + "step": 581 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.17358000576496124, + "learning_rate": 2.617341440859883e-07, + "loss": 0.3162, + "step": 582 + }, + { + "epoch": 1.7808641975308643, + "grad_norm": 0.13688547909259796, + "learning_rate": 2.539302146584116e-07, + "loss": 0.2838, + "step": 583 + }, + { + "epoch": 1.7839506172839505, + "grad_norm": 0.12233246117830276, + "learning_rate": 2.4624136305188895e-07, + "loss": 0.2656, + "step": 584 + }, + { + "epoch": 1.7870370370370372, + "grad_norm": 0.14487585425376892, + "learning_rate": 2.3866777569779234e-07, + "loss": 0.2808, + "step": 585 + }, + { + "epoch": 1.7901234567901234, + "grad_norm": 0.1593523919582367, + "learning_rate": 2.3120963623267822e-07, + "loss": 0.3441, + "step": 586 + }, + { + "epoch": 1.7932098765432098, + "grad_norm": 0.1122526079416275, + "learning_rate": 2.2386712549384848e-07, + "loss": 0.1452, + "step": 587 + }, + { + "epoch": 1.7962962962962963, + "grad_norm": 0.1848554015159607, + "learning_rate": 2.1664042151495424e-07, + "loss": 0.407, + "step": 588 + }, + { + "epoch": 1.7993827160493827, + "grad_norm": 0.17059315741062164, + "learning_rate": 2.095296995216828e-07, + "loss": 0.3516, + "step": 589 + }, + { + "epoch": 1.8024691358024691, + "grad_norm": 0.18412597477436066, + "learning_rate": 2.0253513192751374e-07, + "loss": 0.2922, + "step": 590 + }, + { + "epoch": 1.8055555555555556, + "grad_norm": 0.17134982347488403, + "learning_rate": 1.9565688832952846e-07, + "loss": 0.2951, + "step": 591 + }, + { + "epoch": 1.808641975308642, + "grad_norm": 0.11777715384960175, + "learning_rate": 1.8889513550430892e-07, + "loss": 0.24, + "step": 592 + }, + { + "epoch": 1.8117283950617284, + "grad_norm": 0.18584772944450378, + "learning_rate": 1.8225003740388546e-07, + "loss": 0.3498, + "step": 593 + }, + { + "epoch": 1.8148148148148149, + "grad_norm": 0.15893200039863586, + "learning_rate": 1.7572175515176538e-07, + "loss": 0.3392, + "step": 594 + }, + { + "epoch": 1.817901234567901, + "grad_norm": 0.152305468916893, + "learning_rate": 1.693104470390261e-07, + "loss": 0.2333, + "step": 595 + }, + { + "epoch": 1.8209876543209877, + "grad_norm": 0.15064826607704163, + "learning_rate": 1.6301626852047504e-07, + "loss": 0.2935, + "step": 596 + }, + { + "epoch": 1.824074074074074, + "grad_norm": 0.18689890205860138, + "learning_rate": 1.5683937221088242e-07, + "loss": 0.4082, + "step": 597 + }, + { + "epoch": 1.8271604938271606, + "grad_norm": 0.16067026555538177, + "learning_rate": 1.5077990788127993e-07, + "loss": 0.2624, + "step": 598 + }, + { + "epoch": 1.8302469135802468, + "grad_norm": 0.15756982564926147, + "learning_rate": 1.448380224553303e-07, + "loss": 0.3681, + "step": 599 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.16193000972270966, + "learning_rate": 1.3901386000576112e-07, + "loss": 0.5148, + "step": 600 + }, + { + "epoch": 1.8364197530864197, + "grad_norm": 0.1545064002275467, + "learning_rate": 1.3330756175087778e-07, + "loss": 0.2837, + "step": 601 + }, + { + "epoch": 1.8395061728395061, + "grad_norm": 0.1584656536579132, + "learning_rate": 1.2771926605113283e-07, + "loss": 0.267, + "step": 602 + }, + { + "epoch": 1.8425925925925926, + "grad_norm": 0.23085588216781616, + "learning_rate": 1.2224910840577642e-07, + "loss": 0.3637, + "step": 603 + }, + { + "epoch": 1.845679012345679, + "grad_norm": 0.15698540210723877, + "learning_rate": 1.1689722144956672e-07, + "loss": 0.2152, + "step": 604 + }, + { + "epoch": 1.8487654320987654, + "grad_norm": 0.1545877605676651, + "learning_rate": 1.1166373494955696e-07, + "loss": 0.3073, + "step": 605 + }, + { + "epoch": 1.8518518518518519, + "grad_norm": 0.16467563807964325, + "learning_rate": 1.06548775801949e-07, + "loss": 0.3654, + "step": 606 + }, + { + "epoch": 1.8549382716049383, + "grad_norm": 0.20076429843902588, + "learning_rate": 1.0155246802901198e-07, + "loss": 0.3131, + "step": 607 + }, + { + "epoch": 1.8580246913580247, + "grad_norm": 0.14146511256694794, + "learning_rate": 9.667493277608187e-08, + "loss": 0.3651, + "step": 608 + }, + { + "epoch": 1.8611111111111112, + "grad_norm": 0.15111708641052246, + "learning_rate": 9.191628830861832e-08, + "loss": 0.267, + "step": 609 + }, + { + "epoch": 1.8641975308641974, + "grad_norm": 0.13036541640758514, + "learning_rate": 8.727665000934027e-08, + "loss": 0.2568, + "step": 610 + }, + { + "epoch": 1.867283950617284, + "grad_norm": 0.16827543079853058, + "learning_rate": 8.275613037542873e-08, + "loss": 0.4188, + "step": 611 + }, + { + "epoch": 1.8703703703703702, + "grad_norm": 0.18110865354537964, + "learning_rate": 7.835483901579454e-08, + "loss": 0.3361, + "step": 612 + }, + { + "epoch": 1.873456790123457, + "grad_norm": 0.1515679508447647, + "learning_rate": 7.407288264842772e-08, + "loss": 0.3421, + "step": 613 + }, + { + "epoch": 1.876543209876543, + "grad_norm": 0.1735447645187378, + "learning_rate": 6.991036509780391e-08, + "loss": 0.3908, + "step": 614 + }, + { + "epoch": 1.8796296296296298, + "grad_norm": 0.15131166577339172, + "learning_rate": 6.58673872923693e-08, + "loss": 0.2439, + "step": 615 + }, + { + "epoch": 1.882716049382716, + "grad_norm": 0.12076130509376526, + "learning_rate": 6.194404726209358e-08, + "loss": 0.2178, + "step": 616 + }, + { + "epoch": 1.8858024691358026, + "grad_norm": 0.1315135806798935, + "learning_rate": 5.8140440136091326e-08, + "loss": 0.2291, + "step": 617 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.17915165424346924, + "learning_rate": 5.445665814031942e-08, + "loss": 0.2377, + "step": 618 + }, + { + "epoch": 1.8919753086419753, + "grad_norm": 0.14008641242980957, + "learning_rate": 5.089279059533658e-08, + "loss": 0.2266, + "step": 619 + }, + { + "epoch": 1.8950617283950617, + "grad_norm": 0.18772335350513458, + "learning_rate": 4.744892391413791e-08, + "loss": 0.4006, + "step": 620 + }, + { + "epoch": 1.8981481481481481, + "grad_norm": 0.14937154948711395, + "learning_rate": 4.412514160006376e-08, + "loss": 0.3891, + "step": 621 + }, + { + "epoch": 1.9012345679012346, + "grad_norm": 0.12767252326011658, + "learning_rate": 4.092152424477025e-08, + "loss": 0.2397, + "step": 622 + }, + { + "epoch": 1.904320987654321, + "grad_norm": 0.16874873638153076, + "learning_rate": 3.7838149526277514e-08, + "loss": 0.3338, + "step": 623 + }, + { + "epoch": 1.9074074074074074, + "grad_norm": 0.1845911145210266, + "learning_rate": 3.487509220708563e-08, + "loss": 0.4378, + "step": 624 + }, + { + "epoch": 1.9104938271604939, + "grad_norm": 0.14064140617847443, + "learning_rate": 3.2032424132362736e-08, + "loss": 0.2801, + "step": 625 + }, + { + "epoch": 1.9135802469135803, + "grad_norm": 0.14805810153484344, + "learning_rate": 2.9310214228202016e-08, + "loss": 0.3122, + "step": 626 + }, + { + "epoch": 1.9166666666666665, + "grad_norm": 0.1921551674604416, + "learning_rate": 2.6708528499950758e-08, + "loss": 0.2982, + "step": 627 + }, + { + "epoch": 1.9197530864197532, + "grad_norm": 0.14775682985782623, + "learning_rate": 2.4227430030609455e-08, + "loss": 0.3503, + "step": 628 + }, + { + "epoch": 1.9228395061728394, + "grad_norm": 0.17906314134597778, + "learning_rate": 2.1866978979303567e-08, + "loss": 0.3863, + "step": 629 + }, + { + "epoch": 1.925925925925926, + "grad_norm": 0.1467551589012146, + "learning_rate": 1.962723257982302e-08, + "loss": 0.2993, + "step": 630 + }, + { + "epoch": 1.9290123456790123, + "grad_norm": 0.2205621749162674, + "learning_rate": 1.7508245139236658e-08, + "loss": 0.3168, + "step": 631 + }, + { + "epoch": 1.932098765432099, + "grad_norm": 0.1704474836587906, + "learning_rate": 1.5510068036573288e-08, + "loss": 0.3177, + "step": 632 + }, + { + "epoch": 1.9351851851851851, + "grad_norm": 0.15591393411159515, + "learning_rate": 1.3632749721577132e-08, + "loss": 0.2671, + "step": 633 + }, + { + "epoch": 1.9382716049382716, + "grad_norm": 0.1339595913887024, + "learning_rate": 1.1876335713532638e-08, + "loss": 0.196, + "step": 634 + }, + { + "epoch": 1.941358024691358, + "grad_norm": 0.15144091844558716, + "learning_rate": 1.024086860016149e-08, + "loss": 0.306, + "step": 635 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 0.14868693053722382, + "learning_rate": 8.726388036587874e-09, + "loss": 0.271, + "step": 636 + }, + { + "epoch": 1.9475308641975309, + "grad_norm": 0.14298443496227264, + "learning_rate": 7.332930744380906e-09, + "loss": 0.225, + "step": 637 + }, + { + "epoch": 1.9506172839506173, + "grad_norm": 0.14053991436958313, + "learning_rate": 6.060530510659246e-09, + "loss": 0.32, + "step": 638 + }, + { + "epoch": 1.9537037037037037, + "grad_norm": 0.2039446085691452, + "learning_rate": 4.909218187276743e-09, + "loss": 0.4306, + "step": 639 + }, + { + "epoch": 1.9567901234567902, + "grad_norm": 0.20658931136131287, + "learning_rate": 3.8790216900702615e-09, + "loss": 0.4053, + "step": 640 + }, + { + "epoch": 1.9598765432098766, + "grad_norm": 0.30260926485061646, + "learning_rate": 2.9699659981863306e-09, + "loss": 0.3979, + "step": 641 + }, + { + "epoch": 1.9629629629629628, + "grad_norm": 0.1412692815065384, + "learning_rate": 2.182073153471631e-09, + "loss": 0.1879, + "step": 642 + }, + { + "epoch": 1.9660493827160495, + "grad_norm": 0.11770602315664291, + "learning_rate": 1.5153622599428652e-09, + "loss": 0.2462, + "step": 643 + }, + { + "epoch": 1.9691358024691357, + "grad_norm": 0.156539648771286, + "learning_rate": 9.698494833199068e-10, + "loss": 0.3218, + "step": 644 + }, + { + "epoch": 1.9722222222222223, + "grad_norm": 0.19168072938919067, + "learning_rate": 5.455480506355582e-10, + "loss": 0.4821, + "step": 645 + }, + { + "epoch": 1.9753086419753085, + "grad_norm": 0.13230177760124207, + "learning_rate": 2.4246824991525085e-10, + "loss": 0.3134, + "step": 646 + }, + { + "epoch": 1.9783950617283952, + "grad_norm": 0.1942073255777359, + "learning_rate": 6.061742992613529e-11, + "loss": 0.3413, + "step": 647 + }, + { + "epoch": 1.9814814814814814, + "grad_norm": 0.15652911365032196, + "learning_rate": 0.0, + "loss": 0.2942, + "step": 648 + }, + { + "epoch": 1.9814814814814814, + "eval_loss": 0.42709851264953613, + "eval_runtime": 44.317, + "eval_samples_per_second": 8.304, + "eval_steps_per_second": 1.038, + "step": 648 + } + ], + "logging_steps": 1, + "max_steps": 648, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 162, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.584525189221712e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}