|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9814814814814814, |
|
"eval_steps": 81, |
|
"global_step": 648, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0030864197530864196, |
|
"grad_norm": 0.11897344887256622, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.6253, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0030864197530864196, |
|
"eval_loss": 0.6252603530883789, |
|
"eval_runtime": 44.2936, |
|
"eval_samples_per_second": 8.308, |
|
"eval_steps_per_second": 1.039, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006172839506172839, |
|
"grad_norm": 0.11417510360479355, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.6376, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.009259259259259259, |
|
"grad_norm": 0.0693814605474472, |
|
"learning_rate": 3e-06, |
|
"loss": 0.2684, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.012345679012345678, |
|
"grad_norm": 0.1110842302441597, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.5096, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.015432098765432098, |
|
"grad_norm": 0.09205043315887451, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5674, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.018518518518518517, |
|
"grad_norm": 0.1063380092382431, |
|
"learning_rate": 6e-06, |
|
"loss": 0.6219, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.021604938271604937, |
|
"grad_norm": 0.0740552470088005, |
|
"learning_rate": 7e-06, |
|
"loss": 0.5478, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.024691358024691357, |
|
"grad_norm": 0.10674550384283066, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.6168, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.027777777777777776, |
|
"grad_norm": 0.1061239168047905, |
|
"learning_rate": 9e-06, |
|
"loss": 0.7106, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.030864197530864196, |
|
"grad_norm": 0.10123332589864731, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5221, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.033950617283950615, |
|
"grad_norm": 0.06680818647146225, |
|
"learning_rate": 9.999939382570075e-06, |
|
"loss": 0.2592, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.037037037037037035, |
|
"grad_norm": 0.09670277684926987, |
|
"learning_rate": 9.999757531750086e-06, |
|
"loss": 0.5183, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.040123456790123455, |
|
"grad_norm": 0.07567557692527771, |
|
"learning_rate": 9.999454451949364e-06, |
|
"loss": 0.3257, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.043209876543209874, |
|
"grad_norm": 0.10101059824228287, |
|
"learning_rate": 9.999030150516681e-06, |
|
"loss": 0.4788, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.046296296296296294, |
|
"grad_norm": 0.1238669604063034, |
|
"learning_rate": 9.998484637740058e-06, |
|
"loss": 0.6218, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04938271604938271, |
|
"grad_norm": 0.10699903219938278, |
|
"learning_rate": 9.997817926846528e-06, |
|
"loss": 0.6429, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.05246913580246913, |
|
"grad_norm": 0.08470468968153, |
|
"learning_rate": 9.997030034001815e-06, |
|
"loss": 0.3134, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.05555555555555555, |
|
"grad_norm": 0.1229688748717308, |
|
"learning_rate": 9.99612097830993e-06, |
|
"loss": 0.712, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.05864197530864197, |
|
"grad_norm": 0.10526233166456223, |
|
"learning_rate": 9.995090781812724e-06, |
|
"loss": 0.504, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.06172839506172839, |
|
"grad_norm": 0.11165868490934372, |
|
"learning_rate": 9.993939469489342e-06, |
|
"loss": 0.5122, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06481481481481481, |
|
"grad_norm": 0.09065920859575272, |
|
"learning_rate": 9.99266706925562e-06, |
|
"loss": 0.4664, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.06790123456790123, |
|
"grad_norm": 0.10060250014066696, |
|
"learning_rate": 9.991273611963413e-06, |
|
"loss": 0.4732, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.07098765432098765, |
|
"grad_norm": 0.10402392596006393, |
|
"learning_rate": 9.98975913139984e-06, |
|
"loss": 0.4899, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.07407407407407407, |
|
"grad_norm": 0.11345162242650986, |
|
"learning_rate": 9.98812366428647e-06, |
|
"loss": 0.5365, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.07716049382716049, |
|
"grad_norm": 0.1189904510974884, |
|
"learning_rate": 9.986367250278423e-06, |
|
"loss": 0.6293, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.08024691358024691, |
|
"grad_norm": 0.11722761392593384, |
|
"learning_rate": 9.984489931963429e-06, |
|
"loss": 0.4991, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.08333333333333333, |
|
"grad_norm": 0.08803360909223557, |
|
"learning_rate": 9.982491754860763e-06, |
|
"loss": 0.381, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.08641975308641975, |
|
"grad_norm": 0.11037921905517578, |
|
"learning_rate": 9.980372767420179e-06, |
|
"loss": 0.5814, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.08950617283950617, |
|
"grad_norm": 0.0851665586233139, |
|
"learning_rate": 9.978133021020697e-06, |
|
"loss": 0.3629, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.09259259259259259, |
|
"grad_norm": 0.10195960849523544, |
|
"learning_rate": 9.97577256996939e-06, |
|
"loss": 0.5672, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09567901234567901, |
|
"grad_norm": 0.12112904340028763, |
|
"learning_rate": 9.97329147150005e-06, |
|
"loss": 0.6165, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.09876543209876543, |
|
"grad_norm": 0.07611838728189468, |
|
"learning_rate": 9.970689785771798e-06, |
|
"loss": 0.3902, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.10185185185185185, |
|
"grad_norm": 0.1013374775648117, |
|
"learning_rate": 9.96796757586764e-06, |
|
"loss": 0.5096, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.10493827160493827, |
|
"grad_norm": 0.08809865266084671, |
|
"learning_rate": 9.965124907792916e-06, |
|
"loss": 0.3333, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.10802469135802469, |
|
"grad_norm": 0.0764087364077568, |
|
"learning_rate": 9.962161850473723e-06, |
|
"loss": 0.3461, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1111111111111111, |
|
"grad_norm": 0.0995788499712944, |
|
"learning_rate": 9.95907847575523e-06, |
|
"loss": 0.4225, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.11419753086419752, |
|
"grad_norm": 0.11751396954059601, |
|
"learning_rate": 9.955874858399936e-06, |
|
"loss": 0.4991, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.11728395061728394, |
|
"grad_norm": 0.10502217710018158, |
|
"learning_rate": 9.952551076085864e-06, |
|
"loss": 0.5847, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.12037037037037036, |
|
"grad_norm": 0.1077880784869194, |
|
"learning_rate": 9.949107209404664e-06, |
|
"loss": 0.4901, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.12345679012345678, |
|
"grad_norm": 0.08844556659460068, |
|
"learning_rate": 9.945543341859681e-06, |
|
"loss": 0.5752, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12654320987654322, |
|
"grad_norm": 0.10771756619215012, |
|
"learning_rate": 9.94185955986391e-06, |
|
"loss": 0.5393, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.12962962962962962, |
|
"grad_norm": 0.07496192306280136, |
|
"learning_rate": 9.938055952737908e-06, |
|
"loss": 0.3334, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.13271604938271606, |
|
"grad_norm": 0.106163389980793, |
|
"learning_rate": 9.934132612707631e-06, |
|
"loss": 0.5319, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.13580246913580246, |
|
"grad_norm": 0.09276831895112991, |
|
"learning_rate": 9.930089634902197e-06, |
|
"loss": 0.486, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.1388888888888889, |
|
"grad_norm": 0.09449384361505508, |
|
"learning_rate": 9.925927117351573e-06, |
|
"loss": 0.3858, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1419753086419753, |
|
"grad_norm": 0.07955848425626755, |
|
"learning_rate": 9.921645160984205e-06, |
|
"loss": 0.4648, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.14506172839506173, |
|
"grad_norm": 0.10575301945209503, |
|
"learning_rate": 9.917243869624573e-06, |
|
"loss": 0.4704, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.14814814814814814, |
|
"grad_norm": 0.0714716911315918, |
|
"learning_rate": 9.91272334999066e-06, |
|
"loss": 0.372, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.15123456790123457, |
|
"grad_norm": 0.08894475549459457, |
|
"learning_rate": 9.908083711691383e-06, |
|
"loss": 0.5005, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.15432098765432098, |
|
"grad_norm": 0.0800170972943306, |
|
"learning_rate": 9.903325067223918e-06, |
|
"loss": 0.3688, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1574074074074074, |
|
"grad_norm": 0.09310433268547058, |
|
"learning_rate": 9.898447531970989e-06, |
|
"loss": 0.5127, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.16049382716049382, |
|
"grad_norm": 0.07690192013978958, |
|
"learning_rate": 9.893451224198051e-06, |
|
"loss": 0.2993, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.16358024691358025, |
|
"grad_norm": 0.08025282621383667, |
|
"learning_rate": 9.888336265050443e-06, |
|
"loss": 0.4004, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 0.06500386446714401, |
|
"learning_rate": 9.883102778550434e-06, |
|
"loss": 0.3317, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.1697530864197531, |
|
"grad_norm": 0.07926575839519501, |
|
"learning_rate": 9.877750891594224e-06, |
|
"loss": 0.3606, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1728395061728395, |
|
"grad_norm": 0.07245253026485443, |
|
"learning_rate": 9.872280733948867e-06, |
|
"loss": 0.4437, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.17592592592592593, |
|
"grad_norm": 0.07353054732084274, |
|
"learning_rate": 9.866692438249124e-06, |
|
"loss": 0.36, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.17901234567901234, |
|
"grad_norm": 0.09307980537414551, |
|
"learning_rate": 9.86098613999424e-06, |
|
"loss": 0.5175, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.18209876543209877, |
|
"grad_norm": 0.07782690227031708, |
|
"learning_rate": 9.855161977544672e-06, |
|
"loss": 0.4332, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.18518518518518517, |
|
"grad_norm": 0.06865860521793365, |
|
"learning_rate": 9.849220092118721e-06, |
|
"loss": 0.3464, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1882716049382716, |
|
"grad_norm": 0.0760008841753006, |
|
"learning_rate": 9.84316062778912e-06, |
|
"loss": 0.3808, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.19135802469135801, |
|
"grad_norm": 0.07834326475858688, |
|
"learning_rate": 9.836983731479526e-06, |
|
"loss": 0.499, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.19444444444444445, |
|
"grad_norm": 0.08240173012018204, |
|
"learning_rate": 9.830689552960974e-06, |
|
"loss": 0.4432, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.19753086419753085, |
|
"grad_norm": 0.06976404786109924, |
|
"learning_rate": 9.824278244848236e-06, |
|
"loss": 0.3482, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.2006172839506173, |
|
"grad_norm": 0.09335274249315262, |
|
"learning_rate": 9.817749962596115e-06, |
|
"loss": 0.4533, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2037037037037037, |
|
"grad_norm": 0.10973995178937912, |
|
"learning_rate": 9.811104864495691e-06, |
|
"loss": 0.6042, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.20679012345679013, |
|
"grad_norm": 0.08284437656402588, |
|
"learning_rate": 9.804343111670472e-06, |
|
"loss": 0.4818, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.20987654320987653, |
|
"grad_norm": 0.08448096364736557, |
|
"learning_rate": 9.797464868072489e-06, |
|
"loss": 0.518, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.21296296296296297, |
|
"grad_norm": 0.07667321711778641, |
|
"learning_rate": 9.790470300478318e-06, |
|
"loss": 0.3757, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.21604938271604937, |
|
"grad_norm": 0.0944654569029808, |
|
"learning_rate": 9.783359578485047e-06, |
|
"loss": 0.4863, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2191358024691358, |
|
"grad_norm": 0.07617281377315521, |
|
"learning_rate": 9.776132874506153e-06, |
|
"loss": 0.3484, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 0.09038567543029785, |
|
"learning_rate": 9.768790363767321e-06, |
|
"loss": 0.596, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.22530864197530864, |
|
"grad_norm": 0.0843636766076088, |
|
"learning_rate": 9.761332224302209e-06, |
|
"loss": 0.4042, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.22839506172839505, |
|
"grad_norm": 0.09003959596157074, |
|
"learning_rate": 9.753758636948112e-06, |
|
"loss": 0.5011, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.23148148148148148, |
|
"grad_norm": 0.079057976603508, |
|
"learning_rate": 9.74606978534159e-06, |
|
"loss": 0.4703, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2345679012345679, |
|
"grad_norm": 0.07765232026576996, |
|
"learning_rate": 9.738265855914014e-06, |
|
"loss": 0.3294, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.23765432098765432, |
|
"grad_norm": 0.07654544711112976, |
|
"learning_rate": 9.730347037887041e-06, |
|
"loss": 0.4039, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.24074074074074073, |
|
"grad_norm": 0.05925621837377548, |
|
"learning_rate": 9.722313523268028e-06, |
|
"loss": 0.2295, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.24382716049382716, |
|
"grad_norm": 0.07830403745174408, |
|
"learning_rate": 9.714165506845381e-06, |
|
"loss": 0.3721, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.24691358024691357, |
|
"grad_norm": 0.09928114712238312, |
|
"learning_rate": 9.705903186183828e-06, |
|
"loss": 0.5154, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.06352175772190094, |
|
"learning_rate": 9.697526761619621e-06, |
|
"loss": 0.2613, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 0.5444870591163635, |
|
"eval_runtime": 44.3715, |
|
"eval_samples_per_second": 8.294, |
|
"eval_steps_per_second": 1.037, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.25308641975308643, |
|
"grad_norm": 0.07308296114206314, |
|
"learning_rate": 9.689036436255698e-06, |
|
"loss": 0.3455, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.25617283950617287, |
|
"grad_norm": 0.07788842916488647, |
|
"learning_rate": 9.680432415956736e-06, |
|
"loss": 0.4675, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.25925925925925924, |
|
"grad_norm": 0.09506388008594513, |
|
"learning_rate": 9.671714909344175e-06, |
|
"loss": 0.5544, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.2623456790123457, |
|
"grad_norm": 0.08810863643884659, |
|
"learning_rate": 9.66288412779115e-06, |
|
"loss": 0.497, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.2654320987654321, |
|
"grad_norm": 0.06235141307115555, |
|
"learning_rate": 9.653940285417381e-06, |
|
"loss": 0.2775, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.26851851851851855, |
|
"grad_norm": 0.07534658908843994, |
|
"learning_rate": 9.644883599083959e-06, |
|
"loss": 0.3706, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.2716049382716049, |
|
"grad_norm": 0.11235971748828888, |
|
"learning_rate": 9.635714288388103e-06, |
|
"loss": 0.6166, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.27469135802469136, |
|
"grad_norm": 0.07352706789970398, |
|
"learning_rate": 9.626432575657834e-06, |
|
"loss": 0.4254, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.2777777777777778, |
|
"grad_norm": 0.10939712822437286, |
|
"learning_rate": 9.617038685946578e-06, |
|
"loss": 0.3768, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2808641975308642, |
|
"grad_norm": 0.0766228511929512, |
|
"learning_rate": 9.60753284702772e-06, |
|
"loss": 0.3562, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.2839506172839506, |
|
"grad_norm": 0.08354140818119049, |
|
"learning_rate": 9.597915289389067e-06, |
|
"loss": 0.4783, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.28703703703703703, |
|
"grad_norm": 0.08200543373823166, |
|
"learning_rate": 9.58818624622727e-06, |
|
"loss": 0.3947, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.29012345679012347, |
|
"grad_norm": 0.08410683274269104, |
|
"learning_rate": 9.578345953442163e-06, |
|
"loss": 0.5048, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.2932098765432099, |
|
"grad_norm": 0.1019473522901535, |
|
"learning_rate": 9.568394649631055e-06, |
|
"loss": 0.5842, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 0.08855041116476059, |
|
"learning_rate": 9.558332576082925e-06, |
|
"loss": 0.4176, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2993827160493827, |
|
"grad_norm": 0.08165948837995529, |
|
"learning_rate": 9.548159976772593e-06, |
|
"loss": 0.4098, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.30246913580246915, |
|
"grad_norm": 0.07580746710300446, |
|
"learning_rate": 9.537877098354787e-06, |
|
"loss": 0.3886, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.3055555555555556, |
|
"grad_norm": 0.0938824713230133, |
|
"learning_rate": 9.527484190158171e-06, |
|
"loss": 0.4551, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.30864197530864196, |
|
"grad_norm": 0.07878723740577698, |
|
"learning_rate": 9.5169815041793e-06, |
|
"loss": 0.4042, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3117283950617284, |
|
"grad_norm": 0.07207982987165451, |
|
"learning_rate": 9.506369295076505e-06, |
|
"loss": 0.3541, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.3148148148148148, |
|
"grad_norm": 0.06538520753383636, |
|
"learning_rate": 9.495647820163725e-06, |
|
"loss": 0.2972, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.31790123456790126, |
|
"grad_norm": 0.08196717500686646, |
|
"learning_rate": 9.484817339404261e-06, |
|
"loss": 0.401, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.32098765432098764, |
|
"grad_norm": 0.07677263766527176, |
|
"learning_rate": 9.473878115404477e-06, |
|
"loss": 0.4073, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.32407407407407407, |
|
"grad_norm": 0.11730651557445526, |
|
"learning_rate": 9.462830413407427e-06, |
|
"loss": 0.4501, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3271604938271605, |
|
"grad_norm": 0.06849709898233414, |
|
"learning_rate": 9.451674501286436e-06, |
|
"loss": 0.2538, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.33024691358024694, |
|
"grad_norm": 0.09413019567728043, |
|
"learning_rate": 9.440410649538592e-06, |
|
"loss": 0.4646, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.15361227095127106, |
|
"learning_rate": 9.42903913127819e-06, |
|
"loss": 0.5303, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.33641975308641975, |
|
"grad_norm": 0.08900155127048492, |
|
"learning_rate": 9.417560222230115e-06, |
|
"loss": 0.383, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.3395061728395062, |
|
"grad_norm": 0.07807417958974838, |
|
"learning_rate": 9.405974200723156e-06, |
|
"loss": 0.3673, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3425925925925926, |
|
"grad_norm": 0.1323561668395996, |
|
"learning_rate": 9.394281347683247e-06, |
|
"loss": 0.597, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.345679012345679, |
|
"grad_norm": 0.11236107349395752, |
|
"learning_rate": 9.382481946626673e-06, |
|
"loss": 0.5051, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.3487654320987654, |
|
"grad_norm": 0.09908317029476166, |
|
"learning_rate": 9.370576283653178e-06, |
|
"loss": 0.3208, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.35185185185185186, |
|
"grad_norm": 0.08509659022092819, |
|
"learning_rate": 9.358564647439037e-06, |
|
"loss": 0.3801, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.3549382716049383, |
|
"grad_norm": 0.05896300822496414, |
|
"learning_rate": 9.34644732923006e-06, |
|
"loss": 0.2217, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.35802469135802467, |
|
"grad_norm": 0.06763949990272522, |
|
"learning_rate": 9.33422462283452e-06, |
|
"loss": 0.3583, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.3611111111111111, |
|
"grad_norm": 0.0857081338763237, |
|
"learning_rate": 9.321896824616036e-06, |
|
"loss": 0.4122, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.36419753086419754, |
|
"grad_norm": 0.07149571180343628, |
|
"learning_rate": 9.309464233486386e-06, |
|
"loss": 0.2959, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.36728395061728397, |
|
"grad_norm": 0.09094710648059845, |
|
"learning_rate": 9.29692715089826e-06, |
|
"loss": 0.3633, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.37037037037037035, |
|
"grad_norm": 0.07034748792648315, |
|
"learning_rate": 9.284285880837947e-06, |
|
"loss": 0.2826, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3734567901234568, |
|
"grad_norm": 0.0919278934597969, |
|
"learning_rate": 9.271540729817969e-06, |
|
"loss": 0.389, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.3765432098765432, |
|
"grad_norm": 0.07186863571405411, |
|
"learning_rate": 9.258692006869644e-06, |
|
"loss": 0.296, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.37962962962962965, |
|
"grad_norm": 0.09665773808956146, |
|
"learning_rate": 9.245740023535596e-06, |
|
"loss": 0.4324, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.38271604938271603, |
|
"grad_norm": 0.08115452527999878, |
|
"learning_rate": 9.232685093862206e-06, |
|
"loss": 0.3555, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.38580246913580246, |
|
"grad_norm": 0.07702954113483429, |
|
"learning_rate": 9.219527534391983e-06, |
|
"loss": 0.3385, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3888888888888889, |
|
"grad_norm": 0.10876493901014328, |
|
"learning_rate": 9.206267664155906e-06, |
|
"loss": 0.4446, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.39197530864197533, |
|
"grad_norm": 0.07764764875173569, |
|
"learning_rate": 9.192905804665677e-06, |
|
"loss": 0.369, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.3950617283950617, |
|
"grad_norm": 0.10887006670236588, |
|
"learning_rate": 9.179442279905927e-06, |
|
"loss": 0.4297, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.39814814814814814, |
|
"grad_norm": 0.10183979570865631, |
|
"learning_rate": 9.165877416326365e-06, |
|
"loss": 0.5906, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.4012345679012346, |
|
"grad_norm": 0.07278673350811005, |
|
"learning_rate": 9.152211542833856e-06, |
|
"loss": 0.3017, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.404320987654321, |
|
"grad_norm": 0.08892305195331573, |
|
"learning_rate": 9.138444990784455e-06, |
|
"loss": 0.3919, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.4074074074074074, |
|
"grad_norm": 0.0926053375005722, |
|
"learning_rate": 9.124578093975358e-06, |
|
"loss": 0.4833, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.4104938271604938, |
|
"grad_norm": 0.1312541514635086, |
|
"learning_rate": 9.110611188636828e-06, |
|
"loss": 0.4139, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.41358024691358025, |
|
"grad_norm": 0.07399484515190125, |
|
"learning_rate": 9.096544613424026e-06, |
|
"loss": 0.3156, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 0.0757204219698906, |
|
"learning_rate": 9.082378709408805e-06, |
|
"loss": 0.3355, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.41975308641975306, |
|
"grad_norm": 0.08242496103048325, |
|
"learning_rate": 9.068113820071447e-06, |
|
"loss": 0.3647, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.4228395061728395, |
|
"grad_norm": 0.08191465586423874, |
|
"learning_rate": 9.053750291292321e-06, |
|
"loss": 0.3801, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.42592592592592593, |
|
"grad_norm": 0.08579788357019424, |
|
"learning_rate": 9.039288471343505e-06, |
|
"loss": 0.4375, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.42901234567901236, |
|
"grad_norm": 0.09289571642875671, |
|
"learning_rate": 9.024728710880345e-06, |
|
"loss": 0.3733, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.43209876543209874, |
|
"grad_norm": 0.09474348276853561, |
|
"learning_rate": 9.010071362932945e-06, |
|
"loss": 0.5004, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4351851851851852, |
|
"grad_norm": 0.09607541561126709, |
|
"learning_rate": 8.995316782897605e-06, |
|
"loss": 0.3496, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.4382716049382716, |
|
"grad_norm": 0.08354438096284866, |
|
"learning_rate": 8.98046532852822e-06, |
|
"loss": 0.3528, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.44135802469135804, |
|
"grad_norm": 0.08367566019296646, |
|
"learning_rate": 8.965517359927583e-06, |
|
"loss": 0.3365, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.08424922823905945, |
|
"learning_rate": 8.950473239538672e-06, |
|
"loss": 0.3636, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.44753086419753085, |
|
"grad_norm": 0.07770823687314987, |
|
"learning_rate": 8.935333332135853e-06, |
|
"loss": 0.2757, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.4506172839506173, |
|
"grad_norm": 0.08803431689739227, |
|
"learning_rate": 8.920098004816035e-06, |
|
"loss": 0.3397, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.4537037037037037, |
|
"grad_norm": 0.11619243025779724, |
|
"learning_rate": 8.904767626989774e-06, |
|
"loss": 0.4058, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.4567901234567901, |
|
"grad_norm": 0.08595902472734451, |
|
"learning_rate": 8.88934257037231e-06, |
|
"loss": 0.3447, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.45987654320987653, |
|
"grad_norm": 0.08116041868925095, |
|
"learning_rate": 8.873823208974557e-06, |
|
"loss": 0.3578, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.46296296296296297, |
|
"grad_norm": 0.13053898513317108, |
|
"learning_rate": 8.85820991909404e-06, |
|
"loss": 0.5429, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4660493827160494, |
|
"grad_norm": 0.08137528598308563, |
|
"learning_rate": 8.842503079305757e-06, |
|
"loss": 0.3078, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.4691358024691358, |
|
"grad_norm": 0.0843534767627716, |
|
"learning_rate": 8.826703070453014e-06, |
|
"loss": 0.3807, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.4722222222222222, |
|
"grad_norm": 0.13925758004188538, |
|
"learning_rate": 8.810810275638183e-06, |
|
"loss": 0.4771, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.47530864197530864, |
|
"grad_norm": 0.08117470145225525, |
|
"learning_rate": 8.794825080213415e-06, |
|
"loss": 0.3197, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.4783950617283951, |
|
"grad_norm": 0.07650022953748703, |
|
"learning_rate": 8.778747871771293e-06, |
|
"loss": 0.2993, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.48148148148148145, |
|
"grad_norm": 0.09445349872112274, |
|
"learning_rate": 8.76257904013544e-06, |
|
"loss": 0.3641, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.4845679012345679, |
|
"grad_norm": 0.097043976187706, |
|
"learning_rate": 8.746318977351066e-06, |
|
"loss": 0.4181, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.4876543209876543, |
|
"grad_norm": 0.1167394146323204, |
|
"learning_rate": 8.729968077675454e-06, |
|
"loss": 0.5277, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.49074074074074076, |
|
"grad_norm": 0.08402277529239655, |
|
"learning_rate": 8.713526737568415e-06, |
|
"loss": 0.2867, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.49382716049382713, |
|
"grad_norm": 0.09060430526733398, |
|
"learning_rate": 8.696995355682656e-06, |
|
"loss": 0.3219, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.49691358024691357, |
|
"grad_norm": 0.1259710192680359, |
|
"learning_rate": 8.680374332854134e-06, |
|
"loss": 0.5394, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.09654678404331207, |
|
"learning_rate": 8.663664072092324e-06, |
|
"loss": 0.3679, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 0.5044411420822144, |
|
"eval_runtime": 44.4479, |
|
"eval_samples_per_second": 8.279, |
|
"eval_steps_per_second": 1.035, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.5030864197530864, |
|
"grad_norm": 0.13062100112438202, |
|
"learning_rate": 8.646864978570445e-06, |
|
"loss": 0.38, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.5061728395061729, |
|
"grad_norm": 0.11305861920118332, |
|
"learning_rate": 8.629977459615655e-06, |
|
"loss": 0.3435, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.5092592592592593, |
|
"grad_norm": 0.07454624772071838, |
|
"learning_rate": 8.613001924699146e-06, |
|
"loss": 0.2768, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.5123456790123457, |
|
"grad_norm": 0.08615926653146744, |
|
"learning_rate": 8.595938785426241e-06, |
|
"loss": 0.3404, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.5154320987654321, |
|
"grad_norm": 0.09183604270219803, |
|
"learning_rate": 8.578788455526398e-06, |
|
"loss": 0.3493, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.5185185185185185, |
|
"grad_norm": 0.08047281205654144, |
|
"learning_rate": 8.561551350843185e-06, |
|
"loss": 0.3271, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.5216049382716049, |
|
"grad_norm": 0.08007708936929703, |
|
"learning_rate": 8.544227889324199e-06, |
|
"loss": 0.2844, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.5246913580246914, |
|
"grad_norm": 0.08152032643556595, |
|
"learning_rate": 8.526818491010922e-06, |
|
"loss": 0.3033, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5277777777777778, |
|
"grad_norm": 0.10703514516353607, |
|
"learning_rate": 8.509323578028547e-06, |
|
"loss": 0.4296, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.5308641975308642, |
|
"grad_norm": 0.07901628315448761, |
|
"learning_rate": 8.491743574575743e-06, |
|
"loss": 0.29, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.5339506172839507, |
|
"grad_norm": 0.09099699556827545, |
|
"learning_rate": 8.474078906914359e-06, |
|
"loss": 0.3021, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.5370370370370371, |
|
"grad_norm": 0.0866774320602417, |
|
"learning_rate": 8.456330003359093e-06, |
|
"loss": 0.2633, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.5401234567901234, |
|
"grad_norm": 0.10114055871963501, |
|
"learning_rate": 8.438497294267117e-06, |
|
"loss": 0.3735, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5432098765432098, |
|
"grad_norm": 0.1260298639535904, |
|
"learning_rate": 8.420581212027625e-06, |
|
"loss": 0.4687, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.5462962962962963, |
|
"grad_norm": 0.1004004031419754, |
|
"learning_rate": 8.402582191051365e-06, |
|
"loss": 0.29, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.5493827160493827, |
|
"grad_norm": 0.08794572949409485, |
|
"learning_rate": 8.38450066776009e-06, |
|
"loss": 0.3589, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.5524691358024691, |
|
"grad_norm": 0.10174311697483063, |
|
"learning_rate": 8.36633708057599e-06, |
|
"loss": 0.3832, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 0.11463697254657745, |
|
"learning_rate": 8.348091869911054e-06, |
|
"loss": 0.4172, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.558641975308642, |
|
"grad_norm": 0.11808864772319794, |
|
"learning_rate": 8.329765478156394e-06, |
|
"loss": 0.494, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.5617283950617284, |
|
"grad_norm": 0.11152324080467224, |
|
"learning_rate": 8.311358349671516e-06, |
|
"loss": 0.3973, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.5648148148148148, |
|
"grad_norm": 0.09295979887247086, |
|
"learning_rate": 8.292870930773551e-06, |
|
"loss": 0.3696, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.5679012345679012, |
|
"grad_norm": 0.10292661935091019, |
|
"learning_rate": 8.274303669726427e-06, |
|
"loss": 0.3408, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.5709876543209876, |
|
"grad_norm": 0.10190277546644211, |
|
"learning_rate": 8.255657016729997e-06, |
|
"loss": 0.3513, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.5740740740740741, |
|
"grad_norm": 0.08307984471321106, |
|
"learning_rate": 8.23693142390914e-06, |
|
"loss": 0.2577, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.5771604938271605, |
|
"grad_norm": 0.11023180931806564, |
|
"learning_rate": 8.218127345302775e-06, |
|
"loss": 0.4168, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.5802469135802469, |
|
"grad_norm": 0.10529080033302307, |
|
"learning_rate": 8.199245236852871e-06, |
|
"loss": 0.4223, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.5833333333333334, |
|
"grad_norm": 0.14696502685546875, |
|
"learning_rate": 8.180285556393384e-06, |
|
"loss": 0.5283, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.5864197530864198, |
|
"grad_norm": 0.15351015329360962, |
|
"learning_rate": 8.161248763639154e-06, |
|
"loss": 0.5173, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5895061728395061, |
|
"grad_norm": 0.10003789514303207, |
|
"learning_rate": 8.142135320174758e-06, |
|
"loss": 0.3617, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 0.09017117321491241, |
|
"learning_rate": 8.122945689443328e-06, |
|
"loss": 0.2601, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.595679012345679, |
|
"grad_norm": 0.11840925365686417, |
|
"learning_rate": 8.1036803367353e-06, |
|
"loss": 0.4291, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.5987654320987654, |
|
"grad_norm": 0.09116993844509125, |
|
"learning_rate": 8.084339729177142e-06, |
|
"loss": 0.3061, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.6018518518518519, |
|
"grad_norm": 0.11056546866893768, |
|
"learning_rate": 8.064924335720023e-06, |
|
"loss": 0.3712, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.6049382716049383, |
|
"grad_norm": 0.10576466470956802, |
|
"learning_rate": 8.045434627128446e-06, |
|
"loss": 0.3591, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.6080246913580247, |
|
"grad_norm": 0.09751347452402115, |
|
"learning_rate": 8.025871075968828e-06, |
|
"loss": 0.3268, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.6111111111111112, |
|
"grad_norm": 0.11890437453985214, |
|
"learning_rate": 8.006234156598043e-06, |
|
"loss": 0.3256, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.6141975308641975, |
|
"grad_norm": 0.12418389320373535, |
|
"learning_rate": 7.986524345151924e-06, |
|
"loss": 0.5357, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.6172839506172839, |
|
"grad_norm": 0.11261377483606339, |
|
"learning_rate": 7.966742119533724e-06, |
|
"loss": 0.4537, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6203703703703703, |
|
"grad_norm": 0.12626801431179047, |
|
"learning_rate": 7.946887959402504e-06, |
|
"loss": 0.3786, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.6234567901234568, |
|
"grad_norm": 0.12130914628505707, |
|
"learning_rate": 7.926962346161535e-06, |
|
"loss": 0.4564, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.6265432098765432, |
|
"grad_norm": 0.10559491068124771, |
|
"learning_rate": 7.9069657629466e-06, |
|
"loss": 0.3984, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.6296296296296297, |
|
"grad_norm": 0.11549825966358185, |
|
"learning_rate": 7.886898694614292e-06, |
|
"loss": 0.4251, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.6327160493827161, |
|
"grad_norm": 0.10902281850576401, |
|
"learning_rate": 7.866761627730253e-06, |
|
"loss": 0.4012, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.6358024691358025, |
|
"grad_norm": 0.11586394906044006, |
|
"learning_rate": 7.846555050557381e-06, |
|
"loss": 0.3586, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.6388888888888888, |
|
"grad_norm": 0.10988422483205795, |
|
"learning_rate": 7.826279453043985e-06, |
|
"loss": 0.4294, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.6419753086419753, |
|
"grad_norm": 0.1205698624253273, |
|
"learning_rate": 7.805935326811913e-06, |
|
"loss": 0.4782, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.6450617283950617, |
|
"grad_norm": 0.08950233459472656, |
|
"learning_rate": 7.78552316514462e-06, |
|
"loss": 0.2901, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.6481481481481481, |
|
"grad_norm": 0.13640360534191132, |
|
"learning_rate": 7.765043462975217e-06, |
|
"loss": 0.4403, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6512345679012346, |
|
"grad_norm": 0.13739749789237976, |
|
"learning_rate": 7.744496716874472e-06, |
|
"loss": 0.472, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.654320987654321, |
|
"grad_norm": 0.10840674489736557, |
|
"learning_rate": 7.723883425038759e-06, |
|
"loss": 0.3961, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.6574074074074074, |
|
"grad_norm": 0.11287008225917816, |
|
"learning_rate": 7.703204087277989e-06, |
|
"loss": 0.4169, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.6604938271604939, |
|
"grad_norm": 0.1013006791472435, |
|
"learning_rate": 7.682459205003484e-06, |
|
"loss": 0.3537, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.6635802469135802, |
|
"grad_norm": 0.12204479426145554, |
|
"learning_rate": 7.661649281215823e-06, |
|
"loss": 0.3444, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.1041225790977478, |
|
"learning_rate": 7.640774820492647e-06, |
|
"loss": 0.3432, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.6697530864197531, |
|
"grad_norm": 0.12317519634962082, |
|
"learning_rate": 7.619836328976416e-06, |
|
"loss": 0.4119, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.6728395061728395, |
|
"grad_norm": 0.15862716734409332, |
|
"learning_rate": 7.598834314362151e-06, |
|
"loss": 0.3585, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.6759259259259259, |
|
"grad_norm": 0.10013571381568909, |
|
"learning_rate": 7.57776928588511e-06, |
|
"loss": 0.3589, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.6790123456790124, |
|
"grad_norm": 0.11820396035909653, |
|
"learning_rate": 7.556641754308447e-06, |
|
"loss": 0.2838, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6820987654320988, |
|
"grad_norm": 0.08206115663051605, |
|
"learning_rate": 7.535452231910829e-06, |
|
"loss": 0.1639, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.6851851851851852, |
|
"grad_norm": 0.13305512070655823, |
|
"learning_rate": 7.514201232474012e-06, |
|
"loss": 0.3923, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.6882716049382716, |
|
"grad_norm": 0.1208796426653862, |
|
"learning_rate": 7.492889271270382e-06, |
|
"loss": 0.3698, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.691358024691358, |
|
"grad_norm": 0.11946754902601242, |
|
"learning_rate": 7.471516865050468e-06, |
|
"loss": 0.3797, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.6944444444444444, |
|
"grad_norm": 0.08816403150558472, |
|
"learning_rate": 7.450084532030402e-06, |
|
"loss": 0.2238, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.6975308641975309, |
|
"grad_norm": 0.12045780569314957, |
|
"learning_rate": 7.428592791879361e-06, |
|
"loss": 0.3699, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.7006172839506173, |
|
"grad_norm": 0.11096329241991043, |
|
"learning_rate": 7.407042165706969e-06, |
|
"loss": 0.362, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.7037037037037037, |
|
"grad_norm": 0.14540982246398926, |
|
"learning_rate": 7.385433176050654e-06, |
|
"loss": 0.4543, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.7067901234567902, |
|
"grad_norm": 0.11663732677698135, |
|
"learning_rate": 7.36376634686298e-06, |
|
"loss": 0.4606, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.7098765432098766, |
|
"grad_norm": 0.11102988570928574, |
|
"learning_rate": 7.342042203498952e-06, |
|
"loss": 0.3526, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7129629629629629, |
|
"grad_norm": 0.11012902110815048, |
|
"learning_rate": 7.320261272703259e-06, |
|
"loss": 0.4337, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.7160493827160493, |
|
"grad_norm": 0.09911687672138214, |
|
"learning_rate": 7.298424082597526e-06, |
|
"loss": 0.2504, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.7191358024691358, |
|
"grad_norm": 0.13727596402168274, |
|
"learning_rate": 7.276531162667484e-06, |
|
"loss": 0.4725, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.7222222222222222, |
|
"grad_norm": 0.10461889952421188, |
|
"learning_rate": 7.254583043750152e-06, |
|
"loss": 0.3202, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.7253086419753086, |
|
"grad_norm": 0.18260876834392548, |
|
"learning_rate": 7.232580258020952e-06, |
|
"loss": 0.4248, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.7283950617283951, |
|
"grad_norm": 0.13938364386558533, |
|
"learning_rate": 7.210523338980814e-06, |
|
"loss": 0.2602, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.7314814814814815, |
|
"grad_norm": 0.11910004913806915, |
|
"learning_rate": 7.1884128214432366e-06, |
|
"loss": 0.4185, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.7345679012345679, |
|
"grad_norm": 0.10073763877153397, |
|
"learning_rate": 7.1662492415213194e-06, |
|
"loss": 0.2697, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.7376543209876543, |
|
"grad_norm": 0.11307626962661743, |
|
"learning_rate": 7.14403313661476e-06, |
|
"loss": 0.4232, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 0.10806172341108322, |
|
"learning_rate": 7.1217650453968335e-06, |
|
"loss": 0.2928, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7438271604938271, |
|
"grad_norm": 0.14010940492153168, |
|
"learning_rate": 7.099445507801324e-06, |
|
"loss": 0.3915, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.7469135802469136, |
|
"grad_norm": 0.09002690017223358, |
|
"learning_rate": 7.0770750650094335e-06, |
|
"loss": 0.2801, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.11942241340875626, |
|
"learning_rate": 7.0546542594366605e-06, |
|
"loss": 0.4149, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 0.4767835736274719, |
|
"eval_runtime": 44.3688, |
|
"eval_samples_per_second": 8.294, |
|
"eval_steps_per_second": 1.037, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.7530864197530864, |
|
"grad_norm": 0.16698460280895233, |
|
"learning_rate": 7.03218363471965e-06, |
|
"loss": 0.4605, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.7561728395061729, |
|
"grad_norm": 0.12310118973255157, |
|
"learning_rate": 7.0096637357030105e-06, |
|
"loss": 0.4328, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.7592592592592593, |
|
"grad_norm": 0.11915367841720581, |
|
"learning_rate": 6.987095108426102e-06, |
|
"loss": 0.3907, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.7623456790123457, |
|
"grad_norm": 0.1066504493355751, |
|
"learning_rate": 6.964478300109796e-06, |
|
"loss": 0.3148, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.7654320987654321, |
|
"grad_norm": 0.09711527079343796, |
|
"learning_rate": 6.94181385914321e-06, |
|
"loss": 0.2736, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.7685185185185185, |
|
"grad_norm": 0.08204776048660278, |
|
"learning_rate": 6.91910233507041e-06, |
|
"loss": 0.1607, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.7716049382716049, |
|
"grad_norm": 0.13877205550670624, |
|
"learning_rate": 6.896344278577083e-06, |
|
"loss": 0.3763, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7746913580246914, |
|
"grad_norm": 0.11828643828630447, |
|
"learning_rate": 6.873540241477189e-06, |
|
"loss": 0.4063, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.7777777777777778, |
|
"grad_norm": 0.13950656354427338, |
|
"learning_rate": 6.850690776699574e-06, |
|
"loss": 0.4348, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.7808641975308642, |
|
"grad_norm": 0.13861550390720367, |
|
"learning_rate": 6.8277964382745675e-06, |
|
"loss": 0.4007, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.7839506172839507, |
|
"grad_norm": 0.12502089142799377, |
|
"learning_rate": 6.804857781320558e-06, |
|
"loss": 0.4157, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.7870370370370371, |
|
"grad_norm": 0.1129172146320343, |
|
"learning_rate": 6.781875362030512e-06, |
|
"loss": 0.3087, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.7901234567901234, |
|
"grad_norm": 0.18749450147151947, |
|
"learning_rate": 6.758849737658508e-06, |
|
"loss": 0.381, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.7932098765432098, |
|
"grad_norm": 0.11505936086177826, |
|
"learning_rate": 6.735781466506216e-06, |
|
"loss": 0.3639, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.7962962962962963, |
|
"grad_norm": 0.13606995344161987, |
|
"learning_rate": 6.712671107909359e-06, |
|
"loss": 0.4504, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.7993827160493827, |
|
"grad_norm": 0.13360187411308289, |
|
"learning_rate": 6.6895192222241534e-06, |
|
"loss": 0.4113, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.8024691358024691, |
|
"grad_norm": 0.1227497085928917, |
|
"learning_rate": 6.666326370813722e-06, |
|
"loss": 0.3156, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8055555555555556, |
|
"grad_norm": 0.1294088065624237, |
|
"learning_rate": 6.643093116034486e-06, |
|
"loss": 0.2544, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.808641975308642, |
|
"grad_norm": 0.11842790246009827, |
|
"learning_rate": 6.619820021222518e-06, |
|
"loss": 0.2796, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.8117283950617284, |
|
"grad_norm": 0.11302869021892548, |
|
"learning_rate": 6.5965076506799e-06, |
|
"loss": 0.3225, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.8148148148148148, |
|
"grad_norm": 0.1153462752699852, |
|
"learning_rate": 6.573156569661026e-06, |
|
"loss": 0.3168, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.8179012345679012, |
|
"grad_norm": 0.14865292608737946, |
|
"learning_rate": 6.549767344358903e-06, |
|
"loss": 0.3793, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.8209876543209876, |
|
"grad_norm": 0.18601423501968384, |
|
"learning_rate": 6.526340541891418e-06, |
|
"loss": 0.383, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.8240740740740741, |
|
"grad_norm": 0.11983994394540787, |
|
"learning_rate": 6.5028767302875974e-06, |
|
"loss": 0.3366, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.8271604938271605, |
|
"grad_norm": 0.11204046756029129, |
|
"learning_rate": 6.479376478473822e-06, |
|
"loss": 0.2842, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.8302469135802469, |
|
"grad_norm": 0.12731367349624634, |
|
"learning_rate": 6.455840356260041e-06, |
|
"loss": 0.3664, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 0.12762831151485443, |
|
"learning_rate": 6.432268934325947e-06, |
|
"loss": 0.4333, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8364197530864198, |
|
"grad_norm": 0.1425330489873886, |
|
"learning_rate": 6.408662784207149e-06, |
|
"loss": 0.283, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.8395061728395061, |
|
"grad_norm": 0.1323920488357544, |
|
"learning_rate": 6.385022478281307e-06, |
|
"loss": 0.4108, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.8425925925925926, |
|
"grad_norm": 0.1550484001636505, |
|
"learning_rate": 6.361348589754255e-06, |
|
"loss": 0.3396, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.845679012345679, |
|
"grad_norm": 0.09628990292549133, |
|
"learning_rate": 6.337641692646106e-06, |
|
"loss": 0.246, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.8487654320987654, |
|
"grad_norm": 0.1477012187242508, |
|
"learning_rate": 6.313902361777327e-06, |
|
"loss": 0.4705, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.8518518518518519, |
|
"grad_norm": 0.14865955710411072, |
|
"learning_rate": 6.290131172754811e-06, |
|
"loss": 0.417, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.8549382716049383, |
|
"grad_norm": 0.11468877643346786, |
|
"learning_rate": 6.266328701957911e-06, |
|
"loss": 0.3683, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.8580246913580247, |
|
"grad_norm": 0.1273777186870575, |
|
"learning_rate": 6.24249552652447e-06, |
|
"loss": 0.2808, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.8611111111111112, |
|
"grad_norm": 0.10113878548145294, |
|
"learning_rate": 6.2186322243368236e-06, |
|
"loss": 0.3368, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.8641975308641975, |
|
"grad_norm": 0.1183820515871048, |
|
"learning_rate": 6.194739374007792e-06, |
|
"loss": 0.3095, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8672839506172839, |
|
"grad_norm": 0.12614701688289642, |
|
"learning_rate": 6.170817554866646e-06, |
|
"loss": 0.3772, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.8703703703703703, |
|
"grad_norm": 0.19127966463565826, |
|
"learning_rate": 6.1468673469450655e-06, |
|
"loss": 0.3179, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.8734567901234568, |
|
"grad_norm": 0.14781445264816284, |
|
"learning_rate": 6.122889330963069e-06, |
|
"loss": 0.3659, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.8765432098765432, |
|
"grad_norm": 0.1360250860452652, |
|
"learning_rate": 6.098884088314938e-06, |
|
"loss": 0.4211, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.8796296296296297, |
|
"grad_norm": 0.1149686872959137, |
|
"learning_rate": 6.074852201055121e-06, |
|
"loss": 0.2571, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.8827160493827161, |
|
"grad_norm": 0.14958076179027557, |
|
"learning_rate": 6.050794251884112e-06, |
|
"loss": 0.4164, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.8858024691358025, |
|
"grad_norm": 0.12140931189060211, |
|
"learning_rate": 6.026710824134331e-06, |
|
"loss": 0.2203, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.12924239039421082, |
|
"learning_rate": 6.002602501755974e-06, |
|
"loss": 0.4255, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.8919753086419753, |
|
"grad_norm": 0.1369277834892273, |
|
"learning_rate": 5.978469869302861e-06, |
|
"loss": 0.4083, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.8950617283950617, |
|
"grad_norm": 0.13165542483329773, |
|
"learning_rate": 5.954313511918252e-06, |
|
"loss": 0.3317, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8981481481481481, |
|
"grad_norm": 0.16248537600040436, |
|
"learning_rate": 5.9301340153206685e-06, |
|
"loss": 0.4079, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.9012345679012346, |
|
"grad_norm": 0.14584743976593018, |
|
"learning_rate": 5.905931965789688e-06, |
|
"loss": 0.3508, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.904320987654321, |
|
"grad_norm": 0.15875974297523499, |
|
"learning_rate": 5.881707950151725e-06, |
|
"loss": 0.3597, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.9074074074074074, |
|
"grad_norm": 0.11724277585744858, |
|
"learning_rate": 5.857462555765809e-06, |
|
"loss": 0.3152, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.9104938271604939, |
|
"grad_norm": 0.12342196702957153, |
|
"learning_rate": 5.8331963705093375e-06, |
|
"loss": 0.318, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.9135802469135802, |
|
"grad_norm": 0.12013120949268341, |
|
"learning_rate": 5.808909982763825e-06, |
|
"loss": 0.3951, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.9166666666666666, |
|
"grad_norm": 0.10280231386423111, |
|
"learning_rate": 5.784603981400632e-06, |
|
"loss": 0.2725, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.9197530864197531, |
|
"grad_norm": 0.12491166591644287, |
|
"learning_rate": 5.760278955766695e-06, |
|
"loss": 0.3837, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.9228395061728395, |
|
"grad_norm": 0.11760140210390091, |
|
"learning_rate": 5.735935495670229e-06, |
|
"loss": 0.2464, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.9259259259259259, |
|
"grad_norm": 0.13774855434894562, |
|
"learning_rate": 5.711574191366427e-06, |
|
"loss": 0.3504, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9290123456790124, |
|
"grad_norm": 0.09982441365718842, |
|
"learning_rate": 5.687195633543151e-06, |
|
"loss": 0.2457, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.9320987654320988, |
|
"grad_norm": 0.11534377187490463, |
|
"learning_rate": 5.662800413306611e-06, |
|
"loss": 0.2951, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.9351851851851852, |
|
"grad_norm": 0.100958451628685, |
|
"learning_rate": 5.6383891221670275e-06, |
|
"loss": 0.19, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.9382716049382716, |
|
"grad_norm": 0.17198745906352997, |
|
"learning_rate": 5.613962352024293e-06, |
|
"loss": 0.3832, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.941358024691358, |
|
"grad_norm": 0.16045625507831573, |
|
"learning_rate": 5.589520695153618e-06, |
|
"loss": 0.4173, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.9444444444444444, |
|
"grad_norm": 0.12690144777297974, |
|
"learning_rate": 5.5650647441911706e-06, |
|
"loss": 0.3318, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.9475308641975309, |
|
"grad_norm": 0.12933467328548431, |
|
"learning_rate": 5.540595092119709e-06, |
|
"loss": 0.3169, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.9506172839506173, |
|
"grad_norm": 0.1863582581281662, |
|
"learning_rate": 5.516112332254203e-06, |
|
"loss": 0.3925, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.9537037037037037, |
|
"grad_norm": 0.15057547390460968, |
|
"learning_rate": 5.491617058227443e-06, |
|
"loss": 0.4953, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.9567901234567902, |
|
"grad_norm": 0.159704327583313, |
|
"learning_rate": 5.46710986397565e-06, |
|
"loss": 0.3831, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9598765432098766, |
|
"grad_norm": 0.0988263189792633, |
|
"learning_rate": 5.442591343724081e-06, |
|
"loss": 0.1455, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.9629629629629629, |
|
"grad_norm": 0.13106189668178558, |
|
"learning_rate": 5.418062091972604e-06, |
|
"loss": 0.227, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.9660493827160493, |
|
"grad_norm": 0.17571298778057098, |
|
"learning_rate": 5.393522703481303e-06, |
|
"loss": 0.4638, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.9691358024691358, |
|
"grad_norm": 0.12073665857315063, |
|
"learning_rate": 5.36897377325604e-06, |
|
"loss": 0.2587, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.9722222222222222, |
|
"grad_norm": 0.08656695485115051, |
|
"learning_rate": 5.344415896534039e-06, |
|
"loss": 0.2088, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.9753086419753086, |
|
"grad_norm": 0.1401841789484024, |
|
"learning_rate": 5.319849668769449e-06, |
|
"loss": 0.3667, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.9783950617283951, |
|
"grad_norm": 0.1650845855474472, |
|
"learning_rate": 5.295275685618905e-06, |
|
"loss": 0.3667, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.9814814814814815, |
|
"grad_norm": 0.13909409940242767, |
|
"learning_rate": 5.270694542927089e-06, |
|
"loss": 0.3811, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.9845679012345679, |
|
"grad_norm": 0.11377997696399689, |
|
"learning_rate": 5.246106836712277e-06, |
|
"loss": 0.2349, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.9876543209876543, |
|
"grad_norm": 0.12037783116102219, |
|
"learning_rate": 5.2215131631518945e-06, |
|
"loss": 0.2901, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9907407407407407, |
|
"grad_norm": 0.13020600378513336, |
|
"learning_rate": 5.196914118568054e-06, |
|
"loss": 0.3427, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.9938271604938271, |
|
"grad_norm": 0.15103194117546082, |
|
"learning_rate": 5.1723102994130994e-06, |
|
"loss": 0.4012, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.9969135802469136, |
|
"grad_norm": 0.105732262134552, |
|
"learning_rate": 5.147702302255143e-06, |
|
"loss": 0.175, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.17236697673797607, |
|
"learning_rate": 5.123090723763607e-06, |
|
"loss": 0.3751, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.4522034823894501, |
|
"eval_runtime": 44.5334, |
|
"eval_samples_per_second": 8.263, |
|
"eval_steps_per_second": 1.033, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.0030864197530864, |
|
"grad_norm": 0.15303292870521545, |
|
"learning_rate": 5.098476160694741e-06, |
|
"loss": 0.4663, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.0061728395061729, |
|
"grad_norm": 0.10959513485431671, |
|
"learning_rate": 5.073859209877167e-06, |
|
"loss": 0.2389, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.0092592592592593, |
|
"grad_norm": 0.14050254225730896, |
|
"learning_rate": 5.049240468197401e-06, |
|
"loss": 0.3591, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.0123456790123457, |
|
"grad_norm": 0.12712690234184265, |
|
"learning_rate": 5.0246205325853824e-06, |
|
"loss": 0.3452, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.0154320987654322, |
|
"grad_norm": 0.1756986677646637, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4289, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.0185185185185186, |
|
"grad_norm": 0.14214292168617249, |
|
"learning_rate": 4.975379467414621e-06, |
|
"loss": 0.3695, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0030864197530864, |
|
"grad_norm": 0.1542719155550003, |
|
"learning_rate": 4.950759531802602e-06, |
|
"loss": 0.3824, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 1.0061728395061729, |
|
"grad_norm": 0.12223492562770844, |
|
"learning_rate": 4.926140790122835e-06, |
|
"loss": 0.2753, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.0092592592592593, |
|
"grad_norm": 0.12852071225643158, |
|
"learning_rate": 4.90152383930526e-06, |
|
"loss": 0.2418, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.0123456790123457, |
|
"grad_norm": 0.1099737137556076, |
|
"learning_rate": 4.876909276236395e-06, |
|
"loss": 0.2964, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.0154320987654322, |
|
"grad_norm": 0.1437702178955078, |
|
"learning_rate": 4.852297697744857e-06, |
|
"loss": 0.355, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.0185185185185186, |
|
"grad_norm": 0.12063878774642944, |
|
"learning_rate": 4.827689700586902e-06, |
|
"loss": 0.2879, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.021604938271605, |
|
"grad_norm": 0.19743777811527252, |
|
"learning_rate": 4.803085881431949e-06, |
|
"loss": 0.3412, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.0246913580246915, |
|
"grad_norm": 0.22067442536354065, |
|
"learning_rate": 4.778486836848107e-06, |
|
"loss": 0.3051, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.0277777777777777, |
|
"grad_norm": 0.1556781828403473, |
|
"learning_rate": 4.7538931632877254e-06, |
|
"loss": 0.3369, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.0308641975308641, |
|
"grad_norm": 0.132530078291893, |
|
"learning_rate": 4.729305457072913e-06, |
|
"loss": 0.3452, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0339506172839505, |
|
"grad_norm": 0.16023634374141693, |
|
"learning_rate": 4.704724314381097e-06, |
|
"loss": 0.3887, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.037037037037037, |
|
"grad_norm": 0.14671647548675537, |
|
"learning_rate": 4.680150331230552e-06, |
|
"loss": 0.3082, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.0401234567901234, |
|
"grad_norm": 0.20157098770141602, |
|
"learning_rate": 4.6555841034659625e-06, |
|
"loss": 0.5004, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 1.0432098765432098, |
|
"grad_norm": 0.14635726809501648, |
|
"learning_rate": 4.631026226743962e-06, |
|
"loss": 0.4104, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.0462962962962963, |
|
"grad_norm": 0.14289334416389465, |
|
"learning_rate": 4.606477296518698e-06, |
|
"loss": 0.3206, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.0493827160493827, |
|
"grad_norm": 0.14635069668293, |
|
"learning_rate": 4.581937908027397e-06, |
|
"loss": 0.2957, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.0524691358024691, |
|
"grad_norm": 0.1479678899049759, |
|
"learning_rate": 4.55740865627592e-06, |
|
"loss": 0.3168, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.0555555555555556, |
|
"grad_norm": 0.12210693210363388, |
|
"learning_rate": 4.532890136024351e-06, |
|
"loss": 0.2854, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.058641975308642, |
|
"grad_norm": 0.16018199920654297, |
|
"learning_rate": 4.508382941772558e-06, |
|
"loss": 0.2937, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.0617283950617284, |
|
"grad_norm": 0.14056287705898285, |
|
"learning_rate": 4.483887667745798e-06, |
|
"loss": 0.3246, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.0648148148148149, |
|
"grad_norm": 0.14486226439476013, |
|
"learning_rate": 4.459404907880293e-06, |
|
"loss": 0.3133, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.0679012345679013, |
|
"grad_norm": 0.1279231458902359, |
|
"learning_rate": 4.434935255808831e-06, |
|
"loss": 0.2219, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.0709876543209877, |
|
"grad_norm": 0.16269516944885254, |
|
"learning_rate": 4.410479304846385e-06, |
|
"loss": 0.3531, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.074074074074074, |
|
"grad_norm": 0.15139630436897278, |
|
"learning_rate": 4.386037647975708e-06, |
|
"loss": 0.2508, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.0771604938271604, |
|
"grad_norm": 0.15115757286548615, |
|
"learning_rate": 4.361610877832974e-06, |
|
"loss": 0.3908, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.0802469135802468, |
|
"grad_norm": 0.17080338299274445, |
|
"learning_rate": 4.337199586693389e-06, |
|
"loss": 0.4233, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.0833333333333333, |
|
"grad_norm": 0.149905264377594, |
|
"learning_rate": 4.312804366456851e-06, |
|
"loss": 0.3354, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.0864197530864197, |
|
"grad_norm": 0.2038925588130951, |
|
"learning_rate": 4.2884258086335755e-06, |
|
"loss": 0.422, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.0895061728395061, |
|
"grad_norm": 0.1319386065006256, |
|
"learning_rate": 4.2640645043297715e-06, |
|
"loss": 0.2812, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 1.0925925925925926, |
|
"grad_norm": 0.210116446018219, |
|
"learning_rate": 4.239721044233306e-06, |
|
"loss": 0.3266, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.095679012345679, |
|
"grad_norm": 0.15533123910427094, |
|
"learning_rate": 4.215396018599369e-06, |
|
"loss": 0.3106, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.0987654320987654, |
|
"grad_norm": 0.15208472311496735, |
|
"learning_rate": 4.191090017236177e-06, |
|
"loss": 0.3423, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.1018518518518519, |
|
"grad_norm": 0.12684912979602814, |
|
"learning_rate": 4.166803629490664e-06, |
|
"loss": 0.2755, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.1049382716049383, |
|
"grad_norm": 0.18555931746959686, |
|
"learning_rate": 4.142537444234192e-06, |
|
"loss": 0.4007, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.1080246913580247, |
|
"grad_norm": 0.20792073011398315, |
|
"learning_rate": 4.118292049848277e-06, |
|
"loss": 0.2467, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 0.13857008516788483, |
|
"learning_rate": 4.094068034210313e-06, |
|
"loss": 0.3666, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.1141975308641976, |
|
"grad_norm": 0.10900649428367615, |
|
"learning_rate": 4.069865984679332e-06, |
|
"loss": 0.1954, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 1.117283950617284, |
|
"grad_norm": 0.13190750777721405, |
|
"learning_rate": 4.045686488081748e-06, |
|
"loss": 0.309, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.1203703703703705, |
|
"grad_norm": 0.16032575070858002, |
|
"learning_rate": 4.021530130697141e-06, |
|
"loss": 0.3524, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.123456790123457, |
|
"grad_norm": 0.14147287607192993, |
|
"learning_rate": 3.997397498244028e-06, |
|
"loss": 0.3088, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.126543209876543, |
|
"grad_norm": 0.1288299709558487, |
|
"learning_rate": 3.97328917586567e-06, |
|
"loss": 0.3216, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.1296296296296295, |
|
"grad_norm": 0.17235535383224487, |
|
"learning_rate": 3.9492057481158905e-06, |
|
"loss": 0.3339, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.132716049382716, |
|
"grad_norm": 0.21856486797332764, |
|
"learning_rate": 3.92514779894488e-06, |
|
"loss": 0.3691, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.1358024691358024, |
|
"grad_norm": 0.188248872756958, |
|
"learning_rate": 3.901115911685063e-06, |
|
"loss": 0.3879, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 1.1388888888888888, |
|
"grad_norm": 0.17136438190937042, |
|
"learning_rate": 3.877110669036932e-06, |
|
"loss": 0.4754, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.1419753086419753, |
|
"grad_norm": 0.14845937490463257, |
|
"learning_rate": 3.853132653054936e-06, |
|
"loss": 0.4178, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.1450617283950617, |
|
"grad_norm": 0.14598865807056427, |
|
"learning_rate": 3.829182445133356e-06, |
|
"loss": 0.2653, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 1.1481481481481481, |
|
"grad_norm": 0.12898695468902588, |
|
"learning_rate": 3.8052606259922097e-06, |
|
"loss": 0.2613, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.1512345679012346, |
|
"grad_norm": 0.12332043796777725, |
|
"learning_rate": 3.7813677756631773e-06, |
|
"loss": 0.2803, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 1.154320987654321, |
|
"grad_norm": 0.1356392502784729, |
|
"learning_rate": 3.75750447347553e-06, |
|
"loss": 0.4038, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.1574074074074074, |
|
"grad_norm": 0.25393664836883545, |
|
"learning_rate": 3.7336712980420897e-06, |
|
"loss": 0.5067, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.1604938271604939, |
|
"grad_norm": 0.12110210955142975, |
|
"learning_rate": 3.7098688272451893e-06, |
|
"loss": 0.2413, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.1635802469135803, |
|
"grad_norm": 0.12632521986961365, |
|
"learning_rate": 3.6860976382226747e-06, |
|
"loss": 0.2583, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.1666666666666667, |
|
"grad_norm": 0.15142959356307983, |
|
"learning_rate": 3.662358307353897e-06, |
|
"loss": 0.4542, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.1697530864197532, |
|
"grad_norm": 0.11639465391635895, |
|
"learning_rate": 3.638651410245746e-06, |
|
"loss": 0.1849, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.1728395061728394, |
|
"grad_norm": 0.14406833052635193, |
|
"learning_rate": 3.6149775217186954e-06, |
|
"loss": 0.3171, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.175925925925926, |
|
"grad_norm": 0.1374572366476059, |
|
"learning_rate": 3.5913372157928515e-06, |
|
"loss": 0.2849, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.1790123456790123, |
|
"grad_norm": 0.16935373842716217, |
|
"learning_rate": 3.5677310656740537e-06, |
|
"loss": 0.3982, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.1820987654320987, |
|
"grad_norm": 0.1098417416214943, |
|
"learning_rate": 3.5441596437399596e-06, |
|
"loss": 0.2149, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.1851851851851851, |
|
"grad_norm": 0.14076852798461914, |
|
"learning_rate": 3.5206235215261785e-06, |
|
"loss": 0.2685, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.1882716049382716, |
|
"grad_norm": 0.12600207328796387, |
|
"learning_rate": 3.4971232697124046e-06, |
|
"loss": 0.2009, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 1.191358024691358, |
|
"grad_norm": 0.13086476922035217, |
|
"learning_rate": 3.4736594581085837e-06, |
|
"loss": 0.3062, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.1944444444444444, |
|
"grad_norm": 0.16587767004966736, |
|
"learning_rate": 3.4502326556411e-06, |
|
"loss": 0.2432, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.1975308641975309, |
|
"grad_norm": 0.13524991273880005, |
|
"learning_rate": 3.4268434303389747e-06, |
|
"loss": 0.3204, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.2006172839506173, |
|
"grad_norm": 0.15923044085502625, |
|
"learning_rate": 3.403492349320101e-06, |
|
"loss": 0.36, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.2037037037037037, |
|
"grad_norm": 0.19655781984329224, |
|
"learning_rate": 3.380179978777482e-06, |
|
"loss": 0.4863, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.2067901234567902, |
|
"grad_norm": 0.13031858205795288, |
|
"learning_rate": 3.356906883965516e-06, |
|
"loss": 0.2884, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 1.2098765432098766, |
|
"grad_norm": 0.12421680986881256, |
|
"learning_rate": 3.33367362918628e-06, |
|
"loss": 0.1891, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.212962962962963, |
|
"grad_norm": 0.15903340280056, |
|
"learning_rate": 3.3104807777758487e-06, |
|
"loss": 0.4381, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.2160493827160495, |
|
"grad_norm": 0.11143235117197037, |
|
"learning_rate": 3.2873288920906436e-06, |
|
"loss": 0.2269, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2191358024691359, |
|
"grad_norm": 0.1427583545446396, |
|
"learning_rate": 3.2642185334937853e-06, |
|
"loss": 0.3874, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 1.2222222222222223, |
|
"grad_norm": 0.21431690454483032, |
|
"learning_rate": 3.2411502623414925e-06, |
|
"loss": 0.4815, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.2253086419753085, |
|
"grad_norm": 0.20369336009025574, |
|
"learning_rate": 3.2181246379694886e-06, |
|
"loss": 0.429, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 1.228395061728395, |
|
"grad_norm": 0.21474803984165192, |
|
"learning_rate": 3.1951422186794447e-06, |
|
"loss": 0.4217, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.2314814814814814, |
|
"grad_norm": 0.1690702587366104, |
|
"learning_rate": 3.1722035617254333e-06, |
|
"loss": 0.3388, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.2314814814814814, |
|
"eval_loss": 0.4383295774459839, |
|
"eval_runtime": 44.45, |
|
"eval_samples_per_second": 8.279, |
|
"eval_steps_per_second": 1.035, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.2345679012345678, |
|
"grad_norm": 0.13106146454811096, |
|
"learning_rate": 3.149309223300428e-06, |
|
"loss": 0.2537, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.2376543209876543, |
|
"grad_norm": 0.18745112419128418, |
|
"learning_rate": 3.126459758522813e-06, |
|
"loss": 0.3825, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 1.2407407407407407, |
|
"grad_norm": 0.1358872950077057, |
|
"learning_rate": 3.103655721422917e-06, |
|
"loss": 0.3057, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.2438271604938271, |
|
"grad_norm": 0.15695077180862427, |
|
"learning_rate": 3.080897664929592e-06, |
|
"loss": 0.412, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 1.2469135802469136, |
|
"grad_norm": 0.15740308165550232, |
|
"learning_rate": 3.0581861408567907e-06, |
|
"loss": 0.371, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.17210154235363007, |
|
"learning_rate": 3.035521699890206e-06, |
|
"loss": 0.4671, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.2530864197530864, |
|
"grad_norm": 0.1564391851425171, |
|
"learning_rate": 3.0129048915739013e-06, |
|
"loss": 0.397, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.2561728395061729, |
|
"grad_norm": 0.15035340189933777, |
|
"learning_rate": 2.9903362642969903e-06, |
|
"loss": 0.3696, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 1.2592592592592593, |
|
"grad_norm": 0.12334346026182175, |
|
"learning_rate": 2.967816365280351e-06, |
|
"loss": 0.2595, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.2623456790123457, |
|
"grad_norm": 0.159285768866539, |
|
"learning_rate": 2.94534574056334e-06, |
|
"loss": 0.3444, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.2654320987654322, |
|
"grad_norm": 0.14071713387966156, |
|
"learning_rate": 2.9229249349905686e-06, |
|
"loss": 0.264, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.2685185185185186, |
|
"grad_norm": 0.17824961245059967, |
|
"learning_rate": 2.9005544921986774e-06, |
|
"loss": 0.3823, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.2716049382716048, |
|
"grad_norm": 0.14212675392627716, |
|
"learning_rate": 2.8782349546031673e-06, |
|
"loss": 0.253, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.2746913580246915, |
|
"grad_norm": 0.21493245661258698, |
|
"learning_rate": 2.8559668633852433e-06, |
|
"loss": 0.3181, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 1.2777777777777777, |
|
"grad_norm": 0.14115536212921143, |
|
"learning_rate": 2.8337507584786826e-06, |
|
"loss": 0.3007, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.2808641975308643, |
|
"grad_norm": 0.16807730495929718, |
|
"learning_rate": 2.811587178556764e-06, |
|
"loss": 0.271, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 1.2839506172839505, |
|
"grad_norm": 0.19324727356433868, |
|
"learning_rate": 2.789476661019186e-06, |
|
"loss": 0.3613, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.287037037037037, |
|
"grad_norm": 0.22242026031017303, |
|
"learning_rate": 2.7674197419790493e-06, |
|
"loss": 0.3391, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.2901234567901234, |
|
"grad_norm": 0.1270921379327774, |
|
"learning_rate": 2.7454169562498503e-06, |
|
"loss": 0.2094, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.2932098765432098, |
|
"grad_norm": 0.12505224347114563, |
|
"learning_rate": 2.723468837332517e-06, |
|
"loss": 0.2807, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.2962962962962963, |
|
"grad_norm": 0.16030734777450562, |
|
"learning_rate": 2.7015759174024756e-06, |
|
"loss": 0.3266, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.2993827160493827, |
|
"grad_norm": 0.1334860622882843, |
|
"learning_rate": 2.6797387272967414e-06, |
|
"loss": 0.2262, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 1.3024691358024691, |
|
"grad_norm": 0.16829054057598114, |
|
"learning_rate": 2.65795779650105e-06, |
|
"loss": 0.3483, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.3055555555555556, |
|
"grad_norm": 0.16048014163970947, |
|
"learning_rate": 2.63623365313702e-06, |
|
"loss": 0.3673, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.308641975308642, |
|
"grad_norm": 0.22250574827194214, |
|
"learning_rate": 2.614566823949348e-06, |
|
"loss": 0.3418, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.3117283950617284, |
|
"grad_norm": 0.13716565072536469, |
|
"learning_rate": 2.592957834293033e-06, |
|
"loss": 0.2986, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 1.3148148148148149, |
|
"grad_norm": 0.15584644675254822, |
|
"learning_rate": 2.5714072081206407e-06, |
|
"loss": 0.3419, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.3179012345679013, |
|
"grad_norm": 0.17043578624725342, |
|
"learning_rate": 2.5499154679696014e-06, |
|
"loss": 0.3133, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 1.3209876543209877, |
|
"grad_norm": 0.1307077258825302, |
|
"learning_rate": 2.528483134949535e-06, |
|
"loss": 0.2484, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 1.324074074074074, |
|
"grad_norm": 0.19332851469516754, |
|
"learning_rate": 2.50711072872962e-06, |
|
"loss": 0.338, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.3271604938271606, |
|
"grad_norm": 0.18752485513687134, |
|
"learning_rate": 2.4857987675259887e-06, |
|
"loss": 0.3693, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.3302469135802468, |
|
"grad_norm": 0.171221524477005, |
|
"learning_rate": 2.4645477680891734e-06, |
|
"loss": 0.3222, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.2540048062801361, |
|
"learning_rate": 2.4433582456915556e-06, |
|
"loss": 0.4404, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.3364197530864197, |
|
"grad_norm": 0.13886091113090515, |
|
"learning_rate": 2.422230714114891e-06, |
|
"loss": 0.3246, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 1.3395061728395061, |
|
"grad_norm": 0.11673127859830856, |
|
"learning_rate": 2.4011656856378513e-06, |
|
"loss": 0.1878, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3425925925925926, |
|
"grad_norm": 0.20191854238510132, |
|
"learning_rate": 2.3801636710235836e-06, |
|
"loss": 0.2979, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.345679012345679, |
|
"grad_norm": 0.16786165535449982, |
|
"learning_rate": 2.3592251795073564e-06, |
|
"loss": 0.2931, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.3487654320987654, |
|
"grad_norm": 0.1304280310869217, |
|
"learning_rate": 2.338350718784177e-06, |
|
"loss": 0.2368, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 1.3518518518518519, |
|
"grad_norm": 0.14287714660167694, |
|
"learning_rate": 2.3175407949965167e-06, |
|
"loss": 0.286, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.3549382716049383, |
|
"grad_norm": 0.13601404428482056, |
|
"learning_rate": 2.296795912722014e-06, |
|
"loss": 0.268, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.3580246913580247, |
|
"grad_norm": 0.1764301061630249, |
|
"learning_rate": 2.2761165749612417e-06, |
|
"loss": 0.355, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 1.3611111111111112, |
|
"grad_norm": 0.1622696816921234, |
|
"learning_rate": 2.25550328312553e-06, |
|
"loss": 0.3438, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.3641975308641976, |
|
"grad_norm": 0.15518330037593842, |
|
"learning_rate": 2.2349565370247837e-06, |
|
"loss": 0.2844, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.367283950617284, |
|
"grad_norm": 0.13542047142982483, |
|
"learning_rate": 2.214476834855382e-06, |
|
"loss": 0.324, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 1.3703703703703702, |
|
"grad_norm": 0.20794177055358887, |
|
"learning_rate": 2.1940646731880887e-06, |
|
"loss": 0.5443, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.373456790123457, |
|
"grad_norm": 0.1371917873620987, |
|
"learning_rate": 2.173720546956015e-06, |
|
"loss": 0.3663, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 1.376543209876543, |
|
"grad_norm": 0.17952483892440796, |
|
"learning_rate": 2.1534449494426203e-06, |
|
"loss": 0.3209, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 1.3796296296296298, |
|
"grad_norm": 0.1383998692035675, |
|
"learning_rate": 2.1332383722697483e-06, |
|
"loss": 0.2407, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.382716049382716, |
|
"grad_norm": 0.17842058837413788, |
|
"learning_rate": 2.1131013053857097e-06, |
|
"loss": 0.5964, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 1.3858024691358024, |
|
"grad_norm": 0.13012441992759705, |
|
"learning_rate": 2.0930342370534013e-06, |
|
"loss": 0.2686, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.3888888888888888, |
|
"grad_norm": 0.1683279275894165, |
|
"learning_rate": 2.073037653838466e-06, |
|
"loss": 0.4134, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.3919753086419753, |
|
"grad_norm": 0.18860593438148499, |
|
"learning_rate": 2.053112040597495e-06, |
|
"loss": 0.2766, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 1.3950617283950617, |
|
"grad_norm": 0.15948981046676636, |
|
"learning_rate": 2.0332578804662783e-06, |
|
"loss": 0.452, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 1.3981481481481481, |
|
"grad_norm": 0.13614550232887268, |
|
"learning_rate": 2.013475654848076e-06, |
|
"loss": 0.3028, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.4012345679012346, |
|
"grad_norm": 0.1575852334499359, |
|
"learning_rate": 1.99376584340196e-06, |
|
"loss": 0.3772, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.404320987654321, |
|
"grad_norm": 0.1815677434206009, |
|
"learning_rate": 1.9741289240311757e-06, |
|
"loss": 0.4218, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 1.4074074074074074, |
|
"grad_norm": 0.16409048438072205, |
|
"learning_rate": 1.954565372871554e-06, |
|
"loss": 0.4449, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.4104938271604939, |
|
"grad_norm": 0.17997804284095764, |
|
"learning_rate": 1.935075664279978e-06, |
|
"loss": 0.3908, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 1.4135802469135803, |
|
"grad_norm": 0.17692823708057404, |
|
"learning_rate": 1.9156602708228584e-06, |
|
"loss": 0.3506, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 1.4166666666666667, |
|
"grad_norm": 0.17066018283367157, |
|
"learning_rate": 1.8963196632647008e-06, |
|
"loss": 0.4187, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.4197530864197532, |
|
"grad_norm": 0.17325402796268463, |
|
"learning_rate": 1.8770543105566752e-06, |
|
"loss": 0.3865, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 1.4228395061728394, |
|
"grad_norm": 0.1373230516910553, |
|
"learning_rate": 1.8578646798252432e-06, |
|
"loss": 0.2194, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 1.425925925925926, |
|
"grad_norm": 0.14924941956996918, |
|
"learning_rate": 1.8387512363608496e-06, |
|
"loss": 0.3415, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.4290123456790123, |
|
"grad_norm": 0.15401771664619446, |
|
"learning_rate": 1.8197144436066167e-06, |
|
"loss": 0.3132, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 1.4320987654320987, |
|
"grad_norm": 0.24441462755203247, |
|
"learning_rate": 1.8007547631471289e-06, |
|
"loss": 0.365, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.4351851851851851, |
|
"grad_norm": 0.2641655206680298, |
|
"learning_rate": 1.781872654697226e-06, |
|
"loss": 0.4653, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 1.4382716049382716, |
|
"grad_norm": 0.18639406561851501, |
|
"learning_rate": 1.7630685760908623e-06, |
|
"loss": 0.3422, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 1.441358024691358, |
|
"grad_norm": 0.14547406136989594, |
|
"learning_rate": 1.7443429832700038e-06, |
|
"loss": 0.3541, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 1.4444444444444444, |
|
"grad_norm": 0.179130420088768, |
|
"learning_rate": 1.7256963302735752e-06, |
|
"loss": 0.3341, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.4475308641975309, |
|
"grad_norm": 0.1942981481552124, |
|
"learning_rate": 1.7071290692264492e-06, |
|
"loss": 0.392, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.4506172839506173, |
|
"grad_norm": 0.10643615573644638, |
|
"learning_rate": 1.6886416503284835e-06, |
|
"loss": 0.2317, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 1.4537037037037037, |
|
"grad_norm": 0.14966462552547455, |
|
"learning_rate": 1.6702345218436066e-06, |
|
"loss": 0.2882, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 1.4567901234567902, |
|
"grad_norm": 0.1604948490858078, |
|
"learning_rate": 1.6519081300889472e-06, |
|
"loss": 0.3337, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 1.4598765432098766, |
|
"grad_norm": 0.23344826698303223, |
|
"learning_rate": 1.6336629194240118e-06, |
|
"loss": 0.3655, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 1.462962962962963, |
|
"grad_norm": 0.1553526222705841, |
|
"learning_rate": 1.6154993322399114e-06, |
|
"loss": 0.316, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.4660493827160495, |
|
"grad_norm": 0.1312614530324936, |
|
"learning_rate": 1.5974178089486364e-06, |
|
"loss": 0.301, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 1.4691358024691357, |
|
"grad_norm": 0.13480979204177856, |
|
"learning_rate": 1.5794187879723755e-06, |
|
"loss": 0.356, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 1.4722222222222223, |
|
"grad_norm": 0.14350688457489014, |
|
"learning_rate": 1.561502705732883e-06, |
|
"loss": 0.3021, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 1.4753086419753085, |
|
"grad_norm": 0.13871291279792786, |
|
"learning_rate": 1.543669996640908e-06, |
|
"loss": 0.4188, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 1.4783950617283952, |
|
"grad_norm": 0.16152562201023102, |
|
"learning_rate": 1.5259210930856423e-06, |
|
"loss": 0.3632, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"grad_norm": 0.17420196533203125, |
|
"learning_rate": 1.5082564254242583e-06, |
|
"loss": 0.3735, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"eval_loss": 0.430364727973938, |
|
"eval_runtime": 44.4346, |
|
"eval_samples_per_second": 8.282, |
|
"eval_steps_per_second": 1.035, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.4845679012345678, |
|
"grad_norm": 0.15298381447792053, |
|
"learning_rate": 1.4906764219714537e-06, |
|
"loss": 0.3162, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 1.4876543209876543, |
|
"grad_norm": 0.17767275869846344, |
|
"learning_rate": 1.4731815089890795e-06, |
|
"loss": 0.451, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 1.4907407407407407, |
|
"grad_norm": 0.2112477868795395, |
|
"learning_rate": 1.455772110675804e-06, |
|
"loss": 0.3914, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 1.4938271604938271, |
|
"grad_norm": 0.18488173186779022, |
|
"learning_rate": 1.438448649156815e-06, |
|
"loss": 0.3242, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.4969135802469136, |
|
"grad_norm": 0.19138255715370178, |
|
"learning_rate": 1.4212115444736024e-06, |
|
"loss": 0.3273, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.17519411444664001, |
|
"learning_rate": 1.4040612145737608e-06, |
|
"loss": 0.314, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 1.5030864197530864, |
|
"grad_norm": 0.11331440508365631, |
|
"learning_rate": 1.3869980753008537e-06, |
|
"loss": 0.2184, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 1.5061728395061729, |
|
"grad_norm": 0.1674378216266632, |
|
"learning_rate": 1.370022540384347e-06, |
|
"loss": 0.3075, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 1.5092592592592593, |
|
"grad_norm": 0.14736564457416534, |
|
"learning_rate": 1.353135021429554e-06, |
|
"loss": 0.3719, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.5123456790123457, |
|
"grad_norm": 0.14618776738643646, |
|
"learning_rate": 1.3363359279076776e-06, |
|
"loss": 0.3625, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 1.515432098765432, |
|
"grad_norm": 0.15497514605522156, |
|
"learning_rate": 1.3196256671458663e-06, |
|
"loss": 0.3522, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 1.5185185185185186, |
|
"grad_norm": 0.1439277082681656, |
|
"learning_rate": 1.3030046443173445e-06, |
|
"loss": 0.2904, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 1.5216049382716048, |
|
"grad_norm": 0.14361339807510376, |
|
"learning_rate": 1.2864732624315867e-06, |
|
"loss": 0.3338, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 1.5246913580246915, |
|
"grad_norm": 0.1480712592601776, |
|
"learning_rate": 1.270031922324546e-06, |
|
"loss": 0.4092, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5277777777777777, |
|
"grad_norm": 0.156494140625, |
|
"learning_rate": 1.2536810226489354e-06, |
|
"loss": 0.3855, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 1.5308641975308643, |
|
"grad_norm": 0.2111222743988037, |
|
"learning_rate": 1.237420959864561e-06, |
|
"loss": 0.4681, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 1.5339506172839505, |
|
"grad_norm": 0.20178188383579254, |
|
"learning_rate": 1.2212521282287093e-06, |
|
"loss": 0.3472, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 1.5370370370370372, |
|
"grad_norm": 0.14656566083431244, |
|
"learning_rate": 1.2051749197865875e-06, |
|
"loss": 0.2829, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 1.5401234567901234, |
|
"grad_norm": 0.17030468583106995, |
|
"learning_rate": 1.1891897243618184e-06, |
|
"loss": 0.457, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.5432098765432098, |
|
"grad_norm": 0.16490556299686432, |
|
"learning_rate": 1.173296929546987e-06, |
|
"loss": 0.4265, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 1.5462962962962963, |
|
"grad_norm": 0.15814335644245148, |
|
"learning_rate": 1.1574969206942443e-06, |
|
"loss": 0.3079, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 1.5493827160493827, |
|
"grad_norm": 0.15672267973423004, |
|
"learning_rate": 1.1417900809059623e-06, |
|
"loss": 0.2618, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 1.5524691358024691, |
|
"grad_norm": 0.26926475763320923, |
|
"learning_rate": 1.1261767910254422e-06, |
|
"loss": 0.4501, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 1.5555555555555556, |
|
"grad_norm": 0.22438615560531616, |
|
"learning_rate": 1.1106574296276923e-06, |
|
"loss": 0.5102, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.558641975308642, |
|
"grad_norm": 0.16849224269390106, |
|
"learning_rate": 1.095232373010226e-06, |
|
"loss": 0.4356, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 1.5617283950617284, |
|
"grad_norm": 0.15593089163303375, |
|
"learning_rate": 1.0799019951839656e-06, |
|
"loss": 0.2973, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 1.5648148148148149, |
|
"grad_norm": 0.14039039611816406, |
|
"learning_rate": 1.0646666678641477e-06, |
|
"loss": 0.4104, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 1.567901234567901, |
|
"grad_norm": 0.11041123420000076, |
|
"learning_rate": 1.0495267604613273e-06, |
|
"loss": 0.2541, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 1.5709876543209877, |
|
"grad_norm": 0.1312185525894165, |
|
"learning_rate": 1.0344826400724185e-06, |
|
"loss": 0.2818, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.574074074074074, |
|
"grad_norm": 0.20511452853679657, |
|
"learning_rate": 1.0195346714717813e-06, |
|
"loss": 0.3218, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 1.5771604938271606, |
|
"grad_norm": 0.2118871957063675, |
|
"learning_rate": 1.0046832171023952e-06, |
|
"loss": 0.2921, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 1.5802469135802468, |
|
"grad_norm": 0.18419800698757172, |
|
"learning_rate": 9.899286370670575e-07, |
|
"loss": 0.4502, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 1.5833333333333335, |
|
"grad_norm": 0.1755116879940033, |
|
"learning_rate": 9.752712891196558e-07, |
|
"loss": 0.3514, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 1.5864197530864197, |
|
"grad_norm": 0.16331788897514343, |
|
"learning_rate": 9.607115286564972e-07, |
|
"loss": 0.318, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.5895061728395061, |
|
"grad_norm": 0.18510426580905914, |
|
"learning_rate": 9.46249708707681e-07, |
|
"loss": 0.3207, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 1.5925925925925926, |
|
"grad_norm": 0.1467633843421936, |
|
"learning_rate": 9.318861799285539e-07, |
|
"loss": 0.32, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 1.595679012345679, |
|
"grad_norm": 0.21128030121326447, |
|
"learning_rate": 9.176212905911946e-07, |
|
"loss": 0.4566, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 1.5987654320987654, |
|
"grad_norm": 0.14944253861904144, |
|
"learning_rate": 9.034553865759754e-07, |
|
"loss": 0.4221, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 1.6018518518518519, |
|
"grad_norm": 0.1913837343454361, |
|
"learning_rate": 8.893888113631732e-07, |
|
"loss": 0.3236, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.6049382716049383, |
|
"grad_norm": 0.14830860495567322, |
|
"learning_rate": 8.754219060246432e-07, |
|
"loss": 0.3504, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 1.6080246913580247, |
|
"grad_norm": 0.1303461194038391, |
|
"learning_rate": 8.615550092155478e-07, |
|
"loss": 0.2281, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 1.6111111111111112, |
|
"grad_norm": 0.11773131787776947, |
|
"learning_rate": 8.477884571661449e-07, |
|
"loss": 0.2038, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 1.6141975308641974, |
|
"grad_norm": 0.16557615995407104, |
|
"learning_rate": 8.341225836736367e-07, |
|
"loss": 0.2965, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.617283950617284, |
|
"grad_norm": 0.15140382945537567, |
|
"learning_rate": 8.20557720094074e-07, |
|
"loss": 0.2804, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.6203703703703702, |
|
"grad_norm": 0.15120923519134521, |
|
"learning_rate": 8.070941953343242e-07, |
|
"loss": 0.3037, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 1.623456790123457, |
|
"grad_norm": 0.28693991899490356, |
|
"learning_rate": 7.937323358440935e-07, |
|
"loss": 0.4625, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 1.626543209876543, |
|
"grad_norm": 0.226279154419899, |
|
"learning_rate": 7.804724656080182e-07, |
|
"loss": 0.3529, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 1.6296296296296298, |
|
"grad_norm": 0.14384153485298157, |
|
"learning_rate": 7.673149061377966e-07, |
|
"loss": 0.4064, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 1.632716049382716, |
|
"grad_norm": 0.153773695230484, |
|
"learning_rate": 7.542599764644049e-07, |
|
"loss": 0.2779, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.6358024691358026, |
|
"grad_norm": 0.2235001176595688, |
|
"learning_rate": 7.413079931303591e-07, |
|
"loss": 0.4181, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 1.6388888888888888, |
|
"grad_norm": 0.1906222552061081, |
|
"learning_rate": 7.284592701820325e-07, |
|
"loss": 0.2867, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 1.6419753086419753, |
|
"grad_norm": 0.189738929271698, |
|
"learning_rate": 7.157141191620548e-07, |
|
"loss": 0.3274, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 1.6450617283950617, |
|
"grad_norm": 0.15748707950115204, |
|
"learning_rate": 7.030728491017408e-07, |
|
"loss": 0.2892, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 1.6481481481481481, |
|
"grad_norm": 0.2472158521413803, |
|
"learning_rate": 6.905357665136142e-07, |
|
"loss": 0.3892, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.6512345679012346, |
|
"grad_norm": 0.18736745417118073, |
|
"learning_rate": 6.781031753839662e-07, |
|
"loss": 0.3192, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 1.654320987654321, |
|
"grad_norm": 0.15377798676490784, |
|
"learning_rate": 6.657753771654812e-07, |
|
"loss": 0.2991, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 1.6574074074074074, |
|
"grad_norm": 0.16992682218551636, |
|
"learning_rate": 6.535526707699408e-07, |
|
"loss": 0.3628, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 1.6604938271604939, |
|
"grad_norm": 0.201069176197052, |
|
"learning_rate": 6.414353525609628e-07, |
|
"loss": 0.3127, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 1.6635802469135803, |
|
"grad_norm": 0.14373762905597687, |
|
"learning_rate": 6.294237163468231e-07, |
|
"loss": 0.2488, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.16759946942329407, |
|
"learning_rate": 6.175180533733277e-07, |
|
"loss": 0.3833, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 1.6697530864197532, |
|
"grad_norm": 0.2061176598072052, |
|
"learning_rate": 6.057186523167529e-07, |
|
"loss": 0.252, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 1.6728395061728394, |
|
"grad_norm": 0.18383823335170746, |
|
"learning_rate": 5.940257992768456e-07, |
|
"loss": 0.3677, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 1.675925925925926, |
|
"grad_norm": 0.2329624891281128, |
|
"learning_rate": 5.824397777698859e-07, |
|
"loss": 0.3821, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 1.6790123456790123, |
|
"grad_norm": 0.16050845384597778, |
|
"learning_rate": 5.709608687218116e-07, |
|
"loss": 0.3203, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.682098765432099, |
|
"grad_norm": 0.1575547456741333, |
|
"learning_rate": 5.595893504614097e-07, |
|
"loss": 0.4154, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 1.6851851851851851, |
|
"grad_norm": 0.14166632294654846, |
|
"learning_rate": 5.483254987135644e-07, |
|
"loss": 0.2528, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 1.6882716049382716, |
|
"grad_norm": 0.1413419544696808, |
|
"learning_rate": 5.371695865925736e-07, |
|
"loss": 0.2011, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 1.691358024691358, |
|
"grad_norm": 0.14001396298408508, |
|
"learning_rate": 5.261218845955246e-07, |
|
"loss": 0.2521, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 1.6944444444444444, |
|
"grad_norm": 0.2379157692193985, |
|
"learning_rate": 5.151826605957394e-07, |
|
"loss": 0.3396, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.6975308641975309, |
|
"grad_norm": 0.1787138283252716, |
|
"learning_rate": 5.043521798362755e-07, |
|
"loss": 0.2596, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 1.7006172839506173, |
|
"grad_norm": 0.41910964250564575, |
|
"learning_rate": 4.936307049234956e-07, |
|
"loss": 0.3327, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 1.7037037037037037, |
|
"grad_norm": 0.1860780268907547, |
|
"learning_rate": 4.830184958207007e-07, |
|
"loss": 0.399, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 1.7067901234567902, |
|
"grad_norm": 0.16398878395557404, |
|
"learning_rate": 4.725158098418309e-07, |
|
"loss": 0.3953, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 1.7098765432098766, |
|
"grad_norm": 0.1744304746389389, |
|
"learning_rate": 4.6212290164521554e-07, |
|
"loss": 0.2567, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7129629629629628, |
|
"grad_norm": 0.19683323800563812, |
|
"learning_rate": 4.5184002322740784e-07, |
|
"loss": 0.4327, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 1.7160493827160495, |
|
"grad_norm": 0.17663246393203735, |
|
"learning_rate": 4.4166742391707593e-07, |
|
"loss": 0.2145, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 1.7191358024691357, |
|
"grad_norm": 0.16606709361076355, |
|
"learning_rate": 4.316053503689466e-07, |
|
"loss": 0.3419, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 1.7222222222222223, |
|
"grad_norm": 0.21532438695430756, |
|
"learning_rate": 4.2165404655783836e-07, |
|
"loss": 0.379, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 1.7253086419753085, |
|
"grad_norm": 0.1450224667787552, |
|
"learning_rate": 4.1181375377273237e-07, |
|
"loss": 0.19, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.7283950617283952, |
|
"grad_norm": 0.18900087475776672, |
|
"learning_rate": 4.020847106109349e-07, |
|
"loss": 0.3304, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 1.7314814814814814, |
|
"grad_norm": 0.1328793317079544, |
|
"learning_rate": 3.9246715297228176e-07, |
|
"loss": 0.283, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 1.7314814814814814, |
|
"eval_loss": 0.42760223150253296, |
|
"eval_runtime": 44.2033, |
|
"eval_samples_per_second": 8.325, |
|
"eval_steps_per_second": 1.041, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 1.734567901234568, |
|
"grad_norm": 0.14145122468471527, |
|
"learning_rate": 3.829613140534222e-07, |
|
"loss": 0.3045, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 1.7376543209876543, |
|
"grad_norm": 0.1800602227449417, |
|
"learning_rate": 3.7356742434216775e-07, |
|
"loss": 0.2553, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 1.7407407407407407, |
|
"grad_norm": 0.18250073492527008, |
|
"learning_rate": 3.642857116118986e-07, |
|
"loss": 0.23, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.7438271604938271, |
|
"grad_norm": 0.14363303780555725, |
|
"learning_rate": 3.5511640091604293e-07, |
|
"loss": 0.2744, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 1.7469135802469136, |
|
"grad_norm": 0.16794289648532867, |
|
"learning_rate": 3.4605971458262e-07, |
|
"loss": 0.3806, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.15108714997768402, |
|
"learning_rate": 3.371158722088497e-07, |
|
"loss": 0.2868, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 1.7530864197530864, |
|
"grad_norm": 0.2250644415616989, |
|
"learning_rate": 3.2828509065582713e-07, |
|
"loss": 0.4173, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 1.7561728395061729, |
|
"grad_norm": 0.16634950041770935, |
|
"learning_rate": 3.195675840432655e-07, |
|
"loss": 0.3429, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.7592592592592593, |
|
"grad_norm": 0.3840501010417938, |
|
"learning_rate": 3.109635637443026e-07, |
|
"loss": 0.3564, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 1.7623456790123457, |
|
"grad_norm": 0.1317005604505539, |
|
"learning_rate": 3.02473238380378e-07, |
|
"loss": 0.2571, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 1.765432098765432, |
|
"grad_norm": 0.16465657949447632, |
|
"learning_rate": 2.9409681381617315e-07, |
|
"loss": 0.3739, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 1.7685185185185186, |
|
"grad_norm": 0.14124394953250885, |
|
"learning_rate": 2.858344931546181e-07, |
|
"loss": 0.2025, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 1.7716049382716048, |
|
"grad_norm": 0.19090065360069275, |
|
"learning_rate": 2.776864767319731e-07, |
|
"loss": 0.3652, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.7746913580246915, |
|
"grad_norm": 0.16761578619480133, |
|
"learning_rate": 2.696529621129618e-07, |
|
"loss": 0.3257, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.17358000576496124, |
|
"learning_rate": 2.617341440859883e-07, |
|
"loss": 0.3162, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 1.7808641975308643, |
|
"grad_norm": 0.13688547909259796, |
|
"learning_rate": 2.539302146584116e-07, |
|
"loss": 0.2838, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 1.7839506172839505, |
|
"grad_norm": 0.12233246117830276, |
|
"learning_rate": 2.4624136305188895e-07, |
|
"loss": 0.2656, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 1.7870370370370372, |
|
"grad_norm": 0.14487585425376892, |
|
"learning_rate": 2.3866777569779234e-07, |
|
"loss": 0.2808, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.7901234567901234, |
|
"grad_norm": 0.1593523919582367, |
|
"learning_rate": 2.3120963623267822e-07, |
|
"loss": 0.3441, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 1.7932098765432098, |
|
"grad_norm": 0.1122526079416275, |
|
"learning_rate": 2.2386712549384848e-07, |
|
"loss": 0.1452, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 1.7962962962962963, |
|
"grad_norm": 0.1848554015159607, |
|
"learning_rate": 2.1664042151495424e-07, |
|
"loss": 0.407, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 1.7993827160493827, |
|
"grad_norm": 0.17059315741062164, |
|
"learning_rate": 2.095296995216828e-07, |
|
"loss": 0.3516, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 1.8024691358024691, |
|
"grad_norm": 0.18412597477436066, |
|
"learning_rate": 2.0253513192751374e-07, |
|
"loss": 0.2922, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.8055555555555556, |
|
"grad_norm": 0.17134982347488403, |
|
"learning_rate": 1.9565688832952846e-07, |
|
"loss": 0.2951, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 1.808641975308642, |
|
"grad_norm": 0.11777715384960175, |
|
"learning_rate": 1.8889513550430892e-07, |
|
"loss": 0.24, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 1.8117283950617284, |
|
"grad_norm": 0.18584772944450378, |
|
"learning_rate": 1.8225003740388546e-07, |
|
"loss": 0.3498, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 1.8148148148148149, |
|
"grad_norm": 0.15893200039863586, |
|
"learning_rate": 1.7572175515176538e-07, |
|
"loss": 0.3392, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 1.817901234567901, |
|
"grad_norm": 0.152305468916893, |
|
"learning_rate": 1.693104470390261e-07, |
|
"loss": 0.2333, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.8209876543209877, |
|
"grad_norm": 0.15064826607704163, |
|
"learning_rate": 1.6301626852047504e-07, |
|
"loss": 0.2935, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 1.824074074074074, |
|
"grad_norm": 0.18689890205860138, |
|
"learning_rate": 1.5683937221088242e-07, |
|
"loss": 0.4082, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 1.8271604938271606, |
|
"grad_norm": 0.16067026555538177, |
|
"learning_rate": 1.5077990788127993e-07, |
|
"loss": 0.2624, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 1.8302469135802468, |
|
"grad_norm": 0.15756982564926147, |
|
"learning_rate": 1.448380224553303e-07, |
|
"loss": 0.3681, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 1.8333333333333335, |
|
"grad_norm": 0.16193000972270966, |
|
"learning_rate": 1.3901386000576112e-07, |
|
"loss": 0.5148, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8364197530864197, |
|
"grad_norm": 0.1545064002275467, |
|
"learning_rate": 1.3330756175087778e-07, |
|
"loss": 0.2837, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 1.8395061728395061, |
|
"grad_norm": 0.1584656536579132, |
|
"learning_rate": 1.2771926605113283e-07, |
|
"loss": 0.267, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 1.8425925925925926, |
|
"grad_norm": 0.23085588216781616, |
|
"learning_rate": 1.2224910840577642e-07, |
|
"loss": 0.3637, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 1.845679012345679, |
|
"grad_norm": 0.15698540210723877, |
|
"learning_rate": 1.1689722144956672e-07, |
|
"loss": 0.2152, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 1.8487654320987654, |
|
"grad_norm": 0.1545877605676651, |
|
"learning_rate": 1.1166373494955696e-07, |
|
"loss": 0.3073, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.8518518518518519, |
|
"grad_norm": 0.16467563807964325, |
|
"learning_rate": 1.06548775801949e-07, |
|
"loss": 0.3654, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 1.8549382716049383, |
|
"grad_norm": 0.20076429843902588, |
|
"learning_rate": 1.0155246802901198e-07, |
|
"loss": 0.3131, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 1.8580246913580247, |
|
"grad_norm": 0.14146511256694794, |
|
"learning_rate": 9.667493277608187e-08, |
|
"loss": 0.3651, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 1.8611111111111112, |
|
"grad_norm": 0.15111708641052246, |
|
"learning_rate": 9.191628830861832e-08, |
|
"loss": 0.267, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 1.8641975308641974, |
|
"grad_norm": 0.13036541640758514, |
|
"learning_rate": 8.727665000934027e-08, |
|
"loss": 0.2568, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.867283950617284, |
|
"grad_norm": 0.16827543079853058, |
|
"learning_rate": 8.275613037542873e-08, |
|
"loss": 0.4188, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 1.8703703703703702, |
|
"grad_norm": 0.18110865354537964, |
|
"learning_rate": 7.835483901579454e-08, |
|
"loss": 0.3361, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 1.873456790123457, |
|
"grad_norm": 0.1515679508447647, |
|
"learning_rate": 7.407288264842772e-08, |
|
"loss": 0.3421, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 1.876543209876543, |
|
"grad_norm": 0.1735447645187378, |
|
"learning_rate": 6.991036509780391e-08, |
|
"loss": 0.3908, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 1.8796296296296298, |
|
"grad_norm": 0.15131166577339172, |
|
"learning_rate": 6.58673872923693e-08, |
|
"loss": 0.2439, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.882716049382716, |
|
"grad_norm": 0.12076130509376526, |
|
"learning_rate": 6.194404726209358e-08, |
|
"loss": 0.2178, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 1.8858024691358026, |
|
"grad_norm": 0.1315135806798935, |
|
"learning_rate": 5.8140440136091326e-08, |
|
"loss": 0.2291, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 1.8888888888888888, |
|
"grad_norm": 0.17915165424346924, |
|
"learning_rate": 5.445665814031942e-08, |
|
"loss": 0.2377, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 1.8919753086419753, |
|
"grad_norm": 0.14008641242980957, |
|
"learning_rate": 5.089279059533658e-08, |
|
"loss": 0.2266, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 1.8950617283950617, |
|
"grad_norm": 0.18772335350513458, |
|
"learning_rate": 4.744892391413791e-08, |
|
"loss": 0.4006, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.8981481481481481, |
|
"grad_norm": 0.14937154948711395, |
|
"learning_rate": 4.412514160006376e-08, |
|
"loss": 0.3891, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 1.9012345679012346, |
|
"grad_norm": 0.12767252326011658, |
|
"learning_rate": 4.092152424477025e-08, |
|
"loss": 0.2397, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 1.904320987654321, |
|
"grad_norm": 0.16874873638153076, |
|
"learning_rate": 3.7838149526277514e-08, |
|
"loss": 0.3338, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 1.9074074074074074, |
|
"grad_norm": 0.1845911145210266, |
|
"learning_rate": 3.487509220708563e-08, |
|
"loss": 0.4378, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 1.9104938271604939, |
|
"grad_norm": 0.14064140617847443, |
|
"learning_rate": 3.2032424132362736e-08, |
|
"loss": 0.2801, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.9135802469135803, |
|
"grad_norm": 0.14805810153484344, |
|
"learning_rate": 2.9310214228202016e-08, |
|
"loss": 0.3122, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 1.9166666666666665, |
|
"grad_norm": 0.1921551674604416, |
|
"learning_rate": 2.6708528499950758e-08, |
|
"loss": 0.2982, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 1.9197530864197532, |
|
"grad_norm": 0.14775682985782623, |
|
"learning_rate": 2.4227430030609455e-08, |
|
"loss": 0.3503, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 1.9228395061728394, |
|
"grad_norm": 0.17906314134597778, |
|
"learning_rate": 2.1866978979303567e-08, |
|
"loss": 0.3863, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 1.925925925925926, |
|
"grad_norm": 0.1467551589012146, |
|
"learning_rate": 1.962723257982302e-08, |
|
"loss": 0.2993, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9290123456790123, |
|
"grad_norm": 0.2205621749162674, |
|
"learning_rate": 1.7508245139236658e-08, |
|
"loss": 0.3168, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 1.932098765432099, |
|
"grad_norm": 0.1704474836587906, |
|
"learning_rate": 1.5510068036573288e-08, |
|
"loss": 0.3177, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 1.9351851851851851, |
|
"grad_norm": 0.15591393411159515, |
|
"learning_rate": 1.3632749721577132e-08, |
|
"loss": 0.2671, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 1.9382716049382716, |
|
"grad_norm": 0.1339595913887024, |
|
"learning_rate": 1.1876335713532638e-08, |
|
"loss": 0.196, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 1.941358024691358, |
|
"grad_norm": 0.15144091844558716, |
|
"learning_rate": 1.024086860016149e-08, |
|
"loss": 0.306, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.9444444444444444, |
|
"grad_norm": 0.14868693053722382, |
|
"learning_rate": 8.726388036587874e-09, |
|
"loss": 0.271, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 1.9475308641975309, |
|
"grad_norm": 0.14298443496227264, |
|
"learning_rate": 7.332930744380906e-09, |
|
"loss": 0.225, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 1.9506172839506173, |
|
"grad_norm": 0.14053991436958313, |
|
"learning_rate": 6.060530510659246e-09, |
|
"loss": 0.32, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 1.9537037037037037, |
|
"grad_norm": 0.2039446085691452, |
|
"learning_rate": 4.909218187276743e-09, |
|
"loss": 0.4306, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 1.9567901234567902, |
|
"grad_norm": 0.20658931136131287, |
|
"learning_rate": 3.8790216900702615e-09, |
|
"loss": 0.4053, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.9598765432098766, |
|
"grad_norm": 0.30260926485061646, |
|
"learning_rate": 2.9699659981863306e-09, |
|
"loss": 0.3979, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 1.9629629629629628, |
|
"grad_norm": 0.1412692815065384, |
|
"learning_rate": 2.182073153471631e-09, |
|
"loss": 0.1879, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 1.9660493827160495, |
|
"grad_norm": 0.11770602315664291, |
|
"learning_rate": 1.5153622599428652e-09, |
|
"loss": 0.2462, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 1.9691358024691357, |
|
"grad_norm": 0.156539648771286, |
|
"learning_rate": 9.698494833199068e-10, |
|
"loss": 0.3218, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 1.9722222222222223, |
|
"grad_norm": 0.19168072938919067, |
|
"learning_rate": 5.455480506355582e-10, |
|
"loss": 0.4821, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.9753086419753085, |
|
"grad_norm": 0.13230177760124207, |
|
"learning_rate": 2.4246824991525085e-10, |
|
"loss": 0.3134, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 1.9783950617283952, |
|
"grad_norm": 0.1942073255777359, |
|
"learning_rate": 6.061742992613529e-11, |
|
"loss": 0.3413, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 1.9814814814814814, |
|
"grad_norm": 0.15652911365032196, |
|
"learning_rate": 0.0, |
|
"loss": 0.2942, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 1.9814814814814814, |
|
"eval_loss": 0.42709851264953613, |
|
"eval_runtime": 44.317, |
|
"eval_samples_per_second": 8.304, |
|
"eval_steps_per_second": 1.038, |
|
"step": 648 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 648, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 162, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.584525189221712e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|