|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9924139799512326, |
|
"eval_steps": 58, |
|
"global_step": 460, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004336630979807562, |
|
"grad_norm": 7.40625, |
|
"learning_rate": 2.173913043478261e-07, |
|
"loss": 1.0297, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004336630979807562, |
|
"eval_loss": 1.1468182802200317, |
|
"eval_runtime": 109.2361, |
|
"eval_samples_per_second": 7.909, |
|
"eval_steps_per_second": 1.977, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008673261959615123, |
|
"grad_norm": 7.21875, |
|
"learning_rate": 4.347826086956522e-07, |
|
"loss": 1.0442, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.013009892939422686, |
|
"grad_norm": 8.75, |
|
"learning_rate": 6.521739130434783e-07, |
|
"loss": 1.0301, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.017346523919230247, |
|
"grad_norm": 13.3125, |
|
"learning_rate": 8.695652173913044e-07, |
|
"loss": 1.0477, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.02168315489903781, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 1.0869565217391306e-06, |
|
"loss": 1.038, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02601978587884537, |
|
"grad_norm": 6.0, |
|
"learning_rate": 1.3043478260869566e-06, |
|
"loss": 1.0415, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.030356416858652934, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.521739130434783e-06, |
|
"loss": 1.0229, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.03469304783846049, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 1.7391304347826088e-06, |
|
"loss": 1.0266, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.039029678818268056, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 1.956521739130435e-06, |
|
"loss": 1.0237, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.04336630979807562, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 2.173913043478261e-06, |
|
"loss": 1.0316, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04770294077788318, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 2.391304347826087e-06, |
|
"loss": 1.0134, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.05203957175769074, |
|
"grad_norm": 6.375, |
|
"learning_rate": 2.6086956521739132e-06, |
|
"loss": 1.0288, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.056376202737498306, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 2.8260869565217393e-06, |
|
"loss": 1.0208, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.06071283371730587, |
|
"grad_norm": 3.765625, |
|
"learning_rate": 3.043478260869566e-06, |
|
"loss": 0.9986, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.06504946469711342, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 3.2608695652173914e-06, |
|
"loss": 1.0102, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06938609567692099, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 3.4782608695652175e-06, |
|
"loss": 0.991, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.07372272665672855, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 3.6956521739130436e-06, |
|
"loss": 0.9974, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.07805935763653611, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 3.91304347826087e-06, |
|
"loss": 0.9997, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.08239598861634367, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 4.130434782608696e-06, |
|
"loss": 0.9896, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.08673261959615124, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 4.347826086956522e-06, |
|
"loss": 0.973, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0910692505759588, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 4.565217391304348e-06, |
|
"loss": 0.9764, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.09540588155576636, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 4.782608695652174e-06, |
|
"loss": 0.9461, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.09974251253557392, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9355, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.10407914351538149, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 5.2173913043478265e-06, |
|
"loss": 0.9725, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.10841577449518905, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 5.4347826086956525e-06, |
|
"loss": 0.9244, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.11275240547499661, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 5.652173913043479e-06, |
|
"loss": 0.929, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.11708903645480417, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 5.8695652173913055e-06, |
|
"loss": 0.9434, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.12142566743461174, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 6.086956521739132e-06, |
|
"loss": 0.9331, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.1257622984144193, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 6.304347826086958e-06, |
|
"loss": 0.9264, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.13009892939422685, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 6.521739130434783e-06, |
|
"loss": 0.9114, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.13443556037403442, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 6.739130434782609e-06, |
|
"loss": 0.9277, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.13877219135384197, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 6.956521739130435e-06, |
|
"loss": 0.9154, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.14310882233364955, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 7.173913043478261e-06, |
|
"loss": 0.939, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.1474454533134571, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 7.391304347826087e-06, |
|
"loss": 0.9146, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.15178208429326467, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 7.608695652173914e-06, |
|
"loss": 0.9063, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.15611871527307222, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 7.82608695652174e-06, |
|
"loss": 0.9178, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.1604553462528798, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 8.043478260869566e-06, |
|
"loss": 0.9184, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.16479197723268735, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 8.260869565217392e-06, |
|
"loss": 0.911, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.16912860821249492, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 8.478260869565218e-06, |
|
"loss": 0.9031, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.17346523919230247, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 8.695652173913044e-06, |
|
"loss": 0.8881, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.17780187017211005, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 8.91304347826087e-06, |
|
"loss": 0.8846, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.1821385011519176, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 9.130434782608697e-06, |
|
"loss": 0.8895, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.18647513213172517, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 9.347826086956523e-06, |
|
"loss": 0.8683, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.19081176311153272, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 9.565217391304349e-06, |
|
"loss": 0.8795, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.1951483940913403, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 9.782608695652175e-06, |
|
"loss": 0.8829, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.19948502507114785, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 1e-05, |
|
"loss": 0.8703, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.20382165605095542, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 9.999856041607732e-06, |
|
"loss": 0.8702, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.20815828703076297, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 9.99942417472053e-06, |
|
"loss": 0.869, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.21249491801057055, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 9.998704424206747e-06, |
|
"loss": 0.8748, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.2168315489903781, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 9.997696831512027e-06, |
|
"loss": 0.8737, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.22116817997018567, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 9.996401454656941e-06, |
|
"loss": 0.8745, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.22550481094999322, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 9.994818368233639e-06, |
|
"loss": 0.8677, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.2298414419298008, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 9.992947663401548e-06, |
|
"loss": 0.863, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.23417807290960835, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 9.990789447882136e-06, |
|
"loss": 0.8709, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.2385147038894159, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 9.988343845952697e-06, |
|
"loss": 0.8543, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.24285133486922347, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 9.985610998439198e-06, |
|
"loss": 0.8735, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.24718796584903102, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 9.982591062708172e-06, |
|
"loss": 0.8631, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.2515245968288386, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 9.979284212657658e-06, |
|
"loss": 0.8512, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.2515245968288386, |
|
"eval_loss": 0.8729492425918579, |
|
"eval_runtime": 109.2389, |
|
"eval_samples_per_second": 7.909, |
|
"eval_steps_per_second": 1.977, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.2558612278086462, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 9.97569063870718e-06, |
|
"loss": 0.8554, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.2601978587884537, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 9.971810547786794e-06, |
|
"loss": 0.8661, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.26453448976826127, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 9.967644163325157e-06, |
|
"loss": 0.8592, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.26887112074806885, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 9.963191725236672e-06, |
|
"loss": 0.8614, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.2732077517278764, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 9.958453489907673e-06, |
|
"loss": 0.8555, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.27754438270768395, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 9.953429730181653e-06, |
|
"loss": 0.8572, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.2818810136874915, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 9.948120735343566e-06, |
|
"loss": 0.8583, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2862176446672991, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 9.942526811103153e-06, |
|
"loss": 0.8433, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.2905542756471067, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 9.93664827957735e-06, |
|
"loss": 0.8505, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.2948909066269142, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 9.930485479271735e-06, |
|
"loss": 0.8403, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.29922753760672177, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 9.924038765061042e-06, |
|
"loss": 0.8585, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.30356416858652935, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 9.917308508168712e-06, |
|
"loss": 0.8567, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.3079007995663369, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.91029509614553e-06, |
|
"loss": 0.8543, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.31223743054614445, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 9.902998932847308e-06, |
|
"loss": 0.8752, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.316574061525952, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 9.895420438411616e-06, |
|
"loss": 0.8535, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.3209106925057596, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 9.887560049233606e-06, |
|
"loss": 0.8601, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.3252473234855672, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 9.879418217940872e-06, |
|
"loss": 0.8543, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.3295839544653747, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 9.870995413367397e-06, |
|
"loss": 0.8113, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.33392058544518227, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 9.862292120526536e-06, |
|
"loss": 0.8583, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.33825721642498985, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 9.85330884058311e-06, |
|
"loss": 0.832, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.3425938474047974, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 9.844046090824533e-06, |
|
"loss": 0.8271, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.34693047838460495, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 9.834504404631032e-06, |
|
"loss": 0.8503, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3512671093644125, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 9.824684331444926e-06, |
|
"loss": 0.8189, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.3556037403442201, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 9.814586436738998e-06, |
|
"loss": 0.8465, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.3599403713240276, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 9.804211301983919e-06, |
|
"loss": 0.8159, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.3642770023038352, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 9.793559524614779e-06, |
|
"loss": 0.8392, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.36861363328364277, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 9.782631717996675e-06, |
|
"loss": 0.8291, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.37295026426345035, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 9.771428511389395e-06, |
|
"loss": 0.8398, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.37728689524325787, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 9.759950549911185e-06, |
|
"loss": 0.8499, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.38162352622306545, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 9.748198494501598e-06, |
|
"loss": 0.8244, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.385960157202873, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 9.736173021883433e-06, |
|
"loss": 0.8281, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.3902967881826806, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 9.72387482452377e-06, |
|
"loss": 0.8165, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3946334191624881, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 9.711304610594104e-06, |
|
"loss": 0.8329, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.3989700501422957, |
|
"grad_norm": 0.375, |
|
"learning_rate": 9.698463103929542e-06, |
|
"loss": 0.8218, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.40330668112210327, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 9.685351043987151e-06, |
|
"loss": 0.8132, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.40764331210191085, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 9.671969185803357e-06, |
|
"loss": 0.8357, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.41197994308171837, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 9.658318299950473e-06, |
|
"loss": 0.8352, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.41631657406152595, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 9.644399172492337e-06, |
|
"loss": 0.8112, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.4206532050413335, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 9.630212604939026e-06, |
|
"loss": 0.8376, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.4249898360211411, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 9.615759414200729e-06, |
|
"loss": 0.8304, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.4293264670009486, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 9.601040432540684e-06, |
|
"loss": 0.8403, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.4336630979807562, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 9.586056507527266e-06, |
|
"loss": 0.8331, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4379997289605638, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 9.570808501985176e-06, |
|
"loss": 0.8268, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.44233635994037135, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 9.55529729394576e-06, |
|
"loss": 0.8264, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.44667299092017887, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 9.539523776596446e-06, |
|
"loss": 0.8235, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.45100962189998645, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 9.523488858229313e-06, |
|
"loss": 0.8276, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.455346252879794, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 9.507193462188791e-06, |
|
"loss": 0.8142, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.4596828838596016, |
|
"grad_norm": 0.375, |
|
"learning_rate": 9.490638526818482e-06, |
|
"loss": 0.8092, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.4640195148394091, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 9.47382500540714e-06, |
|
"loss": 0.8256, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.4683561458192167, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 9.45675386613377e-06, |
|
"loss": 0.8342, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.4726927767990243, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 9.439426092011877e-06, |
|
"loss": 0.8087, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.4770294077788318, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 9.421842680832862e-06, |
|
"loss": 0.8316, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.48136603875863937, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 9.40400464510857e-06, |
|
"loss": 0.8257, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.48570266973844695, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 9.385913012012972e-06, |
|
"loss": 0.8246, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.4900393007182545, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 9.367568823323039e-06, |
|
"loss": 0.8206, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.49437593169806204, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 9.348973135358734e-06, |
|
"loss": 0.8358, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.4987125626778696, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 9.330127018922195e-06, |
|
"loss": 0.8017, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.5030491936576772, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 9.311031559236067e-06, |
|
"loss": 0.8496, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.5030491936576772, |
|
"eval_loss": 0.8192870616912842, |
|
"eval_runtime": 109.0108, |
|
"eval_samples_per_second": 7.926, |
|
"eval_steps_per_second": 1.981, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.5073858246374847, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 9.291687855881027e-06, |
|
"loss": 0.8147, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.5117224556172923, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 9.272097022732444e-06, |
|
"loss": 0.8277, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.5160590865970999, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 9.252260187896257e-06, |
|
"loss": 0.8212, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.5203957175769074, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 9.232178493644006e-06, |
|
"loss": 0.8375, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.524732348556715, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 9.211853096347059e-06, |
|
"loss": 0.8386, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.5290689795365225, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 9.191285166410023e-06, |
|
"loss": 0.8118, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.5334056105163302, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 9.170475888203348e-06, |
|
"loss": 0.8181, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.5377422414961377, |
|
"grad_norm": 0.375, |
|
"learning_rate": 9.149426459995127e-06, |
|
"loss": 0.8213, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.5420788724759452, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 9.128138093882098e-06, |
|
"loss": 0.8392, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5464155034557528, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 9.106612015719845e-06, |
|
"loss": 0.8286, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.5507521344355604, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 9.08484946505221e-06, |
|
"loss": 0.8324, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.5550887654153679, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 9.062851695039915e-06, |
|
"loss": 0.8271, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.5594253963951755, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 9.040619972388402e-06, |
|
"loss": 0.8179, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.563762027374983, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 9.018155577274891e-06, |
|
"loss": 0.8214, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5680986583547907, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 8.995459803274664e-06, |
|
"loss": 0.8255, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.5724352893345982, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 8.972533957286574e-06, |
|
"loss": 0.8167, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.5767719203144057, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 8.949379359457795e-06, |
|
"loss": 0.8012, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.5811085512942133, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 8.925997343107796e-06, |
|
"loss": 0.8182, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.5854451822740209, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 8.902389254651568e-06, |
|
"loss": 0.8073, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5897818132538284, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 8.8785564535221e-06, |
|
"loss": 0.8195, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.594118444233636, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 8.854500312092081e-06, |
|
"loss": 0.8292, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.5984550752134435, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 8.83022221559489e-06, |
|
"loss": 0.8204, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.6027917061932511, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 8.805723562044825e-06, |
|
"loss": 0.8175, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.6071283371730587, |
|
"grad_norm": 0.375, |
|
"learning_rate": 8.781005762156593e-06, |
|
"loss": 0.8044, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6114649681528662, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 8.756070239264089e-06, |
|
"loss": 0.8187, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.6158015991326738, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 8.730918429238429e-06, |
|
"loss": 0.8164, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.6201382301124814, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 8.705551780405264e-06, |
|
"loss": 0.8051, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.6244748610922889, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 8.679971753461388e-06, |
|
"loss": 0.8127, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.6288114920720965, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 8.65417982139062e-06, |
|
"loss": 0.8283, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.633148123051904, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 8.628177469378995e-06, |
|
"loss": 0.8169, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.6374847540317116, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 8.601966194729228e-06, |
|
"loss": 0.8209, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.6418213850115192, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 8.575547506774498e-06, |
|
"loss": 0.8388, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.6461580159913267, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 8.548922926791545e-06, |
|
"loss": 0.8129, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.6504946469711343, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 8.522093987913063e-06, |
|
"loss": 0.8282, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6548312779509419, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 8.49506223503941e-06, |
|
"loss": 0.813, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.6591679089307494, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 8.467829224749665e-06, |
|
"loss": 0.8313, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.663504539910557, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 8.440396525211976e-06, |
|
"loss": 0.828, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.6678411708903645, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 8.412765716093273e-06, |
|
"loss": 0.8247, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.6721778018701721, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 8.384938388468296e-06, |
|
"loss": 0.8046, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.6765144328499797, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 8.356916144727985e-06, |
|
"loss": 0.814, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.6808510638297872, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 8.328700598487203e-06, |
|
"loss": 0.8147, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.6851876948095948, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 8.300293374491821e-06, |
|
"loss": 0.8083, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.6895243257894024, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 8.271696108525156e-06, |
|
"loss": 0.801, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.6938609567692099, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 8.24291044731378e-06, |
|
"loss": 0.8155, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6981975877490175, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 8.213938048432697e-06, |
|
"loss": 0.7988, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.702534218728825, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 8.184780580209892e-06, |
|
"loss": 0.8184, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.7068708497086326, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 8.155439721630265e-06, |
|
"loss": 0.8212, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.7112074806884402, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 8.125917162238945e-06, |
|
"loss": 0.8401, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.7155441116682477, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 8.096214602044011e-06, |
|
"loss": 0.7886, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.7198807426480552, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 8.066333751418582e-06, |
|
"loss": 0.8181, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.7242173736278629, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 8.036276331002348e-06, |
|
"loss": 0.8188, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.7285540046076704, |
|
"grad_norm": 0.375, |
|
"learning_rate": 8.006044071602476e-06, |
|
"loss": 0.7999, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.732890635587478, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 7.97563871409395e-06, |
|
"loss": 0.8273, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.7372272665672855, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 7.94506200931932e-06, |
|
"loss": 0.7848, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7415638975470931, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 7.914315717987892e-06, |
|
"loss": 0.82, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.7459005285269007, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 7.883401610574338e-06, |
|
"loss": 0.805, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.7502371595067082, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 7.85232146721673e-06, |
|
"loss": 0.8017, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.7545737904865157, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 7.821077077614062e-06, |
|
"loss": 0.8175, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.7545737904865157, |
|
"eval_loss": 0.8033392429351807, |
|
"eval_runtime": 109.1503, |
|
"eval_samples_per_second": 7.916, |
|
"eval_steps_per_second": 1.979, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.7589104214663234, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 7.789670240923169e-06, |
|
"loss": 0.825, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.7632470524461309, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 7.758102765655136e-06, |
|
"loss": 0.8155, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.7675836834259385, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 7.726376469571165e-06, |
|
"loss": 0.8138, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.771920314405746, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 7.69449317957788e-06, |
|
"loss": 0.8055, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.7762569453855536, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 7.66245473162215e-06, |
|
"loss": 0.8046, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.7805935763653612, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 7.630262970585355e-06, |
|
"loss": 0.8138, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7849302073451687, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 7.597919750177168e-06, |
|
"loss": 0.8366, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.7892668383249762, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 7.56542693282879e-06, |
|
"loss": 0.8303, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.7936034693047839, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 7.532786389585715e-06, |
|
"loss": 0.8139, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.7979401002845914, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.8098, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.802276731264399, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 7.467069652022017e-06, |
|
"loss": 0.8116, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.8066133622442065, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 7.433997241891743e-06, |
|
"loss": 0.7941, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.8109499932240141, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 7.400784674029579e-06, |
|
"loss": 0.8123, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.8152866242038217, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 7.3674338609266705e-06, |
|
"loss": 0.8237, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.8196232551836292, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 7.333946723034794e-06, |
|
"loss": 0.8241, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.8239598861634367, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 7.300325188655762e-06, |
|
"loss": 0.8072, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8282965171432444, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 7.266571193830387e-06, |
|
"loss": 0.8027, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.8326331481230519, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 7.232686682227001e-06, |
|
"loss": 0.8351, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.8369697791028594, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 7.198673605029529e-06, |
|
"loss": 0.8108, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.841306410082667, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 7.164533920825137e-06, |
|
"loss": 0.8248, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.8456430410624746, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 7.130269595491443e-06, |
|
"loss": 0.8117, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.8499796720422822, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 7.095882602083321e-06, |
|
"loss": 0.832, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.8543163030220897, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 7.061374920719288e-06, |
|
"loss": 0.8196, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.8586529340018972, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 7.026748538467474e-06, |
|
"loss": 0.8023, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.8629895649817049, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 6.9920054492312086e-06, |
|
"loss": 0.8149, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.8673261959615124, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 6.957147653634198e-06, |
|
"loss": 0.8166, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8716628269413199, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 6.922177158905326e-06, |
|
"loss": 0.8198, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.8759994579211275, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 6.887095978763072e-06, |
|
"loss": 0.797, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.8803360889009351, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 6.851906133299556e-06, |
|
"loss": 0.8162, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.8846727198807427, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 6.816609648864208e-06, |
|
"loss": 0.8272, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.8890093508605502, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 6.781208557947085e-06, |
|
"loss": 0.7975, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.8933459818403577, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 6.745704899061843e-06, |
|
"loss": 0.8349, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.8976826128201654, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 6.710100716628345e-06, |
|
"loss": 0.7963, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.9020192437999729, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 6.674398060854931e-06, |
|
"loss": 0.8233, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.9063558747797804, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 6.638598987620375e-06, |
|
"loss": 0.8137, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.910692505759588, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 6.6027055583554865e-06, |
|
"loss": 0.8076, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9150291367393956, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 6.566719839924412e-06, |
|
"loss": 0.8046, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.9193657677192032, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 6.530643904505622e-06, |
|
"loss": 0.8211, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.9237023986990107, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 6.49447982947258e-06, |
|
"loss": 0.8135, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.9280390296788182, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 6.458229697274125e-06, |
|
"loss": 0.7993, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.9323756606586259, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 6.42189559531456e-06, |
|
"loss": 0.7944, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.9367122916384334, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 6.385479615833445e-06, |
|
"loss": 0.8078, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.9410489226182409, |
|
"grad_norm": 0.375, |
|
"learning_rate": 6.348983855785122e-06, |
|
"loss": 0.7926, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.9453855535980485, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 6.312410416717969e-06, |
|
"loss": 0.8212, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.9497221845778561, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 6.275761404653381e-06, |
|
"loss": 0.7814, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.9540588155576636, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 6.2390389299645e-06, |
|
"loss": 0.8039, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.9583954465374712, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 6.2022451072546926e-06, |
|
"loss": 0.802, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.9627320775172787, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 6.165382055235784e-06, |
|
"loss": 0.7972, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.9670687084970864, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 6.128451896606054e-06, |
|
"loss": 0.7882, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.9714053394768939, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 6.091456757928008e-06, |
|
"loss": 0.7859, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.9757419704567014, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 6.0543987695059236e-06, |
|
"loss": 0.7966, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.980078601436509, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 6.0172800652631706e-06, |
|
"loss": 0.8079, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.9844152324163166, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 5.980102782619343e-06, |
|
"loss": 0.8123, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.9887518633961241, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 5.9428690623671796e-06, |
|
"loss": 0.8359, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.9930884943759317, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 5.905581048549279e-06, |
|
"loss": 0.8287, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.9974251253557392, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 5.8682408883346535e-06, |
|
"loss": 0.8032, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.0017617563355468, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 5.830850731895071e-06, |
|
"loss": 0.8129, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.0040639393118396, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 5.793412732281258e-06, |
|
"loss": 0.7868, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.0040639393118396, |
|
"eval_loss": 0.7960610389709473, |
|
"eval_runtime": 110.4506, |
|
"eval_samples_per_second": 7.822, |
|
"eval_steps_per_second": 1.956, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.0083988079111352, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 5.755929045298905e-06, |
|
"loss": 0.8008, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.0127336765104307, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 5.718401829384541e-06, |
|
"loss": 0.8084, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.0170685451097263, |
|
"grad_norm": 0.375, |
|
"learning_rate": 5.680833245481234e-06, |
|
"loss": 0.8068, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.021403413709022, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 5.6432254569141565e-06, |
|
"loss": 0.796, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.0257382823083174, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 5.605580629266021e-06, |
|
"loss": 0.8198, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.030073150907613, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 5.567900930252375e-06, |
|
"loss": 0.7929, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.0344080195069087, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 5.530188529596774e-06, |
|
"loss": 0.8029, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.0387428881062042, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 5.492445598905843e-06, |
|
"loss": 0.8121, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.0430777567054998, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 5.454674311544236e-06, |
|
"loss": 0.7917, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.0474126253047955, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 5.416876842509468e-06, |
|
"loss": 0.7988, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.051747493904091, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 5.379055368306693e-06, |
|
"loss": 0.7804, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.0560823625033866, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 5.341212066823356e-06, |
|
"loss": 0.8167, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.0604172311026823, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 5.3033491172037935e-06, |
|
"loss": 0.8158, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.0647520997019777, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 5.265468699723748e-06, |
|
"loss": 0.7957, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.0690869683012734, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 5.227572995664819e-06, |
|
"loss": 0.7902, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.073421836900569, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 5.189664187188857e-06, |
|
"loss": 0.7994, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.0777567054998645, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 5.151744457212312e-06, |
|
"loss": 0.809, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.0820915740991601, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 5.113815989280528e-06, |
|
"loss": 0.7849, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.0864264426984558, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 5.075880967442014e-06, |
|
"loss": 0.8067, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.0907613112977512, |
|
"grad_norm": 0.375, |
|
"learning_rate": 5.037941576122667e-06, |
|
"loss": 0.798, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.0950961798970469, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7891, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.0994310484963425, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 4.962058423877335e-06, |
|
"loss": 0.8044, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.103765917095638, |
|
"grad_norm": 0.375, |
|
"learning_rate": 4.924119032557988e-06, |
|
"loss": 0.7842, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.1081007856949336, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 4.886184010719472e-06, |
|
"loss": 0.7962, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.1124356542942293, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 4.848255542787689e-06, |
|
"loss": 0.8043, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.1167705228935247, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 4.8103358128111435e-06, |
|
"loss": 0.8075, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.1211053914928204, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 4.772427004335183e-06, |
|
"loss": 0.8023, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.125440260092116, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 4.7345313002762545e-06, |
|
"loss": 0.7959, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.1297751286914115, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 4.696650882796207e-06, |
|
"loss": 0.7883, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.1341099972907072, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 4.6587879331766465e-06, |
|
"loss": 0.8036, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.1384448658900026, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 4.620944631693309e-06, |
|
"loss": 0.8016, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.1427797344892983, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 4.583123157490533e-06, |
|
"loss": 0.7982, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.147114603088594, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 4.545325688455766e-06, |
|
"loss": 0.794, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.1514494716878896, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 4.507554401094157e-06, |
|
"loss": 0.7905, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.155784340287185, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 4.469811470403228e-06, |
|
"loss": 0.7941, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.1601192088864807, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 4.432099069747625e-06, |
|
"loss": 0.801, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.1644540774857761, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 4.394419370733981e-06, |
|
"loss": 0.7985, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.1687889460850718, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 4.356774543085845e-06, |
|
"loss": 0.7837, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.1731238146843674, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 4.319166754518768e-06, |
|
"loss": 0.8008, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.1774586832836629, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 4.28159817061546e-06, |
|
"loss": 0.8059, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.1817935518829585, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 4.244070954701096e-06, |
|
"loss": 0.812, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.1861284204822542, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 4.206587267718743e-06, |
|
"loss": 0.7948, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.1904632890815496, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 4.1691492681049305e-06, |
|
"loss": 0.8005, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.1947981576808453, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 4.131759111665349e-06, |
|
"loss": 0.7992, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.199133026280141, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 4.094418951450721e-06, |
|
"loss": 0.8091, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.2034678948794364, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 4.057130937632821e-06, |
|
"loss": 0.799, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.207802763478732, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 4.01989721738066e-06, |
|
"loss": 0.8093, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.2121376320780277, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 3.982719934736832e-06, |
|
"loss": 0.8073, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.2164725006773232, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 3.945601230494079e-06, |
|
"loss": 0.8099, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.2208073692766188, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 3.9085432420719934e-06, |
|
"loss": 0.7912, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.2251422378759145, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 3.871548103393947e-06, |
|
"loss": 0.8105, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.22947710647521, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 3.834617944764218e-06, |
|
"loss": 0.7751, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.2338119750745056, |
|
"grad_norm": 0.375, |
|
"learning_rate": 3.797754892745309e-06, |
|
"loss": 0.8028, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.2381468436738012, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 3.7609610700355014e-06, |
|
"loss": 0.7939, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.2424817122730967, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 3.724238595346619e-06, |
|
"loss": 0.809, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.2468165808723923, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 3.687589583282031e-06, |
|
"loss": 0.8082, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.251151449471688, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 3.6510161442148783e-06, |
|
"loss": 0.7822, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.2554863180709834, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 3.6145203841665577e-06, |
|
"loss": 0.8119, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.2554863180709834, |
|
"eval_loss": 0.7933911681175232, |
|
"eval_runtime": 110.3505, |
|
"eval_samples_per_second": 7.83, |
|
"eval_steps_per_second": 1.957, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.259821186670279, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 3.578104404685442e-06, |
|
"loss": 0.806, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.2641560552695745, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 3.5417703027258752e-06, |
|
"loss": 0.8055, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.2684909238688702, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 3.5055201705274223e-06, |
|
"loss": 0.8039, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.2728257924681659, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 3.46935609549438e-06, |
|
"loss": 0.8149, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.2771606610674615, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 3.4332801600755895e-06, |
|
"loss": 0.7849, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.281495529666757, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 3.397294441644515e-06, |
|
"loss": 0.7956, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.2858303982660526, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 3.3614010123796257e-06, |
|
"loss": 0.7933, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.290165266865348, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 3.3256019391450696e-06, |
|
"loss": 0.8174, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.2945001354646437, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 3.289899283371657e-06, |
|
"loss": 0.7988, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.2988350040639394, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 3.2542951009381584e-06, |
|
"loss": 0.8037, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.303169872663235, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 3.2187914420529176e-06, |
|
"loss": 0.782, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.3075047412625305, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 3.1833903511357943e-06, |
|
"loss": 0.8037, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.3118396098618261, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 3.148093866700445e-06, |
|
"loss": 0.8053, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.3161744784611216, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 3.1129040212369286e-06, |
|
"loss": 0.7896, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.3205093470604172, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 3.077822841094675e-06, |
|
"loss": 0.8078, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.3248442156597129, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 3.0428523463658046e-06, |
|
"loss": 0.8084, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.3291790842590083, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 3.007994550768793e-06, |
|
"loss": 0.8277, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.333513952858304, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 2.973251461532527e-06, |
|
"loss": 0.8079, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.3378488214575996, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 2.9386250792807124e-06, |
|
"loss": 0.8168, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.342183690056895, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 2.9041173979166813e-06, |
|
"loss": 0.8047, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.3465185586561907, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 2.86973040450856e-06, |
|
"loss": 0.8037, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.3508534272554864, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 2.835466079174866e-06, |
|
"loss": 0.8001, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.3551882958547818, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 2.8013263949704706e-06, |
|
"loss": 0.8006, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.3595231644540775, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 2.767313317773e-06, |
|
"loss": 0.8156, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.363858033053373, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 2.7334288061696146e-06, |
|
"loss": 0.7992, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.3681929016526686, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 2.6996748113442397e-06, |
|
"loss": 0.7812, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.3725277702519643, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 2.666053276965207e-06, |
|
"loss": 0.7857, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.37686263885126, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 2.6325661390733303e-06, |
|
"loss": 0.7985, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.3811975074505554, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 2.599215325970423e-06, |
|
"loss": 0.7811, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.385532376049851, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 2.566002758108256e-06, |
|
"loss": 0.7975, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.3898672446491465, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 2.5329303479779855e-06, |
|
"loss": 0.8305, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.3942021132484421, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 2.5000000000000015e-06, |
|
"loss": 0.8006, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.3985369818477378, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 2.467213610414286e-06, |
|
"loss": 0.791, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.4028718504470334, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 2.434573067171213e-06, |
|
"loss": 0.7853, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.4072067190463289, |
|
"grad_norm": 0.375, |
|
"learning_rate": 2.4020802498228333e-06, |
|
"loss": 0.8011, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.4115415876456245, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 2.369737029414644e-06, |
|
"loss": 0.7996, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.41587645624492, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 2.337545268377853e-06, |
|
"loss": 0.8144, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.4202113248442156, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 2.3055068204221226e-06, |
|
"loss": 0.8064, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.4245461934435113, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 2.2736235304288373e-06, |
|
"loss": 0.7983, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.428881062042807, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 2.241897234344864e-06, |
|
"loss": 0.7919, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.4332159306421024, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 2.2103297590768334e-06, |
|
"loss": 0.785, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 1.437550799241398, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 2.1789229223859403e-06, |
|
"loss": 0.789, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.4418856678406935, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 2.1476785327832715e-06, |
|
"loss": 0.8104, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.4462205364399892, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 2.1165983894256647e-06, |
|
"loss": 0.7929, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.4505554050392848, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 2.085684282012108e-06, |
|
"loss": 0.8129, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.4548902736385803, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 2.0549379906806816e-06, |
|
"loss": 0.7983, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.459225142237876, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 2.0243612859060526e-06, |
|
"loss": 0.7915, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.4635600108371716, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 1.9939559283975237e-06, |
|
"loss": 0.8021, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.467894879436467, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 1.9637236689976517e-06, |
|
"loss": 0.8164, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.4722297480357627, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.933666248581418e-06, |
|
"loss": 0.7876, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.4765646166350583, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 1.9037853979559923e-06, |
|
"loss": 0.7911, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.4808994852343538, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 1.8740828377610564e-06, |
|
"loss": 0.786, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.4852343538336494, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.8445602783697375e-06, |
|
"loss": 0.8243, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 1.4895692224329449, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 1.8152194197901086e-06, |
|
"loss": 0.8162, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.4939040910322405, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 1.7860619515673034e-06, |
|
"loss": 0.8081, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.4982389596315362, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.7570895526862202e-06, |
|
"loss": 0.814, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.5025738282308319, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 1.7283038914748446e-06, |
|
"loss": 0.7814, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.5069086968301273, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 1.6997066255081795e-06, |
|
"loss": 0.799, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.5069086968301273, |
|
"eval_loss": 0.7925707697868347, |
|
"eval_runtime": 110.4666, |
|
"eval_samples_per_second": 7.821, |
|
"eval_steps_per_second": 1.955, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.511243565429423, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 1.6712994015127976e-06, |
|
"loss": 0.798, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.5155784340287184, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 1.6430838552720168e-06, |
|
"loss": 0.8019, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.519913302628014, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 1.6150616115317052e-06, |
|
"loss": 0.77, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.5242481712273097, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 1.5872342839067305e-06, |
|
"loss": 0.7969, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.5285830398266054, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.5596034747880263e-06, |
|
"loss": 0.8047, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.5329179084259008, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.5321707752503367e-06, |
|
"loss": 0.7922, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.5372527770251965, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.5049377649605906e-06, |
|
"loss": 0.8011, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.541587645624492, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 1.4779060120869393e-06, |
|
"loss": 0.7937, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.5459225142237876, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.451077073208455e-06, |
|
"loss": 0.7822, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.5502573828230832, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 1.4244524932255026e-06, |
|
"loss": 0.7985, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.554592251422379, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.3980338052707737e-06, |
|
"loss": 0.7968, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 1.5589271200216743, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 1.3718225306210049e-06, |
|
"loss": 0.8111, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.5632619886209698, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 1.3458201786093795e-06, |
|
"loss": 0.7918, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.5675968572202654, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.3200282465386156e-06, |
|
"loss": 0.8026, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.571931725819561, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.2944482195947384e-06, |
|
"loss": 0.8124, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.5762665944188567, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 1.2690815707615727e-06, |
|
"loss": 0.7961, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.5806014630181524, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 1.2439297607359118e-06, |
|
"loss": 0.8055, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.5849363316174478, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.2189942378434083e-06, |
|
"loss": 0.786, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.5892712002167433, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 1.194276437955177e-06, |
|
"loss": 0.8009, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 1.593606068816039, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.1697777844051105e-06, |
|
"loss": 0.8016, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.5979409374153346, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 1.1454996879079205e-06, |
|
"loss": 0.7954, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.6022758060146303, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 1.1214435464779006e-06, |
|
"loss": 0.8051, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.606610674613926, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.0976107453484314e-06, |
|
"loss": 0.7912, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.6109455432132214, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 1.0740026568922058e-06, |
|
"loss": 0.8041, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.6152804118125168, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 1.050620640542208e-06, |
|
"loss": 0.7959, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.6196152804118125, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 1.027466042713428e-06, |
|
"loss": 0.8097, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 1.6239501490111081, |
|
"grad_norm": 0.375, |
|
"learning_rate": 1.0045401967253382e-06, |
|
"loss": 0.7924, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.6282850176104038, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 9.81844422725109e-07, |
|
"loss": 0.8068, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.6326198862096992, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 9.593800276115978e-07, |
|
"loss": 0.8052, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 1.6369547548089949, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 9.371483049600849e-07, |
|
"loss": 0.7862, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.6412896234082903, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 9.151505349477901e-07, |
|
"loss": 0.8059, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 1.645624492007586, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 8.933879842801558e-07, |
|
"loss": 0.785, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.6499593606068816, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 8.718619061179029e-07, |
|
"loss": 0.7866, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.6542942292061773, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 8.505735400048748e-07, |
|
"loss": 0.7948, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.6586290978054727, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 8.29524111796654e-07, |
|
"loss": 0.8076, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.6629639664047684, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 8.087148335899786e-07, |
|
"loss": 0.8034, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.6672988350040638, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 7.881469036529427e-07, |
|
"loss": 0.7956, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.6716337036033595, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 7.678215063559957e-07, |
|
"loss": 0.797, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.6759685722026552, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 7.477398121037449e-07, |
|
"loss": 0.777, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.6803034408019508, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 7.279029772675572e-07, |
|
"loss": 0.8072, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.6846383094012463, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 7.083121441189739e-07, |
|
"loss": 0.7878, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.6889731780005417, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 6.889684407639324e-07, |
|
"loss": 0.8186, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.6933080465998374, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 6.698729810778065e-07, |
|
"loss": 0.8195, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 1.697642915199133, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 6.510268646412665e-07, |
|
"loss": 0.7844, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.7019777837984287, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 6.324311766769631e-07, |
|
"loss": 0.7936, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.7063126523977243, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 6.140869879870287e-07, |
|
"loss": 0.795, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.7106475209970198, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 5.959953548914327e-07, |
|
"loss": 0.7961, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.7149823895963152, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 5.781573191671386e-07, |
|
"loss": 0.7819, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.7193172581956109, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 5.60573907988124e-07, |
|
"loss": 0.8076, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 1.7236521267949065, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 5.43246133866231e-07, |
|
"loss": 0.8044, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.7279869953942022, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 5.261749945928613e-07, |
|
"loss": 0.8001, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.7323218639934979, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 5.0936147318152e-07, |
|
"loss": 0.7955, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.7366567325927933, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 4.928065378112107e-07, |
|
"loss": 0.7974, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 1.7409916011920887, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 4.7651114177068694e-07, |
|
"loss": 0.8025, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.7453264697913844, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 4.604762234035548e-07, |
|
"loss": 0.7857, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 1.74966133839068, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 4.4470270605424195e-07, |
|
"loss": 0.8064, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.7539962069899757, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 4.2919149801482596e-07, |
|
"loss": 0.7966, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.7583310755892712, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 4.139434924727359e-07, |
|
"loss": 0.7891, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.7583310755892712, |
|
"eval_loss": 0.7922915816307068, |
|
"eval_runtime": 110.2564, |
|
"eval_samples_per_second": 7.836, |
|
"eval_steps_per_second": 1.959, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.7626659441885668, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 3.989595674593161e-07, |
|
"loss": 0.7935, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 1.7670008127878623, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 3.8424058579927147e-07, |
|
"loss": 0.7943, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.771335681387158, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 3.697873950609737e-07, |
|
"loss": 0.796, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 1.7756705499864536, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 3.55600827507665e-07, |
|
"loss": 0.8139, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.7800054185857492, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 3.416817000495271e-07, |
|
"loss": 0.7962, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.7843402871850447, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 3.2803081419664483e-07, |
|
"loss": 0.8059, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.7886751557843403, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 3.146489560128496e-07, |
|
"loss": 0.8073, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 1.7930100243836358, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 3.015368960704584e-07, |
|
"loss": 0.7965, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.7973448929829314, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 2.88695389405898e-07, |
|
"loss": 0.8057, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.801679761582227, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 2.7612517547622955e-07, |
|
"loss": 0.7942, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.8060146301815228, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 2.638269781165692e-07, |
|
"loss": 0.7904, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.8103494987808182, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 2.518015054984041e-07, |
|
"loss": 0.8075, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.8146843673801136, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 2.4004945008881617e-07, |
|
"loss": 0.8082, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 1.8190192359794093, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 2.2857148861060552e-07, |
|
"loss": 0.7803, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.823354104578705, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 2.1736828200332628e-07, |
|
"loss": 0.7705, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 1.8276889731780006, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 2.0644047538522226e-07, |
|
"loss": 0.8031, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.8320238417772963, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 1.9578869801608168e-07, |
|
"loss": 0.7753, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.8363587103765917, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.8541356326100436e-07, |
|
"loss": 0.8151, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.8406935789758871, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.7531566855507442e-07, |
|
"loss": 0.7754, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.8450284475751828, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.6549559536896964e-07, |
|
"loss": 0.795, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.8493633161744785, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 1.559539091754686e-07, |
|
"loss": 0.7999, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 1.8536981847737741, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 1.4669115941689182e-07, |
|
"loss": 0.7965, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.8580330533730698, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 1.3770787947346597e-07, |
|
"loss": 0.8072, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.8623679219723652, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 1.2900458663260506e-07, |
|
"loss": 0.8134, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.8667027905716607, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 1.2058178205912763e-07, |
|
"loss": 0.8142, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 1.8710376591709563, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 1.1243995076639535e-07, |
|
"loss": 0.7983, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.875372527770252, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 1.0457956158838545e-07, |
|
"loss": 0.7892, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 1.8797073963695476, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 9.700106715269386e-08, |
|
"loss": 0.793, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 1.884042264968843, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 8.970490385447061e-08, |
|
"loss": 0.8028, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.8883771335681387, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 8.269149183128988e-08, |
|
"loss": 0.8004, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.8927120021674342, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 7.59612349389599e-08, |
|
"loss": 0.7909, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 1.8970468707667298, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 6.951452072826547e-08, |
|
"loss": 0.7832, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.9013817393660255, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 6.335172042265192e-08, |
|
"loss": 0.794, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 1.9057166079653212, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 5.747318889684883e-08, |
|
"loss": 0.763, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.9100514765646166, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 5.187926465643478e-08, |
|
"loss": 0.7852, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.9143863451639123, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 4.657026981834623e-08, |
|
"loss": 0.8118, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.9187212137632077, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 4.1546510092327906e-08, |
|
"loss": 0.8019, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 1.9230560823625034, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 3.680827476332804e-08, |
|
"loss": 0.8194, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.927390950961799, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 3.235583667484443e-08, |
|
"loss": 0.8032, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.9317258195610947, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 2.8189452213207014e-08, |
|
"loss": 0.7975, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 1.9360606881603901, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 2.4309361292820245e-08, |
|
"loss": 0.8016, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.9403955567596856, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 2.0715787342343586e-08, |
|
"loss": 0.8265, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.9447304253589812, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 1.7408937291829575e-08, |
|
"loss": 0.8057, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 1.9490652939582769, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 1.4389001560803917e-08, |
|
"loss": 0.7954, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.9534001625575725, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 1.1656154047303691e-08, |
|
"loss": 0.797, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 1.9577350311568682, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 9.210552117863703e-09, |
|
"loss": 0.7966, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 1.9620698997561636, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 7.052336598451504e-09, |
|
"loss": 0.8071, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.966404768355459, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 5.181631766362216e-09, |
|
"loss": 0.8095, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 1.9707396369547547, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 3.5985453430598115e-09, |
|
"loss": 0.7737, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.9750745055540504, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 2.3031684879742944e-09, |
|
"loss": 0.8073, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.979409374153346, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 1.2955757932542334e-09, |
|
"loss": 0.7904, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 1.9837442427526417, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 5.758252794690888e-10, |
|
"loss": 0.8063, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 1.9880791113519372, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 1.4395839226910568e-10, |
|
"loss": 0.8058, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.9924139799512326, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0, |
|
"loss": 0.7995, |
|
"step": 460 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 460, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 115, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.046286453205369e+19, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|