{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9924139799512326, "eval_steps": 58, "global_step": 460, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004336630979807562, "grad_norm": 7.40625, "learning_rate": 2.173913043478261e-07, "loss": 1.0297, "step": 1 }, { "epoch": 0.004336630979807562, "eval_loss": 1.1468182802200317, "eval_runtime": 109.2361, "eval_samples_per_second": 7.909, "eval_steps_per_second": 1.977, "step": 1 }, { "epoch": 0.008673261959615123, "grad_norm": 7.21875, "learning_rate": 4.347826086956522e-07, "loss": 1.0442, "step": 2 }, { "epoch": 0.013009892939422686, "grad_norm": 8.75, "learning_rate": 6.521739130434783e-07, "loss": 1.0301, "step": 3 }, { "epoch": 0.017346523919230247, "grad_norm": 13.3125, "learning_rate": 8.695652173913044e-07, "loss": 1.0477, "step": 4 }, { "epoch": 0.02168315489903781, "grad_norm": 12.3125, "learning_rate": 1.0869565217391306e-06, "loss": 1.038, "step": 5 }, { "epoch": 0.02601978587884537, "grad_norm": 6.0, "learning_rate": 1.3043478260869566e-06, "loss": 1.0415, "step": 6 }, { "epoch": 0.030356416858652934, "grad_norm": 5.0625, "learning_rate": 1.521739130434783e-06, "loss": 1.0229, "step": 7 }, { "epoch": 0.03469304783846049, "grad_norm": 6.1875, "learning_rate": 1.7391304347826088e-06, "loss": 1.0266, "step": 8 }, { "epoch": 0.039029678818268056, "grad_norm": 5.71875, "learning_rate": 1.956521739130435e-06, "loss": 1.0237, "step": 9 }, { "epoch": 0.04336630979807562, "grad_norm": 6.15625, "learning_rate": 2.173913043478261e-06, "loss": 1.0316, "step": 10 }, { "epoch": 0.04770294077788318, "grad_norm": 5.59375, "learning_rate": 2.391304347826087e-06, "loss": 1.0134, "step": 11 }, { "epoch": 0.05203957175769074, "grad_norm": 6.375, "learning_rate": 2.6086956521739132e-06, "loss": 1.0288, "step": 12 }, { "epoch": 0.056376202737498306, "grad_norm": 4.71875, "learning_rate": 2.8260869565217393e-06, "loss": 1.0208, "step": 13 }, { "epoch": 0.06071283371730587, "grad_norm": 3.765625, "learning_rate": 3.043478260869566e-06, "loss": 0.9986, "step": 14 }, { "epoch": 0.06504946469711342, "grad_norm": 4.84375, "learning_rate": 3.2608695652173914e-06, "loss": 1.0102, "step": 15 }, { "epoch": 0.06938609567692099, "grad_norm": 5.15625, "learning_rate": 3.4782608695652175e-06, "loss": 0.991, "step": 16 }, { "epoch": 0.07372272665672855, "grad_norm": 3.453125, "learning_rate": 3.6956521739130436e-06, "loss": 0.9974, "step": 17 }, { "epoch": 0.07805935763653611, "grad_norm": 2.796875, "learning_rate": 3.91304347826087e-06, "loss": 0.9997, "step": 18 }, { "epoch": 0.08239598861634367, "grad_norm": 2.40625, "learning_rate": 4.130434782608696e-06, "loss": 0.9896, "step": 19 }, { "epoch": 0.08673261959615124, "grad_norm": 8.3125, "learning_rate": 4.347826086956522e-06, "loss": 0.973, "step": 20 }, { "epoch": 0.0910692505759588, "grad_norm": 2.0625, "learning_rate": 4.565217391304348e-06, "loss": 0.9764, "step": 21 }, { "epoch": 0.09540588155576636, "grad_norm": 1.65625, "learning_rate": 4.782608695652174e-06, "loss": 0.9461, "step": 22 }, { "epoch": 0.09974251253557392, "grad_norm": 1.40625, "learning_rate": 5e-06, "loss": 0.9355, "step": 23 }, { "epoch": 0.10407914351538149, "grad_norm": 1.3203125, "learning_rate": 5.2173913043478265e-06, "loss": 0.9725, "step": 24 }, { "epoch": 0.10841577449518905, "grad_norm": 1.1875, "learning_rate": 5.4347826086956525e-06, "loss": 0.9244, "step": 25 }, { "epoch": 0.11275240547499661, "grad_norm": 1.1875, "learning_rate": 5.652173913043479e-06, "loss": 0.929, "step": 26 }, { "epoch": 0.11708903645480417, "grad_norm": 1.1015625, "learning_rate": 5.8695652173913055e-06, "loss": 0.9434, "step": 27 }, { "epoch": 0.12142566743461174, "grad_norm": 1.265625, "learning_rate": 6.086956521739132e-06, "loss": 0.9331, "step": 28 }, { "epoch": 0.1257622984144193, "grad_norm": 1.03125, "learning_rate": 6.304347826086958e-06, "loss": 0.9264, "step": 29 }, { "epoch": 0.13009892939422685, "grad_norm": 1.015625, "learning_rate": 6.521739130434783e-06, "loss": 0.9114, "step": 30 }, { "epoch": 0.13443556037403442, "grad_norm": 1.03125, "learning_rate": 6.739130434782609e-06, "loss": 0.9277, "step": 31 }, { "epoch": 0.13877219135384197, "grad_norm": 1.0234375, "learning_rate": 6.956521739130435e-06, "loss": 0.9154, "step": 32 }, { "epoch": 0.14310882233364955, "grad_norm": 1.015625, "learning_rate": 7.173913043478261e-06, "loss": 0.939, "step": 33 }, { "epoch": 0.1474454533134571, "grad_norm": 0.96875, "learning_rate": 7.391304347826087e-06, "loss": 0.9146, "step": 34 }, { "epoch": 0.15178208429326467, "grad_norm": 0.9609375, "learning_rate": 7.608695652173914e-06, "loss": 0.9063, "step": 35 }, { "epoch": 0.15611871527307222, "grad_norm": 0.9765625, "learning_rate": 7.82608695652174e-06, "loss": 0.9178, "step": 36 }, { "epoch": 0.1604553462528798, "grad_norm": 1.0078125, "learning_rate": 8.043478260869566e-06, "loss": 0.9184, "step": 37 }, { "epoch": 0.16479197723268735, "grad_norm": 0.98046875, "learning_rate": 8.260869565217392e-06, "loss": 0.911, "step": 38 }, { "epoch": 0.16912860821249492, "grad_norm": 0.9609375, "learning_rate": 8.478260869565218e-06, "loss": 0.9031, "step": 39 }, { "epoch": 0.17346523919230247, "grad_norm": 0.97265625, "learning_rate": 8.695652173913044e-06, "loss": 0.8881, "step": 40 }, { "epoch": 0.17780187017211005, "grad_norm": 0.9296875, "learning_rate": 8.91304347826087e-06, "loss": 0.8846, "step": 41 }, { "epoch": 0.1821385011519176, "grad_norm": 0.953125, "learning_rate": 9.130434782608697e-06, "loss": 0.8895, "step": 42 }, { "epoch": 0.18647513213172517, "grad_norm": 0.94921875, "learning_rate": 9.347826086956523e-06, "loss": 0.8683, "step": 43 }, { "epoch": 0.19081176311153272, "grad_norm": 0.99609375, "learning_rate": 9.565217391304349e-06, "loss": 0.8795, "step": 44 }, { "epoch": 0.1951483940913403, "grad_norm": 0.94140625, "learning_rate": 9.782608695652175e-06, "loss": 0.8829, "step": 45 }, { "epoch": 0.19948502507114785, "grad_norm": 0.93359375, "learning_rate": 1e-05, "loss": 0.8703, "step": 46 }, { "epoch": 0.20382165605095542, "grad_norm": 0.92578125, "learning_rate": 9.999856041607732e-06, "loss": 0.8702, "step": 47 }, { "epoch": 0.20815828703076297, "grad_norm": 0.94921875, "learning_rate": 9.99942417472053e-06, "loss": 0.869, "step": 48 }, { "epoch": 0.21249491801057055, "grad_norm": 0.95703125, "learning_rate": 9.998704424206747e-06, "loss": 0.8748, "step": 49 }, { "epoch": 0.2168315489903781, "grad_norm": 0.91796875, "learning_rate": 9.997696831512027e-06, "loss": 0.8737, "step": 50 }, { "epoch": 0.22116817997018567, "grad_norm": 0.91015625, "learning_rate": 9.996401454656941e-06, "loss": 0.8745, "step": 51 }, { "epoch": 0.22550481094999322, "grad_norm": 0.94921875, "learning_rate": 9.994818368233639e-06, "loss": 0.8677, "step": 52 }, { "epoch": 0.2298414419298008, "grad_norm": 0.90625, "learning_rate": 9.992947663401548e-06, "loss": 0.863, "step": 53 }, { "epoch": 0.23417807290960835, "grad_norm": 0.9140625, "learning_rate": 9.990789447882136e-06, "loss": 0.8709, "step": 54 }, { "epoch": 0.2385147038894159, "grad_norm": 0.9609375, "learning_rate": 9.988343845952697e-06, "loss": 0.8543, "step": 55 }, { "epoch": 0.24285133486922347, "grad_norm": 0.9375, "learning_rate": 9.985610998439198e-06, "loss": 0.8735, "step": 56 }, { "epoch": 0.24718796584903102, "grad_norm": 0.8984375, "learning_rate": 9.982591062708172e-06, "loss": 0.8631, "step": 57 }, { "epoch": 0.2515245968288386, "grad_norm": 0.8984375, "learning_rate": 9.979284212657658e-06, "loss": 0.8512, "step": 58 }, { "epoch": 0.2515245968288386, "eval_loss": 0.8729492425918579, "eval_runtime": 109.2389, "eval_samples_per_second": 7.909, "eval_steps_per_second": 1.977, "step": 58 }, { "epoch": 0.2558612278086462, "grad_norm": 0.90234375, "learning_rate": 9.97569063870718e-06, "loss": 0.8554, "step": 59 }, { "epoch": 0.2601978587884537, "grad_norm": 0.859375, "learning_rate": 9.971810547786794e-06, "loss": 0.8661, "step": 60 }, { "epoch": 0.26453448976826127, "grad_norm": 0.84765625, "learning_rate": 9.967644163325157e-06, "loss": 0.8592, "step": 61 }, { "epoch": 0.26887112074806885, "grad_norm": 0.83203125, "learning_rate": 9.963191725236672e-06, "loss": 0.8614, "step": 62 }, { "epoch": 0.2732077517278764, "grad_norm": 0.8203125, "learning_rate": 9.958453489907673e-06, "loss": 0.8555, "step": 63 }, { "epoch": 0.27754438270768395, "grad_norm": 0.80078125, "learning_rate": 9.953429730181653e-06, "loss": 0.8572, "step": 64 }, { "epoch": 0.2818810136874915, "grad_norm": 0.78515625, "learning_rate": 9.948120735343566e-06, "loss": 0.8583, "step": 65 }, { "epoch": 0.2862176446672991, "grad_norm": 0.734375, "learning_rate": 9.942526811103153e-06, "loss": 0.8433, "step": 66 }, { "epoch": 0.2905542756471067, "grad_norm": 0.703125, "learning_rate": 9.93664827957735e-06, "loss": 0.8505, "step": 67 }, { "epoch": 0.2948909066269142, "grad_norm": 0.6640625, "learning_rate": 9.930485479271735e-06, "loss": 0.8403, "step": 68 }, { "epoch": 0.29922753760672177, "grad_norm": 0.63671875, "learning_rate": 9.924038765061042e-06, "loss": 0.8585, "step": 69 }, { "epoch": 0.30356416858652935, "grad_norm": 1.2109375, "learning_rate": 9.917308508168712e-06, "loss": 0.8567, "step": 70 }, { "epoch": 0.3079007995663369, "grad_norm": 0.58984375, "learning_rate": 9.91029509614553e-06, "loss": 0.8543, "step": 71 }, { "epoch": 0.31223743054614445, "grad_norm": 0.54296875, "learning_rate": 9.902998932847308e-06, "loss": 0.8752, "step": 72 }, { "epoch": 0.316574061525952, "grad_norm": 0.5078125, "learning_rate": 9.895420438411616e-06, "loss": 0.8535, "step": 73 }, { "epoch": 0.3209106925057596, "grad_norm": 0.4921875, "learning_rate": 9.887560049233606e-06, "loss": 0.8601, "step": 74 }, { "epoch": 0.3252473234855672, "grad_norm": 0.494140625, "learning_rate": 9.879418217940872e-06, "loss": 0.8543, "step": 75 }, { "epoch": 0.3295839544653747, "grad_norm": 0.4375, "learning_rate": 9.870995413367397e-06, "loss": 0.8113, "step": 76 }, { "epoch": 0.33392058544518227, "grad_norm": 0.427734375, "learning_rate": 9.862292120526536e-06, "loss": 0.8583, "step": 77 }, { "epoch": 0.33825721642498985, "grad_norm": 0.447265625, "learning_rate": 9.85330884058311e-06, "loss": 0.832, "step": 78 }, { "epoch": 0.3425938474047974, "grad_norm": 0.41015625, "learning_rate": 9.844046090824533e-06, "loss": 0.8271, "step": 79 }, { "epoch": 0.34693047838460495, "grad_norm": 0.3984375, "learning_rate": 9.834504404631032e-06, "loss": 0.8503, "step": 80 }, { "epoch": 0.3512671093644125, "grad_norm": 0.408203125, "learning_rate": 9.824684331444926e-06, "loss": 0.8189, "step": 81 }, { "epoch": 0.3556037403442201, "grad_norm": 0.416015625, "learning_rate": 9.814586436738998e-06, "loss": 0.8465, "step": 82 }, { "epoch": 0.3599403713240276, "grad_norm": 0.40625, "learning_rate": 9.804211301983919e-06, "loss": 0.8159, "step": 83 }, { "epoch": 0.3642770023038352, "grad_norm": 0.40234375, "learning_rate": 9.793559524614779e-06, "loss": 0.8392, "step": 84 }, { "epoch": 0.36861363328364277, "grad_norm": 0.3671875, "learning_rate": 9.782631717996675e-06, "loss": 0.8291, "step": 85 }, { "epoch": 0.37295026426345035, "grad_norm": 0.392578125, "learning_rate": 9.771428511389395e-06, "loss": 0.8398, "step": 86 }, { "epoch": 0.37728689524325787, "grad_norm": 0.39453125, "learning_rate": 9.759950549911185e-06, "loss": 0.8499, "step": 87 }, { "epoch": 0.38162352622306545, "grad_norm": 0.373046875, "learning_rate": 9.748198494501598e-06, "loss": 0.8244, "step": 88 }, { "epoch": 0.385960157202873, "grad_norm": 0.373046875, "learning_rate": 9.736173021883433e-06, "loss": 0.8281, "step": 89 }, { "epoch": 0.3902967881826806, "grad_norm": 0.39453125, "learning_rate": 9.72387482452377e-06, "loss": 0.8165, "step": 90 }, { "epoch": 0.3946334191624881, "grad_norm": 0.396484375, "learning_rate": 9.711304610594104e-06, "loss": 0.8329, "step": 91 }, { "epoch": 0.3989700501422957, "grad_norm": 0.375, "learning_rate": 9.698463103929542e-06, "loss": 0.8218, "step": 92 }, { "epoch": 0.40330668112210327, "grad_norm": 0.384765625, "learning_rate": 9.685351043987151e-06, "loss": 0.8132, "step": 93 }, { "epoch": 0.40764331210191085, "grad_norm": 0.39453125, "learning_rate": 9.671969185803357e-06, "loss": 0.8357, "step": 94 }, { "epoch": 0.41197994308171837, "grad_norm": 0.392578125, "learning_rate": 9.658318299950473e-06, "loss": 0.8352, "step": 95 }, { "epoch": 0.41631657406152595, "grad_norm": 0.353515625, "learning_rate": 9.644399172492337e-06, "loss": 0.8112, "step": 96 }, { "epoch": 0.4206532050413335, "grad_norm": 0.380859375, "learning_rate": 9.630212604939026e-06, "loss": 0.8376, "step": 97 }, { "epoch": 0.4249898360211411, "grad_norm": 0.419921875, "learning_rate": 9.615759414200729e-06, "loss": 0.8304, "step": 98 }, { "epoch": 0.4293264670009486, "grad_norm": 0.357421875, "learning_rate": 9.601040432540684e-06, "loss": 0.8403, "step": 99 }, { "epoch": 0.4336630979807562, "grad_norm": 0.380859375, "learning_rate": 9.586056507527266e-06, "loss": 0.8331, "step": 100 }, { "epoch": 0.4379997289605638, "grad_norm": 0.39453125, "learning_rate": 9.570808501985176e-06, "loss": 0.8268, "step": 101 }, { "epoch": 0.44233635994037135, "grad_norm": 0.396484375, "learning_rate": 9.55529729394576e-06, "loss": 0.8264, "step": 102 }, { "epoch": 0.44667299092017887, "grad_norm": 0.396484375, "learning_rate": 9.539523776596446e-06, "loss": 0.8235, "step": 103 }, { "epoch": 0.45100962189998645, "grad_norm": 0.369140625, "learning_rate": 9.523488858229313e-06, "loss": 0.8276, "step": 104 }, { "epoch": 0.455346252879794, "grad_norm": 0.435546875, "learning_rate": 9.507193462188791e-06, "loss": 0.8142, "step": 105 }, { "epoch": 0.4596828838596016, "grad_norm": 0.375, "learning_rate": 9.490638526818482e-06, "loss": 0.8092, "step": 106 }, { "epoch": 0.4640195148394091, "grad_norm": 0.361328125, "learning_rate": 9.47382500540714e-06, "loss": 0.8256, "step": 107 }, { "epoch": 0.4683561458192167, "grad_norm": 0.384765625, "learning_rate": 9.45675386613377e-06, "loss": 0.8342, "step": 108 }, { "epoch": 0.4726927767990243, "grad_norm": 0.400390625, "learning_rate": 9.439426092011877e-06, "loss": 0.8087, "step": 109 }, { "epoch": 0.4770294077788318, "grad_norm": 0.396484375, "learning_rate": 9.421842680832862e-06, "loss": 0.8316, "step": 110 }, { "epoch": 0.48136603875863937, "grad_norm": 0.37890625, "learning_rate": 9.40400464510857e-06, "loss": 0.8257, "step": 111 }, { "epoch": 0.48570266973844695, "grad_norm": 0.359375, "learning_rate": 9.385913012012972e-06, "loss": 0.8246, "step": 112 }, { "epoch": 0.4900393007182545, "grad_norm": 0.39453125, "learning_rate": 9.367568823323039e-06, "loss": 0.8206, "step": 113 }, { "epoch": 0.49437593169806204, "grad_norm": 0.380859375, "learning_rate": 9.348973135358734e-06, "loss": 0.8358, "step": 114 }, { "epoch": 0.4987125626778696, "grad_norm": 0.376953125, "learning_rate": 9.330127018922195e-06, "loss": 0.8017, "step": 115 }, { "epoch": 0.5030491936576772, "grad_norm": 0.392578125, "learning_rate": 9.311031559236067e-06, "loss": 0.8496, "step": 116 }, { "epoch": 0.5030491936576772, "eval_loss": 0.8192870616912842, "eval_runtime": 109.0108, "eval_samples_per_second": 7.926, "eval_steps_per_second": 1.981, "step": 116 }, { "epoch": 0.5073858246374847, "grad_norm": 0.404296875, "learning_rate": 9.291687855881027e-06, "loss": 0.8147, "step": 117 }, { "epoch": 0.5117224556172923, "grad_norm": 0.392578125, "learning_rate": 9.272097022732444e-06, "loss": 0.8277, "step": 118 }, { "epoch": 0.5160590865970999, "grad_norm": 0.361328125, "learning_rate": 9.252260187896257e-06, "loss": 0.8212, "step": 119 }, { "epoch": 0.5203957175769074, "grad_norm": 0.361328125, "learning_rate": 9.232178493644006e-06, "loss": 0.8375, "step": 120 }, { "epoch": 0.524732348556715, "grad_norm": 0.396484375, "learning_rate": 9.211853096347059e-06, "loss": 0.8386, "step": 121 }, { "epoch": 0.5290689795365225, "grad_norm": 0.380859375, "learning_rate": 9.191285166410023e-06, "loss": 0.8118, "step": 122 }, { "epoch": 0.5334056105163302, "grad_norm": 0.380859375, "learning_rate": 9.170475888203348e-06, "loss": 0.8181, "step": 123 }, { "epoch": 0.5377422414961377, "grad_norm": 0.375, "learning_rate": 9.149426459995127e-06, "loss": 0.8213, "step": 124 }, { "epoch": 0.5420788724759452, "grad_norm": 0.359375, "learning_rate": 9.128138093882098e-06, "loss": 0.8392, "step": 125 }, { "epoch": 0.5464155034557528, "grad_norm": 0.373046875, "learning_rate": 9.106612015719845e-06, "loss": 0.8286, "step": 126 }, { "epoch": 0.5507521344355604, "grad_norm": 0.3671875, "learning_rate": 9.08484946505221e-06, "loss": 0.8324, "step": 127 }, { "epoch": 0.5550887654153679, "grad_norm": 0.38671875, "learning_rate": 9.062851695039915e-06, "loss": 0.8271, "step": 128 }, { "epoch": 0.5594253963951755, "grad_norm": 0.36328125, "learning_rate": 9.040619972388402e-06, "loss": 0.8179, "step": 129 }, { "epoch": 0.563762027374983, "grad_norm": 0.384765625, "learning_rate": 9.018155577274891e-06, "loss": 0.8214, "step": 130 }, { "epoch": 0.5680986583547907, "grad_norm": 0.3671875, "learning_rate": 8.995459803274664e-06, "loss": 0.8255, "step": 131 }, { "epoch": 0.5724352893345982, "grad_norm": 0.359375, "learning_rate": 8.972533957286574e-06, "loss": 0.8167, "step": 132 }, { "epoch": 0.5767719203144057, "grad_norm": 0.3671875, "learning_rate": 8.949379359457795e-06, "loss": 0.8012, "step": 133 }, { "epoch": 0.5811085512942133, "grad_norm": 0.35546875, "learning_rate": 8.925997343107796e-06, "loss": 0.8182, "step": 134 }, { "epoch": 0.5854451822740209, "grad_norm": 0.3671875, "learning_rate": 8.902389254651568e-06, "loss": 0.8073, "step": 135 }, { "epoch": 0.5897818132538284, "grad_norm": 0.3515625, "learning_rate": 8.8785564535221e-06, "loss": 0.8195, "step": 136 }, { "epoch": 0.594118444233636, "grad_norm": 0.423828125, "learning_rate": 8.854500312092081e-06, "loss": 0.8292, "step": 137 }, { "epoch": 0.5984550752134435, "grad_norm": 0.384765625, "learning_rate": 8.83022221559489e-06, "loss": 0.8204, "step": 138 }, { "epoch": 0.6027917061932511, "grad_norm": 0.373046875, "learning_rate": 8.805723562044825e-06, "loss": 0.8175, "step": 139 }, { "epoch": 0.6071283371730587, "grad_norm": 0.375, "learning_rate": 8.781005762156593e-06, "loss": 0.8044, "step": 140 }, { "epoch": 0.6114649681528662, "grad_norm": 0.3671875, "learning_rate": 8.756070239264089e-06, "loss": 0.8187, "step": 141 }, { "epoch": 0.6158015991326738, "grad_norm": 0.369140625, "learning_rate": 8.730918429238429e-06, "loss": 0.8164, "step": 142 }, { "epoch": 0.6201382301124814, "grad_norm": 0.36328125, "learning_rate": 8.705551780405264e-06, "loss": 0.8051, "step": 143 }, { "epoch": 0.6244748610922889, "grad_norm": 0.369140625, "learning_rate": 8.679971753461388e-06, "loss": 0.8127, "step": 144 }, { "epoch": 0.6288114920720965, "grad_norm": 0.37109375, "learning_rate": 8.65417982139062e-06, "loss": 0.8283, "step": 145 }, { "epoch": 0.633148123051904, "grad_norm": 0.400390625, "learning_rate": 8.628177469378995e-06, "loss": 0.8169, "step": 146 }, { "epoch": 0.6374847540317116, "grad_norm": 0.3671875, "learning_rate": 8.601966194729228e-06, "loss": 0.8209, "step": 147 }, { "epoch": 0.6418213850115192, "grad_norm": 0.37890625, "learning_rate": 8.575547506774498e-06, "loss": 0.8388, "step": 148 }, { "epoch": 0.6461580159913267, "grad_norm": 0.359375, "learning_rate": 8.548922926791545e-06, "loss": 0.8129, "step": 149 }, { "epoch": 0.6504946469711343, "grad_norm": 0.373046875, "learning_rate": 8.522093987913063e-06, "loss": 0.8282, "step": 150 }, { "epoch": 0.6548312779509419, "grad_norm": 0.3515625, "learning_rate": 8.49506223503941e-06, "loss": 0.813, "step": 151 }, { "epoch": 0.6591679089307494, "grad_norm": 0.3984375, "learning_rate": 8.467829224749665e-06, "loss": 0.8313, "step": 152 }, { "epoch": 0.663504539910557, "grad_norm": 0.38671875, "learning_rate": 8.440396525211976e-06, "loss": 0.828, "step": 153 }, { "epoch": 0.6678411708903645, "grad_norm": 0.380859375, "learning_rate": 8.412765716093273e-06, "loss": 0.8247, "step": 154 }, { "epoch": 0.6721778018701721, "grad_norm": 0.37890625, "learning_rate": 8.384938388468296e-06, "loss": 0.8046, "step": 155 }, { "epoch": 0.6765144328499797, "grad_norm": 0.349609375, "learning_rate": 8.356916144727985e-06, "loss": 0.814, "step": 156 }, { "epoch": 0.6808510638297872, "grad_norm": 0.36328125, "learning_rate": 8.328700598487203e-06, "loss": 0.8147, "step": 157 }, { "epoch": 0.6851876948095948, "grad_norm": 0.365234375, "learning_rate": 8.300293374491821e-06, "loss": 0.8083, "step": 158 }, { "epoch": 0.6895243257894024, "grad_norm": 0.365234375, "learning_rate": 8.271696108525156e-06, "loss": 0.801, "step": 159 }, { "epoch": 0.6938609567692099, "grad_norm": 0.3515625, "learning_rate": 8.24291044731378e-06, "loss": 0.8155, "step": 160 }, { "epoch": 0.6981975877490175, "grad_norm": 0.3515625, "learning_rate": 8.213938048432697e-06, "loss": 0.7988, "step": 161 }, { "epoch": 0.702534218728825, "grad_norm": 0.41015625, "learning_rate": 8.184780580209892e-06, "loss": 0.8184, "step": 162 }, { "epoch": 0.7068708497086326, "grad_norm": 0.369140625, "learning_rate": 8.155439721630265e-06, "loss": 0.8212, "step": 163 }, { "epoch": 0.7112074806884402, "grad_norm": 0.37890625, "learning_rate": 8.125917162238945e-06, "loss": 0.8401, "step": 164 }, { "epoch": 0.7155441116682477, "grad_norm": 0.37109375, "learning_rate": 8.096214602044011e-06, "loss": 0.7886, "step": 165 }, { "epoch": 0.7198807426480552, "grad_norm": 0.38671875, "learning_rate": 8.066333751418582e-06, "loss": 0.8181, "step": 166 }, { "epoch": 0.7242173736278629, "grad_norm": 0.361328125, "learning_rate": 8.036276331002348e-06, "loss": 0.8188, "step": 167 }, { "epoch": 0.7285540046076704, "grad_norm": 0.375, "learning_rate": 8.006044071602476e-06, "loss": 0.7999, "step": 168 }, { "epoch": 0.732890635587478, "grad_norm": 0.38671875, "learning_rate": 7.97563871409395e-06, "loss": 0.8273, "step": 169 }, { "epoch": 0.7372272665672855, "grad_norm": 0.35546875, "learning_rate": 7.94506200931932e-06, "loss": 0.7848, "step": 170 }, { "epoch": 0.7415638975470931, "grad_norm": 0.3515625, "learning_rate": 7.914315717987892e-06, "loss": 0.82, "step": 171 }, { "epoch": 0.7459005285269007, "grad_norm": 0.392578125, "learning_rate": 7.883401610574338e-06, "loss": 0.805, "step": 172 }, { "epoch": 0.7502371595067082, "grad_norm": 0.388671875, "learning_rate": 7.85232146721673e-06, "loss": 0.8017, "step": 173 }, { "epoch": 0.7545737904865157, "grad_norm": 0.35546875, "learning_rate": 7.821077077614062e-06, "loss": 0.8175, "step": 174 }, { "epoch": 0.7545737904865157, "eval_loss": 0.8033392429351807, "eval_runtime": 109.1503, "eval_samples_per_second": 7.916, "eval_steps_per_second": 1.979, "step": 174 }, { "epoch": 0.7589104214663234, "grad_norm": 0.365234375, "learning_rate": 7.789670240923169e-06, "loss": 0.825, "step": 175 }, { "epoch": 0.7632470524461309, "grad_norm": 0.36328125, "learning_rate": 7.758102765655136e-06, "loss": 0.8155, "step": 176 }, { "epoch": 0.7675836834259385, "grad_norm": 0.384765625, "learning_rate": 7.726376469571165e-06, "loss": 0.8138, "step": 177 }, { "epoch": 0.771920314405746, "grad_norm": 0.396484375, "learning_rate": 7.69449317957788e-06, "loss": 0.8055, "step": 178 }, { "epoch": 0.7762569453855536, "grad_norm": 0.37109375, "learning_rate": 7.66245473162215e-06, "loss": 0.8046, "step": 179 }, { "epoch": 0.7805935763653612, "grad_norm": 0.37109375, "learning_rate": 7.630262970585355e-06, "loss": 0.8138, "step": 180 }, { "epoch": 0.7849302073451687, "grad_norm": 0.353515625, "learning_rate": 7.597919750177168e-06, "loss": 0.8366, "step": 181 }, { "epoch": 0.7892668383249762, "grad_norm": 0.408203125, "learning_rate": 7.56542693282879e-06, "loss": 0.8303, "step": 182 }, { "epoch": 0.7936034693047839, "grad_norm": 0.357421875, "learning_rate": 7.532786389585715e-06, "loss": 0.8139, "step": 183 }, { "epoch": 0.7979401002845914, "grad_norm": 0.384765625, "learning_rate": 7.500000000000001e-06, "loss": 0.8098, "step": 184 }, { "epoch": 0.802276731264399, "grad_norm": 0.359375, "learning_rate": 7.467069652022017e-06, "loss": 0.8116, "step": 185 }, { "epoch": 0.8066133622442065, "grad_norm": 0.359375, "learning_rate": 7.433997241891743e-06, "loss": 0.7941, "step": 186 }, { "epoch": 0.8109499932240141, "grad_norm": 0.37109375, "learning_rate": 7.400784674029579e-06, "loss": 0.8123, "step": 187 }, { "epoch": 0.8152866242038217, "grad_norm": 0.376953125, "learning_rate": 7.3674338609266705e-06, "loss": 0.8237, "step": 188 }, { "epoch": 0.8196232551836292, "grad_norm": 0.380859375, "learning_rate": 7.333946723034794e-06, "loss": 0.8241, "step": 189 }, { "epoch": 0.8239598861634367, "grad_norm": 0.357421875, "learning_rate": 7.300325188655762e-06, "loss": 0.8072, "step": 190 }, { "epoch": 0.8282965171432444, "grad_norm": 0.353515625, "learning_rate": 7.266571193830387e-06, "loss": 0.8027, "step": 191 }, { "epoch": 0.8326331481230519, "grad_norm": 0.365234375, "learning_rate": 7.232686682227001e-06, "loss": 0.8351, "step": 192 }, { "epoch": 0.8369697791028594, "grad_norm": 0.369140625, "learning_rate": 7.198673605029529e-06, "loss": 0.8108, "step": 193 }, { "epoch": 0.841306410082667, "grad_norm": 0.369140625, "learning_rate": 7.164533920825137e-06, "loss": 0.8248, "step": 194 }, { "epoch": 0.8456430410624746, "grad_norm": 0.365234375, "learning_rate": 7.130269595491443e-06, "loss": 0.8117, "step": 195 }, { "epoch": 0.8499796720422822, "grad_norm": 0.376953125, "learning_rate": 7.095882602083321e-06, "loss": 0.832, "step": 196 }, { "epoch": 0.8543163030220897, "grad_norm": 0.37890625, "learning_rate": 7.061374920719288e-06, "loss": 0.8196, "step": 197 }, { "epoch": 0.8586529340018972, "grad_norm": 0.390625, "learning_rate": 7.026748538467474e-06, "loss": 0.8023, "step": 198 }, { "epoch": 0.8629895649817049, "grad_norm": 0.392578125, "learning_rate": 6.9920054492312086e-06, "loss": 0.8149, "step": 199 }, { "epoch": 0.8673261959615124, "grad_norm": 0.353515625, "learning_rate": 6.957147653634198e-06, "loss": 0.8166, "step": 200 }, { "epoch": 0.8716628269413199, "grad_norm": 0.359375, "learning_rate": 6.922177158905326e-06, "loss": 0.8198, "step": 201 }, { "epoch": 0.8759994579211275, "grad_norm": 0.380859375, "learning_rate": 6.887095978763072e-06, "loss": 0.797, "step": 202 }, { "epoch": 0.8803360889009351, "grad_norm": 0.361328125, "learning_rate": 6.851906133299556e-06, "loss": 0.8162, "step": 203 }, { "epoch": 0.8846727198807427, "grad_norm": 0.34375, "learning_rate": 6.816609648864208e-06, "loss": 0.8272, "step": 204 }, { "epoch": 0.8890093508605502, "grad_norm": 0.38671875, "learning_rate": 6.781208557947085e-06, "loss": 0.7975, "step": 205 }, { "epoch": 0.8933459818403577, "grad_norm": 0.35546875, "learning_rate": 6.745704899061843e-06, "loss": 0.8349, "step": 206 }, { "epoch": 0.8976826128201654, "grad_norm": 0.388671875, "learning_rate": 6.710100716628345e-06, "loss": 0.7963, "step": 207 }, { "epoch": 0.9020192437999729, "grad_norm": 0.35546875, "learning_rate": 6.674398060854931e-06, "loss": 0.8233, "step": 208 }, { "epoch": 0.9063558747797804, "grad_norm": 0.36328125, "learning_rate": 6.638598987620375e-06, "loss": 0.8137, "step": 209 }, { "epoch": 0.910692505759588, "grad_norm": 0.36328125, "learning_rate": 6.6027055583554865e-06, "loss": 0.8076, "step": 210 }, { "epoch": 0.9150291367393956, "grad_norm": 0.376953125, "learning_rate": 6.566719839924412e-06, "loss": 0.8046, "step": 211 }, { "epoch": 0.9193657677192032, "grad_norm": 0.40625, "learning_rate": 6.530643904505622e-06, "loss": 0.8211, "step": 212 }, { "epoch": 0.9237023986990107, "grad_norm": 0.380859375, "learning_rate": 6.49447982947258e-06, "loss": 0.8135, "step": 213 }, { "epoch": 0.9280390296788182, "grad_norm": 0.390625, "learning_rate": 6.458229697274125e-06, "loss": 0.7993, "step": 214 }, { "epoch": 0.9323756606586259, "grad_norm": 0.390625, "learning_rate": 6.42189559531456e-06, "loss": 0.7944, "step": 215 }, { "epoch": 0.9367122916384334, "grad_norm": 0.380859375, "learning_rate": 6.385479615833445e-06, "loss": 0.8078, "step": 216 }, { "epoch": 0.9410489226182409, "grad_norm": 0.375, "learning_rate": 6.348983855785122e-06, "loss": 0.7926, "step": 217 }, { "epoch": 0.9453855535980485, "grad_norm": 0.365234375, "learning_rate": 6.312410416717969e-06, "loss": 0.8212, "step": 218 }, { "epoch": 0.9497221845778561, "grad_norm": 0.361328125, "learning_rate": 6.275761404653381e-06, "loss": 0.7814, "step": 219 }, { "epoch": 0.9540588155576636, "grad_norm": 0.3984375, "learning_rate": 6.2390389299645e-06, "loss": 0.8039, "step": 220 }, { "epoch": 0.9583954465374712, "grad_norm": 0.40234375, "learning_rate": 6.2022451072546926e-06, "loss": 0.802, "step": 221 }, { "epoch": 0.9627320775172787, "grad_norm": 0.408203125, "learning_rate": 6.165382055235784e-06, "loss": 0.7972, "step": 222 }, { "epoch": 0.9670687084970864, "grad_norm": 0.37109375, "learning_rate": 6.128451896606054e-06, "loss": 0.7882, "step": 223 }, { "epoch": 0.9714053394768939, "grad_norm": 0.36328125, "learning_rate": 6.091456757928008e-06, "loss": 0.7859, "step": 224 }, { "epoch": 0.9757419704567014, "grad_norm": 0.34375, "learning_rate": 6.0543987695059236e-06, "loss": 0.7966, "step": 225 }, { "epoch": 0.980078601436509, "grad_norm": 0.345703125, "learning_rate": 6.0172800652631706e-06, "loss": 0.8079, "step": 226 }, { "epoch": 0.9844152324163166, "grad_norm": 0.396484375, "learning_rate": 5.980102782619343e-06, "loss": 0.8123, "step": 227 }, { "epoch": 0.9887518633961241, "grad_norm": 0.373046875, "learning_rate": 5.9428690623671796e-06, "loss": 0.8359, "step": 228 }, { "epoch": 0.9930884943759317, "grad_norm": 0.353515625, "learning_rate": 5.905581048549279e-06, "loss": 0.8287, "step": 229 }, { "epoch": 0.9974251253557392, "grad_norm": 0.3671875, "learning_rate": 5.8682408883346535e-06, "loss": 0.8032, "step": 230 }, { "epoch": 1.0017617563355468, "grad_norm": 0.345703125, "learning_rate": 5.830850731895071e-06, "loss": 0.8129, "step": 231 }, { "epoch": 1.0040639393118396, "grad_norm": 0.36328125, "learning_rate": 5.793412732281258e-06, "loss": 0.7868, "step": 232 }, { "epoch": 1.0040639393118396, "eval_loss": 0.7960610389709473, "eval_runtime": 110.4506, "eval_samples_per_second": 7.822, "eval_steps_per_second": 1.956, "step": 232 }, { "epoch": 1.0083988079111352, "grad_norm": 0.34765625, "learning_rate": 5.755929045298905e-06, "loss": 0.8008, "step": 233 }, { "epoch": 1.0127336765104307, "grad_norm": 0.37109375, "learning_rate": 5.718401829384541e-06, "loss": 0.8084, "step": 234 }, { "epoch": 1.0170685451097263, "grad_norm": 0.375, "learning_rate": 5.680833245481234e-06, "loss": 0.8068, "step": 235 }, { "epoch": 1.021403413709022, "grad_norm": 0.349609375, "learning_rate": 5.6432254569141565e-06, "loss": 0.796, "step": 236 }, { "epoch": 1.0257382823083174, "grad_norm": 0.36328125, "learning_rate": 5.605580629266021e-06, "loss": 0.8198, "step": 237 }, { "epoch": 1.030073150907613, "grad_norm": 0.361328125, "learning_rate": 5.567900930252375e-06, "loss": 0.7929, "step": 238 }, { "epoch": 1.0344080195069087, "grad_norm": 0.353515625, "learning_rate": 5.530188529596774e-06, "loss": 0.8029, "step": 239 }, { "epoch": 1.0387428881062042, "grad_norm": 0.369140625, "learning_rate": 5.492445598905843e-06, "loss": 0.8121, "step": 240 }, { "epoch": 1.0430777567054998, "grad_norm": 0.33984375, "learning_rate": 5.454674311544236e-06, "loss": 0.7917, "step": 241 }, { "epoch": 1.0474126253047955, "grad_norm": 0.35546875, "learning_rate": 5.416876842509468e-06, "loss": 0.7988, "step": 242 }, { "epoch": 1.051747493904091, "grad_norm": 0.376953125, "learning_rate": 5.379055368306693e-06, "loss": 0.7804, "step": 243 }, { "epoch": 1.0560823625033866, "grad_norm": 0.37109375, "learning_rate": 5.341212066823356e-06, "loss": 0.8167, "step": 244 }, { "epoch": 1.0604172311026823, "grad_norm": 0.36328125, "learning_rate": 5.3033491172037935e-06, "loss": 0.8158, "step": 245 }, { "epoch": 1.0647520997019777, "grad_norm": 0.365234375, "learning_rate": 5.265468699723748e-06, "loss": 0.7957, "step": 246 }, { "epoch": 1.0690869683012734, "grad_norm": 0.40625, "learning_rate": 5.227572995664819e-06, "loss": 0.7902, "step": 247 }, { "epoch": 1.073421836900569, "grad_norm": 0.35546875, "learning_rate": 5.189664187188857e-06, "loss": 0.7994, "step": 248 }, { "epoch": 1.0777567054998645, "grad_norm": 0.365234375, "learning_rate": 5.151744457212312e-06, "loss": 0.809, "step": 249 }, { "epoch": 1.0820915740991601, "grad_norm": 0.35546875, "learning_rate": 5.113815989280528e-06, "loss": 0.7849, "step": 250 }, { "epoch": 1.0864264426984558, "grad_norm": 0.36328125, "learning_rate": 5.075880967442014e-06, "loss": 0.8067, "step": 251 }, { "epoch": 1.0907613112977512, "grad_norm": 0.375, "learning_rate": 5.037941576122667e-06, "loss": 0.798, "step": 252 }, { "epoch": 1.0950961798970469, "grad_norm": 0.36328125, "learning_rate": 5e-06, "loss": 0.7891, "step": 253 }, { "epoch": 1.0994310484963425, "grad_norm": 0.3515625, "learning_rate": 4.962058423877335e-06, "loss": 0.8044, "step": 254 }, { "epoch": 1.103765917095638, "grad_norm": 0.375, "learning_rate": 4.924119032557988e-06, "loss": 0.7842, "step": 255 }, { "epoch": 1.1081007856949336, "grad_norm": 0.33984375, "learning_rate": 4.886184010719472e-06, "loss": 0.7962, "step": 256 }, { "epoch": 1.1124356542942293, "grad_norm": 0.345703125, "learning_rate": 4.848255542787689e-06, "loss": 0.8043, "step": 257 }, { "epoch": 1.1167705228935247, "grad_norm": 0.361328125, "learning_rate": 4.8103358128111435e-06, "loss": 0.8075, "step": 258 }, { "epoch": 1.1211053914928204, "grad_norm": 0.3515625, "learning_rate": 4.772427004335183e-06, "loss": 0.8023, "step": 259 }, { "epoch": 1.125440260092116, "grad_norm": 0.365234375, "learning_rate": 4.7345313002762545e-06, "loss": 0.7959, "step": 260 }, { "epoch": 1.1297751286914115, "grad_norm": 0.36328125, "learning_rate": 4.696650882796207e-06, "loss": 0.7883, "step": 261 }, { "epoch": 1.1341099972907072, "grad_norm": 0.357421875, "learning_rate": 4.6587879331766465e-06, "loss": 0.8036, "step": 262 }, { "epoch": 1.1384448658900026, "grad_norm": 0.361328125, "learning_rate": 4.620944631693309e-06, "loss": 0.8016, "step": 263 }, { "epoch": 1.1427797344892983, "grad_norm": 0.333984375, "learning_rate": 4.583123157490533e-06, "loss": 0.7982, "step": 264 }, { "epoch": 1.147114603088594, "grad_norm": 0.34765625, "learning_rate": 4.545325688455766e-06, "loss": 0.794, "step": 265 }, { "epoch": 1.1514494716878896, "grad_norm": 0.357421875, "learning_rate": 4.507554401094157e-06, "loss": 0.7905, "step": 266 }, { "epoch": 1.155784340287185, "grad_norm": 0.369140625, "learning_rate": 4.469811470403228e-06, "loss": 0.7941, "step": 267 }, { "epoch": 1.1601192088864807, "grad_norm": 0.359375, "learning_rate": 4.432099069747625e-06, "loss": 0.801, "step": 268 }, { "epoch": 1.1644540774857761, "grad_norm": 0.3671875, "learning_rate": 4.394419370733981e-06, "loss": 0.7985, "step": 269 }, { "epoch": 1.1687889460850718, "grad_norm": 0.36328125, "learning_rate": 4.356774543085845e-06, "loss": 0.7837, "step": 270 }, { "epoch": 1.1731238146843674, "grad_norm": 0.37890625, "learning_rate": 4.319166754518768e-06, "loss": 0.8008, "step": 271 }, { "epoch": 1.1774586832836629, "grad_norm": 0.357421875, "learning_rate": 4.28159817061546e-06, "loss": 0.8059, "step": 272 }, { "epoch": 1.1817935518829585, "grad_norm": 0.365234375, "learning_rate": 4.244070954701096e-06, "loss": 0.812, "step": 273 }, { "epoch": 1.1861284204822542, "grad_norm": 0.390625, "learning_rate": 4.206587267718743e-06, "loss": 0.7948, "step": 274 }, { "epoch": 1.1904632890815496, "grad_norm": 0.349609375, "learning_rate": 4.1691492681049305e-06, "loss": 0.8005, "step": 275 }, { "epoch": 1.1947981576808453, "grad_norm": 0.34765625, "learning_rate": 4.131759111665349e-06, "loss": 0.7992, "step": 276 }, { "epoch": 1.199133026280141, "grad_norm": 0.365234375, "learning_rate": 4.094418951450721e-06, "loss": 0.8091, "step": 277 }, { "epoch": 1.2034678948794364, "grad_norm": 0.3671875, "learning_rate": 4.057130937632821e-06, "loss": 0.799, "step": 278 }, { "epoch": 1.207802763478732, "grad_norm": 0.35546875, "learning_rate": 4.01989721738066e-06, "loss": 0.8093, "step": 279 }, { "epoch": 1.2121376320780277, "grad_norm": 0.361328125, "learning_rate": 3.982719934736832e-06, "loss": 0.8073, "step": 280 }, { "epoch": 1.2164725006773232, "grad_norm": 0.337890625, "learning_rate": 3.945601230494079e-06, "loss": 0.8099, "step": 281 }, { "epoch": 1.2208073692766188, "grad_norm": 0.400390625, "learning_rate": 3.9085432420719934e-06, "loss": 0.7912, "step": 282 }, { "epoch": 1.2251422378759145, "grad_norm": 0.357421875, "learning_rate": 3.871548103393947e-06, "loss": 0.8105, "step": 283 }, { "epoch": 1.22947710647521, "grad_norm": 0.37890625, "learning_rate": 3.834617944764218e-06, "loss": 0.7751, "step": 284 }, { "epoch": 1.2338119750745056, "grad_norm": 0.375, "learning_rate": 3.797754892745309e-06, "loss": 0.8028, "step": 285 }, { "epoch": 1.2381468436738012, "grad_norm": 0.34765625, "learning_rate": 3.7609610700355014e-06, "loss": 0.7939, "step": 286 }, { "epoch": 1.2424817122730967, "grad_norm": 0.357421875, "learning_rate": 3.724238595346619e-06, "loss": 0.809, "step": 287 }, { "epoch": 1.2468165808723923, "grad_norm": 0.357421875, "learning_rate": 3.687589583282031e-06, "loss": 0.8082, "step": 288 }, { "epoch": 1.251151449471688, "grad_norm": 0.345703125, "learning_rate": 3.6510161442148783e-06, "loss": 0.7822, "step": 289 }, { "epoch": 1.2554863180709834, "grad_norm": 0.353515625, "learning_rate": 3.6145203841665577e-06, "loss": 0.8119, "step": 290 }, { "epoch": 1.2554863180709834, "eval_loss": 0.7933911681175232, "eval_runtime": 110.3505, "eval_samples_per_second": 7.83, "eval_steps_per_second": 1.957, "step": 290 }, { "epoch": 1.259821186670279, "grad_norm": 0.345703125, "learning_rate": 3.578104404685442e-06, "loss": 0.806, "step": 291 }, { "epoch": 1.2641560552695745, "grad_norm": 0.359375, "learning_rate": 3.5417703027258752e-06, "loss": 0.8055, "step": 292 }, { "epoch": 1.2684909238688702, "grad_norm": 0.365234375, "learning_rate": 3.5055201705274223e-06, "loss": 0.8039, "step": 293 }, { "epoch": 1.2728257924681659, "grad_norm": 0.33984375, "learning_rate": 3.46935609549438e-06, "loss": 0.8149, "step": 294 }, { "epoch": 1.2771606610674615, "grad_norm": 0.3515625, "learning_rate": 3.4332801600755895e-06, "loss": 0.7849, "step": 295 }, { "epoch": 1.281495529666757, "grad_norm": 0.345703125, "learning_rate": 3.397294441644515e-06, "loss": 0.7956, "step": 296 }, { "epoch": 1.2858303982660526, "grad_norm": 0.353515625, "learning_rate": 3.3614010123796257e-06, "loss": 0.7933, "step": 297 }, { "epoch": 1.290165266865348, "grad_norm": 0.36328125, "learning_rate": 3.3256019391450696e-06, "loss": 0.8174, "step": 298 }, { "epoch": 1.2945001354646437, "grad_norm": 0.353515625, "learning_rate": 3.289899283371657e-06, "loss": 0.7988, "step": 299 }, { "epoch": 1.2988350040639394, "grad_norm": 0.34375, "learning_rate": 3.2542951009381584e-06, "loss": 0.8037, "step": 300 }, { "epoch": 1.303169872663235, "grad_norm": 0.34765625, "learning_rate": 3.2187914420529176e-06, "loss": 0.782, "step": 301 }, { "epoch": 1.3075047412625305, "grad_norm": 0.36328125, "learning_rate": 3.1833903511357943e-06, "loss": 0.8037, "step": 302 }, { "epoch": 1.3118396098618261, "grad_norm": 0.384765625, "learning_rate": 3.148093866700445e-06, "loss": 0.8053, "step": 303 }, { "epoch": 1.3161744784611216, "grad_norm": 0.357421875, "learning_rate": 3.1129040212369286e-06, "loss": 0.7896, "step": 304 }, { "epoch": 1.3205093470604172, "grad_norm": 0.359375, "learning_rate": 3.077822841094675e-06, "loss": 0.8078, "step": 305 }, { "epoch": 1.3248442156597129, "grad_norm": 0.341796875, "learning_rate": 3.0428523463658046e-06, "loss": 0.8084, "step": 306 }, { "epoch": 1.3291790842590083, "grad_norm": 0.357421875, "learning_rate": 3.007994550768793e-06, "loss": 0.8277, "step": 307 }, { "epoch": 1.333513952858304, "grad_norm": 0.341796875, "learning_rate": 2.973251461532527e-06, "loss": 0.8079, "step": 308 }, { "epoch": 1.3378488214575996, "grad_norm": 0.349609375, "learning_rate": 2.9386250792807124e-06, "loss": 0.8168, "step": 309 }, { "epoch": 1.342183690056895, "grad_norm": 0.3671875, "learning_rate": 2.9041173979166813e-06, "loss": 0.8047, "step": 310 }, { "epoch": 1.3465185586561907, "grad_norm": 0.37890625, "learning_rate": 2.86973040450856e-06, "loss": 0.8037, "step": 311 }, { "epoch": 1.3508534272554864, "grad_norm": 0.361328125, "learning_rate": 2.835466079174866e-06, "loss": 0.8001, "step": 312 }, { "epoch": 1.3551882958547818, "grad_norm": 0.365234375, "learning_rate": 2.8013263949704706e-06, "loss": 0.8006, "step": 313 }, { "epoch": 1.3595231644540775, "grad_norm": 0.359375, "learning_rate": 2.767313317773e-06, "loss": 0.8156, "step": 314 }, { "epoch": 1.363858033053373, "grad_norm": 0.349609375, "learning_rate": 2.7334288061696146e-06, "loss": 0.7992, "step": 315 }, { "epoch": 1.3681929016526686, "grad_norm": 0.3671875, "learning_rate": 2.6996748113442397e-06, "loss": 0.7812, "step": 316 }, { "epoch": 1.3725277702519643, "grad_norm": 0.341796875, "learning_rate": 2.666053276965207e-06, "loss": 0.7857, "step": 317 }, { "epoch": 1.37686263885126, "grad_norm": 0.35546875, "learning_rate": 2.6325661390733303e-06, "loss": 0.7985, "step": 318 }, { "epoch": 1.3811975074505554, "grad_norm": 0.341796875, "learning_rate": 2.599215325970423e-06, "loss": 0.7811, "step": 319 }, { "epoch": 1.385532376049851, "grad_norm": 0.412109375, "learning_rate": 2.566002758108256e-06, "loss": 0.7975, "step": 320 }, { "epoch": 1.3898672446491465, "grad_norm": 0.345703125, "learning_rate": 2.5329303479779855e-06, "loss": 0.8305, "step": 321 }, { "epoch": 1.3942021132484421, "grad_norm": 0.3515625, "learning_rate": 2.5000000000000015e-06, "loss": 0.8006, "step": 322 }, { "epoch": 1.3985369818477378, "grad_norm": 0.35546875, "learning_rate": 2.467213610414286e-06, "loss": 0.791, "step": 323 }, { "epoch": 1.4028718504470334, "grad_norm": 0.345703125, "learning_rate": 2.434573067171213e-06, "loss": 0.7853, "step": 324 }, { "epoch": 1.4072067190463289, "grad_norm": 0.375, "learning_rate": 2.4020802498228333e-06, "loss": 0.8011, "step": 325 }, { "epoch": 1.4115415876456245, "grad_norm": 0.3671875, "learning_rate": 2.369737029414644e-06, "loss": 0.7996, "step": 326 }, { "epoch": 1.41587645624492, "grad_norm": 0.39453125, "learning_rate": 2.337545268377853e-06, "loss": 0.8144, "step": 327 }, { "epoch": 1.4202113248442156, "grad_norm": 0.40234375, "learning_rate": 2.3055068204221226e-06, "loss": 0.8064, "step": 328 }, { "epoch": 1.4245461934435113, "grad_norm": 0.36328125, "learning_rate": 2.2736235304288373e-06, "loss": 0.7983, "step": 329 }, { "epoch": 1.428881062042807, "grad_norm": 0.365234375, "learning_rate": 2.241897234344864e-06, "loss": 0.7919, "step": 330 }, { "epoch": 1.4332159306421024, "grad_norm": 0.365234375, "learning_rate": 2.2103297590768334e-06, "loss": 0.785, "step": 331 }, { "epoch": 1.437550799241398, "grad_norm": 0.34765625, "learning_rate": 2.1789229223859403e-06, "loss": 0.789, "step": 332 }, { "epoch": 1.4418856678406935, "grad_norm": 0.369140625, "learning_rate": 2.1476785327832715e-06, "loss": 0.8104, "step": 333 }, { "epoch": 1.4462205364399892, "grad_norm": 0.361328125, "learning_rate": 2.1165983894256647e-06, "loss": 0.7929, "step": 334 }, { "epoch": 1.4505554050392848, "grad_norm": 0.40625, "learning_rate": 2.085684282012108e-06, "loss": 0.8129, "step": 335 }, { "epoch": 1.4548902736385803, "grad_norm": 0.34765625, "learning_rate": 2.0549379906806816e-06, "loss": 0.7983, "step": 336 }, { "epoch": 1.459225142237876, "grad_norm": 0.35546875, "learning_rate": 2.0243612859060526e-06, "loss": 0.7915, "step": 337 }, { "epoch": 1.4635600108371716, "grad_norm": 0.35546875, "learning_rate": 1.9939559283975237e-06, "loss": 0.8021, "step": 338 }, { "epoch": 1.467894879436467, "grad_norm": 0.3515625, "learning_rate": 1.9637236689976517e-06, "loss": 0.8164, "step": 339 }, { "epoch": 1.4722297480357627, "grad_norm": 0.361328125, "learning_rate": 1.933666248581418e-06, "loss": 0.7876, "step": 340 }, { "epoch": 1.4765646166350583, "grad_norm": 0.35546875, "learning_rate": 1.9037853979559923e-06, "loss": 0.7911, "step": 341 }, { "epoch": 1.4808994852343538, "grad_norm": 0.357421875, "learning_rate": 1.8740828377610564e-06, "loss": 0.786, "step": 342 }, { "epoch": 1.4852343538336494, "grad_norm": 0.37109375, "learning_rate": 1.8445602783697375e-06, "loss": 0.8243, "step": 343 }, { "epoch": 1.4895692224329449, "grad_norm": 0.353515625, "learning_rate": 1.8152194197901086e-06, "loss": 0.8162, "step": 344 }, { "epoch": 1.4939040910322405, "grad_norm": 0.35546875, "learning_rate": 1.7860619515673034e-06, "loss": 0.8081, "step": 345 }, { "epoch": 1.4982389596315362, "grad_norm": 0.361328125, "learning_rate": 1.7570895526862202e-06, "loss": 0.814, "step": 346 }, { "epoch": 1.5025738282308319, "grad_norm": 0.34375, "learning_rate": 1.7283038914748446e-06, "loss": 0.7814, "step": 347 }, { "epoch": 1.5069086968301273, "grad_norm": 0.353515625, "learning_rate": 1.6997066255081795e-06, "loss": 0.799, "step": 348 }, { "epoch": 1.5069086968301273, "eval_loss": 0.7925707697868347, "eval_runtime": 110.4666, "eval_samples_per_second": 7.821, "eval_steps_per_second": 1.955, "step": 348 }, { "epoch": 1.511243565429423, "grad_norm": 0.396484375, "learning_rate": 1.6712994015127976e-06, "loss": 0.798, "step": 349 }, { "epoch": 1.5155784340287184, "grad_norm": 0.365234375, "learning_rate": 1.6430838552720168e-06, "loss": 0.8019, "step": 350 }, { "epoch": 1.519913302628014, "grad_norm": 0.345703125, "learning_rate": 1.6150616115317052e-06, "loss": 0.77, "step": 351 }, { "epoch": 1.5242481712273097, "grad_norm": 0.35546875, "learning_rate": 1.5872342839067305e-06, "loss": 0.7969, "step": 352 }, { "epoch": 1.5285830398266054, "grad_norm": 0.3671875, "learning_rate": 1.5596034747880263e-06, "loss": 0.8047, "step": 353 }, { "epoch": 1.5329179084259008, "grad_norm": 0.37890625, "learning_rate": 1.5321707752503367e-06, "loss": 0.7922, "step": 354 }, { "epoch": 1.5372527770251965, "grad_norm": 0.37890625, "learning_rate": 1.5049377649605906e-06, "loss": 0.8011, "step": 355 }, { "epoch": 1.541587645624492, "grad_norm": 0.341796875, "learning_rate": 1.4779060120869393e-06, "loss": 0.7937, "step": 356 }, { "epoch": 1.5459225142237876, "grad_norm": 0.361328125, "learning_rate": 1.451077073208455e-06, "loss": 0.7822, "step": 357 }, { "epoch": 1.5502573828230832, "grad_norm": 0.3515625, "learning_rate": 1.4244524932255026e-06, "loss": 0.7985, "step": 358 }, { "epoch": 1.554592251422379, "grad_norm": 0.392578125, "learning_rate": 1.3980338052707737e-06, "loss": 0.7968, "step": 359 }, { "epoch": 1.5589271200216743, "grad_norm": 0.341796875, "learning_rate": 1.3718225306210049e-06, "loss": 0.8111, "step": 360 }, { "epoch": 1.5632619886209698, "grad_norm": 0.35546875, "learning_rate": 1.3458201786093795e-06, "loss": 0.7918, "step": 361 }, { "epoch": 1.5675968572202654, "grad_norm": 0.349609375, "learning_rate": 1.3200282465386156e-06, "loss": 0.8026, "step": 362 }, { "epoch": 1.571931725819561, "grad_norm": 0.37109375, "learning_rate": 1.2944482195947384e-06, "loss": 0.8124, "step": 363 }, { "epoch": 1.5762665944188567, "grad_norm": 0.3515625, "learning_rate": 1.2690815707615727e-06, "loss": 0.7961, "step": 364 }, { "epoch": 1.5806014630181524, "grad_norm": 0.34375, "learning_rate": 1.2439297607359118e-06, "loss": 0.8055, "step": 365 }, { "epoch": 1.5849363316174478, "grad_norm": 0.380859375, "learning_rate": 1.2189942378434083e-06, "loss": 0.786, "step": 366 }, { "epoch": 1.5892712002167433, "grad_norm": 0.359375, "learning_rate": 1.194276437955177e-06, "loss": 0.8009, "step": 367 }, { "epoch": 1.593606068816039, "grad_norm": 0.361328125, "learning_rate": 1.1697777844051105e-06, "loss": 0.8016, "step": 368 }, { "epoch": 1.5979409374153346, "grad_norm": 0.3515625, "learning_rate": 1.1454996879079205e-06, "loss": 0.7954, "step": 369 }, { "epoch": 1.6022758060146303, "grad_norm": 0.34765625, "learning_rate": 1.1214435464779006e-06, "loss": 0.8051, "step": 370 }, { "epoch": 1.606610674613926, "grad_norm": 0.369140625, "learning_rate": 1.0976107453484314e-06, "loss": 0.7912, "step": 371 }, { "epoch": 1.6109455432132214, "grad_norm": 0.365234375, "learning_rate": 1.0740026568922058e-06, "loss": 0.8041, "step": 372 }, { "epoch": 1.6152804118125168, "grad_norm": 0.3359375, "learning_rate": 1.050620640542208e-06, "loss": 0.7959, "step": 373 }, { "epoch": 1.6196152804118125, "grad_norm": 0.357421875, "learning_rate": 1.027466042713428e-06, "loss": 0.8097, "step": 374 }, { "epoch": 1.6239501490111081, "grad_norm": 0.375, "learning_rate": 1.0045401967253382e-06, "loss": 0.7924, "step": 375 }, { "epoch": 1.6282850176104038, "grad_norm": 0.37890625, "learning_rate": 9.81844422725109e-07, "loss": 0.8068, "step": 376 }, { "epoch": 1.6326198862096992, "grad_norm": 0.353515625, "learning_rate": 9.593800276115978e-07, "loss": 0.8052, "step": 377 }, { "epoch": 1.6369547548089949, "grad_norm": 0.3671875, "learning_rate": 9.371483049600849e-07, "loss": 0.7862, "step": 378 }, { "epoch": 1.6412896234082903, "grad_norm": 0.361328125, "learning_rate": 9.151505349477901e-07, "loss": 0.8059, "step": 379 }, { "epoch": 1.645624492007586, "grad_norm": 0.3515625, "learning_rate": 8.933879842801558e-07, "loss": 0.785, "step": 380 }, { "epoch": 1.6499593606068816, "grad_norm": 0.373046875, "learning_rate": 8.718619061179029e-07, "loss": 0.7866, "step": 381 }, { "epoch": 1.6542942292061773, "grad_norm": 0.3828125, "learning_rate": 8.505735400048748e-07, "loss": 0.7948, "step": 382 }, { "epoch": 1.6586290978054727, "grad_norm": 0.353515625, "learning_rate": 8.29524111796654e-07, "loss": 0.8076, "step": 383 }, { "epoch": 1.6629639664047684, "grad_norm": 0.34765625, "learning_rate": 8.087148335899786e-07, "loss": 0.8034, "step": 384 }, { "epoch": 1.6672988350040638, "grad_norm": 0.345703125, "learning_rate": 7.881469036529427e-07, "loss": 0.7956, "step": 385 }, { "epoch": 1.6716337036033595, "grad_norm": 0.353515625, "learning_rate": 7.678215063559957e-07, "loss": 0.797, "step": 386 }, { "epoch": 1.6759685722026552, "grad_norm": 0.36328125, "learning_rate": 7.477398121037449e-07, "loss": 0.777, "step": 387 }, { "epoch": 1.6803034408019508, "grad_norm": 0.390625, "learning_rate": 7.279029772675572e-07, "loss": 0.8072, "step": 388 }, { "epoch": 1.6846383094012463, "grad_norm": 0.337890625, "learning_rate": 7.083121441189739e-07, "loss": 0.7878, "step": 389 }, { "epoch": 1.6889731780005417, "grad_norm": 0.33984375, "learning_rate": 6.889684407639324e-07, "loss": 0.8186, "step": 390 }, { "epoch": 1.6933080465998374, "grad_norm": 0.421875, "learning_rate": 6.698729810778065e-07, "loss": 0.8195, "step": 391 }, { "epoch": 1.697642915199133, "grad_norm": 0.359375, "learning_rate": 6.510268646412665e-07, "loss": 0.7844, "step": 392 }, { "epoch": 1.7019777837984287, "grad_norm": 0.34765625, "learning_rate": 6.324311766769631e-07, "loss": 0.7936, "step": 393 }, { "epoch": 1.7063126523977243, "grad_norm": 0.37109375, "learning_rate": 6.140869879870287e-07, "loss": 0.795, "step": 394 }, { "epoch": 1.7106475209970198, "grad_norm": 0.359375, "learning_rate": 5.959953548914327e-07, "loss": 0.7961, "step": 395 }, { "epoch": 1.7149823895963152, "grad_norm": 0.341796875, "learning_rate": 5.781573191671386e-07, "loss": 0.7819, "step": 396 }, { "epoch": 1.7193172581956109, "grad_norm": 0.345703125, "learning_rate": 5.60573907988124e-07, "loss": 0.8076, "step": 397 }, { "epoch": 1.7236521267949065, "grad_norm": 0.373046875, "learning_rate": 5.43246133866231e-07, "loss": 0.8044, "step": 398 }, { "epoch": 1.7279869953942022, "grad_norm": 0.3359375, "learning_rate": 5.261749945928613e-07, "loss": 0.8001, "step": 399 }, { "epoch": 1.7323218639934979, "grad_norm": 0.3359375, "learning_rate": 5.0936147318152e-07, "loss": 0.7955, "step": 400 }, { "epoch": 1.7366567325927933, "grad_norm": 0.34375, "learning_rate": 4.928065378112107e-07, "loss": 0.7974, "step": 401 }, { "epoch": 1.7409916011920887, "grad_norm": 0.3515625, "learning_rate": 4.7651114177068694e-07, "loss": 0.8025, "step": 402 }, { "epoch": 1.7453264697913844, "grad_norm": 0.349609375, "learning_rate": 4.604762234035548e-07, "loss": 0.7857, "step": 403 }, { "epoch": 1.74966133839068, "grad_norm": 0.353515625, "learning_rate": 4.4470270605424195e-07, "loss": 0.8064, "step": 404 }, { "epoch": 1.7539962069899757, "grad_norm": 0.349609375, "learning_rate": 4.2919149801482596e-07, "loss": 0.7966, "step": 405 }, { "epoch": 1.7583310755892712, "grad_norm": 0.337890625, "learning_rate": 4.139434924727359e-07, "loss": 0.7891, "step": 406 }, { "epoch": 1.7583310755892712, "eval_loss": 0.7922915816307068, "eval_runtime": 110.2564, "eval_samples_per_second": 7.836, "eval_steps_per_second": 1.959, "step": 406 }, { "epoch": 1.7626659441885668, "grad_norm": 0.365234375, "learning_rate": 3.989595674593161e-07, "loss": 0.7935, "step": 407 }, { "epoch": 1.7670008127878623, "grad_norm": 0.37109375, "learning_rate": 3.8424058579927147e-07, "loss": 0.7943, "step": 408 }, { "epoch": 1.771335681387158, "grad_norm": 0.34375, "learning_rate": 3.697873950609737e-07, "loss": 0.796, "step": 409 }, { "epoch": 1.7756705499864536, "grad_norm": 0.34375, "learning_rate": 3.55600827507665e-07, "loss": 0.8139, "step": 410 }, { "epoch": 1.7800054185857492, "grad_norm": 0.361328125, "learning_rate": 3.416817000495271e-07, "loss": 0.7962, "step": 411 }, { "epoch": 1.7843402871850447, "grad_norm": 0.3515625, "learning_rate": 3.2803081419664483e-07, "loss": 0.8059, "step": 412 }, { "epoch": 1.7886751557843403, "grad_norm": 0.359375, "learning_rate": 3.146489560128496e-07, "loss": 0.8073, "step": 413 }, { "epoch": 1.7930100243836358, "grad_norm": 0.380859375, "learning_rate": 3.015368960704584e-07, "loss": 0.7965, "step": 414 }, { "epoch": 1.7973448929829314, "grad_norm": 0.349609375, "learning_rate": 2.88695389405898e-07, "loss": 0.8057, "step": 415 }, { "epoch": 1.801679761582227, "grad_norm": 0.361328125, "learning_rate": 2.7612517547622955e-07, "loss": 0.7942, "step": 416 }, { "epoch": 1.8060146301815228, "grad_norm": 0.34375, "learning_rate": 2.638269781165692e-07, "loss": 0.7904, "step": 417 }, { "epoch": 1.8103494987808182, "grad_norm": 0.345703125, "learning_rate": 2.518015054984041e-07, "loss": 0.8075, "step": 418 }, { "epoch": 1.8146843673801136, "grad_norm": 0.34765625, "learning_rate": 2.4004945008881617e-07, "loss": 0.8082, "step": 419 }, { "epoch": 1.8190192359794093, "grad_norm": 0.353515625, "learning_rate": 2.2857148861060552e-07, "loss": 0.7803, "step": 420 }, { "epoch": 1.823354104578705, "grad_norm": 0.33984375, "learning_rate": 2.1736828200332628e-07, "loss": 0.7705, "step": 421 }, { "epoch": 1.8276889731780006, "grad_norm": 0.36328125, "learning_rate": 2.0644047538522226e-07, "loss": 0.8031, "step": 422 }, { "epoch": 1.8320238417772963, "grad_norm": 0.345703125, "learning_rate": 1.9578869801608168e-07, "loss": 0.7753, "step": 423 }, { "epoch": 1.8363587103765917, "grad_norm": 0.349609375, "learning_rate": 1.8541356326100436e-07, "loss": 0.8151, "step": 424 }, { "epoch": 1.8406935789758871, "grad_norm": 0.349609375, "learning_rate": 1.7531566855507442e-07, "loss": 0.7754, "step": 425 }, { "epoch": 1.8450284475751828, "grad_norm": 0.36328125, "learning_rate": 1.6549559536896964e-07, "loss": 0.795, "step": 426 }, { "epoch": 1.8493633161744785, "grad_norm": 0.35546875, "learning_rate": 1.559539091754686e-07, "loss": 0.7999, "step": 427 }, { "epoch": 1.8536981847737741, "grad_norm": 0.359375, "learning_rate": 1.4669115941689182e-07, "loss": 0.7965, "step": 428 }, { "epoch": 1.8580330533730698, "grad_norm": 0.34765625, "learning_rate": 1.3770787947346597e-07, "loss": 0.8072, "step": 429 }, { "epoch": 1.8623679219723652, "grad_norm": 0.35546875, "learning_rate": 1.2900458663260506e-07, "loss": 0.8134, "step": 430 }, { "epoch": 1.8667027905716607, "grad_norm": 0.3515625, "learning_rate": 1.2058178205912763e-07, "loss": 0.8142, "step": 431 }, { "epoch": 1.8710376591709563, "grad_norm": 0.365234375, "learning_rate": 1.1243995076639535e-07, "loss": 0.7983, "step": 432 }, { "epoch": 1.875372527770252, "grad_norm": 0.357421875, "learning_rate": 1.0457956158838545e-07, "loss": 0.7892, "step": 433 }, { "epoch": 1.8797073963695476, "grad_norm": 0.3515625, "learning_rate": 9.700106715269386e-08, "loss": 0.793, "step": 434 }, { "epoch": 1.884042264968843, "grad_norm": 0.361328125, "learning_rate": 8.970490385447061e-08, "loss": 0.8028, "step": 435 }, { "epoch": 1.8883771335681387, "grad_norm": 0.353515625, "learning_rate": 8.269149183128988e-08, "loss": 0.8004, "step": 436 }, { "epoch": 1.8927120021674342, "grad_norm": 0.337890625, "learning_rate": 7.59612349389599e-08, "loss": 0.7909, "step": 437 }, { "epoch": 1.8970468707667298, "grad_norm": 0.3671875, "learning_rate": 6.951452072826547e-08, "loss": 0.7832, "step": 438 }, { "epoch": 1.9013817393660255, "grad_norm": 0.33984375, "learning_rate": 6.335172042265192e-08, "loss": 0.794, "step": 439 }, { "epoch": 1.9057166079653212, "grad_norm": 0.36328125, "learning_rate": 5.747318889684883e-08, "loss": 0.763, "step": 440 }, { "epoch": 1.9100514765646166, "grad_norm": 0.353515625, "learning_rate": 5.187926465643478e-08, "loss": 0.7852, "step": 441 }, { "epoch": 1.9143863451639123, "grad_norm": 0.34765625, "learning_rate": 4.657026981834623e-08, "loss": 0.8118, "step": 442 }, { "epoch": 1.9187212137632077, "grad_norm": 0.369140625, "learning_rate": 4.1546510092327906e-08, "loss": 0.8019, "step": 443 }, { "epoch": 1.9230560823625034, "grad_norm": 0.361328125, "learning_rate": 3.680827476332804e-08, "loss": 0.8194, "step": 444 }, { "epoch": 1.927390950961799, "grad_norm": 0.359375, "learning_rate": 3.235583667484443e-08, "loss": 0.8032, "step": 445 }, { "epoch": 1.9317258195610947, "grad_norm": 0.361328125, "learning_rate": 2.8189452213207014e-08, "loss": 0.7975, "step": 446 }, { "epoch": 1.9360606881603901, "grad_norm": 0.35546875, "learning_rate": 2.4309361292820245e-08, "loss": 0.8016, "step": 447 }, { "epoch": 1.9403955567596856, "grad_norm": 0.396484375, "learning_rate": 2.0715787342343586e-08, "loss": 0.8265, "step": 448 }, { "epoch": 1.9447304253589812, "grad_norm": 0.345703125, "learning_rate": 1.7408937291829575e-08, "loss": 0.8057, "step": 449 }, { "epoch": 1.9490652939582769, "grad_norm": 0.35546875, "learning_rate": 1.4389001560803917e-08, "loss": 0.7954, "step": 450 }, { "epoch": 1.9534001625575725, "grad_norm": 0.35546875, "learning_rate": 1.1656154047303691e-08, "loss": 0.797, "step": 451 }, { "epoch": 1.9577350311568682, "grad_norm": 0.353515625, "learning_rate": 9.210552117863703e-09, "loss": 0.7966, "step": 452 }, { "epoch": 1.9620698997561636, "grad_norm": 0.349609375, "learning_rate": 7.052336598451504e-09, "loss": 0.8071, "step": 453 }, { "epoch": 1.966404768355459, "grad_norm": 0.345703125, "learning_rate": 5.181631766362216e-09, "loss": 0.8095, "step": 454 }, { "epoch": 1.9707396369547547, "grad_norm": 0.357421875, "learning_rate": 3.5985453430598115e-09, "loss": 0.7737, "step": 455 }, { "epoch": 1.9750745055540504, "grad_norm": 0.359375, "learning_rate": 2.3031684879742944e-09, "loss": 0.8073, "step": 456 }, { "epoch": 1.979409374153346, "grad_norm": 0.35546875, "learning_rate": 1.2955757932542334e-09, "loss": 0.7904, "step": 457 }, { "epoch": 1.9837442427526417, "grad_norm": 0.353515625, "learning_rate": 5.758252794690888e-10, "loss": 0.8063, "step": 458 }, { "epoch": 1.9880791113519372, "grad_norm": 0.359375, "learning_rate": 1.4395839226910568e-10, "loss": 0.8058, "step": 459 }, { "epoch": 1.9924139799512326, "grad_norm": 0.34375, "learning_rate": 0.0, "loss": 0.7995, "step": 460 } ], "logging_steps": 1, "max_steps": 460, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 115, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.046286453205369e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }