{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9971671388101983, "eval_steps": 55, "global_step": 220, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.0000000000000001e-07, "loss": 1.4814, "step": 1 }, { "epoch": 0.0, "eval_loss": 1.4537283182144165, "eval_runtime": 5.2647, "eval_samples_per_second": 494.046, "eval_steps_per_second": 17.665, "step": 1 }, { "epoch": 0.01, "learning_rate": 2.0000000000000002e-07, "loss": 1.3904, "step": 2 }, { "epoch": 0.01, "learning_rate": 3.0000000000000004e-07, "loss": 1.5035, "step": 3 }, { "epoch": 0.02, "learning_rate": 4.0000000000000003e-07, "loss": 1.5594, "step": 4 }, { "epoch": 0.02, "learning_rate": 5.000000000000001e-07, "loss": 1.3623, "step": 5 }, { "epoch": 0.03, "learning_rate": 6.000000000000001e-07, "loss": 1.4823, "step": 6 }, { "epoch": 0.03, "learning_rate": 7.000000000000001e-07, "loss": 1.3817, "step": 7 }, { "epoch": 0.04, "learning_rate": 8.000000000000001e-07, "loss": 1.4402, "step": 8 }, { "epoch": 0.04, "learning_rate": 9.000000000000001e-07, "loss": 1.3728, "step": 9 }, { "epoch": 0.05, "learning_rate": 1.0000000000000002e-06, "loss": 1.5025, "step": 10 }, { "epoch": 0.05, "learning_rate": 1.1e-06, "loss": 1.4032, "step": 11 }, { "epoch": 0.05, "learning_rate": 1.2000000000000002e-06, "loss": 1.3889, "step": 12 }, { "epoch": 0.06, "learning_rate": 1.3e-06, "loss": 1.6009, "step": 13 }, { "epoch": 0.06, "learning_rate": 1.4000000000000001e-06, "loss": 1.4421, "step": 14 }, { "epoch": 0.07, "learning_rate": 1.5e-06, "loss": 1.5008, "step": 15 }, { "epoch": 0.07, "learning_rate": 1.6000000000000001e-06, "loss": 1.5311, "step": 16 }, { "epoch": 0.08, "learning_rate": 1.7000000000000002e-06, "loss": 1.3308, "step": 17 }, { "epoch": 0.08, "learning_rate": 1.8000000000000001e-06, "loss": 1.3582, "step": 18 }, { "epoch": 0.09, "learning_rate": 1.9000000000000002e-06, "loss": 1.5426, "step": 19 }, { "epoch": 0.09, "learning_rate": 2.0000000000000003e-06, "loss": 1.3096, "step": 20 }, { "epoch": 0.1, "learning_rate": 2.1000000000000002e-06, "loss": 1.4453, "step": 21 }, { "epoch": 0.1, "learning_rate": 2.2e-06, "loss": 1.5165, "step": 22 }, { "epoch": 0.1, "learning_rate": 2.3000000000000004e-06, "loss": 1.3552, "step": 23 }, { "epoch": 0.11, "learning_rate": 2.4000000000000003e-06, "loss": 1.3963, "step": 24 }, { "epoch": 0.11, "learning_rate": 2.5e-06, "loss": 1.3657, "step": 25 }, { "epoch": 0.12, "learning_rate": 2.6e-06, "loss": 1.4383, "step": 26 }, { "epoch": 0.12, "learning_rate": 2.7000000000000004e-06, "loss": 1.4539, "step": 27 }, { "epoch": 0.13, "learning_rate": 2.8000000000000003e-06, "loss": 1.3158, "step": 28 }, { "epoch": 0.13, "learning_rate": 2.9e-06, "loss": 1.4113, "step": 29 }, { "epoch": 0.14, "learning_rate": 3e-06, "loss": 1.4426, "step": 30 }, { "epoch": 0.14, "learning_rate": 3.1000000000000004e-06, "loss": 1.3938, "step": 31 }, { "epoch": 0.15, "learning_rate": 3.2000000000000003e-06, "loss": 1.3704, "step": 32 }, { "epoch": 0.15, "learning_rate": 3.3000000000000006e-06, "loss": 1.4268, "step": 33 }, { "epoch": 0.15, "learning_rate": 3.4000000000000005e-06, "loss": 1.3662, "step": 34 }, { "epoch": 0.16, "learning_rate": 3.5e-06, "loss": 1.4189, "step": 35 }, { "epoch": 0.16, "learning_rate": 3.6000000000000003e-06, "loss": 1.3602, "step": 36 }, { "epoch": 0.17, "learning_rate": 3.7e-06, "loss": 1.2845, "step": 37 }, { "epoch": 0.17, "learning_rate": 3.8000000000000005e-06, "loss": 1.4432, "step": 38 }, { "epoch": 0.18, "learning_rate": 3.900000000000001e-06, "loss": 1.412, "step": 39 }, { "epoch": 0.18, "learning_rate": 4.000000000000001e-06, "loss": 1.396, "step": 40 }, { "epoch": 0.19, "learning_rate": 4.1e-06, "loss": 1.3463, "step": 41 }, { "epoch": 0.19, "learning_rate": 4.2000000000000004e-06, "loss": 1.3552, "step": 42 }, { "epoch": 0.19, "learning_rate": 4.3e-06, "loss": 1.3606, "step": 43 }, { "epoch": 0.2, "learning_rate": 4.4e-06, "loss": 1.33, "step": 44 }, { "epoch": 0.2, "learning_rate": 4.5e-06, "loss": 1.3242, "step": 45 }, { "epoch": 0.21, "learning_rate": 4.600000000000001e-06, "loss": 1.3012, "step": 46 }, { "epoch": 0.21, "learning_rate": 4.7e-06, "loss": 1.3414, "step": 47 }, { "epoch": 0.22, "learning_rate": 4.800000000000001e-06, "loss": 1.4236, "step": 48 }, { "epoch": 0.22, "learning_rate": 4.9000000000000005e-06, "loss": 1.3572, "step": 49 }, { "epoch": 0.23, "learning_rate": 5e-06, "loss": 1.4086, "step": 50 }, { "epoch": 0.23, "learning_rate": 5.1e-06, "loss": 1.2868, "step": 51 }, { "epoch": 0.24, "learning_rate": 5.2e-06, "loss": 1.2877, "step": 52 }, { "epoch": 0.24, "learning_rate": 5.300000000000001e-06, "loss": 1.2699, "step": 53 }, { "epoch": 0.24, "learning_rate": 5.400000000000001e-06, "loss": 1.3287, "step": 54 }, { "epoch": 0.25, "learning_rate": 5.500000000000001e-06, "loss": 1.3529, "step": 55 }, { "epoch": 0.25, "eval_loss": 1.300736904144287, "eval_runtime": 5.7333, "eval_samples_per_second": 453.669, "eval_steps_per_second": 16.221, "step": 55 }, { "epoch": 0.25, "learning_rate": 5.600000000000001e-06, "loss": 1.2862, "step": 56 }, { "epoch": 0.26, "learning_rate": 5.7e-06, "loss": 1.3115, "step": 57 }, { "epoch": 0.26, "learning_rate": 5.8e-06, "loss": 1.3377, "step": 58 }, { "epoch": 0.27, "learning_rate": 5.9e-06, "loss": 1.3842, "step": 59 }, { "epoch": 0.27, "learning_rate": 6e-06, "loss": 1.38, "step": 60 }, { "epoch": 0.28, "learning_rate": 6.1e-06, "loss": 1.3856, "step": 61 }, { "epoch": 0.28, "learning_rate": 6.200000000000001e-06, "loss": 1.2891, "step": 62 }, { "epoch": 0.29, "learning_rate": 6.300000000000001e-06, "loss": 1.2762, "step": 63 }, { "epoch": 0.29, "learning_rate": 6.4000000000000006e-06, "loss": 1.356, "step": 64 }, { "epoch": 0.29, "learning_rate": 6.5000000000000004e-06, "loss": 1.3018, "step": 65 }, { "epoch": 0.3, "learning_rate": 6.600000000000001e-06, "loss": 1.3118, "step": 66 }, { "epoch": 0.3, "learning_rate": 6.700000000000001e-06, "loss": 1.3355, "step": 67 }, { "epoch": 0.31, "learning_rate": 6.800000000000001e-06, "loss": 1.2342, "step": 68 }, { "epoch": 0.31, "learning_rate": 6.9e-06, "loss": 1.3015, "step": 69 }, { "epoch": 0.32, "learning_rate": 7e-06, "loss": 1.2471, "step": 70 }, { "epoch": 0.32, "learning_rate": 7.100000000000001e-06, "loss": 1.2873, "step": 71 }, { "epoch": 0.33, "learning_rate": 7.2000000000000005e-06, "loss": 1.2805, "step": 72 }, { "epoch": 0.33, "learning_rate": 7.3e-06, "loss": 1.2326, "step": 73 }, { "epoch": 0.34, "learning_rate": 7.4e-06, "loss": 1.238, "step": 74 }, { "epoch": 0.34, "learning_rate": 7.500000000000001e-06, "loss": 1.3337, "step": 75 }, { "epoch": 0.34, "learning_rate": 7.600000000000001e-06, "loss": 1.3518, "step": 76 }, { "epoch": 0.35, "learning_rate": 7.7e-06, "loss": 1.2328, "step": 77 }, { "epoch": 0.35, "learning_rate": 7.800000000000002e-06, "loss": 1.2552, "step": 78 }, { "epoch": 0.36, "learning_rate": 7.9e-06, "loss": 1.2211, "step": 79 }, { "epoch": 0.36, "learning_rate": 8.000000000000001e-06, "loss": 1.2458, "step": 80 }, { "epoch": 0.37, "learning_rate": 8.1e-06, "loss": 1.1735, "step": 81 }, { "epoch": 0.37, "learning_rate": 8.2e-06, "loss": 1.1613, "step": 82 }, { "epoch": 0.38, "learning_rate": 8.3e-06, "loss": 1.2147, "step": 83 }, { "epoch": 0.38, "learning_rate": 8.400000000000001e-06, "loss": 1.2297, "step": 84 }, { "epoch": 0.39, "learning_rate": 8.5e-06, "loss": 1.2053, "step": 85 }, { "epoch": 0.39, "learning_rate": 8.6e-06, "loss": 1.2524, "step": 86 }, { "epoch": 0.39, "learning_rate": 8.700000000000001e-06, "loss": 1.2613, "step": 87 }, { "epoch": 0.4, "learning_rate": 8.8e-06, "loss": 1.1453, "step": 88 }, { "epoch": 0.4, "learning_rate": 8.900000000000001e-06, "loss": 1.2025, "step": 89 }, { "epoch": 0.41, "learning_rate": 9e-06, "loss": 1.1106, "step": 90 }, { "epoch": 0.41, "learning_rate": 9.100000000000001e-06, "loss": 1.1556, "step": 91 }, { "epoch": 0.42, "learning_rate": 9.200000000000002e-06, "loss": 1.2154, "step": 92 }, { "epoch": 0.42, "learning_rate": 9.3e-06, "loss": 1.1388, "step": 93 }, { "epoch": 0.43, "learning_rate": 9.4e-06, "loss": 1.1614, "step": 94 }, { "epoch": 0.43, "learning_rate": 9.5e-06, "loss": 1.1671, "step": 95 }, { "epoch": 0.44, "learning_rate": 9.600000000000001e-06, "loss": 1.1532, "step": 96 }, { "epoch": 0.44, "learning_rate": 9.7e-06, "loss": 1.1536, "step": 97 }, { "epoch": 0.44, "learning_rate": 9.800000000000001e-06, "loss": 1.1557, "step": 98 }, { "epoch": 0.45, "learning_rate": 9.9e-06, "loss": 1.1084, "step": 99 }, { "epoch": 0.45, "learning_rate": 1e-05, "loss": 1.1748, "step": 100 }, { "epoch": 0.46, "learning_rate": 9.998286624877786e-06, "loss": 1.1997, "step": 101 }, { "epoch": 0.46, "learning_rate": 9.993147673772869e-06, "loss": 1.0219, "step": 102 }, { "epoch": 0.47, "learning_rate": 9.984586668665641e-06, "loss": 1.1311, "step": 103 }, { "epoch": 0.47, "learning_rate": 9.972609476841368e-06, "loss": 1.0767, "step": 104 }, { "epoch": 0.48, "learning_rate": 9.957224306869053e-06, "loss": 1.0199, "step": 105 }, { "epoch": 0.48, "learning_rate": 9.938441702975689e-06, "loss": 1.2051, "step": 106 }, { "epoch": 0.48, "learning_rate": 9.916274537819774e-06, "loss": 1.1329, "step": 107 }, { "epoch": 0.49, "learning_rate": 9.890738003669029e-06, "loss": 1.2139, "step": 108 }, { "epoch": 0.49, "learning_rate": 9.861849601988384e-06, "loss": 1.2148, "step": 109 }, { "epoch": 0.5, "learning_rate": 9.829629131445342e-06, "loss": 1.2246, "step": 110 }, { "epoch": 0.5, "eval_loss": 1.0940097570419312, "eval_runtime": 5.4854, "eval_samples_per_second": 474.171, "eval_steps_per_second": 16.954, "step": 110 }, { "epoch": 0.5, "learning_rate": 9.794098674340966e-06, "loss": 1.191, "step": 111 }, { "epoch": 0.51, "learning_rate": 9.755282581475769e-06, "loss": 1.1356, "step": 112 }, { "epoch": 0.51, "learning_rate": 9.713207455460893e-06, "loss": 1.0826, "step": 113 }, { "epoch": 0.52, "learning_rate": 9.667902132486009e-06, "loss": 1.2344, "step": 114 }, { "epoch": 0.52, "learning_rate": 9.619397662556434e-06, "loss": 1.205, "step": 115 }, { "epoch": 0.53, "learning_rate": 9.567727288213005e-06, "loss": 1.1462, "step": 116 }, { "epoch": 0.53, "learning_rate": 9.512926421749305e-06, "loss": 1.1341, "step": 117 }, { "epoch": 0.53, "learning_rate": 9.45503262094184e-06, "loss": 1.1104, "step": 118 }, { "epoch": 0.54, "learning_rate": 9.394085563309827e-06, "loss": 1.1868, "step": 119 }, { "epoch": 0.54, "learning_rate": 9.330127018922195e-06, "loss": 1.0813, "step": 120 }, { "epoch": 0.55, "learning_rate": 9.263200821770462e-06, "loss": 1.1324, "step": 121 }, { "epoch": 0.55, "learning_rate": 9.193352839727122e-06, "loss": 1.1167, "step": 122 }, { "epoch": 0.56, "learning_rate": 9.120630943110078e-06, "loss": 1.056, "step": 123 }, { "epoch": 0.56, "learning_rate": 9.045084971874738e-06, "loss": 1.1708, "step": 124 }, { "epoch": 0.57, "learning_rate": 8.966766701456177e-06, "loss": 1.1312, "step": 125 }, { "epoch": 0.57, "learning_rate": 8.885729807284855e-06, "loss": 1.1008, "step": 126 }, { "epoch": 0.58, "learning_rate": 8.802029828000157e-06, "loss": 1.0592, "step": 127 }, { "epoch": 0.58, "learning_rate": 8.715724127386971e-06, "loss": 1.0699, "step": 128 }, { "epoch": 0.58, "learning_rate": 8.626871855061438e-06, "loss": 1.0373, "step": 129 }, { "epoch": 0.59, "learning_rate": 8.535533905932739e-06, "loss": 1.1162, "step": 130 }, { "epoch": 0.59, "learning_rate": 8.44177287846877e-06, "loss": 1.0207, "step": 131 }, { "epoch": 0.6, "learning_rate": 8.345653031794292e-06, "loss": 1.2075, "step": 132 }, { "epoch": 0.6, "learning_rate": 8.247240241650918e-06, "loss": 1.0471, "step": 133 }, { "epoch": 0.61, "learning_rate": 8.146601955249187e-06, "loss": 1.1402, "step": 134 }, { "epoch": 0.61, "learning_rate": 8.043807145043604e-06, "loss": 1.0375, "step": 135 }, { "epoch": 0.62, "learning_rate": 7.938926261462366e-06, "loss": 1.0273, "step": 136 }, { "epoch": 0.62, "learning_rate": 7.832031184624165e-06, "loss": 1.046, "step": 137 }, { "epoch": 0.63, "learning_rate": 7.723195175075136e-06, "loss": 1.1205, "step": 138 }, { "epoch": 0.63, "learning_rate": 7.612492823579744e-06, "loss": 1.1016, "step": 139 }, { "epoch": 0.63, "learning_rate": 7.500000000000001e-06, "loss": 1.1173, "step": 140 }, { "epoch": 0.64, "learning_rate": 7.3857938012980425e-06, "loss": 1.0941, "step": 141 }, { "epoch": 0.64, "learning_rate": 7.269952498697734e-06, "loss": 1.1158, "step": 142 }, { "epoch": 0.65, "learning_rate": 7.1525554840414765e-06, "loss": 1.061, "step": 143 }, { "epoch": 0.65, "learning_rate": 7.033683215379002e-06, "loss": 1.1158, "step": 144 }, { "epoch": 0.66, "learning_rate": 6.913417161825449e-06, "loss": 1.1206, "step": 145 }, { "epoch": 0.66, "learning_rate": 6.7918397477265e-06, "loss": 1.1196, "step": 146 }, { "epoch": 0.67, "learning_rate": 6.669034296168855e-06, "loss": 1.1284, "step": 147 }, { "epoch": 0.67, "learning_rate": 6.545084971874738e-06, "loss": 1.0504, "step": 148 }, { "epoch": 0.68, "learning_rate": 6.420076723519615e-06, "loss": 1.1095, "step": 149 }, { "epoch": 0.68, "learning_rate": 6.294095225512604e-06, "loss": 1.154, "step": 150 }, { "epoch": 0.68, "learning_rate": 6.1672268192795285e-06, "loss": 1.0097, "step": 151 }, { "epoch": 0.69, "learning_rate": 6.039558454088796e-06, "loss": 0.997, "step": 152 }, { "epoch": 0.69, "learning_rate": 5.911177627460739e-06, "loss": 1.0522, "step": 153 }, { "epoch": 0.7, "learning_rate": 5.782172325201155e-06, "loss": 1.0514, "step": 154 }, { "epoch": 0.7, "learning_rate": 5.65263096110026e-06, "loss": 1.0633, "step": 155 }, { "epoch": 0.71, "learning_rate": 5.522642316338268e-06, "loss": 1.0988, "step": 156 }, { "epoch": 0.71, "learning_rate": 5.392295478639226e-06, "loss": 1.1034, "step": 157 }, { "epoch": 0.72, "learning_rate": 5.2616797812147205e-06, "loss": 1.1069, "step": 158 }, { "epoch": 0.72, "learning_rate": 5.130884741539367e-06, "loss": 1.142, "step": 159 }, { "epoch": 0.73, "learning_rate": 5e-06, "loss": 1.0829, "step": 160 }, { "epoch": 0.73, "learning_rate": 4.869115258460636e-06, "loss": 1.0947, "step": 161 }, { "epoch": 0.73, "learning_rate": 4.738320218785281e-06, "loss": 1.0733, "step": 162 }, { "epoch": 0.74, "learning_rate": 4.6077045213607765e-06, "loss": 1.1207, "step": 163 }, { "epoch": 0.74, "learning_rate": 4.477357683661734e-06, "loss": 1.1621, "step": 164 }, { "epoch": 0.75, "learning_rate": 4.347369038899744e-06, "loss": 1.0636, "step": 165 }, { "epoch": 0.75, "eval_loss": 0.9949154853820801, "eval_runtime": 5.4121, "eval_samples_per_second": 480.59, "eval_steps_per_second": 17.184, "step": 165 }, { "epoch": 0.75, "learning_rate": 4.217827674798845e-06, "loss": 1.0529, "step": 166 }, { "epoch": 0.76, "learning_rate": 4.088822372539263e-06, "loss": 1.0638, "step": 167 }, { "epoch": 0.76, "learning_rate": 3.960441545911205e-06, "loss": 1.0807, "step": 168 }, { "epoch": 0.77, "learning_rate": 3.832773180720475e-06, "loss": 1.0677, "step": 169 }, { "epoch": 0.77, "learning_rate": 3.705904774487396e-06, "loss": 1.0897, "step": 170 }, { "epoch": 0.78, "learning_rate": 3.579923276480387e-06, "loss": 1.0828, "step": 171 }, { "epoch": 0.78, "learning_rate": 3.4549150281252635e-06, "loss": 1.0671, "step": 172 }, { "epoch": 0.78, "learning_rate": 3.330965703831146e-06, "loss": 1.0613, "step": 173 }, { "epoch": 0.79, "learning_rate": 3.2081602522734987e-06, "loss": 1.1383, "step": 174 }, { "epoch": 0.79, "learning_rate": 3.0865828381745515e-06, "loss": 1.1004, "step": 175 }, { "epoch": 0.8, "learning_rate": 2.966316784621e-06, "loss": 1.048, "step": 176 }, { "epoch": 0.8, "learning_rate": 2.8474445159585235e-06, "loss": 1.083, "step": 177 }, { "epoch": 0.81, "learning_rate": 2.7300475013022666e-06, "loss": 1.0262, "step": 178 }, { "epoch": 0.81, "learning_rate": 2.614206198701958e-06, "loss": 1.076, "step": 179 }, { "epoch": 0.82, "learning_rate": 2.5000000000000015e-06, "loss": 1.1453, "step": 180 }, { "epoch": 0.82, "learning_rate": 2.387507176420256e-06, "loss": 1.0674, "step": 181 }, { "epoch": 0.82, "learning_rate": 2.2768048249248648e-06, "loss": 1.0622, "step": 182 }, { "epoch": 0.83, "learning_rate": 2.1679688153758373e-06, "loss": 1.0837, "step": 183 }, { "epoch": 0.83, "learning_rate": 2.061073738537635e-06, "loss": 1.1283, "step": 184 }, { "epoch": 0.84, "learning_rate": 1.956192854956397e-06, "loss": 1.0808, "step": 185 }, { "epoch": 0.84, "learning_rate": 1.8533980447508138e-06, "loss": 1.1307, "step": 186 }, { "epoch": 0.85, "learning_rate": 1.7527597583490825e-06, "loss": 0.9674, "step": 187 }, { "epoch": 0.85, "learning_rate": 1.6543469682057105e-06, "loss": 1.0615, "step": 188 }, { "epoch": 0.86, "learning_rate": 1.5582271215312294e-06, "loss": 1.1103, "step": 189 }, { "epoch": 0.86, "learning_rate": 1.4644660940672628e-06, "loss": 1.1133, "step": 190 }, { "epoch": 0.87, "learning_rate": 1.373128144938563e-06, "loss": 1.0925, "step": 191 }, { "epoch": 0.87, "learning_rate": 1.2842758726130283e-06, "loss": 1.081, "step": 192 }, { "epoch": 0.87, "learning_rate": 1.1979701719998454e-06, "loss": 1.0841, "step": 193 }, { "epoch": 0.88, "learning_rate": 1.1142701927151456e-06, "loss": 1.1522, "step": 194 }, { "epoch": 0.88, "learning_rate": 1.0332332985438248e-06, "loss": 1.1564, "step": 195 }, { "epoch": 0.89, "learning_rate": 9.549150281252633e-07, "loss": 1.0965, "step": 196 }, { "epoch": 0.89, "learning_rate": 8.793690568899216e-07, "loss": 1.1434, "step": 197 }, { "epoch": 0.9, "learning_rate": 8.066471602728804e-07, "loss": 1.0312, "step": 198 }, { "epoch": 0.9, "learning_rate": 7.367991782295392e-07, "loss": 1.1378, "step": 199 }, { "epoch": 0.91, "learning_rate": 6.698729810778065e-07, "loss": 1.0707, "step": 200 }, { "epoch": 0.91, "learning_rate": 6.059144366901737e-07, "loss": 1.0679, "step": 201 }, { "epoch": 0.92, "learning_rate": 5.449673790581611e-07, "loss": 1.107, "step": 202 }, { "epoch": 0.92, "learning_rate": 4.87073578250698e-07, "loss": 0.9918, "step": 203 }, { "epoch": 0.92, "learning_rate": 4.322727117869951e-07, "loss": 1.0946, "step": 204 }, { "epoch": 0.93, "learning_rate": 3.8060233744356634e-07, "loss": 1.0381, "step": 205 }, { "epoch": 0.93, "learning_rate": 3.320978675139919e-07, "loss": 1.0833, "step": 206 }, { "epoch": 0.94, "learning_rate": 2.867925445391079e-07, "loss": 1.0792, "step": 207 }, { "epoch": 0.94, "learning_rate": 2.447174185242324e-07, "loss": 1.1722, "step": 208 }, { "epoch": 0.95, "learning_rate": 2.0590132565903475e-07, "loss": 1.0136, "step": 209 }, { "epoch": 0.95, "learning_rate": 1.7037086855465902e-07, "loss": 1.1095, "step": 210 }, { "epoch": 0.96, "learning_rate": 1.3815039801161723e-07, "loss": 1.1202, "step": 211 }, { "epoch": 0.96, "learning_rate": 1.0926199633097156e-07, "loss": 1.0463, "step": 212 }, { "epoch": 0.97, "learning_rate": 8.372546218022747e-08, "loss": 1.0762, "step": 213 }, { "epoch": 0.97, "learning_rate": 6.15582970243117e-08, "loss": 1.0496, "step": 214 }, { "epoch": 0.97, "learning_rate": 4.2775693130948094e-08, "loss": 1.0197, "step": 215 }, { "epoch": 0.98, "learning_rate": 2.7390523158633552e-08, "loss": 1.0559, "step": 216 }, { "epoch": 0.98, "learning_rate": 1.541333133436018e-08, "loss": 1.1146, "step": 217 }, { "epoch": 0.99, "learning_rate": 6.852326227130835e-09, "loss": 1.0802, "step": 218 }, { "epoch": 0.99, "learning_rate": 1.7133751222137007e-09, "loss": 1.0113, "step": 219 }, { "epoch": 1.0, "learning_rate": 0.0, "loss": 1.0758, "step": 220 }, { "epoch": 1.0, "eval_loss": 0.9915103912353516, "eval_runtime": 5.4164, "eval_samples_per_second": 480.209, "eval_steps_per_second": 17.17, "step": 220 } ], "logging_steps": 1, "max_steps": 220, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 1.589146774667264e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }