{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.972307692307693, "eval_steps": 1000, "global_step": 505, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009846153846153846, "grad_norm": 38.5, "learning_rate": 3.125e-06, "loss": 1.348, "step": 1 }, { "epoch": 0.019692307692307693, "grad_norm": 28.625, "learning_rate": 6.25e-06, "loss": 1.3239, "step": 2 }, { "epoch": 0.039384615384615386, "grad_norm": 18.375, "learning_rate": 1.25e-05, "loss": 1.3144, "step": 4 }, { "epoch": 0.059076923076923075, "grad_norm": 14.3125, "learning_rate": 1.8750000000000002e-05, "loss": 1.4069, "step": 6 }, { "epoch": 0.07876923076923077, "grad_norm": 11.5625, "learning_rate": 2.5e-05, "loss": 1.247, "step": 8 }, { "epoch": 0.09846153846153846, "grad_norm": 11.25, "learning_rate": 3.125e-05, "loss": 1.2618, "step": 10 }, { "epoch": 0.11815384615384615, "grad_norm": 10.25, "learning_rate": 3.7500000000000003e-05, "loss": 1.287, "step": 12 }, { "epoch": 0.13784615384615384, "grad_norm": 5.9375, "learning_rate": 4.375e-05, "loss": 1.1393, "step": 14 }, { "epoch": 0.15753846153846154, "grad_norm": 7.34375, "learning_rate": 5e-05, "loss": 1.1422, "step": 16 }, { "epoch": 0.17723076923076922, "grad_norm": 7.84375, "learning_rate": 4.9997936302412985e-05, "loss": 1.0547, "step": 18 }, { "epoch": 0.19692307692307692, "grad_norm": 7.0, "learning_rate": 4.9991745550359746e-05, "loss": 1.0486, "step": 20 }, { "epoch": 0.21661538461538463, "grad_norm": 7.75, "learning_rate": 4.99814287659075e-05, "loss": 1.0206, "step": 22 }, { "epoch": 0.2363076923076923, "grad_norm": 13.0625, "learning_rate": 4.996698765231409e-05, "loss": 0.9878, "step": 24 }, { "epoch": 0.256, "grad_norm": 4.59375, "learning_rate": 4.994842459374682e-05, "loss": 0.9275, "step": 26 }, { "epoch": 0.2756923076923077, "grad_norm": 6.46875, "learning_rate": 4.992574265488883e-05, "loss": 0.9555, "step": 28 }, { "epoch": 0.2953846153846154, "grad_norm": 3.890625, "learning_rate": 4.989894558043312e-05, "loss": 0.9275, "step": 30 }, { "epoch": 0.3150769230769231, "grad_norm": 4.75, "learning_rate": 4.986803779446432e-05, "loss": 0.9301, "step": 32 }, { "epoch": 0.33476923076923076, "grad_norm": 2.71875, "learning_rate": 4.983302439972829e-05, "loss": 0.8875, "step": 34 }, { "epoch": 0.35446153846153844, "grad_norm": 6.65625, "learning_rate": 4.979391117678969e-05, "loss": 0.8649, "step": 36 }, { "epoch": 0.37415384615384617, "grad_norm": 2.59375, "learning_rate": 4.975070458307763e-05, "loss": 0.8497, "step": 38 }, { "epoch": 0.39384615384615385, "grad_norm": 3.625, "learning_rate": 4.970341175181956e-05, "loss": 0.8358, "step": 40 }, { "epoch": 0.4135384615384615, "grad_norm": 4.1875, "learning_rate": 4.9652040490863624e-05, "loss": 0.8191, "step": 42 }, { "epoch": 0.43323076923076925, "grad_norm": 4.75, "learning_rate": 4.95965992813896e-05, "loss": 0.8425, "step": 44 }, { "epoch": 0.45292307692307693, "grad_norm": 2.796875, "learning_rate": 4.9537097276508704e-05, "loss": 0.8027, "step": 46 }, { "epoch": 0.4726153846153846, "grad_norm": 2.453125, "learning_rate": 4.947354429975245e-05, "loss": 0.812, "step": 48 }, { "epoch": 0.49230769230769234, "grad_norm": 1.9140625, "learning_rate": 4.940595084345082e-05, "loss": 0.7979, "step": 50 }, { "epoch": 0.512, "grad_norm": 2.3125, "learning_rate": 4.933432806700004e-05, "loss": 0.7927, "step": 52 }, { "epoch": 0.5316923076923077, "grad_norm": 2.953125, "learning_rate": 4.925868779502015e-05, "loss": 0.7773, "step": 54 }, { "epoch": 0.5513846153846154, "grad_norm": 3.625, "learning_rate": 4.9179042515402926e-05, "loss": 0.7694, "step": 56 }, { "epoch": 0.571076923076923, "grad_norm": 2.109375, "learning_rate": 4.909540537725007e-05, "loss": 0.7703, "step": 58 }, { "epoch": 0.5907692307692308, "grad_norm": 1.890625, "learning_rate": 4.900779018870239e-05, "loss": 0.8162, "step": 60 }, { "epoch": 0.6104615384615385, "grad_norm": 2.15625, "learning_rate": 4.891621141466014e-05, "loss": 0.743, "step": 62 }, { "epoch": 0.6301538461538462, "grad_norm": 1.6875, "learning_rate": 4.882068417439493e-05, "loss": 0.7572, "step": 64 }, { "epoch": 0.6498461538461539, "grad_norm": 2.078125, "learning_rate": 4.872122423905358e-05, "loss": 0.7445, "step": 66 }, { "epoch": 0.6695384615384615, "grad_norm": 1.4609375, "learning_rate": 4.8617848029054354e-05, "loss": 0.7419, "step": 68 }, { "epoch": 0.6892307692307692, "grad_norm": 1.46875, "learning_rate": 4.851057261137608e-05, "loss": 0.7402, "step": 70 }, { "epoch": 0.7089230769230769, "grad_norm": 4.09375, "learning_rate": 4.839941569674041e-05, "loss": 0.7131, "step": 72 }, { "epoch": 0.7286153846153847, "grad_norm": 4.25, "learning_rate": 4.8284395636687854e-05, "loss": 0.6954, "step": 74 }, { "epoch": 0.7483076923076923, "grad_norm": 3.21875, "learning_rate": 4.816553142054805e-05, "loss": 0.699, "step": 76 }, { "epoch": 0.768, "grad_norm": 4.125, "learning_rate": 4.804284267230468e-05, "loss": 0.6775, "step": 78 }, { "epoch": 0.7876923076923077, "grad_norm": 2.984375, "learning_rate": 4.791634964735564e-05, "loss": 0.7056, "step": 80 }, { "epoch": 0.8073846153846154, "grad_norm": 2.59375, "learning_rate": 4.778607322916896e-05, "loss": 0.6944, "step": 82 }, { "epoch": 0.827076923076923, "grad_norm": 1.859375, "learning_rate": 4.765203492583502e-05, "loss": 0.668, "step": 84 }, { "epoch": 0.8467692307692307, "grad_norm": 2.109375, "learning_rate": 4.751425686651568e-05, "loss": 0.673, "step": 86 }, { "epoch": 0.8664615384615385, "grad_norm": 1.84375, "learning_rate": 4.737276179779083e-05, "loss": 0.7153, "step": 88 }, { "epoch": 0.8861538461538462, "grad_norm": 2.125, "learning_rate": 4.722757307990302e-05, "loss": 0.7234, "step": 90 }, { "epoch": 0.9058461538461539, "grad_norm": 1.3984375, "learning_rate": 4.707871468290078e-05, "loss": 0.6231, "step": 92 }, { "epoch": 0.9255384615384615, "grad_norm": 1.6484375, "learning_rate": 4.69262111826813e-05, "loss": 0.642, "step": 94 }, { "epoch": 0.9452307692307692, "grad_norm": 1.5625, "learning_rate": 4.6770087756932995e-05, "loss": 0.6231, "step": 96 }, { "epoch": 0.9649230769230769, "grad_norm": 1.78125, "learning_rate": 4.661037018097884e-05, "loss": 0.671, "step": 98 }, { "epoch": 0.9846153846153847, "grad_norm": 1.5625, "learning_rate": 4.6447084823520926e-05, "loss": 0.6657, "step": 100 }, { "epoch": 1.0043076923076923, "grad_norm": 1.5390625, "learning_rate": 4.62802586422871e-05, "loss": 0.607, "step": 102 }, { "epoch": 1.024, "grad_norm": 1.5078125, "learning_rate": 4.610991917958037e-05, "loss": 0.5739, "step": 104 }, { "epoch": 1.0436923076923077, "grad_norm": 1.484375, "learning_rate": 4.593609455773181e-05, "loss": 0.6011, "step": 106 }, { "epoch": 1.0633846153846154, "grad_norm": 1.5234375, "learning_rate": 4.5758813474457606e-05, "loss": 0.5776, "step": 108 }, { "epoch": 1.083076923076923, "grad_norm": 1.234375, "learning_rate": 4.557810519812128e-05, "loss": 0.5808, "step": 110 }, { "epoch": 1.1027692307692307, "grad_norm": 1.7890625, "learning_rate": 4.539399956290152e-05, "loss": 0.5965, "step": 112 }, { "epoch": 1.1224615384615384, "grad_norm": 1.2109375, "learning_rate": 4.520652696386677e-05, "loss": 0.608, "step": 114 }, { "epoch": 1.142153846153846, "grad_norm": 1.625, "learning_rate": 4.5015718351957015e-05, "loss": 0.5714, "step": 116 }, { "epoch": 1.1618461538461538, "grad_norm": 1.5703125, "learning_rate": 4.482160522887403e-05, "loss": 0.5876, "step": 118 }, { "epoch": 1.1815384615384614, "grad_norm": 1.5078125, "learning_rate": 4.462421964188052e-05, "loss": 0.5835, "step": 120 }, { "epoch": 1.2012307692307693, "grad_norm": 1.3515625, "learning_rate": 4.442359417850924e-05, "loss": 0.5881, "step": 122 }, { "epoch": 1.220923076923077, "grad_norm": 1.9140625, "learning_rate": 4.421976196118297e-05, "loss": 0.5471, "step": 124 }, { "epoch": 1.2406153846153847, "grad_norm": 1.625, "learning_rate": 4.401275664174611e-05, "loss": 0.5417, "step": 126 }, { "epoch": 1.2603076923076924, "grad_norm": 1.625, "learning_rate": 4.380261239590892e-05, "loss": 0.5337, "step": 128 }, { "epoch": 1.28, "grad_norm": 1.5625, "learning_rate": 4.358936391760524e-05, "loss": 0.5731, "step": 130 }, { "epoch": 1.2996923076923077, "grad_norm": 1.5625, "learning_rate": 4.337304641326467e-05, "loss": 0.5363, "step": 132 }, { "epoch": 1.3193846153846154, "grad_norm": 1.515625, "learning_rate": 4.315369559600018e-05, "loss": 0.5566, "step": 134 }, { "epoch": 1.339076923076923, "grad_norm": 1.3359375, "learning_rate": 4.2931347679711924e-05, "loss": 0.586, "step": 136 }, { "epoch": 1.3587692307692307, "grad_norm": 1.6015625, "learning_rate": 4.270603937310859e-05, "loss": 0.5535, "step": 138 }, { "epoch": 1.3784615384615384, "grad_norm": 1.2734375, "learning_rate": 4.2477807873646845e-05, "loss": 0.5788, "step": 140 }, { "epoch": 1.398153846153846, "grad_norm": 1.484375, "learning_rate": 4.2246690861390294e-05, "loss": 0.538, "step": 142 }, { "epoch": 1.417846153846154, "grad_norm": 1.453125, "learning_rate": 4.201272649278856e-05, "loss": 0.5531, "step": 144 }, { "epoch": 1.4375384615384617, "grad_norm": 1.5, "learning_rate": 4.177595339437789e-05, "loss": 0.55, "step": 146 }, { "epoch": 1.4572307692307693, "grad_norm": 1.2890625, "learning_rate": 4.153641065640402e-05, "loss": 0.5333, "step": 148 }, { "epoch": 1.476923076923077, "grad_norm": 1.4296875, "learning_rate": 4.129413782636859e-05, "loss": 0.5372, "step": 150 }, { "epoch": 1.4966153846153847, "grad_norm": 1.359375, "learning_rate": 4.1049174902499974e-05, "loss": 0.5575, "step": 152 }, { "epoch": 1.5163076923076924, "grad_norm": 1.4296875, "learning_rate": 4.080156232714976e-05, "loss": 0.5571, "step": 154 }, { "epoch": 1.536, "grad_norm": 1.2421875, "learning_rate": 4.055134098011589e-05, "loss": 0.5246, "step": 156 }, { "epoch": 1.5556923076923077, "grad_norm": 1.5546875, "learning_rate": 4.0298552171893576e-05, "loss": 0.5597, "step": 158 }, { "epoch": 1.5753846153846154, "grad_norm": 1.3046875, "learning_rate": 4.0043237636855116e-05, "loss": 0.5536, "step": 160 }, { "epoch": 1.595076923076923, "grad_norm": 1.5078125, "learning_rate": 3.978543952635967e-05, "loss": 0.5527, "step": 162 }, { "epoch": 1.6147692307692307, "grad_norm": 1.3359375, "learning_rate": 3.952520040179434e-05, "loss": 0.5137, "step": 164 }, { "epoch": 1.6344615384615384, "grad_norm": 1.34375, "learning_rate": 3.92625632275474e-05, "loss": 0.5795, "step": 166 }, { "epoch": 1.654153846153846, "grad_norm": 1.46875, "learning_rate": 3.899757136391507e-05, "loss": 0.5237, "step": 168 }, { "epoch": 1.6738461538461538, "grad_norm": 1.28125, "learning_rate": 3.873026855994292e-05, "loss": 0.5326, "step": 170 }, { "epoch": 1.6935384615384614, "grad_norm": 1.3125, "learning_rate": 3.8460698946203054e-05, "loss": 0.5231, "step": 172 }, { "epoch": 1.7132307692307691, "grad_norm": 1.5546875, "learning_rate": 3.818890702750841e-05, "loss": 0.5492, "step": 174 }, { "epoch": 1.7329230769230768, "grad_norm": 1.453125, "learning_rate": 3.791493767556511e-05, "loss": 0.6126, "step": 176 }, { "epoch": 1.7526153846153845, "grad_norm": 1.1953125, "learning_rate": 3.7638836121564415e-05, "loss": 0.5463, "step": 178 }, { "epoch": 1.7723076923076924, "grad_norm": 1.3515625, "learning_rate": 3.7360647948715164e-05, "loss": 0.515, "step": 180 }, { "epoch": 1.792, "grad_norm": 1.296875, "learning_rate": 3.708041908471827e-05, "loss": 0.5259, "step": 182 }, { "epoch": 1.8116923076923077, "grad_norm": 1.578125, "learning_rate": 3.679819579418414e-05, "loss": 0.5059, "step": 184 }, { "epoch": 1.8313846153846154, "grad_norm": 1.375, "learning_rate": 3.651402467099468e-05, "loss": 0.5709, "step": 186 }, { "epoch": 1.851076923076923, "grad_norm": 1.21875, "learning_rate": 3.622795263061079e-05, "loss": 0.5628, "step": 188 }, { "epoch": 1.8707692307692307, "grad_norm": 1.4609375, "learning_rate": 3.594002690232682e-05, "loss": 0.5066, "step": 190 }, { "epoch": 1.8904615384615384, "grad_norm": 1.3203125, "learning_rate": 3.565029502147323e-05, "loss": 0.5625, "step": 192 }, { "epoch": 1.9101538461538463, "grad_norm": 1.6796875, "learning_rate": 3.53588048215687e-05, "loss": 0.5336, "step": 194 }, { "epoch": 1.929846153846154, "grad_norm": 1.4921875, "learning_rate": 3.506560442642299e-05, "loss": 0.5215, "step": 196 }, { "epoch": 1.9495384615384617, "grad_norm": 1.546875, "learning_rate": 3.4770742242191945e-05, "loss": 0.5296, "step": 198 }, { "epoch": 1.9692307692307693, "grad_norm": 2.421875, "learning_rate": 3.4474266949385817e-05, "loss": 0.523, "step": 200 }, { "epoch": 1.988923076923077, "grad_norm": 1.34375, "learning_rate": 3.4176227494832305e-05, "loss": 0.4856, "step": 202 }, { "epoch": 2.0086153846153847, "grad_norm": 1.3984375, "learning_rate": 3.387667308359568e-05, "loss": 0.5298, "step": 204 }, { "epoch": 2.0283076923076924, "grad_norm": 1.375, "learning_rate": 3.3575653170853175e-05, "loss": 0.4869, "step": 206 }, { "epoch": 2.048, "grad_norm": 1.0703125, "learning_rate": 3.327321745373021e-05, "loss": 0.479, "step": 208 }, { "epoch": 2.0676923076923077, "grad_norm": 1.234375, "learning_rate": 3.2969415863095556e-05, "loss": 0.4935, "step": 210 }, { "epoch": 2.0873846153846154, "grad_norm": 1.2734375, "learning_rate": 3.266429855531797e-05, "loss": 0.4773, "step": 212 }, { "epoch": 2.107076923076923, "grad_norm": 1.3515625, "learning_rate": 3.2357915903985605e-05, "loss": 0.4611, "step": 214 }, { "epoch": 2.1267692307692307, "grad_norm": 1.2265625, "learning_rate": 3.2050318491589506e-05, "loss": 0.469, "step": 216 }, { "epoch": 2.1464615384615384, "grad_norm": 1.2109375, "learning_rate": 3.174155710117271e-05, "loss": 0.4758, "step": 218 }, { "epoch": 2.166153846153846, "grad_norm": 1.34375, "learning_rate": 3.143168270794612e-05, "loss": 0.4933, "step": 220 }, { "epoch": 2.1858461538461538, "grad_norm": 1.2109375, "learning_rate": 3.112074647087274e-05, "loss": 0.4814, "step": 222 }, { "epoch": 2.2055384615384614, "grad_norm": 1.203125, "learning_rate": 3.080879972422154e-05, "loss": 0.5064, "step": 224 }, { "epoch": 2.225230769230769, "grad_norm": 1.34375, "learning_rate": 3.0495893969092392e-05, "loss": 0.4576, "step": 226 }, { "epoch": 2.244923076923077, "grad_norm": 1.4453125, "learning_rate": 3.0182080864913452e-05, "loss": 0.4902, "step": 228 }, { "epoch": 2.2646153846153845, "grad_norm": 1.21875, "learning_rate": 2.9867412220912373e-05, "loss": 0.4486, "step": 230 }, { "epoch": 2.284307692307692, "grad_norm": 1.1953125, "learning_rate": 2.9551939987562866e-05, "loss": 0.4786, "step": 232 }, { "epoch": 2.304, "grad_norm": 1.2578125, "learning_rate": 2.923571624800787e-05, "loss": 0.4814, "step": 234 }, { "epoch": 2.3236923076923075, "grad_norm": 1.3359375, "learning_rate": 2.891879320946086e-05, "loss": 0.4915, "step": 236 }, { "epoch": 2.3433846153846156, "grad_norm": 1.2265625, "learning_rate": 2.8601223194586612e-05, "loss": 0.4931, "step": 238 }, { "epoch": 2.363076923076923, "grad_norm": 1.4453125, "learning_rate": 2.8283058632863003e-05, "loss": 0.481, "step": 240 }, { "epoch": 2.382769230769231, "grad_norm": 1.4140625, "learning_rate": 2.7964352051925103e-05, "loss": 0.4458, "step": 242 }, { "epoch": 2.4024615384615386, "grad_norm": 1.28125, "learning_rate": 2.7645156068893073e-05, "loss": 0.499, "step": 244 }, { "epoch": 2.4221538461538463, "grad_norm": 2.078125, "learning_rate": 2.732552338168531e-05, "loss": 0.4937, "step": 246 }, { "epoch": 2.441846153846154, "grad_norm": 1.1875, "learning_rate": 2.7005506760318235e-05, "loss": 0.4628, "step": 248 }, { "epoch": 2.4615384615384617, "grad_norm": 1.421875, "learning_rate": 2.66851590381942e-05, "loss": 0.4741, "step": 250 }, { "epoch": 2.4812307692307694, "grad_norm": 1.21875, "learning_rate": 2.6364533103378896e-05, "loss": 0.4569, "step": 252 }, { "epoch": 2.500923076923077, "grad_norm": 1.3515625, "learning_rate": 2.604368188986977e-05, "loss": 0.4851, "step": 254 }, { "epoch": 2.5206153846153847, "grad_norm": 1.4921875, "learning_rate": 2.5722658368856816e-05, "loss": 0.4935, "step": 256 }, { "epoch": 2.5403076923076924, "grad_norm": 1.203125, "learning_rate": 2.5401515539977305e-05, "loss": 0.4947, "step": 258 }, { "epoch": 2.56, "grad_norm": 1.328125, "learning_rate": 2.5080306422565707e-05, "loss": 0.4642, "step": 260 }, { "epoch": 2.5796923076923077, "grad_norm": 1.4140625, "learning_rate": 2.4759084046900486e-05, "loss": 0.5064, "step": 262 }, { "epoch": 2.5993846153846154, "grad_norm": 1.2734375, "learning_rate": 2.4437901445448936e-05, "loss": 0.4376, "step": 264 }, { "epoch": 2.619076923076923, "grad_norm": 1.2734375, "learning_rate": 2.4116811644111852e-05, "loss": 0.4861, "step": 266 }, { "epoch": 2.6387692307692308, "grad_norm": 1.3203125, "learning_rate": 2.379586765346907e-05, "loss": 0.4878, "step": 268 }, { "epoch": 2.6584615384615384, "grad_norm": 1.3125, "learning_rate": 2.347512246002774e-05, "loss": 0.4827, "step": 270 }, { "epoch": 2.678153846153846, "grad_norm": 1.3359375, "learning_rate": 2.3154629017474384e-05, "loss": 0.4769, "step": 272 }, { "epoch": 2.697846153846154, "grad_norm": 1.3203125, "learning_rate": 2.2834440237932536e-05, "loss": 0.5063, "step": 274 }, { "epoch": 2.7175384615384615, "grad_norm": 1.15625, "learning_rate": 2.251460898322712e-05, "loss": 0.4483, "step": 276 }, { "epoch": 2.737230769230769, "grad_norm": 1.3984375, "learning_rate": 2.219518805615724e-05, "loss": 0.4855, "step": 278 }, { "epoch": 2.756923076923077, "grad_norm": 1.296875, "learning_rate": 2.1876230191778598e-05, "loss": 0.4663, "step": 280 }, { "epoch": 2.7766153846153845, "grad_norm": 1.109375, "learning_rate": 2.155778804869721e-05, "loss": 0.5065, "step": 282 }, { "epoch": 2.796307692307692, "grad_norm": 1.25, "learning_rate": 2.123991420037565e-05, "loss": 0.4757, "step": 284 }, { "epoch": 2.816, "grad_norm": 1.359375, "learning_rate": 2.0922661126453432e-05, "loss": 0.4768, "step": 286 }, { "epoch": 2.835692307692308, "grad_norm": 1.265625, "learning_rate": 2.0606081204082797e-05, "loss": 0.4383, "step": 288 }, { "epoch": 2.855384615384615, "grad_norm": 1.5625, "learning_rate": 2.02902266992815e-05, "loss": 0.4976, "step": 290 }, { "epoch": 2.8750769230769233, "grad_norm": 1.078125, "learning_rate": 1.9975149758303883e-05, "loss": 0.4871, "step": 292 }, { "epoch": 2.8947692307692305, "grad_norm": 1.3125, "learning_rate": 1.9660902399031782e-05, "loss": 0.4807, "step": 294 }, { "epoch": 2.9144615384615387, "grad_norm": 1.4765625, "learning_rate": 1.9347536502386553e-05, "loss": 0.4544, "step": 296 }, { "epoch": 2.934153846153846, "grad_norm": 1.2109375, "learning_rate": 1.9035103803763792e-05, "loss": 0.4924, "step": 298 }, { "epoch": 2.953846153846154, "grad_norm": 1.3828125, "learning_rate": 1.8723655884491982e-05, "loss": 0.4846, "step": 300 }, { "epoch": 2.9735384615384617, "grad_norm": 1.21875, "learning_rate": 1.8413244163316696e-05, "loss": 0.4921, "step": 302 }, { "epoch": 2.9932307692307694, "grad_norm": 1.3359375, "learning_rate": 1.8103919887911526e-05, "loss": 0.4728, "step": 304 }, { "epoch": 3.012923076923077, "grad_norm": 1.109375, "learning_rate": 1.7795734126417326e-05, "loss": 0.4531, "step": 306 }, { "epoch": 3.0326153846153847, "grad_norm": 1.328125, "learning_rate": 1.7488737759011105e-05, "loss": 0.4468, "step": 308 }, { "epoch": 3.0523076923076924, "grad_norm": 1.234375, "learning_rate": 1.718298146950585e-05, "loss": 0.4727, "step": 310 }, { "epoch": 3.072, "grad_norm": 1.09375, "learning_rate": 1.6878515736982915e-05, "loss": 0.4429, "step": 312 }, { "epoch": 3.0916923076923077, "grad_norm": 1.1484375, "learning_rate": 1.657539082745811e-05, "loss": 0.4304, "step": 314 }, { "epoch": 3.1113846153846154, "grad_norm": 1.125, "learning_rate": 1.6273656785582986e-05, "loss": 0.4814, "step": 316 }, { "epoch": 3.131076923076923, "grad_norm": 1.0625, "learning_rate": 1.597336342638266e-05, "loss": 0.411, "step": 318 }, { "epoch": 3.1507692307692308, "grad_norm": 1.40625, "learning_rate": 1.5674560327031613e-05, "loss": 0.4318, "step": 320 }, { "epoch": 3.1704615384615384, "grad_norm": 1.5703125, "learning_rate": 1.5377296818668638e-05, "loss": 0.4685, "step": 322 }, { "epoch": 3.190153846153846, "grad_norm": 1.109375, "learning_rate": 1.5081621978252548e-05, "loss": 0.423, "step": 324 }, { "epoch": 3.209846153846154, "grad_norm": 1.2734375, "learning_rate": 1.47875846204597e-05, "loss": 0.4587, "step": 326 }, { "epoch": 3.2295384615384615, "grad_norm": 1.078125, "learning_rate": 1.449523328962496e-05, "loss": 0.4341, "step": 328 }, { "epoch": 3.249230769230769, "grad_norm": 1.140625, "learning_rate": 1.420461625172721e-05, "loss": 0.4596, "step": 330 }, { "epoch": 3.268923076923077, "grad_norm": 1.3828125, "learning_rate": 1.3915781486420848e-05, "loss": 0.4357, "step": 332 }, { "epoch": 3.2886153846153845, "grad_norm": 1.2421875, "learning_rate": 1.3628776679114517e-05, "loss": 0.4672, "step": 334 }, { "epoch": 3.308307692307692, "grad_norm": 1.34375, "learning_rate": 1.3343649213098486e-05, "loss": 0.4494, "step": 336 }, { "epoch": 3.328, "grad_norm": 1.296875, "learning_rate": 1.3060446161721855e-05, "loss": 0.4619, "step": 338 }, { "epoch": 3.3476923076923075, "grad_norm": 1.171875, "learning_rate": 1.277921428062091e-05, "loss": 0.4561, "step": 340 }, { "epoch": 3.367384615384615, "grad_norm": 1.1484375, "learning_rate": 1.2500000000000006e-05, "loss": 0.4275, "step": 342 }, { "epoch": 3.387076923076923, "grad_norm": 1.390625, "learning_rate": 1.2222849416966117e-05, "loss": 0.4704, "step": 344 }, { "epoch": 3.406769230769231, "grad_norm": 1.296875, "learning_rate": 1.1947808287918404e-05, "loss": 0.4283, "step": 346 }, { "epoch": 3.4264615384615382, "grad_norm": 1.1953125, "learning_rate": 1.1674922020994022e-05, "loss": 0.4346, "step": 348 }, { "epoch": 3.4461538461538463, "grad_norm": 1.203125, "learning_rate": 1.14042356685714e-05, "loss": 0.4613, "step": 350 }, { "epoch": 3.465846153846154, "grad_norm": 1.3046875, "learning_rate": 1.1135793919832336e-05, "loss": 0.4634, "step": 352 }, { "epoch": 3.4855384615384617, "grad_norm": 1.3828125, "learning_rate": 1.0869641093383962e-05, "loss": 0.4702, "step": 354 }, { "epoch": 3.5052307692307694, "grad_norm": 1.2578125, "learning_rate": 1.0605821129941934e-05, "loss": 0.458, "step": 356 }, { "epoch": 3.524923076923077, "grad_norm": 1.6796875, "learning_rate": 1.0344377585075998e-05, "loss": 0.4286, "step": 358 }, { "epoch": 3.5446153846153847, "grad_norm": 1.3125, "learning_rate": 1.0085353622019175e-05, "loss": 0.46, "step": 360 }, { "epoch": 3.5643076923076924, "grad_norm": 1.25, "learning_rate": 9.82879200454167e-06, "loss": 0.4323, "step": 362 }, { "epoch": 3.584, "grad_norm": 1.3125, "learning_rate": 9.574735089890766e-06, "loss": 0.4452, "step": 364 }, { "epoch": 3.6036923076923078, "grad_norm": 1.2734375, "learning_rate": 9.323224821797782e-06, "loss": 0.4605, "step": 366 }, { "epoch": 3.6233846153846154, "grad_norm": 1.09375, "learning_rate": 9.074302723553398e-06, "loss": 0.4871, "step": 368 }, { "epoch": 3.643076923076923, "grad_norm": 1.21875, "learning_rate": 8.8280098911523e-06, "loss": 0.4801, "step": 370 }, { "epoch": 3.6627692307692308, "grad_norm": 1.15625, "learning_rate": 8.584386986508388e-06, "loss": 0.4666, "step": 372 }, { "epoch": 3.6824615384615385, "grad_norm": 1.4296875, "learning_rate": 8.343474230741715e-06, "loss": 0.4404, "step": 374 }, { "epoch": 3.702153846153846, "grad_norm": 1.1484375, "learning_rate": 8.105311397538085e-06, "loss": 0.4526, "step": 376 }, { "epoch": 3.721846153846154, "grad_norm": 1.4296875, "learning_rate": 7.869937806582642e-06, "loss": 0.4433, "step": 378 }, { "epoch": 3.7415384615384615, "grad_norm": 1.1484375, "learning_rate": 7.63739231706833e-06, "loss": 0.4287, "step": 380 }, { "epoch": 3.761230769230769, "grad_norm": 1.40625, "learning_rate": 7.407713321280377e-06, "loss": 0.465, "step": 382 }, { "epoch": 3.780923076923077, "grad_norm": 1.203125, "learning_rate": 7.180938738257944e-06, "loss": 0.445, "step": 384 }, { "epoch": 3.8006153846153845, "grad_norm": 1.3203125, "learning_rate": 6.957106007533826e-06, "loss": 0.4544, "step": 386 }, { "epoch": 3.820307692307692, "grad_norm": 1.15625, "learning_rate": 6.736252082953307e-06, "loss": 0.4508, "step": 388 }, { "epoch": 3.84, "grad_norm": 1.3671875, "learning_rate": 6.5184134265733e-06, "loss": 0.4575, "step": 390 }, { "epoch": 3.8596923076923075, "grad_norm": 1.28125, "learning_rate": 6.303626002642554e-06, "loss": 0.4432, "step": 392 }, { "epoch": 3.879384615384615, "grad_norm": 1.34375, "learning_rate": 6.091925271664156e-06, "loss": 0.4614, "step": 394 }, { "epoch": 3.8990769230769233, "grad_norm": 1.0625, "learning_rate": 5.883346184541128e-06, "loss": 0.4645, "step": 396 }, { "epoch": 3.9187692307692306, "grad_norm": 1.234375, "learning_rate": 5.67792317680616e-06, "loss": 0.4533, "step": 398 }, { "epoch": 3.9384615384615387, "grad_norm": 1.2421875, "learning_rate": 5.475690162936489e-06, "loss": 0.4232, "step": 400 }, { "epoch": 3.958153846153846, "grad_norm": 2.046875, "learning_rate": 5.27668053075474e-06, "loss": 0.4266, "step": 402 }, { "epoch": 3.977846153846154, "grad_norm": 1.2890625, "learning_rate": 5.0809271359167215e-06, "loss": 0.4529, "step": 404 }, { "epoch": 3.9975384615384613, "grad_norm": 1.40625, "learning_rate": 4.888462296487128e-06, "loss": 0.4429, "step": 406 }, { "epoch": 4.017230769230769, "grad_norm": 1.109375, "learning_rate": 4.699317787603927e-06, "loss": 0.4537, "step": 408 }, { "epoch": 4.036923076923077, "grad_norm": 1.3046875, "learning_rate": 4.513524836232458e-06, "loss": 0.4659, "step": 410 }, { "epoch": 4.056615384615385, "grad_norm": 1.09375, "learning_rate": 4.331114116009938e-06, "loss": 0.4156, "step": 412 }, { "epoch": 4.076307692307692, "grad_norm": 1.3125, "learning_rate": 4.152115742181434e-06, "loss": 0.4561, "step": 414 }, { "epoch": 4.096, "grad_norm": 1.5390625, "learning_rate": 3.97655926662791e-06, "loss": 0.4438, "step": 416 }, { "epoch": 4.115692307692307, "grad_norm": 1.234375, "learning_rate": 3.80447367298738e-06, "loss": 0.4331, "step": 418 }, { "epoch": 4.135384615384615, "grad_norm": 1.1328125, "learning_rate": 3.6358873718697726e-06, "loss": 0.4261, "step": 420 }, { "epoch": 4.155076923076923, "grad_norm": 1.2890625, "learning_rate": 3.470828196166523e-06, "loss": 0.4629, "step": 422 }, { "epoch": 4.174769230769231, "grad_norm": 1.1875, "learning_rate": 3.3093233964554466e-06, "loss": 0.4271, "step": 424 }, { "epoch": 4.194461538461539, "grad_norm": 1.2578125, "learning_rate": 3.151399636501773e-06, "loss": 0.4229, "step": 426 }, { "epoch": 4.214153846153846, "grad_norm": 1.28125, "learning_rate": 2.997082988856087e-06, "loss": 0.4504, "step": 428 }, { "epoch": 4.233846153846154, "grad_norm": 1.4921875, "learning_rate": 2.8463989305498596e-06, "loss": 0.428, "step": 430 }, { "epoch": 4.2535384615384615, "grad_norm": 1.3125, "learning_rate": 2.699372338889297e-06, "loss": 0.4399, "step": 432 }, { "epoch": 4.27323076923077, "grad_norm": 1.2890625, "learning_rate": 2.5560274873481975e-06, "loss": 0.4375, "step": 434 }, { "epoch": 4.292923076923077, "grad_norm": 1.1875, "learning_rate": 2.416388041560491e-06, "loss": 0.4231, "step": 436 }, { "epoch": 4.312615384615385, "grad_norm": 1.2578125, "learning_rate": 2.2804770554131686e-06, "loss": 0.4409, "step": 438 }, { "epoch": 4.332307692307692, "grad_norm": 1.25, "learning_rate": 2.1483169672401686e-06, "loss": 0.4693, "step": 440 }, { "epoch": 4.352, "grad_norm": 1.234375, "learning_rate": 2.0199295961178893e-06, "loss": 0.4454, "step": 442 }, { "epoch": 4.3716923076923075, "grad_norm": 1.40625, "learning_rate": 1.895336138262968e-06, "loss": 0.4543, "step": 444 }, { "epoch": 4.391384615384616, "grad_norm": 1.1015625, "learning_rate": 1.7745571635328723e-06, "loss": 0.4302, "step": 446 }, { "epoch": 4.411076923076923, "grad_norm": 1.125, "learning_rate": 1.6576126120299045e-06, "loss": 0.4325, "step": 448 }, { "epoch": 4.430769230769231, "grad_norm": 1.2734375, "learning_rate": 1.5445217908091613e-06, "loss": 0.4406, "step": 450 }, { "epoch": 4.450461538461538, "grad_norm": 1.21875, "learning_rate": 1.4353033706910296e-06, "loss": 0.4631, "step": 452 }, { "epoch": 4.470153846153846, "grad_norm": 1.15625, "learning_rate": 1.3299753831787192e-06, "loss": 0.4466, "step": 454 }, { "epoch": 4.489846153846154, "grad_norm": 1.4140625, "learning_rate": 1.2285552174813225e-06, "loss": 0.4379, "step": 456 }, { "epoch": 4.509538461538462, "grad_norm": 1.171875, "learning_rate": 1.131059617642935e-06, "loss": 0.443, "step": 458 }, { "epoch": 4.529230769230769, "grad_norm": 1.1796875, "learning_rate": 1.0375046797782866e-06, "loss": 0.4793, "step": 460 }, { "epoch": 4.548923076923077, "grad_norm": 1.3046875, "learning_rate": 9.479058494153425e-07, "loss": 0.4512, "step": 462 }, { "epoch": 4.568615384615384, "grad_norm": 1.1328125, "learning_rate": 8.622779189453007e-07, "loss": 0.4558, "step": 464 }, { "epoch": 4.588307692307692, "grad_norm": 1.3125, "learning_rate": 7.806350251804484e-07, "loss": 0.4365, "step": 466 }, { "epoch": 4.608, "grad_norm": 1.3359375, "learning_rate": 7.029906470202046e-07, "loss": 0.4499, "step": 468 }, { "epoch": 4.627692307692308, "grad_norm": 1.421875, "learning_rate": 6.293576032258413e-07, "loss": 0.4228, "step": 470 }, { "epoch": 4.647384615384615, "grad_norm": 1.3828125, "learning_rate": 5.597480503041486e-07, "loss": 0.4443, "step": 472 }, { "epoch": 4.667076923076923, "grad_norm": 1.375, "learning_rate": 4.941734805004289e-07, "loss": 0.4462, "step": 474 }, { "epoch": 4.686769230769231, "grad_norm": 1.140625, "learning_rate": 4.326447199012068e-07, "loss": 0.4136, "step": 476 }, { "epoch": 4.7064615384615385, "grad_norm": 1.078125, "learning_rate": 3.751719266468584e-07, "loss": 0.418, "step": 478 }, { "epoch": 4.726153846153846, "grad_norm": 1.28125, "learning_rate": 3.217645892545695e-07, "loss": 0.437, "step": 480 }, { "epoch": 4.745846153846154, "grad_norm": 1.234375, "learning_rate": 2.724315250518056e-07, "loss": 0.4599, "step": 482 }, { "epoch": 4.765538461538462, "grad_norm": 1.2578125, "learning_rate": 2.271808787206092e-07, "loss": 0.4741, "step": 484 }, { "epoch": 4.785230769230769, "grad_norm": 1.0625, "learning_rate": 1.860201209529483e-07, "loss": 0.454, "step": 486 }, { "epoch": 4.804923076923077, "grad_norm": 1.2421875, "learning_rate": 1.489560472173468e-07, "loss": 0.4625, "step": 488 }, { "epoch": 4.8246153846153845, "grad_norm": 1.5390625, "learning_rate": 1.1599477663696845e-07, "loss": 0.443, "step": 490 }, { "epoch": 4.844307692307693, "grad_norm": 1.109375, "learning_rate": 8.714175097937204e-08, "loss": 0.4617, "step": 492 }, { "epoch": 4.864, "grad_norm": 1.2578125, "learning_rate": 6.240173375811343e-08, "loss": 0.4432, "step": 494 }, { "epoch": 4.883692307692308, "grad_norm": 1.3125, "learning_rate": 4.1778809446302304e-08, "loss": 0.4661, "step": 496 }, { "epoch": 4.903384615384615, "grad_norm": 1.1953125, "learning_rate": 2.5276382802272292e-08, "loss": 0.4307, "step": 498 }, { "epoch": 4.923076923076923, "grad_norm": 1.2890625, "learning_rate": 1.2897178307461067e-08, "loss": 0.4554, "step": 500 }, { "epoch": 4.942769230769231, "grad_norm": 1.140625, "learning_rate": 4.6432397166285e-09, "loss": 0.4637, "step": 502 }, { "epoch": 4.962461538461539, "grad_norm": 1.3515625, "learning_rate": 5.159297204238023e-10, "loss": 0.459, "step": 504 } ], "logging_steps": 2, "max_steps": 505, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4919954461790044e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }