|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.972307692307693, |
|
"eval_steps": 1000, |
|
"global_step": 505, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009846153846153846, |
|
"grad_norm": 38.5, |
|
"learning_rate": 3.125e-06, |
|
"loss": 1.348, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.019692307692307693, |
|
"grad_norm": 28.625, |
|
"learning_rate": 6.25e-06, |
|
"loss": 1.3239, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.039384615384615386, |
|
"grad_norm": 18.375, |
|
"learning_rate": 1.25e-05, |
|
"loss": 1.3144, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.059076923076923075, |
|
"grad_norm": 14.3125, |
|
"learning_rate": 1.8750000000000002e-05, |
|
"loss": 1.4069, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.07876923076923077, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.247, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.09846153846153846, |
|
"grad_norm": 11.25, |
|
"learning_rate": 3.125e-05, |
|
"loss": 1.2618, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.11815384615384615, |
|
"grad_norm": 10.25, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 1.287, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.13784615384615384, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 4.375e-05, |
|
"loss": 1.1393, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.15753846153846154, |
|
"grad_norm": 7.34375, |
|
"learning_rate": 5e-05, |
|
"loss": 1.1422, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.17723076923076922, |
|
"grad_norm": 7.84375, |
|
"learning_rate": 4.9997936302412985e-05, |
|
"loss": 1.0547, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.19692307692307692, |
|
"grad_norm": 7.0, |
|
"learning_rate": 4.9991745550359746e-05, |
|
"loss": 1.0486, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.21661538461538463, |
|
"grad_norm": 7.75, |
|
"learning_rate": 4.99814287659075e-05, |
|
"loss": 1.0206, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.2363076923076923, |
|
"grad_norm": 13.0625, |
|
"learning_rate": 4.996698765231409e-05, |
|
"loss": 0.9878, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 4.994842459374682e-05, |
|
"loss": 0.9275, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.2756923076923077, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 4.992574265488883e-05, |
|
"loss": 0.9555, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.2953846153846154, |
|
"grad_norm": 3.890625, |
|
"learning_rate": 4.989894558043312e-05, |
|
"loss": 0.9275, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.3150769230769231, |
|
"grad_norm": 4.75, |
|
"learning_rate": 4.986803779446432e-05, |
|
"loss": 0.9301, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.33476923076923076, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 4.983302439972829e-05, |
|
"loss": 0.8875, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.35446153846153844, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 4.979391117678969e-05, |
|
"loss": 0.8649, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.37415384615384617, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 4.975070458307763e-05, |
|
"loss": 0.8497, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.39384615384615385, |
|
"grad_norm": 3.625, |
|
"learning_rate": 4.970341175181956e-05, |
|
"loss": 0.8358, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.4135384615384615, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 4.9652040490863624e-05, |
|
"loss": 0.8191, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.43323076923076925, |
|
"grad_norm": 4.75, |
|
"learning_rate": 4.95965992813896e-05, |
|
"loss": 0.8425, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.45292307692307693, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 4.9537097276508704e-05, |
|
"loss": 0.8027, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.4726153846153846, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 4.947354429975245e-05, |
|
"loss": 0.812, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.49230769230769234, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 4.940595084345082e-05, |
|
"loss": 0.7979, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 4.933432806700004e-05, |
|
"loss": 0.7927, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.5316923076923077, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 4.925868779502015e-05, |
|
"loss": 0.7773, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.5513846153846154, |
|
"grad_norm": 3.625, |
|
"learning_rate": 4.9179042515402926e-05, |
|
"loss": 0.7694, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.571076923076923, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 4.909540537725007e-05, |
|
"loss": 0.7703, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.5907692307692308, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 4.900779018870239e-05, |
|
"loss": 0.8162, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6104615384615385, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 4.891621141466014e-05, |
|
"loss": 0.743, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.6301538461538462, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 4.882068417439493e-05, |
|
"loss": 0.7572, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.6498461538461539, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 4.872122423905358e-05, |
|
"loss": 0.7445, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.6695384615384615, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 4.8617848029054354e-05, |
|
"loss": 0.7419, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.6892307692307692, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 4.851057261137608e-05, |
|
"loss": 0.7402, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.7089230769230769, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 4.839941569674041e-05, |
|
"loss": 0.7131, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.7286153846153847, |
|
"grad_norm": 4.25, |
|
"learning_rate": 4.8284395636687854e-05, |
|
"loss": 0.6954, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.7483076923076923, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 4.816553142054805e-05, |
|
"loss": 0.699, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 4.125, |
|
"learning_rate": 4.804284267230468e-05, |
|
"loss": 0.6775, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.7876923076923077, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 4.791634964735564e-05, |
|
"loss": 0.7056, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.8073846153846154, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 4.778607322916896e-05, |
|
"loss": 0.6944, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.827076923076923, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 4.765203492583502e-05, |
|
"loss": 0.668, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.8467692307692307, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 4.751425686651568e-05, |
|
"loss": 0.673, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.8664615384615385, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 4.737276179779083e-05, |
|
"loss": 0.7153, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.8861538461538462, |
|
"grad_norm": 2.125, |
|
"learning_rate": 4.722757307990302e-05, |
|
"loss": 0.7234, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.9058461538461539, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 4.707871468290078e-05, |
|
"loss": 0.6231, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.9255384615384615, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 4.69262111826813e-05, |
|
"loss": 0.642, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.9452307692307692, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 4.6770087756932995e-05, |
|
"loss": 0.6231, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.9649230769230769, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 4.661037018097884e-05, |
|
"loss": 0.671, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.9846153846153847, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 4.6447084823520926e-05, |
|
"loss": 0.6657, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.0043076923076923, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 4.62802586422871e-05, |
|
"loss": 0.607, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 4.610991917958037e-05, |
|
"loss": 0.5739, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.0436923076923077, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 4.593609455773181e-05, |
|
"loss": 0.6011, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.0633846153846154, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 4.5758813474457606e-05, |
|
"loss": 0.5776, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.083076923076923, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 4.557810519812128e-05, |
|
"loss": 0.5808, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.1027692307692307, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 4.539399956290152e-05, |
|
"loss": 0.5965, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.1224615384615384, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 4.520652696386677e-05, |
|
"loss": 0.608, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.142153846153846, |
|
"grad_norm": 1.625, |
|
"learning_rate": 4.5015718351957015e-05, |
|
"loss": 0.5714, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.1618461538461538, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 4.482160522887403e-05, |
|
"loss": 0.5876, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.1815384615384614, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 4.462421964188052e-05, |
|
"loss": 0.5835, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.2012307692307693, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 4.442359417850924e-05, |
|
"loss": 0.5881, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.220923076923077, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 4.421976196118297e-05, |
|
"loss": 0.5471, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.2406153846153847, |
|
"grad_norm": 1.625, |
|
"learning_rate": 4.401275664174611e-05, |
|
"loss": 0.5417, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.2603076923076924, |
|
"grad_norm": 1.625, |
|
"learning_rate": 4.380261239590892e-05, |
|
"loss": 0.5337, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 4.358936391760524e-05, |
|
"loss": 0.5731, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.2996923076923077, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 4.337304641326467e-05, |
|
"loss": 0.5363, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.3193846153846154, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 4.315369559600018e-05, |
|
"loss": 0.5566, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.339076923076923, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 4.2931347679711924e-05, |
|
"loss": 0.586, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.3587692307692307, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 4.270603937310859e-05, |
|
"loss": 0.5535, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.3784615384615384, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 4.2477807873646845e-05, |
|
"loss": 0.5788, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.398153846153846, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 4.2246690861390294e-05, |
|
"loss": 0.538, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.417846153846154, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 4.201272649278856e-05, |
|
"loss": 0.5531, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.4375384615384617, |
|
"grad_norm": 1.5, |
|
"learning_rate": 4.177595339437789e-05, |
|
"loss": 0.55, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.4572307692307693, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 4.153641065640402e-05, |
|
"loss": 0.5333, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.476923076923077, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 4.129413782636859e-05, |
|
"loss": 0.5372, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.4966153846153847, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 4.1049174902499974e-05, |
|
"loss": 0.5575, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.5163076923076924, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 4.080156232714976e-05, |
|
"loss": 0.5571, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.536, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 4.055134098011589e-05, |
|
"loss": 0.5246, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.5556923076923077, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 4.0298552171893576e-05, |
|
"loss": 0.5597, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.5753846153846154, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 4.0043237636855116e-05, |
|
"loss": 0.5536, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.595076923076923, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 3.978543952635967e-05, |
|
"loss": 0.5527, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.6147692307692307, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 3.952520040179434e-05, |
|
"loss": 0.5137, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.6344615384615384, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 3.92625632275474e-05, |
|
"loss": 0.5795, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.654153846153846, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 3.899757136391507e-05, |
|
"loss": 0.5237, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.6738461538461538, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 3.873026855994292e-05, |
|
"loss": 0.5326, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.6935384615384614, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 3.8460698946203054e-05, |
|
"loss": 0.5231, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.7132307692307691, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 3.818890702750841e-05, |
|
"loss": 0.5492, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.7329230769230768, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 3.791493767556511e-05, |
|
"loss": 0.6126, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.7526153846153845, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 3.7638836121564415e-05, |
|
"loss": 0.5463, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.7723076923076924, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 3.7360647948715164e-05, |
|
"loss": 0.515, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.792, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 3.708041908471827e-05, |
|
"loss": 0.5259, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.8116923076923077, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 3.679819579418414e-05, |
|
"loss": 0.5059, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.8313846153846154, |
|
"grad_norm": 1.375, |
|
"learning_rate": 3.651402467099468e-05, |
|
"loss": 0.5709, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.851076923076923, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 3.622795263061079e-05, |
|
"loss": 0.5628, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.8707692307692307, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 3.594002690232682e-05, |
|
"loss": 0.5066, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.8904615384615384, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 3.565029502147323e-05, |
|
"loss": 0.5625, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.9101538461538463, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.53588048215687e-05, |
|
"loss": 0.5336, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.929846153846154, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 3.506560442642299e-05, |
|
"loss": 0.5215, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.9495384615384617, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 3.4770742242191945e-05, |
|
"loss": 0.5296, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.9692307692307693, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 3.4474266949385817e-05, |
|
"loss": 0.523, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.988923076923077, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 3.4176227494832305e-05, |
|
"loss": 0.4856, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 2.0086153846153847, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 3.387667308359568e-05, |
|
"loss": 0.5298, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 2.0283076923076924, |
|
"grad_norm": 1.375, |
|
"learning_rate": 3.3575653170853175e-05, |
|
"loss": 0.4869, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 2.048, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 3.327321745373021e-05, |
|
"loss": 0.479, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 2.0676923076923077, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 3.2969415863095556e-05, |
|
"loss": 0.4935, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.0873846153846154, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 3.266429855531797e-05, |
|
"loss": 0.4773, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 2.107076923076923, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 3.2357915903985605e-05, |
|
"loss": 0.4611, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 2.1267692307692307, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 3.2050318491589506e-05, |
|
"loss": 0.469, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.1464615384615384, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 3.174155710117271e-05, |
|
"loss": 0.4758, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 2.166153846153846, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 3.143168270794612e-05, |
|
"loss": 0.4933, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.1858461538461538, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 3.112074647087274e-05, |
|
"loss": 0.4814, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.2055384615384614, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 3.080879972422154e-05, |
|
"loss": 0.5064, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 2.225230769230769, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 3.0495893969092392e-05, |
|
"loss": 0.4576, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 2.244923076923077, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 3.0182080864913452e-05, |
|
"loss": 0.4902, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.2646153846153845, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 2.9867412220912373e-05, |
|
"loss": 0.4486, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.284307692307692, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 2.9551939987562866e-05, |
|
"loss": 0.4786, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 2.304, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 2.923571624800787e-05, |
|
"loss": 0.4814, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 2.3236923076923075, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 2.891879320946086e-05, |
|
"loss": 0.4915, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 2.3433846153846156, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 2.8601223194586612e-05, |
|
"loss": 0.4931, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 2.363076923076923, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 2.8283058632863003e-05, |
|
"loss": 0.481, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.382769230769231, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 2.7964352051925103e-05, |
|
"loss": 0.4458, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 2.4024615384615386, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 2.7645156068893073e-05, |
|
"loss": 0.499, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 2.4221538461538463, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 2.732552338168531e-05, |
|
"loss": 0.4937, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.441846153846154, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 2.7005506760318235e-05, |
|
"loss": 0.4628, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 2.66851590381942e-05, |
|
"loss": 0.4741, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.4812307692307694, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 2.6364533103378896e-05, |
|
"loss": 0.4569, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.500923076923077, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 2.604368188986977e-05, |
|
"loss": 0.4851, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.5206153846153847, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.5722658368856816e-05, |
|
"loss": 0.4935, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.5403076923076924, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 2.5401515539977305e-05, |
|
"loss": 0.4947, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 2.5080306422565707e-05, |
|
"loss": 0.4642, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.5796923076923077, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 2.4759084046900486e-05, |
|
"loss": 0.5064, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.5993846153846154, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 2.4437901445448936e-05, |
|
"loss": 0.4376, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.619076923076923, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 2.4116811644111852e-05, |
|
"loss": 0.4861, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.6387692307692308, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 2.379586765346907e-05, |
|
"loss": 0.4878, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.6584615384615384, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 2.347512246002774e-05, |
|
"loss": 0.4827, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.678153846153846, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 2.3154629017474384e-05, |
|
"loss": 0.4769, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.697846153846154, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 2.2834440237932536e-05, |
|
"loss": 0.5063, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.7175384615384615, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 2.251460898322712e-05, |
|
"loss": 0.4483, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.737230769230769, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 2.219518805615724e-05, |
|
"loss": 0.4855, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.756923076923077, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 2.1876230191778598e-05, |
|
"loss": 0.4663, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.7766153846153845, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 2.155778804869721e-05, |
|
"loss": 0.5065, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.796307692307692, |
|
"grad_norm": 1.25, |
|
"learning_rate": 2.123991420037565e-05, |
|
"loss": 0.4757, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.816, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 2.0922661126453432e-05, |
|
"loss": 0.4768, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.835692307692308, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 2.0606081204082797e-05, |
|
"loss": 0.4383, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.855384615384615, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.02902266992815e-05, |
|
"loss": 0.4976, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.8750769230769233, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 1.9975149758303883e-05, |
|
"loss": 0.4871, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.8947692307692305, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 1.9660902399031782e-05, |
|
"loss": 0.4807, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.9144615384615387, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.9347536502386553e-05, |
|
"loss": 0.4544, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.934153846153846, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 1.9035103803763792e-05, |
|
"loss": 0.4924, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.953846153846154, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 1.8723655884491982e-05, |
|
"loss": 0.4846, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.9735384615384617, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 1.8413244163316696e-05, |
|
"loss": 0.4921, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.9932307692307694, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 1.8103919887911526e-05, |
|
"loss": 0.4728, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 3.012923076923077, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 1.7795734126417326e-05, |
|
"loss": 0.4531, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 3.0326153846153847, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 1.7488737759011105e-05, |
|
"loss": 0.4468, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 3.0523076923076924, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 1.718298146950585e-05, |
|
"loss": 0.4727, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 3.072, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 1.6878515736982915e-05, |
|
"loss": 0.4429, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 3.0916923076923077, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 1.657539082745811e-05, |
|
"loss": 0.4304, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 3.1113846153846154, |
|
"grad_norm": 1.125, |
|
"learning_rate": 1.6273656785582986e-05, |
|
"loss": 0.4814, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 3.131076923076923, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.597336342638266e-05, |
|
"loss": 0.411, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 3.1507692307692308, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 1.5674560327031613e-05, |
|
"loss": 0.4318, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.1704615384615384, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.5377296818668638e-05, |
|
"loss": 0.4685, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 3.190153846153846, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 1.5081621978252548e-05, |
|
"loss": 0.423, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 3.209846153846154, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 1.47875846204597e-05, |
|
"loss": 0.4587, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 3.2295384615384615, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 1.449523328962496e-05, |
|
"loss": 0.4341, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 3.249230769230769, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 1.420461625172721e-05, |
|
"loss": 0.4596, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.268923076923077, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 1.3915781486420848e-05, |
|
"loss": 0.4357, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 3.2886153846153845, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 1.3628776679114517e-05, |
|
"loss": 0.4672, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 3.308307692307692, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 1.3343649213098486e-05, |
|
"loss": 0.4494, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 3.328, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 1.3060446161721855e-05, |
|
"loss": 0.4619, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 3.3476923076923075, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 1.277921428062091e-05, |
|
"loss": 0.4561, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.367384615384615, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 1.2500000000000006e-05, |
|
"loss": 0.4275, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 3.387076923076923, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 1.2222849416966117e-05, |
|
"loss": 0.4704, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 3.406769230769231, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 1.1947808287918404e-05, |
|
"loss": 0.4283, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 3.4264615384615382, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 1.1674922020994022e-05, |
|
"loss": 0.4346, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 3.4461538461538463, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 1.14042356685714e-05, |
|
"loss": 0.4613, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.465846153846154, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 1.1135793919832336e-05, |
|
"loss": 0.4634, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 3.4855384615384617, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 1.0869641093383962e-05, |
|
"loss": 0.4702, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 3.5052307692307694, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 1.0605821129941934e-05, |
|
"loss": 0.458, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 3.524923076923077, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.0344377585075998e-05, |
|
"loss": 0.4286, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 3.5446153846153847, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 1.0085353622019175e-05, |
|
"loss": 0.46, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.5643076923076924, |
|
"grad_norm": 1.25, |
|
"learning_rate": 9.82879200454167e-06, |
|
"loss": 0.4323, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 3.584, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 9.574735089890766e-06, |
|
"loss": 0.4452, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 3.6036923076923078, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 9.323224821797782e-06, |
|
"loss": 0.4605, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 3.6233846153846154, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 9.074302723553398e-06, |
|
"loss": 0.4871, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 3.643076923076923, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 8.8280098911523e-06, |
|
"loss": 0.4801, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.6627692307692308, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 8.584386986508388e-06, |
|
"loss": 0.4666, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 3.6824615384615385, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 8.343474230741715e-06, |
|
"loss": 0.4404, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 3.702153846153846, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 8.105311397538085e-06, |
|
"loss": 0.4526, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 3.721846153846154, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 7.869937806582642e-06, |
|
"loss": 0.4433, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 3.7415384615384615, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 7.63739231706833e-06, |
|
"loss": 0.4287, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.761230769230769, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 7.407713321280377e-06, |
|
"loss": 0.465, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 3.780923076923077, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 7.180938738257944e-06, |
|
"loss": 0.445, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 3.8006153846153845, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 6.957106007533826e-06, |
|
"loss": 0.4544, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 3.820307692307692, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 6.736252082953307e-06, |
|
"loss": 0.4508, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 6.5184134265733e-06, |
|
"loss": 0.4575, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.8596923076923075, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 6.303626002642554e-06, |
|
"loss": 0.4432, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 3.879384615384615, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 6.091925271664156e-06, |
|
"loss": 0.4614, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 3.8990769230769233, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 5.883346184541128e-06, |
|
"loss": 0.4645, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 3.9187692307692306, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 5.67792317680616e-06, |
|
"loss": 0.4533, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 3.9384615384615387, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 5.475690162936489e-06, |
|
"loss": 0.4232, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.958153846153846, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 5.27668053075474e-06, |
|
"loss": 0.4266, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 3.977846153846154, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 5.0809271359167215e-06, |
|
"loss": 0.4529, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 3.9975384615384613, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 4.888462296487128e-06, |
|
"loss": 0.4429, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 4.017230769230769, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.699317787603927e-06, |
|
"loss": 0.4537, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 4.036923076923077, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 4.513524836232458e-06, |
|
"loss": 0.4659, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 4.056615384615385, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 4.331114116009938e-06, |
|
"loss": 0.4156, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 4.076307692307692, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 4.152115742181434e-06, |
|
"loss": 0.4561, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 4.096, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 3.97655926662791e-06, |
|
"loss": 0.4438, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 4.115692307692307, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 3.80447367298738e-06, |
|
"loss": 0.4331, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 4.135384615384615, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 3.6358873718697726e-06, |
|
"loss": 0.4261, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 4.155076923076923, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 3.470828196166523e-06, |
|
"loss": 0.4629, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 4.174769230769231, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 3.3093233964554466e-06, |
|
"loss": 0.4271, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 4.194461538461539, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 3.151399636501773e-06, |
|
"loss": 0.4229, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 4.214153846153846, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 2.997082988856087e-06, |
|
"loss": 0.4504, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 4.233846153846154, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.8463989305498596e-06, |
|
"loss": 0.428, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 4.2535384615384615, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 2.699372338889297e-06, |
|
"loss": 0.4399, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 4.27323076923077, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 2.5560274873481975e-06, |
|
"loss": 0.4375, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 4.292923076923077, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 2.416388041560491e-06, |
|
"loss": 0.4231, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 4.312615384615385, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 2.2804770554131686e-06, |
|
"loss": 0.4409, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 4.332307692307692, |
|
"grad_norm": 1.25, |
|
"learning_rate": 2.1483169672401686e-06, |
|
"loss": 0.4693, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 4.352, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 2.0199295961178893e-06, |
|
"loss": 0.4454, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 4.3716923076923075, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 1.895336138262968e-06, |
|
"loss": 0.4543, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 4.391384615384616, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 1.7745571635328723e-06, |
|
"loss": 0.4302, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 4.411076923076923, |
|
"grad_norm": 1.125, |
|
"learning_rate": 1.6576126120299045e-06, |
|
"loss": 0.4325, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 4.430769230769231, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 1.5445217908091613e-06, |
|
"loss": 0.4406, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 4.450461538461538, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 1.4353033706910296e-06, |
|
"loss": 0.4631, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 4.470153846153846, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 1.3299753831787192e-06, |
|
"loss": 0.4466, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 4.489846153846154, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 1.2285552174813225e-06, |
|
"loss": 0.4379, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 4.509538461538462, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 1.131059617642935e-06, |
|
"loss": 0.443, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 4.529230769230769, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 1.0375046797782866e-06, |
|
"loss": 0.4793, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 4.548923076923077, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 9.479058494153425e-07, |
|
"loss": 0.4512, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 4.568615384615384, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 8.622779189453007e-07, |
|
"loss": 0.4558, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 4.588307692307692, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 7.806350251804484e-07, |
|
"loss": 0.4365, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 4.608, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 7.029906470202046e-07, |
|
"loss": 0.4499, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 4.627692307692308, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 6.293576032258413e-07, |
|
"loss": 0.4228, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 4.647384615384615, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 5.597480503041486e-07, |
|
"loss": 0.4443, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 4.667076923076923, |
|
"grad_norm": 1.375, |
|
"learning_rate": 4.941734805004289e-07, |
|
"loss": 0.4462, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 4.686769230769231, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 4.326447199012068e-07, |
|
"loss": 0.4136, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 4.7064615384615385, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 3.751719266468584e-07, |
|
"loss": 0.418, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 4.726153846153846, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 3.217645892545695e-07, |
|
"loss": 0.437, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 4.745846153846154, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 2.724315250518056e-07, |
|
"loss": 0.4599, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 4.765538461538462, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 2.271808787206092e-07, |
|
"loss": 0.4741, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 4.785230769230769, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.860201209529483e-07, |
|
"loss": 0.454, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 4.804923076923077, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 1.489560472173468e-07, |
|
"loss": 0.4625, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 4.8246153846153845, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.1599477663696845e-07, |
|
"loss": 0.443, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.844307692307693, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 8.714175097937204e-08, |
|
"loss": 0.4617, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 4.864, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 6.240173375811343e-08, |
|
"loss": 0.4432, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 4.883692307692308, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 4.1778809446302304e-08, |
|
"loss": 0.4661, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 4.903384615384615, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 2.5276382802272292e-08, |
|
"loss": 0.4307, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 4.923076923076923, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.2897178307461067e-08, |
|
"loss": 0.4554, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.942769230769231, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 4.6432397166285e-09, |
|
"loss": 0.4637, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 4.962461538461539, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 5.159297204238023e-10, |
|
"loss": 0.459, |
|
"step": 504 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 505, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.4919954461790044e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|