|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.944622991347343, |
|
"eval_steps": 5000, |
|
"global_step": 20001, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00024721878862793575, |
|
"grad_norm": 1448.0, |
|
"learning_rate": 2.0000000000000002e-07, |
|
"loss": 13.9716, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006180469715698393, |
|
"grad_norm": 334.0, |
|
"learning_rate": 5e-06, |
|
"loss": 9.5244, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.012360939431396786, |
|
"grad_norm": 78.5, |
|
"learning_rate": 1e-05, |
|
"loss": 7.5535, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.018541409147095178, |
|
"grad_norm": 34.5, |
|
"learning_rate": 1.5e-05, |
|
"loss": 7.0022, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.024721878862793572, |
|
"grad_norm": 18.75, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4562, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.030902348578491966, |
|
"grad_norm": 63.75, |
|
"learning_rate": 2.5e-05, |
|
"loss": 5.6999, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.037082818294190356, |
|
"grad_norm": 29.125, |
|
"learning_rate": 3e-05, |
|
"loss": 5.2536, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.04326328800988875, |
|
"grad_norm": 29.5, |
|
"learning_rate": 3.5e-05, |
|
"loss": 4.6767, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.049443757725587144, |
|
"grad_norm": 20.25, |
|
"learning_rate": 4e-05, |
|
"loss": 4.2913, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05562422744128554, |
|
"grad_norm": 16.625, |
|
"learning_rate": 4.5e-05, |
|
"loss": 3.7505, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.06180469715698393, |
|
"grad_norm": 17.375, |
|
"learning_rate": 5e-05, |
|
"loss": 3.3317, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.06798516687268233, |
|
"grad_norm": 112.0, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 4.1318, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.07416563658838071, |
|
"grad_norm": 31.75, |
|
"learning_rate": 6e-05, |
|
"loss": 3.558, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.08034610630407911, |
|
"grad_norm": 13.3125, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 2.9825, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.0865265760197775, |
|
"grad_norm": 12.875, |
|
"learning_rate": 7e-05, |
|
"loss": 2.6931, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.09270704573547589, |
|
"grad_norm": 12.8125, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 2.4328, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.09888751545117429, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 8e-05, |
|
"loss": 2.285, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.10506798516687268, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 8.5e-05, |
|
"loss": 2.1667, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.11124845488257108, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 9e-05, |
|
"loss": 2.0534, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.11742892459826947, |
|
"grad_norm": 3.90625, |
|
"learning_rate": 9.5e-05, |
|
"loss": 1.9744, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.12360939431396786, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.9209, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.12978986402966625, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 9.987179487179488e-05, |
|
"loss": 1.8793, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.13597033374536466, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 9.974358974358975e-05, |
|
"loss": 1.8301, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.14215080346106304, |
|
"grad_norm": 2.25, |
|
"learning_rate": 9.961538461538463e-05, |
|
"loss": 1.7935, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.14833127317676142, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 9.948717948717949e-05, |
|
"loss": 1.7651, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.15451174289245984, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 9.935897435897437e-05, |
|
"loss": 1.737, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.16069221260815822, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 9.923076923076923e-05, |
|
"loss": 1.7098, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.1668726823238566, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 9.910256410256411e-05, |
|
"loss": 1.69, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.173053152039555, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 9.897435897435898e-05, |
|
"loss": 1.6622, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.1792336217552534, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 9.884615384615386e-05, |
|
"loss": 1.6502, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.18541409147095178, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 9.871794871794872e-05, |
|
"loss": 1.629, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.1915945611866502, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 9.85897435897436e-05, |
|
"loss": 1.6102, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.19777503090234858, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 9.846153846153848e-05, |
|
"loss": 1.5991, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.20395550061804696, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 9.833333333333333e-05, |
|
"loss": 1.5884, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.21013597033374537, |
|
"grad_norm": 1.5, |
|
"learning_rate": 9.820512820512821e-05, |
|
"loss": 1.5704, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.21631644004944375, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 9.807692307692307e-05, |
|
"loss": 1.5573, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.22249690976514216, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 9.794871794871795e-05, |
|
"loss": 1.5416, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.22867737948084055, |
|
"grad_norm": 1.25, |
|
"learning_rate": 9.782051282051282e-05, |
|
"loss": 1.5409, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.23485784919653893, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 9.76923076923077e-05, |
|
"loss": 1.5344, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.24103831891223734, |
|
"grad_norm": 1.5, |
|
"learning_rate": 9.756410256410257e-05, |
|
"loss": 1.5195, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.24721878862793573, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 9.743589743589744e-05, |
|
"loss": 1.5051, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.25339925834363414, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 9.730769230769232e-05, |
|
"loss": 1.505, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.2595797280593325, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 9.717948717948718e-05, |
|
"loss": 1.4932, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.2657601977750309, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 9.705128205128206e-05, |
|
"loss": 1.4901, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.2719406674907293, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 9.692307692307692e-05, |
|
"loss": 1.4806, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.27812113720642767, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 9.67948717948718e-05, |
|
"loss": 1.4745, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.2843016069221261, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 9.666666666666667e-05, |
|
"loss": 1.4604, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.2904820766378245, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 9.653846153846155e-05, |
|
"loss": 1.4584, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.29666254635352285, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 9.641025641025641e-05, |
|
"loss": 1.4489, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.30284301606922126, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 9.628205128205129e-05, |
|
"loss": 1.4439, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.30902348578491967, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 9.615384615384617e-05, |
|
"loss": 1.4367, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.315203955500618, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 9.602564102564103e-05, |
|
"loss": 1.4392, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.32138442521631644, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 9.589743589743591e-05, |
|
"loss": 1.4281, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.32756489493201485, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 9.576923076923078e-05, |
|
"loss": 1.4251, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.3337453646477132, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 9.564102564102565e-05, |
|
"loss": 1.4222, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.3399258343634116, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 9.551282051282052e-05, |
|
"loss": 1.4259, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.34610630407911, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 9.53846153846154e-05, |
|
"loss": 1.4154, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.3522867737948084, |
|
"grad_norm": 1.0, |
|
"learning_rate": 9.525641025641026e-05, |
|
"loss": 1.4074, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.3584672435105068, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 9.512820512820513e-05, |
|
"loss": 1.4051, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.3646477132262052, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 9.5e-05, |
|
"loss": 1.4007, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.37082818294190356, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 9.487179487179487e-05, |
|
"loss": 1.389, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.377008652657602, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 9.474358974358975e-05, |
|
"loss": 1.3924, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.3831891223733004, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 9.461538461538461e-05, |
|
"loss": 1.3868, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.38936959208899874, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 9.448717948717949e-05, |
|
"loss": 1.3889, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.39555006180469715, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 9.435897435897436e-05, |
|
"loss": 1.3825, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.40173053152039556, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 9.423076923076924e-05, |
|
"loss": 1.3709, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.4079110012360939, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 9.41025641025641e-05, |
|
"loss": 1.3719, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.41409147095179233, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 9.397435897435898e-05, |
|
"loss": 1.3718, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.42027194066749074, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 9.384615384615386e-05, |
|
"loss": 1.3673, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.4264524103831891, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 9.371794871794872e-05, |
|
"loss": 1.3629, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.4326328800988875, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 9.35897435897436e-05, |
|
"loss": 1.3616, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.4388133498145859, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 9.346153846153846e-05, |
|
"loss": 1.3599, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.44499381953028433, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 9.333333333333334e-05, |
|
"loss": 1.3516, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.4511742892459827, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 9.320512820512821e-05, |
|
"loss": 1.3552, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.4573547589616811, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 9.307692307692309e-05, |
|
"loss": 1.3448, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.4635352286773795, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 9.294871794871795e-05, |
|
"loss": 1.3492, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.46971569839307786, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 9.282051282051283e-05, |
|
"loss": 1.34, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.4758961681087763, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 9.26923076923077e-05, |
|
"loss": 1.3406, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.4820766378244747, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 9.256410256410257e-05, |
|
"loss": 1.3369, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.48825710754017304, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 9.243589743589745e-05, |
|
"loss": 1.3297, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.49443757725587145, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 9.230769230769232e-05, |
|
"loss": 1.3347, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5006180469715699, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 9.217948717948718e-05, |
|
"loss": 1.3269, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.5067985166872683, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 9.205128205128205e-05, |
|
"loss": 1.3316, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.5129789864029666, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 9.192307692307692e-05, |
|
"loss": 1.3212, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.519159456118665, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 9.179487179487179e-05, |
|
"loss": 1.3177, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.5253399258343634, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 9.166666666666667e-05, |
|
"loss": 1.3155, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.5315203955500618, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 9.153846153846155e-05, |
|
"loss": 1.3206, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.5377008652657602, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 9.141025641025641e-05, |
|
"loss": 1.3131, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.5438813349814586, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 9.128205128205129e-05, |
|
"loss": 1.3143, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.5500618046971569, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 9.115384615384615e-05, |
|
"loss": 1.3099, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.5562422744128553, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 9.102564102564103e-05, |
|
"loss": 1.3104, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.5624227441285538, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 9.08974358974359e-05, |
|
"loss": 1.3026, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.5686032138442522, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 9.076923076923078e-05, |
|
"loss": 1.2985, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.5747836835599506, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 9.064102564102564e-05, |
|
"loss": 1.3109, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.580964153275649, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 9.051282051282052e-05, |
|
"loss": 1.3061, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.5871446229913473, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 9.038461538461538e-05, |
|
"loss": 1.3022, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.5933250927070457, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 9.025641025641026e-05, |
|
"loss": 1.3006, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.5995055624227441, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 9.012820512820514e-05, |
|
"loss": 1.2938, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.6056860321384425, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 9e-05, |
|
"loss": 1.2968, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.6118665018541409, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 8.987179487179488e-05, |
|
"loss": 1.2872, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.6180469715698393, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 8.974358974358975e-05, |
|
"loss": 1.2902, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.6242274412855378, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 8.961538461538463e-05, |
|
"loss": 1.2848, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.630407911001236, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 8.948717948717949e-05, |
|
"loss": 1.2902, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.6365883807169345, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 8.935897435897437e-05, |
|
"loss": 1.2917, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.6427688504326329, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 8.923076923076924e-05, |
|
"loss": 1.2882, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.6489493201483313, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 8.910256410256411e-05, |
|
"loss": 1.277, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.6551297898640297, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 8.897435897435898e-05, |
|
"loss": 1.2832, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.6613102595797281, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 8.884615384615384e-05, |
|
"loss": 1.2822, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.6674907292954264, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 8.871794871794872e-05, |
|
"loss": 1.2809, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.6736711990111248, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 8.858974358974359e-05, |
|
"loss": 1.2696, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.6798516687268232, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 8.846153846153847e-05, |
|
"loss": 1.2768, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.6860321384425216, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 8.833333333333333e-05, |
|
"loss": 1.2735, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.69221260815822, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 8.820512820512821e-05, |
|
"loss": 1.2743, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.6983930778739185, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 8.807692307692307e-05, |
|
"loss": 1.2766, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.7045735475896168, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 8.794871794871795e-05, |
|
"loss": 1.2693, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.7107540173053152, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 8.782051282051283e-05, |
|
"loss": 1.263, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.7169344870210136, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 8.76923076923077e-05, |
|
"loss": 1.2618, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.723114956736712, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 8.756410256410257e-05, |
|
"loss": 1.2626, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.7292954264524104, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 8.743589743589744e-05, |
|
"loss": 1.2644, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.7354758961681088, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 8.730769230769232e-05, |
|
"loss": 1.2623, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.7416563658838071, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 8.717948717948718e-05, |
|
"loss": 1.2633, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.7478368355995055, |
|
"grad_norm": 0.75, |
|
"learning_rate": 8.705128205128206e-05, |
|
"loss": 1.261, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.754017305315204, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 8.692307692307692e-05, |
|
"loss": 1.259, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.7601977750309024, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 8.67948717948718e-05, |
|
"loss": 1.2546, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.7663782447466008, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 8.666666666666667e-05, |
|
"loss": 1.2551, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.7725587144622992, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 8.653846153846155e-05, |
|
"loss": 1.2565, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.7787391841779975, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 8.641025641025642e-05, |
|
"loss": 1.2559, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.7849196538936959, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 8.628205128205129e-05, |
|
"loss": 1.253, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 0.7911001236093943, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 8.615384615384617e-05, |
|
"loss": 1.255, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.7972805933250927, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 8.602564102564103e-05, |
|
"loss": 1.2486, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.8034610630407911, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 8.58974358974359e-05, |
|
"loss": 1.24, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.8096415327564895, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 8.576923076923076e-05, |
|
"loss": 1.2466, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 0.8158220024721878, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 8.564102564102564e-05, |
|
"loss": 1.2455, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.8220024721878862, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 8.551282051282052e-05, |
|
"loss": 1.2409, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 0.8281829419035847, |
|
"grad_norm": 0.875, |
|
"learning_rate": 8.538461538461538e-05, |
|
"loss": 1.237, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.8343634116192831, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 8.525641025641026e-05, |
|
"loss": 1.2437, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 0.8405438813349815, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 8.512820512820513e-05, |
|
"loss": 1.2388, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.8467243510506799, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 8.5e-05, |
|
"loss": 1.238, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 0.8529048207663782, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 8.487179487179487e-05, |
|
"loss": 1.231, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.8590852904820766, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 8.474358974358975e-05, |
|
"loss": 1.2381, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 0.865265760197775, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 8.461538461538461e-05, |
|
"loss": 1.2305, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.8714462299134734, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 8.448717948717949e-05, |
|
"loss": 1.2326, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 0.8776266996291718, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 8.435897435897436e-05, |
|
"loss": 1.2288, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.8838071693448702, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 8.423076923076924e-05, |
|
"loss": 1.2268, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 0.8899876390605687, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 8.410256410256411e-05, |
|
"loss": 1.2314, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.896168108776267, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 8.397435897435898e-05, |
|
"loss": 1.2292, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 0.9023485784919654, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 8.384615384615386e-05, |
|
"loss": 1.2304, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.9085290482076638, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 8.371794871794872e-05, |
|
"loss": 1.2256, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 0.9147095179233622, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 8.35897435897436e-05, |
|
"loss": 1.2268, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.9208899876390606, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 8.346153846153847e-05, |
|
"loss": 1.2287, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 0.927070457354759, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 1.2252, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.9332509270704573, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 8.320512820512821e-05, |
|
"loss": 1.225, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 0.9394313967861557, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 8.307692307692309e-05, |
|
"loss": 1.2183, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.9456118665018541, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 8.294871794871795e-05, |
|
"loss": 1.2244, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 0.9517923362175525, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 8.282051282051283e-05, |
|
"loss": 1.2194, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.957972805933251, |
|
"grad_norm": 0.75, |
|
"learning_rate": 8.26923076923077e-05, |
|
"loss": 1.2162, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 0.9641532756489494, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 8.256410256410256e-05, |
|
"loss": 1.2146, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.9703337453646477, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 8.243589743589744e-05, |
|
"loss": 1.2144, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 0.9765142150803461, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 8.23076923076923e-05, |
|
"loss": 1.2146, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.9826946847960445, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 8.217948717948718e-05, |
|
"loss": 1.2151, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 0.9888751545117429, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 8.205128205128205e-05, |
|
"loss": 1.2146, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.9950556242274413, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 8.192307692307693e-05, |
|
"loss": 1.211, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 1.0012360939431397, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 8.179487179487179e-05, |
|
"loss": 1.198, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 1.0074165636588381, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 8.166666666666667e-05, |
|
"loss": 1.1373, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 1.0135970333745365, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 8.153846153846155e-05, |
|
"loss": 1.1436, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.019777503090235, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 8.141025641025641e-05, |
|
"loss": 1.138, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 1.0259579728059331, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 8.128205128205129e-05, |
|
"loss": 1.1334, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 1.0321384425216316, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 8.115384615384616e-05, |
|
"loss": 1.1423, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 1.03831891223733, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 8.102564102564103e-05, |
|
"loss": 1.1398, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.0444993819530284, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 8.08974358974359e-05, |
|
"loss": 1.1378, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 1.0506798516687268, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 8.076923076923078e-05, |
|
"loss": 1.1376, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 1.0568603213844252, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 8.064102564102564e-05, |
|
"loss": 1.1443, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 1.0630407911001236, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 8.051282051282052e-05, |
|
"loss": 1.1441, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.069221260815822, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 8.038461538461538e-05, |
|
"loss": 1.1435, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 1.0754017305315204, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 8.025641025641026e-05, |
|
"loss": 1.1449, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 1.0815822002472189, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 8.012820512820514e-05, |
|
"loss": 1.1468, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 1.0877626699629173, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 8e-05, |
|
"loss": 1.1402, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.0939431396786157, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 7.987179487179488e-05, |
|
"loss": 1.1405, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 1.100123609394314, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 7.974358974358975e-05, |
|
"loss": 1.1441, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 1.1063040791100123, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 7.961538461538461e-05, |
|
"loss": 1.1486, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 1.1124845488257107, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 7.948717948717948e-05, |
|
"loss": 1.1439, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.118665018541409, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 7.935897435897436e-05, |
|
"loss": 1.1408, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 1.1248454882571075, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 7.923076923076924e-05, |
|
"loss": 1.1381, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 1.131025957972806, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 7.91025641025641e-05, |
|
"loss": 1.1441, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 1.1372064276885043, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 7.897435897435898e-05, |
|
"loss": 1.1377, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.1433868974042027, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 7.884615384615384e-05, |
|
"loss": 1.1405, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 1.1495673671199012, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 7.871794871794872e-05, |
|
"loss": 1.146, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 1.1557478368355996, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 7.858974358974359e-05, |
|
"loss": 1.1416, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 1.161928306551298, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 7.846153846153847e-05, |
|
"loss": 1.1352, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.1681087762669964, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 7.833333333333333e-05, |
|
"loss": 1.1398, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 1.1742892459826946, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 7.820512820512821e-05, |
|
"loss": 1.1422, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 1.180469715698393, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 7.807692307692307e-05, |
|
"loss": 1.1416, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 1.1866501854140914, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 7.794871794871795e-05, |
|
"loss": 1.1416, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.1928306551297898, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 7.782051282051283e-05, |
|
"loss": 1.1387, |
|
"step": 4825 |
|
}, |
|
{ |
|
"epoch": 1.1990111248454882, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 7.76923076923077e-05, |
|
"loss": 1.1372, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 1.2051915945611866, |
|
"grad_norm": 0.75, |
|
"learning_rate": 7.756410256410257e-05, |
|
"loss": 1.1381, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 1.211372064276885, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 7.743589743589744e-05, |
|
"loss": 1.1342, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.2175525339925835, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 7.730769230769232e-05, |
|
"loss": 1.1355, |
|
"step": 4925 |
|
}, |
|
{ |
|
"epoch": 1.2237330037082819, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 7.717948717948718e-05, |
|
"loss": 1.1406, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 1.2299134734239803, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 7.705128205128206e-05, |
|
"loss": 1.1411, |
|
"step": 4975 |
|
}, |
|
{ |
|
"epoch": 1.2360939431396787, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 7.692307692307693e-05, |
|
"loss": 1.135, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.2360939431396787, |
|
"eval_loss": 1.048352837562561, |
|
"eval_runtime": 1.5386, |
|
"eval_samples_per_second": 415.323, |
|
"eval_steps_per_second": 1.95, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.242274412855377, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 7.67948717948718e-05, |
|
"loss": 1.1348, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 1.2484548825710755, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 7.666666666666667e-05, |
|
"loss": 1.1338, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 1.254635352286774, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 7.653846153846153e-05, |
|
"loss": 1.1291, |
|
"step": 5075 |
|
}, |
|
{ |
|
"epoch": 1.260815822002472, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 7.641025641025641e-05, |
|
"loss": 1.1325, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.2669962917181705, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 7.628205128205128e-05, |
|
"loss": 1.1359, |
|
"step": 5125 |
|
}, |
|
{ |
|
"epoch": 1.273176761433869, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 7.615384615384616e-05, |
|
"loss": 1.1337, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 1.2793572311495673, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 7.602564102564102e-05, |
|
"loss": 1.132, |
|
"step": 5175 |
|
}, |
|
{ |
|
"epoch": 1.2855377008652658, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 7.58974358974359e-05, |
|
"loss": 1.1308, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.2917181705809642, |
|
"grad_norm": 0.75, |
|
"learning_rate": 7.576923076923076e-05, |
|
"loss": 1.1291, |
|
"step": 5225 |
|
}, |
|
{ |
|
"epoch": 1.2978986402966626, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 7.564102564102564e-05, |
|
"loss": 1.1248, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 1.304079110012361, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 7.551282051282052e-05, |
|
"loss": 1.1262, |
|
"step": 5275 |
|
}, |
|
{ |
|
"epoch": 1.3102595797280594, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 7.538461538461539e-05, |
|
"loss": 1.1246, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.3164400494437576, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 7.525641025641026e-05, |
|
"loss": 1.1234, |
|
"step": 5325 |
|
}, |
|
{ |
|
"epoch": 1.322620519159456, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 7.512820512820513e-05, |
|
"loss": 1.1252, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 1.3288009888751544, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 1.1288, |
|
"step": 5375 |
|
}, |
|
{ |
|
"epoch": 1.3349814585908528, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 7.487179487179487e-05, |
|
"loss": 1.1227, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.3411619283065512, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 7.474358974358975e-05, |
|
"loss": 1.1233, |
|
"step": 5425 |
|
}, |
|
{ |
|
"epoch": 1.3473423980222496, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 7.461538461538462e-05, |
|
"loss": 1.1233, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 1.353522867737948, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 7.44871794871795e-05, |
|
"loss": 1.1248, |
|
"step": 5475 |
|
}, |
|
{ |
|
"epoch": 1.3597033374536465, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 7.435897435897436e-05, |
|
"loss": 1.1264, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.3658838071693449, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 7.423076923076924e-05, |
|
"loss": 1.1215, |
|
"step": 5525 |
|
}, |
|
{ |
|
"epoch": 1.3720642768850433, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 7.410256410256412e-05, |
|
"loss": 1.1237, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 1.3782447466007417, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 7.397435897435898e-05, |
|
"loss": 1.1212, |
|
"step": 5575 |
|
}, |
|
{ |
|
"epoch": 1.38442521631644, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 7.384615384615386e-05, |
|
"loss": 1.118, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.3906056860321385, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 7.371794871794872e-05, |
|
"loss": 1.1163, |
|
"step": 5625 |
|
}, |
|
{ |
|
"epoch": 1.396786155747837, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 7.35897435897436e-05, |
|
"loss": 1.1168, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 1.4029666254635353, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 7.346153846153847e-05, |
|
"loss": 1.118, |
|
"step": 5675 |
|
}, |
|
{ |
|
"epoch": 1.4091470951792338, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 7.333333333333333e-05, |
|
"loss": 1.1153, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.415327564894932, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 7.320512820512821e-05, |
|
"loss": 1.1237, |
|
"step": 5725 |
|
}, |
|
{ |
|
"epoch": 1.4215080346106304, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 7.307692307692307e-05, |
|
"loss": 1.1225, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 1.4276885043263288, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 7.294871794871795e-05, |
|
"loss": 1.1224, |
|
"step": 5775 |
|
}, |
|
{ |
|
"epoch": 1.4338689740420272, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 7.282051282051282e-05, |
|
"loss": 1.1187, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.4400494437577256, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 7.26923076923077e-05, |
|
"loss": 1.1186, |
|
"step": 5825 |
|
}, |
|
{ |
|
"epoch": 1.446229913473424, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 7.256410256410256e-05, |
|
"loss": 1.1164, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 1.4524103831891224, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 7.243589743589744e-05, |
|
"loss": 1.1194, |
|
"step": 5875 |
|
}, |
|
{ |
|
"epoch": 1.4585908529048208, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 7.23076923076923e-05, |
|
"loss": 1.1164, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.4647713226205192, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 7.217948717948718e-05, |
|
"loss": 1.1159, |
|
"step": 5925 |
|
}, |
|
{ |
|
"epoch": 1.4709517923362174, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 7.205128205128205e-05, |
|
"loss": 1.1184, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 1.4771322620519158, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 7.192307692307693e-05, |
|
"loss": 1.1136, |
|
"step": 5975 |
|
}, |
|
{ |
|
"epoch": 1.4833127317676142, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 7.17948717948718e-05, |
|
"loss": 1.1086, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.4894932014833127, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 7.166666666666667e-05, |
|
"loss": 1.1098, |
|
"step": 6025 |
|
}, |
|
{ |
|
"epoch": 1.495673671199011, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 7.153846153846155e-05, |
|
"loss": 1.1132, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 1.5018541409147095, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 7.141025641025641e-05, |
|
"loss": 1.1044, |
|
"step": 6075 |
|
}, |
|
{ |
|
"epoch": 1.508034610630408, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 7.128205128205129e-05, |
|
"loss": 1.1081, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.5142150803461063, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 7.115384615384616e-05, |
|
"loss": 1.1109, |
|
"step": 6125 |
|
}, |
|
{ |
|
"epoch": 1.5203955500618047, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 7.102564102564103e-05, |
|
"loss": 1.1085, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 1.5265760197775031, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 7.08974358974359e-05, |
|
"loss": 1.1143, |
|
"step": 6175 |
|
}, |
|
{ |
|
"epoch": 1.5327564894932015, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 7.076923076923078e-05, |
|
"loss": 1.106, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.5389369592089, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 7.064102564102564e-05, |
|
"loss": 1.1094, |
|
"step": 6225 |
|
}, |
|
{ |
|
"epoch": 1.5451174289245984, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 7.051282051282052e-05, |
|
"loss": 1.1063, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 1.5512978986402968, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 7.03846153846154e-05, |
|
"loss": 1.105, |
|
"step": 6275 |
|
}, |
|
{ |
|
"epoch": 1.5574783683559952, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 7.025641025641025e-05, |
|
"loss": 1.1063, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.5636588380716936, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 7.012820512820513e-05, |
|
"loss": 1.1029, |
|
"step": 6325 |
|
}, |
|
{ |
|
"epoch": 1.569839307787392, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 7e-05, |
|
"loss": 1.105, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 1.5760197775030902, |
|
"grad_norm": 0.75, |
|
"learning_rate": 6.987179487179487e-05, |
|
"loss": 1.1036, |
|
"step": 6375 |
|
}, |
|
{ |
|
"epoch": 1.5822002472187886, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 6.974358974358974e-05, |
|
"loss": 1.1072, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.588380716934487, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 6.961538461538462e-05, |
|
"loss": 1.098, |
|
"step": 6425 |
|
}, |
|
{ |
|
"epoch": 1.5945611866501854, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 6.94871794871795e-05, |
|
"loss": 1.101, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 1.6007416563658838, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 6.935897435897436e-05, |
|
"loss": 1.1008, |
|
"step": 6475 |
|
}, |
|
{ |
|
"epoch": 1.6069221260815822, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 6.923076923076924e-05, |
|
"loss": 1.1, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.6131025957972804, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 6.91025641025641e-05, |
|
"loss": 1.0997, |
|
"step": 6525 |
|
}, |
|
{ |
|
"epoch": 1.6192830655129788, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 6.897435897435898e-05, |
|
"loss": 1.1005, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 1.6254635352286773, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 6.884615384615385e-05, |
|
"loss": 1.0953, |
|
"step": 6575 |
|
}, |
|
{ |
|
"epoch": 1.6316440049443757, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 6.871794871794872e-05, |
|
"loss": 1.0999, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.637824474660074, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 6.858974358974359e-05, |
|
"loss": 1.0965, |
|
"step": 6625 |
|
}, |
|
{ |
|
"epoch": 1.6440049443757725, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 6.846153846153847e-05, |
|
"loss": 1.098, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 1.650185414091471, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 6.833333333333333e-05, |
|
"loss": 1.09, |
|
"step": 6675 |
|
}, |
|
{ |
|
"epoch": 1.6563658838071693, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 6.820512820512821e-05, |
|
"loss": 1.0992, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.6625463535228677, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 6.807692307692309e-05, |
|
"loss": 1.0948, |
|
"step": 6725 |
|
}, |
|
{ |
|
"epoch": 1.6687268232385661, |
|
"grad_norm": 0.75, |
|
"learning_rate": 6.794871794871795e-05, |
|
"loss": 1.098, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 1.6749072929542645, |
|
"grad_norm": 0.75, |
|
"learning_rate": 6.782051282051283e-05, |
|
"loss": 1.0915, |
|
"step": 6775 |
|
}, |
|
{ |
|
"epoch": 1.681087762669963, |
|
"grad_norm": 0.75, |
|
"learning_rate": 6.76923076923077e-05, |
|
"loss": 1.0943, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.6872682323856614, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 6.756410256410258e-05, |
|
"loss": 1.0913, |
|
"step": 6825 |
|
}, |
|
{ |
|
"epoch": 1.6934487021013598, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 6.743589743589744e-05, |
|
"loss": 1.092, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 1.6996291718170582, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 6.730769230769232e-05, |
|
"loss": 1.0904, |
|
"step": 6875 |
|
}, |
|
{ |
|
"epoch": 1.7058096415327566, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 6.717948717948718e-05, |
|
"loss": 1.091, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.711990111248455, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 6.705128205128205e-05, |
|
"loss": 1.0901, |
|
"step": 6925 |
|
}, |
|
{ |
|
"epoch": 1.7181705809641534, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 6.692307692307693e-05, |
|
"loss": 1.0885, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 1.7243510506798516, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 6.679487179487179e-05, |
|
"loss": 1.0897, |
|
"step": 6975 |
|
}, |
|
{ |
|
"epoch": 1.73053152039555, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 1.0888, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.7367119901112484, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 6.653846153846153e-05, |
|
"loss": 1.0887, |
|
"step": 7025 |
|
}, |
|
{ |
|
"epoch": 1.7428924598269468, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 6.641025641025641e-05, |
|
"loss": 1.0857, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 1.7490729295426453, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 6.628205128205128e-05, |
|
"loss": 1.089, |
|
"step": 7075 |
|
}, |
|
{ |
|
"epoch": 1.7552533992583437, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 6.615384615384616e-05, |
|
"loss": 1.0866, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 1.7614338689740419, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 6.602564102564102e-05, |
|
"loss": 1.0871, |
|
"step": 7125 |
|
}, |
|
{ |
|
"epoch": 1.7676143386897403, |
|
"grad_norm": 0.75, |
|
"learning_rate": 6.58974358974359e-05, |
|
"loss": 1.0779, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 1.7737948084054387, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 6.576923076923078e-05, |
|
"loss": 1.0852, |
|
"step": 7175 |
|
}, |
|
{ |
|
"epoch": 1.779975278121137, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 6.564102564102564e-05, |
|
"loss": 1.0836, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.7861557478368355, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 6.551282051282052e-05, |
|
"loss": 1.0843, |
|
"step": 7225 |
|
}, |
|
{ |
|
"epoch": 1.792336217552534, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 6.538461538461539e-05, |
|
"loss": 1.0802, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 1.7985166872682323, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 6.525641025641026e-05, |
|
"loss": 1.083, |
|
"step": 7275 |
|
}, |
|
{ |
|
"epoch": 1.8046971569839307, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 6.512820512820513e-05, |
|
"loss": 1.0798, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 1.8108776266996292, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 1.0823, |
|
"step": 7325 |
|
}, |
|
{ |
|
"epoch": 1.8170580964153276, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 6.487179487179487e-05, |
|
"loss": 1.0814, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 1.823238566131026, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 6.474358974358975e-05, |
|
"loss": 1.0827, |
|
"step": 7375 |
|
}, |
|
{ |
|
"epoch": 1.8294190358467244, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 6.461538461538462e-05, |
|
"loss": 1.0827, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.8355995055624228, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 6.44871794871795e-05, |
|
"loss": 1.0799, |
|
"step": 7425 |
|
}, |
|
{ |
|
"epoch": 1.8417799752781212, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 6.435897435897437e-05, |
|
"loss": 1.0783, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 1.8479604449938196, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 6.423076923076924e-05, |
|
"loss": 1.0758, |
|
"step": 7475 |
|
}, |
|
{ |
|
"epoch": 1.854140914709518, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 6.410256410256412e-05, |
|
"loss": 1.078, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.8603213844252164, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 6.397435897435897e-05, |
|
"loss": 1.0715, |
|
"step": 7525 |
|
}, |
|
{ |
|
"epoch": 1.8665018541409149, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 6.384615384615385e-05, |
|
"loss": 1.0761, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 1.8726823238566133, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 6.371794871794871e-05, |
|
"loss": 1.076, |
|
"step": 7575 |
|
}, |
|
{ |
|
"epoch": 1.8788627935723115, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 6.358974358974359e-05, |
|
"loss": 1.0732, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.8850432632880099, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 6.346153846153847e-05, |
|
"loss": 1.073, |
|
"step": 7625 |
|
}, |
|
{ |
|
"epoch": 1.8912237330037083, |
|
"grad_norm": 0.75, |
|
"learning_rate": 6.333333333333333e-05, |
|
"loss": 1.07, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 1.8974042027194067, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 6.320512820512821e-05, |
|
"loss": 1.0739, |
|
"step": 7675 |
|
}, |
|
{ |
|
"epoch": 1.903584672435105, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 6.307692307692308e-05, |
|
"loss": 1.071, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 1.9097651421508035, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 6.294871794871795e-05, |
|
"loss": 1.0706, |
|
"step": 7725 |
|
}, |
|
{ |
|
"epoch": 1.9159456118665017, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 6.282051282051282e-05, |
|
"loss": 1.0725, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 1.9221260815822, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 6.26923076923077e-05, |
|
"loss": 1.0666, |
|
"step": 7775 |
|
}, |
|
{ |
|
"epoch": 1.9283065512978985, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 6.256410256410256e-05, |
|
"loss": 1.0661, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 1.934487021013597, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 6.243589743589744e-05, |
|
"loss": 1.0677, |
|
"step": 7825 |
|
}, |
|
{ |
|
"epoch": 1.9406674907292953, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 6.23076923076923e-05, |
|
"loss": 1.0683, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 1.9468479604449938, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 6.217948717948718e-05, |
|
"loss": 1.0676, |
|
"step": 7875 |
|
}, |
|
{ |
|
"epoch": 1.9530284301606922, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 6.205128205128206e-05, |
|
"loss": 1.068, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 1.9592088998763906, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 6.192307692307693e-05, |
|
"loss": 1.0662, |
|
"step": 7925 |
|
}, |
|
{ |
|
"epoch": 1.965389369592089, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 6.17948717948718e-05, |
|
"loss": 1.0653, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 1.9715698393077874, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 6.166666666666667e-05, |
|
"loss": 1.0629, |
|
"step": 7975 |
|
}, |
|
{ |
|
"epoch": 1.9777503090234858, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 6.153846153846155e-05, |
|
"loss": 1.0669, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.9839307787391842, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 6.141025641025641e-05, |
|
"loss": 1.0648, |
|
"step": 8025 |
|
}, |
|
{ |
|
"epoch": 1.9901112484548826, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 6.128205128205129e-05, |
|
"loss": 1.0646, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 1.996291718170581, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 6.115384615384616e-05, |
|
"loss": 1.0591, |
|
"step": 8075 |
|
}, |
|
{ |
|
"epoch": 2.0024721878862795, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 6.1025641025641035e-05, |
|
"loss": 1.0239, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 2.008652657601978, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 6.089743589743589e-05, |
|
"loss": 0.9635, |
|
"step": 8125 |
|
}, |
|
{ |
|
"epoch": 2.0148331273176763, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 6.0769230769230765e-05, |
|
"loss": 0.9628, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 2.0210135970333747, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 6.0641025641025637e-05, |
|
"loss": 0.9707, |
|
"step": 8175 |
|
}, |
|
{ |
|
"epoch": 2.027194066749073, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 6.0512820512820515e-05, |
|
"loss": 0.9658, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 2.0333745364647715, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 6.038461538461539e-05, |
|
"loss": 0.9695, |
|
"step": 8225 |
|
}, |
|
{ |
|
"epoch": 2.03955500618047, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 6.025641025641026e-05, |
|
"loss": 0.9714, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 2.0457354758961683, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 6.012820512820513e-05, |
|
"loss": 0.973, |
|
"step": 8275 |
|
}, |
|
{ |
|
"epoch": 2.0519159456118663, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 6e-05, |
|
"loss": 0.9738, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 2.0580964153275647, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 5.987179487179487e-05, |
|
"loss": 0.9714, |
|
"step": 8325 |
|
}, |
|
{ |
|
"epoch": 2.064276885043263, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 5.9743589743589745e-05, |
|
"loss": 0.9718, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 2.0704573547589615, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 5.9615384615384616e-05, |
|
"loss": 0.9768, |
|
"step": 8375 |
|
}, |
|
{ |
|
"epoch": 2.07663782447466, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 5.948717948717949e-05, |
|
"loss": 0.9756, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 2.0828182941903584, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 5.935897435897436e-05, |
|
"loss": 0.9768, |
|
"step": 8425 |
|
}, |
|
{ |
|
"epoch": 2.0889987639060568, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 5.923076923076923e-05, |
|
"loss": 0.9755, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 2.095179233621755, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 5.910256410256411e-05, |
|
"loss": 0.9758, |
|
"step": 8475 |
|
}, |
|
{ |
|
"epoch": 2.1013597033374536, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 5.897435897435898e-05, |
|
"loss": 0.9762, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.107540173053152, |
|
"grad_norm": 0.75, |
|
"learning_rate": 5.884615384615385e-05, |
|
"loss": 0.9774, |
|
"step": 8525 |
|
}, |
|
{ |
|
"epoch": 2.1137206427688504, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 5.8717948717948725e-05, |
|
"loss": 0.9795, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 2.119901112484549, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 5.8589743589743596e-05, |
|
"loss": 0.9762, |
|
"step": 8575 |
|
}, |
|
{ |
|
"epoch": 2.1260815822002472, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 5.846153846153847e-05, |
|
"loss": 0.9808, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 2.1322620519159456, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 5.833333333333334e-05, |
|
"loss": 0.9781, |
|
"step": 8625 |
|
}, |
|
{ |
|
"epoch": 2.138442521631644, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 5.820512820512821e-05, |
|
"loss": 0.9772, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 2.1446229913473425, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 5.807692307692308e-05, |
|
"loss": 0.9774, |
|
"step": 8675 |
|
}, |
|
{ |
|
"epoch": 2.150803461063041, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 5.7948717948717954e-05, |
|
"loss": 0.9764, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 2.1569839307787393, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 5.7820512820512826e-05, |
|
"loss": 0.9781, |
|
"step": 8725 |
|
}, |
|
{ |
|
"epoch": 2.1631644004944377, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 5.769230769230769e-05, |
|
"loss": 0.9748, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 2.169344870210136, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 5.756410256410256e-05, |
|
"loss": 0.9762, |
|
"step": 8775 |
|
}, |
|
{ |
|
"epoch": 2.1755253399258345, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 5.7435897435897434e-05, |
|
"loss": 0.9789, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 2.181705809641533, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 5.7307692307692306e-05, |
|
"loss": 0.9764, |
|
"step": 8825 |
|
}, |
|
{ |
|
"epoch": 2.1878862793572313, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 5.717948717948718e-05, |
|
"loss": 0.9767, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 2.1940667490729293, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 5.705128205128205e-05, |
|
"loss": 0.9798, |
|
"step": 8875 |
|
}, |
|
{ |
|
"epoch": 2.200247218788628, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 5.692307692307692e-05, |
|
"loss": 0.9766, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 2.206427688504326, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 5.679487179487179e-05, |
|
"loss": 0.9814, |
|
"step": 8925 |
|
}, |
|
{ |
|
"epoch": 2.2126081582200245, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 5.666666666666667e-05, |
|
"loss": 0.9784, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 2.218788627935723, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 5.653846153846154e-05, |
|
"loss": 0.9749, |
|
"step": 8975 |
|
}, |
|
{ |
|
"epoch": 2.2249690976514214, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 5.6410256410256414e-05, |
|
"loss": 0.9814, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.23114956736712, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 5.6282051282051286e-05, |
|
"loss": 0.9805, |
|
"step": 9025 |
|
}, |
|
{ |
|
"epoch": 2.237330037082818, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 5.615384615384616e-05, |
|
"loss": 0.9749, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 2.2435105067985166, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 5.602564102564103e-05, |
|
"loss": 0.9747, |
|
"step": 9075 |
|
}, |
|
{ |
|
"epoch": 2.249690976514215, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 5.58974358974359e-05, |
|
"loss": 0.9755, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 2.2558714462299134, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 5.576923076923077e-05, |
|
"loss": 0.9767, |
|
"step": 9125 |
|
}, |
|
{ |
|
"epoch": 2.262051915945612, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 5.5641025641025644e-05, |
|
"loss": 0.9799, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 2.2682323856613102, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 5.5512820512820515e-05, |
|
"loss": 0.9807, |
|
"step": 9175 |
|
}, |
|
{ |
|
"epoch": 2.2744128553770087, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 5.538461538461539e-05, |
|
"loss": 0.9765, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 2.280593325092707, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 5.5256410256410265e-05, |
|
"loss": 0.9767, |
|
"step": 9225 |
|
}, |
|
{ |
|
"epoch": 2.2867737948084055, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 5.512820512820514e-05, |
|
"loss": 0.9751, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 2.292954264524104, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 0.9749, |
|
"step": 9275 |
|
}, |
|
{ |
|
"epoch": 2.2991347342398023, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 5.487179487179488e-05, |
|
"loss": 0.975, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 2.3053152039555007, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 5.474358974358975e-05, |
|
"loss": 0.9754, |
|
"step": 9325 |
|
}, |
|
{ |
|
"epoch": 2.311495673671199, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 5.461538461538461e-05, |
|
"loss": 0.9721, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 2.3176761433868975, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 5.448717948717948e-05, |
|
"loss": 0.9739, |
|
"step": 9375 |
|
}, |
|
{ |
|
"epoch": 2.323856613102596, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 5.435897435897436e-05, |
|
"loss": 0.9759, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 2.3300370828182944, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 5.423076923076923e-05, |
|
"loss": 0.9716, |
|
"step": 9425 |
|
}, |
|
{ |
|
"epoch": 2.3362175525339928, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 5.41025641025641e-05, |
|
"loss": 0.9757, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 2.342398022249691, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 5.3974358974358975e-05, |
|
"loss": 0.9723, |
|
"step": 9475 |
|
}, |
|
{ |
|
"epoch": 2.348578491965389, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 5.384615384615385e-05, |
|
"loss": 0.973, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 2.354758961681088, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 5.371794871794872e-05, |
|
"loss": 0.9721, |
|
"step": 9525 |
|
}, |
|
{ |
|
"epoch": 2.360939431396786, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 5.358974358974359e-05, |
|
"loss": 0.9762, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 2.3671199011124844, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 5.346153846153846e-05, |
|
"loss": 0.9781, |
|
"step": 9575 |
|
}, |
|
{ |
|
"epoch": 2.373300370828183, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 5.333333333333333e-05, |
|
"loss": 0.9767, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 2.379480840543881, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 5.3205128205128205e-05, |
|
"loss": 0.974, |
|
"step": 9625 |
|
}, |
|
{ |
|
"epoch": 2.3856613102595796, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 5.3076923076923076e-05, |
|
"loss": 0.9717, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 2.391841779975278, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 5.2948717948717955e-05, |
|
"loss": 0.9717, |
|
"step": 9675 |
|
}, |
|
{ |
|
"epoch": 2.3980222496909764, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 5.2820512820512826e-05, |
|
"loss": 0.9726, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 2.404202719406675, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 5.26923076923077e-05, |
|
"loss": 0.9746, |
|
"step": 9725 |
|
}, |
|
{ |
|
"epoch": 2.4103831891223733, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 5.256410256410257e-05, |
|
"loss": 0.9739, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 2.4165636588380717, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 5.243589743589744e-05, |
|
"loss": 0.9702, |
|
"step": 9775 |
|
}, |
|
{ |
|
"epoch": 2.42274412855377, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 5.230769230769231e-05, |
|
"loss": 0.97, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 2.4289245982694685, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 5.2179487179487185e-05, |
|
"loss": 0.9698, |
|
"step": 9825 |
|
}, |
|
{ |
|
"epoch": 2.435105067985167, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 5.2051282051282056e-05, |
|
"loss": 0.9687, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 2.4412855377008653, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 5.192307692307693e-05, |
|
"loss": 0.9673, |
|
"step": 9875 |
|
}, |
|
{ |
|
"epoch": 2.4474660074165637, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 5.17948717948718e-05, |
|
"loss": 0.9671, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 2.453646477132262, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 5.166666666666667e-05, |
|
"loss": 0.9685, |
|
"step": 9925 |
|
}, |
|
{ |
|
"epoch": 2.4598269468479605, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 5.1538461538461536e-05, |
|
"loss": 0.9722, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 2.466007416563659, |
|
"grad_norm": 0.875, |
|
"learning_rate": 5.141025641025641e-05, |
|
"loss": 0.969, |
|
"step": 9975 |
|
}, |
|
{ |
|
"epoch": 2.4721878862793574, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 5.128205128205128e-05, |
|
"loss": 0.9717, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.4721878862793574, |
|
"eval_loss": 1.0057789087295532, |
|
"eval_runtime": 1.5251, |
|
"eval_samples_per_second": 418.982, |
|
"eval_steps_per_second": 1.967, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.478368355995056, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 5.115384615384615e-05, |
|
"loss": 0.9694, |
|
"step": 10025 |
|
}, |
|
{ |
|
"epoch": 2.484548825710754, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 5.102564102564102e-05, |
|
"loss": 0.9696, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 2.490729295426452, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 5.0897435897435894e-05, |
|
"loss": 0.9687, |
|
"step": 10075 |
|
}, |
|
{ |
|
"epoch": 2.496909765142151, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 5.0769230769230766e-05, |
|
"loss": 0.964, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 2.503090234857849, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 5.0641025641025644e-05, |
|
"loss": 0.9691, |
|
"step": 10125 |
|
}, |
|
{ |
|
"epoch": 2.509270704573548, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 5.0512820512820516e-05, |
|
"loss": 0.9649, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 2.515451174289246, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 5.038461538461539e-05, |
|
"loss": 0.9673, |
|
"step": 10175 |
|
}, |
|
{ |
|
"epoch": 2.521631644004944, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 5.025641025641026e-05, |
|
"loss": 0.9669, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 2.5278121137206426, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 5.012820512820513e-05, |
|
"loss": 0.9653, |
|
"step": 10225 |
|
}, |
|
{ |
|
"epoch": 2.533992583436341, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9682, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 2.5401730531520395, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 4.9871794871794874e-05, |
|
"loss": 0.9615, |
|
"step": 10275 |
|
}, |
|
{ |
|
"epoch": 2.546353522867738, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 4.9743589743589746e-05, |
|
"loss": 0.9618, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 2.5525339925834363, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 4.961538461538462e-05, |
|
"loss": 0.9633, |
|
"step": 10325 |
|
}, |
|
{ |
|
"epoch": 2.5587144622991347, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 4.948717948717949e-05, |
|
"loss": 0.9624, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 2.564894932014833, |
|
"grad_norm": 0.75, |
|
"learning_rate": 4.935897435897436e-05, |
|
"loss": 0.9645, |
|
"step": 10375 |
|
}, |
|
{ |
|
"epoch": 2.5710754017305315, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 4.923076923076924e-05, |
|
"loss": 0.9594, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 2.57725587144623, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 4.9102564102564104e-05, |
|
"loss": 0.9652, |
|
"step": 10425 |
|
}, |
|
{ |
|
"epoch": 2.5834363411619283, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 4.8974358974358975e-05, |
|
"loss": 0.9606, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 2.5896168108776267, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 4.884615384615385e-05, |
|
"loss": 0.9639, |
|
"step": 10475 |
|
}, |
|
{ |
|
"epoch": 2.595797280593325, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 4.871794871794872e-05, |
|
"loss": 0.9615, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 2.6019777503090236, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 4.858974358974359e-05, |
|
"loss": 0.961, |
|
"step": 10525 |
|
}, |
|
{ |
|
"epoch": 2.608158220024722, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 4.846153846153846e-05, |
|
"loss": 0.9608, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 2.6143386897404204, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 4.8333333333333334e-05, |
|
"loss": 0.9605, |
|
"step": 10575 |
|
}, |
|
{ |
|
"epoch": 2.620519159456119, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 4.8205128205128205e-05, |
|
"loss": 0.9572, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 2.626699629171817, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 4.8076923076923084e-05, |
|
"loss": 0.961, |
|
"step": 10625 |
|
}, |
|
{ |
|
"epoch": 2.632880098887515, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 4.7948717948717955e-05, |
|
"loss": 0.9538, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 2.639060568603214, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 4.782051282051283e-05, |
|
"loss": 0.9529, |
|
"step": 10675 |
|
}, |
|
{ |
|
"epoch": 2.645241038318912, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 4.76923076923077e-05, |
|
"loss": 0.9579, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 2.651421508034611, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 4.7564102564102563e-05, |
|
"loss": 0.956, |
|
"step": 10725 |
|
}, |
|
{ |
|
"epoch": 2.657601977750309, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 4.7435897435897435e-05, |
|
"loss": 0.9592, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 2.6637824474660077, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 4.730769230769231e-05, |
|
"loss": 0.9578, |
|
"step": 10775 |
|
}, |
|
{ |
|
"epoch": 2.6699629171817056, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 4.717948717948718e-05, |
|
"loss": 0.9579, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 2.676143386897404, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 4.705128205128205e-05, |
|
"loss": 0.9561, |
|
"step": 10825 |
|
}, |
|
{ |
|
"epoch": 2.6823238566131025, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 4.692307692307693e-05, |
|
"loss": 0.9567, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 2.688504326328801, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 4.67948717948718e-05, |
|
"loss": 0.956, |
|
"step": 10875 |
|
}, |
|
{ |
|
"epoch": 2.6946847960444993, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 4.666666666666667e-05, |
|
"loss": 0.9567, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 2.7008652657601977, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 4.653846153846154e-05, |
|
"loss": 0.9514, |
|
"step": 10925 |
|
}, |
|
{ |
|
"epoch": 2.707045735475896, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 4.6410256410256415e-05, |
|
"loss": 0.9535, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 2.7132262051915945, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 4.6282051282051287e-05, |
|
"loss": 0.9571, |
|
"step": 10975 |
|
}, |
|
{ |
|
"epoch": 2.719406674907293, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 4.615384615384616e-05, |
|
"loss": 0.9511, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 2.7255871446229913, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 4.602564102564102e-05, |
|
"loss": 0.9511, |
|
"step": 11025 |
|
}, |
|
{ |
|
"epoch": 2.7317676143386898, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 4.5897435897435895e-05, |
|
"loss": 0.9554, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 2.737948084054388, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 4.576923076923077e-05, |
|
"loss": 0.95, |
|
"step": 11075 |
|
}, |
|
{ |
|
"epoch": 2.7441285537700866, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 4.5641025641025645e-05, |
|
"loss": 0.951, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 2.750309023485785, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 4.5512820512820516e-05, |
|
"loss": 0.9475, |
|
"step": 11125 |
|
}, |
|
{ |
|
"epoch": 2.7564894932014834, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 4.538461538461539e-05, |
|
"loss": 0.9511, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 2.762669962917182, |
|
"grad_norm": 0.75, |
|
"learning_rate": 4.525641025641026e-05, |
|
"loss": 0.9489, |
|
"step": 11175 |
|
}, |
|
{ |
|
"epoch": 2.76885043263288, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 4.512820512820513e-05, |
|
"loss": 0.9511, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 2.7750309023485786, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.951, |
|
"step": 11225 |
|
}, |
|
{ |
|
"epoch": 2.781211372064277, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 4.4871794871794874e-05, |
|
"loss": 0.9532, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 2.787391841779975, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 4.4743589743589746e-05, |
|
"loss": 0.9517, |
|
"step": 11275 |
|
}, |
|
{ |
|
"epoch": 2.793572311495674, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 4.461538461538462e-05, |
|
"loss": 0.9479, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 2.799752781211372, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 4.448717948717949e-05, |
|
"loss": 0.9493, |
|
"step": 11325 |
|
}, |
|
{ |
|
"epoch": 2.8059332509270707, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 4.435897435897436e-05, |
|
"loss": 0.9489, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 2.8121137206427687, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 4.423076923076923e-05, |
|
"loss": 0.945, |
|
"step": 11375 |
|
}, |
|
{ |
|
"epoch": 2.8182941903584675, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 4.4102564102564104e-05, |
|
"loss": 0.9511, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 2.8244746600741655, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 4.3974358974358976e-05, |
|
"loss": 0.9455, |
|
"step": 11425 |
|
}, |
|
{ |
|
"epoch": 2.830655129789864, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 4.384615384615385e-05, |
|
"loss": 0.9468, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 2.8368355995055623, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 4.371794871794872e-05, |
|
"loss": 0.9478, |
|
"step": 11475 |
|
}, |
|
{ |
|
"epoch": 2.8430160692212607, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 4.358974358974359e-05, |
|
"loss": 0.9448, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 2.849196538936959, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 4.346153846153846e-05, |
|
"loss": 0.9439, |
|
"step": 11525 |
|
}, |
|
{ |
|
"epoch": 2.8553770086526575, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 4.3333333333333334e-05, |
|
"loss": 0.948, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 2.861557478368356, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 4.320512820512821e-05, |
|
"loss": 0.9465, |
|
"step": 11575 |
|
}, |
|
{ |
|
"epoch": 2.8677379480840544, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 4.3076923076923084e-05, |
|
"loss": 0.9469, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 2.8739184177997528, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 4.294871794871795e-05, |
|
"loss": 0.9417, |
|
"step": 11625 |
|
}, |
|
{ |
|
"epoch": 2.880098887515451, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 4.282051282051282e-05, |
|
"loss": 0.9431, |
|
"step": 11650 |
|
}, |
|
{ |
|
"epoch": 2.8862793572311496, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 4.269230769230769e-05, |
|
"loss": 0.9446, |
|
"step": 11675 |
|
}, |
|
{ |
|
"epoch": 2.892459826946848, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 4.2564102564102564e-05, |
|
"loss": 0.943, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 2.8986402966625464, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 4.2435897435897435e-05, |
|
"loss": 0.9467, |
|
"step": 11725 |
|
}, |
|
{ |
|
"epoch": 2.904820766378245, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 4.230769230769231e-05, |
|
"loss": 0.9489, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 2.9110012360939432, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 4.217948717948718e-05, |
|
"loss": 0.9435, |
|
"step": 11775 |
|
}, |
|
{ |
|
"epoch": 2.9171817058096416, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 4.205128205128206e-05, |
|
"loss": 0.9405, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 2.92336217552534, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 4.192307692307693e-05, |
|
"loss": 0.9422, |
|
"step": 11825 |
|
}, |
|
{ |
|
"epoch": 2.9295426452410385, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 4.17948717948718e-05, |
|
"loss": 0.9421, |
|
"step": 11850 |
|
}, |
|
{ |
|
"epoch": 2.935723114956737, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 0.9394, |
|
"step": 11875 |
|
}, |
|
{ |
|
"epoch": 2.941903584672435, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 4.1538461538461544e-05, |
|
"loss": 0.9416, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 2.9480840543881337, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 4.1410256410256415e-05, |
|
"loss": 0.9412, |
|
"step": 11925 |
|
}, |
|
{ |
|
"epoch": 2.9542645241038317, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 4.128205128205128e-05, |
|
"loss": 0.9399, |
|
"step": 11950 |
|
}, |
|
{ |
|
"epoch": 2.9604449938195305, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 4.115384615384615e-05, |
|
"loss": 0.9396, |
|
"step": 11975 |
|
}, |
|
{ |
|
"epoch": 2.9666254635352285, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 4.1025641025641023e-05, |
|
"loss": 0.9366, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 2.9728059332509273, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 4.0897435897435895e-05, |
|
"loss": 0.9368, |
|
"step": 12025 |
|
}, |
|
{ |
|
"epoch": 2.9789864029666253, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 4.0769230769230773e-05, |
|
"loss": 0.9392, |
|
"step": 12050 |
|
}, |
|
{ |
|
"epoch": 2.9851668726823237, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 4.0641025641025645e-05, |
|
"loss": 0.9397, |
|
"step": 12075 |
|
}, |
|
{ |
|
"epoch": 2.991347342398022, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 4.051282051282052e-05, |
|
"loss": 0.9385, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 2.9975278121137205, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 4.038461538461539e-05, |
|
"loss": 0.9397, |
|
"step": 12125 |
|
}, |
|
{ |
|
"epoch": 3.003708281829419, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 4.025641025641026e-05, |
|
"loss": 0.8884, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 3.0098887515451174, |
|
"grad_norm": 0.75, |
|
"learning_rate": 4.012820512820513e-05, |
|
"loss": 0.8596, |
|
"step": 12175 |
|
}, |
|
{ |
|
"epoch": 3.016069221260816, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 4e-05, |
|
"loss": 0.8606, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 3.022249690976514, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 3.9871794871794875e-05, |
|
"loss": 0.8623, |
|
"step": 12225 |
|
}, |
|
{ |
|
"epoch": 3.0284301606922126, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 3.974358974358974e-05, |
|
"loss": 0.8606, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 3.034610630407911, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 3.961538461538462e-05, |
|
"loss": 0.8626, |
|
"step": 12275 |
|
}, |
|
{ |
|
"epoch": 3.0407911001236094, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 3.948717948717949e-05, |
|
"loss": 0.8634, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 3.046971569839308, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 3.935897435897436e-05, |
|
"loss": 0.8615, |
|
"step": 12325 |
|
}, |
|
{ |
|
"epoch": 3.0531520395550062, |
|
"grad_norm": 0.75, |
|
"learning_rate": 3.923076923076923e-05, |
|
"loss": 0.8622, |
|
"step": 12350 |
|
}, |
|
{ |
|
"epoch": 3.0593325092707047, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 3.9102564102564105e-05, |
|
"loss": 0.8627, |
|
"step": 12375 |
|
}, |
|
{ |
|
"epoch": 3.065512978986403, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 3.8974358974358976e-05, |
|
"loss": 0.8603, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 3.0716934487021015, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 3.884615384615385e-05, |
|
"loss": 0.8676, |
|
"step": 12425 |
|
}, |
|
{ |
|
"epoch": 3.0778739184178, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 3.871794871794872e-05, |
|
"loss": 0.8622, |
|
"step": 12450 |
|
}, |
|
{ |
|
"epoch": 3.0840543881334983, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 3.858974358974359e-05, |
|
"loss": 0.8625, |
|
"step": 12475 |
|
}, |
|
{ |
|
"epoch": 3.0902348578491967, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 3.846153846153846e-05, |
|
"loss": 0.8649, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 3.096415327564895, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 3.8333333333333334e-05, |
|
"loss": 0.8663, |
|
"step": 12525 |
|
}, |
|
{ |
|
"epoch": 3.1025957972805935, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 3.8205128205128206e-05, |
|
"loss": 0.8633, |
|
"step": 12550 |
|
}, |
|
{ |
|
"epoch": 3.1087762669962915, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 3.807692307692308e-05, |
|
"loss": 0.8652, |
|
"step": 12575 |
|
}, |
|
{ |
|
"epoch": 3.11495673671199, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 3.794871794871795e-05, |
|
"loss": 0.8671, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 3.1211372064276883, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 3.782051282051282e-05, |
|
"loss": 0.8633, |
|
"step": 12625 |
|
}, |
|
{ |
|
"epoch": 3.1273176761433867, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 3.769230769230769e-05, |
|
"loss": 0.8672, |
|
"step": 12650 |
|
}, |
|
{ |
|
"epoch": 3.133498145859085, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 3.7564102564102564e-05, |
|
"loss": 0.868, |
|
"step": 12675 |
|
}, |
|
{ |
|
"epoch": 3.1396786155747836, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 3.7435897435897436e-05, |
|
"loss": 0.8656, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 3.145859085290482, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 3.730769230769231e-05, |
|
"loss": 0.8663, |
|
"step": 12725 |
|
}, |
|
{ |
|
"epoch": 3.1520395550061804, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 3.717948717948718e-05, |
|
"loss": 0.8679, |
|
"step": 12750 |
|
}, |
|
{ |
|
"epoch": 3.158220024721879, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 3.705128205128206e-05, |
|
"loss": 0.867, |
|
"step": 12775 |
|
}, |
|
{ |
|
"epoch": 3.164400494437577, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 3.692307692307693e-05, |
|
"loss": 0.8685, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 3.1705809641532756, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 3.67948717948718e-05, |
|
"loss": 0.8673, |
|
"step": 12825 |
|
}, |
|
{ |
|
"epoch": 3.176761433868974, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 3.6666666666666666e-05, |
|
"loss": 0.8688, |
|
"step": 12850 |
|
}, |
|
{ |
|
"epoch": 3.1829419035846724, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 3.653846153846154e-05, |
|
"loss": 0.8638, |
|
"step": 12875 |
|
}, |
|
{ |
|
"epoch": 3.189122373300371, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 3.641025641025641e-05, |
|
"loss": 0.87, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 3.1953028430160693, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 3.628205128205128e-05, |
|
"loss": 0.8717, |
|
"step": 12925 |
|
}, |
|
{ |
|
"epoch": 3.2014833127317677, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 3.615384615384615e-05, |
|
"loss": 0.863, |
|
"step": 12950 |
|
}, |
|
{ |
|
"epoch": 3.207663782447466, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 3.6025641025641024e-05, |
|
"loss": 0.8704, |
|
"step": 12975 |
|
}, |
|
{ |
|
"epoch": 3.2138442521631645, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 3.58974358974359e-05, |
|
"loss": 0.8695, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 3.220024721878863, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 3.5769230769230774e-05, |
|
"loss": 0.8695, |
|
"step": 13025 |
|
}, |
|
{ |
|
"epoch": 3.2262051915945613, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 3.5641025641025646e-05, |
|
"loss": 0.8668, |
|
"step": 13050 |
|
}, |
|
{ |
|
"epoch": 3.2323856613102597, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 3.551282051282052e-05, |
|
"loss": 0.8703, |
|
"step": 13075 |
|
}, |
|
{ |
|
"epoch": 3.238566131025958, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 3.538461538461539e-05, |
|
"loss": 0.8671, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 3.2447466007416566, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 3.525641025641026e-05, |
|
"loss": 0.8677, |
|
"step": 13125 |
|
}, |
|
{ |
|
"epoch": 3.2509270704573545, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 3.5128205128205125e-05, |
|
"loss": 0.8671, |
|
"step": 13150 |
|
}, |
|
{ |
|
"epoch": 3.2571075401730534, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.8682, |
|
"step": 13175 |
|
}, |
|
{ |
|
"epoch": 3.2632880098887513, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 3.487179487179487e-05, |
|
"loss": 0.8677, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 3.2694684796044498, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 3.474358974358975e-05, |
|
"loss": 0.871, |
|
"step": 13225 |
|
}, |
|
{ |
|
"epoch": 3.275648949320148, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 3.461538461538462e-05, |
|
"loss": 0.8682, |
|
"step": 13250 |
|
}, |
|
{ |
|
"epoch": 3.2818294190358466, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 3.448717948717949e-05, |
|
"loss": 0.8653, |
|
"step": 13275 |
|
}, |
|
{ |
|
"epoch": 3.288009888751545, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 3.435897435897436e-05, |
|
"loss": 0.8697, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 3.2941903584672434, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 3.4230769230769234e-05, |
|
"loss": 0.8697, |
|
"step": 13325 |
|
}, |
|
{ |
|
"epoch": 3.300370828182942, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 3.4102564102564105e-05, |
|
"loss": 0.869, |
|
"step": 13350 |
|
}, |
|
{ |
|
"epoch": 3.30655129789864, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 3.397435897435898e-05, |
|
"loss": 0.8687, |
|
"step": 13375 |
|
}, |
|
{ |
|
"epoch": 3.3127317676143386, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 3.384615384615385e-05, |
|
"loss": 0.8692, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 3.318912237330037, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 3.371794871794872e-05, |
|
"loss": 0.8687, |
|
"step": 13425 |
|
}, |
|
{ |
|
"epoch": 3.3250927070457355, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 3.358974358974359e-05, |
|
"loss": 0.8708, |
|
"step": 13450 |
|
}, |
|
{ |
|
"epoch": 3.331273176761434, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 3.346153846153846e-05, |
|
"loss": 0.8659, |
|
"step": 13475 |
|
}, |
|
{ |
|
"epoch": 3.3374536464771323, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.8662, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 3.3436341161928307, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 3.3205128205128207e-05, |
|
"loss": 0.8692, |
|
"step": 13525 |
|
}, |
|
{ |
|
"epoch": 3.349814585908529, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 3.307692307692308e-05, |
|
"loss": 0.8677, |
|
"step": 13550 |
|
}, |
|
{ |
|
"epoch": 3.3559950556242275, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 3.294871794871795e-05, |
|
"loss": 0.8672, |
|
"step": 13575 |
|
}, |
|
{ |
|
"epoch": 3.362175525339926, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 3.282051282051282e-05, |
|
"loss": 0.8703, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 3.3683559950556243, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 3.269230769230769e-05, |
|
"loss": 0.8658, |
|
"step": 13625 |
|
}, |
|
{ |
|
"epoch": 3.3745364647713227, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 3.2564102564102565e-05, |
|
"loss": 0.8679, |
|
"step": 13650 |
|
}, |
|
{ |
|
"epoch": 3.380716934487021, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 3.2435897435897436e-05, |
|
"loss": 0.8654, |
|
"step": 13675 |
|
}, |
|
{ |
|
"epoch": 3.3868974042027196, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 3.230769230769231e-05, |
|
"loss": 0.8662, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 3.393077873918418, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 3.2179487179487186e-05, |
|
"loss": 0.8654, |
|
"step": 13725 |
|
}, |
|
{ |
|
"epoch": 3.3992583436341164, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 3.205128205128206e-05, |
|
"loss": 0.8696, |
|
"step": 13750 |
|
}, |
|
{ |
|
"epoch": 3.4054388133498144, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 3.192307692307692e-05, |
|
"loss": 0.8648, |
|
"step": 13775 |
|
}, |
|
{ |
|
"epoch": 3.411619283065513, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 3.1794871794871795e-05, |
|
"loss": 0.8677, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 3.417799752781211, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 3.1666666666666666e-05, |
|
"loss": 0.8665, |
|
"step": 13825 |
|
}, |
|
{ |
|
"epoch": 3.4239802224969096, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 3.153846153846154e-05, |
|
"loss": 0.8662, |
|
"step": 13850 |
|
}, |
|
{ |
|
"epoch": 3.430160692212608, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 3.141025641025641e-05, |
|
"loss": 0.8658, |
|
"step": 13875 |
|
}, |
|
{ |
|
"epoch": 3.4363411619283064, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 3.128205128205128e-05, |
|
"loss": 0.8652, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 3.442521631644005, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 3.115384615384615e-05, |
|
"loss": 0.8667, |
|
"step": 13925 |
|
}, |
|
{ |
|
"epoch": 3.4487021013597032, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 3.102564102564103e-05, |
|
"loss": 0.8654, |
|
"step": 13950 |
|
}, |
|
{ |
|
"epoch": 3.4548825710754016, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 3.08974358974359e-05, |
|
"loss": 0.8666, |
|
"step": 13975 |
|
}, |
|
{ |
|
"epoch": 3.4610630407911, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 3.0769230769230774e-05, |
|
"loss": 0.8712, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 3.4672435105067985, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 3.0641025641025646e-05, |
|
"loss": 0.867, |
|
"step": 14025 |
|
}, |
|
{ |
|
"epoch": 3.473423980222497, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 3.0512820512820518e-05, |
|
"loss": 0.8635, |
|
"step": 14050 |
|
}, |
|
{ |
|
"epoch": 3.4796044499381953, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 3.0384615384615382e-05, |
|
"loss": 0.8686, |
|
"step": 14075 |
|
}, |
|
{ |
|
"epoch": 3.4857849196538937, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 3.0256410256410257e-05, |
|
"loss": 0.8654, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 3.491965389369592, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 3.012820512820513e-05, |
|
"loss": 0.869, |
|
"step": 14125 |
|
}, |
|
{ |
|
"epoch": 3.4981458590852905, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 3e-05, |
|
"loss": 0.862, |
|
"step": 14150 |
|
}, |
|
{ |
|
"epoch": 3.504326328800989, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 2.9871794871794872e-05, |
|
"loss": 0.866, |
|
"step": 14175 |
|
}, |
|
{ |
|
"epoch": 3.5105067985166873, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 2.9743589743589744e-05, |
|
"loss": 0.8701, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 3.5166872682323858, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 2.9615384615384616e-05, |
|
"loss": 0.8679, |
|
"step": 14225 |
|
}, |
|
{ |
|
"epoch": 3.522867737948084, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 2.948717948717949e-05, |
|
"loss": 0.8639, |
|
"step": 14250 |
|
}, |
|
{ |
|
"epoch": 3.5290482076637826, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 2.9358974358974362e-05, |
|
"loss": 0.8674, |
|
"step": 14275 |
|
}, |
|
{ |
|
"epoch": 3.535228677379481, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 2.9230769230769234e-05, |
|
"loss": 0.8639, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 3.5414091470951794, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 2.9102564102564106e-05, |
|
"loss": 0.8652, |
|
"step": 14325 |
|
}, |
|
{ |
|
"epoch": 3.5475896168108774, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 2.8974358974358977e-05, |
|
"loss": 0.8687, |
|
"step": 14350 |
|
}, |
|
{ |
|
"epoch": 3.553770086526576, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 2.8846153846153845e-05, |
|
"loss": 0.8664, |
|
"step": 14375 |
|
}, |
|
{ |
|
"epoch": 3.559950556242274, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 2.8717948717948717e-05, |
|
"loss": 0.8671, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 3.566131025957973, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 2.858974358974359e-05, |
|
"loss": 0.8658, |
|
"step": 14425 |
|
}, |
|
{ |
|
"epoch": 3.572311495673671, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 2.846153846153846e-05, |
|
"loss": 0.8598, |
|
"step": 14450 |
|
}, |
|
{ |
|
"epoch": 3.57849196538937, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 2.8333333333333335e-05, |
|
"loss": 0.8643, |
|
"step": 14475 |
|
}, |
|
{ |
|
"epoch": 3.584672435105068, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 2.8205128205128207e-05, |
|
"loss": 0.8641, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 3.5908529048207662, |
|
"grad_norm": 0.75, |
|
"learning_rate": 2.807692307692308e-05, |
|
"loss": 0.8665, |
|
"step": 14525 |
|
}, |
|
{ |
|
"epoch": 3.5970333745364647, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 2.794871794871795e-05, |
|
"loss": 0.8649, |
|
"step": 14550 |
|
}, |
|
{ |
|
"epoch": 3.603213844252163, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 2.7820512820512822e-05, |
|
"loss": 0.8671, |
|
"step": 14575 |
|
}, |
|
{ |
|
"epoch": 3.6093943139678615, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 2.7692307692307694e-05, |
|
"loss": 0.8656, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 3.61557478368356, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 2.756410256410257e-05, |
|
"loss": 0.8642, |
|
"step": 14625 |
|
}, |
|
{ |
|
"epoch": 3.6217552533992583, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 2.743589743589744e-05, |
|
"loss": 0.8628, |
|
"step": 14650 |
|
}, |
|
{ |
|
"epoch": 3.6279357231149567, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 2.7307692307692305e-05, |
|
"loss": 0.8648, |
|
"step": 14675 |
|
}, |
|
{ |
|
"epoch": 3.634116192830655, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 2.717948717948718e-05, |
|
"loss": 0.8674, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 3.6402966625463535, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 2.705128205128205e-05, |
|
"loss": 0.8664, |
|
"step": 14725 |
|
}, |
|
{ |
|
"epoch": 3.646477132262052, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 2.6923076923076923e-05, |
|
"loss": 0.8643, |
|
"step": 14750 |
|
}, |
|
{ |
|
"epoch": 3.6526576019777504, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 2.6794871794871795e-05, |
|
"loss": 0.8679, |
|
"step": 14775 |
|
}, |
|
{ |
|
"epoch": 3.6588380716934488, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 0.8669, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 3.665018541409147, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 2.6538461538461538e-05, |
|
"loss": 0.8619, |
|
"step": 14825 |
|
}, |
|
{ |
|
"epoch": 3.6711990111248456, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 2.6410256410256413e-05, |
|
"loss": 0.8685, |
|
"step": 14850 |
|
}, |
|
{ |
|
"epoch": 3.677379480840544, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 2.6282051282051285e-05, |
|
"loss": 0.8666, |
|
"step": 14875 |
|
}, |
|
{ |
|
"epoch": 3.6835599505562424, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 2.6153846153846157e-05, |
|
"loss": 0.8645, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 3.689740420271941, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 2.6025641025641028e-05, |
|
"loss": 0.8657, |
|
"step": 14925 |
|
}, |
|
{ |
|
"epoch": 3.6959208899876392, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 2.58974358974359e-05, |
|
"loss": 0.8611, |
|
"step": 14950 |
|
}, |
|
{ |
|
"epoch": 3.702101359703337, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 2.5769230769230768e-05, |
|
"loss": 0.8628, |
|
"step": 14975 |
|
}, |
|
{ |
|
"epoch": 3.708281829419036, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 2.564102564102564e-05, |
|
"loss": 0.8643, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 3.708281829419036, |
|
"eval_loss": 0.9966387748718262, |
|
"eval_runtime": 1.5383, |
|
"eval_samples_per_second": 415.389, |
|
"eval_steps_per_second": 1.95, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 3.714462299134734, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 2.551282051282051e-05, |
|
"loss": 0.864, |
|
"step": 15025 |
|
}, |
|
{ |
|
"epoch": 3.720642768850433, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 2.5384615384615383e-05, |
|
"loss": 0.8643, |
|
"step": 15050 |
|
}, |
|
{ |
|
"epoch": 3.726823238566131, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 2.5256410256410258e-05, |
|
"loss": 0.8614, |
|
"step": 15075 |
|
}, |
|
{ |
|
"epoch": 3.7330037082818293, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 2.512820512820513e-05, |
|
"loss": 0.8694, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 3.7391841779975277, |
|
"grad_norm": 0.75, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.8666, |
|
"step": 15125 |
|
}, |
|
{ |
|
"epoch": 3.745364647713226, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 2.4871794871794873e-05, |
|
"loss": 0.8616, |
|
"step": 15150 |
|
}, |
|
{ |
|
"epoch": 3.7515451174289245, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 2.4743589743589744e-05, |
|
"loss": 0.8646, |
|
"step": 15175 |
|
}, |
|
{ |
|
"epoch": 3.757725587144623, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 2.461538461538462e-05, |
|
"loss": 0.8623, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 3.7639060568603213, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 2.4487179487179488e-05, |
|
"loss": 0.8645, |
|
"step": 15225 |
|
}, |
|
{ |
|
"epoch": 3.7700865265760197, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 2.435897435897436e-05, |
|
"loss": 0.8612, |
|
"step": 15250 |
|
}, |
|
{ |
|
"epoch": 3.776266996291718, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 2.423076923076923e-05, |
|
"loss": 0.8642, |
|
"step": 15275 |
|
}, |
|
{ |
|
"epoch": 3.7824474660074165, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 2.4102564102564103e-05, |
|
"loss": 0.8615, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 3.788627935723115, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 2.3974358974358978e-05, |
|
"loss": 0.8603, |
|
"step": 15325 |
|
}, |
|
{ |
|
"epoch": 3.7948084054388134, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 2.384615384615385e-05, |
|
"loss": 0.8675, |
|
"step": 15350 |
|
}, |
|
{ |
|
"epoch": 3.800988875154512, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 2.3717948717948718e-05, |
|
"loss": 0.8655, |
|
"step": 15375 |
|
}, |
|
{ |
|
"epoch": 3.80716934487021, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 2.358974358974359e-05, |
|
"loss": 0.8632, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 3.8133498145859086, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 2.3461538461538464e-05, |
|
"loss": 0.8599, |
|
"step": 15425 |
|
}, |
|
{ |
|
"epoch": 3.819530284301607, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 2.3333333333333336e-05, |
|
"loss": 0.8656, |
|
"step": 15450 |
|
}, |
|
{ |
|
"epoch": 3.8257107540173054, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 2.3205128205128207e-05, |
|
"loss": 0.8638, |
|
"step": 15475 |
|
}, |
|
{ |
|
"epoch": 3.831891223733004, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 2.307692307692308e-05, |
|
"loss": 0.8621, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 3.8380716934487022, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 2.2948717948717947e-05, |
|
"loss": 0.8651, |
|
"step": 15525 |
|
}, |
|
{ |
|
"epoch": 3.8442521631644007, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 2.2820512820512822e-05, |
|
"loss": 0.8607, |
|
"step": 15550 |
|
}, |
|
{ |
|
"epoch": 3.850432632880099, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 2.2692307692307694e-05, |
|
"loss": 0.8632, |
|
"step": 15575 |
|
}, |
|
{ |
|
"epoch": 3.856613102595797, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 2.2564102564102566e-05, |
|
"loss": 0.8632, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 3.862793572311496, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 2.2435897435897437e-05, |
|
"loss": 0.8631, |
|
"step": 15625 |
|
}, |
|
{ |
|
"epoch": 3.868974042027194, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 2.230769230769231e-05, |
|
"loss": 0.8594, |
|
"step": 15650 |
|
}, |
|
{ |
|
"epoch": 3.8751545117428927, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 2.217948717948718e-05, |
|
"loss": 0.8617, |
|
"step": 15675 |
|
}, |
|
{ |
|
"epoch": 3.8813349814585907, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 2.2051282051282052e-05, |
|
"loss": 0.8617, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 3.887515451174289, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 2.1923076923076924e-05, |
|
"loss": 0.8629, |
|
"step": 15725 |
|
}, |
|
{ |
|
"epoch": 3.8936959208899875, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 2.1794871794871795e-05, |
|
"loss": 0.8639, |
|
"step": 15750 |
|
}, |
|
{ |
|
"epoch": 3.899876390605686, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 2.1666666666666667e-05, |
|
"loss": 0.8616, |
|
"step": 15775 |
|
}, |
|
{ |
|
"epoch": 3.9060568603213843, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 2.1538461538461542e-05, |
|
"loss": 0.865, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 3.9122373300370827, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 2.141025641025641e-05, |
|
"loss": 0.8609, |
|
"step": 15825 |
|
}, |
|
{ |
|
"epoch": 3.918417799752781, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 2.1282051282051282e-05, |
|
"loss": 0.861, |
|
"step": 15850 |
|
}, |
|
{ |
|
"epoch": 3.9245982694684796, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 2.1153846153846154e-05, |
|
"loss": 0.8613, |
|
"step": 15875 |
|
}, |
|
{ |
|
"epoch": 3.930778739184178, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 2.102564102564103e-05, |
|
"loss": 0.8624, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 3.9369592088998764, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 2.08974358974359e-05, |
|
"loss": 0.8633, |
|
"step": 15925 |
|
}, |
|
{ |
|
"epoch": 3.943139678615575, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 2.0769230769230772e-05, |
|
"loss": 0.8589, |
|
"step": 15950 |
|
}, |
|
{ |
|
"epoch": 3.949320148331273, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 2.064102564102564e-05, |
|
"loss": 0.8605, |
|
"step": 15975 |
|
}, |
|
{ |
|
"epoch": 3.9555006180469716, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 2.0512820512820512e-05, |
|
"loss": 0.8612, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 3.96168108776267, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 2.0384615384615387e-05, |
|
"loss": 0.8612, |
|
"step": 16025 |
|
}, |
|
{ |
|
"epoch": 3.9678615574783684, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 2.025641025641026e-05, |
|
"loss": 0.861, |
|
"step": 16050 |
|
}, |
|
{ |
|
"epoch": 3.974042027194067, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 2.012820512820513e-05, |
|
"loss": 0.8593, |
|
"step": 16075 |
|
}, |
|
{ |
|
"epoch": 3.9802224969097653, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8612, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 3.9864029666254637, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 1.987179487179487e-05, |
|
"loss": 0.8664, |
|
"step": 16125 |
|
}, |
|
{ |
|
"epoch": 3.992583436341162, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 1.9743589743589745e-05, |
|
"loss": 0.8599, |
|
"step": 16150 |
|
}, |
|
{ |
|
"epoch": 3.99876390605686, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.9615384615384617e-05, |
|
"loss": 0.8649, |
|
"step": 16175 |
|
}, |
|
{ |
|
"epoch": 4.004944375772559, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 1.9487179487179488e-05, |
|
"loss": 0.8264, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 4.011124845488257, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.935897435897436e-05, |
|
"loss": 0.8184, |
|
"step": 16225 |
|
}, |
|
{ |
|
"epoch": 4.017305315203956, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.923076923076923e-05, |
|
"loss": 0.8214, |
|
"step": 16250 |
|
}, |
|
{ |
|
"epoch": 4.023485784919654, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.9102564102564103e-05, |
|
"loss": 0.8166, |
|
"step": 16275 |
|
}, |
|
{ |
|
"epoch": 4.0296662546353526, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 1.8974358974358975e-05, |
|
"loss": 0.8191, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 4.0358467243510505, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 1.8846153846153846e-05, |
|
"loss": 0.8194, |
|
"step": 16325 |
|
}, |
|
{ |
|
"epoch": 4.042027194066749, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 1.8717948717948718e-05, |
|
"loss": 0.8189, |
|
"step": 16350 |
|
}, |
|
{ |
|
"epoch": 4.048207663782447, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 1.858974358974359e-05, |
|
"loss": 0.822, |
|
"step": 16375 |
|
}, |
|
{ |
|
"epoch": 4.054388133498146, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 1.8461538461538465e-05, |
|
"loss": 0.8146, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 4.060568603213844, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.8333333333333333e-05, |
|
"loss": 0.8168, |
|
"step": 16425 |
|
}, |
|
{ |
|
"epoch": 4.066749072929543, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.8205128205128204e-05, |
|
"loss": 0.8221, |
|
"step": 16450 |
|
}, |
|
{ |
|
"epoch": 4.072929542645241, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 1.8076923076923076e-05, |
|
"loss": 0.8179, |
|
"step": 16475 |
|
}, |
|
{ |
|
"epoch": 4.07911001236094, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.794871794871795e-05, |
|
"loss": 0.8182, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 4.085290482076638, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 1.7820512820512823e-05, |
|
"loss": 0.8145, |
|
"step": 16525 |
|
}, |
|
{ |
|
"epoch": 4.091470951792337, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 1.7692307692307694e-05, |
|
"loss": 0.8239, |
|
"step": 16550 |
|
}, |
|
{ |
|
"epoch": 4.097651421508035, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.7564102564102563e-05, |
|
"loss": 0.8151, |
|
"step": 16575 |
|
}, |
|
{ |
|
"epoch": 4.103831891223733, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 1.7435897435897434e-05, |
|
"loss": 0.8193, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 4.1100123609394315, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.730769230769231e-05, |
|
"loss": 0.8193, |
|
"step": 16625 |
|
}, |
|
{ |
|
"epoch": 4.116192830655129, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.717948717948718e-05, |
|
"loss": 0.8193, |
|
"step": 16650 |
|
}, |
|
{ |
|
"epoch": 4.122373300370828, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 1.7051282051282053e-05, |
|
"loss": 0.8214, |
|
"step": 16675 |
|
}, |
|
{ |
|
"epoch": 4.128553770086526, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.6923076923076924e-05, |
|
"loss": 0.8211, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 4.134734239802225, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.6794871794871796e-05, |
|
"loss": 0.8213, |
|
"step": 16725 |
|
}, |
|
{ |
|
"epoch": 4.140914709517923, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.8209, |
|
"step": 16750 |
|
}, |
|
{ |
|
"epoch": 4.147095179233622, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.653846153846154e-05, |
|
"loss": 0.8185, |
|
"step": 16775 |
|
}, |
|
{ |
|
"epoch": 4.15327564894932, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.641025641025641e-05, |
|
"loss": 0.8188, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 4.159456118665019, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.6282051282051282e-05, |
|
"loss": 0.8209, |
|
"step": 16825 |
|
}, |
|
{ |
|
"epoch": 4.165636588380717, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 1.6153846153846154e-05, |
|
"loss": 0.82, |
|
"step": 16850 |
|
}, |
|
{ |
|
"epoch": 4.171817058096416, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.602564102564103e-05, |
|
"loss": 0.8203, |
|
"step": 16875 |
|
}, |
|
{ |
|
"epoch": 4.1779975278121135, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 1.5897435897435897e-05, |
|
"loss": 0.8175, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 4.184177997527812, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 1.576923076923077e-05, |
|
"loss": 0.8207, |
|
"step": 16925 |
|
}, |
|
{ |
|
"epoch": 4.19035846724351, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.564102564102564e-05, |
|
"loss": 0.8213, |
|
"step": 16950 |
|
}, |
|
{ |
|
"epoch": 4.196538936959209, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.5512820512820516e-05, |
|
"loss": 0.8235, |
|
"step": 16975 |
|
}, |
|
{ |
|
"epoch": 4.202719406674907, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 0.8166, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 4.208899876390606, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.5256410256410259e-05, |
|
"loss": 0.8178, |
|
"step": 17025 |
|
}, |
|
{ |
|
"epoch": 4.215080346106304, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.5128205128205129e-05, |
|
"loss": 0.8214, |
|
"step": 17050 |
|
}, |
|
{ |
|
"epoch": 4.221260815822003, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.8228, |
|
"step": 17075 |
|
}, |
|
{ |
|
"epoch": 4.227441285537701, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.4871794871794872e-05, |
|
"loss": 0.8216, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 4.2336217552534, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 1.4743589743589745e-05, |
|
"loss": 0.8202, |
|
"step": 17125 |
|
}, |
|
{ |
|
"epoch": 4.239802224969098, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.4615384615384617e-05, |
|
"loss": 0.8245, |
|
"step": 17150 |
|
}, |
|
{ |
|
"epoch": 4.2459826946847965, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 1.4487179487179489e-05, |
|
"loss": 0.8228, |
|
"step": 17175 |
|
}, |
|
{ |
|
"epoch": 4.2521631644004945, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 1.4358974358974359e-05, |
|
"loss": 0.8222, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 4.258343634116192, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 1.423076923076923e-05, |
|
"loss": 0.8213, |
|
"step": 17225 |
|
}, |
|
{ |
|
"epoch": 4.264524103831891, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.4102564102564104e-05, |
|
"loss": 0.8226, |
|
"step": 17250 |
|
}, |
|
{ |
|
"epoch": 4.270704573547589, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 1.3974358974358975e-05, |
|
"loss": 0.8171, |
|
"step": 17275 |
|
}, |
|
{ |
|
"epoch": 4.276885043263288, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 1.3846153846153847e-05, |
|
"loss": 0.8161, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 4.283065512978986, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 1.371794871794872e-05, |
|
"loss": 0.8203, |
|
"step": 17325 |
|
}, |
|
{ |
|
"epoch": 4.289245982694685, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 1.358974358974359e-05, |
|
"loss": 0.8184, |
|
"step": 17350 |
|
}, |
|
{ |
|
"epoch": 4.295426452410383, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 1.3461538461538462e-05, |
|
"loss": 0.8223, |
|
"step": 17375 |
|
}, |
|
{ |
|
"epoch": 4.301606922126082, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.8207, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 4.30778739184178, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 1.3205128205128207e-05, |
|
"loss": 0.8233, |
|
"step": 17425 |
|
}, |
|
{ |
|
"epoch": 4.313967861557479, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 1.3076923076923078e-05, |
|
"loss": 0.8211, |
|
"step": 17450 |
|
}, |
|
{ |
|
"epoch": 4.3201483312731765, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 1.294871794871795e-05, |
|
"loss": 0.8193, |
|
"step": 17475 |
|
}, |
|
{ |
|
"epoch": 4.326328800988875, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.282051282051282e-05, |
|
"loss": 0.8207, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 4.332509270704573, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 1.2692307692307691e-05, |
|
"loss": 0.8247, |
|
"step": 17525 |
|
}, |
|
{ |
|
"epoch": 4.338689740420272, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.2564102564102565e-05, |
|
"loss": 0.826, |
|
"step": 17550 |
|
}, |
|
{ |
|
"epoch": 4.34487021013597, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.2435897435897436e-05, |
|
"loss": 0.82, |
|
"step": 17575 |
|
}, |
|
{ |
|
"epoch": 4.351050679851669, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.230769230769231e-05, |
|
"loss": 0.8161, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 4.357231149567367, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 1.217948717948718e-05, |
|
"loss": 0.826, |
|
"step": 17625 |
|
}, |
|
{ |
|
"epoch": 4.363411619283066, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 1.2051282051282051e-05, |
|
"loss": 0.8199, |
|
"step": 17650 |
|
}, |
|
{ |
|
"epoch": 4.369592088998764, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 1.1923076923076925e-05, |
|
"loss": 0.8251, |
|
"step": 17675 |
|
}, |
|
{ |
|
"epoch": 4.375772558714463, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.1794871794871795e-05, |
|
"loss": 0.8209, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 4.381953028430161, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.1666666666666668e-05, |
|
"loss": 0.8206, |
|
"step": 17725 |
|
}, |
|
{ |
|
"epoch": 4.388133498145859, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.153846153846154e-05, |
|
"loss": 0.8209, |
|
"step": 17750 |
|
}, |
|
{ |
|
"epoch": 4.3943139678615575, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.1410256410256411e-05, |
|
"loss": 0.8218, |
|
"step": 17775 |
|
}, |
|
{ |
|
"epoch": 4.400494437577256, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 1.1282051282051283e-05, |
|
"loss": 0.8216, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 4.406674907292954, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.1153846153846154e-05, |
|
"loss": 0.82, |
|
"step": 17825 |
|
}, |
|
{ |
|
"epoch": 4.412855377008652, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 1.1025641025641026e-05, |
|
"loss": 0.8224, |
|
"step": 17850 |
|
}, |
|
{ |
|
"epoch": 4.419035846724351, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.0897435897435898e-05, |
|
"loss": 0.8209, |
|
"step": 17875 |
|
}, |
|
{ |
|
"epoch": 4.425216316440049, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.0769230769230771e-05, |
|
"loss": 0.8198, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 4.431396786155748, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 1.0641025641025641e-05, |
|
"loss": 0.8259, |
|
"step": 17925 |
|
}, |
|
{ |
|
"epoch": 4.437577255871446, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.0512820512820514e-05, |
|
"loss": 0.8211, |
|
"step": 17950 |
|
}, |
|
{ |
|
"epoch": 4.443757725587145, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 1.0384615384615386e-05, |
|
"loss": 0.8213, |
|
"step": 17975 |
|
}, |
|
{ |
|
"epoch": 4.449938195302843, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 1.0256410256410256e-05, |
|
"loss": 0.8266, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 4.456118665018542, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 1.012820512820513e-05, |
|
"loss": 0.821, |
|
"step": 18025 |
|
}, |
|
{ |
|
"epoch": 4.46229913473424, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 1e-05, |
|
"loss": 0.8223, |
|
"step": 18050 |
|
}, |
|
{ |
|
"epoch": 4.468479604449938, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 9.871794871794872e-06, |
|
"loss": 0.8216, |
|
"step": 18075 |
|
}, |
|
{ |
|
"epoch": 4.474660074165636, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 9.743589743589744e-06, |
|
"loss": 0.8183, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 4.480840543881335, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 9.615384615384616e-06, |
|
"loss": 0.8243, |
|
"step": 18125 |
|
}, |
|
{ |
|
"epoch": 4.487021013597033, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 9.487179487179487e-06, |
|
"loss": 0.8197, |
|
"step": 18150 |
|
}, |
|
{ |
|
"epoch": 4.493201483312732, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 9.358974358974359e-06, |
|
"loss": 0.8243, |
|
"step": 18175 |
|
}, |
|
{ |
|
"epoch": 4.49938195302843, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 9.230769230769232e-06, |
|
"loss": 0.8211, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 4.505562422744129, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 9.102564102564102e-06, |
|
"loss": 0.8222, |
|
"step": 18225 |
|
}, |
|
{ |
|
"epoch": 4.511742892459827, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 8.974358974358976e-06, |
|
"loss": 0.8225, |
|
"step": 18250 |
|
}, |
|
{ |
|
"epoch": 4.517923362175526, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 8.846153846153847e-06, |
|
"loss": 0.8261, |
|
"step": 18275 |
|
}, |
|
{ |
|
"epoch": 4.524103831891224, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 8.717948717948717e-06, |
|
"loss": 0.8234, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 4.5302843016069225, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 8.58974358974359e-06, |
|
"loss": 0.8213, |
|
"step": 18325 |
|
}, |
|
{ |
|
"epoch": 4.5364647713226205, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 8.461538461538462e-06, |
|
"loss": 0.8272, |
|
"step": 18350 |
|
}, |
|
{ |
|
"epoch": 4.5426452410383185, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.8223, |
|
"step": 18375 |
|
}, |
|
{ |
|
"epoch": 4.548825710754017, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 8.205128205128205e-06, |
|
"loss": 0.8218, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 4.555006180469716, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 8.076923076923077e-06, |
|
"loss": 0.8195, |
|
"step": 18425 |
|
}, |
|
{ |
|
"epoch": 4.561186650185414, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 7.948717948717949e-06, |
|
"loss": 0.8222, |
|
"step": 18450 |
|
}, |
|
{ |
|
"epoch": 4.567367119901112, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 7.82051282051282e-06, |
|
"loss": 0.8212, |
|
"step": 18475 |
|
}, |
|
{ |
|
"epoch": 4.573547589616811, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 7.692307692307694e-06, |
|
"loss": 0.8263, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 4.579728059332509, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 7.564102564102564e-06, |
|
"loss": 0.824, |
|
"step": 18525 |
|
}, |
|
{ |
|
"epoch": 4.585908529048208, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 7.435897435897436e-06, |
|
"loss": 0.8189, |
|
"step": 18550 |
|
}, |
|
{ |
|
"epoch": 4.592088998763906, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 7.3076923076923085e-06, |
|
"loss": 0.8213, |
|
"step": 18575 |
|
}, |
|
{ |
|
"epoch": 4.598269468479605, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 7.179487179487179e-06, |
|
"loss": 0.8183, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 4.604449938195303, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 7.051282051282052e-06, |
|
"loss": 0.8203, |
|
"step": 18625 |
|
}, |
|
{ |
|
"epoch": 4.610630407911001, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 6.923076923076923e-06, |
|
"loss": 0.819, |
|
"step": 18650 |
|
}, |
|
{ |
|
"epoch": 4.616810877626699, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 6.794871794871795e-06, |
|
"loss": 0.8235, |
|
"step": 18675 |
|
}, |
|
{ |
|
"epoch": 4.622991347342398, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.8229, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 4.629171817058096, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 6.538461538461539e-06, |
|
"loss": 0.8189, |
|
"step": 18725 |
|
}, |
|
{ |
|
"epoch": 4.635352286773795, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 6.41025641025641e-06, |
|
"loss": 0.8224, |
|
"step": 18750 |
|
}, |
|
{ |
|
"epoch": 4.641532756489493, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 6.282051282051282e-06, |
|
"loss": 0.8225, |
|
"step": 18775 |
|
}, |
|
{ |
|
"epoch": 4.647713226205192, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 6.153846153846155e-06, |
|
"loss": 0.8194, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 4.65389369592089, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 6.025641025641026e-06, |
|
"loss": 0.8225, |
|
"step": 18825 |
|
}, |
|
{ |
|
"epoch": 4.660074165636589, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 5.897435897435897e-06, |
|
"loss": 0.8169, |
|
"step": 18850 |
|
}, |
|
{ |
|
"epoch": 4.666254635352287, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 5.76923076923077e-06, |
|
"loss": 0.8153, |
|
"step": 18875 |
|
}, |
|
{ |
|
"epoch": 4.6724351050679855, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 5.641025641025641e-06, |
|
"loss": 0.822, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 4.6786155747836835, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 5.512820512820513e-06, |
|
"loss": 0.822, |
|
"step": 18925 |
|
}, |
|
{ |
|
"epoch": 4.684796044499382, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 5.3846153846153855e-06, |
|
"loss": 0.8184, |
|
"step": 18950 |
|
}, |
|
{ |
|
"epoch": 4.69097651421508, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 5.256410256410257e-06, |
|
"loss": 0.8247, |
|
"step": 18975 |
|
}, |
|
{ |
|
"epoch": 4.697156983930778, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 5.128205128205128e-06, |
|
"loss": 0.8212, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 4.703337453646477, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8226, |
|
"step": 19025 |
|
}, |
|
{ |
|
"epoch": 4.709517923362176, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 4.871794871794872e-06, |
|
"loss": 0.8206, |
|
"step": 19050 |
|
}, |
|
{ |
|
"epoch": 4.715698393077874, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 4.743589743589744e-06, |
|
"loss": 0.8183, |
|
"step": 19075 |
|
}, |
|
{ |
|
"epoch": 4.721878862793572, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 4.615384615384616e-06, |
|
"loss": 0.8184, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 4.728059332509271, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 4.487179487179488e-06, |
|
"loss": 0.8212, |
|
"step": 19125 |
|
}, |
|
{ |
|
"epoch": 4.734239802224969, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 4.3589743589743586e-06, |
|
"loss": 0.8198, |
|
"step": 19150 |
|
}, |
|
{ |
|
"epoch": 4.740420271940668, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 4.230769230769231e-06, |
|
"loss": 0.8235, |
|
"step": 19175 |
|
}, |
|
{ |
|
"epoch": 4.746600741656366, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 4.102564102564103e-06, |
|
"loss": 0.8228, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 4.752781211372064, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 3.974358974358974e-06, |
|
"loss": 0.8278, |
|
"step": 19225 |
|
}, |
|
{ |
|
"epoch": 4.758961681087762, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 3.846153846153847e-06, |
|
"loss": 0.817, |
|
"step": 19250 |
|
}, |
|
{ |
|
"epoch": 4.765142150803461, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 3.717948717948718e-06, |
|
"loss": 0.8226, |
|
"step": 19275 |
|
}, |
|
{ |
|
"epoch": 4.771322620519159, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 3.5897435897435896e-06, |
|
"loss": 0.8218, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 4.777503090234858, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 3.4615384615384617e-06, |
|
"loss": 0.8209, |
|
"step": 19325 |
|
}, |
|
{ |
|
"epoch": 4.783683559950556, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.8231, |
|
"step": 19350 |
|
}, |
|
{ |
|
"epoch": 4.789864029666255, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 3.205128205128205e-06, |
|
"loss": 0.8199, |
|
"step": 19375 |
|
}, |
|
{ |
|
"epoch": 4.796044499381953, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 3.0769230769230774e-06, |
|
"loss": 0.8214, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 4.802224969097652, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 2.9487179487179486e-06, |
|
"loss": 0.8221, |
|
"step": 19425 |
|
}, |
|
{ |
|
"epoch": 4.80840543881335, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 2.8205128205128207e-06, |
|
"loss": 0.8198, |
|
"step": 19450 |
|
}, |
|
{ |
|
"epoch": 4.8145859085290486, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 2.6923076923076928e-06, |
|
"loss": 0.8195, |
|
"step": 19475 |
|
}, |
|
{ |
|
"epoch": 4.8207663782447465, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 2.564102564102564e-06, |
|
"loss": 0.8217, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 4.826946847960445, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 2.435897435897436e-06, |
|
"loss": 0.8185, |
|
"step": 19525 |
|
}, |
|
{ |
|
"epoch": 4.833127317676143, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 2.307692307692308e-06, |
|
"loss": 0.8196, |
|
"step": 19550 |
|
}, |
|
{ |
|
"epoch": 4.839307787391842, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 2.1794871794871793e-06, |
|
"loss": 0.8204, |
|
"step": 19575 |
|
}, |
|
{ |
|
"epoch": 4.84548825710754, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 2.0512820512820513e-06, |
|
"loss": 0.8201, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 4.851668726823238, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 1.9230769230769234e-06, |
|
"loss": 0.8229, |
|
"step": 19625 |
|
}, |
|
{ |
|
"epoch": 4.857849196538937, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 1.7948717948717948e-06, |
|
"loss": 0.8222, |
|
"step": 19650 |
|
}, |
|
{ |
|
"epoch": 4.864029666254636, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 0.8232, |
|
"step": 19675 |
|
}, |
|
{ |
|
"epoch": 4.870210135970334, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.5384615384615387e-06, |
|
"loss": 0.8224, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 4.876390605686032, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 1.4102564102564104e-06, |
|
"loss": 0.8235, |
|
"step": 19725 |
|
}, |
|
{ |
|
"epoch": 4.882571075401731, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 1.282051282051282e-06, |
|
"loss": 0.8164, |
|
"step": 19750 |
|
}, |
|
{ |
|
"epoch": 4.888751545117429, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.153846153846154e-06, |
|
"loss": 0.8217, |
|
"step": 19775 |
|
}, |
|
{ |
|
"epoch": 4.8949320148331275, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.0256410256410257e-06, |
|
"loss": 0.8185, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 4.901112484548825, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 8.974358974358974e-07, |
|
"loss": 0.8256, |
|
"step": 19825 |
|
}, |
|
{ |
|
"epoch": 4.907292954264524, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 7.692307692307694e-07, |
|
"loss": 0.8235, |
|
"step": 19850 |
|
}, |
|
{ |
|
"epoch": 4.913473423980222, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 6.41025641025641e-07, |
|
"loss": 0.8234, |
|
"step": 19875 |
|
}, |
|
{ |
|
"epoch": 4.919653893695921, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 5.128205128205128e-07, |
|
"loss": 0.8208, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 4.925834363411619, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 3.846153846153847e-07, |
|
"loss": 0.8235, |
|
"step": 19925 |
|
}, |
|
{ |
|
"epoch": 4.932014833127318, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 2.564102564102564e-07, |
|
"loss": 0.8218, |
|
"step": 19950 |
|
}, |
|
{ |
|
"epoch": 4.938195302843016, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 1.282051282051282e-07, |
|
"loss": 0.8207, |
|
"step": 19975 |
|
}, |
|
{ |
|
"epoch": 4.944375772558715, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0, |
|
"loss": 0.8191, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 4.944375772558715, |
|
"eval_loss": 1.0042184591293335, |
|
"eval_runtime": 1.5308, |
|
"eval_samples_per_second": 417.441, |
|
"eval_steps_per_second": 1.96, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 4.944622991347343, |
|
"step": 20001, |
|
"total_flos": 9.058565045825516e+19, |
|
"train_loss": 4.2969823139893115e-05, |
|
"train_runtime": 7.0157, |
|
"train_samples_per_second": 729792.908, |
|
"train_steps_per_second": 2850.754 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 20000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 5000, |
|
"total_flos": 9.058565045825516e+19, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|