|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9999558635300348, |
|
"eval_steps": 2000, |
|
"global_step": 11328, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 8.827293993026438e-05, |
|
"eval_accuracy": 0.31068875219818615, |
|
"eval_loss": 5.8817362785339355, |
|
"eval_runtime": 7.2025, |
|
"eval_samples_per_second": 44.152, |
|
"eval_steps_per_second": 0.417, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0008827293993026437, |
|
"grad_norm": 11.6875, |
|
"learning_rate": 1e-05, |
|
"loss": 6.1657, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0017654587986052875, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7042, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0026481881979079315, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 3e-05, |
|
"loss": 5.1376, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.003530917597210575, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 4e-05, |
|
"loss": 4.8318, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0044136469965132185, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 5e-05, |
|
"loss": 4.5196, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.005296376395815863, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 6e-05, |
|
"loss": 4.2342, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0061791057951185065, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 7e-05, |
|
"loss": 3.9608, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.00706183519442115, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 8e-05, |
|
"loss": 3.6844, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.007944564593723794, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 9e-05, |
|
"loss": 3.5164, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.008827293993026437, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0001, |
|
"loss": 3.3741, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.009710023392329082, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 9.999980428024528e-05, |
|
"loss": 3.2844, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.010592752791631726, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 9.999921712251341e-05, |
|
"loss": 3.2223, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.01147548219093437, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 9.999823853140109e-05, |
|
"loss": 3.1739, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.012358211590237013, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 9.99968685145695e-05, |
|
"loss": 3.1245, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.013240940989539656, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 9.999510708274425e-05, |
|
"loss": 3.0851, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0141236703888423, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 9.99929542497152e-05, |
|
"loss": 3.0726, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.015006399788144944, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 9.999041003233641e-05, |
|
"loss": 3.0398, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.015889129187447587, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 9.998747445052606e-05, |
|
"loss": 3.015, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.016771858586750232, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 9.998414752726617e-05, |
|
"loss": 3.0002, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.017654587986052874, |
|
"grad_norm": 0.375, |
|
"learning_rate": 9.998042928860257e-05, |
|
"loss": 2.9856, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.01853731738535552, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 9.997631976364451e-05, |
|
"loss": 2.9849, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.019420046784658165, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 9.997181898456465e-05, |
|
"loss": 2.9434, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.020302776183960806, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 9.99669269865986e-05, |
|
"loss": 2.9379, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.02118550558326345, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 9.996164380804483e-05, |
|
"loss": 2.9335, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.022068234982566094, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 9.995596949026422e-05, |
|
"loss": 2.9216, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.02295096438186874, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 9.99499040776798e-05, |
|
"loss": 2.9121, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.02383369378117138, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 9.994344761777644e-05, |
|
"loss": 2.9013, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.024716423180474026, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 9.993660016110036e-05, |
|
"loss": 2.8823, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.02559915257977667, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 9.992936176125893e-05, |
|
"loss": 2.8883, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.026481881979079313, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 9.992173247492e-05, |
|
"loss": 2.8729, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.027364611378381958, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 9.991371236181169e-05, |
|
"loss": 2.8636, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.0282473407776846, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 9.990530148472176e-05, |
|
"loss": 2.8728, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.029130070176987245, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 9.989649990949726e-05, |
|
"loss": 2.8614, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.030012799576289887, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 9.988730770504379e-05, |
|
"loss": 2.8757, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.030895528975592532, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 9.987772494332524e-05, |
|
"loss": 2.8752, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.031778258374895174, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 9.986775169936302e-05, |
|
"loss": 2.8469, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.03266098777419782, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 9.985738805123558e-05, |
|
"loss": 2.8615, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.033543717173500465, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 9.984663408007777e-05, |
|
"loss": 2.8378, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.03442644657280311, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 9.983548987008014e-05, |
|
"loss": 2.846, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.03530917597210575, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 9.982395550848835e-05, |
|
"loss": 2.8153, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.036191905371408394, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 9.981203108560256e-05, |
|
"loss": 2.8368, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.03707463477071104, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 9.979971669477651e-05, |
|
"loss": 2.8201, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.037957364170013684, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 9.978701243241703e-05, |
|
"loss": 2.8193, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.03884009356931633, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 9.977391839798309e-05, |
|
"loss": 2.8326, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.03972282296861897, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 9.976043469398517e-05, |
|
"loss": 2.8105, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.04060555236792161, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 9.974656142598435e-05, |
|
"loss": 2.8017, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.04148828176722426, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 9.973229870259152e-05, |
|
"loss": 2.8168, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.0423710111665269, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 9.971764663546656e-05, |
|
"loss": 2.7931, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.04325374056582954, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 9.970260533931744e-05, |
|
"loss": 2.7863, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.04413646996513219, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 9.968717493189927e-05, |
|
"loss": 2.7798, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.04501919936443483, |
|
"grad_norm": 0.25, |
|
"learning_rate": 9.967135553401353e-05, |
|
"loss": 2.7948, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.04590192876373748, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 9.965514726950693e-05, |
|
"loss": 2.7823, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.04678465816304012, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 9.963855026527057e-05, |
|
"loss": 2.7968, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.04766738756234276, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 9.962156465123893e-05, |
|
"loss": 2.7856, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.048550116961645406, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 9.960419056038882e-05, |
|
"loss": 2.7871, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.04943284636094805, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 9.958642812873833e-05, |
|
"loss": 2.7721, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.0503155757602507, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 9.956827749534584e-05, |
|
"loss": 2.7855, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.05119830515955334, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 9.954973880230882e-05, |
|
"loss": 2.7887, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.05208103455885598, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 9.953081219476285e-05, |
|
"loss": 2.7762, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.052963763958158626, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 9.951149782088033e-05, |
|
"loss": 2.7848, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.05384649335746127, |
|
"grad_norm": 0.25, |
|
"learning_rate": 9.949179583186945e-05, |
|
"loss": 2.7645, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.054729222756763916, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 9.947170638197296e-05, |
|
"loss": 2.7697, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.055611952156066555, |
|
"grad_norm": 0.25, |
|
"learning_rate": 9.945122962846694e-05, |
|
"loss": 2.7792, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.0564946815553692, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 9.943036573165959e-05, |
|
"loss": 2.7721, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.057377410954671845, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 9.940911485489e-05, |
|
"loss": 2.7592, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.05826014035397449, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 9.93874771645268e-05, |
|
"loss": 2.7639, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.059142869753277136, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 9.936545282996696e-05, |
|
"loss": 2.7603, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.060025599152579774, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 9.934304202363434e-05, |
|
"loss": 2.7615, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.06090832855188242, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 9.932024492097847e-05, |
|
"loss": 2.7559, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.061791057951185065, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 9.929706170047305e-05, |
|
"loss": 2.7478, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.0626737873504877, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 9.927349254361471e-05, |
|
"loss": 2.7611, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.06355651674979035, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 9.924953763492136e-05, |
|
"loss": 2.7436, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.064439246149093, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 9.9225197161931e-05, |
|
"loss": 2.7339, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.06532197554839564, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 9.920047131520007e-05, |
|
"loss": 2.7479, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.06620470494769828, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 9.917536028830205e-05, |
|
"loss": 2.7432, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.06708743434700093, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 9.914986427782588e-05, |
|
"loss": 2.763, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.06797016374630357, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 9.91239834833745e-05, |
|
"loss": 2.7424, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.06885289314560622, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 9.90977181075632e-05, |
|
"loss": 2.7259, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.06973562254490885, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 9.90710683560181e-05, |
|
"loss": 2.7367, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.0706183519442115, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 9.904403443737454e-05, |
|
"loss": 2.7398, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.07150108134351414, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 9.901661656327534e-05, |
|
"loss": 2.7436, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.07238381074281679, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 9.898881494836933e-05, |
|
"loss": 2.7435, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.07326654014211943, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 9.896062981030952e-05, |
|
"loss": 2.7367, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.07414926954142208, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 9.893206136975144e-05, |
|
"loss": 2.7307, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.07503199894072472, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 9.890310985035139e-05, |
|
"loss": 2.7328, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.07591472834002737, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 9.887377547876475e-05, |
|
"loss": 2.7388, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.07679745773933001, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 9.88440584846442e-05, |
|
"loss": 2.7299, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.07768018713863266, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 9.88139591006378e-05, |
|
"loss": 2.7311, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.07856291653793529, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 9.878347756238733e-05, |
|
"loss": 2.7234, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.07944564593723794, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 9.875261410852637e-05, |
|
"loss": 2.7167, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.08032837533654058, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 9.872136898067841e-05, |
|
"loss": 2.7305, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.08121110473584323, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 9.8689742423455e-05, |
|
"loss": 2.7279, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.08209383413514587, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 9.865773468445385e-05, |
|
"loss": 2.7065, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.08297656353444852, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 9.862534601425681e-05, |
|
"loss": 2.7143, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.08385929293375116, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 9.859257666642796e-05, |
|
"loss": 2.7277, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.0847420223330538, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 9.85594268975117e-05, |
|
"loss": 2.7248, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.08562475173235645, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 9.852589696703057e-05, |
|
"loss": 2.7202, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.08650748113165908, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 9.849198713748337e-05, |
|
"loss": 2.7214, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.08739021053096173, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 9.845769767434307e-05, |
|
"loss": 2.7024, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.08827293993026437, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 9.842302884605466e-05, |
|
"loss": 2.7205, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.08915566932956702, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 9.838798092403311e-05, |
|
"loss": 2.7136, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.09003839872886966, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 9.835255418266128e-05, |
|
"loss": 2.7087, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.09092112812817231, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 9.831674889928766e-05, |
|
"loss": 2.727, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.09180385752747496, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 9.828056535422435e-05, |
|
"loss": 2.7114, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.0926865869267776, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 9.824400383074467e-05, |
|
"loss": 2.7267, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.09356931632608025, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 9.820706461508115e-05, |
|
"loss": 2.7116, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.09445204572538288, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 9.816974799642318e-05, |
|
"loss": 2.7174, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.09533477512468552, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 9.81320542669147e-05, |
|
"loss": 2.7118, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.09621750452398817, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 9.809398372165204e-05, |
|
"loss": 2.7049, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.09710023392329081, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 9.805553665868148e-05, |
|
"loss": 2.7097, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.09798296332259346, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 9.801671337899703e-05, |
|
"loss": 2.6988, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.0988656927218961, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 9.7977514186538e-05, |
|
"loss": 2.7014, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.09974842212119875, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 9.793793938818664e-05, |
|
"loss": 2.7058, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.1006311515205014, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 9.789798929376575e-05, |
|
"loss": 2.7129, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.10151388091980404, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 9.785766421603621e-05, |
|
"loss": 2.7107, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.10239661031910668, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 9.781696447069463e-05, |
|
"loss": 2.7264, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.10327933971840932, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 9.777589037637075e-05, |
|
"loss": 2.6923, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.10416206911771196, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 9.773444225462508e-05, |
|
"loss": 2.7033, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.1050447985170146, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 9.76926204299462e-05, |
|
"loss": 2.7079, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.10592752791631725, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 9.765042522974844e-05, |
|
"loss": 2.7167, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.1068102573156199, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 9.760785698436919e-05, |
|
"loss": 2.7181, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.10769298671492254, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 9.756491602706626e-05, |
|
"loss": 2.7034, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.10857571611422519, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 9.752160269401544e-05, |
|
"loss": 2.6932, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.10945844551352783, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 9.74779173243077e-05, |
|
"loss": 2.706, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.11034117491283048, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 9.743386025994665e-05, |
|
"loss": 2.697, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.11122390431213311, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 9.738943184584578e-05, |
|
"loss": 2.7085, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.11210663371143575, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 9.734463242982587e-05, |
|
"loss": 2.7015, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.1129893631107384, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 9.72994623626121e-05, |
|
"loss": 2.688, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.11387209251004105, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 9.725392199783146e-05, |
|
"loss": 2.7, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.11475482190934369, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 9.720801169200993e-05, |
|
"loss": 2.7074, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.11563755130864634, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 9.716173180456965e-05, |
|
"loss": 2.6907, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.11652028070794898, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 9.711508269782615e-05, |
|
"loss": 2.7017, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.11740301010725163, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 9.706806473698549e-05, |
|
"loss": 2.6937, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.11828573950655427, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 9.702067829014142e-05, |
|
"loss": 2.6991, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.1191684689058569, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 9.697292372827251e-05, |
|
"loss": 2.7, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.12005119830515955, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 9.692480142523918e-05, |
|
"loss": 2.6776, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.1209339277044622, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 9.687631175778087e-05, |
|
"loss": 2.6791, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.12181665710376484, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 9.682745510551301e-05, |
|
"loss": 2.6928, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.12269938650306748, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 9.677823185092405e-05, |
|
"loss": 2.6816, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.12358211590237013, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 9.672864237937255e-05, |
|
"loss": 2.6905, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.12446484530167277, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 9.667868707908407e-05, |
|
"loss": 2.6788, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.1253475747009754, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 9.662836634114818e-05, |
|
"loss": 2.6776, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.12623030410027805, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 9.657768055951537e-05, |
|
"loss": 2.6698, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.1271130334995807, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 9.6526630130994e-05, |
|
"loss": 2.6884, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.12799576289888334, |
|
"grad_norm": 0.25, |
|
"learning_rate": 9.647521545524716e-05, |
|
"loss": 2.6813, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.128878492298186, |
|
"grad_norm": 0.375, |
|
"learning_rate": 9.642343693478955e-05, |
|
"loss": 2.6874, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.12976122169748863, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 9.637129497498436e-05, |
|
"loss": 2.6745, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.13064395109679128, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 9.631878998404003e-05, |
|
"loss": 2.6895, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.13152668049609392, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 9.626592237300714e-05, |
|
"loss": 2.6957, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.13240940989539657, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 9.621269255577512e-05, |
|
"loss": 2.6802, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.1332921392946992, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 9.615910094906904e-05, |
|
"loss": 2.6879, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.13417486869400186, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 9.610514797244634e-05, |
|
"loss": 2.6831, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.1350575980933045, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 9.605083404829355e-05, |
|
"loss": 2.6818, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.13594032749260715, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 9.5996159601823e-05, |
|
"loss": 2.6825, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.1368230568919098, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 9.594112506106945e-05, |
|
"loss": 2.6874, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.13770578629121244, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 9.588573085688676e-05, |
|
"loss": 2.679, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.13858851569051509, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 9.582997742294458e-05, |
|
"loss": 2.685, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.1394712450898177, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 9.57738651957248e-05, |
|
"loss": 2.6716, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.14035397448912035, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 9.571739461451829e-05, |
|
"loss": 2.684, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.141236703888423, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 9.566056612142138e-05, |
|
"loss": 2.6782, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.14211943328772564, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 9.560338016133241e-05, |
|
"loss": 2.6686, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.14300216268702828, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 9.554583718194827e-05, |
|
"loss": 2.676, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.14388489208633093, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 9.548793763376089e-05, |
|
"loss": 2.6822, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.14476762148563357, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 9.542968197005367e-05, |
|
"loss": 2.6774, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.14565035088493622, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 9.537107064689798e-05, |
|
"loss": 2.6687, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.14653308028423886, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 9.531210412314955e-05, |
|
"loss": 2.674, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.1474158096835415, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 9.525278286044497e-05, |
|
"loss": 2.6848, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.14829853908284416, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 9.519310732319792e-05, |
|
"loss": 2.6602, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.1491812684821468, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 9.513307797859568e-05, |
|
"loss": 2.6783, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.15006399788144945, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 9.507269529659536e-05, |
|
"loss": 2.6784, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.1509467272807521, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 9.501195974992037e-05, |
|
"loss": 2.6796, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.15182945668005474, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 9.495087181405651e-05, |
|
"loss": 2.6764, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.15271218607935738, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 9.488943196724842e-05, |
|
"loss": 2.6727, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.15359491547866003, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 9.482764069049579e-05, |
|
"loss": 2.6753, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.15447764487796267, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 9.476549846754956e-05, |
|
"loss": 2.6651, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.15536037427726532, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 9.470300578490813e-05, |
|
"loss": 2.6674, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.15624310367656793, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 9.464016313181363e-05, |
|
"loss": 2.684, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.15712583307587058, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 9.457697100024798e-05, |
|
"loss": 2.6822, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.15800856247517323, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 9.451342988492915e-05, |
|
"loss": 2.6699, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.15889129187447587, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 9.444954028330716e-05, |
|
"loss": 2.6636, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.15977402127377852, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 9.438530269556035e-05, |
|
"loss": 2.6715, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.16065675067308116, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 9.432071762459127e-05, |
|
"loss": 2.6564, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.1615394800723838, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 9.42557855760229e-05, |
|
"loss": 2.6819, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.16242220947168645, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 9.419050705819463e-05, |
|
"loss": 2.6713, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.1633049388709891, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 9.412488258215829e-05, |
|
"loss": 2.6659, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.16418766827029174, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 9.405891266167411e-05, |
|
"loss": 2.6674, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.1650703976695944, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 9.399259781320678e-05, |
|
"loss": 2.6686, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.16595312706889703, |
|
"grad_norm": 0.25, |
|
"learning_rate": 9.392593855592133e-05, |
|
"loss": 2.6784, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.16683585646819968, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 9.38589354116791e-05, |
|
"loss": 2.6759, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.16771858586750232, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 9.379158890503362e-05, |
|
"loss": 2.6753, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.16860131526680497, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 9.37238995632266e-05, |
|
"loss": 2.6811, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.1694840446661076, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 9.365586791618368e-05, |
|
"loss": 2.6739, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.17036677406541026, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 9.358749449651035e-05, |
|
"loss": 2.667, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.1712495034647129, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 9.351877983948775e-05, |
|
"loss": 2.6697, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.17213223286401555, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 9.344972448306854e-05, |
|
"loss": 2.6712, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.17301496226331817, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 9.338032896787263e-05, |
|
"loss": 2.661, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.1738976916626208, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 9.331059383718289e-05, |
|
"loss": 2.6646, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.17478042106192346, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 9.324051963694107e-05, |
|
"loss": 2.6658, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.1756631504612261, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 9.317010691574338e-05, |
|
"loss": 2.6684, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.17654587986052875, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 9.309935622483623e-05, |
|
"loss": 2.6873, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.17654587986052875, |
|
"eval_accuracy": 0.4942451678300735, |
|
"eval_loss": 2.5415842533111572, |
|
"eval_runtime": 7.2847, |
|
"eval_samples_per_second": 43.653, |
|
"eval_steps_per_second": 0.412, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.1774286092598314, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 9.302826811811195e-05, |
|
"loss": 2.6576, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.17831133865913404, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 9.295684315210439e-05, |
|
"loss": 2.6712, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.17919406805843668, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 9.288508188598464e-05, |
|
"loss": 2.6641, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.18007679745773933, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 9.281298488155659e-05, |
|
"loss": 2.6525, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.18095952685704197, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 9.274055270325255e-05, |
|
"loss": 2.6661, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.18184225625634462, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 9.266778591812886e-05, |
|
"loss": 2.6672, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.18272498565564727, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 9.259468509586142e-05, |
|
"loss": 2.6612, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.1836077150549499, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 9.252125080874121e-05, |
|
"loss": 2.6762, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.18449044445425256, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 9.244748363166989e-05, |
|
"loss": 2.6509, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.1853731738535552, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 9.237338414215517e-05, |
|
"loss": 2.6488, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.18625590325285785, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 9.229895292030644e-05, |
|
"loss": 2.661, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.1871386326521605, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 9.22241905488301e-05, |
|
"loss": 2.6639, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.18802136205146314, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 9.214909761302506e-05, |
|
"loss": 2.664, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.18890409145076575, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 9.20736747007782e-05, |
|
"loss": 2.6656, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.1897868208500684, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 9.199792240255963e-05, |
|
"loss": 2.6482, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.19066955024937104, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 9.192184131141824e-05, |
|
"loss": 2.6512, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.1915522796486737, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 9.184543202297688e-05, |
|
"loss": 2.6396, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.19243500904797634, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 9.176869513542789e-05, |
|
"loss": 2.6514, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.19331773844727898, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 9.169163124952825e-05, |
|
"loss": 2.6762, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.19420046784658163, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 9.161424096859493e-05, |
|
"loss": 2.6632, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.19508319724588427, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 9.15365248985002e-05, |
|
"loss": 2.6461, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.19596592664518692, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 9.145848364766691e-05, |
|
"loss": 2.6589, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.19684865604448956, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 9.13801178270636e-05, |
|
"loss": 2.6617, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.1977313854437922, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 9.130142805019984e-05, |
|
"loss": 2.6491, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.19861411484309485, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 9.12224149331214e-05, |
|
"loss": 2.6635, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.1994968442423975, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 9.114307909440539e-05, |
|
"loss": 2.6504, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.20037957364170014, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 9.106342115515543e-05, |
|
"loss": 2.6628, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.2012623030410028, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 9.098344173899684e-05, |
|
"loss": 2.6479, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.20214503244030543, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 9.090314147207168e-05, |
|
"loss": 2.6649, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.20302776183960808, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 9.082252098303387e-05, |
|
"loss": 2.676, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.20391049123891072, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 9.074158090304433e-05, |
|
"loss": 2.6463, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.20479322063821337, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 9.066032186576596e-05, |
|
"loss": 2.6386, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.205675950037516, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 9.057874450735869e-05, |
|
"loss": 2.66, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.20655867943681863, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 9.049684946647459e-05, |
|
"loss": 2.6644, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.20744140883612128, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 9.041463738425268e-05, |
|
"loss": 2.6506, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.20832413823542392, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 9.033210890431418e-05, |
|
"loss": 2.6504, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.20920686763472657, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 9.02492646727572e-05, |
|
"loss": 2.6466, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.2100895970340292, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 9.016610533815187e-05, |
|
"loss": 2.6563, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.21097232643333186, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 9.008263155153516e-05, |
|
"loss": 2.6583, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.2118550558326345, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 8.999884396640581e-05, |
|
"loss": 2.6557, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.21273778523193715, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 8.991474323871929e-05, |
|
"loss": 2.6673, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.2136205146312398, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 8.983033002688252e-05, |
|
"loss": 2.6577, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.21450324403054244, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 8.974560499174883e-05, |
|
"loss": 2.6645, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.21538597342984508, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 8.966056879661275e-05, |
|
"loss": 2.6603, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.21626870282914773, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 8.95752221072048e-05, |
|
"loss": 2.6586, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.21715143222845038, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 8.948956559168632e-05, |
|
"loss": 2.659, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.21803416162775302, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 8.940359992064418e-05, |
|
"loss": 2.6559, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.21891689102705567, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 8.93173257670856e-05, |
|
"loss": 2.6566, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.2197996204263583, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 8.923074380643281e-05, |
|
"loss": 2.6384, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.22068234982566096, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 8.914385471651784e-05, |
|
"loss": 2.6573, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.22156507922496357, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 8.905665917757712e-05, |
|
"loss": 2.6591, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.22244780862426622, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 8.896915787224622e-05, |
|
"loss": 2.6562, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.22333053802356886, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 8.888135148555455e-05, |
|
"loss": 2.6495, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.2242132674228715, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 8.879324070491984e-05, |
|
"loss": 2.6478, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.22509599682217415, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 8.870482622014292e-05, |
|
"loss": 2.6387, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.2259787262214768, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 8.861610872340222e-05, |
|
"loss": 2.656, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.22686145562077945, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 8.852708890924845e-05, |
|
"loss": 2.65, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.2277441850200821, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 8.843776747459906e-05, |
|
"loss": 2.6446, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.22862691441938474, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 8.834814511873277e-05, |
|
"loss": 2.6418, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.22950964381868738, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 8.825822254328424e-05, |
|
"loss": 2.6581, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.23039237321799003, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 8.816800045223843e-05, |
|
"loss": 2.6397, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.23127510261729267, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 8.807747955192516e-05, |
|
"loss": 2.638, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.23215783201659532, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 8.79866605510136e-05, |
|
"loss": 2.6362, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.23304056141589796, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 8.789554416050661e-05, |
|
"loss": 2.6515, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.2339232908152006, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 8.780413109373532e-05, |
|
"loss": 2.6462, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.23480602021450325, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 8.771242206635341e-05, |
|
"loss": 2.6401, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.2356887496138059, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 8.762041779633167e-05, |
|
"loss": 2.633, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.23657147901310854, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 8.75281190039522e-05, |
|
"loss": 2.6553, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.2374542084124112, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 8.743552641180287e-05, |
|
"loss": 2.6483, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.2383369378117138, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 8.734264074477167e-05, |
|
"loss": 2.6421, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.23921966721101645, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 8.724946273004102e-05, |
|
"loss": 2.6469, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.2401023966103191, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 8.7155993097082e-05, |
|
"loss": 2.6436, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.24098512600962174, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 8.706223257764878e-05, |
|
"loss": 2.651, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.2418678554089244, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 8.69681819057728e-05, |
|
"loss": 2.6524, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.24275058480822703, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 8.687384181775704e-05, |
|
"loss": 2.6303, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.24363331420752968, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 8.677921305217023e-05, |
|
"loss": 2.6415, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.24451604360683232, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 8.668429634984114e-05, |
|
"loss": 2.6243, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.24539877300613497, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 8.658909245385273e-05, |
|
"loss": 2.6362, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.2462815024054376, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 8.64936021095363e-05, |
|
"loss": 2.6479, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.24716423180474026, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 8.639782606446574e-05, |
|
"loss": 2.6345, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.2480469612040429, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 8.63017650684516e-05, |
|
"loss": 2.6506, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.24892969060334555, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 8.620541987353529e-05, |
|
"loss": 2.6516, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.2498124200026482, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 8.610879123398307e-05, |
|
"loss": 2.6419, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.2506951494019508, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 8.601187990628035e-05, |
|
"loss": 2.6398, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.2515778788012535, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 8.591468664912554e-05, |
|
"loss": 2.6197, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.2524606082005561, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 8.581721222342427e-05, |
|
"loss": 2.6387, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.2533433375998588, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 8.571945739228337e-05, |
|
"loss": 2.6444, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.2542260669991614, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 8.56214229210049e-05, |
|
"loss": 2.6342, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.25510879639846407, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 8.552310957708017e-05, |
|
"loss": 2.6653, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.2559915257977667, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 8.542451813018372e-05, |
|
"loss": 2.6429, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.25687425519706936, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 8.53256493521673e-05, |
|
"loss": 2.6366, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.257756984596372, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 8.522650401705383e-05, |
|
"loss": 2.6578, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.25863971399567465, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 8.512708290103134e-05, |
|
"loss": 2.6271, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.25952244339497726, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 8.502738678244689e-05, |
|
"loss": 2.654, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.26040517279427994, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 8.492741644180046e-05, |
|
"loss": 2.6578, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.26128790219358256, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 8.482717266173889e-05, |
|
"loss": 2.6654, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.26217063159288523, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 8.47266562270497e-05, |
|
"loss": 2.6488, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.26305336099218785, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 8.462586792465494e-05, |
|
"loss": 2.6538, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.26393609039149046, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 8.452480854360512e-05, |
|
"loss": 2.6424, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.26481881979079314, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 8.442347887507291e-05, |
|
"loss": 2.6501, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.26570154919009575, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 8.432187971234704e-05, |
|
"loss": 2.6407, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.2665842785893984, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 8.422001185082602e-05, |
|
"loss": 2.6435, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.26746700798870104, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 8.411787608801199e-05, |
|
"loss": 2.6409, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.2683497373880037, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 8.401547322350438e-05, |
|
"loss": 2.6317, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.26923246678730633, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 8.391280405899375e-05, |
|
"loss": 2.6278, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.270115196186609, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 8.380986939825542e-05, |
|
"loss": 2.6442, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.2709979255859116, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 8.370667004714329e-05, |
|
"loss": 2.6286, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.2718806549852143, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 8.36032068135834e-05, |
|
"loss": 2.6341, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.2727633843845169, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 8.34994805075677e-05, |
|
"loss": 2.638, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.2736461137838196, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 8.339549194114771e-05, |
|
"loss": 2.6486, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.2745288431831222, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 8.329124192842806e-05, |
|
"loss": 2.6425, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.2754115725824249, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 8.318673128556025e-05, |
|
"loss": 2.6637, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.2762943019817275, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 8.308196083073614e-05, |
|
"loss": 2.6533, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.27717703138103017, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 8.297693138418168e-05, |
|
"loss": 2.6561, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.2780597607803328, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 8.287164376815035e-05, |
|
"loss": 2.6431, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.2789424901796354, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 8.276609880691682e-05, |
|
"loss": 2.6482, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.2798252195789381, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 8.266029732677043e-05, |
|
"loss": 2.6505, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.2807079489782407, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 8.255424015600876e-05, |
|
"loss": 2.6284, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.28159067837754337, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 8.244792812493119e-05, |
|
"loss": 2.6487, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.282473407776846, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 8.234136206583227e-05, |
|
"loss": 2.6439, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.28335613717614866, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 8.223454281299533e-05, |
|
"loss": 2.6303, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.2842388665754513, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 8.212747120268587e-05, |
|
"loss": 2.6383, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.28512159597475395, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 8.20201480731451e-05, |
|
"loss": 2.6437, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.28600432537405657, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 8.191257426458326e-05, |
|
"loss": 2.6279, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.28688705477335924, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 8.180475061917312e-05, |
|
"loss": 2.6436, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.28776978417266186, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 8.169667798104339e-05, |
|
"loss": 2.6419, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.28865251357196453, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 8.158835719627206e-05, |
|
"loss": 2.6335, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.28953524297126715, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 8.147978911287988e-05, |
|
"loss": 2.6538, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.2904179723705698, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 8.137097458082353e-05, |
|
"loss": 2.6449, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.29130070176987244, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 8.126191445198918e-05, |
|
"loss": 2.6366, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.2921834311691751, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 8.115260958018571e-05, |
|
"loss": 2.634, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.29306616056847773, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 8.104306082113802e-05, |
|
"loss": 2.6574, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.2939488899677804, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 8.093326903248033e-05, |
|
"loss": 2.6419, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.294831619367083, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 8.082323507374956e-05, |
|
"loss": 2.628, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.29571434876638564, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 8.071295980637847e-05, |
|
"loss": 2.6513, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.2965970781656883, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 8.060244409368897e-05, |
|
"loss": 2.6383, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.29747980756499093, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 8.049168880088543e-05, |
|
"loss": 2.6321, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.2983625369642936, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 8.038069479504775e-05, |
|
"loss": 2.6384, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.2992452663635962, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 8.026946294512477e-05, |
|
"loss": 2.635, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.3001279957628989, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 8.015799412192726e-05, |
|
"loss": 2.6338, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.3010107251622015, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 8.004628919812125e-05, |
|
"loss": 2.6216, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.3018934545615042, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 7.993434904822119e-05, |
|
"loss": 2.6542, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.3027761839608068, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 7.9822174548583e-05, |
|
"loss": 2.6336, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.3036589133601095, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 7.970976657739731e-05, |
|
"loss": 2.6511, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.3045416427594121, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 7.959712601468253e-05, |
|
"loss": 2.6327, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.30542437215871476, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 7.948425374227799e-05, |
|
"loss": 2.6391, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.3063071015580174, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 7.937115064383704e-05, |
|
"loss": 2.6476, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.30718983095732005, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 7.925781760482011e-05, |
|
"loss": 2.6443, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.30807256035662267, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 7.914425551248781e-05, |
|
"loss": 2.6388, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.30895528975592534, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 7.903046525589386e-05, |
|
"loss": 2.6365, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.30983801915522796, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 7.891644772587837e-05, |
|
"loss": 2.647, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.31072074855453063, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 7.880220381506065e-05, |
|
"loss": 2.6398, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.31160347795383325, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 7.868773441783228e-05, |
|
"loss": 2.6379, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.31248620735313587, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 7.857304043035017e-05, |
|
"loss": 2.6211, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.31336893675243854, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 7.845812275052948e-05, |
|
"loss": 2.6467, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.31425166615174116, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 7.834298227803665e-05, |
|
"loss": 2.6416, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.31513439555104383, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 7.822761991428223e-05, |
|
"loss": 2.6576, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.31601712495034645, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 7.811203656241397e-05, |
|
"loss": 2.6294, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.3168998543496491, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 7.799623312730971e-05, |
|
"loss": 2.6442, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.31778258374895174, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 7.788021051557022e-05, |
|
"loss": 2.6513, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.3186653131482544, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 7.77639696355122e-05, |
|
"loss": 2.6454, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.31954804254755703, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 7.764751139716109e-05, |
|
"loss": 2.6339, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.3204307719468597, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 7.753083671224404e-05, |
|
"loss": 2.6511, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.3213135013461623, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 7.741394649418264e-05, |
|
"loss": 2.6476, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.322196230745465, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 7.72968416580859e-05, |
|
"loss": 2.6456, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.3230789601447676, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 7.717952312074304e-05, |
|
"loss": 2.6436, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.3239616895440703, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 7.706199180061622e-05, |
|
"loss": 2.6234, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.3248444189433729, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 7.694424861783351e-05, |
|
"loss": 2.6349, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.3257271483426756, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 7.682629449418159e-05, |
|
"loss": 2.651, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.3266098777419782, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 7.670813035309855e-05, |
|
"loss": 2.6523, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.32749260714128087, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 7.658975711966664e-05, |
|
"loss": 2.6352, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.3283753365405835, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 7.647117572060509e-05, |
|
"loss": 2.6395, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.3292580659398861, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 7.635238708426277e-05, |
|
"loss": 2.6431, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.3301407953391888, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 7.6233392140611e-05, |
|
"loss": 2.6479, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.3310235247384914, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 7.611419182123625e-05, |
|
"loss": 2.6355, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.33190625413779407, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 7.599478705933278e-05, |
|
"loss": 2.6458, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.3327889835370967, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 7.587517878969544e-05, |
|
"loss": 2.6242, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.33367171293639936, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 7.575536794871223e-05, |
|
"loss": 2.6361, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.334554442335702, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 7.563535547435713e-05, |
|
"loss": 2.635, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.33543717173500465, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 7.551514230618265e-05, |
|
"loss": 2.6397, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.33631990113430726, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 7.539472938531242e-05, |
|
"loss": 2.6383, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.33720263053360994, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 7.527411765443392e-05, |
|
"loss": 2.6261, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.33808535993291255, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 7.51533080577911e-05, |
|
"loss": 2.6423, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.3389680893322152, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 7.503230154117698e-05, |
|
"loss": 2.6283, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.33985081873151785, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 7.491109905192614e-05, |
|
"loss": 2.6351, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.3407335481308205, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 7.478970153890747e-05, |
|
"loss": 2.6321, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.34161627753012314, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 7.466810995251661e-05, |
|
"loss": 2.6127, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.3424990069294258, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 7.454632524466857e-05, |
|
"loss": 2.6389, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.3433817363287284, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 7.442434836879032e-05, |
|
"loss": 2.6247, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.3442644657280311, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 7.430218027981318e-05, |
|
"loss": 2.634, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.3451471951273337, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 7.417982193416553e-05, |
|
"loss": 2.6388, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.34602992452663633, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 7.405727428976515e-05, |
|
"loss": 2.6456, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.346912653925939, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 7.393453830601184e-05, |
|
"loss": 2.6407, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.3477953833252416, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 7.38116149437799e-05, |
|
"loss": 2.6438, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.3486781127245443, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 7.36885051654105e-05, |
|
"loss": 2.6213, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.3495608421238469, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 7.356520993470428e-05, |
|
"loss": 2.6393, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.3504435715231496, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 7.344173021691374e-05, |
|
"loss": 2.6329, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.3513263009224522, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 7.331806697873567e-05, |
|
"loss": 2.6339, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.3522090303217549, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 7.319422118830364e-05, |
|
"loss": 2.6397, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.3530917597210575, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 7.307019381518033e-05, |
|
"loss": 2.6477, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.3530917597210575, |
|
"eval_accuracy": 0.497904331866596, |
|
"eval_loss": 2.5108346939086914, |
|
"eval_runtime": 7.1683, |
|
"eval_samples_per_second": 44.362, |
|
"eval_steps_per_second": 0.419, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.35397448912036017, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 7.294598583035004e-05, |
|
"loss": 2.6278, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.3548572185196628, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 7.2821598206211e-05, |
|
"loss": 2.6518, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.35573994791896546, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 7.269703191656786e-05, |
|
"loss": 2.6343, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.3566226773182681, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 7.257228793662393e-05, |
|
"loss": 2.6334, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.35750540671757075, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 7.24473672429737e-05, |
|
"loss": 2.6371, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.35838813611687337, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 7.232227081359501e-05, |
|
"loss": 2.6252, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.35927086551617604, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 7.219699962784159e-05, |
|
"loss": 2.629, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.36015359491547866, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 7.207155466643528e-05, |
|
"loss": 2.6241, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.3610363243147813, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 7.194593691145834e-05, |
|
"loss": 2.6259, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.36191905371408395, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 7.182014734634587e-05, |
|
"loss": 2.6297, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.36280178311338657, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 7.169418695587791e-05, |
|
"loss": 2.6244, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.36368451251268924, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 7.156805672617197e-05, |
|
"loss": 2.6242, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.36456724191199186, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 7.144175764467516e-05, |
|
"loss": 2.6189, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.36544997131129453, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 7.131529070015647e-05, |
|
"loss": 2.6543, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.36633270071059715, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 7.118865688269908e-05, |
|
"loss": 2.6412, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.3672154301098998, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 7.106185718369259e-05, |
|
"loss": 2.632, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.36809815950920244, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 7.093489259582522e-05, |
|
"loss": 2.6252, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.3689808889085051, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 7.080776411307609e-05, |
|
"loss": 2.62, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.36986361830780773, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 7.068047273070745e-05, |
|
"loss": 2.6305, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.3707463477071104, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 7.055301944525679e-05, |
|
"loss": 2.6414, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.371629077106413, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 7.042540525452916e-05, |
|
"loss": 2.6361, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.3725118065057157, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 7.029763115758927e-05, |
|
"loss": 2.6278, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.3733945359050183, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 7.016969815475373e-05, |
|
"loss": 2.6235, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.374277265304321, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 7.004160724758318e-05, |
|
"loss": 2.6404, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.3751599947036236, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 6.991335943887445e-05, |
|
"loss": 2.6512, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.3760427241029263, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 6.978495573265271e-05, |
|
"loss": 2.6292, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.3769254535022289, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 6.965639713416365e-05, |
|
"loss": 2.6219, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.3778081829015315, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 6.952768464986558e-05, |
|
"loss": 2.6488, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.3786909123008342, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 6.939881928742153e-05, |
|
"loss": 2.6413, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.3795736417001368, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 6.926980205569136e-05, |
|
"loss": 2.6315, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.38045637109943947, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 6.914063396472392e-05, |
|
"loss": 2.6297, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.3813391004987421, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 6.90113160257491e-05, |
|
"loss": 2.6413, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.38222182989804476, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 6.888184925116991e-05, |
|
"loss": 2.6318, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.3831045592973474, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 6.875223465455456e-05, |
|
"loss": 2.6235, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.38398728869665005, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 6.862247325062854e-05, |
|
"loss": 2.6341, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.38487001809595267, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 6.849256605526663e-05, |
|
"loss": 2.6208, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.38575274749525534, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 6.836251408548506e-05, |
|
"loss": 2.6244, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.38663547689455796, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 6.823231835943336e-05, |
|
"loss": 2.6548, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.38751820629386063, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 6.810197989638658e-05, |
|
"loss": 2.6418, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.38840093569316325, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 6.79714997167372e-05, |
|
"loss": 2.6277, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.3892836650924659, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 6.784087884198717e-05, |
|
"loss": 2.6405, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.39016639449176854, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 6.771011829473989e-05, |
|
"loss": 2.6341, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.3910491238910712, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 6.75792190986923e-05, |
|
"loss": 2.6282, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.39193185329037383, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 6.744818227862668e-05, |
|
"loss": 2.6219, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.3928145826896765, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 6.731700886040284e-05, |
|
"loss": 2.621, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.3936973120889791, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 6.718569987094993e-05, |
|
"loss": 2.6387, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.39458004148828174, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 6.705425633825851e-05, |
|
"loss": 2.6402, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.3954627708875844, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 6.692267929137239e-05, |
|
"loss": 2.6313, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.39634550028688703, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 6.679096976038066e-05, |
|
"loss": 2.6274, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.3972282296861897, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 6.665912877640963e-05, |
|
"loss": 2.6202, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.3981109590854923, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 6.652715737161468e-05, |
|
"loss": 2.6326, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.398993688484795, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 6.639505657917226e-05, |
|
"loss": 2.6372, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.3998764178840976, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 6.626282743327174e-05, |
|
"loss": 2.6441, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.4007591472834003, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 6.613047096910739e-05, |
|
"loss": 2.6544, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.4016418766827029, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 6.599798822287018e-05, |
|
"loss": 2.6333, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.4025246060820056, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 6.586538023173972e-05, |
|
"loss": 2.6251, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.4034073354813082, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 6.573264803387618e-05, |
|
"loss": 2.6134, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.40429006488061087, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 6.559979266841209e-05, |
|
"loss": 2.6325, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.4051727942799135, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 6.54668151754442e-05, |
|
"loss": 2.649, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.40605552367921616, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 6.53337165960254e-05, |
|
"loss": 2.6455, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.4069382530785188, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 6.520049797215659e-05, |
|
"loss": 2.6323, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.40782098247782145, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 6.506716034677838e-05, |
|
"loss": 2.6184, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.40870371187712407, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 6.493370476376308e-05, |
|
"loss": 2.6307, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.40958644127642674, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 6.480013226790646e-05, |
|
"loss": 2.6404, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.41046917067572936, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 6.466644390491954e-05, |
|
"loss": 2.6364, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.411351900075032, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 6.453264072142048e-05, |
|
"loss": 2.6346, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.41223462947433465, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 6.439872376492634e-05, |
|
"loss": 2.638, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.41311735887363726, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 6.426469408384486e-05, |
|
"loss": 2.6309, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.41400008827293994, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 6.413055272746628e-05, |
|
"loss": 2.6294, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.41488281767224255, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 6.399630074595515e-05, |
|
"loss": 2.6368, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.4157655470715452, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 6.386193919034207e-05, |
|
"loss": 2.6235, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.41664827647084784, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 6.372746911251548e-05, |
|
"loss": 2.6224, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.4175310058701505, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 6.359289156521338e-05, |
|
"loss": 2.6258, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.41841373526945314, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 6.345820760201516e-05, |
|
"loss": 2.6244, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.4192964646687558, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 6.33234182773333e-05, |
|
"loss": 2.637, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.4201791940680584, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 6.318852464640516e-05, |
|
"loss": 2.6284, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.4210619234673611, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 6.305352776528468e-05, |
|
"loss": 2.6276, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.4219446528666637, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 6.29184286908341e-05, |
|
"loss": 2.605, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.4228273822659664, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 6.278322848071572e-05, |
|
"loss": 2.6241, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.423710111665269, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 6.264792819338366e-05, |
|
"loss": 2.6347, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.4245928410645717, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 6.251252888807544e-05, |
|
"loss": 2.615, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.4254755704638743, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 6.237703162480384e-05, |
|
"loss": 2.6338, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.42635829986317697, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 6.224143746434847e-05, |
|
"loss": 2.6288, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.4272410292624796, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 6.21057474682476e-05, |
|
"loss": 2.6013, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.4281237586617822, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 6.196996269878973e-05, |
|
"loss": 2.6295, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.4290064880610849, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 6.183408421900532e-05, |
|
"loss": 2.6321, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.4298892174603875, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 6.16981130926585e-05, |
|
"loss": 2.6196, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.43077194685969017, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 6.156205038423867e-05, |
|
"loss": 2.6076, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.4316546762589928, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 6.142589715895223e-05, |
|
"loss": 2.6335, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.43253740565829546, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 6.128965448271422e-05, |
|
"loss": 2.6463, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.4334201350575981, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 6.115332342213997e-05, |
|
"loss": 2.6223, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.43430286445690075, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 6.1016905044536744e-05, |
|
"loss": 2.6422, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.43518559385620337, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 6.088040041789539e-05, |
|
"loss": 2.6299, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.43606832325550604, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 6.0743810610882005e-05, |
|
"loss": 2.6315, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.43695105265480866, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 6.060713669282951e-05, |
|
"loss": 2.621, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.43783378205411133, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 6.0470379733729374e-05, |
|
"loss": 2.6292, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.43871651145341395, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 6.0333540804223086e-05, |
|
"loss": 2.6342, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.4395992408527166, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 6.0196620975593935e-05, |
|
"loss": 2.6236, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.44048197025201924, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 6.005962131975855e-05, |
|
"loss": 2.6246, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 0.4413646996513219, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 5.992254290925847e-05, |
|
"loss": 2.6367, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.44224742905062453, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 5.978538681725182e-05, |
|
"loss": 2.623, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.44313015844992715, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 5.964815411750486e-05, |
|
"loss": 2.6337, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.4440128878492298, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 5.951084588438361e-05, |
|
"loss": 2.6208, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 0.44489561724853244, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 5.937346319284542e-05, |
|
"loss": 2.6353, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.4457783466478351, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 5.923600711843056e-05, |
|
"loss": 2.6409, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.44666107604713773, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 5.909847873725378e-05, |
|
"loss": 2.6264, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.4475438054464404, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 5.896087912599593e-05, |
|
"loss": 2.6371, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 0.448426534845743, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 5.882320936189551e-05, |
|
"loss": 2.6229, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.4493092642450457, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 5.8685470522740194e-05, |
|
"loss": 2.6237, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 0.4501919936443483, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 5.8547663686858475e-05, |
|
"loss": 2.6216, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.451074723043651, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 5.840978993311116e-05, |
|
"loss": 2.6376, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 0.4519574524429536, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 5.827185034088292e-05, |
|
"loss": 2.6413, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.4528401818422563, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 5.813384599007389e-05, |
|
"loss": 2.6376, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 0.4537229112415589, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 5.7995777961091167e-05, |
|
"loss": 2.6248, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.45460564064086156, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 5.785764733484041e-05, |
|
"loss": 2.6199, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.4554883700401642, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 5.771945519271729e-05, |
|
"loss": 2.6338, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.45637109943946685, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 5.758120261659909e-05, |
|
"loss": 2.6285, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 0.45725382883876947, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 5.744289068883624e-05, |
|
"loss": 2.6269, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.45813655823807214, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 5.7304520492243786e-05, |
|
"loss": 2.6404, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 0.45901928763737476, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 5.716609311009298e-05, |
|
"loss": 2.6181, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.4599020170366774, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 5.7027609626102727e-05, |
|
"loss": 2.6245, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 0.46078474643598005, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 5.688907112443119e-05, |
|
"loss": 2.6233, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.46166747583528267, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 5.675047868966723e-05, |
|
"loss": 2.6241, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 0.46255020523458534, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 5.6611833406821945e-05, |
|
"loss": 2.6386, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.46343293463388796, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 5.6473136361320144e-05, |
|
"loss": 2.618, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.46431566403319063, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 5.633438863899192e-05, |
|
"loss": 2.6352, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 0.46519839343249325, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 5.6195591326064056e-05, |
|
"loss": 2.6421, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 0.4660811228317959, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 5.605674550915161e-05, |
|
"loss": 2.6392, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.46696385223109854, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 5.591785227524935e-05, |
|
"loss": 2.6228, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 0.4678465816304012, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 5.577891271172328e-05, |
|
"loss": 2.6408, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.46872931102970383, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 5.563992790630205e-05, |
|
"loss": 2.6193, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 0.4696120404290065, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 5.550089894706856e-05, |
|
"loss": 2.6388, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 0.4704947698283091, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 5.536182692245138e-05, |
|
"loss": 2.6328, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 0.4713774992276118, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 5.52227129212162e-05, |
|
"loss": 2.6255, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 0.4722602286269144, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 5.508355803245733e-05, |
|
"loss": 2.614, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.4731429580262171, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 5.494436334558922e-05, |
|
"loss": 2.6339, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 0.4740256874255197, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 5.480512995033785e-05, |
|
"loss": 2.6168, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 0.4749084168248224, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 5.466585893673229e-05, |
|
"loss": 2.6204, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 0.475791146224125, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 5.452655139509607e-05, |
|
"loss": 2.6301, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 0.4766738756234276, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 5.438720841603869e-05, |
|
"loss": 2.6382, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.4775566050227303, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 5.42478310904471e-05, |
|
"loss": 2.6265, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 0.4784393344220329, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 5.4108420509477145e-05, |
|
"loss": 2.631, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 0.4793220638213356, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 5.3968977764545025e-05, |
|
"loss": 2.6213, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 0.4802047932206382, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 5.3829503947318726e-05, |
|
"loss": 2.6119, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 0.48108752261994087, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 5.3690000149709505e-05, |
|
"loss": 2.6383, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.4819702520192435, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 5.3550467463863294e-05, |
|
"loss": 2.6316, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 0.48285298141854616, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 5.3410906982152257e-05, |
|
"loss": 2.6253, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 0.4837357108178488, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 5.327131979716608e-05, |
|
"loss": 2.622, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 0.48461844021715145, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 5.3131707001703604e-05, |
|
"loss": 2.6402, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 0.48550116961645406, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 5.299206968876407e-05, |
|
"loss": 2.6287, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.48638389901575674, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 5.2852408951538704e-05, |
|
"loss": 2.6174, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 0.48726662841505936, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 5.2712725883402126e-05, |
|
"loss": 2.6355, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.48814935781436203, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 5.257302157790377e-05, |
|
"loss": 2.6322, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 0.48903208721366465, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 5.243329712875934e-05, |
|
"loss": 2.6241, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 0.4899148166129673, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 5.229355362984221e-05, |
|
"loss": 2.6364, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.49079754601226994, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 5.215379217517492e-05, |
|
"loss": 2.6495, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 0.4916802754115726, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 5.2014013858920597e-05, |
|
"loss": 2.6038, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 0.4925630048108752, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 5.187421977537433e-05, |
|
"loss": 2.6297, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 0.49344573421017784, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 5.1734411018954674e-05, |
|
"loss": 2.6243, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 0.4943284636094805, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 5.159458868419505e-05, |
|
"loss": 2.6305, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.49521119300878313, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 5.145475386573518e-05, |
|
"loss": 2.6353, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 0.4960939224080858, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 5.1314907658312514e-05, |
|
"loss": 2.6267, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 0.4969766518073884, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 5.117505115675368e-05, |
|
"loss": 2.6248, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 0.4978593812066911, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 5.1035185455965894e-05, |
|
"loss": 2.6212, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 0.4987421106059937, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 5.0895311650928344e-05, |
|
"loss": 2.6456, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.4996248400052964, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 5.075543083668374e-05, |
|
"loss": 2.6206, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 0.5005075694045991, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 5.06155441083296e-05, |
|
"loss": 2.6386, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 0.5013902988039016, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 5.04756525610098e-05, |
|
"loss": 2.6223, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 0.5022730282032043, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 5.0335757289905885e-05, |
|
"loss": 2.6338, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 0.503155757602507, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 5.019585939022859e-05, |
|
"loss": 2.6239, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.5040384870018096, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 5.005595995720923e-05, |
|
"loss": 2.6258, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 0.5049212164011122, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 4.991606008609111e-05, |
|
"loss": 2.6213, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 0.5058039458004149, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 4.9776160872120955e-05, |
|
"loss": 2.631, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 0.5066866751997176, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 4.963626341054038e-05, |
|
"loss": 2.6331, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 0.5075694045990202, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 4.949636879657725e-05, |
|
"loss": 2.6257, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.5084521339983228, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 4.935647812543712e-05, |
|
"loss": 2.6318, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.5093348633976255, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 4.9216592492294746e-05, |
|
"loss": 2.6316, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 0.5102175927969281, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 4.9076712992285386e-05, |
|
"loss": 2.6222, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 0.5111003221962307, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 4.893684072049629e-05, |
|
"loss": 2.6312, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 0.5119830515955334, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 4.879697677195811e-05, |
|
"loss": 2.6249, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.512865780994836, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 4.8657122241636395e-05, |
|
"loss": 2.6336, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 0.5137485103941387, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 4.851727822442291e-05, |
|
"loss": 2.6424, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 0.5146312397934413, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 4.837744581512708e-05, |
|
"loss": 2.6247, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 0.515513969192744, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 4.823762610846755e-05, |
|
"loss": 2.6188, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 0.5163966985920466, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 4.8097820199063456e-05, |
|
"loss": 2.6359, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.5172794279913493, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 4.795802918142592e-05, |
|
"loss": 2.615, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 0.5181621573906519, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 4.781825414994949e-05, |
|
"loss": 2.6306, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 0.5190448867899545, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 4.7678496198903567e-05, |
|
"loss": 2.6259, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 0.5199276161892572, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 4.753875642242383e-05, |
|
"loss": 2.6101, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 0.5208103455885599, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 4.7399035914503655e-05, |
|
"loss": 2.6196, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.5216930749878624, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 4.7259335768985596e-05, |
|
"loss": 2.6288, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 0.5225758043871651, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 4.711965707955279e-05, |
|
"loss": 2.624, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 0.5234585337864678, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 4.698000093972037e-05, |
|
"loss": 2.6236, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 0.5243412631857705, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 4.684036844282694e-05, |
|
"loss": 2.6206, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 0.525223992585073, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 4.6700760682026066e-05, |
|
"loss": 2.6388, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.5261067219843757, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 4.6561178750277604e-05, |
|
"loss": 2.6421, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 0.5269894513836784, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 4.642162374033918e-05, |
|
"loss": 2.6268, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 0.5278721807829809, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 4.628209674475773e-05, |
|
"loss": 2.6289, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 0.5287549101822836, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 4.6142598855860795e-05, |
|
"loss": 2.6257, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 0.5296376395815863, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 4.600313116574809e-05, |
|
"loss": 2.6277, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.5296376395815863, |
|
"eval_accuracy": 0.49875058554303836, |
|
"eval_loss": 2.503570795059204, |
|
"eval_runtime": 7.1297, |
|
"eval_samples_per_second": 44.602, |
|
"eval_steps_per_second": 0.421, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.530520368980889, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 4.5863694766282894e-05, |
|
"loss": 2.615, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 0.5314030983801915, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 4.572429074908354e-05, |
|
"loss": 2.626, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 0.5322858277794942, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 4.558492020551482e-05, |
|
"loss": 2.6262, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 0.5331685571787969, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 4.544558422667944e-05, |
|
"loss": 2.6279, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 0.5340512865780995, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 4.530628390340963e-05, |
|
"loss": 2.639, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.5349340159774021, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 4.5167020326258326e-05, |
|
"loss": 2.612, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 0.5358167453767048, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 4.502779458549087e-05, |
|
"loss": 2.6433, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 0.5366994747760074, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 4.488860777107636e-05, |
|
"loss": 2.6328, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 0.5375822041753101, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 4.4749460972679205e-05, |
|
"loss": 2.6262, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 0.5384649335746127, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 4.461035527965046e-05, |
|
"loss": 2.6392, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.5393476629739153, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 4.4471291781019425e-05, |
|
"loss": 2.6324, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 0.540230392373218, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 4.4332271565485053e-05, |
|
"loss": 2.6216, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 0.5411131217725207, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 4.419329572140744e-05, |
|
"loss": 2.624, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 0.5419958511718233, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 4.405436533679931e-05, |
|
"loss": 2.624, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 0.5428785805711259, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 4.3915481499317484e-05, |
|
"loss": 2.6225, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.5437613099704286, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 4.3776645296254415e-05, |
|
"loss": 2.6373, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 0.5446440393697312, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 4.363785781452958e-05, |
|
"loss": 2.631, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 0.5455267687690338, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 4.349912014068107e-05, |
|
"loss": 2.6262, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 0.5464094981683365, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 4.3360433360857e-05, |
|
"loss": 2.6247, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 0.5472922275676392, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 4.322179856080711e-05, |
|
"loss": 2.6323, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.5481749569669417, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 4.308321682587415e-05, |
|
"loss": 2.6169, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 0.5490576863662444, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 4.2944689240985406e-05, |
|
"loss": 2.638, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 0.5499404157655471, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 4.2806216890644337e-05, |
|
"loss": 2.633, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 0.5508231451648498, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 4.266780085892188e-05, |
|
"loss": 2.6206, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 0.5517058745641523, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 4.252944222944813e-05, |
|
"loss": 2.6257, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.552588603963455, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 4.239114208540374e-05, |
|
"loss": 2.6352, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 0.5534713333627577, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 4.225290150951155e-05, |
|
"loss": 2.6231, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 0.5543540627620603, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 4.2114721584028014e-05, |
|
"loss": 2.6201, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 0.5552367921613629, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 4.1976603390734756e-05, |
|
"loss": 2.6209, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 0.5561195215606656, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 4.183854801093017e-05, |
|
"loss": 2.6257, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.5570022509599682, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 4.170055652542083e-05, |
|
"loss": 2.6307, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 0.5578849803592708, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 4.156263001451314e-05, |
|
"loss": 2.629, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 0.5587677097585735, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 4.1424769558004786e-05, |
|
"loss": 2.6266, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 0.5596504391578762, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 4.12869762351764e-05, |
|
"loss": 2.6237, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 0.5605331685571788, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 4.114925112478299e-05, |
|
"loss": 2.628, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.5614158979564814, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 4.1011595305045504e-05, |
|
"loss": 2.6286, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 0.5622986273557841, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 4.0874009853642515e-05, |
|
"loss": 2.6312, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 0.5631813567550867, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 4.073649584770165e-05, |
|
"loss": 2.6292, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 0.5640640861543894, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 4.059905436379121e-05, |
|
"loss": 2.6139, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 0.564946815553692, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 4.046168647791171e-05, |
|
"loss": 2.6176, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.5658295449529946, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 4.032439326548754e-05, |
|
"loss": 2.634, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 0.5667122743522973, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 4.0187175801358436e-05, |
|
"loss": 2.6211, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 0.5675950037516, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 4.005003515977113e-05, |
|
"loss": 2.6386, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 0.5684777331509026, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 3.991297241437095e-05, |
|
"loss": 2.6258, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 0.5693604625502052, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 3.977598863819336e-05, |
|
"loss": 2.6364, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.5702431919495079, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 3.963908490365562e-05, |
|
"loss": 2.6291, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 0.5711259213488106, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 3.9502262282548296e-05, |
|
"loss": 2.6231, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 0.5720086507481131, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 3.936552184602704e-05, |
|
"loss": 2.6223, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 0.5728913801474158, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 3.9228864664604014e-05, |
|
"loss": 2.6224, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 0.5737741095467185, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 3.9092291808139614e-05, |
|
"loss": 2.6227, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.574656838946021, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 3.895580434583409e-05, |
|
"loss": 2.6219, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 0.5755395683453237, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 3.881940334621914e-05, |
|
"loss": 2.6411, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 0.5764222977446264, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 3.868308987714958e-05, |
|
"loss": 2.6267, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 0.5773050271439291, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 3.854686500579494e-05, |
|
"loss": 2.6416, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 0.5781877565432316, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 3.841072979863118e-05, |
|
"loss": 2.614, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.5790704859425343, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 3.8274685321432266e-05, |
|
"loss": 2.6288, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 0.579953215341837, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 3.8138732639261846e-05, |
|
"loss": 2.628, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 0.5808359447411396, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 3.800287281646499e-05, |
|
"loss": 2.6223, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 0.5817186741404422, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 3.786710691665971e-05, |
|
"loss": 2.6236, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 0.5826014035397449, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 3.7731436002728755e-05, |
|
"loss": 2.616, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.5834841329390476, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 3.759586113681123e-05, |
|
"loss": 2.6246, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 0.5843668623383502, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 3.746038338029435e-05, |
|
"loss": 2.6202, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 0.5852495917376528, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 3.732500379380504e-05, |
|
"loss": 2.6071, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 0.5861323211369555, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 3.718972343720164e-05, |
|
"loss": 2.6136, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 0.5870150505362581, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 3.705454336956573e-05, |
|
"loss": 2.6261, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.5878977799355608, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 3.691946464919367e-05, |
|
"loss": 2.6299, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 0.5887805093348634, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 3.6784488333588416e-05, |
|
"loss": 2.6416, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 0.589663238734166, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 3.6649615479451226e-05, |
|
"loss": 2.6213, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 0.5905459681334687, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 3.65148471426734e-05, |
|
"loss": 2.6365, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 0.5914286975327713, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 3.638018437832795e-05, |
|
"loss": 2.6543, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.592311426932074, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 3.624562824066138e-05, |
|
"loss": 2.6174, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 0.5931941563313766, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 3.6111179783085514e-05, |
|
"loss": 2.6264, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 0.5940768857306793, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 3.59768400581691e-05, |
|
"loss": 2.619, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 0.5949596151299819, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 3.5842610117629636e-05, |
|
"loss": 2.6272, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 0.5958423445292845, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 3.5708491012325164e-05, |
|
"loss": 2.6186, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.5967250739285872, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 3.5574483792246046e-05, |
|
"loss": 2.6323, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 0.5976078033278899, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 3.5440589506506686e-05, |
|
"loss": 2.633, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 0.5984905327271924, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 3.530680920333733e-05, |
|
"loss": 2.6164, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 0.5993732621264951, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 3.517314393007594e-05, |
|
"loss": 2.6319, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 0.6002559915257978, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 3.503959473315986e-05, |
|
"loss": 2.6203, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.6011387209251005, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 3.490616265811776e-05, |
|
"loss": 2.6206, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 0.602021450324403, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 3.477284874956134e-05, |
|
"loss": 2.6163, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 0.6029041797237057, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 3.463965405117723e-05, |
|
"loss": 2.5971, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 0.6037869091230084, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 3.450657960571878e-05, |
|
"loss": 2.628, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 0.604669638522311, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 3.437362645499787e-05, |
|
"loss": 2.6248, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.6055523679216136, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 3.424079563987688e-05, |
|
"loss": 2.62, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 0.6064350973209163, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 3.4108088200260354e-05, |
|
"loss": 2.621, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 0.607317826720219, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 3.3975505175087017e-05, |
|
"loss": 2.6261, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 0.6082005561195215, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 3.384304760232151e-05, |
|
"loss": 2.6298, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 0.6090832855188242, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 3.371071651894643e-05, |
|
"loss": 2.6244, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.6099660149181269, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 3.3578512960954044e-05, |
|
"loss": 2.6135, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 0.6108487443174295, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 3.3446437963338255e-05, |
|
"loss": 2.6161, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 0.6117314737167321, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 3.3314492560086536e-05, |
|
"loss": 2.6346, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 0.6126142031160348, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 3.318267778417175e-05, |
|
"loss": 2.6285, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 0.6134969325153374, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 3.305099466754412e-05, |
|
"loss": 2.6211, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.6143796619146401, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 3.291944424112314e-05, |
|
"loss": 2.6073, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 0.6152623913139427, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 3.2788027534789504e-05, |
|
"loss": 2.6382, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 0.6161451207132453, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 3.2656745577377025e-05, |
|
"loss": 2.6129, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 0.617027850112548, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 3.2525599396664584e-05, |
|
"loss": 2.6133, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 0.6179105795118507, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 3.2394590019368156e-05, |
|
"loss": 2.6179, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.6187933089111533, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 3.226371847113265e-05, |
|
"loss": 2.6153, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 0.6196760383104559, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 3.2132985776523964e-05, |
|
"loss": 2.6358, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 0.6205587677097586, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 3.2002392959020896e-05, |
|
"loss": 2.6359, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 0.6214414971090613, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 3.187194104100727e-05, |
|
"loss": 2.6304, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 0.6223242265083638, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 3.1741631043763766e-05, |
|
"loss": 2.625, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.6232069559076665, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 3.1611463987459984e-05, |
|
"loss": 2.6233, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 0.6240896853069692, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 3.148144089114653e-05, |
|
"loss": 2.6241, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 0.6249724147062717, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 3.135156277274692e-05, |
|
"loss": 2.6232, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 0.6258551441055744, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 3.1221830649049725e-05, |
|
"loss": 2.6329, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 0.6267378735048771, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 3.109224553570047e-05, |
|
"loss": 2.6296, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.6276206029041798, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 3.096280844719388e-05, |
|
"loss": 2.6346, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 0.6285033323034823, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 3.0833520396865724e-05, |
|
"loss": 2.624, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 0.629386061702785, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 3.0704382396885013e-05, |
|
"loss": 2.6119, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 0.6302687911020877, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 3.057539545824609e-05, |
|
"loss": 2.6206, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 0.6311515205013903, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 3.0446560590760627e-05, |
|
"loss": 2.6294, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.6320342499006929, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 3.0317878803049755e-05, |
|
"loss": 2.6323, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 0.6329169792999956, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 3.018935110253619e-05, |
|
"loss": 2.6279, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 0.6337997086992982, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 3.006097849543635e-05, |
|
"loss": 2.6163, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 0.6346824380986009, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 2.9932761986752434e-05, |
|
"loss": 2.6277, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 0.6355651674979035, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 2.9804702580264564e-05, |
|
"loss": 2.6287, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.6364478968972062, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 2.967680127852301e-05, |
|
"loss": 2.6401, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 0.6373306262965088, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 2.95490590828402e-05, |
|
"loss": 2.6298, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 0.6382133556958115, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 2.9421476993282988e-05, |
|
"loss": 2.6035, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 0.6390960850951141, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 2.9294056008664762e-05, |
|
"loss": 2.6071, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 0.6399788144944167, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 2.9166797126537715e-05, |
|
"loss": 2.6279, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.6408615438937194, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 2.903970134318493e-05, |
|
"loss": 2.6355, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 0.641744273293022, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 2.8912769653612605e-05, |
|
"loss": 2.6411, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 0.6426270026923246, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 2.8786003051542304e-05, |
|
"loss": 2.6248, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 0.6435097320916273, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 2.8659402529403196e-05, |
|
"loss": 2.6262, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 0.64439246149093, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 2.8532969078324162e-05, |
|
"loss": 2.6202, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.6452751908902326, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 2.840670368812618e-05, |
|
"loss": 2.6117, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 0.6461579202895352, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 2.8280607347314504e-05, |
|
"loss": 2.6212, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 0.6470406496888379, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 2.8154681043070946e-05, |
|
"loss": 2.6303, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 0.6479233790881406, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 2.8028925761246084e-05, |
|
"loss": 2.6152, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 0.6488061084874431, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 2.7903342486351647e-05, |
|
"loss": 2.6195, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.6496888378867458, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 2.777793220155277e-05, |
|
"loss": 2.6275, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 0.6505715672860485, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 2.765269588866023e-05, |
|
"loss": 2.6047, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 0.6514542966853512, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 2.7527634528122847e-05, |
|
"loss": 2.6367, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 0.6523370260846537, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 2.7402749099019797e-05, |
|
"loss": 2.6283, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 0.6532197554839564, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 2.7278040579052876e-05, |
|
"loss": 2.638, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.6541024848832591, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 2.7153509944538923e-05, |
|
"loss": 2.634, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 0.6549852142825617, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 2.702915817040218e-05, |
|
"loss": 2.6214, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 0.6558679436818643, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 2.6904986230166562e-05, |
|
"loss": 2.625, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 0.656750673081167, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 2.6780995095948158e-05, |
|
"loss": 2.6243, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 0.6576334024804696, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 2.6657185738447493e-05, |
|
"loss": 2.6234, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.6585161318797722, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 2.653355912694213e-05, |
|
"loss": 2.6195, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 0.6593988612790749, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 2.6410116229278815e-05, |
|
"loss": 2.6284, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 0.6602815906783776, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 2.6286858011866104e-05, |
|
"loss": 2.6237, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 0.6611643200776802, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 2.6163785439666753e-05, |
|
"loss": 2.6095, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 0.6620470494769828, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 2.6040899476190062e-05, |
|
"loss": 2.6246, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.6629297788762855, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 2.5918201083484494e-05, |
|
"loss": 2.6307, |
|
"step": 7510 |
|
}, |
|
{ |
|
"epoch": 0.6638125082755881, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 2.5795691222129982e-05, |
|
"loss": 2.6286, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 0.6646952376748908, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 2.567337085123055e-05, |
|
"loss": 2.6388, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 0.6655779670741934, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 2.5551240928406735e-05, |
|
"loss": 2.6232, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 0.666460696473496, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 2.542930240978802e-05, |
|
"loss": 2.6161, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.6673434258727987, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 2.530755625000556e-05, |
|
"loss": 2.6233, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 0.6682261552721014, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 2.5186003402184464e-05, |
|
"loss": 2.6219, |
|
"step": 7570 |
|
}, |
|
{ |
|
"epoch": 0.669108884671404, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 2.5064644817936445e-05, |
|
"loss": 2.6193, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 0.6699916140707066, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 2.4943481447352425e-05, |
|
"loss": 2.6268, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 0.6708743434700093, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 2.4822514238995005e-05, |
|
"loss": 2.6233, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.671757072869312, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 2.4701744139891098e-05, |
|
"loss": 2.6255, |
|
"step": 7610 |
|
}, |
|
{ |
|
"epoch": 0.6726398022686145, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 2.458117209552443e-05, |
|
"loss": 2.6224, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 0.6735225316679172, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 2.446079904982827e-05, |
|
"loss": 2.6141, |
|
"step": 7630 |
|
}, |
|
{ |
|
"epoch": 0.6744052610672199, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 2.4340625945177947e-05, |
|
"loss": 2.612, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 0.6752879904665224, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 2.4220653722383445e-05, |
|
"loss": 2.6168, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.6761707198658251, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 2.410088332068215e-05, |
|
"loss": 2.6232, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 0.6770534492651278, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 2.3981315677731415e-05, |
|
"loss": 2.6173, |
|
"step": 7670 |
|
}, |
|
{ |
|
"epoch": 0.6779361786644305, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 2.3861951729601206e-05, |
|
"loss": 2.6193, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 0.678818908063733, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 2.374279241076684e-05, |
|
"loss": 2.6333, |
|
"step": 7690 |
|
}, |
|
{ |
|
"epoch": 0.6797016374630357, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 2.362383865410162e-05, |
|
"loss": 2.6286, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.6805843668623384, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 2.3505091390869576e-05, |
|
"loss": 2.6351, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 0.681467096261641, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 2.33865515507181e-05, |
|
"loss": 2.6419, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 0.6823498256609436, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 2.326822006167068e-05, |
|
"loss": 2.6173, |
|
"step": 7730 |
|
}, |
|
{ |
|
"epoch": 0.6832325550602463, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 2.3150097850119802e-05, |
|
"loss": 2.6185, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 0.684115284459549, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 2.3032185840819414e-05, |
|
"loss": 2.629, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.6849980138588516, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 2.2914484956877923e-05, |
|
"loss": 2.6191, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 0.6858807432581542, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 2.279699611975087e-05, |
|
"loss": 2.6239, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 0.6867634726574569, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 2.2679720249233665e-05, |
|
"loss": 2.6025, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 0.6876462020567595, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 2.2562658263454543e-05, |
|
"loss": 2.6218, |
|
"step": 7790 |
|
}, |
|
{ |
|
"epoch": 0.6885289314560622, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 2.2445811078867185e-05, |
|
"loss": 2.6176, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.6894116608553648, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 2.2329179610243707e-05, |
|
"loss": 2.6166, |
|
"step": 7810 |
|
}, |
|
{ |
|
"epoch": 0.6902943902546674, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 2.2212764770667415e-05, |
|
"loss": 2.6049, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 0.6911771196539701, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 2.2096567471525614e-05, |
|
"loss": 2.6358, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 0.6920598490532727, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 2.1980588622502658e-05, |
|
"loss": 2.6265, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 0.6929425784525753, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 2.1864829131572572e-05, |
|
"loss": 2.6251, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.693825307851878, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 2.174928990499212e-05, |
|
"loss": 2.6321, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 0.6947080372511807, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 2.1633971847293678e-05, |
|
"loss": 2.6235, |
|
"step": 7870 |
|
}, |
|
{ |
|
"epoch": 0.6955907666504832, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 2.1518875861278113e-05, |
|
"loss": 2.6185, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 0.6964734960497859, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 2.1404002848007793e-05, |
|
"loss": 2.6147, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 0.6973562254490886, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 2.1289353706799377e-05, |
|
"loss": 2.6092, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.6982389548483913, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 2.117492933521697e-05, |
|
"loss": 2.6363, |
|
"step": 7910 |
|
}, |
|
{ |
|
"epoch": 0.6991216842476938, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 2.1060730629064978e-05, |
|
"loss": 2.6392, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 0.7000044136469965, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 2.0946758482381085e-05, |
|
"loss": 2.63, |
|
"step": 7930 |
|
}, |
|
{ |
|
"epoch": 0.7008871430462992, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 2.0833013787429323e-05, |
|
"loss": 2.6227, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 0.7017698724456018, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 2.071949743469307e-05, |
|
"loss": 2.6166, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.7026526018449044, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 2.0606210312867997e-05, |
|
"loss": 2.6349, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 0.7035353312442071, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 2.0493153308855223e-05, |
|
"loss": 2.6306, |
|
"step": 7970 |
|
}, |
|
{ |
|
"epoch": 0.7044180606435098, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 2.038032730775432e-05, |
|
"loss": 2.6178, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 0.7053007900428124, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 2.026773319285639e-05, |
|
"loss": 2.6263, |
|
"step": 7990 |
|
}, |
|
{ |
|
"epoch": 0.706183519442115, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 2.0155371845637123e-05, |
|
"loss": 2.6339, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.706183519442115, |
|
"eval_accuracy": 0.49892260080939327, |
|
"eval_loss": 2.501890182495117, |
|
"eval_runtime": 7.1805, |
|
"eval_samples_per_second": 44.287, |
|
"eval_steps_per_second": 0.418, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.7070662488414177, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 2.0043244145749896e-05, |
|
"loss": 2.6303, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 0.7079489782407203, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 1.9931350971019008e-05, |
|
"loss": 2.6234, |
|
"step": 8020 |
|
}, |
|
{ |
|
"epoch": 0.7088317076400229, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 1.9819693197432603e-05, |
|
"loss": 2.6225, |
|
"step": 8030 |
|
}, |
|
{ |
|
"epoch": 0.7097144370393256, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 1.9708271699135977e-05, |
|
"loss": 2.6256, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 0.7105971664386282, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 1.9597087348424682e-05, |
|
"loss": 2.6243, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.7114798958379309, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 1.9486141015737637e-05, |
|
"loss": 2.6278, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 0.7123626252372335, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 1.9375433569650438e-05, |
|
"loss": 2.6287, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 0.7132453546365362, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 1.9264965876868396e-05, |
|
"loss": 2.6198, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 0.7141280840358388, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 1.9154738802219973e-05, |
|
"loss": 2.6258, |
|
"step": 8090 |
|
}, |
|
{ |
|
"epoch": 0.7150108134351415, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 1.904475320864978e-05, |
|
"loss": 2.6311, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.7158935428344441, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 1.8935009957211912e-05, |
|
"loss": 2.6296, |
|
"step": 8110 |
|
}, |
|
{ |
|
"epoch": 0.7167762722337467, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 1.8825509907063327e-05, |
|
"loss": 2.6211, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 0.7176590016330494, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 1.87162539154569e-05, |
|
"loss": 2.6147, |
|
"step": 8130 |
|
}, |
|
{ |
|
"epoch": 0.7185417310323521, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 1.860724283773489e-05, |
|
"loss": 2.624, |
|
"step": 8140 |
|
}, |
|
{ |
|
"epoch": 0.7194244604316546, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 1.8498477527322123e-05, |
|
"loss": 2.6308, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 0.7203071898309573, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 1.8389958835719413e-05, |
|
"loss": 2.613, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 0.72118991923026, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 1.828168761249684e-05, |
|
"loss": 2.6128, |
|
"step": 8170 |
|
}, |
|
{ |
|
"epoch": 0.7220726486295626, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 1.817366470528707e-05, |
|
"loss": 2.624, |
|
"step": 8180 |
|
}, |
|
{ |
|
"epoch": 0.7229553780288652, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 1.806589095977878e-05, |
|
"loss": 2.6244, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 0.7238381074281679, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 1.7958367219710047e-05, |
|
"loss": 2.6347, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.7247208368274706, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 1.785109432686162e-05, |
|
"loss": 2.6195, |
|
"step": 8210 |
|
}, |
|
{ |
|
"epoch": 0.7256035662267731, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 1.774407312105051e-05, |
|
"loss": 2.6074, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 0.7264862956260758, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 1.7637304440123275e-05, |
|
"loss": 2.6076, |
|
"step": 8230 |
|
}, |
|
{ |
|
"epoch": 0.7273690250253785, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 1.75307891199495e-05, |
|
"loss": 2.626, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 0.7282517544246812, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 1.742452799441528e-05, |
|
"loss": 2.6314, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.7291344838239837, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 1.731852189541669e-05, |
|
"loss": 2.6327, |
|
"step": 8260 |
|
}, |
|
{ |
|
"epoch": 0.7300172132232864, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 1.7212771652853242e-05, |
|
"loss": 2.6265, |
|
"step": 8270 |
|
}, |
|
{ |
|
"epoch": 0.7308999426225891, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 1.710727809462137e-05, |
|
"loss": 2.6317, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 0.7317826720218917, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 1.7002042046608017e-05, |
|
"loss": 2.6335, |
|
"step": 8290 |
|
}, |
|
{ |
|
"epoch": 0.7326654014211943, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 1.6897064332684153e-05, |
|
"loss": 2.6111, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.733548130820497, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 1.6792345774698233e-05, |
|
"loss": 2.6238, |
|
"step": 8310 |
|
}, |
|
{ |
|
"epoch": 0.7344308602197996, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 1.6687887192469888e-05, |
|
"loss": 2.6294, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 0.7353135896191023, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 1.6583689403783464e-05, |
|
"loss": 2.6179, |
|
"step": 8330 |
|
}, |
|
{ |
|
"epoch": 0.7361963190184049, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 1.6479753224381554e-05, |
|
"loss": 2.6194, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 0.7370790484177075, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 1.6376079467958734e-05, |
|
"loss": 2.6321, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 0.7379617778170102, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 1.627266894615504e-05, |
|
"loss": 2.6303, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 0.7388445072163128, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 1.6169522468549812e-05, |
|
"loss": 2.6058, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 0.7397272366156155, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 1.6066640842655155e-05, |
|
"loss": 2.6188, |
|
"step": 8380 |
|
}, |
|
{ |
|
"epoch": 0.7406099660149181, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 1.59640248739097e-05, |
|
"loss": 2.6073, |
|
"step": 8390 |
|
}, |
|
{ |
|
"epoch": 0.7414926954142208, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 1.586167536567239e-05, |
|
"loss": 2.626, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.7423754248135234, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 1.575959311921602e-05, |
|
"loss": 2.6275, |
|
"step": 8410 |
|
}, |
|
{ |
|
"epoch": 0.743258154212826, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 1.5657778933721102e-05, |
|
"loss": 2.6243, |
|
"step": 8420 |
|
}, |
|
{ |
|
"epoch": 0.7441408836121287, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 1.55562336062695e-05, |
|
"loss": 2.6332, |
|
"step": 8430 |
|
}, |
|
{ |
|
"epoch": 0.7450236130114314, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 1.545495793183829e-05, |
|
"loss": 2.6268, |
|
"step": 8440 |
|
}, |
|
{ |
|
"epoch": 0.745906342410734, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 1.53539527032935e-05, |
|
"loss": 2.6207, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 0.7467890718100366, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 1.5253218711383844e-05, |
|
"loss": 2.6227, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 0.7476718012093393, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 1.5152756744734615e-05, |
|
"loss": 2.6195, |
|
"step": 8470 |
|
}, |
|
{ |
|
"epoch": 0.748554530608642, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 1.5052567589841498e-05, |
|
"loss": 2.6193, |
|
"step": 8480 |
|
}, |
|
{ |
|
"epoch": 0.7494372600079445, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 1.4952652031064324e-05, |
|
"loss": 2.6207, |
|
"step": 8490 |
|
}, |
|
{ |
|
"epoch": 0.7503199894072472, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 1.4853010850621062e-05, |
|
"loss": 2.6358, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.7512027188065499, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 1.475364482858161e-05, |
|
"loss": 2.627, |
|
"step": 8510 |
|
}, |
|
{ |
|
"epoch": 0.7520854482058525, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 1.4654554742861714e-05, |
|
"loss": 2.6251, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 0.7529681776051551, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 1.4555741369216841e-05, |
|
"loss": 2.6306, |
|
"step": 8530 |
|
}, |
|
{ |
|
"epoch": 0.7538509070044578, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 1.445720548123617e-05, |
|
"loss": 2.629, |
|
"step": 8540 |
|
}, |
|
{ |
|
"epoch": 0.7547336364037605, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 1.4358947850336512e-05, |
|
"loss": 2.6249, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 0.755616365803063, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 1.4260969245756218e-05, |
|
"loss": 2.6375, |
|
"step": 8560 |
|
}, |
|
{ |
|
"epoch": 0.7564990952023657, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 1.4163270434549237e-05, |
|
"loss": 2.6289, |
|
"step": 8570 |
|
}, |
|
{ |
|
"epoch": 0.7573818246016684, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 1.4065852181579075e-05, |
|
"loss": 2.6232, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 0.758264554000971, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 1.3968715249512775e-05, |
|
"loss": 2.6222, |
|
"step": 8590 |
|
}, |
|
{ |
|
"epoch": 0.7591472834002736, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 1.3871860398815001e-05, |
|
"loss": 2.6257, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.7600300127995763, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 1.3775288387742085e-05, |
|
"loss": 2.6222, |
|
"step": 8610 |
|
}, |
|
{ |
|
"epoch": 0.7609127421988789, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 1.3678999972335998e-05, |
|
"loss": 2.6421, |
|
"step": 8620 |
|
}, |
|
{ |
|
"epoch": 0.7617954715981816, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 1.3582995906418578e-05, |
|
"loss": 2.6145, |
|
"step": 8630 |
|
}, |
|
{ |
|
"epoch": 0.7626782009974842, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 1.348727694158547e-05, |
|
"loss": 2.6128, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 0.7635609303967869, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 1.3391843827200423e-05, |
|
"loss": 2.6248, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 0.7644436597960895, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 1.3296697310389228e-05, |
|
"loss": 2.6301, |
|
"step": 8660 |
|
}, |
|
{ |
|
"epoch": 0.7653263891953922, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 1.3201838136034012e-05, |
|
"loss": 2.6149, |
|
"step": 8670 |
|
}, |
|
{ |
|
"epoch": 0.7662091185946948, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 1.3107267046767363e-05, |
|
"loss": 2.6265, |
|
"step": 8680 |
|
}, |
|
{ |
|
"epoch": 0.7670918479939974, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 1.3012984782966464e-05, |
|
"loss": 2.617, |
|
"step": 8690 |
|
}, |
|
{ |
|
"epoch": 0.7679745773933001, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 1.2918992082747417e-05, |
|
"loss": 2.6187, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.7688573067926028, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 1.2825289681959313e-05, |
|
"loss": 2.63, |
|
"step": 8710 |
|
}, |
|
{ |
|
"epoch": 0.7697400361919053, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 1.2731878314178609e-05, |
|
"loss": 2.6196, |
|
"step": 8720 |
|
}, |
|
{ |
|
"epoch": 0.770622765591208, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 1.2638758710703308e-05, |
|
"loss": 2.6072, |
|
"step": 8730 |
|
}, |
|
{ |
|
"epoch": 0.7715054949905107, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 1.2545931600547218e-05, |
|
"loss": 2.6154, |
|
"step": 8740 |
|
}, |
|
{ |
|
"epoch": 0.7723882243898132, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 1.2453397710434356e-05, |
|
"loss": 2.6287, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.7732709537891159, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 1.2361157764793113e-05, |
|
"loss": 2.6299, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 0.7741536831884186, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 1.2269212485750648e-05, |
|
"loss": 2.6174, |
|
"step": 8770 |
|
}, |
|
{ |
|
"epoch": 0.7750364125877213, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 1.2177562593127274e-05, |
|
"loss": 2.6287, |
|
"step": 8780 |
|
}, |
|
{ |
|
"epoch": 0.7759191419870238, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 1.2086208804430776e-05, |
|
"loss": 2.6378, |
|
"step": 8790 |
|
}, |
|
{ |
|
"epoch": 0.7768018713863265, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 1.1995151834850815e-05, |
|
"loss": 2.6295, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.7776846007856292, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 1.1904392397253267e-05, |
|
"loss": 2.6436, |
|
"step": 8810 |
|
}, |
|
{ |
|
"epoch": 0.7785673301849318, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 1.1813931202174739e-05, |
|
"loss": 2.6316, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 0.7794500595842344, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 1.1723768957816966e-05, |
|
"loss": 2.6201, |
|
"step": 8830 |
|
}, |
|
{ |
|
"epoch": 0.7803327889835371, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 1.1633906370041214e-05, |
|
"loss": 2.6255, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 0.7812155183828398, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 1.154434414236284e-05, |
|
"loss": 2.624, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 0.7820982477821424, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 1.1455082975945752e-05, |
|
"loss": 2.6158, |
|
"step": 8860 |
|
}, |
|
{ |
|
"epoch": 0.782980977181445, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 1.1366123569596859e-05, |
|
"loss": 2.6169, |
|
"step": 8870 |
|
}, |
|
{ |
|
"epoch": 0.7838637065807477, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 1.1277466619760708e-05, |
|
"loss": 2.6269, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 0.7847464359800503, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 1.1189112820513952e-05, |
|
"loss": 2.6299, |
|
"step": 8890 |
|
}, |
|
{ |
|
"epoch": 0.785629165379353, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 1.1101062863559968e-05, |
|
"loss": 2.6147, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.7865118947786556, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 1.1013317438223381e-05, |
|
"loss": 2.626, |
|
"step": 8910 |
|
}, |
|
{ |
|
"epoch": 0.7873946241779582, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 1.0925877231444687e-05, |
|
"loss": 2.6106, |
|
"step": 8920 |
|
}, |
|
{ |
|
"epoch": 0.7882773535772609, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 1.083874292777497e-05, |
|
"loss": 2.6319, |
|
"step": 8930 |
|
}, |
|
{ |
|
"epoch": 0.7891600829765635, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 1.0751915209370372e-05, |
|
"loss": 2.604, |
|
"step": 8940 |
|
}, |
|
{ |
|
"epoch": 0.7900428123758662, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 1.0665394755986902e-05, |
|
"loss": 2.63, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 0.7909255417751688, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 1.0579182244975045e-05, |
|
"loss": 2.6271, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 0.7918082711744715, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 1.0493278351274433e-05, |
|
"loss": 2.6332, |
|
"step": 8970 |
|
}, |
|
{ |
|
"epoch": 0.7926910005737741, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 1.040768374740866e-05, |
|
"loss": 2.6309, |
|
"step": 8980 |
|
}, |
|
{ |
|
"epoch": 0.7935737299730767, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 1.0322399103479884e-05, |
|
"loss": 2.6025, |
|
"step": 8990 |
|
}, |
|
{ |
|
"epoch": 0.7944564593723794, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 1.0237425087163715e-05, |
|
"loss": 2.6356, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.7953391887716821, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 1.01527623637039e-05, |
|
"loss": 2.6257, |
|
"step": 9010 |
|
}, |
|
{ |
|
"epoch": 0.7962219181709846, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 1.0068411595907101e-05, |
|
"loss": 2.6319, |
|
"step": 9020 |
|
}, |
|
{ |
|
"epoch": 0.7971046475702873, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 9.98437344413784e-06, |
|
"loss": 2.6298, |
|
"step": 9030 |
|
}, |
|
{ |
|
"epoch": 0.79798737696959, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 9.900648566313126e-06, |
|
"loss": 2.6312, |
|
"step": 9040 |
|
}, |
|
{ |
|
"epoch": 0.7988701063688927, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 9.817237617897495e-06, |
|
"loss": 2.6168, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 0.7997528357681952, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 9.734141251897733e-06, |
|
"loss": 2.6416, |
|
"step": 9060 |
|
}, |
|
{ |
|
"epoch": 0.8006355651674979, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 9.651360118857866e-06, |
|
"loss": 2.6145, |
|
"step": 9070 |
|
}, |
|
{ |
|
"epoch": 0.8015182945668006, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 9.568894866854034e-06, |
|
"loss": 2.6228, |
|
"step": 9080 |
|
}, |
|
{ |
|
"epoch": 0.8024010239661032, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 9.48674614148936e-06, |
|
"loss": 2.6266, |
|
"step": 9090 |
|
}, |
|
{ |
|
"epoch": 0.8032837533654058, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 9.404914585888997e-06, |
|
"loss": 2.6232, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.8041664827647085, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 9.323400840695035e-06, |
|
"loss": 2.6182, |
|
"step": 9110 |
|
}, |
|
{ |
|
"epoch": 0.8050492121640112, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 9.24220554406146e-06, |
|
"loss": 2.6254, |
|
"step": 9120 |
|
}, |
|
{ |
|
"epoch": 0.8059319415633137, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 9.16132933164922e-06, |
|
"loss": 2.6353, |
|
"step": 9130 |
|
}, |
|
{ |
|
"epoch": 0.8068146709626164, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 9.080772836621238e-06, |
|
"loss": 2.6321, |
|
"step": 9140 |
|
}, |
|
{ |
|
"epoch": 0.8076974003619191, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 9.000536689637373e-06, |
|
"loss": 2.6406, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 0.8085801297612217, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 8.920621518849603e-06, |
|
"loss": 2.6226, |
|
"step": 9160 |
|
}, |
|
{ |
|
"epoch": 0.8094628591605243, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 8.841027949897034e-06, |
|
"loss": 2.6177, |
|
"step": 9170 |
|
}, |
|
{ |
|
"epoch": 0.810345588559827, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 8.761756605901034e-06, |
|
"loss": 2.616, |
|
"step": 9180 |
|
}, |
|
{ |
|
"epoch": 0.8112283179591296, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 8.682808107460295e-06, |
|
"loss": 2.6296, |
|
"step": 9190 |
|
}, |
|
{ |
|
"epoch": 0.8121110473584323, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 8.604183072646055e-06, |
|
"loss": 2.6248, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.8129937767577349, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 8.52588211699723e-06, |
|
"loss": 2.6157, |
|
"step": 9210 |
|
}, |
|
{ |
|
"epoch": 0.8138765061570375, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 8.447905853515547e-06, |
|
"loss": 2.6233, |
|
"step": 9220 |
|
}, |
|
{ |
|
"epoch": 0.8147592355563402, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 8.370254892660834e-06, |
|
"loss": 2.6281, |
|
"step": 9230 |
|
}, |
|
{ |
|
"epoch": 0.8156419649556429, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 8.292929842346176e-06, |
|
"loss": 2.6195, |
|
"step": 9240 |
|
}, |
|
{ |
|
"epoch": 0.8165246943549455, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 8.21593130793314e-06, |
|
"loss": 2.6297, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 0.8174074237542481, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 8.13925989222713e-06, |
|
"loss": 2.618, |
|
"step": 9260 |
|
}, |
|
{ |
|
"epoch": 0.8182901531535508, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 8.06291619547253e-06, |
|
"loss": 2.6237, |
|
"step": 9270 |
|
}, |
|
{ |
|
"epoch": 0.8191728825528535, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 7.98690081534818e-06, |
|
"loss": 2.6225, |
|
"step": 9280 |
|
}, |
|
{ |
|
"epoch": 0.820055611952156, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 7.911214346962508e-06, |
|
"loss": 2.626, |
|
"step": 9290 |
|
}, |
|
{ |
|
"epoch": 0.8209383413514587, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 7.835857382848976e-06, |
|
"loss": 2.6203, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.8218210707507614, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 7.760830512961498e-06, |
|
"loss": 2.6344, |
|
"step": 9310 |
|
}, |
|
{ |
|
"epoch": 0.822703800150064, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 7.686134324669652e-06, |
|
"loss": 2.6023, |
|
"step": 9320 |
|
}, |
|
{ |
|
"epoch": 0.8235865295493666, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 7.611769402754254e-06, |
|
"loss": 2.6184, |
|
"step": 9330 |
|
}, |
|
{ |
|
"epoch": 0.8244692589486693, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 7.5377363294026505e-06, |
|
"loss": 2.6228, |
|
"step": 9340 |
|
}, |
|
{ |
|
"epoch": 0.825351988347972, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 7.464035684204251e-06, |
|
"loss": 2.649, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 0.8262347177472745, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 7.390668044145954e-06, |
|
"loss": 2.6216, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 0.8271174471465772, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 7.317633983607597e-06, |
|
"loss": 2.6286, |
|
"step": 9370 |
|
}, |
|
{ |
|
"epoch": 0.8280001765458799, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 7.244934074357529e-06, |
|
"loss": 2.6231, |
|
"step": 9380 |
|
}, |
|
{ |
|
"epoch": 0.8288829059451825, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 7.1725688855480975e-06, |
|
"loss": 2.608, |
|
"step": 9390 |
|
}, |
|
{ |
|
"epoch": 0.8297656353444851, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 7.100538983711158e-06, |
|
"loss": 2.6203, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.8306483647437878, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 7.028844932753714e-06, |
|
"loss": 2.6287, |
|
"step": 9410 |
|
}, |
|
{ |
|
"epoch": 0.8315310941430905, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 6.957487293953452e-06, |
|
"loss": 2.6295, |
|
"step": 9420 |
|
}, |
|
{ |
|
"epoch": 0.8324138235423931, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 6.886466625954352e-06, |
|
"loss": 2.6256, |
|
"step": 9430 |
|
}, |
|
{ |
|
"epoch": 0.8332965529416957, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 6.815783484762312e-06, |
|
"loss": 2.611, |
|
"step": 9440 |
|
}, |
|
{ |
|
"epoch": 0.8341792823409984, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 6.745438423740824e-06, |
|
"loss": 2.6091, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 0.835062011740301, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 6.675431993606618e-06, |
|
"loss": 2.6366, |
|
"step": 9460 |
|
}, |
|
{ |
|
"epoch": 0.8359447411396037, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 6.60576474242533e-06, |
|
"loss": 2.6258, |
|
"step": 9470 |
|
}, |
|
{ |
|
"epoch": 0.8368274705389063, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 6.536437215607261e-06, |
|
"loss": 2.6323, |
|
"step": 9480 |
|
}, |
|
{ |
|
"epoch": 0.8377101999382089, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 6.467449955903088e-06, |
|
"loss": 2.6278, |
|
"step": 9490 |
|
}, |
|
{ |
|
"epoch": 0.8385929293375116, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 6.398803503399564e-06, |
|
"loss": 2.6191, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.8394756587368142, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 6.330498395515377e-06, |
|
"loss": 2.626, |
|
"step": 9510 |
|
}, |
|
{ |
|
"epoch": 0.8403583881361169, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 6.262535166996896e-06, |
|
"loss": 2.6308, |
|
"step": 9520 |
|
}, |
|
{ |
|
"epoch": 0.8412411175354195, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 6.194914349913955e-06, |
|
"loss": 2.6346, |
|
"step": 9530 |
|
}, |
|
{ |
|
"epoch": 0.8421238469347222, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 6.127636473655768e-06, |
|
"loss": 2.6277, |
|
"step": 9540 |
|
}, |
|
{ |
|
"epoch": 0.8430065763340248, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 6.060702064926682e-06, |
|
"loss": 2.6341, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 0.8438893057333274, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 5.994111647742173e-06, |
|
"loss": 2.6315, |
|
"step": 9560 |
|
}, |
|
{ |
|
"epoch": 0.8447720351326301, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 5.927865743424621e-06, |
|
"loss": 2.6149, |
|
"step": 9570 |
|
}, |
|
{ |
|
"epoch": 0.8456547645319328, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 5.861964870599324e-06, |
|
"loss": 2.6226, |
|
"step": 9580 |
|
}, |
|
{ |
|
"epoch": 0.8465374939312353, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 5.79640954519039e-06, |
|
"loss": 2.6231, |
|
"step": 9590 |
|
}, |
|
{ |
|
"epoch": 0.847420223330538, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 5.731200280416698e-06, |
|
"loss": 2.6058, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.8483029527298407, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 5.666337586787912e-06, |
|
"loss": 2.6302, |
|
"step": 9610 |
|
}, |
|
{ |
|
"epoch": 0.8491856821291434, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 5.601821972100435e-06, |
|
"loss": 2.6232, |
|
"step": 9620 |
|
}, |
|
{ |
|
"epoch": 0.8500684115284459, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 5.53765394143349e-06, |
|
"loss": 2.6293, |
|
"step": 9630 |
|
}, |
|
{ |
|
"epoch": 0.8509511409277486, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 5.4738339971451336e-06, |
|
"loss": 2.6324, |
|
"step": 9640 |
|
}, |
|
{ |
|
"epoch": 0.8518338703270513, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 5.4103626388682885e-06, |
|
"loss": 2.632, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 0.8527165997263539, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 5.347240363506939e-06, |
|
"loss": 2.625, |
|
"step": 9660 |
|
}, |
|
{ |
|
"epoch": 0.8535993291256565, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 5.284467665232123e-06, |
|
"loss": 2.6272, |
|
"step": 9670 |
|
}, |
|
{ |
|
"epoch": 0.8544820585249592, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 5.222045035478107e-06, |
|
"loss": 2.6289, |
|
"step": 9680 |
|
}, |
|
{ |
|
"epoch": 0.8553647879242618, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 5.159972962938581e-06, |
|
"loss": 2.6234, |
|
"step": 9690 |
|
}, |
|
{ |
|
"epoch": 0.8562475173235644, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 5.0982519335627695e-06, |
|
"loss": 2.6188, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.8571302467228671, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 5.036882430551676e-06, |
|
"loss": 2.6205, |
|
"step": 9710 |
|
}, |
|
{ |
|
"epoch": 0.8580129761221698, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 4.97586493435424e-06, |
|
"loss": 2.6228, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 0.8588957055214724, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 4.915199922663649e-06, |
|
"loss": 2.6324, |
|
"step": 9730 |
|
}, |
|
{ |
|
"epoch": 0.859778434920775, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 4.854887870413566e-06, |
|
"loss": 2.623, |
|
"step": 9740 |
|
}, |
|
{ |
|
"epoch": 0.8606611643200777, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 4.794929249774366e-06, |
|
"loss": 2.6247, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 0.8615438937193803, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 4.735324530149521e-06, |
|
"loss": 2.6182, |
|
"step": 9760 |
|
}, |
|
{ |
|
"epoch": 0.862426623118683, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 4.676074178171891e-06, |
|
"loss": 2.625, |
|
"step": 9770 |
|
}, |
|
{ |
|
"epoch": 0.8633093525179856, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 4.617178657700027e-06, |
|
"loss": 2.6223, |
|
"step": 9780 |
|
}, |
|
{ |
|
"epoch": 0.8641920819172882, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 4.5586384298146105e-06, |
|
"loss": 2.6039, |
|
"step": 9790 |
|
}, |
|
{ |
|
"epoch": 0.8650748113165909, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 4.5004539528148035e-06, |
|
"loss": 2.6035, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.8659575407158936, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 4.442625682214679e-06, |
|
"loss": 2.6371, |
|
"step": 9810 |
|
}, |
|
{ |
|
"epoch": 0.8668402701151962, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 4.385154070739628e-06, |
|
"loss": 2.6312, |
|
"step": 9820 |
|
}, |
|
{ |
|
"epoch": 0.8677229995144988, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 4.328039568322817e-06, |
|
"loss": 2.6235, |
|
"step": 9830 |
|
}, |
|
{ |
|
"epoch": 0.8686057289138015, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 4.271282622101735e-06, |
|
"loss": 2.6177, |
|
"step": 9840 |
|
}, |
|
{ |
|
"epoch": 0.8694884583131042, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 4.2148836764145826e-06, |
|
"loss": 2.6226, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 0.8703711877124067, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 4.158843172796884e-06, |
|
"loss": 2.6146, |
|
"step": 9860 |
|
}, |
|
{ |
|
"epoch": 0.8712539171117094, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 4.103161549977991e-06, |
|
"loss": 2.6202, |
|
"step": 9870 |
|
}, |
|
{ |
|
"epoch": 0.8721366465110121, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 4.047839243877627e-06, |
|
"loss": 2.6351, |
|
"step": 9880 |
|
}, |
|
{ |
|
"epoch": 0.8730193759103146, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 3.992876687602537e-06, |
|
"loss": 2.6384, |
|
"step": 9890 |
|
}, |
|
{ |
|
"epoch": 0.8739021053096173, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 3.938274311443019e-06, |
|
"loss": 2.6225, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.87478483470892, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 3.884032542869642e-06, |
|
"loss": 2.6279, |
|
"step": 9910 |
|
}, |
|
{ |
|
"epoch": 0.8756675641082227, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 3.83015180652983e-06, |
|
"loss": 2.6227, |
|
"step": 9920 |
|
}, |
|
{ |
|
"epoch": 0.8765502935075252, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 3.7766325242445386e-06, |
|
"loss": 2.6411, |
|
"step": 9930 |
|
}, |
|
{ |
|
"epoch": 0.8774330229068279, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 3.7234751150050374e-06, |
|
"loss": 2.6379, |
|
"step": 9940 |
|
}, |
|
{ |
|
"epoch": 0.8783157523061306, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 3.6706799949695058e-06, |
|
"loss": 2.6377, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 0.8791984817054332, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 3.618247577459871e-06, |
|
"loss": 2.6256, |
|
"step": 9960 |
|
}, |
|
{ |
|
"epoch": 0.8800812111047358, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 3.5661782729585146e-06, |
|
"loss": 2.6304, |
|
"step": 9970 |
|
}, |
|
{ |
|
"epoch": 0.8809639405040385, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 3.514472489105114e-06, |
|
"loss": 2.632, |
|
"step": 9980 |
|
}, |
|
{ |
|
"epoch": 0.8818466699033412, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 3.4631306306933954e-06, |
|
"loss": 2.6237, |
|
"step": 9990 |
|
}, |
|
{ |
|
"epoch": 0.8827293993026438, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 3.4121530996679917e-06, |
|
"loss": 2.6312, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.8827293993026438, |
|
"eval_accuracy": 0.4989594612236122, |
|
"eval_loss": 2.501610279083252, |
|
"eval_runtime": 7.1818, |
|
"eval_samples_per_second": 44.279, |
|
"eval_steps_per_second": 0.418, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.8836121287019464, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 3.361540295121307e-06, |
|
"loss": 2.6354, |
|
"step": 10010 |
|
}, |
|
{ |
|
"epoch": 0.8844948581012491, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 3.3112926132903677e-06, |
|
"loss": 2.6213, |
|
"step": 10020 |
|
}, |
|
{ |
|
"epoch": 0.8853775875005517, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 3.26141044755372e-06, |
|
"loss": 2.6354, |
|
"step": 10030 |
|
}, |
|
{ |
|
"epoch": 0.8862603168998543, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 3.2118941884283825e-06, |
|
"loss": 2.6195, |
|
"step": 10040 |
|
}, |
|
{ |
|
"epoch": 0.887143046299157, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 3.162744223566766e-06, |
|
"loss": 2.6348, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 0.8880257756984596, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 3.113960937753618e-06, |
|
"loss": 2.6094, |
|
"step": 10060 |
|
}, |
|
{ |
|
"epoch": 0.8889085050977623, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 3.065544712903062e-06, |
|
"loss": 2.6198, |
|
"step": 10070 |
|
}, |
|
{ |
|
"epoch": 0.8897912344970649, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 3.0174959280555526e-06, |
|
"loss": 2.6238, |
|
"step": 10080 |
|
}, |
|
{ |
|
"epoch": 0.8906739638963675, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 2.9698149593749557e-06, |
|
"loss": 2.6377, |
|
"step": 10090 |
|
}, |
|
{ |
|
"epoch": 0.8915566932956702, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 2.9225021801455586e-06, |
|
"loss": 2.6216, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.8924394226949729, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 2.8755579607691928e-06, |
|
"loss": 2.6272, |
|
"step": 10110 |
|
}, |
|
{ |
|
"epoch": 0.8933221520942755, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 2.828982668762309e-06, |
|
"loss": 2.6189, |
|
"step": 10120 |
|
}, |
|
{ |
|
"epoch": 0.8942048814935781, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 2.7827766687530787e-06, |
|
"loss": 2.6224, |
|
"step": 10130 |
|
}, |
|
{ |
|
"epoch": 0.8950876108928808, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 2.7369403224785926e-06, |
|
"loss": 2.6226, |
|
"step": 10140 |
|
}, |
|
{ |
|
"epoch": 0.8959703402921835, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 2.691473988781984e-06, |
|
"loss": 2.6127, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 0.896853069691486, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 2.6463780236096356e-06, |
|
"loss": 2.6272, |
|
"step": 10160 |
|
}, |
|
{ |
|
"epoch": 0.8977357990907887, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 2.6016527800084176e-06, |
|
"loss": 2.6354, |
|
"step": 10170 |
|
}, |
|
{ |
|
"epoch": 0.8986185284900914, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 2.557298608122849e-06, |
|
"loss": 2.6421, |
|
"step": 10180 |
|
}, |
|
{ |
|
"epoch": 0.8995012578893941, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 2.513315855192461e-06, |
|
"loss": 2.6155, |
|
"step": 10190 |
|
}, |
|
{ |
|
"epoch": 0.9003839872886966, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 2.4697048655489883e-06, |
|
"loss": 2.6153, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.9012667166879993, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 2.4264659806137125e-06, |
|
"loss": 2.6227, |
|
"step": 10210 |
|
}, |
|
{ |
|
"epoch": 0.902149446087302, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 2.3835995388947986e-06, |
|
"loss": 2.6269, |
|
"step": 10220 |
|
}, |
|
{ |
|
"epoch": 0.9030321754866045, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 2.341105875984617e-06, |
|
"loss": 2.6115, |
|
"step": 10230 |
|
}, |
|
{ |
|
"epoch": 0.9039149048859072, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 2.2989853245571622e-06, |
|
"loss": 2.6215, |
|
"step": 10240 |
|
}, |
|
{ |
|
"epoch": 0.9047976342852099, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 2.257238214365365e-06, |
|
"loss": 2.6167, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 0.9056803636845125, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 2.2158648722386044e-06, |
|
"loss": 2.6459, |
|
"step": 10260 |
|
}, |
|
{ |
|
"epoch": 0.9065630930838151, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 2.1748656220801024e-06, |
|
"loss": 2.6282, |
|
"step": 10270 |
|
}, |
|
{ |
|
"epoch": 0.9074458224831178, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 2.1342407848643776e-06, |
|
"loss": 2.6295, |
|
"step": 10280 |
|
}, |
|
{ |
|
"epoch": 0.9083285518824205, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 2.093990678634755e-06, |
|
"loss": 2.6164, |
|
"step": 10290 |
|
}, |
|
{ |
|
"epoch": 0.9092112812817231, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 2.054115618500879e-06, |
|
"loss": 2.6202, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.9100940106810257, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 2.0146159166362254e-06, |
|
"loss": 2.624, |
|
"step": 10310 |
|
}, |
|
{ |
|
"epoch": 0.9109767400803284, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 1.975491882275665e-06, |
|
"loss": 2.6297, |
|
"step": 10320 |
|
}, |
|
{ |
|
"epoch": 0.911859469479631, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 1.9367438217130663e-06, |
|
"loss": 2.6247, |
|
"step": 10330 |
|
}, |
|
{ |
|
"epoch": 0.9127421988789337, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 1.8983720382988679e-06, |
|
"loss": 2.6325, |
|
"step": 10340 |
|
}, |
|
{ |
|
"epoch": 0.9136249282782363, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 1.860376832437699e-06, |
|
"loss": 2.6241, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 0.9145076576775389, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 1.8227585015860526e-06, |
|
"loss": 2.6247, |
|
"step": 10360 |
|
}, |
|
{ |
|
"epoch": 0.9153903870768416, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 1.7855173402499703e-06, |
|
"loss": 2.6221, |
|
"step": 10370 |
|
}, |
|
{ |
|
"epoch": 0.9162731164761443, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 1.7486536399826615e-06, |
|
"loss": 2.6114, |
|
"step": 10380 |
|
}, |
|
{ |
|
"epoch": 0.9171558458754469, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 1.7121676893823213e-06, |
|
"loss": 2.6173, |
|
"step": 10390 |
|
}, |
|
{ |
|
"epoch": 0.9180385752747495, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 1.6760597740897998e-06, |
|
"loss": 2.6083, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.9189213046740522, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 1.640330176786381e-06, |
|
"loss": 2.6246, |
|
"step": 10410 |
|
}, |
|
{ |
|
"epoch": 0.9198040340733548, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 1.6049791771915957e-06, |
|
"loss": 2.6113, |
|
"step": 10420 |
|
}, |
|
{ |
|
"epoch": 0.9206867634726574, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 1.57000705206099e-06, |
|
"loss": 2.6202, |
|
"step": 10430 |
|
}, |
|
{ |
|
"epoch": 0.9215694928719601, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 1.535414075184005e-06, |
|
"loss": 2.6192, |
|
"step": 10440 |
|
}, |
|
{ |
|
"epoch": 0.9224522222712628, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 1.5012005173818012e-06, |
|
"loss": 2.6357, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 0.9233349516705653, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 1.467366646505125e-06, |
|
"loss": 2.6198, |
|
"step": 10460 |
|
}, |
|
{ |
|
"epoch": 0.924217681069868, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 1.4339127274322795e-06, |
|
"loss": 2.6285, |
|
"step": 10470 |
|
}, |
|
{ |
|
"epoch": 0.9251004104691707, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 1.4008390220669576e-06, |
|
"loss": 2.6169, |
|
"step": 10480 |
|
}, |
|
{ |
|
"epoch": 0.9259831398684734, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 1.3681457893362726e-06, |
|
"loss": 2.6392, |
|
"step": 10490 |
|
}, |
|
{ |
|
"epoch": 0.9268658692677759, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 1.3358332851886702e-06, |
|
"loss": 2.6182, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.9277485986670786, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 1.3039017625919748e-06, |
|
"loss": 2.6241, |
|
"step": 10510 |
|
}, |
|
{ |
|
"epoch": 0.9286313280663813, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 1.2723514715313856e-06, |
|
"loss": 2.6237, |
|
"step": 10520 |
|
}, |
|
{ |
|
"epoch": 0.9295140574656839, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 1.241182659007495e-06, |
|
"loss": 2.6166, |
|
"step": 10530 |
|
}, |
|
{ |
|
"epoch": 0.9303967868649865, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 1.2103955690344003e-06, |
|
"loss": 2.6376, |
|
"step": 10540 |
|
}, |
|
{ |
|
"epoch": 0.9312795162642892, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 1.1799904426377794e-06, |
|
"loss": 2.6119, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 0.9321622456635918, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 1.149967517852979e-06, |
|
"loss": 2.6263, |
|
"step": 10560 |
|
}, |
|
{ |
|
"epoch": 0.9330449750628945, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 1.1203270297231783e-06, |
|
"loss": 2.6154, |
|
"step": 10570 |
|
}, |
|
{ |
|
"epoch": 0.9339277044621971, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 1.091069210297546e-06, |
|
"loss": 2.6363, |
|
"step": 10580 |
|
}, |
|
{ |
|
"epoch": 0.9348104338614998, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 1.0621942886294035e-06, |
|
"loss": 2.6398, |
|
"step": 10590 |
|
}, |
|
{ |
|
"epoch": 0.9356931632608024, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 1.0337024907744576e-06, |
|
"loss": 2.6224, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.936575892660105, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 1.005594039789015e-06, |
|
"loss": 2.6237, |
|
"step": 10610 |
|
}, |
|
{ |
|
"epoch": 0.9374586220594077, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 9.778691557282504e-07, |
|
"loss": 2.6195, |
|
"step": 10620 |
|
}, |
|
{ |
|
"epoch": 0.9383413514587103, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 9.505280556444507e-07, |
|
"loss": 2.6307, |
|
"step": 10630 |
|
}, |
|
{ |
|
"epoch": 0.939224080858013, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 9.235709535853565e-07, |
|
"loss": 2.6174, |
|
"step": 10640 |
|
}, |
|
{ |
|
"epoch": 0.9401068102573156, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 8.969980605924688e-07, |
|
"loss": 2.6427, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 0.9409895396566182, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 8.708095846993836e-07, |
|
"loss": 2.6199, |
|
"step": 10660 |
|
}, |
|
{ |
|
"epoch": 0.9418722690559209, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 8.450057309301873e-07, |
|
"loss": 2.6155, |
|
"step": 10670 |
|
}, |
|
{ |
|
"epoch": 0.9427549984552236, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 8.19586701297842e-07, |
|
"loss": 2.6205, |
|
"step": 10680 |
|
}, |
|
{ |
|
"epoch": 0.9436377278545262, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 7.945526948025917e-07, |
|
"loss": 2.6317, |
|
"step": 10690 |
|
}, |
|
{ |
|
"epoch": 0.9445204572538288, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 7.699039074304249e-07, |
|
"loss": 2.6326, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.9454031866531315, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 7.456405321515203e-07, |
|
"loss": 2.6244, |
|
"step": 10710 |
|
}, |
|
{ |
|
"epoch": 0.9462859160524342, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 7.217627589187592e-07, |
|
"loss": 2.6325, |
|
"step": 10720 |
|
}, |
|
{ |
|
"epoch": 0.9471686454517367, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 6.982707746662154e-07, |
|
"loss": 2.622, |
|
"step": 10730 |
|
}, |
|
{ |
|
"epoch": 0.9480513748510394, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 6.751647633076952e-07, |
|
"loss": 2.6188, |
|
"step": 10740 |
|
}, |
|
{ |
|
"epoch": 0.9489341042503421, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 6.524449057353333e-07, |
|
"loss": 2.6075, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 0.9498168336496448, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 6.301113798181158e-07, |
|
"loss": 2.632, |
|
"step": 10760 |
|
}, |
|
{ |
|
"epoch": 0.9506995630489473, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 6.081643604005315e-07, |
|
"loss": 2.6125, |
|
"step": 10770 |
|
}, |
|
{ |
|
"epoch": 0.95158229244825, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 5.866040193011901e-07, |
|
"loss": 2.6127, |
|
"step": 10780 |
|
}, |
|
{ |
|
"epoch": 0.9524650218475527, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 5.654305253114777e-07, |
|
"loss": 2.6327, |
|
"step": 10790 |
|
}, |
|
{ |
|
"epoch": 0.9533477512468552, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 5.44644044194248e-07, |
|
"loss": 2.6144, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.9542304806461579, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 5.242447386824833e-07, |
|
"loss": 2.6269, |
|
"step": 10810 |
|
}, |
|
{ |
|
"epoch": 0.9551132100454606, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 5.042327684780745e-07, |
|
"loss": 2.6296, |
|
"step": 10820 |
|
}, |
|
{ |
|
"epoch": 0.9559959394447632, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 4.846082902505433e-07, |
|
"loss": 2.6269, |
|
"step": 10830 |
|
}, |
|
{ |
|
"epoch": 0.9568786688440658, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 4.6537145763579927e-07, |
|
"loss": 2.6213, |
|
"step": 10840 |
|
}, |
|
{ |
|
"epoch": 0.9577613982433685, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 4.4652242123497943e-07, |
|
"loss": 2.6245, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 0.9586441276426712, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 4.280613286132329e-07, |
|
"loss": 2.6293, |
|
"step": 10860 |
|
}, |
|
{ |
|
"epoch": 0.9595268570419738, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 4.0998832429857693e-07, |
|
"loss": 2.6313, |
|
"step": 10870 |
|
}, |
|
{ |
|
"epoch": 0.9604095864412764, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 3.9230354978077586e-07, |
|
"loss": 2.6121, |
|
"step": 10880 |
|
}, |
|
{ |
|
"epoch": 0.9612923158405791, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 3.750071435102087e-07, |
|
"loss": 2.6228, |
|
"step": 10890 |
|
}, |
|
{ |
|
"epoch": 0.9621750452398817, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 3.5809924089682536e-07, |
|
"loss": 2.6382, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.9630577746391844, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 3.4157997430904196e-07, |
|
"loss": 2.6157, |
|
"step": 10910 |
|
}, |
|
{ |
|
"epoch": 0.963940504038487, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 3.2544947307272534e-07, |
|
"loss": 2.6239, |
|
"step": 10920 |
|
}, |
|
{ |
|
"epoch": 0.9648232334377896, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 3.0970786347018774e-07, |
|
"loss": 2.6143, |
|
"step": 10930 |
|
}, |
|
{ |
|
"epoch": 0.9657059628370923, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 2.943552687391937e-07, |
|
"loss": 2.6276, |
|
"step": 10940 |
|
}, |
|
{ |
|
"epoch": 0.966588692236395, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 2.7939180907197717e-07, |
|
"loss": 2.6118, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 0.9674714216356975, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 2.648176016143367e-07, |
|
"loss": 2.6171, |
|
"step": 10960 |
|
}, |
|
{ |
|
"epoch": 0.9683541510350002, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 2.506327604646752e-07, |
|
"loss": 2.6172, |
|
"step": 10970 |
|
}, |
|
{ |
|
"epoch": 0.9692368804343029, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 2.3683739667313965e-07, |
|
"loss": 2.6472, |
|
"step": 10980 |
|
}, |
|
{ |
|
"epoch": 0.9701196098336055, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 2.2343161824073256e-07, |
|
"loss": 2.6086, |
|
"step": 10990 |
|
}, |
|
{ |
|
"epoch": 0.9710023392329081, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 2.1041553011849624e-07, |
|
"loss": 2.6253, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.9718850686322108, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 1.977892342066412e-07, |
|
"loss": 2.6299, |
|
"step": 11010 |
|
}, |
|
{ |
|
"epoch": 0.9727677980315135, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 1.855528293537856e-07, |
|
"loss": 2.6204, |
|
"step": 11020 |
|
}, |
|
{ |
|
"epoch": 0.973650527430816, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 1.7370641135619481e-07, |
|
"loss": 2.6232, |
|
"step": 11030 |
|
}, |
|
{ |
|
"epoch": 0.9745332568301187, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 1.6225007295697093e-07, |
|
"loss": 2.6042, |
|
"step": 11040 |
|
}, |
|
{ |
|
"epoch": 0.9754159862294214, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 1.5118390384539217e-07, |
|
"loss": 2.6245, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 0.9762987156287241, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 1.4050799065616905e-07, |
|
"loss": 2.6268, |
|
"step": 11060 |
|
}, |
|
{ |
|
"epoch": 0.9771814450280266, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 1.3022241696878934e-07, |
|
"loss": 2.6356, |
|
"step": 11070 |
|
}, |
|
{ |
|
"epoch": 0.9780641744273293, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 1.203272633068575e-07, |
|
"loss": 2.6168, |
|
"step": 11080 |
|
}, |
|
{ |
|
"epoch": 0.978946903826632, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 1.108226071374452e-07, |
|
"loss": 2.6207, |
|
"step": 11090 |
|
}, |
|
{ |
|
"epoch": 0.9798296332259346, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 1.017085228705139e-07, |
|
"loss": 2.6102, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.9807123626252372, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 9.298508185832666e-08, |
|
"loss": 2.6266, |
|
"step": 11110 |
|
}, |
|
{ |
|
"epoch": 0.9815950920245399, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 8.465235239485947e-08, |
|
"loss": 2.6303, |
|
"step": 11120 |
|
}, |
|
{ |
|
"epoch": 0.9824778214238425, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 7.671039971530735e-08, |
|
"loss": 2.6091, |
|
"step": 11130 |
|
}, |
|
{ |
|
"epoch": 0.9833605508231452, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 6.915928599555699e-08, |
|
"loss": 2.6227, |
|
"step": 11140 |
|
}, |
|
{ |
|
"epoch": 0.9842432802224478, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 6.199907035168706e-08, |
|
"loss": 2.619, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 0.9851260096217505, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 5.522980883952422e-08, |
|
"loss": 2.6245, |
|
"step": 11160 |
|
}, |
|
{ |
|
"epoch": 0.9860087390210531, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 4.8851554454204527e-08, |
|
"loss": 2.6249, |
|
"step": 11170 |
|
}, |
|
{ |
|
"epoch": 0.9868914684203557, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 4.286435712973491e-08, |
|
"loss": 2.6296, |
|
"step": 11180 |
|
}, |
|
{ |
|
"epoch": 0.9877741978196584, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 3.726826373862679e-08, |
|
"loss": 2.6197, |
|
"step": 11190 |
|
}, |
|
{ |
|
"epoch": 0.988656927218961, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 3.206331809151864e-08, |
|
"loss": 2.6156, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.9895396566182637, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 2.7249560936848385e-08, |
|
"loss": 2.625, |
|
"step": 11210 |
|
}, |
|
{ |
|
"epoch": 0.9904223860175663, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 2.2827029960509338e-08, |
|
"loss": 2.6255, |
|
"step": 11220 |
|
}, |
|
{ |
|
"epoch": 0.9913051154168689, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 1.8795759785555922e-08, |
|
"loss": 2.625, |
|
"step": 11230 |
|
}, |
|
{ |
|
"epoch": 0.9921878448161716, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 1.515578197197054e-08, |
|
"loss": 2.6212, |
|
"step": 11240 |
|
}, |
|
{ |
|
"epoch": 0.9930705742154743, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 1.1907125016369369e-08, |
|
"loss": 2.626, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 0.9939533036147769, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 9.049814351808072e-09, |
|
"loss": 2.6205, |
|
"step": 11260 |
|
}, |
|
{ |
|
"epoch": 0.9948360330140795, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 6.583872347570852e-09, |
|
"loss": 2.6325, |
|
"step": 11270 |
|
}, |
|
{ |
|
"epoch": 0.9957187624133822, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 4.5093183089983674e-09, |
|
"loss": 2.6124, |
|
"step": 11280 |
|
}, |
|
{ |
|
"epoch": 0.9966014918126849, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 2.826168477343405e-09, |
|
"loss": 2.6257, |
|
"step": 11290 |
|
}, |
|
{ |
|
"epoch": 0.9974842212119874, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 1.5344360296265513e-09, |
|
"loss": 2.6156, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.9983669506112901, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 6.341310785584754e-10, |
|
"loss": 2.6207, |
|
"step": 11310 |
|
}, |
|
{ |
|
"epoch": 0.9992496800105928, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 1.2526067243445737e-10, |
|
"loss": 2.6222, |
|
"step": 11320 |
|
}, |
|
{ |
|
"epoch": 0.9999558635300348, |
|
"step": 11328, |
|
"total_flos": 2.234940544568441e+20, |
|
"train_loss": 2.6672301666211276, |
|
"train_runtime": 28535.6383, |
|
"train_samples_per_second": 101.631, |
|
"train_steps_per_second": 0.397 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 11328, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.234940544568441e+20, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|