|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9999558635300348, |
|
"eval_steps": 2000, |
|
"global_step": 11328, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 8.827293993026438e-05, |
|
"eval_accuracy": 0.31068875219818615, |
|
"eval_loss": 5.8817362785339355, |
|
"eval_runtime": 7.092, |
|
"eval_samples_per_second": 44.839, |
|
"eval_steps_per_second": 0.423, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0008827293993026437, |
|
"grad_norm": 7.5625, |
|
"learning_rate": 5e-05, |
|
"loss": 6.1788, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0017654587986052875, |
|
"grad_norm": 7.21875, |
|
"learning_rate": 0.0001, |
|
"loss": 5.9299, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0026481881979079315, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 0.00015, |
|
"loss": 5.2333, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.003530917597210575, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.0002, |
|
"loss": 4.6613, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0044136469965132185, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00025, |
|
"loss": 4.3038, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.005296376395815863, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.0003, |
|
"loss": 3.9929, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0061791057951185065, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00035, |
|
"loss": 3.7479, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.00706183519442115, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.0004, |
|
"loss": 3.5018, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.007944564593723794, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 0.00045000000000000004, |
|
"loss": 3.3363, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.008827293993026437, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0005, |
|
"loss": 3.1974, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.009710023392329082, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0004999990214012265, |
|
"loss": 3.1016, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.010592752791631726, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.000499996085612567, |
|
"loss": 3.0369, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.01147548219093437, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0004999911926570055, |
|
"loss": 2.9845, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.012358211590237013, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0004999843425728476, |
|
"loss": 2.9364, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.013240940989539656, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0004999755354137212, |
|
"loss": 2.899, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0141236703888423, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.000499964771248576, |
|
"loss": 2.8838, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.015006399788144944, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.000499952050161682, |
|
"loss": 2.8561, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.015889129187447587, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0004999373722526303, |
|
"loss": 2.8367, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.016771858586750232, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0004999207376363309, |
|
"loss": 2.8232, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.017654587986052874, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0004999021464430128, |
|
"loss": 2.811, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.01853731738535552, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0004998815988182225, |
|
"loss": 2.8107, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.019420046784658165, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.0004998590949228232, |
|
"loss": 2.7771, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.020302776183960806, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.000499834634932993, |
|
"loss": 2.7739, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.02118550558326345, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0004998082190402241, |
|
"loss": 2.7691, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.022068234982566094, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0004997798474513211, |
|
"loss": 2.7592, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.02295096438186874, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.000499749520388399, |
|
"loss": 2.7538, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.02383369378117138, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0004997172380888822, |
|
"loss": 2.7447, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.024716423180474026, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0004996830008055017, |
|
"loss": 2.729, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.02559915257977667, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.0004996468088062946, |
|
"loss": 2.7356, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.026481881979079313, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.0004996086623746, |
|
"loss": 2.7239, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.027364611378381958, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0004995685618090584, |
|
"loss": 2.7162, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.0282473407776846, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0004995265074236088, |
|
"loss": 2.7254, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.029130070176987245, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.0004994824995474863, |
|
"loss": 2.7169, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.030012799576289887, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0004994365385252189, |
|
"loss": 2.7328, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.030895528975592532, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.0004993886247166261, |
|
"loss": 2.7327, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.031778258374895174, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.000499338758496815, |
|
"loss": 2.7076, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.03266098777419782, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.000499286940256178, |
|
"loss": 2.7214, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.033543717173500465, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0004992331704003889, |
|
"loss": 2.7024, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.03442644657280311, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.0004991774493504007, |
|
"loss": 2.7097, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.03530917597210575, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0004991197775424418, |
|
"loss": 2.6817, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.036191905371408394, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0004990601554280128, |
|
"loss": 2.7019, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.03707463477071104, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.0004989985834738824, |
|
"loss": 2.6888, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.037957364170013684, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0004989350621620851, |
|
"loss": 2.6891, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.03884009356931633, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.0004988695919899154, |
|
"loss": 2.7029, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.03972282296861897, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0004988021734699258, |
|
"loss": 2.6838, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.04060555236792161, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0004987328071299217, |
|
"loss": 2.6746, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.04148828176722426, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.0004986614935129576, |
|
"loss": 2.6911, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.0423710111665269, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0004985882331773328, |
|
"loss": 2.6699, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.04325374056582954, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0004985130266965871, |
|
"loss": 2.664, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.04413646996513219, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0004984358746594964, |
|
"loss": 2.6587, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.04501919936443483, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0004983567776700676, |
|
"loss": 2.6734, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.04590192876373748, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0004982757363475346, |
|
"loss": 2.6638, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.04678465816304012, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0004981927513263529, |
|
"loss": 2.6759, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.04766738756234276, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0004981078232561947, |
|
"loss": 2.6665, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.048550116961645406, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.0004980209528019441, |
|
"loss": 2.6673, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.04943284636094805, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0004979321406436917, |
|
"loss": 2.6545, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.0503155757602507, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.0004978413874767291, |
|
"loss": 2.6685, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.05119830515955334, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0004977486940115441, |
|
"loss": 2.6715, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.05208103455885598, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0004976540609738143, |
|
"loss": 2.6611, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.052963763958158626, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.0004975574891044017, |
|
"loss": 2.6682, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.05384649335746127, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0004974589791593472, |
|
"loss": 2.6512, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.054729222756763916, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0004973585319098648, |
|
"loss": 2.6565, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.055611952156066555, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.0004972561481423346, |
|
"loss": 2.6673, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.0564946815553692, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.0004971518286582979, |
|
"loss": 2.6604, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.057377410954671845, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.0004970455742744499, |
|
"loss": 2.6483, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.05826014035397449, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.0004969373858226341, |
|
"loss": 2.6532, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.059142869753277136, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0004968272641498349, |
|
"loss": 2.6505, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.060025599152579774, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.0004967152101181717, |
|
"loss": 2.6512, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.06090832855188242, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0004966012246048924, |
|
"loss": 2.6483, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.061791057951185065, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.0004964853085023653, |
|
"loss": 2.6397, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.0626737873504877, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.0004963674627180735, |
|
"loss": 2.6535, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.06355651674979035, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.0004962476881746068, |
|
"loss": 2.6369, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.064439246149093, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.000496125985809655, |
|
"loss": 2.6288, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.06532197554839564, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0004960023565760003, |
|
"loss": 2.6421, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.06620470494769828, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0004958768014415103, |
|
"loss": 2.6378, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.06708743434700093, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0004957493213891295, |
|
"loss": 2.6562, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.06797016374630357, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.0004956199174168725, |
|
"loss": 2.638, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.06885289314560622, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.000495488590537816, |
|
"loss": 2.6232, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.06973562254490885, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0004953553417800905, |
|
"loss": 2.6335, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.0706183519442115, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0004952201721868726, |
|
"loss": 2.636, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.07150108134351414, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 0.0004950830828163767, |
|
"loss": 2.641, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.07238381074281679, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.0004949440747418467, |
|
"loss": 2.6415, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.07326654014211943, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.0004948031490515476, |
|
"loss": 2.6356, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.07414926954142208, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.0004946603068487572, |
|
"loss": 2.6286, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.07503199894072472, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.0004945155492517569, |
|
"loss": 2.6308, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.07591472834002737, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0004943688773938237, |
|
"loss": 2.6379, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.07679745773933001, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.000494220292423221, |
|
"loss": 2.6308, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.07768018713863266, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.000494069795503189, |
|
"loss": 2.6325, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.07856291653793529, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.0004939173878119366, |
|
"loss": 2.626, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.07944564593723794, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0004937630705426318, |
|
"loss": 2.6191, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.08032837533654058, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.000493606844903392, |
|
"loss": 2.6315, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.08121110473584323, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.000493448712117275, |
|
"loss": 2.6306, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.08209383413514587, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0004932886734222693, |
|
"loss": 2.6096, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.08297656353444852, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.000493126730071284, |
|
"loss": 2.6182, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.08385929293375116, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.0004929628833321397, |
|
"loss": 2.63, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.0847420223330538, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0004927971344875585, |
|
"loss": 2.6271, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.08562475173235645, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.0004926294848351528, |
|
"loss": 2.6246, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.08650748113165908, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0004924599356874169, |
|
"loss": 2.6244, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.08739021053096173, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0004922884883717154, |
|
"loss": 2.609, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.08827293993026437, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.0004921151442302732, |
|
"loss": 2.6245, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.08915566932956702, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 0.0004919399046201656, |
|
"loss": 2.6195, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.09003839872886966, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0004917627709133064, |
|
"loss": 2.6149, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.09092112812817231, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0004915837444964383, |
|
"loss": 2.6333, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.09180385752747496, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 0.0004914028267711217, |
|
"loss": 2.617, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.0926865869267776, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.0004912200191537233, |
|
"loss": 2.6324, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.09356931632608025, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.0004910353230754057, |
|
"loss": 2.619, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.09445204572538288, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0004908487399821158, |
|
"loss": 2.6247, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.09533477512468552, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0004906602713345735, |
|
"loss": 2.6194, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.09621750452398817, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.0004904699186082602, |
|
"loss": 2.6127, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.09710023392329081, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0004902776832934074, |
|
"loss": 2.6178, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.09798296332259346, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.0004900835668949852, |
|
"loss": 2.6088, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.0988656927218961, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00048988757093269, |
|
"loss": 2.612, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.09974842212119875, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.0004896896969409332, |
|
"loss": 2.6148, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.1006311515205014, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0004894899464688287, |
|
"loss": 2.6227, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.10151388091980404, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.000489288321080181, |
|
"loss": 2.6195, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.10239661031910668, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 0.0004890848223534732, |
|
"loss": 2.6363, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.10327933971840932, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.0004888794518818538, |
|
"loss": 2.6029, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.10416206911771196, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0004886722112731253, |
|
"loss": 2.6123, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.1050447985170146, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.000488463102149731, |
|
"loss": 2.6176, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.10592752791631725, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0004882521261487422, |
|
"loss": 2.6269, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.1068102573156199, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0004880392849218459, |
|
"loss": 2.6292, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.10769298671492254, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00048782458013533125, |
|
"loss": 2.6148, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.10857571611422519, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00048760801347007716, |
|
"loss": 2.6057, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.10945844551352783, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.0004873895866215385, |
|
"loss": 2.6181, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.11034117491283048, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00048716930129973323, |
|
"loss": 2.6098, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.11122390431213311, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0004869471592292289, |
|
"loss": 2.6201, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.11210663371143575, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0004867231621491293, |
|
"loss": 2.6141, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.1129893631107384, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.00048649731181306047, |
|
"loss": 2.6008, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.11387209251004105, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00048626960998915733, |
|
"loss": 2.6134, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.11475482190934369, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.0004860400584600496, |
|
"loss": 2.6197, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.11563755130864634, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0004858086590228482, |
|
"loss": 2.6045, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.11652028070794898, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.0004855754134891307, |
|
"loss": 2.6152, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.11740301010725163, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0004853403236849274, |
|
"loss": 2.6074, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.11828573950655427, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.0004851033914507071, |
|
"loss": 2.6143, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.1191684689058569, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00048486461864136253, |
|
"loss": 2.6143, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.12005119830515955, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 0.0004846240071261959, |
|
"loss": 2.5931, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.1209339277044622, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00048438155878890434, |
|
"loss": 2.594, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.12181665710376484, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00048413727552756505, |
|
"loss": 2.6069, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.12269938650306748, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00048389115925462025, |
|
"loss": 2.5968, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.12358211590237013, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00048364321189686276, |
|
"loss": 2.606, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.12446484530167277, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00048339343539542033, |
|
"loss": 2.5955, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.1253475747009754, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.0004831418317057409, |
|
"loss": 2.5942, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.12623030410027805, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.0004828884027975768, |
|
"loss": 2.587, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.1271130334995807, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.00048263315065497, |
|
"loss": 2.6048, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.12799576289888334, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.0004823760772762358, |
|
"loss": 2.5977, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.128878492298186, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00048211718467394774, |
|
"loss": 2.6055, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.12976122169748863, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0004818564748749218, |
|
"loss": 2.5919, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.13064395109679128, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0004815939499202001, |
|
"loss": 2.6066, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.13152668049609392, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0004813296118650357, |
|
"loss": 2.6125, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.13240940989539657, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0004810634627788756, |
|
"loss": 2.5976, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.1332921392946992, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0004807955047453452, |
|
"loss": 2.6044, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.13417486869400186, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0004805257398622317, |
|
"loss": 2.6011, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.1350575980933045, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0004802541702414678, |
|
"loss": 2.6004, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.13594032749260715, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.000479980798009115, |
|
"loss": 2.5994, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.1368230568919098, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00047970562530534724, |
|
"loss": 2.6054, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.13770578629121244, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.0004794286542844338, |
|
"loss": 2.5978, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.13858851569051509, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00047914988711472283, |
|
"loss": 2.6025, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.1394712450898177, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 0.00047886932597862396, |
|
"loss": 2.59, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.14035397448912035, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0004785869730725914, |
|
"loss": 2.6018, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.141236703888423, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0004783028306071069, |
|
"loss": 2.5972, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.14211943328772564, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.00047801690080666206, |
|
"loss": 2.5886, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.14300216268702828, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00047772918590974136, |
|
"loss": 2.5954, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.14388489208633093, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00047743968816880446, |
|
"loss": 2.6028, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.14476762148563357, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.0004771484098502683, |
|
"loss": 2.5978, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.14565035088493622, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.0004768553532344899, |
|
"loss": 2.5883, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.14653308028423886, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.0004765605206157478, |
|
"loss": 2.5949, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.1474158096835415, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0004762639143022248, |
|
"loss": 2.6048, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.14829853908284416, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00047596553661598956, |
|
"loss": 2.5817, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.1491812684821468, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00047566538989297837, |
|
"loss": 2.5987, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.15006399788144945, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.00047536347648297685, |
|
"loss": 2.5991, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.1509467272807521, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.0004750597987496018, |
|
"loss": 2.6001, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.15182945668005474, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00047475435907028254, |
|
"loss": 2.5968, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.15271218607935738, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.0004744471598362421, |
|
"loss": 2.5941, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.15359491547866003, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 0.0004741382034524789, |
|
"loss": 2.5971, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.15447764487796267, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 0.0004738274923377478, |
|
"loss": 2.5867, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.15536037427726532, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0004735150289245407, |
|
"loss": 2.5883, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.15624310367656793, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.00047320081565906813, |
|
"loss": 2.6041, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.15712583307587058, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0004728848550012399, |
|
"loss": 2.6029, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.15800856247517323, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00047256714942464574, |
|
"loss": 2.5912, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.15889129187447587, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0004722477014165358, |
|
"loss": 2.586, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.15977402127377852, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0004719265134778017, |
|
"loss": 2.5931, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.16065675067308116, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00047160358812295633, |
|
"loss": 2.5792, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.1615394800723838, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0004712789278801145, |
|
"loss": 2.6021, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.16242220947168645, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00047095253529097313, |
|
"loss": 2.594, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.1633049388709891, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.0004706244129107914, |
|
"loss": 2.588, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.16418766827029174, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.00047029456330837055, |
|
"loss": 2.5905, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.1650703976695944, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.0004699629890660339, |
|
"loss": 2.592, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.16595312706889703, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.00046962969277960663, |
|
"loss": 2.6002, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.16683585646819968, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00046929467705839544, |
|
"loss": 2.5983, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.16771858586750232, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0004689579445251681, |
|
"loss": 2.5974, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.16860131526680497, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.000468619497816133, |
|
"loss": 2.6035, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.1694840446661076, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.0004682793395809184, |
|
"loss": 2.5968, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.17036677406541026, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.0004679374724825517, |
|
"loss": 2.5902, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.1712495034647129, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00046759389919743876, |
|
"loss": 2.5931, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.17213223286401555, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0004672486224153427, |
|
"loss": 2.5937, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.17301496226331817, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 0.0004669016448393631, |
|
"loss": 2.5863, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.1738976916626208, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.0004665529691859144, |
|
"loss": 2.5893, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.17478042106192346, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.00046620259818470536, |
|
"loss": 2.59, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.1756631504612261, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.0004658505345787169, |
|
"loss": 2.5924, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.17654587986052875, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.00046549678112418116, |
|
"loss": 2.6109, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.17654587986052875, |
|
"eval_accuracy": 0.4971187442885556, |
|
"eval_loss": 2.480692148208618, |
|
"eval_runtime": 7.0837, |
|
"eval_samples_per_second": 44.892, |
|
"eval_steps_per_second": 0.424, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.1774286092598314, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0004651413405905597, |
|
"loss": 2.5819, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.17831133865913404, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00046478421576052196, |
|
"loss": 2.5949, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.17919406805843668, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00046442540942992315, |
|
"loss": 2.588, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.18007679745773933, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.00046406492440778294, |
|
"loss": 2.577, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.18095952685704197, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.0004637027635162627, |
|
"loss": 2.5906, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.18184225625634462, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00046333892959064425, |
|
"loss": 2.5913, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.18272498565564727, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.0004629734254793071, |
|
"loss": 2.5859, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.1836077150549499, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00046260625404370606, |
|
"loss": 2.6003, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.18449044445425256, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.0004622374181583494, |
|
"loss": 2.5759, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.1853731738535552, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00046186692071077586, |
|
"loss": 2.5745, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.18625590325285785, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.00046149476460153216, |
|
"loss": 2.586, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.1871386326521605, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.0004611209527441504, |
|
"loss": 2.5893, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.18802136205146314, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0004607454880651253, |
|
"loss": 2.5885, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.18890409145076575, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 0.0004603683735038909, |
|
"loss": 2.5912, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.1897868208500684, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00045998961201279814, |
|
"loss": 2.5746, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.19066955024937104, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.00045960920655709113, |
|
"loss": 2.5771, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.1915522796486737, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0004592271601148844, |
|
"loss": 2.5671, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.19243500904797634, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.00045884347567713945, |
|
"loss": 2.5778, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.19331773844727898, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0004584581562476412, |
|
"loss": 2.6024, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.19420046784658163, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0004580712048429746, |
|
"loss": 2.5891, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.19508319724588427, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.000457682624492501, |
|
"loss": 2.573, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.19596592664518692, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.0004572924182383346, |
|
"loss": 2.5845, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.19684865604448956, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.00045690058913531794, |
|
"loss": 2.5873, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.1977313854437922, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0004565071402509992, |
|
"loss": 2.5757, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.19861411484309485, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 0.000456112074665607, |
|
"loss": 2.5904, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.1994968442423975, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.0004557153954720269, |
|
"loss": 2.5777, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.20037957364170014, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 0.0004553171057757772, |
|
"loss": 2.59, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.2012623030410028, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.0004549172086949842, |
|
"loss": 2.5746, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.20214503244030543, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0004545157073603584, |
|
"loss": 2.5907, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.20302776183960808, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0004541126049151694, |
|
"loss": 2.6017, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.20391049123891072, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00045370790451522165, |
|
"loss": 2.5727, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.20479322063821337, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0004533016093288298, |
|
"loss": 2.5668, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.205675950037516, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0004528937225367935, |
|
"loss": 2.5869, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.20655867943681863, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.0004524842473323729, |
|
"loss": 2.59, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.20744140883612128, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 0.0004520731869212634, |
|
"loss": 2.5767, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.20832413823542392, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0004516605445215709, |
|
"loss": 2.5774, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.20920686763472657, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00045124632336378603, |
|
"loss": 2.5753, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.2100895970340292, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00045083052669075936, |
|
"loss": 2.5835, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.21097232643333186, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0004504131577576758, |
|
"loss": 2.5853, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.2118550558326345, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00044999421983202905, |
|
"loss": 2.5831, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.21273778523193715, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00044957371619359644, |
|
"loss": 2.5935, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.2136205146312398, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00044915165013441257, |
|
"loss": 2.5853, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.21450324403054244, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0004487280249587441, |
|
"loss": 2.5908, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.21538597342984508, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00044830284398306375, |
|
"loss": 2.5873, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.21626870282914773, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 0.000447876110536024, |
|
"loss": 2.5863, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.21715143222845038, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0004474478279584316, |
|
"loss": 2.5858, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.21803416162775302, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.00044701799960322085, |
|
"loss": 2.5832, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.21891689102705567, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.000446586628835428, |
|
"loss": 2.5848, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.2197996204263583, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 0.00044615371903216407, |
|
"loss": 2.5662, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.22068234982566096, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.00044571927358258917, |
|
"loss": 2.5855, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.22156507922496357, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0004452832958878856, |
|
"loss": 2.5872, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.22244780862426622, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 0.0004448457893612311, |
|
"loss": 2.584, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.22333053802356886, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.0004444067574277727, |
|
"loss": 2.579, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.2242132674228715, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.00044396620352459915, |
|
"loss": 2.5757, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.22509599682217415, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00044352413110071453, |
|
"loss": 2.5684, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.2259787262214768, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0004430805436170111, |
|
"loss": 2.5839, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.22686145562077945, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 0.00044263544454624224, |
|
"loss": 2.5779, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.2277441850200821, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.00044218883737299526, |
|
"loss": 2.573, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.22862691441938474, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00044174072559366386, |
|
"loss": 2.5703, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.22950964381868738, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.00044129111271642117, |
|
"loss": 2.5853, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.23039237321799003, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.0004408400022611921, |
|
"loss": 2.5679, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.23127510261729267, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 0.00044038739775962584, |
|
"loss": 2.5662, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.23215783201659532, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 0.0004399333027550679, |
|
"loss": 2.5646, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.23304056141589796, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.000439477720802533, |
|
"loss": 2.5806, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.2339232908152006, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.00043902065546867655, |
|
"loss": 2.5744, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.23480602021450325, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.0004385621103317671, |
|
"loss": 2.5689, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.2356887496138059, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 0.00043810208898165836, |
|
"loss": 2.5626, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.23657147901310854, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.000437640595019761, |
|
"loss": 2.5837, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.2374542084124112, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00043717763205901436, |
|
"loss": 2.5777, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.2383369378117138, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00043671320372385834, |
|
"loss": 2.571, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.23921966721101645, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00043624731365020505, |
|
"loss": 2.5759, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.2401023966103191, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00043577996548541, |
|
"loss": 2.5723, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.24098512600962174, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00043531116288824393, |
|
"loss": 2.5803, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.2418678554089244, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 0.00043484090952886404, |
|
"loss": 2.5819, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.24275058480822703, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0004343692090887852, |
|
"loss": 2.5608, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.24363331420752968, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0004338960652608511, |
|
"loss": 2.5712, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.24451604360683232, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.0004334214817492057, |
|
"loss": 2.5537, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.24539877300613497, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.0004329454622692636, |
|
"loss": 2.566, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.2462815024054376, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.00043246801054768147, |
|
"loss": 2.5767, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.24716423180474026, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0004319891303223287, |
|
"loss": 2.5636, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.2480469612040429, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.000431508825342258, |
|
"loss": 2.5796, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.24892969060334555, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0004310270993676764, |
|
"loss": 2.5804, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.2498124200026482, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00043054395616991535, |
|
"loss": 2.5703, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.2506951494019508, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.0004300593995314017, |
|
"loss": 2.5692, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.2515778788012535, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.0004295734332456277, |
|
"loss": 2.5508, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.2524606082005561, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00042908606111712136, |
|
"loss": 2.5691, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.2533433375998588, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.0004285972869614169, |
|
"loss": 2.5741, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.2542260669991614, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.00042810711460502447, |
|
"loss": 2.5651, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.25510879639846407, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00042761554788540084, |
|
"loss": 2.5944, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.2559915257977667, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 0.0004271225906509186, |
|
"loss": 2.5719, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.25687425519706936, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0004266282467608365, |
|
"loss": 2.5665, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.257756984596372, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00042613252008526914, |
|
"loss": 2.5864, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.25863971399567465, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.0004256354145051567, |
|
"loss": 2.5584, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.25952244339497726, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.0004251369339122344, |
|
"loss": 2.5835, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.26040517279427994, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.00042463708220900225, |
|
"loss": 2.5874, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.26128790219358256, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.00042413586330869446, |
|
"loss": 2.5944, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.26217063159288523, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.00042363328113524846, |
|
"loss": 2.579, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.26305336099218785, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.0004231293396232747, |
|
"loss": 2.5835, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.26393609039149046, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.00042262404271802565, |
|
"loss": 2.5732, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.26481881979079314, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.00042211739437536457, |
|
"loss": 2.58, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.26570154919009575, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 0.0004216093985617352, |
|
"loss": 2.5709, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.2665842785893984, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.0004211000592541301, |
|
"loss": 2.5737, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.26746700798870104, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0004205893804400599, |
|
"loss": 2.57, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.2683497373880037, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.0004200773661175219, |
|
"loss": 2.5627, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.26923246678730633, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0004195640202949687, |
|
"loss": 2.559, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.270115196186609, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00041904934699127713, |
|
"loss": 2.5736, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.2709979255859116, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0004185333502357164, |
|
"loss": 2.5594, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.2718806549852143, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 0.000418016034067917, |
|
"loss": 2.5649, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.2727633843845169, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 0.00041749740253783853, |
|
"loss": 2.5689, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.2736461137838196, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.00041697745970573855, |
|
"loss": 2.5798, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.2745288431831222, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00041645620964214023, |
|
"loss": 2.572, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.2754115725824249, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.0004159336564278012, |
|
"loss": 2.5933, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.2762943019817275, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0004154098041536807, |
|
"loss": 2.5831, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.27717703138103017, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00041488465692090837, |
|
"loss": 2.5858, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.2780597607803328, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00041435821884075176, |
|
"loss": 2.5733, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.2789424901796354, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00041383049403458403, |
|
"loss": 2.5785, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.2798252195789381, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 0.0004133014866338521, |
|
"loss": 2.5804, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.2807079489782407, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.00041277120078004383, |
|
"loss": 2.5579, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.28159067837754337, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0004122396406246559, |
|
"loss": 2.5792, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.282473407776846, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.0004117068103291614, |
|
"loss": 2.5744, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.28335613717614866, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.00041117271406497665, |
|
"loss": 2.5614, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.2842388665754513, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.00041063735601342934, |
|
"loss": 2.5693, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.28512159597475395, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0004101007403657255, |
|
"loss": 2.5743, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.28600432537405657, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00040956287132291625, |
|
"loss": 2.5592, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.28688705477335924, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00040902375309586557, |
|
"loss": 2.5735, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.28776978417266186, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.00040848338990521696, |
|
"loss": 2.5728, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.28865251357196453, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.00040794178598136033, |
|
"loss": 2.5648, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.28953524297126715, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 0.0004073989455643994, |
|
"loss": 2.5843, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.2904179723705698, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.00040685487290411765, |
|
"loss": 2.5756, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.29130070176987244, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.0004063095722599459, |
|
"loss": 2.5676, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.2921834311691751, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00040576304790092857, |
|
"loss": 2.5653, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.29306616056847773, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00040521530410569007, |
|
"loss": 2.5877, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.2939488899677804, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0004046663451624016, |
|
"loss": 2.5722, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.294831619367083, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 0.0004041161753687478, |
|
"loss": 2.5592, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.29571434876638564, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.00040356479903189233, |
|
"loss": 2.5817, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.2965970781656883, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 0.0004030122204684449, |
|
"loss": 2.5689, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.29747980756499093, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 0.0004024584440044271, |
|
"loss": 2.563, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.2983625369642936, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.00040190347397523873, |
|
"loss": 2.5695, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.2992452663635962, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 0.0004013473147256238, |
|
"loss": 2.5658, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.3001279957628989, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0004007899706096363, |
|
"loss": 2.5648, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.3010107251622015, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.00040023144599060623, |
|
"loss": 2.5534, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.3018934545615042, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00039967174524110596, |
|
"loss": 2.585, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.3027761839608068, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 0.000399110872742915, |
|
"loss": 2.5641, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.3036589133601095, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 0.0003985488328869865, |
|
"loss": 2.582, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.3045416427594121, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.0003979856300734126, |
|
"loss": 2.5632, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.30542437215871476, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.00039742126871138996, |
|
"loss": 2.5696, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.3063071015580174, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.0003968557532191852, |
|
"loss": 2.5784, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.30718983095732005, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00039628908802410057, |
|
"loss": 2.5746, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.30807256035662267, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.00039572127756243904, |
|
"loss": 2.5684, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.30895528975592534, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.0003951523262794693, |
|
"loss": 2.5684, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.30983801915522796, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.00039458223862939184, |
|
"loss": 2.5781, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.31072074855453063, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.00039401101907530323, |
|
"loss": 2.571, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.31160347795383325, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.0003934386720891614, |
|
"loss": 2.569, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.31248620735313587, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 0.00039286520215175085, |
|
"loss": 2.5527, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.31336893675243854, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 0.0003922906137526474, |
|
"loss": 2.5774, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.31425166615174116, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 0.00039171491139018325, |
|
"loss": 2.572, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.31513439555104383, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0003911380995714111, |
|
"loss": 2.5883, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.31601712495034645, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0003905601828120698, |
|
"loss": 2.5614, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.3168998543496491, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 0.0003899811656365485, |
|
"loss": 2.574, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.31778258374895174, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 0.0003894010525778511, |
|
"loss": 2.5814, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.3186653131482544, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 0.000388819848177561, |
|
"loss": 2.5756, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.31954804254755703, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00038823755698580545, |
|
"loss": 2.5644, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.3204307719468597, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.0003876541835612202, |
|
"loss": 2.5813, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.3213135013461623, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0003870697324709132, |
|
"loss": 2.5781, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.322196230745465, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 0.00038648420829042954, |
|
"loss": 2.5774, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.3230789601447676, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 0.00038589761560371515, |
|
"loss": 2.5752, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.3239616895440703, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00038530995900308107, |
|
"loss": 2.5554, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.3248444189433729, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00038472124308916753, |
|
"loss": 2.5661, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.3257271483426756, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.00038413147247090795, |
|
"loss": 2.5818, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.3266098777419782, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.00038354065176549274, |
|
"loss": 2.583, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.32749260714128087, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.00038294878559833317, |
|
"loss": 2.5657, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.3283753365405835, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0003823558786030255, |
|
"loss": 2.5704, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.3292580659398861, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00038176193542131386, |
|
"loss": 2.5747, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.3301407953391888, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 0.00038116696070305503, |
|
"loss": 2.5803, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.3310235247384914, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00038057095910618125, |
|
"loss": 2.5665, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.33190625413779407, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00037997393529666393, |
|
"loss": 2.5765, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.3327889835370967, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 0.00037937589394847714, |
|
"loss": 2.5569, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.33367171293639936, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.00037877683974356114, |
|
"loss": 2.5679, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.334554442335702, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.0003781767773717857, |
|
"loss": 2.5664, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.33543717173500465, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00037757571153091324, |
|
"loss": 2.5706, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.33631990113430726, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.000376973646926562, |
|
"loss": 2.5694, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.33720263053360994, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.00037637058827216964, |
|
"loss": 2.5567, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.33808535993291255, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 0.00037576654028895554, |
|
"loss": 2.5725, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.3389680893322152, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.00037516150770588487, |
|
"loss": 2.5594, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.33985081873151785, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 0.00037455549525963066, |
|
"loss": 2.5653, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.3407335481308205, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0003739485076945373, |
|
"loss": 2.5642, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.34161627753012314, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.000373340549762583, |
|
"loss": 2.5428, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.3424990069294258, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.0003727316262233429, |
|
"loss": 2.5701, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.3433817363287284, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 0.0003721217418439516, |
|
"loss": 2.556, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.3442644657280311, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.00037151090139906593, |
|
"loss": 2.5647, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.3451471951273337, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 0.00037089910967082765, |
|
"loss": 2.5705, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.34602992452663633, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 0.0003702863714488257, |
|
"loss": 2.5759, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.346912653925939, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.0003696726915300592, |
|
"loss": 2.5727, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.3477953833252416, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 0.0003690580747188995, |
|
"loss": 2.5742, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.3486781127245443, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.00036844252582705244, |
|
"loss": 2.5529, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.3495608421238469, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 0.0003678260496735214, |
|
"loss": 2.5697, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.3504435715231496, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0003672086510845687, |
|
"loss": 2.5643, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.3513263009224522, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 0.00036659033489367835, |
|
"loss": 2.5644, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.3522090303217549, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0003659711059415182, |
|
"loss": 2.5698, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.3530917597210575, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 0.0003653509690759016, |
|
"loss": 2.5789, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.3530917597210575, |
|
"eval_accuracy": 0.5002887399113815, |
|
"eval_loss": 2.4561643600463867, |
|
"eval_runtime": 6.9947, |
|
"eval_samples_per_second": 45.463, |
|
"eval_steps_per_second": 0.429, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.35397448912036017, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.00036472992915175017, |
|
"loss": 2.5587, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.3548572185196628, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.00036410799103105503, |
|
"loss": 2.5827, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.35573994791896546, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.0003634851595828393, |
|
"loss": 2.5659, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.3566226773182681, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 0.00036286143968311963, |
|
"loss": 2.5649, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.35750540671757075, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 0.00036223683621486845, |
|
"loss": 2.5683, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.35838813611687337, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 0.00036161135406797504, |
|
"loss": 2.5568, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.35927086551617604, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.0003609849981392079, |
|
"loss": 2.5601, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.36015359491547866, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0003603577733321764, |
|
"loss": 2.5553, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.3610363243147813, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.0003597296845572917, |
|
"loss": 2.5573, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.36191905371408395, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 0.00035910073673172933, |
|
"loss": 2.5609, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.36280178311338657, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 0.00035847093477938953, |
|
"loss": 2.5557, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.36368451251268924, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 0.00035784028363085985, |
|
"loss": 2.5553, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.36456724191199186, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 0.00035720878822337576, |
|
"loss": 2.5494, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.36544997131129453, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00035657645350078233, |
|
"loss": 2.5837, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.36633270071059715, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.0003559432844134954, |
|
"loss": 2.5717, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.3672154301098998, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.0003553092859184629, |
|
"loss": 2.5629, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.36809815950920244, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.0003546744629791261, |
|
"loss": 2.5562, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.3689808889085051, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.00035403882056538044, |
|
"loss": 2.5511, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.36986361830780773, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.00035340236365353724, |
|
"loss": 2.5611, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.3707463477071104, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 0.000352765097226284, |
|
"loss": 2.5717, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.371629077106413, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 0.0003521270262726458, |
|
"loss": 2.5666, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.3725118065057157, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00035148815578794635, |
|
"loss": 2.5583, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.3733945359050183, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.0003508484907737687, |
|
"loss": 2.5552, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.374277265304321, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.0003502080362379159, |
|
"loss": 2.5708, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.3751599947036236, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 0.00034956679719437225, |
|
"loss": 2.5804, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.3760427241029263, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 0.00034892477866326356, |
|
"loss": 2.5592, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.3769254535022289, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 0.0003482819856708183, |
|
"loss": 2.5529, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.3778081829015315, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 0.00034763842324932794, |
|
"loss": 2.579, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.3786909123008342, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.00034699409643710764, |
|
"loss": 2.5711, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.3795736417001368, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00034634901027845677, |
|
"loss": 2.5626, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.38045637109943947, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.0003457031698236196, |
|
"loss": 2.5598, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.3813391004987421, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.00034505658012874544, |
|
"loss": 2.5722, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.38222182989804476, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00034440924625584954, |
|
"loss": 2.5619, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.3831045592973474, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 0.0003437611732727728, |
|
"loss": 2.5547, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.38398728869665005, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 0.0003431123662531427, |
|
"loss": 2.5654, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.38487001809595267, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 0.0003424628302763332, |
|
"loss": 2.5526, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.38575274749525534, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.0003418125704274252, |
|
"loss": 2.5546, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.38663547689455796, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00034116159179716675, |
|
"loss": 2.585, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.38751820629386063, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 0.0003405098994819329, |
|
"loss": 2.572, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.38840093569316325, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 0.00033985749858368605, |
|
"loss": 2.5571, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.3892836650924659, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 0.0003392043942099358, |
|
"loss": 2.5717, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.39016639449176854, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.0003385505914736994, |
|
"loss": 2.5652, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.3910491238910712, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.00033789609549346146, |
|
"loss": 2.5583, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.39193185329037383, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0003372409113931334, |
|
"loss": 2.5538, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.3928145826896765, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 0.0003365850443020142, |
|
"loss": 2.5522, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.3936973120889791, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 0.00033592849935474965, |
|
"loss": 2.5695, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.39458004148828174, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.0003352712816912925, |
|
"loss": 2.57, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.3954627708875844, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 0.00033461339645686196, |
|
"loss": 2.5631, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.39634550028688703, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 0.0003339548488019033, |
|
"loss": 2.558, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.3972282296861897, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 0.00033329564388204816, |
|
"loss": 2.5512, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.3981109590854923, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.0003326357868580734, |
|
"loss": 2.5622, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.398993688484795, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.0003319752828958613, |
|
"loss": 2.5679, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.3998764178840976, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 0.0003313141371663587, |
|
"loss": 2.5745, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.4007591472834003, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.000330652354845537, |
|
"loss": 2.5843, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.4016418766827029, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 0.0003299899411143509, |
|
"loss": 2.5639, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.4025246060820056, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.0003293269011586986, |
|
"loss": 2.5564, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.4034073354813082, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 0.00032866324016938095, |
|
"loss": 2.5443, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.40429006488061087, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 0.00032799896334206045, |
|
"loss": 2.5623, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.4051727942799135, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 0.000327334075877221, |
|
"loss": 2.5788, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.40605552367921616, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 0.000326668582980127, |
|
"loss": 2.5764, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.4069382530785188, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 0.00032600248986078295, |
|
"loss": 2.5626, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.40782098247782145, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00032533580173389195, |
|
"loss": 2.5496, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.40870371187712407, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0003246685238188154, |
|
"loss": 2.5608, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.40958644127642674, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00032400066133953225, |
|
"loss": 2.5702, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.41046917067572936, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0003233322195245977, |
|
"loss": 2.567, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.411351900075032, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.00032266320360710237, |
|
"loss": 2.5644, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.41223462947433465, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 0.0003219936188246317, |
|
"loss": 2.5675, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.41311735887363726, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 0.0003213234704192243, |
|
"loss": 2.5619, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.41400008827293994, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 0.00032065276363733137, |
|
"loss": 2.5594, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.41488281767224255, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.00031998150372977577, |
|
"loss": 2.5668, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.4157655470715452, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 0.0003193096959517103, |
|
"loss": 2.5547, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.41664827647084784, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.0003186373455625774, |
|
"loss": 2.5528, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.4175310058701505, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 0.0003179644578260669, |
|
"loss": 2.5555, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.41841373526945314, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 0.00031729103801007575, |
|
"loss": 2.5547, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.4192964646687558, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 0.0003166170913866665, |
|
"loss": 2.5666, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.4201791940680584, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 0.00031594262323202577, |
|
"loss": 2.5587, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.4210619234673611, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 0.0003152676388264234, |
|
"loss": 2.5577, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.4219446528666637, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 0.00031459214345417046, |
|
"loss": 2.5362, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.4228273822659664, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 0.00031391614240357864, |
|
"loss": 2.5542, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.423710111665269, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.00031323964096691825, |
|
"loss": 2.565, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.4245928410645717, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.0003125626444403772, |
|
"loss": 2.5467, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.4254755704638743, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.00031188515812401917, |
|
"loss": 2.5632, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.42635829986317697, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 0.00031120718732174235, |
|
"loss": 2.5587, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.4272410292624796, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.000310528737341238, |
|
"loss": 2.5333, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.4281237586617822, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00030984981349394864, |
|
"loss": 2.561, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.4290064880610849, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.00030917042109502663, |
|
"loss": 2.5618, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.4298892174603875, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 0.00030849056546329253, |
|
"loss": 2.5497, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.43077194685969017, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 0.0003078102519211933, |
|
"loss": 2.5374, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.4316546762589928, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 0.0003071294857947612, |
|
"loss": 2.5631, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.43253740565829546, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 0.0003064482724135711, |
|
"loss": 2.575, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.4334201350575981, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 0.00030576661711069985, |
|
"loss": 2.5525, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.43430286445690075, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 0.0003050845252226837, |
|
"loss": 2.5718, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.43518559385620337, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 0.0003044020020894769, |
|
"loss": 2.5601, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.43606832325550604, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 0.00030371905305441, |
|
"loss": 2.5612, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.43695105265480866, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.0003030356834641476, |
|
"loss": 2.5504, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.43783378205411133, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.0003023518986686469, |
|
"loss": 2.5584, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.43871651145341395, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0003016677040211154, |
|
"loss": 2.5645, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.4395992408527166, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 0.00030098310487796965, |
|
"loss": 2.5536, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.44048197025201924, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 0.00030029810659879273, |
|
"loss": 2.5535, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 0.4413646996513219, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 0.00029961271454629235, |
|
"loss": 2.565, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.44224742905062453, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 0.0002989269340862591, |
|
"loss": 2.5531, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.44313015844992715, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.0002982407705875243, |
|
"loss": 2.5636, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.4440128878492298, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 0.00029755422942191805, |
|
"loss": 2.5507, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 0.44489561724853244, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 0.0002968673159642271, |
|
"loss": 2.5646, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.4457783466478351, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 0.00029618003559215276, |
|
"loss": 2.5697, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.44666107604713773, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 0.0002954923936862689, |
|
"loss": 2.5557, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.4475438054464404, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 0.00029480439562997964, |
|
"loss": 2.5661, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 0.448426534845743, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 0.00029411604680947755, |
|
"loss": 2.5527, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.4493092642450457, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00029342735261370095, |
|
"loss": 2.5538, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 0.4501919936443483, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 0.0002927383184342924, |
|
"loss": 2.5503, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.451074723043651, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 0.00029204894966555577, |
|
"loss": 2.5669, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 0.4519574524429536, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 0.00029135925170441457, |
|
"loss": 2.5698, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.4528401818422563, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 0.0002906692299503694, |
|
"loss": 2.567, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 0.4537229112415589, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 0.00028997888980545586, |
|
"loss": 2.5538, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.45460564064086156, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 0.00028928823667420206, |
|
"loss": 2.5495, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.4554883700401642, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 0.00028859727596358643, |
|
"loss": 2.5627, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.45637109943946685, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.00028790601308299545, |
|
"loss": 2.5567, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 0.45725382883876947, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 0.0002872144534441812, |
|
"loss": 2.5561, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.45813655823807214, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 0.0002865226024612189, |
|
"loss": 2.5693, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 0.45901928763737476, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 0.00028583046555046487, |
|
"loss": 2.5478, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.4599020170366774, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 0.0002851380481305136, |
|
"loss": 2.5533, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 0.46078474643598005, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 0.00028444535562215594, |
|
"loss": 2.5529, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.46166747583528267, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.00028375239344833616, |
|
"loss": 2.5532, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 0.46255020523458534, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.00028305916703410974, |
|
"loss": 2.566, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.46343293463388796, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00028236568180660073, |
|
"loss": 2.5478, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.46431566403319063, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.0002816719431949596, |
|
"loss": 2.5633, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 0.46519839343249325, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 0.0002809779566303203, |
|
"loss": 2.5704, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 0.4660811228317959, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 0.00028028372754575805, |
|
"loss": 2.5681, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.46696385223109854, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 0.0002795892613762467, |
|
"loss": 2.5515, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 0.4678465816304012, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.00027889456355861635, |
|
"loss": 2.5681, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.46872931102970383, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 0.00027819963953151024, |
|
"loss": 2.5487, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 0.4696120404290065, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 0.0002775044947353428, |
|
"loss": 2.5672, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 0.4704947698283091, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 0.0002768091346122569, |
|
"loss": 2.562, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 0.4713774992276118, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 0.000276113564606081, |
|
"loss": 2.5542, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 0.4722602286269144, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 0.00027541779016228664, |
|
"loss": 2.5435, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.4731429580262171, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 0.0002747218167279461, |
|
"loss": 2.5631, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 0.4740256874255197, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 0.00027402564975168925, |
|
"loss": 2.5464, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 0.4749084168248224, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 0.0002733292946836615, |
|
"loss": 2.5498, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 0.475791146224125, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 0.0002726327569754803, |
|
"loss": 2.559, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 0.4766738756234276, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 0.00027193604208019346, |
|
"loss": 2.5666, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.4775566050227303, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 0.0002712391554522355, |
|
"loss": 2.556, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 0.4784393344220329, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 0.0002705421025473857, |
|
"loss": 2.559, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 0.4793220638213356, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0002698448888227251, |
|
"loss": 2.5503, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 0.4802047932206382, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 0.0002691475197365936, |
|
"loss": 2.5404, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 0.48108752261994087, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 0.00026845000074854754, |
|
"loss": 2.5667, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.4819702520192435, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 0.0002677523373193165, |
|
"loss": 2.559, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 0.48285298141854616, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 0.00026705453491076127, |
|
"loss": 2.5533, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 0.4837357108178488, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.00026635659898583043, |
|
"loss": 2.5518, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 0.48461844021715145, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 0.000265658535008518, |
|
"loss": 2.5682, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 0.48550116961645406, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 0.00026496034844382036, |
|
"loss": 2.5576, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.48638389901575674, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 0.0002642620447576935, |
|
"loss": 2.546, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 0.48726662841505936, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 0.0002635636294170106, |
|
"loss": 2.5629, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.48814935781436203, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.00026286510788951886, |
|
"loss": 2.5602, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 0.48903208721366465, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 0.0002621664856437967, |
|
"loss": 2.5532, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 0.4899148166129673, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 0.00026146776814921105, |
|
"loss": 2.5645, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.49079754601226994, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 0.0002607689608758746, |
|
"loss": 2.577, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 0.4916802754115726, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 0.000260070069294603, |
|
"loss": 2.5333, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 0.4925630048108752, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 0.00025937109887687164, |
|
"loss": 2.5584, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 0.49344573421017784, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.00025867205509477335, |
|
"loss": 2.5522, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 0.4943284636094805, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 0.0002579729434209752, |
|
"loss": 2.5581, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.49521119300878313, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 0.00025727376932867593, |
|
"loss": 2.5625, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 0.4960939224080858, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 0.00025657453829156256, |
|
"loss": 2.5555, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 0.4969766518073884, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.00025587525578376843, |
|
"loss": 2.5526, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 0.4978593812066911, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 0.0002551759272798295, |
|
"loss": 2.5501, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 0.4987421106059937, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 0.00025447655825464174, |
|
"loss": 2.5728, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.4996248400052964, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 0.0002537771541834187, |
|
"loss": 2.5491, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 0.5005075694045991, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 0.00025307772054164804, |
|
"loss": 2.5658, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 0.5013902988039016, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 0.000252378262805049, |
|
"loss": 2.5504, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 0.5022730282032043, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 0.0002516787864495294, |
|
"loss": 2.5621, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 0.503155757602507, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 0.00025097929695114295, |
|
"loss": 2.5526, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.5040384870018096, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.00025027979978604615, |
|
"loss": 2.5535, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 0.5049212164011122, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 0.0002495803004304556, |
|
"loss": 2.5489, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 0.5058039458004149, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 0.0002488808043606048, |
|
"loss": 2.5585, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 0.5066866751997176, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 0.0002481813170527019, |
|
"loss": 2.561, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 0.5075694045990202, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 0.0002474818439828862, |
|
"loss": 2.5538, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.5084521339983228, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.0002467823906271856, |
|
"loss": 2.559, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.5093348633976255, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 0.00024608296246147375, |
|
"loss": 2.5583, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 0.5102175927969281, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 0.00024538356496142693, |
|
"loss": 2.5506, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 0.5111003221962307, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 0.00024468420360248145, |
|
"loss": 2.5589, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 0.5119830515955334, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 0.00024398488385979055, |
|
"loss": 2.5531, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.512865780994836, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 0.00024328561120818195, |
|
"loss": 2.5605, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 0.5137485103941387, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 0.00024258639112211453, |
|
"loss": 2.5698, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 0.5146312397934413, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 0.00024188722907563537, |
|
"loss": 2.5531, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 0.515513969192744, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 0.00024118813054233774, |
|
"loss": 2.547, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 0.5163966985920466, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 0.00024048910099531726, |
|
"loss": 2.5631, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.5172794279913493, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 0.00023979014590712962, |
|
"loss": 2.5436, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 0.5181621573906519, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 0.00023909127074974744, |
|
"loss": 2.5586, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 0.5190448867899545, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 0.00023839248099451782, |
|
"loss": 2.5524, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 0.5199276161892572, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 0.00023769378211211916, |
|
"loss": 2.5391, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 0.5208103455885599, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 0.00023699517957251825, |
|
"loss": 2.5464, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.5216930749878624, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 0.00023629667884492799, |
|
"loss": 2.556, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 0.5225758043871651, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.00023559828539776394, |
|
"loss": 2.5516, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 0.5234585337864678, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 0.00023490000469860185, |
|
"loss": 2.5518, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 0.5243412631857705, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 0.0002342018422141347, |
|
"loss": 2.5477, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 0.525223992585073, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 0.00023350380341013034, |
|
"loss": 2.5656, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.5261067219843757, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 0.000232805893751388, |
|
"loss": 2.568, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 0.5269894513836784, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 0.0002321081187016959, |
|
"loss": 2.5531, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 0.5278721807829809, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 0.00023141048372378863, |
|
"loss": 2.5555, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 0.5287549101822836, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 0.00023071299427930396, |
|
"loss": 2.5531, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 0.5296376395815863, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 0.00023001565582874046, |
|
"loss": 2.555, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.5296376395815863, |
|
"eval_accuracy": 0.5019328679706038, |
|
"eval_loss": 2.4451804161071777, |
|
"eval_runtime": 7.0082, |
|
"eval_samples_per_second": 45.375, |
|
"eval_steps_per_second": 0.428, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.530520368980889, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 0.00022931847383141446, |
|
"loss": 2.5439, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 0.5314030983801915, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 0.00022862145374541768, |
|
"loss": 2.553, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 0.5322858277794942, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 0.00022792460102757407, |
|
"loss": 2.5539, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 0.5331685571787969, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 0.00022722792113339722, |
|
"loss": 2.5546, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 0.5340512865780995, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 0.0002265314195170481, |
|
"loss": 2.5649, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.5349340159774021, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 0.00022583510163129162, |
|
"loss": 2.5396, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 0.5358167453767048, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 0.00022513897292745434, |
|
"loss": 2.5698, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 0.5366994747760074, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 0.00022444303885538178, |
|
"loss": 2.5594, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 0.5375822041753101, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 0.000223747304863396, |
|
"loss": 2.5539, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 0.5384649335746127, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 0.0002230517763982523, |
|
"loss": 2.5658, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.5393476629739153, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 0.0002223564589050971, |
|
"loss": 2.5584, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 0.540230392373218, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 0.00022166135782742525, |
|
"loss": 2.5497, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 0.5411131217725207, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 0.0002209664786070372, |
|
"loss": 2.5505, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 0.5419958511718233, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 0.00022027182668399653, |
|
"loss": 2.5513, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 0.5428785805711259, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 0.0002195774074965874, |
|
"loss": 2.5493, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.5437613099704286, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 0.00021888322648127206, |
|
"loss": 2.5636, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 0.5446440393697312, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 0.0002181892890726479, |
|
"loss": 2.5583, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 0.5455267687690338, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 0.00021749560070340534, |
|
"loss": 2.5529, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 0.5464094981683365, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 0.000216802166804285, |
|
"loss": 2.5515, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 0.5472922275676392, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 0.00021610899280403555, |
|
"loss": 2.5585, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.5481749569669417, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 0.00021541608412937075, |
|
"loss": 2.5432, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 0.5490576863662444, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 0.000214723446204927, |
|
"loss": 2.5633, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 0.5499404157655471, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 0.00021403108445322168, |
|
"loss": 2.5604, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 0.5508231451648498, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 0.0002133390042946094, |
|
"loss": 2.5477, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 0.5517058745641523, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 0.00021264721114724064, |
|
"loss": 2.5514, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.552588603963455, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 0.0002119557104270187, |
|
"loss": 2.5616, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 0.5534713333627577, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 0.00021126450754755774, |
|
"loss": 2.5491, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 0.5543540627620603, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 0.00021057360792014004, |
|
"loss": 2.5473, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 0.5552367921613629, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 0.0002098830169536738, |
|
"loss": 2.5478, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 0.5561195215606656, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 0.00020919274005465083, |
|
"loss": 2.552, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.5570022509599682, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 0.00020850278262710416, |
|
"loss": 2.5571, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 0.5578849803592708, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 0.0002078131500725657, |
|
"loss": 2.5556, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 0.5587677097585735, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.00020712384779002392, |
|
"loss": 2.552, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 0.5596504391578762, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 0.00020643488117588199, |
|
"loss": 2.5512, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 0.5605331685571788, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 0.00020574625562391494, |
|
"loss": 2.5546, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.5614158979564814, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 0.00020505797652522751, |
|
"loss": 2.5543, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 0.5622986273557841, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 0.00020437004926821255, |
|
"loss": 2.5575, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 0.5631813567550867, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 0.00020368247923850826, |
|
"loss": 2.5547, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 0.5640640861543894, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 0.00020299527181895602, |
|
"loss": 2.5412, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 0.564946815553692, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 0.00020230843238955854, |
|
"loss": 2.544, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.5658295449529946, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 0.0002016219663274377, |
|
"loss": 2.5603, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 0.5667122743522973, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 0.00020093587900679217, |
|
"loss": 2.5474, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 0.5675950037516, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 0.00020025017579885563, |
|
"loss": 2.565, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 0.5684777331509026, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 0.00019956486207185477, |
|
"loss": 2.5528, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 0.5693604625502052, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 0.0001988799431909668, |
|
"loss": 2.5615, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.5702431919495079, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 0.00019819542451827808, |
|
"loss": 2.5547, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 0.5711259213488106, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 0.00019751131141274147, |
|
"loss": 2.5488, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 0.5720086507481131, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.0001968276092301352, |
|
"loss": 2.5499, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 0.5728913801474158, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 0.00019614432332302006, |
|
"loss": 2.5489, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 0.5737741095467185, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 0.00019546145904069808, |
|
"loss": 2.5497, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.574656838946021, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 0.00019477902172917045, |
|
"loss": 2.5487, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 0.5755395683453237, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 0.0001940970167310957, |
|
"loss": 2.5668, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 0.5764222977446264, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 0.0001934154493857479, |
|
"loss": 2.5521, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 0.5773050271439291, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 0.0001927343250289747, |
|
"loss": 2.5676, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 0.5781877565432316, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.00019205364899315593, |
|
"loss": 2.5402, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.5790704859425343, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 0.00019137342660716133, |
|
"loss": 2.5538, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 0.579953215341837, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.00019069366319630923, |
|
"loss": 2.5536, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 0.5808359447411396, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 0.00019001436408232496, |
|
"loss": 2.5481, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 0.5817186741404422, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 0.00018933553458329856, |
|
"loss": 2.5494, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 0.5826014035397449, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 0.00018865718001364375, |
|
"loss": 2.5421, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.5834841329390476, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 0.00018797930568405612, |
|
"loss": 2.5504, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 0.5843668623383502, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 0.00018730191690147176, |
|
"loss": 2.5459, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 0.5852495917376528, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 0.00018662501896902519, |
|
"loss": 2.5339, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 0.5861323211369555, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 0.0001859486171860082, |
|
"loss": 2.5401, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 0.5870150505362581, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 0.00018527271684782865, |
|
"loss": 2.5508, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.5878977799355608, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 0.00018459732324596834, |
|
"loss": 2.555, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 0.5887805093348634, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 0.0001839224416679421, |
|
"loss": 2.5675, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 0.589663238734166, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 0.00018324807739725614, |
|
"loss": 2.5473, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 0.5905459681334687, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 0.000182574235713367, |
|
"loss": 2.5612, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 0.5914286975327713, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 0.00018190092189163974, |
|
"loss": 2.5791, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.592311426932074, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 0.00018122814120330688, |
|
"loss": 2.5439, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 0.5931941563313766, |
|
"grad_norm": 0.1669921875, |
|
"learning_rate": 0.00018055589891542758, |
|
"loss": 2.5517, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 0.5940768857306793, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 0.00017988420029084551, |
|
"loss": 2.5437, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 0.5949596151299819, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 0.00017921305058814818, |
|
"loss": 2.5537, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 0.5958423445292845, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 0.00017854245506162582, |
|
"loss": 2.544, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.5967250739285872, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 0.00017787241896123024, |
|
"loss": 2.5581, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 0.5976078033278899, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 0.00017720294753253345, |
|
"loss": 2.5579, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 0.5984905327271924, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 0.00017653404601668666, |
|
"loss": 2.5429, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 0.5993732621264951, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 0.00017586571965037966, |
|
"loss": 2.5569, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 0.6002559915257978, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 0.0001751979736657993, |
|
"loss": 2.545, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.6011387209251005, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 0.00017453081329058882, |
|
"loss": 2.5456, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 0.602021450324403, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 0.0001738642437478067, |
|
"loss": 2.5416, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 0.6029041797237057, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 0.00017319827025588614, |
|
"loss": 2.5233, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 0.6037869091230084, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 0.0001725328980285939, |
|
"loss": 2.5527, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 0.604669638522311, |
|
"grad_norm": 0.1669921875, |
|
"learning_rate": 0.00017186813227498937, |
|
"loss": 2.55, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.6055523679216136, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 0.0001712039781993844, |
|
"loss": 2.5464, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 0.6064350973209163, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 0.00017054044100130178, |
|
"loss": 2.5457, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 0.607317826720219, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 0.0001698775258754351, |
|
"loss": 2.551, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 0.6082005561195215, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 0.00016921523801160756, |
|
"loss": 2.5549, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 0.6090832855188242, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 0.00016855358259473217, |
|
"loss": 2.5485, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.6099660149181269, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 0.00016789256480477023, |
|
"loss": 2.5402, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 0.6108487443174295, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 0.00016723218981669127, |
|
"loss": 2.5418, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 0.6117314737167321, |
|
"grad_norm": 0.16015625, |
|
"learning_rate": 0.00016657246280043266, |
|
"loss": 2.5591, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 0.6126142031160348, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 0.00016591338892085874, |
|
"loss": 2.5536, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 0.6134969325153374, |
|
"grad_norm": 0.169921875, |
|
"learning_rate": 0.0001652549733377206, |
|
"loss": 2.5456, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.6143796619146401, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 0.00016459722120561567, |
|
"loss": 2.5326, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 0.6152623913139427, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 0.0001639401376739475, |
|
"loss": 2.5623, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 0.6161451207132453, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 0.0001632837278868851, |
|
"loss": 2.5383, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 0.617027850112548, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 0.00016262799698332292, |
|
"loss": 2.5386, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 0.6179105795118507, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 0.00016197295009684077, |
|
"loss": 2.5427, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.6187933089111533, |
|
"grad_norm": 0.1650390625, |
|
"learning_rate": 0.00016131859235566325, |
|
"loss": 2.541, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 0.6196760383104559, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 0.00016066492888261983, |
|
"loss": 2.5609, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 0.6205587677097586, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 0.00016001196479510448, |
|
"loss": 2.5601, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 0.6214414971090613, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 0.00015935970520503638, |
|
"loss": 2.5552, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 0.6223242265083638, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 0.0001587081552188188, |
|
"loss": 2.5498, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.6232069559076665, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 0.0001580573199372999, |
|
"loss": 2.5479, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 0.6240896853069692, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 0.00015740720445573262, |
|
"loss": 2.5488, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 0.6249724147062717, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 0.00015675781386373462, |
|
"loss": 2.5478, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 0.6258551441055744, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 0.0001561091532452486, |
|
"loss": 2.5579, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 0.6267378735048771, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 0.00015546122767850232, |
|
"loss": 2.5543, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.6276206029041798, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 0.00015481404223596939, |
|
"loss": 2.559, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 0.6285033323034823, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 0.0001541676019843286, |
|
"loss": 2.549, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 0.629386061702785, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 0.00015352191198442507, |
|
"loss": 2.5372, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 0.6302687911020877, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 0.00015287697729123045, |
|
"loss": 2.5458, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 0.6311515205013903, |
|
"grad_norm": 0.1650390625, |
|
"learning_rate": 0.0001522328029538031, |
|
"loss": 2.5545, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.6320342499006929, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 0.00015158939401524877, |
|
"loss": 2.5564, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 0.6329169792999956, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 0.00015094675551268096, |
|
"loss": 2.5528, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 0.6337997086992982, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 0.00015030489247718173, |
|
"loss": 2.5414, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 0.6346824380986009, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 0.00014966380993376217, |
|
"loss": 2.5522, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 0.6355651674979035, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 0.0001490235129013228, |
|
"loss": 2.5521, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.6364478968972062, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 0.00014838400639261503, |
|
"loss": 2.5627, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 0.6373306262965088, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 0.000147745295414201, |
|
"loss": 2.5546, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 0.6382133556958115, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 0.00014710738496641492, |
|
"loss": 2.5284, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 0.6390960850951141, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 0.0001464702800433238, |
|
"loss": 2.5326, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 0.6399788144944167, |
|
"grad_norm": 0.16796875, |
|
"learning_rate": 0.00014583398563268858, |
|
"loss": 2.5522, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.6408615438937194, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 0.00014519850671592467, |
|
"loss": 2.5589, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 0.641744273293022, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 0.000144563848268063, |
|
"loss": 2.5653, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 0.6426270026923246, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 0.00014393001525771153, |
|
"loss": 2.55, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 0.6435097320916273, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 0.00014329701264701597, |
|
"loss": 2.5498, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 0.64439246149093, |
|
"grad_norm": 0.1689453125, |
|
"learning_rate": 0.0001426648453916208, |
|
"loss": 2.545, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.6452751908902326, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 0.00014203351844063088, |
|
"loss": 2.537, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 0.6461579202895352, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 0.0001414030367365725, |
|
"loss": 2.5452, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 0.6470406496888379, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 0.00014077340521535472, |
|
"loss": 2.5548, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 0.6479233790881406, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 0.00014014462880623042, |
|
"loss": 2.5404, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 0.6488061084874431, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 0.00013951671243175824, |
|
"loss": 2.5443, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.6496888378867458, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 0.00013888966100776386, |
|
"loss": 2.5506, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 0.6505715672860485, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 0.00013826347944330116, |
|
"loss": 2.5296, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 0.6514542966853512, |
|
"grad_norm": 0.158203125, |
|
"learning_rate": 0.00013763817264061425, |
|
"loss": 2.5591, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 0.6523370260846537, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 0.00013701374549509899, |
|
"loss": 2.5541, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 0.6532197554839564, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 0.00013639020289526438, |
|
"loss": 2.5624, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.6541024848832591, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 0.00013576754972269463, |
|
"loss": 2.5578, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 0.6549852142825617, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 0.0001351457908520109, |
|
"loss": 2.5454, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 0.6558679436818643, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 0.0001345249311508328, |
|
"loss": 2.5486, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 0.656750673081167, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 0.00013390497547974078, |
|
"loss": 2.5484, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 0.6576334024804696, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 0.00013328592869223747, |
|
"loss": 2.5486, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.6585161318797722, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 0.00013266779563471064, |
|
"loss": 2.5437, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 0.6593988612790749, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 0.00013205058114639407, |
|
"loss": 2.5521, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 0.6602815906783776, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 0.00013143429005933052, |
|
"loss": 2.5482, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 0.6611643200776802, |
|
"grad_norm": 0.16015625, |
|
"learning_rate": 0.00013081892719833378, |
|
"loss": 2.5343, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 0.6620470494769828, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 0.0001302044973809503, |
|
"loss": 2.5493, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.6629297788762855, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 0.00012959100541742248, |
|
"loss": 2.5553, |
|
"step": 7510 |
|
}, |
|
{ |
|
"epoch": 0.6638125082755881, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 0.0001289784561106499, |
|
"loss": 2.5531, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 0.6646952376748908, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 0.00012836685425615275, |
|
"loss": 2.5634, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 0.6655779670741934, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 0.00012775620464203365, |
|
"loss": 2.547, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 0.666460696473496, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 0.0001271465120489401, |
|
"loss": 2.54, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.6673434258727987, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 0.0001265377812500278, |
|
"loss": 2.548, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 0.6682261552721014, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 0.00012593001701092233, |
|
"loss": 2.547, |
|
"step": 7570 |
|
}, |
|
{ |
|
"epoch": 0.669108884671404, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 0.00012532322408968221, |
|
"loss": 2.5431, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 0.6699916140707066, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 0.00012471740723676213, |
|
"loss": 2.5517, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 0.6708743434700093, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 0.000124112571194975, |
|
"loss": 2.5473, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.671757072869312, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 0.00012350872069945547, |
|
"loss": 2.5503, |
|
"step": 7610 |
|
}, |
|
{ |
|
"epoch": 0.6726398022686145, |
|
"grad_norm": 0.16796875, |
|
"learning_rate": 0.00012290586047762216, |
|
"loss": 2.547, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 0.6735225316679172, |
|
"grad_norm": 0.1640625, |
|
"learning_rate": 0.00012230399524914136, |
|
"loss": 2.5385, |
|
"step": 7630 |
|
}, |
|
{ |
|
"epoch": 0.6744052610672199, |
|
"grad_norm": 0.169921875, |
|
"learning_rate": 0.00012170312972588974, |
|
"loss": 2.5363, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 0.6752879904665224, |
|
"grad_norm": 0.1640625, |
|
"learning_rate": 0.00012110326861191722, |
|
"loss": 2.5413, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.6761707198658251, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 0.00012050441660341074, |
|
"loss": 2.5474, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 0.6770534492651278, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 0.00011990657838865706, |
|
"loss": 2.5413, |
|
"step": 7670 |
|
}, |
|
{ |
|
"epoch": 0.6779361786644305, |
|
"grad_norm": 0.1650390625, |
|
"learning_rate": 0.00011930975864800603, |
|
"loss": 2.5438, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 0.678818908063733, |
|
"grad_norm": 0.1640625, |
|
"learning_rate": 0.0001187139620538342, |
|
"loss": 2.5575, |
|
"step": 7690 |
|
}, |
|
{ |
|
"epoch": 0.6797016374630357, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 0.0001181191932705081, |
|
"loss": 2.5511, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.6805843668623384, |
|
"grad_norm": 0.1669921875, |
|
"learning_rate": 0.00011752545695434788, |
|
"loss": 2.5575, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 0.681467096261641, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 0.00011693275775359049, |
|
"loss": 2.5661, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 0.6823498256609436, |
|
"grad_norm": 0.1669921875, |
|
"learning_rate": 0.00011634110030835341, |
|
"loss": 2.5405, |
|
"step": 7730 |
|
}, |
|
{ |
|
"epoch": 0.6832325550602463, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 0.000115750489250599, |
|
"loss": 2.5429, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 0.684115284459549, |
|
"grad_norm": 0.169921875, |
|
"learning_rate": 0.00011516092920409706, |
|
"loss": 2.5527, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.6849980138588516, |
|
"grad_norm": 0.154296875, |
|
"learning_rate": 0.00011457242478438962, |
|
"loss": 2.5431, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 0.6858807432581542, |
|
"grad_norm": 0.158203125, |
|
"learning_rate": 0.00011398498059875434, |
|
"loss": 2.5475, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 0.6867634726574569, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 0.00011339860124616833, |
|
"loss": 2.5277, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 0.6876462020567595, |
|
"grad_norm": 0.1689453125, |
|
"learning_rate": 0.00011281329131727272, |
|
"loss": 2.5447, |
|
"step": 7790 |
|
}, |
|
{ |
|
"epoch": 0.6885289314560622, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 0.00011222905539433593, |
|
"loss": 2.5402, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.6894116608553648, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 0.00011164589805121852, |
|
"loss": 2.5401, |
|
"step": 7810 |
|
}, |
|
{ |
|
"epoch": 0.6902943902546674, |
|
"grad_norm": 0.1611328125, |
|
"learning_rate": 0.00011106382385333708, |
|
"loss": 2.5293, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 0.6911771196539701, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 0.00011048283735762806, |
|
"loss": 2.5591, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 0.6920598490532727, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 0.00010990294311251328, |
|
"loss": 2.5501, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 0.6929425784525753, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 0.00010932414565786286, |
|
"loss": 2.5488, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.693825307851878, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 0.0001087464495249606, |
|
"loss": 2.5563, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 0.6947080372511807, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 0.00010816985923646838, |
|
"loss": 2.5468, |
|
"step": 7870 |
|
}, |
|
{ |
|
"epoch": 0.6955907666504832, |
|
"grad_norm": 0.16796875, |
|
"learning_rate": 0.00010759437930639058, |
|
"loss": 2.5426, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 0.6964734960497859, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 0.00010702001424003896, |
|
"loss": 2.5377, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 0.6973562254490886, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 0.00010644676853399688, |
|
"loss": 2.5323, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.6982389548483913, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 0.00010587464667608484, |
|
"loss": 2.5584, |
|
"step": 7910 |
|
}, |
|
{ |
|
"epoch": 0.6991216842476938, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 0.00010530365314532488, |
|
"loss": 2.5627, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 0.7000044136469965, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 0.00010473379241190542, |
|
"loss": 2.5529, |
|
"step": 7930 |
|
}, |
|
{ |
|
"epoch": 0.7008871430462992, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 0.00010416506893714662, |
|
"loss": 2.5464, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 0.7017698724456018, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 0.00010359748717346534, |
|
"loss": 2.54, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.7026526018449044, |
|
"grad_norm": 0.1611328125, |
|
"learning_rate": 0.00010303105156433998, |
|
"loss": 2.5576, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 0.7035353312442071, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00010246576654427611, |
|
"loss": 2.5533, |
|
"step": 7970 |
|
}, |
|
{ |
|
"epoch": 0.7044180606435098, |
|
"grad_norm": 0.15625, |
|
"learning_rate": 0.0001019016365387716, |
|
"loss": 2.5419, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 0.7053007900428124, |
|
"grad_norm": 0.1640625, |
|
"learning_rate": 0.00010133866596428196, |
|
"loss": 2.549, |
|
"step": 7990 |
|
}, |
|
{ |
|
"epoch": 0.706183519442115, |
|
"grad_norm": 0.1552734375, |
|
"learning_rate": 0.0001007768592281856, |
|
"loss": 2.5558, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.706183519442115, |
|
"eval_accuracy": 0.5025226345981063, |
|
"eval_loss": 2.4390876293182373, |
|
"eval_runtime": 7.0517, |
|
"eval_samples_per_second": 45.095, |
|
"eval_steps_per_second": 0.425, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.7070662488414177, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 0.00010021622072874948, |
|
"loss": 2.5533, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 0.7079489782407203, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 9.965675485509504e-05, |
|
"loss": 2.5469, |
|
"step": 8020 |
|
}, |
|
{ |
|
"epoch": 0.7088317076400229, |
|
"grad_norm": 0.1640625, |
|
"learning_rate": 9.909846598716302e-05, |
|
"loss": 2.5456, |
|
"step": 8030 |
|
}, |
|
{ |
|
"epoch": 0.7097144370393256, |
|
"grad_norm": 0.158203125, |
|
"learning_rate": 9.854135849567988e-05, |
|
"loss": 2.5486, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 0.7105971664386282, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 9.79854367421234e-05, |
|
"loss": 2.5466, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.7114798958379309, |
|
"grad_norm": 0.15625, |
|
"learning_rate": 9.743070507868818e-05, |
|
"loss": 2.5508, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 0.7123626252372335, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 9.687716784825218e-05, |
|
"loss": 2.5515, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 0.7132453546365362, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 9.632482938434197e-05, |
|
"loss": 2.5433, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 0.7141280840358388, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 9.577369401109987e-05, |
|
"loss": 2.5499, |
|
"step": 8090 |
|
}, |
|
{ |
|
"epoch": 0.7150108134351415, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 9.522376604324889e-05, |
|
"loss": 2.5531, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.7158935428344441, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 9.467504978605956e-05, |
|
"loss": 2.5524, |
|
"step": 8110 |
|
}, |
|
{ |
|
"epoch": 0.7167762722337467, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 9.412754953531663e-05, |
|
"loss": 2.5444, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 0.7176590016330494, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 9.35812695772845e-05, |
|
"loss": 2.5384, |
|
"step": 8130 |
|
}, |
|
{ |
|
"epoch": 0.7185417310323521, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 9.303621418867444e-05, |
|
"loss": 2.5473, |
|
"step": 8140 |
|
}, |
|
{ |
|
"epoch": 0.7194244604316546, |
|
"grad_norm": 0.1640625, |
|
"learning_rate": 9.24923876366106e-05, |
|
"loss": 2.5543, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 0.7203071898309573, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 9.194979417859705e-05, |
|
"loss": 2.5362, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 0.72118991923026, |
|
"grad_norm": 0.150390625, |
|
"learning_rate": 9.14084380624842e-05, |
|
"loss": 2.5362, |
|
"step": 8170 |
|
}, |
|
{ |
|
"epoch": 0.7220726486295626, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 9.086832352643535e-05, |
|
"loss": 2.5472, |
|
"step": 8180 |
|
}, |
|
{ |
|
"epoch": 0.7229553780288652, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 9.032945479889391e-05, |
|
"loss": 2.5464, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 0.7238381074281679, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 8.979183609855024e-05, |
|
"loss": 2.5572, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.7247208368274706, |
|
"grad_norm": 0.15625, |
|
"learning_rate": 8.925547163430812e-05, |
|
"loss": 2.5419, |
|
"step": 8210 |
|
}, |
|
{ |
|
"epoch": 0.7256035662267731, |
|
"grad_norm": 0.158203125, |
|
"learning_rate": 8.872036560525254e-05, |
|
"loss": 2.5313, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 0.7264862956260758, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 8.818652220061638e-05, |
|
"loss": 2.5315, |
|
"step": 8230 |
|
}, |
|
{ |
|
"epoch": 0.7273690250253785, |
|
"grad_norm": 0.1669921875, |
|
"learning_rate": 8.76539455997475e-05, |
|
"loss": 2.549, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 0.7282517544246812, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 8.71226399720764e-05, |
|
"loss": 2.5549, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.7291344838239837, |
|
"grad_norm": 0.15625, |
|
"learning_rate": 8.659260947708344e-05, |
|
"loss": 2.5558, |
|
"step": 8260 |
|
}, |
|
{ |
|
"epoch": 0.7300172132232864, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 8.606385826426621e-05, |
|
"loss": 2.5501, |
|
"step": 8270 |
|
}, |
|
{ |
|
"epoch": 0.7308999426225891, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 8.553639047310685e-05, |
|
"loss": 2.5546, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 0.7317826720218917, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 8.50102102330401e-05, |
|
"loss": 2.5545, |
|
"step": 8290 |
|
}, |
|
{ |
|
"epoch": 0.7326654014211943, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 8.448532166342077e-05, |
|
"loss": 2.5349, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.733548130820497, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 8.396172887349115e-05, |
|
"loss": 2.5466, |
|
"step": 8310 |
|
}, |
|
{ |
|
"epoch": 0.7344308602197996, |
|
"grad_norm": 0.15625, |
|
"learning_rate": 8.343943596234943e-05, |
|
"loss": 2.5521, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 0.7353135896191023, |
|
"grad_norm": 0.1689453125, |
|
"learning_rate": 8.291844701891732e-05, |
|
"loss": 2.5412, |
|
"step": 8330 |
|
}, |
|
{ |
|
"epoch": 0.7361963190184049, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 8.239876612190778e-05, |
|
"loss": 2.5424, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 0.7370790484177075, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 8.188039733979366e-05, |
|
"loss": 2.5543, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 0.7379617778170102, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 8.136334473077519e-05, |
|
"loss": 2.5527, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 0.7388445072163128, |
|
"grad_norm": 0.15625, |
|
"learning_rate": 8.084761234274906e-05, |
|
"loss": 2.5302, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 0.7397272366156155, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 8.033320421327578e-05, |
|
"loss": 2.5411, |
|
"step": 8380 |
|
}, |
|
{ |
|
"epoch": 0.7406099660149181, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 7.982012436954849e-05, |
|
"loss": 2.5302, |
|
"step": 8390 |
|
}, |
|
{ |
|
"epoch": 0.7414926954142208, |
|
"grad_norm": 0.1689453125, |
|
"learning_rate": 7.930837682836195e-05, |
|
"loss": 2.549, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.7423754248135234, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 7.87979655960801e-05, |
|
"loss": 2.5501, |
|
"step": 8410 |
|
}, |
|
{ |
|
"epoch": 0.743258154212826, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 7.828889466860551e-05, |
|
"loss": 2.5477, |
|
"step": 8420 |
|
}, |
|
{ |
|
"epoch": 0.7441408836121287, |
|
"grad_norm": 0.158203125, |
|
"learning_rate": 7.77811680313475e-05, |
|
"loss": 2.5561, |
|
"step": 8430 |
|
}, |
|
{ |
|
"epoch": 0.7450236130114314, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 7.727478965919144e-05, |
|
"loss": 2.5498, |
|
"step": 8440 |
|
}, |
|
{ |
|
"epoch": 0.745906342410734, |
|
"grad_norm": 0.1669921875, |
|
"learning_rate": 7.67697635164675e-05, |
|
"loss": 2.5422, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 0.7467890718100366, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 7.626609355691922e-05, |
|
"loss": 2.5452, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 0.7476718012093393, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 7.576378372367306e-05, |
|
"loss": 2.5422, |
|
"step": 8470 |
|
}, |
|
{ |
|
"epoch": 0.748554530608642, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 7.52628379492075e-05, |
|
"loss": 2.5423, |
|
"step": 8480 |
|
}, |
|
{ |
|
"epoch": 0.7494372600079445, |
|
"grad_norm": 0.150390625, |
|
"learning_rate": 7.476326015532162e-05, |
|
"loss": 2.5439, |
|
"step": 8490 |
|
}, |
|
{ |
|
"epoch": 0.7503199894072472, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 7.426505425310531e-05, |
|
"loss": 2.5584, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.7512027188065499, |
|
"grad_norm": 0.150390625, |
|
"learning_rate": 7.376822414290804e-05, |
|
"loss": 2.5494, |
|
"step": 8510 |
|
}, |
|
{ |
|
"epoch": 0.7520854482058525, |
|
"grad_norm": 0.16015625, |
|
"learning_rate": 7.327277371430858e-05, |
|
"loss": 2.5476, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 0.7529681776051551, |
|
"grad_norm": 0.154296875, |
|
"learning_rate": 7.27787068460842e-05, |
|
"loss": 2.5534, |
|
"step": 8530 |
|
}, |
|
{ |
|
"epoch": 0.7538509070044578, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 7.228602740618085e-05, |
|
"loss": 2.5516, |
|
"step": 8540 |
|
}, |
|
{ |
|
"epoch": 0.7547336364037605, |
|
"grad_norm": 0.1552734375, |
|
"learning_rate": 7.179473925168256e-05, |
|
"loss": 2.5482, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 0.755616365803063, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 7.130484622878108e-05, |
|
"loss": 2.5597, |
|
"step": 8560 |
|
}, |
|
{ |
|
"epoch": 0.7564990952023657, |
|
"grad_norm": 0.1611328125, |
|
"learning_rate": 7.081635217274617e-05, |
|
"loss": 2.5501, |
|
"step": 8570 |
|
}, |
|
{ |
|
"epoch": 0.7573818246016684, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 7.032926090789537e-05, |
|
"loss": 2.5453, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 0.758264554000971, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 6.984357624756388e-05, |
|
"loss": 2.5454, |
|
"step": 8590 |
|
}, |
|
{ |
|
"epoch": 0.7591472834002736, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 6.935930199407501e-05, |
|
"loss": 2.5486, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.7600300127995763, |
|
"grad_norm": 0.16796875, |
|
"learning_rate": 6.887644193871042e-05, |
|
"loss": 2.5446, |
|
"step": 8610 |
|
}, |
|
{ |
|
"epoch": 0.7609127421988789, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 6.839499986167999e-05, |
|
"loss": 2.5639, |
|
"step": 8620 |
|
}, |
|
{ |
|
"epoch": 0.7617954715981816, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 6.791497953209289e-05, |
|
"loss": 2.5376, |
|
"step": 8630 |
|
}, |
|
{ |
|
"epoch": 0.7626782009974842, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 6.743638470792735e-05, |
|
"loss": 2.5355, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 0.7635609303967869, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 6.695921913600212e-05, |
|
"loss": 2.5469, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 0.7644436597960895, |
|
"grad_norm": 0.150390625, |
|
"learning_rate": 6.648348655194613e-05, |
|
"loss": 2.5516, |
|
"step": 8660 |
|
}, |
|
{ |
|
"epoch": 0.7653263891953922, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 6.600919068017006e-05, |
|
"loss": 2.538, |
|
"step": 8670 |
|
}, |
|
{ |
|
"epoch": 0.7662091185946948, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 6.553633523383682e-05, |
|
"loss": 2.5491, |
|
"step": 8680 |
|
}, |
|
{ |
|
"epoch": 0.7670918479939974, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 6.506492391483232e-05, |
|
"loss": 2.5383, |
|
"step": 8690 |
|
}, |
|
{ |
|
"epoch": 0.7679745773933001, |
|
"grad_norm": 0.150390625, |
|
"learning_rate": 6.459496041373708e-05, |
|
"loss": 2.5425, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.7688573067926028, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 6.412644840979656e-05, |
|
"loss": 2.5525, |
|
"step": 8710 |
|
}, |
|
{ |
|
"epoch": 0.7697400361919053, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 6.365939157089304e-05, |
|
"loss": 2.5425, |
|
"step": 8720 |
|
}, |
|
{ |
|
"epoch": 0.770622765591208, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 6.319379355351653e-05, |
|
"loss": 2.5293, |
|
"step": 8730 |
|
}, |
|
{ |
|
"epoch": 0.7715054949905107, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 6.272965800273608e-05, |
|
"loss": 2.5375, |
|
"step": 8740 |
|
}, |
|
{ |
|
"epoch": 0.7723882243898132, |
|
"grad_norm": 0.1552734375, |
|
"learning_rate": 6.226698855217178e-05, |
|
"loss": 2.5502, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.7732709537891159, |
|
"grad_norm": 0.169921875, |
|
"learning_rate": 6.180578882396556e-05, |
|
"loss": 2.5518, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 0.7741536831884186, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 6.134606242875324e-05, |
|
"loss": 2.5396, |
|
"step": 8770 |
|
}, |
|
{ |
|
"epoch": 0.7750364125877213, |
|
"grad_norm": 0.150390625, |
|
"learning_rate": 6.088781296563636e-05, |
|
"loss": 2.5522, |
|
"step": 8780 |
|
}, |
|
{ |
|
"epoch": 0.7759191419870238, |
|
"grad_norm": 0.1650390625, |
|
"learning_rate": 6.043104402215388e-05, |
|
"loss": 2.5597, |
|
"step": 8790 |
|
}, |
|
{ |
|
"epoch": 0.7768018713863265, |
|
"grad_norm": 0.158203125, |
|
"learning_rate": 5.9975759174254075e-05, |
|
"loss": 2.5519, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.7776846007856292, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 5.952196198626633e-05, |
|
"loss": 2.5654, |
|
"step": 8810 |
|
}, |
|
{ |
|
"epoch": 0.7785673301849318, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 5.906965601087369e-05, |
|
"loss": 2.5543, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 0.7794500595842344, |
|
"grad_norm": 0.150390625, |
|
"learning_rate": 5.861884478908483e-05, |
|
"loss": 2.5422, |
|
"step": 8830 |
|
}, |
|
{ |
|
"epoch": 0.7803327889835371, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 5.816953185020607e-05, |
|
"loss": 2.5479, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 0.7812155183828398, |
|
"grad_norm": 0.154296875, |
|
"learning_rate": 5.7721720711814195e-05, |
|
"loss": 2.5471, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 0.7820982477821424, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 5.727541487972876e-05, |
|
"loss": 2.5383, |
|
"step": 8860 |
|
}, |
|
{ |
|
"epoch": 0.782980977181445, |
|
"grad_norm": 0.154296875, |
|
"learning_rate": 5.68306178479843e-05, |
|
"loss": 2.54, |
|
"step": 8870 |
|
}, |
|
{ |
|
"epoch": 0.7838637065807477, |
|
"grad_norm": 0.16015625, |
|
"learning_rate": 5.638733309880353e-05, |
|
"loss": 2.5504, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 0.7847464359800503, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 5.5945564102569764e-05, |
|
"loss": 2.5533, |
|
"step": 8890 |
|
}, |
|
{ |
|
"epoch": 0.785629165379353, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 5.550531431779984e-05, |
|
"loss": 2.5376, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.7865118947786556, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 5.50665871911169e-05, |
|
"loss": 2.5491, |
|
"step": 8910 |
|
}, |
|
{ |
|
"epoch": 0.7873946241779582, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 5.4629386157223434e-05, |
|
"loss": 2.533, |
|
"step": 8920 |
|
}, |
|
{ |
|
"epoch": 0.7882773535772609, |
|
"grad_norm": 0.15625, |
|
"learning_rate": 5.4193714638874845e-05, |
|
"loss": 2.5541, |
|
"step": 8930 |
|
}, |
|
{ |
|
"epoch": 0.7891600829765635, |
|
"grad_norm": 0.16015625, |
|
"learning_rate": 5.375957604685186e-05, |
|
"loss": 2.5261, |
|
"step": 8940 |
|
}, |
|
{ |
|
"epoch": 0.7900428123758662, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 5.3326973779934506e-05, |
|
"loss": 2.5527, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 0.7909255417751688, |
|
"grad_norm": 0.1552734375, |
|
"learning_rate": 5.289591122487522e-05, |
|
"loss": 2.5499, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 0.7918082711744715, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 5.246639175637216e-05, |
|
"loss": 2.5553, |
|
"step": 8970 |
|
}, |
|
{ |
|
"epoch": 0.7926910005737741, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 5.203841873704329e-05, |
|
"loss": 2.5535, |
|
"step": 8980 |
|
}, |
|
{ |
|
"epoch": 0.7935737299730767, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 5.161199551739942e-05, |
|
"loss": 2.5253, |
|
"step": 8990 |
|
}, |
|
{ |
|
"epoch": 0.7944564593723794, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 5.1187125435818575e-05, |
|
"loss": 2.5568, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.7953391887716821, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 5.0763811818519494e-05, |
|
"loss": 2.5483, |
|
"step": 9010 |
|
}, |
|
{ |
|
"epoch": 0.7962219181709846, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 5.0342057979535507e-05, |
|
"loss": 2.5541, |
|
"step": 9020 |
|
}, |
|
{ |
|
"epoch": 0.7971046475702873, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 4.99218672206892e-05, |
|
"loss": 2.5512, |
|
"step": 9030 |
|
}, |
|
{ |
|
"epoch": 0.79798737696959, |
|
"grad_norm": 0.150390625, |
|
"learning_rate": 4.950324283156562e-05, |
|
"loss": 2.5524, |
|
"step": 9040 |
|
}, |
|
{ |
|
"epoch": 0.7988701063688927, |
|
"grad_norm": 0.150390625, |
|
"learning_rate": 4.908618808948748e-05, |
|
"loss": 2.5388, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 0.7997528357681952, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 4.867070625948866e-05, |
|
"loss": 2.5634, |
|
"step": 9060 |
|
}, |
|
{ |
|
"epoch": 0.8006355651674979, |
|
"grad_norm": 0.15625, |
|
"learning_rate": 4.825680059428933e-05, |
|
"loss": 2.5374, |
|
"step": 9070 |
|
}, |
|
{ |
|
"epoch": 0.8015182945668006, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 4.784447433427016e-05, |
|
"loss": 2.5457, |
|
"step": 9080 |
|
}, |
|
{ |
|
"epoch": 0.8024010239661032, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 4.7433730707446805e-05, |
|
"loss": 2.5496, |
|
"step": 9090 |
|
}, |
|
{ |
|
"epoch": 0.8032837533654058, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 4.702457292944498e-05, |
|
"loss": 2.546, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.8041664827647085, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 4.661700420347517e-05, |
|
"loss": 2.5403, |
|
"step": 9110 |
|
}, |
|
{ |
|
"epoch": 0.8050492121640112, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 4.62110277203073e-05, |
|
"loss": 2.5484, |
|
"step": 9120 |
|
}, |
|
{ |
|
"epoch": 0.8059319415633137, |
|
"grad_norm": 0.150390625, |
|
"learning_rate": 4.5806646658246104e-05, |
|
"loss": 2.5572, |
|
"step": 9130 |
|
}, |
|
{ |
|
"epoch": 0.8068146709626164, |
|
"grad_norm": 0.150390625, |
|
"learning_rate": 4.5403864183106184e-05, |
|
"loss": 2.555, |
|
"step": 9140 |
|
}, |
|
{ |
|
"epoch": 0.8076974003619191, |
|
"grad_norm": 0.154296875, |
|
"learning_rate": 4.5002683448186866e-05, |
|
"loss": 2.5622, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 0.8085801297612217, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 4.460310759424802e-05, |
|
"loss": 2.5454, |
|
"step": 9160 |
|
}, |
|
{ |
|
"epoch": 0.8094628591605243, |
|
"grad_norm": 0.158203125, |
|
"learning_rate": 4.420513974948517e-05, |
|
"loss": 2.5404, |
|
"step": 9170 |
|
}, |
|
{ |
|
"epoch": 0.810345588559827, |
|
"grad_norm": 0.15625, |
|
"learning_rate": 4.3808783029505166e-05, |
|
"loss": 2.5385, |
|
"step": 9180 |
|
}, |
|
{ |
|
"epoch": 0.8112283179591296, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 4.341404053730147e-05, |
|
"loss": 2.5515, |
|
"step": 9190 |
|
}, |
|
{ |
|
"epoch": 0.8121110473584323, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 4.3020915363230274e-05, |
|
"loss": 2.5482, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.8129937767577349, |
|
"grad_norm": 0.1552734375, |
|
"learning_rate": 4.262941058498615e-05, |
|
"loss": 2.5382, |
|
"step": 9210 |
|
}, |
|
{ |
|
"epoch": 0.8138765061570375, |
|
"grad_norm": 0.154296875, |
|
"learning_rate": 4.2239529267577736e-05, |
|
"loss": 2.5462, |
|
"step": 9220 |
|
}, |
|
{ |
|
"epoch": 0.8147592355563402, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 4.1851274463304165e-05, |
|
"loss": 2.551, |
|
"step": 9230 |
|
}, |
|
{ |
|
"epoch": 0.8156419649556429, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 4.146464921173088e-05, |
|
"loss": 2.542, |
|
"step": 9240 |
|
}, |
|
{ |
|
"epoch": 0.8165246943549455, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 4.1079656539665696e-05, |
|
"loss": 2.5525, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 0.8174074237542481, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 4.069629946113565e-05, |
|
"loss": 2.5403, |
|
"step": 9260 |
|
}, |
|
{ |
|
"epoch": 0.8182901531535508, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 4.0314580977362655e-05, |
|
"loss": 2.5468, |
|
"step": 9270 |
|
}, |
|
{ |
|
"epoch": 0.8191728825528535, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 3.99345040767409e-05, |
|
"loss": 2.5448, |
|
"step": 9280 |
|
}, |
|
{ |
|
"epoch": 0.820055611952156, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 3.955607173481254e-05, |
|
"loss": 2.5475, |
|
"step": 9290 |
|
}, |
|
{ |
|
"epoch": 0.8209383413514587, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 3.9179286914244884e-05, |
|
"loss": 2.5421, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.8218210707507614, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 3.880415256480749e-05, |
|
"loss": 2.5562, |
|
"step": 9310 |
|
}, |
|
{ |
|
"epoch": 0.822703800150064, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 3.843067162334826e-05, |
|
"loss": 2.5252, |
|
"step": 9320 |
|
}, |
|
{ |
|
"epoch": 0.8235865295493666, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 3.805884701377127e-05, |
|
"loss": 2.5409, |
|
"step": 9330 |
|
}, |
|
{ |
|
"epoch": 0.8244692589486693, |
|
"grad_norm": 0.1640625, |
|
"learning_rate": 3.768868164701325e-05, |
|
"loss": 2.5449, |
|
"step": 9340 |
|
}, |
|
{ |
|
"epoch": 0.825351988347972, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 3.732017842102126e-05, |
|
"loss": 2.5703, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 0.8262347177472745, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 3.695334022072977e-05, |
|
"loss": 2.5449, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 0.8271174471465772, |
|
"grad_norm": 0.15625, |
|
"learning_rate": 3.658816991803798e-05, |
|
"loss": 2.5508, |
|
"step": 9370 |
|
}, |
|
{ |
|
"epoch": 0.8280001765458799, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 3.622467037178765e-05, |
|
"loss": 2.5448, |
|
"step": 9380 |
|
}, |
|
{ |
|
"epoch": 0.8288829059451825, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 3.586284442774049e-05, |
|
"loss": 2.5299, |
|
"step": 9390 |
|
}, |
|
{ |
|
"epoch": 0.8297656353444851, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 3.550269491855579e-05, |
|
"loss": 2.5425, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.8306483647437878, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 3.514422466376857e-05, |
|
"loss": 2.5504, |
|
"step": 9410 |
|
}, |
|
{ |
|
"epoch": 0.8315310941430905, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 3.478743646976726e-05, |
|
"loss": 2.551, |
|
"step": 9420 |
|
}, |
|
{ |
|
"epoch": 0.8324138235423931, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 3.443233312977176e-05, |
|
"loss": 2.5484, |
|
"step": 9430 |
|
}, |
|
{ |
|
"epoch": 0.8332965529416957, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 3.4078917423811556e-05, |
|
"loss": 2.5335, |
|
"step": 9440 |
|
}, |
|
{ |
|
"epoch": 0.8341792823409984, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 3.372719211870412e-05, |
|
"loss": 2.5315, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 0.835062011740301, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 3.3377159968033085e-05, |
|
"loss": 2.5582, |
|
"step": 9460 |
|
}, |
|
{ |
|
"epoch": 0.8359447411396037, |
|
"grad_norm": 0.169921875, |
|
"learning_rate": 3.302882371212665e-05, |
|
"loss": 2.5467, |
|
"step": 9470 |
|
}, |
|
{ |
|
"epoch": 0.8368274705389063, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 3.2682186078036304e-05, |
|
"loss": 2.5539, |
|
"step": 9480 |
|
}, |
|
{ |
|
"epoch": 0.8377101999382089, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 3.2337249779515436e-05, |
|
"loss": 2.5506, |
|
"step": 9490 |
|
}, |
|
{ |
|
"epoch": 0.8385929293375116, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 3.199401751699782e-05, |
|
"loss": 2.5415, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.8394756587368142, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 3.1652491977576883e-05, |
|
"loss": 2.5471, |
|
"step": 9510 |
|
}, |
|
{ |
|
"epoch": 0.8403583881361169, |
|
"grad_norm": 0.15625, |
|
"learning_rate": 3.131267583498448e-05, |
|
"loss": 2.552, |
|
"step": 9520 |
|
}, |
|
{ |
|
"epoch": 0.8412411175354195, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 3.097457174956977e-05, |
|
"loss": 2.5561, |
|
"step": 9530 |
|
}, |
|
{ |
|
"epoch": 0.8421238469347222, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 3.063818236827884e-05, |
|
"loss": 2.5502, |
|
"step": 9540 |
|
}, |
|
{ |
|
"epoch": 0.8430065763340248, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 3.030351032463341e-05, |
|
"loss": 2.5575, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 0.8438893057333274, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 2.9970558238710865e-05, |
|
"loss": 2.5531, |
|
"step": 9560 |
|
}, |
|
{ |
|
"epoch": 0.8447720351326301, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 2.9639328717123104e-05, |
|
"loss": 2.5366, |
|
"step": 9570 |
|
}, |
|
{ |
|
"epoch": 0.8456547645319328, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 2.9309824352996618e-05, |
|
"loss": 2.5446, |
|
"step": 9580 |
|
}, |
|
{ |
|
"epoch": 0.8465374939312353, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 2.898204772595195e-05, |
|
"loss": 2.5454, |
|
"step": 9590 |
|
}, |
|
{ |
|
"epoch": 0.847420223330538, |
|
"grad_norm": 0.15625, |
|
"learning_rate": 2.865600140208349e-05, |
|
"loss": 2.5283, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.8483029527298407, |
|
"grad_norm": 0.158203125, |
|
"learning_rate": 2.833168793393956e-05, |
|
"loss": 2.5519, |
|
"step": 9610 |
|
}, |
|
{ |
|
"epoch": 0.8491856821291434, |
|
"grad_norm": 0.154296875, |
|
"learning_rate": 2.8009109860502174e-05, |
|
"loss": 2.5443, |
|
"step": 9620 |
|
}, |
|
{ |
|
"epoch": 0.8500684115284459, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 2.768826970716745e-05, |
|
"loss": 2.55, |
|
"step": 9630 |
|
}, |
|
{ |
|
"epoch": 0.8509511409277486, |
|
"grad_norm": 0.1552734375, |
|
"learning_rate": 2.736916998572567e-05, |
|
"loss": 2.5536, |
|
"step": 9640 |
|
}, |
|
{ |
|
"epoch": 0.8518338703270513, |
|
"grad_norm": 0.1396484375, |
|
"learning_rate": 2.705181319434144e-05, |
|
"loss": 2.554, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 0.8527165997263539, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 2.6736201817534696e-05, |
|
"loss": 2.5469, |
|
"step": 9660 |
|
}, |
|
{ |
|
"epoch": 0.8535993291256565, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 2.6422338326160618e-05, |
|
"loss": 2.5496, |
|
"step": 9670 |
|
}, |
|
{ |
|
"epoch": 0.8544820585249592, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 2.6110225177390534e-05, |
|
"loss": 2.5509, |
|
"step": 9680 |
|
}, |
|
{ |
|
"epoch": 0.8553647879242618, |
|
"grad_norm": 0.1552734375, |
|
"learning_rate": 2.5799864814692902e-05, |
|
"loss": 2.5452, |
|
"step": 9690 |
|
}, |
|
{ |
|
"epoch": 0.8562475173235644, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 2.549125966781385e-05, |
|
"loss": 2.5413, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.8571302467228671, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 2.518441215275838e-05, |
|
"loss": 2.5428, |
|
"step": 9710 |
|
}, |
|
{ |
|
"epoch": 0.8580129761221698, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 2.48793246717712e-05, |
|
"loss": 2.545, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 0.8588957055214724, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 2.4575999613318245e-05, |
|
"loss": 2.5541, |
|
"step": 9730 |
|
}, |
|
{ |
|
"epoch": 0.859778434920775, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 2.4274439352067828e-05, |
|
"loss": 2.5458, |
|
"step": 9740 |
|
}, |
|
{ |
|
"epoch": 0.8606611643200777, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 2.3974646248871827e-05, |
|
"loss": 2.547, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 0.8615438937193803, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 2.3676622650747603e-05, |
|
"loss": 2.5407, |
|
"step": 9760 |
|
}, |
|
{ |
|
"epoch": 0.862426623118683, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 2.3380370890859454e-05, |
|
"loss": 2.5465, |
|
"step": 9770 |
|
}, |
|
{ |
|
"epoch": 0.8633093525179856, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 2.3085893288500136e-05, |
|
"loss": 2.5445, |
|
"step": 9780 |
|
}, |
|
{ |
|
"epoch": 0.8641920819172882, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 2.279319214907305e-05, |
|
"loss": 2.5268, |
|
"step": 9790 |
|
}, |
|
{ |
|
"epoch": 0.8650748113165909, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 2.2502269764074017e-05, |
|
"loss": 2.5262, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.8659575407158936, |
|
"grad_norm": 0.154296875, |
|
"learning_rate": 2.2213128411073396e-05, |
|
"loss": 2.5578, |
|
"step": 9810 |
|
}, |
|
{ |
|
"epoch": 0.8668402701151962, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 2.1925770353698137e-05, |
|
"loss": 2.5533, |
|
"step": 9820 |
|
}, |
|
{ |
|
"epoch": 0.8677229995144988, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 2.1640197841614083e-05, |
|
"loss": 2.5468, |
|
"step": 9830 |
|
}, |
|
{ |
|
"epoch": 0.8686057289138015, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 2.1356413110508675e-05, |
|
"loss": 2.5399, |
|
"step": 9840 |
|
}, |
|
{ |
|
"epoch": 0.8694884583131042, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 2.1074418382072912e-05, |
|
"loss": 2.5452, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 0.8703711877124067, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 2.0794215863984417e-05, |
|
"loss": 2.5361, |
|
"step": 9860 |
|
}, |
|
{ |
|
"epoch": 0.8712539171117094, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 2.0515807749889954e-05, |
|
"loss": 2.5424, |
|
"step": 9870 |
|
}, |
|
{ |
|
"epoch": 0.8721366465110121, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 2.0239196219388133e-05, |
|
"loss": 2.5568, |
|
"step": 9880 |
|
}, |
|
{ |
|
"epoch": 0.8730193759103146, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 1.9964383438012685e-05, |
|
"loss": 2.5599, |
|
"step": 9890 |
|
}, |
|
{ |
|
"epoch": 0.8739021053096173, |
|
"grad_norm": 0.1416015625, |
|
"learning_rate": 1.969137155721509e-05, |
|
"loss": 2.5448, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.87478483470892, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 1.942016271434821e-05, |
|
"loss": 2.5507, |
|
"step": 9910 |
|
}, |
|
{ |
|
"epoch": 0.8756675641082227, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 1.915075903264915e-05, |
|
"loss": 2.5443, |
|
"step": 9920 |
|
}, |
|
{ |
|
"epoch": 0.8765502935075252, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 1.8883162621222693e-05, |
|
"loss": 2.5618, |
|
"step": 9930 |
|
}, |
|
{ |
|
"epoch": 0.8774330229068279, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 1.8617375575025186e-05, |
|
"loss": 2.5591, |
|
"step": 9940 |
|
}, |
|
{ |
|
"epoch": 0.8783157523061306, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 1.835339997484753e-05, |
|
"loss": 2.5593, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 0.8791984817054332, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 1.8091237887299357e-05, |
|
"loss": 2.5468, |
|
"step": 9960 |
|
}, |
|
{ |
|
"epoch": 0.8800812111047358, |
|
"grad_norm": 0.1416015625, |
|
"learning_rate": 1.783089136479257e-05, |
|
"loss": 2.5537, |
|
"step": 9970 |
|
}, |
|
{ |
|
"epoch": 0.8809639405040385, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 1.757236244552557e-05, |
|
"loss": 2.5536, |
|
"step": 9980 |
|
}, |
|
{ |
|
"epoch": 0.8818466699033412, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 1.7315653153466977e-05, |
|
"loss": 2.5452, |
|
"step": 9990 |
|
}, |
|
{ |
|
"epoch": 0.8827293993026438, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 1.7060765498339958e-05, |
|
"loss": 2.5535, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.8827293993026438, |
|
"eval_accuracy": 0.5028574500272613, |
|
"eval_loss": 2.4378483295440674, |
|
"eval_runtime": 7.0626, |
|
"eval_samples_per_second": 45.026, |
|
"eval_steps_per_second": 0.425, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.8836121287019464, |
|
"grad_norm": 0.1416015625, |
|
"learning_rate": 1.6807701475606534e-05, |
|
"loss": 2.5573, |
|
"step": 10010 |
|
}, |
|
{ |
|
"epoch": 0.8844948581012491, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 1.6556463066451837e-05, |
|
"loss": 2.5438, |
|
"step": 10020 |
|
}, |
|
{ |
|
"epoch": 0.8853775875005517, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 1.63070522377686e-05, |
|
"loss": 2.5571, |
|
"step": 10030 |
|
}, |
|
{ |
|
"epoch": 0.8862603168998543, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 1.6059470942141912e-05, |
|
"loss": 2.5412, |
|
"step": 10040 |
|
}, |
|
{ |
|
"epoch": 0.887143046299157, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 1.5813721117833828e-05, |
|
"loss": 2.5566, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 0.8880257756984596, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 1.5569804688768092e-05, |
|
"loss": 2.5315, |
|
"step": 10060 |
|
}, |
|
{ |
|
"epoch": 0.8889085050977623, |
|
"grad_norm": 0.1650390625, |
|
"learning_rate": 1.532772356451531e-05, |
|
"loss": 2.542, |
|
"step": 10070 |
|
}, |
|
{ |
|
"epoch": 0.8897912344970649, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 1.5087479640277763e-05, |
|
"loss": 2.5465, |
|
"step": 10080 |
|
}, |
|
{ |
|
"epoch": 0.8906739638963675, |
|
"grad_norm": 0.140625, |
|
"learning_rate": 1.4849074796874779e-05, |
|
"loss": 2.5593, |
|
"step": 10090 |
|
}, |
|
{ |
|
"epoch": 0.8915566932956702, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 1.4612510900727794e-05, |
|
"loss": 2.5438, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.8924394226949729, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 1.4377789803845964e-05, |
|
"loss": 2.5491, |
|
"step": 10110 |
|
}, |
|
{ |
|
"epoch": 0.8933221520942755, |
|
"grad_norm": 0.1416015625, |
|
"learning_rate": 1.4144913343811544e-05, |
|
"loss": 2.5414, |
|
"step": 10120 |
|
}, |
|
{ |
|
"epoch": 0.8942048814935781, |
|
"grad_norm": 0.1396484375, |
|
"learning_rate": 1.3913883343765394e-05, |
|
"loss": 2.5444, |
|
"step": 10130 |
|
}, |
|
{ |
|
"epoch": 0.8950876108928808, |
|
"grad_norm": 0.154296875, |
|
"learning_rate": 1.3684701612392963e-05, |
|
"loss": 2.5444, |
|
"step": 10140 |
|
}, |
|
{ |
|
"epoch": 0.8959703402921835, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 1.345736994390992e-05, |
|
"loss": 2.5356, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 0.896853069691486, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 1.3231890118048179e-05, |
|
"loss": 2.5487, |
|
"step": 10160 |
|
}, |
|
{ |
|
"epoch": 0.8977357990907887, |
|
"grad_norm": 0.138671875, |
|
"learning_rate": 1.300826390004209e-05, |
|
"loss": 2.5567, |
|
"step": 10170 |
|
}, |
|
{ |
|
"epoch": 0.8986185284900914, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 1.2786493040614245e-05, |
|
"loss": 2.5631, |
|
"step": 10180 |
|
}, |
|
{ |
|
"epoch": 0.8995012578893941, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 1.2566579275962303e-05, |
|
"loss": 2.5384, |
|
"step": 10190 |
|
}, |
|
{ |
|
"epoch": 0.9003839872886966, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 1.2348524327744943e-05, |
|
"loss": 2.5369, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.9012667166879993, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 1.2132329903068563e-05, |
|
"loss": 2.5445, |
|
"step": 10210 |
|
}, |
|
{ |
|
"epoch": 0.902149446087302, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 1.1917997694473992e-05, |
|
"loss": 2.549, |
|
"step": 10220 |
|
}, |
|
{ |
|
"epoch": 0.9030321754866045, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 1.1705529379923085e-05, |
|
"loss": 2.5339, |
|
"step": 10230 |
|
}, |
|
{ |
|
"epoch": 0.9039149048859072, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 1.1494926622785811e-05, |
|
"loss": 2.5437, |
|
"step": 10240 |
|
}, |
|
{ |
|
"epoch": 0.9047976342852099, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 1.1286191071826823e-05, |
|
"loss": 2.5387, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 0.9056803636845125, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 1.1079324361193022e-05, |
|
"loss": 2.5676, |
|
"step": 10260 |
|
}, |
|
{ |
|
"epoch": 0.9065630930838151, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 1.0874328110400511e-05, |
|
"loss": 2.5503, |
|
"step": 10270 |
|
}, |
|
{ |
|
"epoch": 0.9074458224831178, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 1.0671203924321887e-05, |
|
"loss": 2.5516, |
|
"step": 10280 |
|
}, |
|
{ |
|
"epoch": 0.9083285518824205, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 1.0469953393173776e-05, |
|
"loss": 2.5399, |
|
"step": 10290 |
|
}, |
|
{ |
|
"epoch": 0.9092112812817231, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 1.0270578092504396e-05, |
|
"loss": 2.5427, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.9100940106810257, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 1.0073079583181126e-05, |
|
"loss": 2.5459, |
|
"step": 10310 |
|
}, |
|
{ |
|
"epoch": 0.9109767400803284, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 9.877459411378325e-06, |
|
"loss": 2.552, |
|
"step": 10320 |
|
}, |
|
{ |
|
"epoch": 0.911859469479631, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 9.683719108565331e-06, |
|
"loss": 2.5469, |
|
"step": 10330 |
|
}, |
|
{ |
|
"epoch": 0.9127421988789337, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 9.49186019149434e-06, |
|
"loss": 2.5547, |
|
"step": 10340 |
|
}, |
|
{ |
|
"epoch": 0.9136249282782363, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 9.301884162188496e-06, |
|
"loss": 2.5461, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 0.9145076576775389, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 9.113792507930263e-06, |
|
"loss": 2.5475, |
|
"step": 10360 |
|
}, |
|
{ |
|
"epoch": 0.9153903870768416, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 8.927586701249852e-06, |
|
"loss": 2.5437, |
|
"step": 10370 |
|
}, |
|
{ |
|
"epoch": 0.9162731164761443, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 8.743268199913307e-06, |
|
"loss": 2.5339, |
|
"step": 10380 |
|
}, |
|
{ |
|
"epoch": 0.9171558458754469, |
|
"grad_norm": 0.150390625, |
|
"learning_rate": 8.560838446911607e-06, |
|
"loss": 2.539, |
|
"step": 10390 |
|
}, |
|
{ |
|
"epoch": 0.9180385752747495, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 8.380298870449e-06, |
|
"loss": 2.5314, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.9189213046740522, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 8.201650883931904e-06, |
|
"loss": 2.5467, |
|
"step": 10410 |
|
}, |
|
{ |
|
"epoch": 0.9198040340733548, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 8.024895885957978e-06, |
|
"loss": 2.533, |
|
"step": 10420 |
|
}, |
|
{ |
|
"epoch": 0.9206867634726574, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 7.85003526030495e-06, |
|
"loss": 2.5422, |
|
"step": 10430 |
|
}, |
|
{ |
|
"epoch": 0.9215694928719601, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 7.677070375920026e-06, |
|
"loss": 2.5415, |
|
"step": 10440 |
|
}, |
|
{ |
|
"epoch": 0.9224522222712628, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 7.506002586909006e-06, |
|
"loss": 2.5579, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 0.9233349516705653, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 7.336833232525625e-06, |
|
"loss": 2.5422, |
|
"step": 10460 |
|
}, |
|
{ |
|
"epoch": 0.924217681069868, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 7.169563637161397e-06, |
|
"loss": 2.55, |
|
"step": 10470 |
|
}, |
|
{ |
|
"epoch": 0.9251004104691707, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 7.004195110334788e-06, |
|
"loss": 2.5397, |
|
"step": 10480 |
|
}, |
|
{ |
|
"epoch": 0.9259831398684734, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 6.840728946681363e-06, |
|
"loss": 2.5606, |
|
"step": 10490 |
|
}, |
|
{ |
|
"epoch": 0.9268658692677759, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 6.679166425943351e-06, |
|
"loss": 2.5403, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.9277485986670786, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 6.519508812959873e-06, |
|
"loss": 2.5464, |
|
"step": 10510 |
|
}, |
|
{ |
|
"epoch": 0.9286313280663813, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 6.3617573576569274e-06, |
|
"loss": 2.546, |
|
"step": 10520 |
|
}, |
|
{ |
|
"epoch": 0.9295140574656839, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 6.205913295037474e-06, |
|
"loss": 2.5394, |
|
"step": 10530 |
|
}, |
|
{ |
|
"epoch": 0.9303967868649865, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 6.051977845172002e-06, |
|
"loss": 2.5584, |
|
"step": 10540 |
|
}, |
|
{ |
|
"epoch": 0.9312795162642892, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 5.899952213188897e-06, |
|
"loss": 2.5341, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 0.9321622456635918, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 5.749837589264895e-06, |
|
"loss": 2.5478, |
|
"step": 10560 |
|
}, |
|
{ |
|
"epoch": 0.9330449750628945, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 5.601635148615891e-06, |
|
"loss": 2.5387, |
|
"step": 10570 |
|
}, |
|
{ |
|
"epoch": 0.9339277044621971, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 5.4553460514877304e-06, |
|
"loss": 2.5579, |
|
"step": 10580 |
|
}, |
|
{ |
|
"epoch": 0.9348104338614998, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 5.3109714431470165e-06, |
|
"loss": 2.5602, |
|
"step": 10590 |
|
}, |
|
{ |
|
"epoch": 0.9356931632608024, |
|
"grad_norm": 0.140625, |
|
"learning_rate": 5.168512453872287e-06, |
|
"loss": 2.5453, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.936575892660105, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 5.027970198945076e-06, |
|
"loss": 2.5461, |
|
"step": 10610 |
|
}, |
|
{ |
|
"epoch": 0.9374586220594077, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 4.889345778641252e-06, |
|
"loss": 2.5422, |
|
"step": 10620 |
|
}, |
|
{ |
|
"epoch": 0.9383413514587103, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 4.752640278222254e-06, |
|
"loss": 2.5523, |
|
"step": 10630 |
|
}, |
|
{ |
|
"epoch": 0.939224080858013, |
|
"grad_norm": 0.1416015625, |
|
"learning_rate": 4.617854767926782e-06, |
|
"loss": 2.5384, |
|
"step": 10640 |
|
}, |
|
{ |
|
"epoch": 0.9401068102573156, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 4.484990302962344e-06, |
|
"loss": 2.564, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 0.9409895396566182, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 4.354047923496917e-06, |
|
"loss": 2.5429, |
|
"step": 10660 |
|
}, |
|
{ |
|
"epoch": 0.9418722690559209, |
|
"grad_norm": 0.1416015625, |
|
"learning_rate": 4.2250286546509365e-06, |
|
"loss": 2.5365, |
|
"step": 10670 |
|
}, |
|
{ |
|
"epoch": 0.9427549984552236, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 4.09793350648921e-06, |
|
"loss": 2.543, |
|
"step": 10680 |
|
}, |
|
{ |
|
"epoch": 0.9436377278545262, |
|
"grad_norm": 0.140625, |
|
"learning_rate": 3.9727634740129585e-06, |
|
"loss": 2.5527, |
|
"step": 10690 |
|
}, |
|
{ |
|
"epoch": 0.9445204572538288, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 3.849519537152124e-06, |
|
"loss": 2.5534, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.9454031866531315, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 3.7282026607576016e-06, |
|
"loss": 2.5467, |
|
"step": 10710 |
|
}, |
|
{ |
|
"epoch": 0.9462859160524342, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 3.608813794593796e-06, |
|
"loss": 2.5537, |
|
"step": 10720 |
|
}, |
|
{ |
|
"epoch": 0.9471686454517367, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 3.491353873331077e-06, |
|
"loss": 2.5443, |
|
"step": 10730 |
|
}, |
|
{ |
|
"epoch": 0.9480513748510394, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 3.3758238165384757e-06, |
|
"loss": 2.5409, |
|
"step": 10740 |
|
}, |
|
{ |
|
"epoch": 0.9489341042503421, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 3.262224528676666e-06, |
|
"loss": 2.5294, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 0.9498168336496448, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 3.1505568990905787e-06, |
|
"loss": 2.5535, |
|
"step": 10760 |
|
}, |
|
{ |
|
"epoch": 0.9506995630489473, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 3.040821802002658e-06, |
|
"loss": 2.534, |
|
"step": 10770 |
|
}, |
|
{ |
|
"epoch": 0.95158229244825, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 2.9330200965059507e-06, |
|
"loss": 2.5347, |
|
"step": 10780 |
|
}, |
|
{ |
|
"epoch": 0.9524650218475527, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 2.827152626557389e-06, |
|
"loss": 2.5541, |
|
"step": 10790 |
|
}, |
|
{ |
|
"epoch": 0.9533477512468552, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 2.72322022097124e-06, |
|
"loss": 2.5358, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.9542304806461579, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 2.621223693412417e-06, |
|
"loss": 2.5485, |
|
"step": 10810 |
|
}, |
|
{ |
|
"epoch": 0.9551132100454606, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 2.5211638423903725e-06, |
|
"loss": 2.5523, |
|
"step": 10820 |
|
}, |
|
{ |
|
"epoch": 0.9559959394447632, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 2.4230414512527166e-06, |
|
"loss": 2.5485, |
|
"step": 10830 |
|
}, |
|
{ |
|
"epoch": 0.9568786688440658, |
|
"grad_norm": 0.140625, |
|
"learning_rate": 2.326857288178996e-06, |
|
"loss": 2.5437, |
|
"step": 10840 |
|
}, |
|
{ |
|
"epoch": 0.9577613982433685, |
|
"grad_norm": 0.158203125, |
|
"learning_rate": 2.232612106174897e-06, |
|
"loss": 2.5459, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 0.9586441276426712, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 2.1403066430661644e-06, |
|
"loss": 2.5504, |
|
"step": 10860 |
|
}, |
|
{ |
|
"epoch": 0.9595268570419738, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 2.0499416214928844e-06, |
|
"loss": 2.5543, |
|
"step": 10870 |
|
}, |
|
{ |
|
"epoch": 0.9604095864412764, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 1.9615177489038792e-06, |
|
"loss": 2.5351, |
|
"step": 10880 |
|
}, |
|
{ |
|
"epoch": 0.9612923158405791, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 1.8750357175510435e-06, |
|
"loss": 2.5447, |
|
"step": 10890 |
|
}, |
|
{ |
|
"epoch": 0.9621750452398817, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 1.7904962044841266e-06, |
|
"loss": 2.5591, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.9630577746391844, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 1.70789987154521e-06, |
|
"loss": 2.5395, |
|
"step": 10910 |
|
}, |
|
{ |
|
"epoch": 0.963940504038487, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 1.6272473653636266e-06, |
|
"loss": 2.5443, |
|
"step": 10920 |
|
}, |
|
{ |
|
"epoch": 0.9648232334377896, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 1.5485393173509388e-06, |
|
"loss": 2.5364, |
|
"step": 10930 |
|
}, |
|
{ |
|
"epoch": 0.9657059628370923, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 1.4717763436959685e-06, |
|
"loss": 2.55, |
|
"step": 10940 |
|
}, |
|
{ |
|
"epoch": 0.966588692236395, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 1.3969590453598858e-06, |
|
"loss": 2.5337, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 0.9674714216356975, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 1.3240880080716832e-06, |
|
"loss": 2.5396, |
|
"step": 10960 |
|
}, |
|
{ |
|
"epoch": 0.9683541510350002, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 1.2531638023233761e-06, |
|
"loss": 2.5398, |
|
"step": 10970 |
|
}, |
|
{ |
|
"epoch": 0.9692368804343029, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 1.1841869833656981e-06, |
|
"loss": 2.5688, |
|
"step": 10980 |
|
}, |
|
{ |
|
"epoch": 0.9701196098336055, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 1.1171580912036627e-06, |
|
"loss": 2.5305, |
|
"step": 10990 |
|
}, |
|
{ |
|
"epoch": 0.9710023392329081, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 1.0520776505924812e-06, |
|
"loss": 2.5474, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.9718850686322108, |
|
"grad_norm": 0.1552734375, |
|
"learning_rate": 9.889461710332059e-07, |
|
"loss": 2.5524, |
|
"step": 11010 |
|
}, |
|
{ |
|
"epoch": 0.9727677980315135, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 9.277641467689279e-07, |
|
"loss": 2.5433, |
|
"step": 11020 |
|
}, |
|
{ |
|
"epoch": 0.973650527430816, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 8.685320567809741e-07, |
|
"loss": 2.5445, |
|
"step": 11030 |
|
}, |
|
{ |
|
"epoch": 0.9745332568301187, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 8.112503647848546e-07, |
|
"loss": 2.5276, |
|
"step": 11040 |
|
}, |
|
{ |
|
"epoch": 0.9754159862294214, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 7.559195192269608e-07, |
|
"loss": 2.5454, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 0.9762987156287241, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 7.025399532808452e-07, |
|
"loss": 2.5486, |
|
"step": 11060 |
|
}, |
|
{ |
|
"epoch": 0.9771814450280266, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 6.511120848439467e-07, |
|
"loss": 2.5565, |
|
"step": 11070 |
|
}, |
|
{ |
|
"epoch": 0.9780641744273293, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 6.016363165342875e-07, |
|
"loss": 2.5388, |
|
"step": 11080 |
|
}, |
|
{ |
|
"epoch": 0.978946903826632, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 5.54113035687226e-07, |
|
"loss": 2.5419, |
|
"step": 11090 |
|
}, |
|
{ |
|
"epoch": 0.9798296332259346, |
|
"grad_norm": 0.150390625, |
|
"learning_rate": 5.085426143525695e-07, |
|
"loss": 2.5327, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.9807123626252372, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 4.649254092916333e-07, |
|
"loss": 2.5482, |
|
"step": 11110 |
|
}, |
|
{ |
|
"epoch": 0.9815950920245399, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 4.2326176197429735e-07, |
|
"loss": 2.5524, |
|
"step": 11120 |
|
}, |
|
{ |
|
"epoch": 0.9824778214238425, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 3.835519985765368e-07, |
|
"loss": 2.5317, |
|
"step": 11130 |
|
}, |
|
{ |
|
"epoch": 0.9833605508231452, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 3.457964299777849e-07, |
|
"loss": 2.5451, |
|
"step": 11140 |
|
}, |
|
{ |
|
"epoch": 0.9842432802224478, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 3.099953517584353e-07, |
|
"loss": 2.5406, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 0.9851260096217505, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 2.761490441976211e-07, |
|
"loss": 2.5455, |
|
"step": 11160 |
|
}, |
|
{ |
|
"epoch": 0.9860087390210531, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 2.4425777227102265e-07, |
|
"loss": 2.5483, |
|
"step": 11170 |
|
}, |
|
{ |
|
"epoch": 0.9868914684203557, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 2.1432178564867455e-07, |
|
"loss": 2.5509, |
|
"step": 11180 |
|
}, |
|
{ |
|
"epoch": 0.9877741978196584, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 1.8634131869313397e-07, |
|
"loss": 2.5409, |
|
"step": 11190 |
|
}, |
|
{ |
|
"epoch": 0.988656927218961, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 1.6031659045759318e-07, |
|
"loss": 2.537, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.9895396566182637, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 1.3624780468424192e-07, |
|
"loss": 2.5476, |
|
"step": 11210 |
|
}, |
|
{ |
|
"epoch": 0.9904223860175663, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 1.1413514980254669e-07, |
|
"loss": 2.5474, |
|
"step": 11220 |
|
}, |
|
{ |
|
"epoch": 0.9913051154168689, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 9.397879892777961e-08, |
|
"loss": 2.5472, |
|
"step": 11230 |
|
}, |
|
{ |
|
"epoch": 0.9921878448161716, |
|
"grad_norm": 0.150390625, |
|
"learning_rate": 7.577890985985269e-08, |
|
"loss": 2.5441, |
|
"step": 11240 |
|
}, |
|
{ |
|
"epoch": 0.9930705742154743, |
|
"grad_norm": 0.150390625, |
|
"learning_rate": 5.953562508184684e-08, |
|
"loss": 2.5474, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 0.9939533036147769, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 4.524907175904036e-08, |
|
"loss": 2.5428, |
|
"step": 11260 |
|
}, |
|
{ |
|
"epoch": 0.9948360330140795, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 3.2919361737854256e-08, |
|
"loss": 2.5553, |
|
"step": 11270 |
|
}, |
|
{ |
|
"epoch": 0.9957187624133822, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 2.2546591544991833e-08, |
|
"loss": 2.5346, |
|
"step": 11280 |
|
}, |
|
{ |
|
"epoch": 0.9966014918126849, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 1.4130842386717025e-08, |
|
"loss": 2.548, |
|
"step": 11290 |
|
}, |
|
{ |
|
"epoch": 0.9974842212119874, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 7.672180148132757e-09, |
|
"loss": 2.5376, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.9983669506112901, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 3.1706553927923763e-09, |
|
"loss": 2.5424, |
|
"step": 11310 |
|
}, |
|
{ |
|
"epoch": 0.9992496800105928, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 6.263033621722869e-10, |
|
"loss": 2.5439, |
|
"step": 11320 |
|
}, |
|
{ |
|
"epoch": 0.9999558635300348, |
|
"step": 11328, |
|
"total_flos": 1.9775705361382638e+20, |
|
"train_loss": 2.5878285095516573, |
|
"train_runtime": 23032.3399, |
|
"train_samples_per_second": 125.914, |
|
"train_steps_per_second": 0.492 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 11328, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.9775705361382638e+20, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|