|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.4636336426219696, |
|
"eval_steps": 26, |
|
"global_step": 260, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009577970667464832, |
|
"grad_norm": 10.084560608592307, |
|
"learning_rate": 1.7241379310344828e-07, |
|
"loss": 1.579, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.009577970667464832, |
|
"eval_loss": 2.5250306129455566, |
|
"eval_runtime": 107.4458, |
|
"eval_samples_per_second": 13.16, |
|
"eval_steps_per_second": 3.295, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.019155941334929663, |
|
"grad_norm": 8.306669565661105, |
|
"learning_rate": 3.4482758620689656e-07, |
|
"loss": 1.5724, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.02873391200239449, |
|
"grad_norm": 13.938049536284893, |
|
"learning_rate": 5.172413793103449e-07, |
|
"loss": 1.5871, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.038311882669859326, |
|
"grad_norm": 12.43456292626288, |
|
"learning_rate": 6.896551724137931e-07, |
|
"loss": 1.5681, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.04788985333732416, |
|
"grad_norm": 13.870879646573128, |
|
"learning_rate": 8.620689655172415e-07, |
|
"loss": 1.5744, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.05746782400478898, |
|
"grad_norm": 15.247654309196745, |
|
"learning_rate": 1.0344827586206898e-06, |
|
"loss": 1.5925, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.06704579467225381, |
|
"grad_norm": 15.680512101057806, |
|
"learning_rate": 1.2068965517241381e-06, |
|
"loss": 1.5704, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.07662376533971865, |
|
"grad_norm": 14.30414461091009, |
|
"learning_rate": 1.3793103448275862e-06, |
|
"loss": 1.5732, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.08620173600718348, |
|
"grad_norm": 11.033868746409794, |
|
"learning_rate": 1.5517241379310346e-06, |
|
"loss": 1.5325, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.09577970667464832, |
|
"grad_norm": 9.293155363204939, |
|
"learning_rate": 1.724137931034483e-06, |
|
"loss": 1.5525, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.10535767734211314, |
|
"grad_norm": 10.55909566144827, |
|
"learning_rate": 1.896551724137931e-06, |
|
"loss": 1.5283, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.11493564800957796, |
|
"grad_norm": 7.362528707126726, |
|
"learning_rate": 2.0689655172413796e-06, |
|
"loss": 1.5246, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.1245136186770428, |
|
"grad_norm": 7.368215078656617, |
|
"learning_rate": 2.241379310344828e-06, |
|
"loss": 1.5313, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.13409158934450763, |
|
"grad_norm": 6.065170717786516, |
|
"learning_rate": 2.4137931034482762e-06, |
|
"loss": 1.5027, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.14366956001197245, |
|
"grad_norm": 5.328528823161362, |
|
"learning_rate": 2.5862068965517246e-06, |
|
"loss": 1.481, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1532475306794373, |
|
"grad_norm": 4.425999183762783, |
|
"learning_rate": 2.7586206896551725e-06, |
|
"loss": 1.4494, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.16282550134690213, |
|
"grad_norm": 2.3104583142533675, |
|
"learning_rate": 2.931034482758621e-06, |
|
"loss": 1.4645, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.17240347201436695, |
|
"grad_norm": 1.595394748941364, |
|
"learning_rate": 3.103448275862069e-06, |
|
"loss": 1.4619, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.18198144268183178, |
|
"grad_norm": 1.2488731383034972, |
|
"learning_rate": 3.2758620689655175e-06, |
|
"loss": 1.4641, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.19155941334929663, |
|
"grad_norm": 1.5772662843657, |
|
"learning_rate": 3.448275862068966e-06, |
|
"loss": 1.4029, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.20113738401676146, |
|
"grad_norm": 2.556424014112241, |
|
"learning_rate": 3.620689655172414e-06, |
|
"loss": 1.4453, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.21071535468422628, |
|
"grad_norm": 2.0581192872654483, |
|
"learning_rate": 3.793103448275862e-06, |
|
"loss": 1.4135, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.2202933253516911, |
|
"grad_norm": 1.6613052346475512, |
|
"learning_rate": 3.96551724137931e-06, |
|
"loss": 1.4336, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.22987129601915593, |
|
"grad_norm": 1.2670811596205898, |
|
"learning_rate": 4.137931034482759e-06, |
|
"loss": 1.3898, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.23944926668662078, |
|
"grad_norm": 1.4594637064715403, |
|
"learning_rate": 4.310344827586207e-06, |
|
"loss": 1.392, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2490272373540856, |
|
"grad_norm": 1.6947460151500366, |
|
"learning_rate": 4.482758620689656e-06, |
|
"loss": 1.3967, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.2490272373540856, |
|
"eval_loss": 2.319483518600464, |
|
"eval_runtime": 107.1009, |
|
"eval_samples_per_second": 13.202, |
|
"eval_steps_per_second": 3.305, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.25860520802155046, |
|
"grad_norm": 1.4794556532045955, |
|
"learning_rate": 4.655172413793104e-06, |
|
"loss": 1.3882, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.26818317868901526, |
|
"grad_norm": 1.275878657564078, |
|
"learning_rate": 4.8275862068965525e-06, |
|
"loss": 1.4152, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.2777611493564801, |
|
"grad_norm": 1.0273810925450593, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3897, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.2873391200239449, |
|
"grad_norm": 1.3658855156304837, |
|
"learning_rate": 4.9998459603839726e-06, |
|
"loss": 1.3539, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.29691709069140976, |
|
"grad_norm": 1.160650318212732, |
|
"learning_rate": 4.9993838605184505e-06, |
|
"loss": 1.3461, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.3064950613588746, |
|
"grad_norm": 0.9334705830010439, |
|
"learning_rate": 4.998613757348784e-06, |
|
"loss": 1.3575, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.3160730320263394, |
|
"grad_norm": 1.0269221075865582, |
|
"learning_rate": 4.99753574577609e-06, |
|
"loss": 1.3503, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.32565100269380426, |
|
"grad_norm": 0.9951200682896573, |
|
"learning_rate": 4.996149958645559e-06, |
|
"loss": 1.3718, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.33522897336126906, |
|
"grad_norm": 0.8568405246328175, |
|
"learning_rate": 4.994456566730085e-06, |
|
"loss": 1.3515, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.3448069440287339, |
|
"grad_norm": 0.8752926728569858, |
|
"learning_rate": 4.992455778709222e-06, |
|
"loss": 1.3571, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.35438491469619876, |
|
"grad_norm": 0.9195979878575848, |
|
"learning_rate": 4.990147841143462e-06, |
|
"loss": 1.3335, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.36396288536366356, |
|
"grad_norm": 0.8848215909446233, |
|
"learning_rate": 4.98753303844386e-06, |
|
"loss": 1.3093, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.3735408560311284, |
|
"grad_norm": 0.8261733197817335, |
|
"learning_rate": 4.984611692836979e-06, |
|
"loss": 1.3376, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.38311882669859326, |
|
"grad_norm": 0.7643849735934586, |
|
"learning_rate": 4.981384164325184e-06, |
|
"loss": 1.3172, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.39269679736605806, |
|
"grad_norm": 0.8302859072234411, |
|
"learning_rate": 4.977850850642275e-06, |
|
"loss": 1.352, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.4022747680335229, |
|
"grad_norm": 0.8019795318623388, |
|
"learning_rate": 4.97401218720448e-06, |
|
"loss": 1.3271, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.4118527387009877, |
|
"grad_norm": 0.7856123291749388, |
|
"learning_rate": 4.969868647056793e-06, |
|
"loss": 1.3302, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.42143070936845256, |
|
"grad_norm": 0.7212471859830762, |
|
"learning_rate": 4.965420740814679e-06, |
|
"loss": 1.3215, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.4310086800359174, |
|
"grad_norm": 0.7660292329930958, |
|
"learning_rate": 4.960669016601155e-06, |
|
"loss": 1.3435, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.4405866507033822, |
|
"grad_norm": 0.7247198414191649, |
|
"learning_rate": 4.95561405997924e-06, |
|
"loss": 1.3163, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.45016462137084706, |
|
"grad_norm": 0.7419070442778594, |
|
"learning_rate": 4.950256493879795e-06, |
|
"loss": 1.3209, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.45974259203831186, |
|
"grad_norm": 0.7024643859790418, |
|
"learning_rate": 4.94459697852476e-06, |
|
"loss": 1.2684, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.4693205627057767, |
|
"grad_norm": 0.7208397492740805, |
|
"learning_rate": 4.938636211345792e-06, |
|
"loss": 1.2818, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.47889853337324156, |
|
"grad_norm": 0.7159719760236076, |
|
"learning_rate": 4.932374926898321e-06, |
|
"loss": 1.3094, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.48847650404070636, |
|
"grad_norm": 0.7100286359014379, |
|
"learning_rate": 4.92581389677103e-06, |
|
"loss": 1.3177, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.4980544747081712, |
|
"grad_norm": 0.664062518173294, |
|
"learning_rate": 4.918953929490768e-06, |
|
"loss": 1.2868, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.4980544747081712, |
|
"eval_loss": 2.239407777786255, |
|
"eval_runtime": 107.263, |
|
"eval_samples_per_second": 13.183, |
|
"eval_steps_per_second": 3.3, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.507632445375636, |
|
"grad_norm": 0.8658636506450442, |
|
"learning_rate": 4.911795870422916e-06, |
|
"loss": 1.2904, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.5172104160431009, |
|
"grad_norm": 0.6715121564275828, |
|
"learning_rate": 4.904340601667208e-06, |
|
"loss": 1.326, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.5267883867105657, |
|
"grad_norm": 0.8518222183690225, |
|
"learning_rate": 4.896589041949036e-06, |
|
"loss": 1.2757, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5363663573780305, |
|
"grad_norm": 0.6780934729098863, |
|
"learning_rate": 4.888542146506224e-06, |
|
"loss": 1.3027, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.5459443280454953, |
|
"grad_norm": 0.8407110074770763, |
|
"learning_rate": 4.880200906971321e-06, |
|
"loss": 1.2965, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.5555222987129602, |
|
"grad_norm": 0.654501814705368, |
|
"learning_rate": 4.8715663512493924e-06, |
|
"loss": 1.2764, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.565100269380425, |
|
"grad_norm": 0.7722805216190872, |
|
"learning_rate": 4.8626395433913595e-06, |
|
"loss": 1.2799, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.5746782400478898, |
|
"grad_norm": 0.6575468000608066, |
|
"learning_rate": 4.853421583462866e-06, |
|
"loss": 1.3009, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5842562107153547, |
|
"grad_norm": 0.6919845481307941, |
|
"learning_rate": 4.8439136074087165e-06, |
|
"loss": 1.2885, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.5938341813828195, |
|
"grad_norm": 0.652693683934317, |
|
"learning_rate": 4.834116786912897e-06, |
|
"loss": 1.2564, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.6034121520502843, |
|
"grad_norm": 0.6684643483116979, |
|
"learning_rate": 4.82403232925418e-06, |
|
"loss": 1.278, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.6129901227177492, |
|
"grad_norm": 0.6735443956477082, |
|
"learning_rate": 4.813661477157355e-06, |
|
"loss": 1.2895, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.622568093385214, |
|
"grad_norm": 0.6574494336528988, |
|
"learning_rate": 4.803005508640083e-06, |
|
"loss": 1.2481, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.6321460640526788, |
|
"grad_norm": 0.7061153031772025, |
|
"learning_rate": 4.7920657368554e-06, |
|
"loss": 1.3023, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.6417240347201437, |
|
"grad_norm": 0.6609850544647713, |
|
"learning_rate": 4.780843509929905e-06, |
|
"loss": 1.2619, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.6513020053876085, |
|
"grad_norm": 0.6958172104041147, |
|
"learning_rate": 4.769340210797618e-06, |
|
"loss": 1.2633, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.6608799760550733, |
|
"grad_norm": 0.6532872905224688, |
|
"learning_rate": 4.757557257029563e-06, |
|
"loss": 1.2581, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.6704579467225381, |
|
"grad_norm": 0.693714390508834, |
|
"learning_rate": 4.745496100659083e-06, |
|
"loss": 1.2499, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.680035917390003, |
|
"grad_norm": 0.6749996898449282, |
|
"learning_rate": 4.733158228002891e-06, |
|
"loss": 1.2536, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.6896138880574678, |
|
"grad_norm": 0.6753612400656019, |
|
"learning_rate": 4.720545159477921e-06, |
|
"loss": 1.2605, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.6991918587249326, |
|
"grad_norm": 0.6950386791904168, |
|
"learning_rate": 4.707658449413961e-06, |
|
"loss": 1.2489, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.7087698293923975, |
|
"grad_norm": 0.6396387112266337, |
|
"learning_rate": 4.694499685862106e-06, |
|
"loss": 1.264, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.7183478000598623, |
|
"grad_norm": 0.6809655013846588, |
|
"learning_rate": 4.681070490399064e-06, |
|
"loss": 1.2477, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.7279257707273271, |
|
"grad_norm": 0.6814836664342683, |
|
"learning_rate": 4.667372517927323e-06, |
|
"loss": 1.2349, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.737503741394792, |
|
"grad_norm": 0.6502075268222723, |
|
"learning_rate": 4.653407456471222e-06, |
|
"loss": 1.243, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.7470817120622568, |
|
"grad_norm": 0.6579341200451629, |
|
"learning_rate": 4.639177026968924e-06, |
|
"loss": 1.2549, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.7470817120622568, |
|
"eval_loss": 2.2078425884246826, |
|
"eval_runtime": 107.0636, |
|
"eval_samples_per_second": 13.207, |
|
"eval_steps_per_second": 3.306, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.7566596827297216, |
|
"grad_norm": 0.6264741505964025, |
|
"learning_rate": 4.624682983060346e-06, |
|
"loss": 1.2903, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.7662376533971865, |
|
"grad_norm": 0.6533395420906253, |
|
"learning_rate": 4.609927110871053e-06, |
|
"loss": 1.2371, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.7758156240646513, |
|
"grad_norm": 0.6366166912748572, |
|
"learning_rate": 4.594911228792156e-06, |
|
"loss": 1.2554, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.7853935947321161, |
|
"grad_norm": 0.6435835690637465, |
|
"learning_rate": 4.579637187256222e-06, |
|
"loss": 1.2855, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.7949715653995809, |
|
"grad_norm": 0.6410872090826751, |
|
"learning_rate": 4.564106868509246e-06, |
|
"loss": 1.232, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.8045495360670458, |
|
"grad_norm": 0.6260242741257913, |
|
"learning_rate": 4.5483221863786965e-06, |
|
"loss": 1.2458, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.8141275067345106, |
|
"grad_norm": 0.6588265965096135, |
|
"learning_rate": 4.5322850860376744e-06, |
|
"loss": 1.2474, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.8237054774019754, |
|
"grad_norm": 0.6372013969893753, |
|
"learning_rate": 4.515997543765202e-06, |
|
"loss": 1.2563, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.8332834480694403, |
|
"grad_norm": 0.683356686747451, |
|
"learning_rate": 4.499461566702685e-06, |
|
"loss": 1.2447, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.8428614187369051, |
|
"grad_norm": 0.6520958114219059, |
|
"learning_rate": 4.48267919260657e-06, |
|
"loss": 1.2243, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.8524393894043699, |
|
"grad_norm": 0.6468861797594448, |
|
"learning_rate": 4.465652489597226e-06, |
|
"loss": 1.2254, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.8620173600718348, |
|
"grad_norm": 0.6675355862176291, |
|
"learning_rate": 4.4483835559040885e-06, |
|
"loss": 1.2116, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8715953307392996, |
|
"grad_norm": 0.6318507194646, |
|
"learning_rate": 4.430874519607089e-06, |
|
"loss": 1.2634, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.8811733014067644, |
|
"grad_norm": 0.6496099541936005, |
|
"learning_rate": 4.413127538374411e-06, |
|
"loss": 1.2129, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.8907512720742293, |
|
"grad_norm": 0.6026396711785842, |
|
"learning_rate": 4.395144799196593e-06, |
|
"loss": 1.2483, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.9003292427416941, |
|
"grad_norm": 0.6709684350468395, |
|
"learning_rate": 4.376928518117028e-06, |
|
"loss": 1.2193, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.9099072134091589, |
|
"grad_norm": 0.6237262552476821, |
|
"learning_rate": 4.358480939958867e-06, |
|
"loss": 1.218, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.9194851840766237, |
|
"grad_norm": 0.6582242790059232, |
|
"learning_rate": 4.339804338048397e-06, |
|
"loss": 1.229, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.9290631547440886, |
|
"grad_norm": 0.6235719312223321, |
|
"learning_rate": 4.320901013934887e-06, |
|
"loss": 1.2098, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.9386411254115534, |
|
"grad_norm": 0.6295163336318428, |
|
"learning_rate": 4.301773297106968e-06, |
|
"loss": 1.205, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.9482190960790182, |
|
"grad_norm": 0.6250959313071772, |
|
"learning_rate": 4.282423544705564e-06, |
|
"loss": 1.2054, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.9577970667464831, |
|
"grad_norm": 0.6086898991547662, |
|
"learning_rate": 4.262854141233419e-06, |
|
"loss": 1.2118, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.9673750374139479, |
|
"grad_norm": 0.5764067645719498, |
|
"learning_rate": 4.243067498261251e-06, |
|
"loss": 1.2372, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.9769530080814127, |
|
"grad_norm": 0.6406315852737573, |
|
"learning_rate": 4.223066054130568e-06, |
|
"loss": 1.2251, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.9865309787488776, |
|
"grad_norm": 0.5834984455673559, |
|
"learning_rate": 4.2028522736531895e-06, |
|
"loss": 1.2258, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.9961089494163424, |
|
"grad_norm": 0.5911139350878512, |
|
"learning_rate": 4.182428647807503e-06, |
|
"loss": 1.2286, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.9961089494163424, |
|
"eval_loss": 2.184576988220215, |
|
"eval_runtime": 107.5576, |
|
"eval_samples_per_second": 13.146, |
|
"eval_steps_per_second": 3.291, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.0056869200838072, |
|
"grad_norm": 0.6299976497698655, |
|
"learning_rate": 4.161797693431493e-06, |
|
"loss": 1.2383, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.002095181083508, |
|
"grad_norm": 0.5986176560633782, |
|
"learning_rate": 4.140961952912594e-06, |
|
"loss": 1.2182, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.0116731517509727, |
|
"grad_norm": 0.7138997909374802, |
|
"learning_rate": 4.11992399387438e-06, |
|
"loss": 1.1894, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.0212511224184375, |
|
"grad_norm": 0.6431525283411005, |
|
"learning_rate": 4.098686408860157e-06, |
|
"loss": 1.1741, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.0308290930859023, |
|
"grad_norm": 0.7490910983392529, |
|
"learning_rate": 4.077251815013477e-06, |
|
"loss": 1.1849, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.0404070637533673, |
|
"grad_norm": 0.6667698353299697, |
|
"learning_rate": 4.055622853755627e-06, |
|
"loss": 1.1833, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.0499850344208321, |
|
"grad_norm": 0.7240102351414811, |
|
"learning_rate": 4.033802190460114e-06, |
|
"loss": 1.1915, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.059563005088297, |
|
"grad_norm": 0.6281393232743739, |
|
"learning_rate": 4.011792514124217e-06, |
|
"loss": 1.1557, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.0691409757557617, |
|
"grad_norm": 0.6735415717178005, |
|
"learning_rate": 3.989596537037608e-06, |
|
"loss": 1.1878, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.0787189464232265, |
|
"grad_norm": 0.5939146155666697, |
|
"learning_rate": 3.967216994448116e-06, |
|
"loss": 1.1639, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.0882969170906913, |
|
"grad_norm": 0.6932505538102671, |
|
"learning_rate": 3.9446566442246615e-06, |
|
"loss": 1.1759, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.0978748877581563, |
|
"grad_norm": 0.5763908483496408, |
|
"learning_rate": 3.921918266517392e-06, |
|
"loss": 1.1781, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.1074528584256211, |
|
"grad_norm": 0.6818836608860367, |
|
"learning_rate": 3.899004663415083e-06, |
|
"loss": 1.1869, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.117030829093086, |
|
"grad_norm": 0.5998154432302447, |
|
"learning_rate": 3.875918658599837e-06, |
|
"loss": 1.1692, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.1266087997605507, |
|
"grad_norm": 0.6596200288243683, |
|
"learning_rate": 3.852663096999104e-06, |
|
"loss": 1.2059, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.1361867704280155, |
|
"grad_norm": 0.5918812335768482, |
|
"learning_rate": 3.829240844435109e-06, |
|
"loss": 1.1798, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.1457647410954803, |
|
"grad_norm": 0.6232580849345692, |
|
"learning_rate": 3.8056547872716865e-06, |
|
"loss": 1.1517, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.1553427117629451, |
|
"grad_norm": 0.5903843042319051, |
|
"learning_rate": 3.7819078320585865e-06, |
|
"loss": 1.1906, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.1649206824304101, |
|
"grad_norm": 0.5896678764206408, |
|
"learning_rate": 3.7580029051732992e-06, |
|
"loss": 1.1832, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.174498653097875, |
|
"grad_norm": 0.5666656027289849, |
|
"learning_rate": 3.733942952460432e-06, |
|
"loss": 1.1911, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.1840766237653397, |
|
"grad_norm": 0.565358825737842, |
|
"learning_rate": 3.7097309388686865e-06, |
|
"loss": 1.1945, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.1936545944328045, |
|
"grad_norm": 0.645159266559964, |
|
"learning_rate": 3.6853698480854853e-06, |
|
"loss": 1.1988, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.2032325651002693, |
|
"grad_norm": 0.5661828443152349, |
|
"learning_rate": 3.660862682169283e-06, |
|
"loss": 1.1683, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.2128105357677341, |
|
"grad_norm": 0.5590652900384634, |
|
"learning_rate": 3.636212461179623e-06, |
|
"loss": 1.1401, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.2223885064351991, |
|
"grad_norm": 0.5772830186331369, |
|
"learning_rate": 3.6114222228049657e-06, |
|
"loss": 1.1457, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.231966477102664, |
|
"grad_norm": 0.5638028162671672, |
|
"learning_rate": 3.5864950219883514e-06, |
|
"loss": 1.1599, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.231966477102664, |
|
"eval_loss": 2.181441068649292, |
|
"eval_runtime": 107.4543, |
|
"eval_samples_per_second": 13.159, |
|
"eval_steps_per_second": 3.294, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.2415444477701287, |
|
"grad_norm": 0.5674264221381613, |
|
"learning_rate": 3.561433930550934e-06, |
|
"loss": 1.1439, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.2511224184375935, |
|
"grad_norm": 0.5548457286136358, |
|
"learning_rate": 3.536242036813436e-06, |
|
"loss": 1.1455, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.2607003891050583, |
|
"grad_norm": 0.5681860545302818, |
|
"learning_rate": 3.510922445215568e-06, |
|
"loss": 1.1619, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.2702783597725231, |
|
"grad_norm": 0.5189655726956113, |
|
"learning_rate": 3.4854782759334625e-06, |
|
"loss": 1.1647, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.279856330439988, |
|
"grad_norm": 0.5482759127528988, |
|
"learning_rate": 3.4599126644951758e-06, |
|
"loss": 1.1963, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.289434301107453, |
|
"grad_norm": 0.5545710145438582, |
|
"learning_rate": 3.4342287613942804e-06, |
|
"loss": 1.1673, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.2990122717749177, |
|
"grad_norm": 0.5616911560631516, |
|
"learning_rate": 3.4084297317016353e-06, |
|
"loss": 1.1482, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.3085902424423825, |
|
"grad_norm": 0.5429625311889626, |
|
"learning_rate": 3.3825187546753426e-06, |
|
"loss": 1.1459, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.3181682131098473, |
|
"grad_norm": 0.5775738090552808, |
|
"learning_rate": 3.3564990233689632e-06, |
|
"loss": 1.1744, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.3277461837773121, |
|
"grad_norm": 0.5422962267277087, |
|
"learning_rate": 3.330373744238033e-06, |
|
"loss": 1.1796, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.3373241544447771, |
|
"grad_norm": 0.5383626495155892, |
|
"learning_rate": 3.3041461367449256e-06, |
|
"loss": 1.1646, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.346902125112242, |
|
"grad_norm": 0.5588657340470299, |
|
"learning_rate": 3.2778194329621104e-06, |
|
"loss": 1.1842, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.3564800957797067, |
|
"grad_norm": 0.5198196148369068, |
|
"learning_rate": 3.2513968771738606e-06, |
|
"loss": 1.1708, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.3660580664471715, |
|
"grad_norm": 0.5453371169769571, |
|
"learning_rate": 3.224881725476456e-06, |
|
"loss": 1.1636, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.3756360371146363, |
|
"grad_norm": 0.5692897944097868, |
|
"learning_rate": 3.198277245376924e-06, |
|
"loss": 1.1273, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.3852140077821011, |
|
"grad_norm": 0.5423704486470122, |
|
"learning_rate": 3.1715867153903844e-06, |
|
"loss": 1.1405, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.394791978449566, |
|
"grad_norm": 0.5819177408649716, |
|
"learning_rate": 3.144813424636031e-06, |
|
"loss": 1.1665, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.4043699491170307, |
|
"grad_norm": 0.554870749454361, |
|
"learning_rate": 3.1179606724318052e-06, |
|
"loss": 1.1872, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.4139479197844955, |
|
"grad_norm": 0.5493659769746441, |
|
"learning_rate": 3.091031767887817e-06, |
|
"loss": 1.1906, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.4235258904519605, |
|
"grad_norm": 0.6008378552179591, |
|
"learning_rate": 3.0640300294985613e-06, |
|
"loss": 1.1635, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.4331038611194253, |
|
"grad_norm": 0.5078261653762177, |
|
"learning_rate": 3.036958784733967e-06, |
|
"loss": 1.1438, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.4426818317868901, |
|
"grad_norm": 0.5559592542323409, |
|
"learning_rate": 3.0098213696293542e-06, |
|
"loss": 1.1642, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.452259802454355, |
|
"grad_norm": 0.5461821050739424, |
|
"learning_rate": 2.982621128374325e-06, |
|
"loss": 1.1725, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.46183777312182, |
|
"grad_norm": 0.5412862095154186, |
|
"learning_rate": 2.9553614129006543e-06, |
|
"loss": 1.1654, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.4714157437892847, |
|
"grad_norm": 0.5658659296771973, |
|
"learning_rate": 2.9280455824692255e-06, |
|
"loss": 1.1655, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.4809937144567495, |
|
"grad_norm": 0.5525850336445564, |
|
"learning_rate": 2.9006770032560637e-06, |
|
"loss": 1.1577, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.4809937144567495, |
|
"eval_loss": 2.1755869388580322, |
|
"eval_runtime": 107.2159, |
|
"eval_samples_per_second": 13.188, |
|
"eval_steps_per_second": 3.302, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.4905716851242143, |
|
"grad_norm": 0.5710362202768258, |
|
"learning_rate": 2.8732590479375167e-06, |
|
"loss": 1.1595, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.5001496557916791, |
|
"grad_norm": 0.5369626897696785, |
|
"learning_rate": 2.8457950952746293e-06, |
|
"loss": 1.1622, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.509727626459144, |
|
"grad_norm": 0.5194143574454793, |
|
"learning_rate": 2.8182885296967833e-06, |
|
"loss": 1.1313, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.5193055971266087, |
|
"grad_norm": 0.5220817246963333, |
|
"learning_rate": 2.7907427408846156e-06, |
|
"loss": 1.1493, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.5288835677940735, |
|
"grad_norm": 0.5307538609855902, |
|
"learning_rate": 2.763161123352314e-06, |
|
"loss": 1.1571, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 0.5133921578064818, |
|
"learning_rate": 2.735547076029296e-06, |
|
"loss": 1.1398, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.5480395091290033, |
|
"grad_norm": 0.528392253063443, |
|
"learning_rate": 2.7079040018413586e-06, |
|
"loss": 1.169, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.5576174797964681, |
|
"grad_norm": 0.5033775123091357, |
|
"learning_rate": 2.6802353072913307e-06, |
|
"loss": 1.1396, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.567195450463933, |
|
"grad_norm": 0.5429413779707357, |
|
"learning_rate": 2.6525444020392794e-06, |
|
"loss": 1.1558, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.5767734211313977, |
|
"grad_norm": 0.5391198899526514, |
|
"learning_rate": 2.6248346984823325e-06, |
|
"loss": 1.1584, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.5863513917988628, |
|
"grad_norm": 0.5237711725405991, |
|
"learning_rate": 2.5971096113341692e-06, |
|
"loss": 1.1399, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.5959293624663276, |
|
"grad_norm": 0.522431379990406, |
|
"learning_rate": 2.5693725572042135e-06, |
|
"loss": 1.146, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.6055073331337923, |
|
"grad_norm": 0.540451111257001, |
|
"learning_rate": 2.5416269541765963e-06, |
|
"loss": 1.1347, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.6150853038012571, |
|
"grad_norm": 0.542474309771266, |
|
"learning_rate": 2.5138762213889493e-06, |
|
"loss": 1.1507, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.624663274468722, |
|
"grad_norm": 0.5339716680549861, |
|
"learning_rate": 2.486123778611051e-06, |
|
"loss": 1.1428, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.6342412451361867, |
|
"grad_norm": 0.5194346219437855, |
|
"learning_rate": 2.458373045823404e-06, |
|
"loss": 1.1717, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.6438192158036515, |
|
"grad_norm": 0.5486922902738444, |
|
"learning_rate": 2.4306274427957878e-06, |
|
"loss": 1.1405, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.6533971864711163, |
|
"grad_norm": 0.5364703724723029, |
|
"learning_rate": 2.402890388665831e-06, |
|
"loss": 1.1397, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.6629751571385811, |
|
"grad_norm": 0.5151838009534813, |
|
"learning_rate": 2.375165301517668e-06, |
|
"loss": 1.1625, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.6725531278060461, |
|
"grad_norm": 0.5387178228054901, |
|
"learning_rate": 2.3474555979607214e-06, |
|
"loss": 1.1586, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.682131098473511, |
|
"grad_norm": 0.5264984610535657, |
|
"learning_rate": 2.3197646927086697e-06, |
|
"loss": 1.1654, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.6917090691409757, |
|
"grad_norm": 0.5272357155280125, |
|
"learning_rate": 2.2920959981586426e-06, |
|
"loss": 1.1934, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.7012870398084405, |
|
"grad_norm": 0.5252339989768573, |
|
"learning_rate": 2.2644529239707054e-06, |
|
"loss": 1.1426, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.7108650104759056, |
|
"grad_norm": 0.4974185735061034, |
|
"learning_rate": 2.2368388766476875e-06, |
|
"loss": 1.1597, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.7204429811433704, |
|
"grad_norm": 0.5361098970394095, |
|
"learning_rate": 2.2092572591153843e-06, |
|
"loss": 1.1637, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.7300209518108352, |
|
"grad_norm": 0.5305009042993176, |
|
"learning_rate": 2.1817114703032176e-06, |
|
"loss": 1.1637, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.7300209518108352, |
|
"eval_loss": 2.1710658073425293, |
|
"eval_runtime": 107.1212, |
|
"eval_samples_per_second": 13.2, |
|
"eval_steps_per_second": 3.305, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.7395989224783, |
|
"grad_norm": 0.5012187069773779, |
|
"learning_rate": 2.154204904725371e-06, |
|
"loss": 1.1447, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.7491768931457647, |
|
"grad_norm": 0.5763812037469009, |
|
"learning_rate": 2.126740952062484e-06, |
|
"loss": 1.1565, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.7587548638132295, |
|
"grad_norm": 0.5129804478325861, |
|
"learning_rate": 2.099322996743936e-06, |
|
"loss": 1.1798, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.7683328344806943, |
|
"grad_norm": 0.5107704085635135, |
|
"learning_rate": 2.0719544175307754e-06, |
|
"loss": 1.1486, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.7779108051481591, |
|
"grad_norm": 0.5225266432128085, |
|
"learning_rate": 2.044638587099347e-06, |
|
"loss": 1.1457, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.787488775815624, |
|
"grad_norm": 0.48553711881118367, |
|
"learning_rate": 2.0173788716256758e-06, |
|
"loss": 1.1557, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.797066746483089, |
|
"grad_norm": 0.5155245524580911, |
|
"learning_rate": 1.9901786303706466e-06, |
|
"loss": 1.1667, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.8066447171505537, |
|
"grad_norm": 0.5394238331941211, |
|
"learning_rate": 1.9630412152660333e-06, |
|
"loss": 1.1639, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.8162226878180185, |
|
"grad_norm": 0.5208012650775928, |
|
"learning_rate": 1.93596997050144e-06, |
|
"loss": 1.167, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.8258006584854833, |
|
"grad_norm": 0.5084683728452081, |
|
"learning_rate": 1.9089682321121834e-06, |
|
"loss": 1.146, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.8353786291529484, |
|
"grad_norm": 0.5107216674575125, |
|
"learning_rate": 1.8820393275681954e-06, |
|
"loss": 1.1299, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.8449565998204132, |
|
"grad_norm": 0.5037220655522233, |
|
"learning_rate": 1.8551865753639692e-06, |
|
"loss": 1.1705, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.854534570487878, |
|
"grad_norm": 0.5081083073272432, |
|
"learning_rate": 1.8284132846096164e-06, |
|
"loss": 1.1232, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.8641125411553428, |
|
"grad_norm": 0.4960779996118519, |
|
"learning_rate": 1.801722754623077e-06, |
|
"loss": 1.1356, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.8736905118228075, |
|
"grad_norm": 0.5194537399766056, |
|
"learning_rate": 1.775118274523545e-06, |
|
"loss": 1.1321, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.8832684824902723, |
|
"grad_norm": 0.5149057994299137, |
|
"learning_rate": 1.74860312282614e-06, |
|
"loss": 1.1306, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.8928464531577371, |
|
"grad_norm": 0.5061127962699723, |
|
"learning_rate": 1.72218056703789e-06, |
|
"loss": 1.1302, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.902424423825202, |
|
"grad_norm": 0.49704736224795454, |
|
"learning_rate": 1.6958538632550753e-06, |
|
"loss": 1.1479, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.9120023944926667, |
|
"grad_norm": 0.4976492539596855, |
|
"learning_rate": 1.6696262557619677e-06, |
|
"loss": 1.135, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.9215803651601315, |
|
"grad_norm": 0.5438558597014863, |
|
"learning_rate": 1.6435009766310372e-06, |
|
"loss": 1.1677, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.9311583358275966, |
|
"grad_norm": 0.49386649254244525, |
|
"learning_rate": 1.6174812453246582e-06, |
|
"loss": 1.1396, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.9407363064950613, |
|
"grad_norm": 0.5039832884638089, |
|
"learning_rate": 1.5915702682983657e-06, |
|
"loss": 1.1857, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.9503142771625261, |
|
"grad_norm": 0.4892382263387271, |
|
"learning_rate": 1.5657712386057202e-06, |
|
"loss": 1.15, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.9598922478299912, |
|
"grad_norm": 0.5084631284159544, |
|
"learning_rate": 1.5400873355048248e-06, |
|
"loss": 1.1572, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.969470218497456, |
|
"grad_norm": 0.5008750617477549, |
|
"learning_rate": 1.5145217240665373e-06, |
|
"loss": 1.1326, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.9790481891649208, |
|
"grad_norm": 0.4980386882470781, |
|
"learning_rate": 1.489077554784432e-06, |
|
"loss": 1.143, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.9790481891649208, |
|
"eval_loss": 2.1687815189361572, |
|
"eval_runtime": 107.1708, |
|
"eval_samples_per_second": 13.194, |
|
"eval_steps_per_second": 3.303, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.9886261598323856, |
|
"grad_norm": 0.4895688225344272, |
|
"learning_rate": 1.4637579631865645e-06, |
|
"loss": 1.1171, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.9982041304998504, |
|
"grad_norm": 0.49262081512228967, |
|
"learning_rate": 1.4385660694490667e-06, |
|
"loss": 1.1449, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.007782101167315, |
|
"grad_norm": 0.5057383346810608, |
|
"learning_rate": 1.4135049780116496e-06, |
|
"loss": 1.1394, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 2.0038910505836576, |
|
"grad_norm": 0.5387584817585892, |
|
"learning_rate": 1.388577777195035e-06, |
|
"loss": 1.1306, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 2.0134690212511224, |
|
"grad_norm": 0.5623404364476285, |
|
"learning_rate": 1.3637875388203784e-06, |
|
"loss": 1.0952, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 2.023046991918587, |
|
"grad_norm": 0.5743034832238124, |
|
"learning_rate": 1.3391373178307182e-06, |
|
"loss": 1.1261, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 2.032624962586052, |
|
"grad_norm": 0.5461858778537674, |
|
"learning_rate": 1.3146301519145153e-06, |
|
"loss": 1.1328, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.0422029332535168, |
|
"grad_norm": 0.5528333288756201, |
|
"learning_rate": 1.2902690611313135e-06, |
|
"loss": 1.1249, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.0517809039209816, |
|
"grad_norm": 0.5258934842101934, |
|
"learning_rate": 1.2660570475395684e-06, |
|
"loss": 1.1109, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 2.0613588745884464, |
|
"grad_norm": 0.5524292613274455, |
|
"learning_rate": 1.2419970948267014e-06, |
|
"loss": 1.1135, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 2.0709368452559116, |
|
"grad_norm": 0.5405228294413486, |
|
"learning_rate": 1.2180921679414143e-06, |
|
"loss": 1.1287, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 2.0805148159233764, |
|
"grad_norm": 0.5298775138689613, |
|
"learning_rate": 1.1943452127283145e-06, |
|
"loss": 1.124, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.090092786590841, |
|
"grad_norm": 0.514214942457388, |
|
"learning_rate": 1.1707591555648905e-06, |
|
"loss": 1.1059, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 2.099670757258306, |
|
"grad_norm": 0.5329396149825425, |
|
"learning_rate": 1.1473369030008974e-06, |
|
"loss": 1.1201, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.109248727925771, |
|
"grad_norm": 0.5564862124718808, |
|
"learning_rate": 1.124081341400165e-06, |
|
"loss": 1.1032, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 2.1188266985932356, |
|
"grad_norm": 0.5244468629630417, |
|
"learning_rate": 1.1009953365849168e-06, |
|
"loss": 1.1433, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 2.1284046692607004, |
|
"grad_norm": 0.5087349968174719, |
|
"learning_rate": 1.078081733482609e-06, |
|
"loss": 1.1286, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.137982639928165, |
|
"grad_norm": 0.522473732751717, |
|
"learning_rate": 1.055343355775339e-06, |
|
"loss": 1.084, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 2.14756061059563, |
|
"grad_norm": 0.5213841410982886, |
|
"learning_rate": 1.0327830055518843e-06, |
|
"loss": 1.0778, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 2.1571385812630948, |
|
"grad_norm": 0.5211792543694728, |
|
"learning_rate": 1.0104034629623933e-06, |
|
"loss": 1.0892, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.1667165519305596, |
|
"grad_norm": 0.5366639328996056, |
|
"learning_rate": 9.88207485875784e-07, |
|
"loss": 1.1129, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 2.1762945225980244, |
|
"grad_norm": 0.5072189940995689, |
|
"learning_rate": 9.661978095398854e-07, |
|
"loss": 1.1124, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.1858724932654896, |
|
"grad_norm": 0.5273739329980739, |
|
"learning_rate": 9.443771462443743e-07, |
|
"loss": 1.0966, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 2.1954504639329544, |
|
"grad_norm": 0.530434300883332, |
|
"learning_rate": 9.227481849865236e-07, |
|
"loss": 1.121, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 2.205028434600419, |
|
"grad_norm": 0.49620125772094664, |
|
"learning_rate": 9.013135911398435e-07, |
|
"loss": 1.1227, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 2.214606405267884, |
|
"grad_norm": 0.48930931831635505, |
|
"learning_rate": 8.800760061256205e-07, |
|
"loss": 1.1249, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 2.214606405267884, |
|
"eval_loss": 2.177833318710327, |
|
"eval_runtime": 106.9928, |
|
"eval_samples_per_second": 13.216, |
|
"eval_steps_per_second": 3.309, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 2.224184375935349, |
|
"grad_norm": 0.5117030753774101, |
|
"learning_rate": 8.590380470874066e-07, |
|
"loss": 1.0983, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.2337623466028136, |
|
"grad_norm": 0.5334281898363374, |
|
"learning_rate": 8.382023065685071e-07, |
|
"loss": 1.1058, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 2.2433403172702784, |
|
"grad_norm": 0.4997549069918058, |
|
"learning_rate": 8.175713521924977e-07, |
|
"loss": 1.1205, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 2.252918287937743, |
|
"grad_norm": 0.4903764233470244, |
|
"learning_rate": 7.971477263468108e-07, |
|
"loss": 1.1166, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 2.262496258605208, |
|
"grad_norm": 0.5111886828961109, |
|
"learning_rate": 7.769339458694319e-07, |
|
"loss": 1.1296, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 2.2720742292726728, |
|
"grad_norm": 0.5046245576610761, |
|
"learning_rate": 7.569325017387502e-07, |
|
"loss": 1.1214, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.2816521999401376, |
|
"grad_norm": 0.5012727372502416, |
|
"learning_rate": 7.371458587665822e-07, |
|
"loss": 1.1282, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 2.2912301706076024, |
|
"grad_norm": 0.5089746600647955, |
|
"learning_rate": 7.175764552944368e-07, |
|
"loss": 1.1228, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 2.300808141275067, |
|
"grad_norm": 0.49011501775043553, |
|
"learning_rate": 6.982267028930326e-07, |
|
"loss": 1.1019, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 2.310386111942532, |
|
"grad_norm": 0.5062866494664521, |
|
"learning_rate": 6.790989860651143e-07, |
|
"loss": 1.1237, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 2.3199640826099968, |
|
"grad_norm": 0.48914824725834716, |
|
"learning_rate": 6.601956619516037e-07, |
|
"loss": 1.1228, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.329542053277462, |
|
"grad_norm": 0.500095846054479, |
|
"learning_rate": 6.41519060041134e-07, |
|
"loss": 1.0725, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.339120023944927, |
|
"grad_norm": 0.48427264883155136, |
|
"learning_rate": 6.230714818829733e-07, |
|
"loss": 1.116, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 2.3486979946123916, |
|
"grad_norm": 0.5009855645248527, |
|
"learning_rate": 6.048552008034073e-07, |
|
"loss": 1.1158, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 2.3582759652798564, |
|
"grad_norm": 0.4895310310383359, |
|
"learning_rate": 5.868724616255899e-07, |
|
"loss": 1.1134, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.367853935947321, |
|
"grad_norm": 0.49721503448947285, |
|
"learning_rate": 5.691254803929117e-07, |
|
"loss": 1.1178, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.377431906614786, |
|
"grad_norm": 0.4908749278467018, |
|
"learning_rate": 5.516164440959118e-07, |
|
"loss": 1.0965, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 2.387009877282251, |
|
"grad_norm": 0.49404150582673295, |
|
"learning_rate": 5.343475104027743e-07, |
|
"loss": 1.1299, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.3965878479497156, |
|
"grad_norm": 0.4824591396857287, |
|
"learning_rate": 5.17320807393431e-07, |
|
"loss": 1.0795, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 2.4061658186171804, |
|
"grad_norm": 0.49129116007089907, |
|
"learning_rate": 5.005384332973154e-07, |
|
"loss": 1.1193, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.415743789284645, |
|
"grad_norm": 0.49733333626674653, |
|
"learning_rate": 4.840024562347987e-07, |
|
"loss": 1.11, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.42532175995211, |
|
"grad_norm": 0.4755146663348369, |
|
"learning_rate": 4.67714913962326e-07, |
|
"loss": 1.1091, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.434899730619575, |
|
"grad_norm": 0.49183045855158936, |
|
"learning_rate": 4.5167781362130374e-07, |
|
"loss": 1.1247, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 2.44447770128704, |
|
"grad_norm": 0.48090622566109875, |
|
"learning_rate": 4.3589313149075495e-07, |
|
"loss": 1.0957, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.454055671954505, |
|
"grad_norm": 0.49785336365870875, |
|
"learning_rate": 4.2036281274377865e-07, |
|
"loss": 1.1139, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 2.4636336426219696, |
|
"grad_norm": 0.49485716633378346, |
|
"learning_rate": 4.050887712078444e-07, |
|
"loss": 1.1298, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.4636336426219696, |
|
"eval_loss": 2.177307367324829, |
|
"eval_runtime": 107.0849, |
|
"eval_samples_per_second": 13.204, |
|
"eval_steps_per_second": 3.306, |
|
"step": 260 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 312, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 52, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.246866760855716e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|