|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9766839378238341, |
|
"eval_steps": 97, |
|
"global_step": 772, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0025906735751295338, |
|
"grad_norm": 0.37030652165412903, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.4535, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0025906735751295338, |
|
"eval_loss": 1.3420950174331665, |
|
"eval_runtime": 98.9896, |
|
"eval_samples_per_second": 2.526, |
|
"eval_steps_per_second": 0.323, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0051813471502590676, |
|
"grad_norm": 0.37870243191719055, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.3744, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.007772020725388601, |
|
"grad_norm": 0.3809736669063568, |
|
"learning_rate": 3e-06, |
|
"loss": 1.413, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.010362694300518135, |
|
"grad_norm": 0.3608606457710266, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.4169, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.012953367875647668, |
|
"grad_norm": 0.3465673625469208, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3166, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.015544041450777202, |
|
"grad_norm": 0.36601942777633667, |
|
"learning_rate": 6e-06, |
|
"loss": 1.4029, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.018134715025906734, |
|
"grad_norm": 0.374077171087265, |
|
"learning_rate": 7e-06, |
|
"loss": 1.4351, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.02072538860103627, |
|
"grad_norm": 0.3928413391113281, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.3023, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.023316062176165803, |
|
"grad_norm": 0.35695162415504456, |
|
"learning_rate": 9e-06, |
|
"loss": 1.4251, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.025906735751295335, |
|
"grad_norm": 0.38057687878608704, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4864, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02849740932642487, |
|
"grad_norm": 0.3801506757736206, |
|
"learning_rate": 9.999957505845144e-06, |
|
"loss": 1.4257, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.031088082901554404, |
|
"grad_norm": 0.3910161852836609, |
|
"learning_rate": 9.999830024102874e-06, |
|
"loss": 1.456, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03367875647668394, |
|
"grad_norm": 0.37710297107696533, |
|
"learning_rate": 9.999617556940085e-06, |
|
"loss": 1.3441, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03626943005181347, |
|
"grad_norm": 0.4258323311805725, |
|
"learning_rate": 9.99932010796822e-06, |
|
"loss": 1.3589, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.038860103626943004, |
|
"grad_norm": 0.42138487100601196, |
|
"learning_rate": 9.998937682243216e-06, |
|
"loss": 1.4402, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04145077720207254, |
|
"grad_norm": 0.45730850100517273, |
|
"learning_rate": 9.998470286265415e-06, |
|
"loss": 1.3916, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04404145077720207, |
|
"grad_norm": 0.4608759582042694, |
|
"learning_rate": 9.99791792797946e-06, |
|
"loss": 1.4034, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.046632124352331605, |
|
"grad_norm": 0.45245274901390076, |
|
"learning_rate": 9.997280616774147e-06, |
|
"loss": 1.3311, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.04922279792746114, |
|
"grad_norm": 0.462851881980896, |
|
"learning_rate": 9.996558363482277e-06, |
|
"loss": 1.4076, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.05181347150259067, |
|
"grad_norm": 0.5080270767211914, |
|
"learning_rate": 9.995751180380468e-06, |
|
"loss": 1.4116, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.054404145077720206, |
|
"grad_norm": 0.44085511565208435, |
|
"learning_rate": 9.994859081188944e-06, |
|
"loss": 1.399, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.05699481865284974, |
|
"grad_norm": 0.40889567136764526, |
|
"learning_rate": 9.993882081071307e-06, |
|
"loss": 1.3666, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.05958549222797927, |
|
"grad_norm": 0.42577946186065674, |
|
"learning_rate": 9.992820196634274e-06, |
|
"loss": 1.3906, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.06217616580310881, |
|
"grad_norm": 0.46674349904060364, |
|
"learning_rate": 9.991673445927399e-06, |
|
"loss": 1.3446, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06476683937823834, |
|
"grad_norm": 0.48815953731536865, |
|
"learning_rate": 9.99044184844276e-06, |
|
"loss": 1.3156, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06735751295336788, |
|
"grad_norm": 0.4857616424560547, |
|
"learning_rate": 9.989125425114639e-06, |
|
"loss": 1.3713, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.06994818652849741, |
|
"grad_norm": 0.4946143925189972, |
|
"learning_rate": 9.98772419831915e-06, |
|
"loss": 1.4438, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.07253886010362694, |
|
"grad_norm": 0.47251901030540466, |
|
"learning_rate": 9.986238191873874e-06, |
|
"loss": 1.3626, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07512953367875648, |
|
"grad_norm": 0.5448423027992249, |
|
"learning_rate": 9.984667431037448e-06, |
|
"loss": 1.3201, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.07772020725388601, |
|
"grad_norm": 0.4412786364555359, |
|
"learning_rate": 9.983011942509131e-06, |
|
"loss": 1.3581, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08031088082901554, |
|
"grad_norm": 0.44406625628471375, |
|
"learning_rate": 9.981271754428361e-06, |
|
"loss": 1.2458, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.08290155440414508, |
|
"grad_norm": 0.5035629272460938, |
|
"learning_rate": 9.979446896374264e-06, |
|
"loss": 1.3699, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08549222797927461, |
|
"grad_norm": 0.47425153851509094, |
|
"learning_rate": 9.977537399365159e-06, |
|
"loss": 1.3558, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.08808290155440414, |
|
"grad_norm": 0.5448233485221863, |
|
"learning_rate": 9.975543295858035e-06, |
|
"loss": 1.3342, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.09067357512953368, |
|
"grad_norm": 0.49746477603912354, |
|
"learning_rate": 9.973464619747983e-06, |
|
"loss": 1.3138, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09326424870466321, |
|
"grad_norm": 0.48222672939300537, |
|
"learning_rate": 9.971301406367644e-06, |
|
"loss": 1.3991, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.09585492227979274, |
|
"grad_norm": 0.5209602117538452, |
|
"learning_rate": 9.969053692486582e-06, |
|
"loss": 1.2775, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.09844559585492228, |
|
"grad_norm": 0.4517224133014679, |
|
"learning_rate": 9.966721516310683e-06, |
|
"loss": 1.3207, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.10103626943005181, |
|
"grad_norm": 0.5346600413322449, |
|
"learning_rate": 9.964304917481482e-06, |
|
"loss": 1.2108, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.10362694300518134, |
|
"grad_norm": 0.5563952326774597, |
|
"learning_rate": 9.961803937075516e-06, |
|
"loss": 1.2934, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10621761658031088, |
|
"grad_norm": 0.5306499004364014, |
|
"learning_rate": 9.959218617603601e-06, |
|
"loss": 1.2202, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.10880829015544041, |
|
"grad_norm": 0.4907849133014679, |
|
"learning_rate": 9.956549003010122e-06, |
|
"loss": 1.2261, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.11139896373056994, |
|
"grad_norm": 0.5063201189041138, |
|
"learning_rate": 9.953795138672291e-06, |
|
"loss": 1.2863, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.11398963730569948, |
|
"grad_norm": 0.5338860750198364, |
|
"learning_rate": 9.950957071399357e-06, |
|
"loss": 1.2388, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.11658031088082901, |
|
"grad_norm": 0.5024104714393616, |
|
"learning_rate": 9.948034849431831e-06, |
|
"loss": 1.2573, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.11917098445595854, |
|
"grad_norm": 0.5430374145507812, |
|
"learning_rate": 9.945028522440654e-06, |
|
"loss": 1.1556, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.12176165803108809, |
|
"grad_norm": 0.4689851701259613, |
|
"learning_rate": 9.941938141526355e-06, |
|
"loss": 1.2576, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.12435233160621761, |
|
"grad_norm": 0.4983338415622711, |
|
"learning_rate": 9.938763759218186e-06, |
|
"loss": 1.2247, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.12694300518134716, |
|
"grad_norm": 0.47061145305633545, |
|
"learning_rate": 9.935505429473221e-06, |
|
"loss": 1.2173, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.12953367875647667, |
|
"grad_norm": 0.5208876729011536, |
|
"learning_rate": 9.93216320767545e-06, |
|
"loss": 1.1284, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13212435233160622, |
|
"grad_norm": 0.5343345999717712, |
|
"learning_rate": 9.92873715063483e-06, |
|
"loss": 1.2766, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.13471502590673576, |
|
"grad_norm": 0.5376512408256531, |
|
"learning_rate": 9.925227316586316e-06, |
|
"loss": 1.3082, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.13730569948186527, |
|
"grad_norm": 0.49544206261634827, |
|
"learning_rate": 9.921633765188887e-06, |
|
"loss": 1.2488, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.13989637305699482, |
|
"grad_norm": 0.5503423810005188, |
|
"learning_rate": 9.917956557524511e-06, |
|
"loss": 1.2837, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.14248704663212436, |
|
"grad_norm": 0.6678252816200256, |
|
"learning_rate": 9.91419575609712e-06, |
|
"loss": 1.2592, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.14507772020725387, |
|
"grad_norm": 0.5491769313812256, |
|
"learning_rate": 9.910351424831545e-06, |
|
"loss": 1.2528, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.14766839378238342, |
|
"grad_norm": 0.5200887322425842, |
|
"learning_rate": 9.906423629072435e-06, |
|
"loss": 1.1433, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.15025906735751296, |
|
"grad_norm": 0.5037204623222351, |
|
"learning_rate": 9.902412435583127e-06, |
|
"loss": 1.204, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.15284974093264247, |
|
"grad_norm": 0.62909996509552, |
|
"learning_rate": 9.898317912544537e-06, |
|
"loss": 1.1012, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.15544041450777202, |
|
"grad_norm": 0.5567560791969299, |
|
"learning_rate": 9.89414012955398e-06, |
|
"loss": 1.2056, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.15803108808290156, |
|
"grad_norm": 0.4932163655757904, |
|
"learning_rate": 9.889879157624003e-06, |
|
"loss": 1.1295, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.16062176165803108, |
|
"grad_norm": 0.49116799235343933, |
|
"learning_rate": 9.885535069181163e-06, |
|
"loss": 1.1767, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.16321243523316062, |
|
"grad_norm": 0.5316625833511353, |
|
"learning_rate": 9.881107938064806e-06, |
|
"loss": 1.2038, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.16580310880829016, |
|
"grad_norm": 0.5318853259086609, |
|
"learning_rate": 9.876597839525814e-06, |
|
"loss": 1.1865, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.16839378238341968, |
|
"grad_norm": 0.535534143447876, |
|
"learning_rate": 9.872004850225313e-06, |
|
"loss": 1.2335, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.17098445595854922, |
|
"grad_norm": 0.6383189558982849, |
|
"learning_rate": 9.867329048233387e-06, |
|
"loss": 1.1985, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.17357512953367876, |
|
"grad_norm": 0.5795570611953735, |
|
"learning_rate": 9.862570513027736e-06, |
|
"loss": 1.1598, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.17616580310880828, |
|
"grad_norm": 0.5034987330436707, |
|
"learning_rate": 9.857729325492329e-06, |
|
"loss": 1.1798, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.17875647668393782, |
|
"grad_norm": 0.5522433519363403, |
|
"learning_rate": 9.85280556791604e-06, |
|
"loss": 1.1093, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.18134715025906736, |
|
"grad_norm": 0.5660341382026672, |
|
"learning_rate": 9.847799323991234e-06, |
|
"loss": 1.1477, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.18393782383419688, |
|
"grad_norm": 0.529484748840332, |
|
"learning_rate": 9.842710678812352e-06, |
|
"loss": 1.1028, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.18652849740932642, |
|
"grad_norm": 0.4907700717449188, |
|
"learning_rate": 9.837539718874466e-06, |
|
"loss": 1.2247, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.18911917098445596, |
|
"grad_norm": 0.5624697804450989, |
|
"learning_rate": 9.832286532071802e-06, |
|
"loss": 1.1997, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.19170984455958548, |
|
"grad_norm": 0.5344734787940979, |
|
"learning_rate": 9.826951207696258e-06, |
|
"loss": 1.1389, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.19430051813471502, |
|
"grad_norm": 0.5770263671875, |
|
"learning_rate": 9.82153383643587e-06, |
|
"loss": 1.2063, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.19689119170984457, |
|
"grad_norm": 0.5622824430465698, |
|
"learning_rate": 9.816034510373287e-06, |
|
"loss": 1.1715, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.19948186528497408, |
|
"grad_norm": 0.5856156945228577, |
|
"learning_rate": 9.81045332298419e-06, |
|
"loss": 1.155, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.20207253886010362, |
|
"grad_norm": 0.5373365879058838, |
|
"learning_rate": 9.804790369135719e-06, |
|
"loss": 1.1258, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.20466321243523317, |
|
"grad_norm": 0.5527685284614563, |
|
"learning_rate": 9.799045745084848e-06, |
|
"loss": 1.1948, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.20725388601036268, |
|
"grad_norm": 0.5555474162101746, |
|
"learning_rate": 9.793219548476754e-06, |
|
"loss": 1.1871, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.20984455958549222, |
|
"grad_norm": 0.5365428328514099, |
|
"learning_rate": 9.787311878343158e-06, |
|
"loss": 1.1591, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.21243523316062177, |
|
"grad_norm": 0.5377835631370544, |
|
"learning_rate": 9.781322835100639e-06, |
|
"loss": 1.1238, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.21502590673575128, |
|
"grad_norm": 0.5349002480506897, |
|
"learning_rate": 9.77525252054893e-06, |
|
"loss": 1.181, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.21761658031088082, |
|
"grad_norm": 0.6802361607551575, |
|
"learning_rate": 9.769101037869187e-06, |
|
"loss": 1.2205, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.22020725388601037, |
|
"grad_norm": 0.5526275038719177, |
|
"learning_rate": 9.762868491622229e-06, |
|
"loss": 1.1173, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.22279792746113988, |
|
"grad_norm": 0.6218620538711548, |
|
"learning_rate": 9.756554987746777e-06, |
|
"loss": 1.1103, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.22538860103626943, |
|
"grad_norm": 0.6305224299430847, |
|
"learning_rate": 9.750160633557626e-06, |
|
"loss": 1.132, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.22797927461139897, |
|
"grad_norm": 0.6259211301803589, |
|
"learning_rate": 9.743685537743856e-06, |
|
"loss": 1.1183, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.23056994818652848, |
|
"grad_norm": 0.866215169429779, |
|
"learning_rate": 9.737129810366952e-06, |
|
"loss": 1.1117, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.23316062176165803, |
|
"grad_norm": 0.700805127620697, |
|
"learning_rate": 9.730493562858954e-06, |
|
"loss": 1.0444, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.23575129533678757, |
|
"grad_norm": 0.630706250667572, |
|
"learning_rate": 9.72377690802055e-06, |
|
"loss": 1.2306, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.23834196891191708, |
|
"grad_norm": 0.5979788899421692, |
|
"learning_rate": 9.716979960019173e-06, |
|
"loss": 1.1418, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.24093264248704663, |
|
"grad_norm": 0.496480792760849, |
|
"learning_rate": 9.710102834387043e-06, |
|
"loss": 1.0891, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.24352331606217617, |
|
"grad_norm": 0.5865656137466431, |
|
"learning_rate": 9.70314564801922e-06, |
|
"loss": 1.0926, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.24611398963730569, |
|
"grad_norm": 0.6677915453910828, |
|
"learning_rate": 9.696108519171605e-06, |
|
"loss": 1.0742, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.24870466321243523, |
|
"grad_norm": 0.6176425218582153, |
|
"learning_rate": 9.688991567458934e-06, |
|
"loss": 1.114, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.25129533678756477, |
|
"grad_norm": 0.6433624625205994, |
|
"learning_rate": 9.681794913852747e-06, |
|
"loss": 1.1304, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.25129533678756477, |
|
"eval_loss": 1.0442166328430176, |
|
"eval_runtime": 99.046, |
|
"eval_samples_per_second": 2.524, |
|
"eval_steps_per_second": 0.323, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.2538860103626943, |
|
"grad_norm": 0.634186863899231, |
|
"learning_rate": 9.67451868067933e-06, |
|
"loss": 1.0791, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.25647668393782386, |
|
"grad_norm": 0.6801970601081848, |
|
"learning_rate": 9.667162991617633e-06, |
|
"loss": 1.1209, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.25906735751295334, |
|
"grad_norm": 0.5541910529136658, |
|
"learning_rate": 9.659727971697173e-06, |
|
"loss": 1.1145, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2616580310880829, |
|
"grad_norm": 0.600925624370575, |
|
"learning_rate": 9.652213747295906e-06, |
|
"loss": 1.1005, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.26424870466321243, |
|
"grad_norm": 0.6611819863319397, |
|
"learning_rate": 9.644620446138078e-06, |
|
"loss": 1.1321, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.266839378238342, |
|
"grad_norm": 0.6193287372589111, |
|
"learning_rate": 9.636948197292051e-06, |
|
"loss": 1.069, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.2694300518134715, |
|
"grad_norm": 0.5518503189086914, |
|
"learning_rate": 9.629197131168125e-06, |
|
"loss": 1.092, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.27202072538860106, |
|
"grad_norm": 0.6933501362800598, |
|
"learning_rate": 9.621367379516294e-06, |
|
"loss": 1.0589, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.27461139896373055, |
|
"grad_norm": 0.7061232924461365, |
|
"learning_rate": 9.613459075424033e-06, |
|
"loss": 1.094, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2772020725388601, |
|
"grad_norm": 0.7254324555397034, |
|
"learning_rate": 9.605472353314024e-06, |
|
"loss": 1.0386, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.27979274611398963, |
|
"grad_norm": 0.5736768841743469, |
|
"learning_rate": 9.597407348941865e-06, |
|
"loss": 1.0804, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2823834196891192, |
|
"grad_norm": 0.5374739170074463, |
|
"learning_rate": 9.589264199393776e-06, |
|
"loss": 1.0699, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.2849740932642487, |
|
"grad_norm": 0.7706698775291443, |
|
"learning_rate": 9.58104304308426e-06, |
|
"loss": 1.0498, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.28756476683937826, |
|
"grad_norm": 0.7128419876098633, |
|
"learning_rate": 9.572744019753753e-06, |
|
"loss": 1.1212, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.29015544041450775, |
|
"grad_norm": 0.57721346616745, |
|
"learning_rate": 9.564367270466247e-06, |
|
"loss": 1.0665, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2927461139896373, |
|
"grad_norm": 0.6171419024467468, |
|
"learning_rate": 9.555912937606896e-06, |
|
"loss": 1.1146, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.29533678756476683, |
|
"grad_norm": 0.575703501701355, |
|
"learning_rate": 9.54738116487959e-06, |
|
"loss": 1.1132, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.2979274611398964, |
|
"grad_norm": 0.6700958609580994, |
|
"learning_rate": 9.53877209730452e-06, |
|
"loss": 1.1184, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3005181347150259, |
|
"grad_norm": 0.7100483179092407, |
|
"learning_rate": 9.530085881215705e-06, |
|
"loss": 1.1258, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.30310880829015546, |
|
"grad_norm": 0.6098718047142029, |
|
"learning_rate": 9.52132266425851e-06, |
|
"loss": 1.1547, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.30569948186528495, |
|
"grad_norm": 0.6828567981719971, |
|
"learning_rate": 9.512482595387131e-06, |
|
"loss": 1.0376, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.3082901554404145, |
|
"grad_norm": 0.6913807988166809, |
|
"learning_rate": 9.503565824862076e-06, |
|
"loss": 1.1741, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.31088082901554404, |
|
"grad_norm": 0.6839683651924133, |
|
"learning_rate": 9.494572504247593e-06, |
|
"loss": 1.1419, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3134715025906736, |
|
"grad_norm": 0.6861181855201721, |
|
"learning_rate": 9.485502786409107e-06, |
|
"loss": 1.1121, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.3160621761658031, |
|
"grad_norm": 0.7473586201667786, |
|
"learning_rate": 9.476356825510613e-06, |
|
"loss": 1.1247, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.31865284974093266, |
|
"grad_norm": 0.7107284665107727, |
|
"learning_rate": 9.467134777012063e-06, |
|
"loss": 1.0763, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.32124352331606215, |
|
"grad_norm": 0.7920987010002136, |
|
"learning_rate": 9.457836797666722e-06, |
|
"loss": 1.0304, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3238341968911917, |
|
"grad_norm": 0.9145889282226562, |
|
"learning_rate": 9.448463045518498e-06, |
|
"loss": 1.2063, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.32642487046632124, |
|
"grad_norm": 0.5906431674957275, |
|
"learning_rate": 9.439013679899263e-06, |
|
"loss": 1.0248, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3290155440414508, |
|
"grad_norm": 0.7043807506561279, |
|
"learning_rate": 9.429488861426137e-06, |
|
"loss": 1.1032, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.3316062176165803, |
|
"grad_norm": 0.6444696187973022, |
|
"learning_rate": 9.419888751998768e-06, |
|
"loss": 1.0548, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.33419689119170987, |
|
"grad_norm": 0.7440561652183533, |
|
"learning_rate": 9.410213514796565e-06, |
|
"loss": 1.0761, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.33678756476683935, |
|
"grad_norm": 0.6268182992935181, |
|
"learning_rate": 9.400463314275942e-06, |
|
"loss": 1.0308, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3393782383419689, |
|
"grad_norm": 0.7046973705291748, |
|
"learning_rate": 9.390638316167513e-06, |
|
"loss": 1.061, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.34196891191709844, |
|
"grad_norm": 0.6304768323898315, |
|
"learning_rate": 9.380738687473274e-06, |
|
"loss": 1.0291, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.344559585492228, |
|
"grad_norm": 0.7118530869483948, |
|
"learning_rate": 9.370764596463764e-06, |
|
"loss": 1.0132, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.3471502590673575, |
|
"grad_norm": 0.6919021010398865, |
|
"learning_rate": 9.360716212675213e-06, |
|
"loss": 1.0753, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.34974093264248707, |
|
"grad_norm": 0.6322518587112427, |
|
"learning_rate": 9.350593706906653e-06, |
|
"loss": 1.0591, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.35233160621761656, |
|
"grad_norm": 0.9555734395980835, |
|
"learning_rate": 9.340397251217009e-06, |
|
"loss": 1.0681, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3549222797927461, |
|
"grad_norm": 0.7190225720405579, |
|
"learning_rate": 9.330127018922195e-06, |
|
"loss": 1.1177, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.35751295336787564, |
|
"grad_norm": 0.6554561853408813, |
|
"learning_rate": 9.319783184592142e-06, |
|
"loss": 1.0129, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.3601036269430052, |
|
"grad_norm": 0.6203082203865051, |
|
"learning_rate": 9.309365924047853e-06, |
|
"loss": 1.0762, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.3626943005181347, |
|
"grad_norm": 0.8245846033096313, |
|
"learning_rate": 9.298875414358399e-06, |
|
"loss": 1.0528, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.36528497409326427, |
|
"grad_norm": 0.7231378555297852, |
|
"learning_rate": 9.288311833837918e-06, |
|
"loss": 1.0407, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.36787564766839376, |
|
"grad_norm": 0.7020997405052185, |
|
"learning_rate": 9.27767536204258e-06, |
|
"loss": 1.1054, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.3704663212435233, |
|
"grad_norm": 0.6527167558670044, |
|
"learning_rate": 9.266966179767539e-06, |
|
"loss": 1.0096, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.37305699481865284, |
|
"grad_norm": 0.6411340236663818, |
|
"learning_rate": 9.256184469043852e-06, |
|
"loss": 1.0547, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3756476683937824, |
|
"grad_norm": 1.0810225009918213, |
|
"learning_rate": 9.245330413135395e-06, |
|
"loss": 1.0752, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.37823834196891193, |
|
"grad_norm": 0.9295507669448853, |
|
"learning_rate": 9.23440419653574e-06, |
|
"loss": 1.0341, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.38082901554404147, |
|
"grad_norm": 0.7355513572692871, |
|
"learning_rate": 9.223406004965023e-06, |
|
"loss": 1.0998, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.38341968911917096, |
|
"grad_norm": 0.8607478141784668, |
|
"learning_rate": 9.212336025366789e-06, |
|
"loss": 1.026, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.3860103626943005, |
|
"grad_norm": 0.7150827646255493, |
|
"learning_rate": 9.201194445904804e-06, |
|
"loss": 1.071, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.38860103626943004, |
|
"grad_norm": 0.688491940498352, |
|
"learning_rate": 9.189981455959873e-06, |
|
"loss": 0.994, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3911917098445596, |
|
"grad_norm": 0.8984132409095764, |
|
"learning_rate": 9.178697246126608e-06, |
|
"loss": 1.0318, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.39378238341968913, |
|
"grad_norm": 0.6616289019584656, |
|
"learning_rate": 9.167342008210191e-06, |
|
"loss": 1.0407, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.3963730569948187, |
|
"grad_norm": 0.7567489147186279, |
|
"learning_rate": 9.15591593522312e-06, |
|
"loss": 1.0323, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.39896373056994816, |
|
"grad_norm": 0.6091617345809937, |
|
"learning_rate": 9.144419221381919e-06, |
|
"loss": 1.0361, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.4015544041450777, |
|
"grad_norm": 0.721120297908783, |
|
"learning_rate": 9.132852062103845e-06, |
|
"loss": 1.101, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.40414507772020725, |
|
"grad_norm": 0.7634301781654358, |
|
"learning_rate": 9.121214654003561e-06, |
|
"loss": 1.0285, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.4067357512953368, |
|
"grad_norm": 0.6629643440246582, |
|
"learning_rate": 9.109507194889793e-06, |
|
"loss": 1.0858, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.40932642487046633, |
|
"grad_norm": 0.6693721413612366, |
|
"learning_rate": 9.097729883761977e-06, |
|
"loss": 1.0461, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.4119170984455959, |
|
"grad_norm": 0.6852269768714905, |
|
"learning_rate": 9.085882920806862e-06, |
|
"loss": 1.0298, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.41450777202072536, |
|
"grad_norm": 0.6539198756217957, |
|
"learning_rate": 9.073966507395123e-06, |
|
"loss": 1.0298, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4170984455958549, |
|
"grad_norm": 0.9533804655075073, |
|
"learning_rate": 9.061980846077925e-06, |
|
"loss": 1.062, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.41968911917098445, |
|
"grad_norm": 0.6733927130699158, |
|
"learning_rate": 9.049926140583487e-06, |
|
"loss": 1.0268, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.422279792746114, |
|
"grad_norm": 0.7857198715209961, |
|
"learning_rate": 9.037802595813621e-06, |
|
"loss": 1.052, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.42487046632124353, |
|
"grad_norm": 0.7269684076309204, |
|
"learning_rate": 9.025610417840238e-06, |
|
"loss": 1.0026, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.4274611398963731, |
|
"grad_norm": 0.8214792013168335, |
|
"learning_rate": 9.01334981390186e-06, |
|
"loss": 1.0533, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.43005181347150256, |
|
"grad_norm": 0.6824995875358582, |
|
"learning_rate": 9.001020992400086e-06, |
|
"loss": 1.0437, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.4326424870466321, |
|
"grad_norm": 0.6578338742256165, |
|
"learning_rate": 8.988624162896058e-06, |
|
"loss": 1.097, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.43523316062176165, |
|
"grad_norm": 0.7836469411849976, |
|
"learning_rate": 8.976159536106895e-06, |
|
"loss": 0.977, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.4378238341968912, |
|
"grad_norm": 0.7745511531829834, |
|
"learning_rate": 8.963627323902105e-06, |
|
"loss": 0.9812, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.44041450777202074, |
|
"grad_norm": 0.712700366973877, |
|
"learning_rate": 8.951027739299996e-06, |
|
"loss": 1.0043, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4430051813471503, |
|
"grad_norm": 0.6669776439666748, |
|
"learning_rate": 8.938360996464048e-06, |
|
"loss": 1.0269, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.44559585492227977, |
|
"grad_norm": 0.693252444267273, |
|
"learning_rate": 8.925627310699275e-06, |
|
"loss": 0.9963, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.4481865284974093, |
|
"grad_norm": 0.741417407989502, |
|
"learning_rate": 8.91282689844856e-06, |
|
"loss": 1.0292, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.45077720207253885, |
|
"grad_norm": 0.8754829168319702, |
|
"learning_rate": 8.899959977288988e-06, |
|
"loss": 1.0089, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.4533678756476684, |
|
"grad_norm": 0.6871668100357056, |
|
"learning_rate": 8.887026765928129e-06, |
|
"loss": 0.9792, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.45595854922279794, |
|
"grad_norm": 0.7657473683357239, |
|
"learning_rate": 8.874027484200342e-06, |
|
"loss": 1.0297, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.4585492227979275, |
|
"grad_norm": 0.7051308751106262, |
|
"learning_rate": 8.860962353063022e-06, |
|
"loss": 1.0033, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.46113989637305697, |
|
"grad_norm": 0.7424786686897278, |
|
"learning_rate": 8.847831594592851e-06, |
|
"loss": 1.0776, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.4637305699481865, |
|
"grad_norm": 0.9562065601348877, |
|
"learning_rate": 8.834635431982022e-06, |
|
"loss": 0.893, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.46632124352331605, |
|
"grad_norm": 0.7324496507644653, |
|
"learning_rate": 8.821374089534446e-06, |
|
"loss": 0.9691, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4689119170984456, |
|
"grad_norm": 0.6958107948303223, |
|
"learning_rate": 8.808047792661941e-06, |
|
"loss": 1.0113, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.47150259067357514, |
|
"grad_norm": 0.7926040291786194, |
|
"learning_rate": 8.794656767880394e-06, |
|
"loss": 1.054, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.4740932642487047, |
|
"grad_norm": 0.7949718236923218, |
|
"learning_rate": 8.781201242805917e-06, |
|
"loss": 1.0385, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.47668393782383417, |
|
"grad_norm": 0.7993385195732117, |
|
"learning_rate": 8.767681446150977e-06, |
|
"loss": 0.987, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4792746113989637, |
|
"grad_norm": 0.8635035157203674, |
|
"learning_rate": 8.754097607720512e-06, |
|
"loss": 1.1012, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.48186528497409326, |
|
"grad_norm": 0.7778316140174866, |
|
"learning_rate": 8.740449958408006e-06, |
|
"loss": 1.0547, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.4844559585492228, |
|
"grad_norm": 0.7622816562652588, |
|
"learning_rate": 8.726738730191596e-06, |
|
"loss": 1.057, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.48704663212435234, |
|
"grad_norm": 0.8010302782058716, |
|
"learning_rate": 8.7129641561301e-06, |
|
"loss": 1.06, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.4896373056994819, |
|
"grad_norm": 0.8265953063964844, |
|
"learning_rate": 8.699126470359073e-06, |
|
"loss": 0.9859, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.49222797927461137, |
|
"grad_norm": 0.7856041193008423, |
|
"learning_rate": 8.68522590808682e-06, |
|
"loss": 1.05, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4948186528497409, |
|
"grad_norm": 0.7639081478118896, |
|
"learning_rate": 8.671262705590399e-06, |
|
"loss": 1.0018, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.49740932642487046, |
|
"grad_norm": 0.8716084361076355, |
|
"learning_rate": 8.657237100211604e-06, |
|
"loss": 1.0073, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.7365914583206177, |
|
"learning_rate": 8.643149330352939e-06, |
|
"loss": 1.0448, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.5025906735751295, |
|
"grad_norm": 0.7660321593284607, |
|
"learning_rate": 8.628999635473547e-06, |
|
"loss": 1.0415, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.5025906735751295, |
|
"eval_loss": 0.9808682203292847, |
|
"eval_runtime": 99.1803, |
|
"eval_samples_per_second": 2.521, |
|
"eval_steps_per_second": 0.323, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.5051813471502591, |
|
"grad_norm": 0.8740248680114746, |
|
"learning_rate": 8.61478825608516e-06, |
|
"loss": 1.08, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5077720207253886, |
|
"grad_norm": 0.8985514640808105, |
|
"learning_rate": 8.600515433748003e-06, |
|
"loss": 1.0248, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.5103626943005182, |
|
"grad_norm": 0.6852197647094727, |
|
"learning_rate": 8.586181411066684e-06, |
|
"loss": 1.0513, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.5129533678756477, |
|
"grad_norm": 0.8582538366317749, |
|
"learning_rate": 8.571786431686074e-06, |
|
"loss": 1.0453, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.5155440414507773, |
|
"grad_norm": 0.7389397621154785, |
|
"learning_rate": 8.557330740287166e-06, |
|
"loss": 1.0516, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.5181347150259067, |
|
"grad_norm": 0.7529143691062927, |
|
"learning_rate": 8.542814582582917e-06, |
|
"loss": 1.0294, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5207253886010362, |
|
"grad_norm": 0.828758180141449, |
|
"learning_rate": 8.528238205314067e-06, |
|
"loss": 1.0357, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.5233160621761658, |
|
"grad_norm": 0.841716468334198, |
|
"learning_rate": 8.513601856244951e-06, |
|
"loss": 1.0827, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.5259067357512953, |
|
"grad_norm": 0.807770848274231, |
|
"learning_rate": 8.498905784159282e-06, |
|
"loss": 1.0939, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.5284974093264249, |
|
"grad_norm": 0.7827705144882202, |
|
"learning_rate": 8.484150238855921e-06, |
|
"loss": 1.0735, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5310880829015544, |
|
"grad_norm": 0.7675580382347107, |
|
"learning_rate": 8.469335471144646e-06, |
|
"loss": 1.0268, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.533678756476684, |
|
"grad_norm": 0.767455518245697, |
|
"learning_rate": 8.454461732841864e-06, |
|
"loss": 1.0624, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.5362694300518135, |
|
"grad_norm": 0.6222347617149353, |
|
"learning_rate": 8.439529276766354e-06, |
|
"loss": 1.0263, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.538860103626943, |
|
"grad_norm": 0.7709267735481262, |
|
"learning_rate": 8.424538356734957e-06, |
|
"loss": 1.0703, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5414507772020726, |
|
"grad_norm": 0.9357248544692993, |
|
"learning_rate": 8.40948922755826e-06, |
|
"loss": 1.0299, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.5440414507772021, |
|
"grad_norm": 0.7862836718559265, |
|
"learning_rate": 8.394382145036277e-06, |
|
"loss": 1.017, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5466321243523317, |
|
"grad_norm": 0.8551638722419739, |
|
"learning_rate": 8.379217365954089e-06, |
|
"loss": 0.9445, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.5492227979274611, |
|
"grad_norm": 0.7895774841308594, |
|
"learning_rate": 8.363995148077481e-06, |
|
"loss": 1.1145, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5518134715025906, |
|
"grad_norm": 0.684273362159729, |
|
"learning_rate": 8.348715750148571e-06, |
|
"loss": 1.0439, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.5544041450777202, |
|
"grad_norm": 0.748660147190094, |
|
"learning_rate": 8.333379431881398e-06, |
|
"loss": 1.034, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.5569948186528497, |
|
"grad_norm": 0.8681594133377075, |
|
"learning_rate": 8.317986453957514e-06, |
|
"loss": 1.0523, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5595854922279793, |
|
"grad_norm": 0.8759576678276062, |
|
"learning_rate": 8.302537078021555e-06, |
|
"loss": 1.0167, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5621761658031088, |
|
"grad_norm": 0.8709188103675842, |
|
"learning_rate": 8.28703156667679e-06, |
|
"loss": 1.0947, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.5647668393782384, |
|
"grad_norm": 0.7970831990242004, |
|
"learning_rate": 8.271470183480664e-06, |
|
"loss": 1.0404, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.5673575129533679, |
|
"grad_norm": 0.8916209936141968, |
|
"learning_rate": 8.2558531929403e-06, |
|
"loss": 1.0681, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.5699481865284974, |
|
"grad_norm": 0.7815295457839966, |
|
"learning_rate": 8.240180860508027e-06, |
|
"loss": 1.0273, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.572538860103627, |
|
"grad_norm": 0.8419135808944702, |
|
"learning_rate": 8.224453452576855e-06, |
|
"loss": 0.974, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.5751295336787565, |
|
"grad_norm": 1.0326911211013794, |
|
"learning_rate": 8.208671236475945e-06, |
|
"loss": 1.0307, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5777202072538861, |
|
"grad_norm": 0.8720448017120361, |
|
"learning_rate": 8.192834480466072e-06, |
|
"loss": 1.0061, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.5803108808290155, |
|
"grad_norm": 0.7192279100418091, |
|
"learning_rate": 8.176943453735062e-06, |
|
"loss": 0.9383, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.582901554404145, |
|
"grad_norm": 0.7928858995437622, |
|
"learning_rate": 8.160998426393214e-06, |
|
"loss": 1.0538, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5854922279792746, |
|
"grad_norm": 0.9114733934402466, |
|
"learning_rate": 8.144999669468714e-06, |
|
"loss": 1.0704, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.5880829015544041, |
|
"grad_norm": 0.8045042753219604, |
|
"learning_rate": 8.12894745490302e-06, |
|
"loss": 1.0213, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.5906735751295337, |
|
"grad_norm": 0.8414791226387024, |
|
"learning_rate": 8.112842055546254e-06, |
|
"loss": 1.1058, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5932642487046632, |
|
"grad_norm": 0.8457399606704712, |
|
"learning_rate": 8.096683745152544e-06, |
|
"loss": 1.0407, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.5958549222797928, |
|
"grad_norm": 0.7906298637390137, |
|
"learning_rate": 8.080472798375392e-06, |
|
"loss": 1.0335, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5984455958549223, |
|
"grad_norm": 0.8662751913070679, |
|
"learning_rate": 8.064209490762988e-06, |
|
"loss": 1.0809, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.6010362694300518, |
|
"grad_norm": 0.8119012117385864, |
|
"learning_rate": 8.04789409875354e-06, |
|
"loss": 1.0132, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.6036269430051814, |
|
"grad_norm": 0.7348366975784302, |
|
"learning_rate": 8.031526899670563e-06, |
|
"loss": 1.0383, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.6062176165803109, |
|
"grad_norm": 0.7896949648857117, |
|
"learning_rate": 8.015108171718177e-06, |
|
"loss": 1.0656, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.6088082901554405, |
|
"grad_norm": 0.8619694113731384, |
|
"learning_rate": 7.998638193976366e-06, |
|
"loss": 1.0881, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6113989637305699, |
|
"grad_norm": 0.7404602766036987, |
|
"learning_rate": 7.982117246396246e-06, |
|
"loss": 1.0613, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.6139896373056994, |
|
"grad_norm": 0.6466916799545288, |
|
"learning_rate": 7.965545609795297e-06, |
|
"loss": 1.0333, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.616580310880829, |
|
"grad_norm": 0.6092397570610046, |
|
"learning_rate": 7.948923565852597e-06, |
|
"loss": 1.0161, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.6191709844559585, |
|
"grad_norm": 0.7040543556213379, |
|
"learning_rate": 7.932251397104031e-06, |
|
"loss": 0.9819, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.6217616580310881, |
|
"grad_norm": 1.0263625383377075, |
|
"learning_rate": 7.915529386937486e-06, |
|
"loss": 0.9477, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6243523316062176, |
|
"grad_norm": 0.6552597284317017, |
|
"learning_rate": 7.898757819588038e-06, |
|
"loss": 0.989, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.6269430051813472, |
|
"grad_norm": 0.7315415143966675, |
|
"learning_rate": 7.881936980133118e-06, |
|
"loss": 1.0219, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.6295336787564767, |
|
"grad_norm": 0.8863290548324585, |
|
"learning_rate": 7.86506715448767e-06, |
|
"loss": 1.0347, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.6321243523316062, |
|
"grad_norm": 0.6123449206352234, |
|
"learning_rate": 7.848148629399287e-06, |
|
"loss": 1.0001, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6347150259067358, |
|
"grad_norm": 0.9395535588264465, |
|
"learning_rate": 7.831181692443338e-06, |
|
"loss": 1.008, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6373056994818653, |
|
"grad_norm": 0.6824455261230469, |
|
"learning_rate": 7.814166632018083e-06, |
|
"loss": 1.0588, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.6398963730569949, |
|
"grad_norm": 0.79184889793396, |
|
"learning_rate": 7.797103737339767e-06, |
|
"loss": 1.0284, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.6424870466321243, |
|
"grad_norm": 0.8167986273765564, |
|
"learning_rate": 7.779993298437704e-06, |
|
"loss": 0.9941, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6450777202072538, |
|
"grad_norm": 0.8785682916641235, |
|
"learning_rate": 7.762835606149352e-06, |
|
"loss": 1.0335, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.6476683937823834, |
|
"grad_norm": 0.6932119727134705, |
|
"learning_rate": 7.745630952115365e-06, |
|
"loss": 1.002, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6502590673575129, |
|
"grad_norm": 0.8167779445648193, |
|
"learning_rate": 7.728379628774632e-06, |
|
"loss": 0.9496, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.6528497409326425, |
|
"grad_norm": 0.819342315196991, |
|
"learning_rate": 7.711081929359316e-06, |
|
"loss": 1.0228, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.655440414507772, |
|
"grad_norm": 0.8480210900306702, |
|
"learning_rate": 7.693738147889868e-06, |
|
"loss": 1.0217, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.6580310880829016, |
|
"grad_norm": 0.8472158908843994, |
|
"learning_rate": 7.67634857917002e-06, |
|
"loss": 1.0264, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.6606217616580311, |
|
"grad_norm": 0.7778662443161011, |
|
"learning_rate": 7.658913518781782e-06, |
|
"loss": 0.9766, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6632124352331606, |
|
"grad_norm": 0.8037154078483582, |
|
"learning_rate": 7.641433263080418e-06, |
|
"loss": 1.0129, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6658031088082902, |
|
"grad_norm": 0.778190016746521, |
|
"learning_rate": 7.623908109189404e-06, |
|
"loss": 1.0401, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.6683937823834197, |
|
"grad_norm": 1.0345908403396606, |
|
"learning_rate": 7.606338354995381e-06, |
|
"loss": 1.0601, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.6709844559585493, |
|
"grad_norm": 0.7248339653015137, |
|
"learning_rate": 7.588724299143091e-06, |
|
"loss": 1.0098, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.6735751295336787, |
|
"grad_norm": 0.6873196959495544, |
|
"learning_rate": 7.571066241030302e-06, |
|
"loss": 1.0574, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6761658031088082, |
|
"grad_norm": 0.7845073938369751, |
|
"learning_rate": 7.553364480802715e-06, |
|
"loss": 1.0041, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.6787564766839378, |
|
"grad_norm": 0.7601717114448547, |
|
"learning_rate": 7.5356193193488655e-06, |
|
"loss": 0.9021, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.6813471502590673, |
|
"grad_norm": 0.8694692850112915, |
|
"learning_rate": 7.517831058295013e-06, |
|
"loss": 1.0321, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.6839378238341969, |
|
"grad_norm": 0.8670944571495056, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.9876, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6865284974093264, |
|
"grad_norm": 0.9318599700927734, |
|
"learning_rate": 7.4821264475501325e-06, |
|
"loss": 1.0528, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.689119170984456, |
|
"grad_norm": 0.9383565187454224, |
|
"learning_rate": 7.464210704754009e-06, |
|
"loss": 0.9914, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.6917098445595855, |
|
"grad_norm": 0.8553853034973145, |
|
"learning_rate": 7.446253076137372e-06, |
|
"loss": 1.0909, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.694300518134715, |
|
"grad_norm": 0.8563231229782104, |
|
"learning_rate": 7.4282538669379186e-06, |
|
"loss": 1.0639, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.6968911917098446, |
|
"grad_norm": 0.8212496042251587, |
|
"learning_rate": 7.410213383100126e-06, |
|
"loss": 0.9786, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.6994818652849741, |
|
"grad_norm": 0.9418748617172241, |
|
"learning_rate": 7.3921319312700365e-06, |
|
"loss": 1.0663, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7020725388601037, |
|
"grad_norm": 0.7147770524024963, |
|
"learning_rate": 7.374009818790058e-06, |
|
"loss": 1.0187, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.7046632124352331, |
|
"grad_norm": 0.8993127346038818, |
|
"learning_rate": 7.355847353693729e-06, |
|
"loss": 0.9813, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7072538860103627, |
|
"grad_norm": 0.8212840557098389, |
|
"learning_rate": 7.337644844700494e-06, |
|
"loss": 0.9602, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.7098445595854922, |
|
"grad_norm": 0.8086667656898499, |
|
"learning_rate": 7.319402601210448e-06, |
|
"loss": 1.1018, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.7124352331606217, |
|
"grad_norm": 0.8487491607666016, |
|
"learning_rate": 7.301120933299076e-06, |
|
"loss": 0.9453, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.7150259067357513, |
|
"grad_norm": 1.0112347602844238, |
|
"learning_rate": 7.282800151711991e-06, |
|
"loss": 0.9753, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.7176165803108808, |
|
"grad_norm": 0.7847304940223694, |
|
"learning_rate": 7.264440567859645e-06, |
|
"loss": 1.0121, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.7202072538860104, |
|
"grad_norm": 0.7814239263534546, |
|
"learning_rate": 7.246042493812036e-06, |
|
"loss": 1.0152, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.7227979274611399, |
|
"grad_norm": 0.9602167010307312, |
|
"learning_rate": 7.227606242293409e-06, |
|
"loss": 1.057, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.7253886010362695, |
|
"grad_norm": 0.8330225944519043, |
|
"learning_rate": 7.209132126676934e-06, |
|
"loss": 1.0834, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.727979274611399, |
|
"grad_norm": 0.8452211618423462, |
|
"learning_rate": 7.190620460979384e-06, |
|
"loss": 1.0382, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.7305699481865285, |
|
"grad_norm": 0.7918412089347839, |
|
"learning_rate": 7.172071559855792e-06, |
|
"loss": 0.9387, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.7331606217616581, |
|
"grad_norm": 0.8554849624633789, |
|
"learning_rate": 7.153485738594111e-06, |
|
"loss": 1.0589, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.7357512953367875, |
|
"grad_norm": 0.906253457069397, |
|
"learning_rate": 7.134863313109847e-06, |
|
"loss": 1.001, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7383419689119171, |
|
"grad_norm": 0.9388474225997925, |
|
"learning_rate": 7.116204599940693e-06, |
|
"loss": 1.0046, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7409326424870466, |
|
"grad_norm": 0.8858338594436646, |
|
"learning_rate": 7.097509916241145e-06, |
|
"loss": 1.0284, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.7435233160621761, |
|
"grad_norm": 0.8751007914543152, |
|
"learning_rate": 7.078779579777122e-06, |
|
"loss": 1.0375, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.7461139896373057, |
|
"grad_norm": 0.9136319160461426, |
|
"learning_rate": 7.060013908920549e-06, |
|
"loss": 1.0254, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7487046632124352, |
|
"grad_norm": 0.8126935958862305, |
|
"learning_rate": 7.041213222643952e-06, |
|
"loss": 1.0091, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.7512953367875648, |
|
"grad_norm": 0.8925020098686218, |
|
"learning_rate": 7.022377840515047e-06, |
|
"loss": 0.9436, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7538860103626943, |
|
"grad_norm": 0.7228291630744934, |
|
"learning_rate": 7.003508082691286e-06, |
|
"loss": 0.9353, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.7538860103626943, |
|
"eval_loss": 0.9649054408073425, |
|
"eval_runtime": 99.1248, |
|
"eval_samples_per_second": 2.522, |
|
"eval_steps_per_second": 0.323, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.7564766839378239, |
|
"grad_norm": 0.7875744104385376, |
|
"learning_rate": 6.984604269914437e-06, |
|
"loss": 1.0115, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7590673575129534, |
|
"grad_norm": 0.8201349377632141, |
|
"learning_rate": 6.965666723505118e-06, |
|
"loss": 1.0683, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.7616580310880829, |
|
"grad_norm": 0.9053636193275452, |
|
"learning_rate": 6.94669576535734e-06, |
|
"loss": 0.9764, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.7642487046632125, |
|
"grad_norm": 0.7280160784721375, |
|
"learning_rate": 6.927691717933038e-06, |
|
"loss": 1.0246, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.7668393782383419, |
|
"grad_norm": 0.756608784198761, |
|
"learning_rate": 6.908654904256584e-06, |
|
"loss": 1.0057, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7694300518134715, |
|
"grad_norm": 0.8075028657913208, |
|
"learning_rate": 6.889585647909303e-06, |
|
"loss": 1.0235, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.772020725388601, |
|
"grad_norm": 0.7520319223403931, |
|
"learning_rate": 6.870484273023967e-06, |
|
"loss": 1.007, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.7746113989637305, |
|
"grad_norm": 0.8978586792945862, |
|
"learning_rate": 6.8513511042792895e-06, |
|
"loss": 0.9499, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.7772020725388601, |
|
"grad_norm": 0.8076562881469727, |
|
"learning_rate": 6.832186466894402e-06, |
|
"loss": 0.9844, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7797927461139896, |
|
"grad_norm": 0.9009307026863098, |
|
"learning_rate": 6.812990686623335e-06, |
|
"loss": 1.0019, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.7823834196891192, |
|
"grad_norm": 0.7848314046859741, |
|
"learning_rate": 6.793764089749473e-06, |
|
"loss": 1.0229, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.7849740932642487, |
|
"grad_norm": 0.8111161589622498, |
|
"learning_rate": 6.7745070030800075e-06, |
|
"loss": 1.0046, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.7875647668393783, |
|
"grad_norm": 0.8512140512466431, |
|
"learning_rate": 6.755219753940389e-06, |
|
"loss": 0.9784, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7901554404145078, |
|
"grad_norm": 0.7778370380401611, |
|
"learning_rate": 6.735902670168758e-06, |
|
"loss": 1.0444, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.7927461139896373, |
|
"grad_norm": 0.8520141839981079, |
|
"learning_rate": 6.716556080110374e-06, |
|
"loss": 1.036, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.7953367875647669, |
|
"grad_norm": 0.9404623508453369, |
|
"learning_rate": 6.6971803126120336e-06, |
|
"loss": 1.1203, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.7979274611398963, |
|
"grad_norm": 0.9082056283950806, |
|
"learning_rate": 6.677775697016484e-06, |
|
"loss": 1.0798, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.8005181347150259, |
|
"grad_norm": 0.7969542145729065, |
|
"learning_rate": 6.658342563156821e-06, |
|
"loss": 0.9632, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.8031088082901554, |
|
"grad_norm": 0.7916974425315857, |
|
"learning_rate": 6.638881241350884e-06, |
|
"loss": 1.0198, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.805699481865285, |
|
"grad_norm": 0.9639979600906372, |
|
"learning_rate": 6.619392062395643e-06, |
|
"loss": 1.0056, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.8082901554404145, |
|
"grad_norm": 0.9673221707344055, |
|
"learning_rate": 6.599875357561572e-06, |
|
"loss": 0.9875, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.810880829015544, |
|
"grad_norm": 1.0038220882415771, |
|
"learning_rate": 6.5803314585870225e-06, |
|
"loss": 1.1161, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.8134715025906736, |
|
"grad_norm": 0.9063149094581604, |
|
"learning_rate": 6.560760697672583e-06, |
|
"loss": 0.9699, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.8160621761658031, |
|
"grad_norm": 0.9444339871406555, |
|
"learning_rate": 6.541163407475433e-06, |
|
"loss": 1.0099, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.8186528497409327, |
|
"grad_norm": 0.869704008102417, |
|
"learning_rate": 6.5215399211036815e-06, |
|
"loss": 1.0076, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.8212435233160622, |
|
"grad_norm": 0.7657658457756042, |
|
"learning_rate": 6.50189057211072e-06, |
|
"loss": 1.0471, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.8238341968911918, |
|
"grad_norm": 0.8859752416610718, |
|
"learning_rate": 6.4822156944895375e-06, |
|
"loss": 0.9323, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.8264248704663213, |
|
"grad_norm": 0.7193804979324341, |
|
"learning_rate": 6.462515622667056e-06, |
|
"loss": 1.042, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.8290155440414507, |
|
"grad_norm": 0.7999783754348755, |
|
"learning_rate": 6.442790691498433e-06, |
|
"loss": 0.967, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8316062176165803, |
|
"grad_norm": 0.8761709928512573, |
|
"learning_rate": 6.423041236261381e-06, |
|
"loss": 1.0732, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.8341968911917098, |
|
"grad_norm": 1.0846220254898071, |
|
"learning_rate": 6.403267592650466e-06, |
|
"loss": 0.9741, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.8367875647668394, |
|
"grad_norm": 0.6978609561920166, |
|
"learning_rate": 6.383470096771396e-06, |
|
"loss": 1.0009, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.8393782383419689, |
|
"grad_norm": 0.8032934069633484, |
|
"learning_rate": 6.363649085135311e-06, |
|
"loss": 0.9546, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.8419689119170984, |
|
"grad_norm": 1.0093727111816406, |
|
"learning_rate": 6.343804894653072e-06, |
|
"loss": 1.0592, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.844559585492228, |
|
"grad_norm": 0.8561227917671204, |
|
"learning_rate": 6.323937862629513e-06, |
|
"loss": 1.0551, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.8471502590673575, |
|
"grad_norm": 0.7876784801483154, |
|
"learning_rate": 6.304048326757735e-06, |
|
"loss": 1.0564, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.8497409326424871, |
|
"grad_norm": 0.7641052603721619, |
|
"learning_rate": 6.2841366251133405e-06, |
|
"loss": 0.9427, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8523316062176166, |
|
"grad_norm": 0.8268167972564697, |
|
"learning_rate": 6.2642030961487046e-06, |
|
"loss": 1.0171, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.8549222797927462, |
|
"grad_norm": 0.7793816328048706, |
|
"learning_rate": 6.244248078687213e-06, |
|
"loss": 0.9475, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8575129533678757, |
|
"grad_norm": 0.7105771899223328, |
|
"learning_rate": 6.224271911917508e-06, |
|
"loss": 1.0326, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.8601036269430051, |
|
"grad_norm": 0.8429663777351379, |
|
"learning_rate": 6.204274935387716e-06, |
|
"loss": 1.0257, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8626943005181347, |
|
"grad_norm": 0.8630242347717285, |
|
"learning_rate": 6.184257488999688e-06, |
|
"loss": 1.0037, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.8652849740932642, |
|
"grad_norm": 1.0064729452133179, |
|
"learning_rate": 6.164219913003208e-06, |
|
"loss": 1.0468, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.8678756476683938, |
|
"grad_norm": 0.9719647765159607, |
|
"learning_rate": 6.14416254799022e-06, |
|
"loss": 1.0582, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.8704663212435233, |
|
"grad_norm": 0.8877604007720947, |
|
"learning_rate": 6.124085734889034e-06, |
|
"loss": 0.9288, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8730569948186528, |
|
"grad_norm": 0.8161412477493286, |
|
"learning_rate": 6.1039898149585305e-06, |
|
"loss": 0.9929, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.8756476683937824, |
|
"grad_norm": 0.7888549566268921, |
|
"learning_rate": 6.083875129782366e-06, |
|
"loss": 1.022, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.8782383419689119, |
|
"grad_norm": 0.8845784664154053, |
|
"learning_rate": 6.063742021263157e-06, |
|
"loss": 0.9448, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.8808290155440415, |
|
"grad_norm": 0.791270911693573, |
|
"learning_rate": 6.043590831616677e-06, |
|
"loss": 0.9783, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.883419689119171, |
|
"grad_norm": 0.8304092288017273, |
|
"learning_rate": 6.023421903366034e-06, |
|
"loss": 1.0105, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.8860103626943006, |
|
"grad_norm": 0.7290758490562439, |
|
"learning_rate": 6.003235579335851e-06, |
|
"loss": 1.011, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.8886010362694301, |
|
"grad_norm": 0.9048756957054138, |
|
"learning_rate": 5.9830322026464435e-06, |
|
"loss": 1.0133, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.8911917098445595, |
|
"grad_norm": 0.8403552770614624, |
|
"learning_rate": 5.962812116707977e-06, |
|
"loss": 1.0031, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8937823834196891, |
|
"grad_norm": 0.8173761963844299, |
|
"learning_rate": 5.942575665214634e-06, |
|
"loss": 1.007, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.8963730569948186, |
|
"grad_norm": 0.8470476865768433, |
|
"learning_rate": 5.92232319213878e-06, |
|
"loss": 0.9703, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.8989637305699482, |
|
"grad_norm": 0.816389262676239, |
|
"learning_rate": 5.902055041725105e-06, |
|
"loss": 0.9642, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.9015544041450777, |
|
"grad_norm": 0.7632498741149902, |
|
"learning_rate": 5.8817715584847744e-06, |
|
"loss": 1.0025, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.9041450777202072, |
|
"grad_norm": 0.8325628638267517, |
|
"learning_rate": 5.861473087189584e-06, |
|
"loss": 1.0194, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.9067357512953368, |
|
"grad_norm": 0.921424150466919, |
|
"learning_rate": 5.841159972866085e-06, |
|
"loss": 0.9659, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9093264248704663, |
|
"grad_norm": 0.8112065196037292, |
|
"learning_rate": 5.820832560789727e-06, |
|
"loss": 1.0551, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.9119170984455959, |
|
"grad_norm": 0.9384793043136597, |
|
"learning_rate": 5.800491196478989e-06, |
|
"loss": 0.9915, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9145077720207254, |
|
"grad_norm": 0.6935743689537048, |
|
"learning_rate": 5.780136225689505e-06, |
|
"loss": 1.0039, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.917098445595855, |
|
"grad_norm": 0.7410438060760498, |
|
"learning_rate": 5.759767994408188e-06, |
|
"loss": 0.8865, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.9196891191709845, |
|
"grad_norm": 0.6849777102470398, |
|
"learning_rate": 5.739386848847346e-06, |
|
"loss": 0.9646, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.9222797927461139, |
|
"grad_norm": 0.8872363567352295, |
|
"learning_rate": 5.718993135438803e-06, |
|
"loss": 1.0218, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.9248704663212435, |
|
"grad_norm": 0.8674810528755188, |
|
"learning_rate": 5.6985872008280045e-06, |
|
"loss": 1.0209, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.927461139896373, |
|
"grad_norm": 0.9334598779678345, |
|
"learning_rate": 5.678169391868128e-06, |
|
"loss": 0.9841, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.9300518134715026, |
|
"grad_norm": 0.7986560463905334, |
|
"learning_rate": 5.6577400556141906e-06, |
|
"loss": 0.9821, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.9326424870466321, |
|
"grad_norm": 0.7242634296417236, |
|
"learning_rate": 5.637299539317141e-06, |
|
"loss": 1.0017, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9352331606217616, |
|
"grad_norm": 0.9153129458427429, |
|
"learning_rate": 5.616848190417965e-06, |
|
"loss": 0.9975, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.9378238341968912, |
|
"grad_norm": 1.0346280336380005, |
|
"learning_rate": 5.596386356541779e-06, |
|
"loss": 1.0298, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.9404145077720207, |
|
"grad_norm": 0.7707669734954834, |
|
"learning_rate": 5.575914385491917e-06, |
|
"loss": 0.9551, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.9430051813471503, |
|
"grad_norm": 0.7947671413421631, |
|
"learning_rate": 5.555432625244024e-06, |
|
"loss": 0.9803, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9455958549222798, |
|
"grad_norm": 0.8810315132141113, |
|
"learning_rate": 5.534941423940135e-06, |
|
"loss": 0.9941, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.9481865284974094, |
|
"grad_norm": 0.848483681678772, |
|
"learning_rate": 5.51444112988276e-06, |
|
"loss": 1.0864, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.9507772020725389, |
|
"grad_norm": 0.816373884677887, |
|
"learning_rate": 5.493932091528972e-06, |
|
"loss": 0.9821, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.9533678756476683, |
|
"grad_norm": 0.8828781843185425, |
|
"learning_rate": 5.473414657484468e-06, |
|
"loss": 0.9929, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9559585492227979, |
|
"grad_norm": 0.8907682299613953, |
|
"learning_rate": 5.452889176497659e-06, |
|
"loss": 0.9888, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.9585492227979274, |
|
"grad_norm": 0.9210544228553772, |
|
"learning_rate": 5.432355997453729e-06, |
|
"loss": 0.9757, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.961139896373057, |
|
"grad_norm": 0.9528911709785461, |
|
"learning_rate": 5.4118154693687165e-06, |
|
"loss": 0.9014, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.9637305699481865, |
|
"grad_norm": 0.7835102677345276, |
|
"learning_rate": 5.391267941383572e-06, |
|
"loss": 0.9736, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.966321243523316, |
|
"grad_norm": 0.9772538542747498, |
|
"learning_rate": 5.3707137627582315e-06, |
|
"loss": 1.0071, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.9689119170984456, |
|
"grad_norm": 0.8421518802642822, |
|
"learning_rate": 5.350153282865674e-06, |
|
"loss": 1.0501, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.9715025906735751, |
|
"grad_norm": 0.7764169573783875, |
|
"learning_rate": 5.329586851185987e-06, |
|
"loss": 1.0491, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.9740932642487047, |
|
"grad_norm": 0.7631876468658447, |
|
"learning_rate": 5.309014817300422e-06, |
|
"loss": 1.0018, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9766839378238342, |
|
"grad_norm": 0.816758394241333, |
|
"learning_rate": 5.2884375308854565e-06, |
|
"loss": 1.0032, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.9792746113989638, |
|
"grad_norm": 0.7835342288017273, |
|
"learning_rate": 5.26785534170685e-06, |
|
"loss": 0.9686, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.9818652849740933, |
|
"grad_norm": 0.8044567704200745, |
|
"learning_rate": 5.247268599613696e-06, |
|
"loss": 0.9695, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.9844559585492227, |
|
"grad_norm": 0.883346438407898, |
|
"learning_rate": 5.226677654532476e-06, |
|
"loss": 0.9709, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9870466321243523, |
|
"grad_norm": 0.9801338911056519, |
|
"learning_rate": 5.206082856461115e-06, |
|
"loss": 0.9887, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.9896373056994818, |
|
"grad_norm": 0.8825283646583557, |
|
"learning_rate": 5.185484555463026e-06, |
|
"loss": 0.9649, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.9922279792746114, |
|
"grad_norm": 1.0225670337677002, |
|
"learning_rate": 5.16488310166117e-06, |
|
"loss": 1.0526, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.9948186528497409, |
|
"grad_norm": 1.0191495418548584, |
|
"learning_rate": 5.1442788452320915e-06, |
|
"loss": 0.9138, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9974093264248705, |
|
"grad_norm": 0.8653509616851807, |
|
"learning_rate": 5.123672136399975e-06, |
|
"loss": 0.9498, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.9232073426246643, |
|
"learning_rate": 5.1030633254306935e-06, |
|
"loss": 0.9828, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.0025906735751295, |
|
"grad_norm": 0.8782436847686768, |
|
"learning_rate": 5.082452762625848e-06, |
|
"loss": 1.0139, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.005181347150259, |
|
"grad_norm": 0.9029590487480164, |
|
"learning_rate": 5.061840798316815e-06, |
|
"loss": 1.0483, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.005181347150259, |
|
"eval_loss": 0.9548383951187134, |
|
"eval_runtime": 99.0137, |
|
"eval_samples_per_second": 2.525, |
|
"eval_steps_per_second": 0.323, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.0077720207253886, |
|
"grad_norm": 0.9202731847763062, |
|
"learning_rate": 5.041227782858799e-06, |
|
"loss": 1.0369, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.0103626943005182, |
|
"grad_norm": 0.8768073320388794, |
|
"learning_rate": 5.020614066624868e-06, |
|
"loss": 1.0371, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0129533678756477, |
|
"grad_norm": 1.0436216592788696, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0636, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 1.0155440414507773, |
|
"grad_norm": 0.8364196419715881, |
|
"learning_rate": 4.979385933375133e-06, |
|
"loss": 0.9556, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.0181347150259068, |
|
"grad_norm": 0.8371309041976929, |
|
"learning_rate": 4.958772217141203e-06, |
|
"loss": 0.9272, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.0207253886010363, |
|
"grad_norm": 0.7747754454612732, |
|
"learning_rate": 4.9381592016831856e-06, |
|
"loss": 1.0231, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.0233160621761659, |
|
"grad_norm": 0.7783029079437256, |
|
"learning_rate": 4.917547237374153e-06, |
|
"loss": 0.9855, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.0025906735751295, |
|
"grad_norm": 0.9176076650619507, |
|
"learning_rate": 4.896936674569309e-06, |
|
"loss": 1.0037, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.005181347150259, |
|
"grad_norm": 0.9758632183074951, |
|
"learning_rate": 4.876327863600026e-06, |
|
"loss": 0.987, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 1.0077720207253886, |
|
"grad_norm": 0.8266388177871704, |
|
"learning_rate": 4.85572115476791e-06, |
|
"loss": 1.0135, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.0103626943005182, |
|
"grad_norm": 0.8124467730522156, |
|
"learning_rate": 4.83511689833883e-06, |
|
"loss": 0.9964, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.0129533678756477, |
|
"grad_norm": 1.061855673789978, |
|
"learning_rate": 4.814515444536975e-06, |
|
"loss": 1.0406, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0155440414507773, |
|
"grad_norm": 0.89823317527771, |
|
"learning_rate": 4.793917143538887e-06, |
|
"loss": 1.0009, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 1.0181347150259068, |
|
"grad_norm": 0.7810267806053162, |
|
"learning_rate": 4.773322345467525e-06, |
|
"loss": 1.0269, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.0207253886010363, |
|
"grad_norm": 1.0242594480514526, |
|
"learning_rate": 4.752731400386306e-06, |
|
"loss": 0.9579, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 1.0233160621761659, |
|
"grad_norm": 0.8448682427406311, |
|
"learning_rate": 4.732144658293151e-06, |
|
"loss": 1.0156, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.0259067357512954, |
|
"grad_norm": 1.009443998336792, |
|
"learning_rate": 4.711562469114544e-06, |
|
"loss": 1.0358, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.028497409326425, |
|
"grad_norm": 0.9292247891426086, |
|
"learning_rate": 4.690985182699581e-06, |
|
"loss": 0.9328, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.0310880829015545, |
|
"grad_norm": 0.7662960886955261, |
|
"learning_rate": 4.670413148814015e-06, |
|
"loss": 0.9747, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 1.0336787564766838, |
|
"grad_norm": 0.8195393681526184, |
|
"learning_rate": 4.649846717134327e-06, |
|
"loss": 0.9467, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.0362694300518134, |
|
"grad_norm": 0.8635261058807373, |
|
"learning_rate": 4.62928623724177e-06, |
|
"loss": 0.9653, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 1.038860103626943, |
|
"grad_norm": 0.929542064666748, |
|
"learning_rate": 4.6087320586164296e-06, |
|
"loss": 0.9214, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.0414507772020725, |
|
"grad_norm": 0.8770561814308167, |
|
"learning_rate": 4.588184530631284e-06, |
|
"loss": 0.9743, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.044041450777202, |
|
"grad_norm": 0.7577627301216125, |
|
"learning_rate": 4.567644002546273e-06, |
|
"loss": 1.016, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.0466321243523315, |
|
"grad_norm": 0.888802707195282, |
|
"learning_rate": 4.547110823502343e-06, |
|
"loss": 0.9667, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 1.049222797927461, |
|
"grad_norm": 0.7431952357292175, |
|
"learning_rate": 4.526585342515533e-06, |
|
"loss": 1.0157, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.0518134715025906, |
|
"grad_norm": 0.9357666969299316, |
|
"learning_rate": 4.506067908471029e-06, |
|
"loss": 1.0433, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.0544041450777202, |
|
"grad_norm": 0.9243563413619995, |
|
"learning_rate": 4.485558870117241e-06, |
|
"loss": 0.9251, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.0569948186528497, |
|
"grad_norm": 0.9315341711044312, |
|
"learning_rate": 4.465058576059868e-06, |
|
"loss": 0.9785, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.0595854922279793, |
|
"grad_norm": 1.0324779748916626, |
|
"learning_rate": 4.444567374755978e-06, |
|
"loss": 0.994, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.0621761658031088, |
|
"grad_norm": 0.784785270690918, |
|
"learning_rate": 4.424085614508084e-06, |
|
"loss": 1.0312, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 1.0647668393782384, |
|
"grad_norm": 0.9518358111381531, |
|
"learning_rate": 4.403613643458222e-06, |
|
"loss": 1.0396, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.067357512953368, |
|
"grad_norm": 1.0345162153244019, |
|
"learning_rate": 4.383151809582035e-06, |
|
"loss": 1.0877, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 1.0699481865284974, |
|
"grad_norm": 0.8149455189704895, |
|
"learning_rate": 4.362700460682861e-06, |
|
"loss": 0.9827, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.072538860103627, |
|
"grad_norm": 0.8805851936340332, |
|
"learning_rate": 4.342259944385811e-06, |
|
"loss": 0.9838, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.0751295336787565, |
|
"grad_norm": 0.8283563852310181, |
|
"learning_rate": 4.321830608131872e-06, |
|
"loss": 1.0128, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.077720207253886, |
|
"grad_norm": 1.1321141719818115, |
|
"learning_rate": 4.301412799171998e-06, |
|
"loss": 0.8987, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.0803108808290156, |
|
"grad_norm": 0.9323210120201111, |
|
"learning_rate": 4.281006864561199e-06, |
|
"loss": 0.9618, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.0829015544041452, |
|
"grad_norm": 0.7723137140274048, |
|
"learning_rate": 4.260613151152655e-06, |
|
"loss": 0.9636, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 1.0854922279792747, |
|
"grad_norm": 0.9840835928916931, |
|
"learning_rate": 4.240232005591816e-06, |
|
"loss": 0.9963, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.0880829015544042, |
|
"grad_norm": 0.8220058679580688, |
|
"learning_rate": 4.219863774310497e-06, |
|
"loss": 0.9901, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.0906735751295338, |
|
"grad_norm": 0.9598246216773987, |
|
"learning_rate": 4.1995088035210126e-06, |
|
"loss": 0.9561, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.093264248704663, |
|
"grad_norm": 0.7539969682693481, |
|
"learning_rate": 4.179167439210275e-06, |
|
"loss": 0.9904, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 1.0958549222797926, |
|
"grad_norm": 0.8430355191230774, |
|
"learning_rate": 4.158840027133917e-06, |
|
"loss": 1.0678, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.0984455958549222, |
|
"grad_norm": 0.9641113877296448, |
|
"learning_rate": 4.138526912810418e-06, |
|
"loss": 0.9488, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 1.1010362694300517, |
|
"grad_norm": 0.8497964143753052, |
|
"learning_rate": 4.1182284415152255e-06, |
|
"loss": 1.0023, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 1.1036269430051813, |
|
"grad_norm": 0.8309699892997742, |
|
"learning_rate": 4.097944958274898e-06, |
|
"loss": 0.9936, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.1062176165803108, |
|
"grad_norm": 1.0693992376327515, |
|
"learning_rate": 4.077676807861221e-06, |
|
"loss": 0.9811, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.1088082901554404, |
|
"grad_norm": 0.8500522375106812, |
|
"learning_rate": 4.057424334785366e-06, |
|
"loss": 0.9753, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 1.11139896373057, |
|
"grad_norm": 0.7945443391799927, |
|
"learning_rate": 4.037187883292027e-06, |
|
"loss": 0.9833, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.1139896373056994, |
|
"grad_norm": 0.7603841423988342, |
|
"learning_rate": 4.016967797353558e-06, |
|
"loss": 0.9531, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 1.116580310880829, |
|
"grad_norm": 0.9688854217529297, |
|
"learning_rate": 3.996764420664149e-06, |
|
"loss": 0.9785, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.1191709844559585, |
|
"grad_norm": 1.072771668434143, |
|
"learning_rate": 3.976578096633969e-06, |
|
"loss": 0.9511, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.121761658031088, |
|
"grad_norm": 0.8664917349815369, |
|
"learning_rate": 3.956409168383325e-06, |
|
"loss": 0.9434, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.1243523316062176, |
|
"grad_norm": 0.8430099487304688, |
|
"learning_rate": 3.936257978736845e-06, |
|
"loss": 0.9397, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 1.1269430051813472, |
|
"grad_norm": 0.9500722885131836, |
|
"learning_rate": 3.916124870217635e-06, |
|
"loss": 1.0198, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.1295336787564767, |
|
"grad_norm": 1.0132358074188232, |
|
"learning_rate": 3.89601018504147e-06, |
|
"loss": 1.0048, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.1321243523316062, |
|
"grad_norm": 0.8455902338027954, |
|
"learning_rate": 3.875914265110967e-06, |
|
"loss": 0.9248, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 1.1347150259067358, |
|
"grad_norm": 0.8625523447990417, |
|
"learning_rate": 3.85583745200978e-06, |
|
"loss": 1.0251, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.1373056994818653, |
|
"grad_norm": 0.9898774027824402, |
|
"learning_rate": 3.835780086996794e-06, |
|
"loss": 0.998, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.1398963730569949, |
|
"grad_norm": 1.021031141281128, |
|
"learning_rate": 3.815742511000313e-06, |
|
"loss": 1.0321, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 1.1424870466321244, |
|
"grad_norm": 0.8724047541618347, |
|
"learning_rate": 3.7957250646122843e-06, |
|
"loss": 0.9535, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.145077720207254, |
|
"grad_norm": 0.820254921913147, |
|
"learning_rate": 3.7757280880824946e-06, |
|
"loss": 0.9489, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 1.1476683937823835, |
|
"grad_norm": 1.034761667251587, |
|
"learning_rate": 3.755751921312788e-06, |
|
"loss": 0.9903, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 1.150259067357513, |
|
"grad_norm": 0.8738400936126709, |
|
"learning_rate": 3.735796903851297e-06, |
|
"loss": 0.9504, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.1528497409326426, |
|
"grad_norm": 0.8705572485923767, |
|
"learning_rate": 3.715863374886661e-06, |
|
"loss": 0.9571, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 1.1554404145077721, |
|
"grad_norm": 0.8942792415618896, |
|
"learning_rate": 3.695951673242267e-06, |
|
"loss": 0.932, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.1580310880829017, |
|
"grad_norm": 0.9116060733795166, |
|
"learning_rate": 3.6760621373704867e-06, |
|
"loss": 0.9602, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.160621761658031, |
|
"grad_norm": 0.9232982993125916, |
|
"learning_rate": 3.6561951053469313e-06, |
|
"loss": 0.9944, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 1.1632124352331605, |
|
"grad_norm": 0.6849638223648071, |
|
"learning_rate": 3.636350914864689e-06, |
|
"loss": 1.014, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 1.16580310880829, |
|
"grad_norm": 1.0841094255447388, |
|
"learning_rate": 3.6165299032286055e-06, |
|
"loss": 0.8914, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.1683937823834196, |
|
"grad_norm": 0.8563059568405151, |
|
"learning_rate": 3.5967324073495363e-06, |
|
"loss": 0.972, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1709844559585492, |
|
"grad_norm": 1.0777771472930908, |
|
"learning_rate": 3.5769587637386206e-06, |
|
"loss": 1.0476, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 1.1735751295336787, |
|
"grad_norm": 0.790111780166626, |
|
"learning_rate": 3.5572093085015683e-06, |
|
"loss": 0.959, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.1761658031088082, |
|
"grad_norm": 0.8875803351402283, |
|
"learning_rate": 3.537484377332945e-06, |
|
"loss": 1.0109, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 1.1787564766839378, |
|
"grad_norm": 0.932518720626831, |
|
"learning_rate": 3.5177843055104633e-06, |
|
"loss": 1.0012, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 1.1813471502590673, |
|
"grad_norm": 0.776736855506897, |
|
"learning_rate": 3.4981094278892813e-06, |
|
"loss": 0.9852, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.1839378238341969, |
|
"grad_norm": 0.8646918535232544, |
|
"learning_rate": 3.4784600788963197e-06, |
|
"loss": 1.0376, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 1.1865284974093264, |
|
"grad_norm": 0.9904392957687378, |
|
"learning_rate": 3.458836592524569e-06, |
|
"loss": 0.9692, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 1.189119170984456, |
|
"grad_norm": 0.8158794641494751, |
|
"learning_rate": 3.4392393023274173e-06, |
|
"loss": 1.0176, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.1917098445595855, |
|
"grad_norm": 0.8880725502967834, |
|
"learning_rate": 3.419668541412977e-06, |
|
"loss": 0.9946, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 1.194300518134715, |
|
"grad_norm": 1.0934456586837769, |
|
"learning_rate": 3.4001246424384294e-06, |
|
"loss": 0.9071, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.1968911917098446, |
|
"grad_norm": 0.9213566780090332, |
|
"learning_rate": 3.380607937604358e-06, |
|
"loss": 0.9489, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 1.1994818652849741, |
|
"grad_norm": 0.8582881689071655, |
|
"learning_rate": 3.361118758649116e-06, |
|
"loss": 0.9756, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 1.2020725388601037, |
|
"grad_norm": 0.9744678735733032, |
|
"learning_rate": 3.341657436843181e-06, |
|
"loss": 1.0436, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 1.2046632124352332, |
|
"grad_norm": 0.9584829807281494, |
|
"learning_rate": 3.322224302983517e-06, |
|
"loss": 1.0492, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.2072538860103628, |
|
"grad_norm": 0.8227071166038513, |
|
"learning_rate": 3.302819687387967e-06, |
|
"loss": 0.9419, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.2098445595854923, |
|
"grad_norm": 1.1721601486206055, |
|
"learning_rate": 3.2834439198896285e-06, |
|
"loss": 1.0105, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 1.2124352331606219, |
|
"grad_norm": 0.96018385887146, |
|
"learning_rate": 3.264097329831244e-06, |
|
"loss": 1.0335, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 1.2150259067357512, |
|
"grad_norm": 0.8224446177482605, |
|
"learning_rate": 3.2447802460596124e-06, |
|
"loss": 0.9914, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 1.2176165803108807, |
|
"grad_norm": 0.9198357462882996, |
|
"learning_rate": 3.2254929969199933e-06, |
|
"loss": 0.957, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 1.2202072538860103, |
|
"grad_norm": 1.0417511463165283, |
|
"learning_rate": 3.206235910250529e-06, |
|
"loss": 1.1192, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.2227979274611398, |
|
"grad_norm": 0.9127503037452698, |
|
"learning_rate": 3.1870093133766653e-06, |
|
"loss": 0.9585, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 1.2253886010362693, |
|
"grad_norm": 0.8656659722328186, |
|
"learning_rate": 3.167813533105598e-06, |
|
"loss": 0.9106, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 1.2279792746113989, |
|
"grad_norm": 0.9434940218925476, |
|
"learning_rate": 3.148648895720714e-06, |
|
"loss": 0.9955, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 1.2305699481865284, |
|
"grad_norm": 0.8094484806060791, |
|
"learning_rate": 3.1295157269760347e-06, |
|
"loss": 1.007, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 1.233160621761658, |
|
"grad_norm": 1.175736904144287, |
|
"learning_rate": 3.1104143520906976e-06, |
|
"loss": 0.9983, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.233160621761658, |
|
"eval_loss": 0.9479498267173767, |
|
"eval_runtime": 99.1167, |
|
"eval_samples_per_second": 2.522, |
|
"eval_steps_per_second": 0.323, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.2357512953367875, |
|
"grad_norm": 0.872238278388977, |
|
"learning_rate": 3.0913450957434177e-06, |
|
"loss": 1.0085, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.238341968911917, |
|
"grad_norm": 0.8121694326400757, |
|
"learning_rate": 3.0723082820669634e-06, |
|
"loss": 0.9683, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 1.2409326424870466, |
|
"grad_norm": 0.8866750597953796, |
|
"learning_rate": 3.0533042346426612e-06, |
|
"loss": 0.9324, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 1.2435233160621761, |
|
"grad_norm": 0.8889084458351135, |
|
"learning_rate": 3.034333276494884e-06, |
|
"loss": 0.9622, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 1.2461139896373057, |
|
"grad_norm": 0.9424455761909485, |
|
"learning_rate": 3.015395730085565e-06, |
|
"loss": 0.9475, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.2487046632124352, |
|
"grad_norm": 0.8178656101226807, |
|
"learning_rate": 2.9964919173087154e-06, |
|
"loss": 0.9323, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 1.2512953367875648, |
|
"grad_norm": 0.8948029279708862, |
|
"learning_rate": 2.9776221594849565e-06, |
|
"loss": 0.9929, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 1.2538860103626943, |
|
"grad_norm": 0.8116942048072815, |
|
"learning_rate": 2.9587867773560488e-06, |
|
"loss": 1.0207, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 1.2564766839378239, |
|
"grad_norm": 0.8729721903800964, |
|
"learning_rate": 2.9399860910794532e-06, |
|
"loss": 1.0275, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 1.2590673575129534, |
|
"grad_norm": 0.977170467376709, |
|
"learning_rate": 2.921220420222878e-06, |
|
"loss": 1.0432, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.261658031088083, |
|
"grad_norm": 0.8280233144760132, |
|
"learning_rate": 2.902490083758856e-06, |
|
"loss": 0.9642, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 1.2642487046632125, |
|
"grad_norm": 0.9376110434532166, |
|
"learning_rate": 2.8837954000593106e-06, |
|
"loss": 0.9649, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 1.266839378238342, |
|
"grad_norm": 0.7053350210189819, |
|
"learning_rate": 2.8651366868901543e-06, |
|
"loss": 0.9851, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 1.2694300518134716, |
|
"grad_norm": 0.8334206938743591, |
|
"learning_rate": 2.8465142614058916e-06, |
|
"loss": 0.9808, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 1.2720207253886011, |
|
"grad_norm": 0.8525857329368591, |
|
"learning_rate": 2.8279284401442085e-06, |
|
"loss": 0.9964, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2746113989637307, |
|
"grad_norm": 0.8159149289131165, |
|
"learning_rate": 2.809379539020618e-06, |
|
"loss": 1.0053, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 1.2772020725388602, |
|
"grad_norm": 0.9699737429618835, |
|
"learning_rate": 2.790867873323067e-06, |
|
"loss": 0.9218, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 1.2797927461139897, |
|
"grad_norm": 0.8307886123657227, |
|
"learning_rate": 2.7723937577065924e-06, |
|
"loss": 0.9667, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 1.2823834196891193, |
|
"grad_norm": 0.9945038557052612, |
|
"learning_rate": 2.753957506187964e-06, |
|
"loss": 1.0314, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 1.2849740932642488, |
|
"grad_norm": 1.1403707265853882, |
|
"learning_rate": 2.735559432140358e-06, |
|
"loss": 0.9839, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.2875647668393784, |
|
"grad_norm": 1.1950939893722534, |
|
"learning_rate": 2.7171998482880093e-06, |
|
"loss": 1.0106, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 1.2901554404145077, |
|
"grad_norm": 0.9658251404762268, |
|
"learning_rate": 2.6988790667009246e-06, |
|
"loss": 0.9977, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 1.2927461139896372, |
|
"grad_norm": 0.8657079935073853, |
|
"learning_rate": 2.680597398789554e-06, |
|
"loss": 0.9296, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 1.2953367875647668, |
|
"grad_norm": 0.9752938151359558, |
|
"learning_rate": 2.6623551552995076e-06, |
|
"loss": 1.0082, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 1.2979274611398963, |
|
"grad_norm": 0.9197008609771729, |
|
"learning_rate": 2.6441526463062727e-06, |
|
"loss": 0.9655, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.3005181347150259, |
|
"grad_norm": 1.0991517305374146, |
|
"learning_rate": 2.6259901812099432e-06, |
|
"loss": 0.8592, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 1.3031088082901554, |
|
"grad_norm": 0.9919476509094238, |
|
"learning_rate": 2.607868068729966e-06, |
|
"loss": 1.014, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 1.305699481865285, |
|
"grad_norm": 0.8860031366348267, |
|
"learning_rate": 2.5897866168998754e-06, |
|
"loss": 0.9493, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 1.3082901554404145, |
|
"grad_norm": 0.9551827907562256, |
|
"learning_rate": 2.571746133062082e-06, |
|
"loss": 0.9784, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 1.310880829015544, |
|
"grad_norm": 0.9395473003387451, |
|
"learning_rate": 2.5537469238626296e-06, |
|
"loss": 0.9847, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.3134715025906736, |
|
"grad_norm": 0.9098738431930542, |
|
"learning_rate": 2.5357892952459917e-06, |
|
"loss": 0.9757, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 1.3160621761658031, |
|
"grad_norm": 1.0057542324066162, |
|
"learning_rate": 2.517873552449869e-06, |
|
"loss": 1.0406, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 1.3186528497409327, |
|
"grad_norm": 0.9436793327331543, |
|
"learning_rate": 2.5000000000000015e-06, |
|
"loss": 0.9948, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 1.3212435233160622, |
|
"grad_norm": 0.9765656590461731, |
|
"learning_rate": 2.4821689417049898e-06, |
|
"loss": 0.9242, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 1.3238341968911918, |
|
"grad_norm": 0.9807612895965576, |
|
"learning_rate": 2.4643806806511344e-06, |
|
"loss": 0.979, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.3264248704663213, |
|
"grad_norm": 1.051147699356079, |
|
"learning_rate": 2.4466355191972886e-06, |
|
"loss": 0.905, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 1.3290155440414508, |
|
"grad_norm": 0.975245475769043, |
|
"learning_rate": 2.4289337589697e-06, |
|
"loss": 1.0436, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 1.3316062176165804, |
|
"grad_norm": 1.053765892982483, |
|
"learning_rate": 2.4112757008569102e-06, |
|
"loss": 1.0155, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 1.33419689119171, |
|
"grad_norm": 0.7797523736953735, |
|
"learning_rate": 2.3936616450046207e-06, |
|
"loss": 1.0052, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 1.3367875647668392, |
|
"grad_norm": 0.8835865259170532, |
|
"learning_rate": 2.376091890810598e-06, |
|
"loss": 0.9177, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.3393782383419688, |
|
"grad_norm": 0.8106747269630432, |
|
"learning_rate": 2.3585667369195815e-06, |
|
"loss": 0.9525, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 1.3419689119170983, |
|
"grad_norm": 1.0181241035461426, |
|
"learning_rate": 2.341086481218217e-06, |
|
"loss": 0.9024, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 1.3445595854922279, |
|
"grad_norm": 0.9752720594406128, |
|
"learning_rate": 2.32365142082998e-06, |
|
"loss": 1.0024, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 1.3471502590673574, |
|
"grad_norm": 0.8057308793067932, |
|
"learning_rate": 2.306261852110132e-06, |
|
"loss": 0.9294, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.349740932642487, |
|
"grad_norm": 1.02779221534729, |
|
"learning_rate": 2.288918070640684e-06, |
|
"loss": 0.9539, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.3523316062176165, |
|
"grad_norm": 0.8755698800086975, |
|
"learning_rate": 2.2716203712253708e-06, |
|
"loss": 1.0245, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 1.354922279792746, |
|
"grad_norm": 0.8415625691413879, |
|
"learning_rate": 2.254369047884639e-06, |
|
"loss": 0.9994, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 1.3575129533678756, |
|
"grad_norm": 1.01762056350708, |
|
"learning_rate": 2.2371643938506488e-06, |
|
"loss": 0.9763, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 1.3601036269430051, |
|
"grad_norm": 1.0196937322616577, |
|
"learning_rate": 2.2200067015622986e-06, |
|
"loss": 0.9745, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 1.3626943005181347, |
|
"grad_norm": 1.0955042839050293, |
|
"learning_rate": 2.2028962626602346e-06, |
|
"loss": 0.9457, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.3652849740932642, |
|
"grad_norm": 0.8075233697891235, |
|
"learning_rate": 2.185833367981918e-06, |
|
"loss": 0.9042, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 1.3678756476683938, |
|
"grad_norm": 0.8643060922622681, |
|
"learning_rate": 2.168818307556663e-06, |
|
"loss": 0.9541, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 1.3704663212435233, |
|
"grad_norm": 0.7847241759300232, |
|
"learning_rate": 2.1518513706007154e-06, |
|
"loss": 0.9874, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 1.3730569948186528, |
|
"grad_norm": 0.8477650880813599, |
|
"learning_rate": 2.13493284551233e-06, |
|
"loss": 1.0541, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 1.3756476683937824, |
|
"grad_norm": 0.993405818939209, |
|
"learning_rate": 2.118063019866884e-06, |
|
"loss": 1.0024, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.378238341968912, |
|
"grad_norm": 0.9656562805175781, |
|
"learning_rate": 2.101242180411963e-06, |
|
"loss": 0.9929, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 1.3808290155440415, |
|
"grad_norm": 0.88680100440979, |
|
"learning_rate": 2.0844706130625146e-06, |
|
"loss": 1.0225, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 1.383419689119171, |
|
"grad_norm": 0.9172679781913757, |
|
"learning_rate": 2.067748602895969e-06, |
|
"loss": 0.9504, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 1.3860103626943006, |
|
"grad_norm": 0.9063828587532043, |
|
"learning_rate": 2.0510764341474032e-06, |
|
"loss": 0.9287, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 1.38860103626943, |
|
"grad_norm": 0.9309386014938354, |
|
"learning_rate": 2.0344543902047043e-06, |
|
"loss": 1.015, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.3911917098445596, |
|
"grad_norm": 0.9050936102867126, |
|
"learning_rate": 2.0178827536037547e-06, |
|
"loss": 0.9952, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 1.3937823834196892, |
|
"grad_norm": 1.3205598592758179, |
|
"learning_rate": 2.001361806023636e-06, |
|
"loss": 0.911, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 1.3963730569948187, |
|
"grad_norm": 0.8794770240783691, |
|
"learning_rate": 1.9848918282818242e-06, |
|
"loss": 0.9863, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 1.3989637305699483, |
|
"grad_norm": 0.9719308614730835, |
|
"learning_rate": 1.968473100329437e-06, |
|
"loss": 0.9862, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 1.4015544041450778, |
|
"grad_norm": 0.8657988905906677, |
|
"learning_rate": 1.952105901246461e-06, |
|
"loss": 1.0129, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.4041450777202074, |
|
"grad_norm": 1.1228150129318237, |
|
"learning_rate": 1.935790509237013e-06, |
|
"loss": 0.9855, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 1.406735751295337, |
|
"grad_norm": 0.838809072971344, |
|
"learning_rate": 1.9195272016246105e-06, |
|
"loss": 0.9796, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 1.4093264248704664, |
|
"grad_norm": 0.9168699383735657, |
|
"learning_rate": 1.9033162548474577e-06, |
|
"loss": 0.9694, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 1.411917098445596, |
|
"grad_norm": 0.9007121324539185, |
|
"learning_rate": 1.887157944453749e-06, |
|
"loss": 0.9737, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 1.4145077720207253, |
|
"grad_norm": 0.8796411156654358, |
|
"learning_rate": 1.8710525450969803e-06, |
|
"loss": 0.9293, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.4170984455958548, |
|
"grad_norm": 0.9988073110580444, |
|
"learning_rate": 1.855000330531289e-06, |
|
"loss": 0.9632, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 1.4196891191709844, |
|
"grad_norm": 0.9262337684631348, |
|
"learning_rate": 1.8390015736067869e-06, |
|
"loss": 0.9679, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 1.422279792746114, |
|
"grad_norm": 0.9995079040527344, |
|
"learning_rate": 1.823056546264939e-06, |
|
"loss": 0.9558, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 1.4248704663212435, |
|
"grad_norm": 0.855469286441803, |
|
"learning_rate": 1.8071655195339272e-06, |
|
"loss": 1.0535, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 1.427461139896373, |
|
"grad_norm": 0.760111927986145, |
|
"learning_rate": 1.7913287635240573e-06, |
|
"loss": 0.9714, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.4300518134715026, |
|
"grad_norm": 0.9862337708473206, |
|
"learning_rate": 1.7755465474231465e-06, |
|
"loss": 1.0004, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 1.432642487046632, |
|
"grad_norm": 0.7655313611030579, |
|
"learning_rate": 1.7598191394919738e-06, |
|
"loss": 0.9812, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 1.4352331606217616, |
|
"grad_norm": 1.0747240781784058, |
|
"learning_rate": 1.7441468070597017e-06, |
|
"loss": 1.0208, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 1.4378238341968912, |
|
"grad_norm": 0.9578352570533752, |
|
"learning_rate": 1.7285298165193388e-06, |
|
"loss": 0.9637, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 1.4404145077720207, |
|
"grad_norm": 0.9221131801605225, |
|
"learning_rate": 1.7129684333232095e-06, |
|
"loss": 0.952, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.4430051813471503, |
|
"grad_norm": 1.1166445016860962, |
|
"learning_rate": 1.697462921978446e-06, |
|
"loss": 1.0394, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 1.4455958549222798, |
|
"grad_norm": 0.8609073758125305, |
|
"learning_rate": 1.682013546042488e-06, |
|
"loss": 1.0053, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 1.4481865284974094, |
|
"grad_norm": 0.8035356998443604, |
|
"learning_rate": 1.6666205681186032e-06, |
|
"loss": 1.0051, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 1.450777202072539, |
|
"grad_norm": 0.8815962076187134, |
|
"learning_rate": 1.6512842498514315e-06, |
|
"loss": 0.9821, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 1.4533678756476685, |
|
"grad_norm": 0.9989993572235107, |
|
"learning_rate": 1.6360048519225197e-06, |
|
"loss": 0.9804, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.455958549222798, |
|
"grad_norm": 0.9338482618331909, |
|
"learning_rate": 1.6207826340459131e-06, |
|
"loss": 0.9514, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 1.4585492227979275, |
|
"grad_norm": 0.7817990183830261, |
|
"learning_rate": 1.6056178549637248e-06, |
|
"loss": 1.0398, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 1.4611398963730569, |
|
"grad_norm": 0.9715204834938049, |
|
"learning_rate": 1.5905107724417412e-06, |
|
"loss": 0.9859, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 1.4637305699481864, |
|
"grad_norm": 0.8073526620864868, |
|
"learning_rate": 1.5754616432650443e-06, |
|
"loss": 0.9364, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 1.466321243523316, |
|
"grad_norm": 0.908467710018158, |
|
"learning_rate": 1.5604707232336457e-06, |
|
"loss": 0.97, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.4689119170984455, |
|
"grad_norm": 0.7307873964309692, |
|
"learning_rate": 1.5455382671581365e-06, |
|
"loss": 0.9081, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 1.471502590673575, |
|
"grad_norm": 0.9267300367355347, |
|
"learning_rate": 1.5306645288553556e-06, |
|
"loss": 0.9719, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 1.4740932642487046, |
|
"grad_norm": 1.200758695602417, |
|
"learning_rate": 1.5158497611440792e-06, |
|
"loss": 1.0408, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 1.4766839378238341, |
|
"grad_norm": 0.8580562472343445, |
|
"learning_rate": 1.5010942158407204e-06, |
|
"loss": 0.8879, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 1.4792746113989637, |
|
"grad_norm": 0.7956061959266663, |
|
"learning_rate": 1.48639814375505e-06, |
|
"loss": 1.0052, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.4818652849740932, |
|
"grad_norm": 1.047351360321045, |
|
"learning_rate": 1.4717617946859319e-06, |
|
"loss": 1.0048, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 1.4844559585492227, |
|
"grad_norm": 0.9025893211364746, |
|
"learning_rate": 1.4571854174170847e-06, |
|
"loss": 0.9976, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 1.4844559585492227, |
|
"eval_loss": 0.9445139765739441, |
|
"eval_runtime": 99.1624, |
|
"eval_samples_per_second": 2.521, |
|
"eval_steps_per_second": 0.323, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 1.4870466321243523, |
|
"grad_norm": 0.8686254620552063, |
|
"learning_rate": 1.4426692597128339e-06, |
|
"loss": 0.9141, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 1.4896373056994818, |
|
"grad_norm": 0.8144710659980774, |
|
"learning_rate": 1.428213568313927e-06, |
|
"loss": 0.9429, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 1.4922279792746114, |
|
"grad_norm": 0.8153760433197021, |
|
"learning_rate": 1.4138185889333172e-06, |
|
"loss": 0.9564, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.494818652849741, |
|
"grad_norm": 0.8345690369606018, |
|
"learning_rate": 1.3994845662519985e-06, |
|
"loss": 0.9794, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 1.4974093264248705, |
|
"grad_norm": 1.0408412218093872, |
|
"learning_rate": 1.3852117439148416e-06, |
|
"loss": 0.9961, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.9831898212432861, |
|
"learning_rate": 1.3710003645264559e-06, |
|
"loss": 1.0319, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 1.5025906735751295, |
|
"grad_norm": 0.9798739552497864, |
|
"learning_rate": 1.3568506696470645e-06, |
|
"loss": 1.0054, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 1.505181347150259, |
|
"grad_norm": 0.741041898727417, |
|
"learning_rate": 1.3427628997883957e-06, |
|
"loss": 0.9952, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.5077720207253886, |
|
"grad_norm": 0.7825758457183838, |
|
"learning_rate": 1.3287372944096021e-06, |
|
"loss": 0.993, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 1.5103626943005182, |
|
"grad_norm": 0.8112067580223083, |
|
"learning_rate": 1.3147740919131814e-06, |
|
"loss": 1.0047, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 1.5129533678756477, |
|
"grad_norm": 0.8506227135658264, |
|
"learning_rate": 1.3008735296409281e-06, |
|
"loss": 0.9148, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 1.5155440414507773, |
|
"grad_norm": 0.8730190396308899, |
|
"learning_rate": 1.2870358438699005e-06, |
|
"loss": 0.9805, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 1.5181347150259068, |
|
"grad_norm": 0.8171835541725159, |
|
"learning_rate": 1.2732612698084067e-06, |
|
"loss": 0.9846, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.5207253886010363, |
|
"grad_norm": 0.8341667652130127, |
|
"learning_rate": 1.2595500415919948e-06, |
|
"loss": 0.9912, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 1.5233160621761659, |
|
"grad_norm": 0.8373621106147766, |
|
"learning_rate": 1.245902392279491e-06, |
|
"loss": 1.0312, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 1.5259067357512954, |
|
"grad_norm": 1.0364506244659424, |
|
"learning_rate": 1.232318553849023e-06, |
|
"loss": 0.9417, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 1.528497409326425, |
|
"grad_norm": 0.8765987157821655, |
|
"learning_rate": 1.2187987571940846e-06, |
|
"loss": 1.0039, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 1.5310880829015545, |
|
"grad_norm": 0.8896673917770386, |
|
"learning_rate": 1.2053432321196085e-06, |
|
"loss": 0.9867, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.533678756476684, |
|
"grad_norm": 0.7940327525138855, |
|
"learning_rate": 1.1919522073380614e-06, |
|
"loss": 1.0547, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 1.5362694300518136, |
|
"grad_norm": 0.903992772102356, |
|
"learning_rate": 1.1786259104655562e-06, |
|
"loss": 0.9451, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 1.5388601036269431, |
|
"grad_norm": 0.8643820285797119, |
|
"learning_rate": 1.1653645680179792e-06, |
|
"loss": 0.8786, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 1.5414507772020727, |
|
"grad_norm": 0.7279991507530212, |
|
"learning_rate": 1.1521684054071524e-06, |
|
"loss": 0.9964, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 1.5440414507772022, |
|
"grad_norm": 0.8498103022575378, |
|
"learning_rate": 1.1390376469369796e-06, |
|
"loss": 0.9547, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.5466321243523318, |
|
"grad_norm": 0.800202488899231, |
|
"learning_rate": 1.1259725157996593e-06, |
|
"loss": 0.9599, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 1.549222797927461, |
|
"grad_norm": 0.9869511127471924, |
|
"learning_rate": 1.1129732340718702e-06, |
|
"loss": 1.0126, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 1.5518134715025906, |
|
"grad_norm": 0.9484280347824097, |
|
"learning_rate": 1.1000400227110142e-06, |
|
"loss": 0.9711, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 1.5544041450777202, |
|
"grad_norm": 0.9410507082939148, |
|
"learning_rate": 1.087173101551439e-06, |
|
"loss": 0.9845, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 1.5569948186528497, |
|
"grad_norm": 0.7972487807273865, |
|
"learning_rate": 1.0743726893007257e-06, |
|
"loss": 0.9653, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.5595854922279793, |
|
"grad_norm": 0.9086237549781799, |
|
"learning_rate": 1.061639003535952e-06, |
|
"loss": 1.002, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 1.5621761658031088, |
|
"grad_norm": 0.9138615131378174, |
|
"learning_rate": 1.0489722607000052e-06, |
|
"loss": 0.9809, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 1.5647668393782384, |
|
"grad_norm": 0.9875264167785645, |
|
"learning_rate": 1.036372676097897e-06, |
|
"loss": 0.9163, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 1.567357512953368, |
|
"grad_norm": 0.9866282939910889, |
|
"learning_rate": 1.0238404638931077e-06, |
|
"loss": 1.067, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 1.5699481865284974, |
|
"grad_norm": 0.9424428939819336, |
|
"learning_rate": 1.0113758371039429e-06, |
|
"loss": 0.9418, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.572538860103627, |
|
"grad_norm": 0.9368731379508972, |
|
"learning_rate": 9.989790075999145e-07, |
|
"loss": 0.9741, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 1.5751295336787565, |
|
"grad_norm": 0.8245809674263, |
|
"learning_rate": 9.866501860981431e-07, |
|
"loss": 0.9837, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 1.577720207253886, |
|
"grad_norm": 0.964534342288971, |
|
"learning_rate": 9.743895821597638e-07, |
|
"loss": 0.961, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 1.5803108808290154, |
|
"grad_norm": 0.9792007803916931, |
|
"learning_rate": 9.621974041863813e-07, |
|
"loss": 0.898, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 1.582901554404145, |
|
"grad_norm": 0.8622314929962158, |
|
"learning_rate": 9.500738594165132e-07, |
|
"loss": 1.0085, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.5854922279792745, |
|
"grad_norm": 0.936686098575592, |
|
"learning_rate": 9.380191539220762e-07, |
|
"loss": 0.9979, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 1.588082901554404, |
|
"grad_norm": 1.0908080339431763, |
|
"learning_rate": 9.260334926048787e-07, |
|
"loss": 0.9885, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 1.5906735751295336, |
|
"grad_norm": 0.8463078737258911, |
|
"learning_rate": 9.141170791931386e-07, |
|
"loss": 0.9314, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 1.593264248704663, |
|
"grad_norm": 1.073156714439392, |
|
"learning_rate": 9.022701162380259e-07, |
|
"loss": 1.0001, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 1.5958549222797926, |
|
"grad_norm": 0.8691221475601196, |
|
"learning_rate": 8.904928051102074e-07, |
|
"loss": 0.9589, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.5984455958549222, |
|
"grad_norm": 0.9988768696784973, |
|
"learning_rate": 8.787853459964407e-07, |
|
"loss": 1.0873, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 1.6010362694300517, |
|
"grad_norm": 1.0085712671279907, |
|
"learning_rate": 8.671479378961556e-07, |
|
"loss": 0.9401, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 1.6036269430051813, |
|
"grad_norm": 1.0235176086425781, |
|
"learning_rate": 8.555807786180814e-07, |
|
"loss": 1.0291, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 1.6062176165803108, |
|
"grad_norm": 0.9803045988082886, |
|
"learning_rate": 8.4408406477688e-07, |
|
"loss": 0.9434, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 1.6088082901554404, |
|
"grad_norm": 0.9053346514701843, |
|
"learning_rate": 8.326579917898098e-07, |
|
"loss": 0.9941, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.61139896373057, |
|
"grad_norm": 0.9022249579429626, |
|
"learning_rate": 8.21302753873393e-07, |
|
"loss": 0.947, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 1.6139896373056994, |
|
"grad_norm": 1.0404874086380005, |
|
"learning_rate": 8.100185440401276e-07, |
|
"loss": 1.0749, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 1.616580310880829, |
|
"grad_norm": 0.9517078399658203, |
|
"learning_rate": 7.988055540951967e-07, |
|
"loss": 0.9732, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 1.6191709844559585, |
|
"grad_norm": 0.9991124868392944, |
|
"learning_rate": 7.876639746332132e-07, |
|
"loss": 1.0463, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 1.621761658031088, |
|
"grad_norm": 0.9260480403900146, |
|
"learning_rate": 7.765939950349776e-07, |
|
"loss": 0.9353, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.6243523316062176, |
|
"grad_norm": 0.9367355704307556, |
|
"learning_rate": 7.655958034642619e-07, |
|
"loss": 0.9758, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 1.6269430051813472, |
|
"grad_norm": 0.777481734752655, |
|
"learning_rate": 7.54669586864607e-07, |
|
"loss": 0.927, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 1.6295336787564767, |
|
"grad_norm": 0.9329550266265869, |
|
"learning_rate": 7.43815530956149e-07, |
|
"loss": 0.9997, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 1.6321243523316062, |
|
"grad_norm": 0.9558778405189514, |
|
"learning_rate": 7.330338202324621e-07, |
|
"loss": 0.9486, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 1.6347150259067358, |
|
"grad_norm": 0.9044818878173828, |
|
"learning_rate": 7.223246379574206e-07, |
|
"loss": 0.9678, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.6373056994818653, |
|
"grad_norm": 1.0184038877487183, |
|
"learning_rate": 7.116881661620833e-07, |
|
"loss": 0.9467, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 1.6398963730569949, |
|
"grad_norm": 0.82332843542099, |
|
"learning_rate": 7.011245856416016e-07, |
|
"loss": 1.0119, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 1.6424870466321244, |
|
"grad_norm": 1.0882319211959839, |
|
"learning_rate": 6.90634075952149e-07, |
|
"loss": 1.0505, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 1.645077720207254, |
|
"grad_norm": 0.8696717619895935, |
|
"learning_rate": 6.802168154078586e-07, |
|
"loss": 1.0155, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 1.6476683937823835, |
|
"grad_norm": 0.8515833616256714, |
|
"learning_rate": 6.698729810778065e-07, |
|
"loss": 1.0446, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.650259067357513, |
|
"grad_norm": 0.9765702486038208, |
|
"learning_rate": 6.596027487829915e-07, |
|
"loss": 0.9914, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 1.6528497409326426, |
|
"grad_norm": 0.96961510181427, |
|
"learning_rate": 6.494062930933497e-07, |
|
"loss": 0.9454, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 1.6554404145077721, |
|
"grad_norm": 0.9544724822044373, |
|
"learning_rate": 6.392837873247876e-07, |
|
"loss": 0.953, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 1.6580310880829017, |
|
"grad_norm": 1.0494675636291504, |
|
"learning_rate": 6.292354035362369e-07, |
|
"loss": 1.0195, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 1.6606217616580312, |
|
"grad_norm": 1.032488465309143, |
|
"learning_rate": 6.192613125267283e-07, |
|
"loss": 1.0027, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.6632124352331608, |
|
"grad_norm": 1.010940432548523, |
|
"learning_rate": 6.093616838324872e-07, |
|
"loss": 0.9949, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 1.6658031088082903, |
|
"grad_norm": 0.7320314645767212, |
|
"learning_rate": 5.995366857240592e-07, |
|
"loss": 0.9881, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 1.6683937823834198, |
|
"grad_norm": 1.1827306747436523, |
|
"learning_rate": 5.897864852034368e-07, |
|
"loss": 0.9126, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 1.6709844559585494, |
|
"grad_norm": 0.8718465566635132, |
|
"learning_rate": 5.801112480012344e-07, |
|
"loss": 0.8972, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 1.6735751295336787, |
|
"grad_norm": 0.8429136276245117, |
|
"learning_rate": 5.705111385738638e-07, |
|
"loss": 1.0019, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.6761658031088082, |
|
"grad_norm": 0.9650006294250488, |
|
"learning_rate": 5.609863201007382e-07, |
|
"loss": 0.992, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 1.6787564766839378, |
|
"grad_norm": 1.091964840888977, |
|
"learning_rate": 5.515369544815025e-07, |
|
"loss": 1.0321, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 1.6813471502590673, |
|
"grad_norm": 0.7537882924079895, |
|
"learning_rate": 5.421632023332779e-07, |
|
"loss": 0.9601, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 1.6839378238341969, |
|
"grad_norm": 0.9340682029724121, |
|
"learning_rate": 5.328652229879383e-07, |
|
"loss": 0.9789, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 1.6865284974093264, |
|
"grad_norm": 0.7913133502006531, |
|
"learning_rate": 5.236431744893883e-07, |
|
"loss": 0.9062, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.689119170984456, |
|
"grad_norm": 0.9730626344680786, |
|
"learning_rate": 5.144972135908949e-07, |
|
"loss": 0.938, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 1.6917098445595855, |
|
"grad_norm": 0.8744672536849976, |
|
"learning_rate": 5.054274957524075e-07, |
|
"loss": 1.0014, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 1.694300518134715, |
|
"grad_norm": 0.9841236472129822, |
|
"learning_rate": 4.964341751379248e-07, |
|
"loss": 0.9094, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 1.6968911917098446, |
|
"grad_norm": 1.0411112308502197, |
|
"learning_rate": 4.875174046128684e-07, |
|
"loss": 1.011, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 1.6994818652849741, |
|
"grad_norm": 0.9232543706893921, |
|
"learning_rate": 4.786773357414926e-07, |
|
"loss": 0.9188, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.7020725388601037, |
|
"grad_norm": 0.9863471984863281, |
|
"learning_rate": 4.6991411878429593e-07, |
|
"loss": 0.9577, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 1.704663212435233, |
|
"grad_norm": 0.8515871167182922, |
|
"learning_rate": 4.612279026954808e-07, |
|
"loss": 0.9712, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 1.7072538860103625, |
|
"grad_norm": 1.0452429056167603, |
|
"learning_rate": 4.526188351204103e-07, |
|
"loss": 0.9525, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 1.709844559585492, |
|
"grad_norm": 1.039813756942749, |
|
"learning_rate": 4.440870623931054e-07, |
|
"loss": 0.9677, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 1.7124352331606216, |
|
"grad_norm": 1.0061904191970825, |
|
"learning_rate": 4.3563272953375426e-07, |
|
"loss": 0.9249, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.7150259067357512, |
|
"grad_norm": 0.9659146666526794, |
|
"learning_rate": 4.2725598024624783e-07, |
|
"loss": 0.932, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 1.7176165803108807, |
|
"grad_norm": 1.100063443183899, |
|
"learning_rate": 4.1895695691574146e-07, |
|
"loss": 0.9303, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 1.7202072538860103, |
|
"grad_norm": 0.9187989830970764, |
|
"learning_rate": 4.1073580060622455e-07, |
|
"loss": 0.984, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 1.7227979274611398, |
|
"grad_norm": 0.8058050870895386, |
|
"learning_rate": 4.025926510581357e-07, |
|
"loss": 0.9469, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 1.7253886010362693, |
|
"grad_norm": 0.9303697347640991, |
|
"learning_rate": 3.9452764668597764e-07, |
|
"loss": 1.0155, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.7279792746113989, |
|
"grad_norm": 0.875150740146637, |
|
"learning_rate": 3.8654092457596714e-07, |
|
"loss": 0.9498, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 1.7305699481865284, |
|
"grad_norm": 1.206532597541809, |
|
"learning_rate": 3.786326204837065e-07, |
|
"loss": 1.0035, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 1.733160621761658, |
|
"grad_norm": 0.9007678627967834, |
|
"learning_rate": 3.7080286883187713e-07, |
|
"loss": 0.9713, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 1.7357512953367875, |
|
"grad_norm": 1.0224071741104126, |
|
"learning_rate": 3.6305180270794827e-07, |
|
"loss": 0.9525, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 1.7357512953367875, |
|
"eval_loss": 0.9424797892570496, |
|
"eval_runtime": 98.9403, |
|
"eval_samples_per_second": 2.527, |
|
"eval_steps_per_second": 0.323, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 1.738341968911917, |
|
"grad_norm": 0.8955622315406799, |
|
"learning_rate": 3.553795538619237e-07, |
|
"loss": 0.9495, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.7409326424870466, |
|
"grad_norm": 0.9500125050544739, |
|
"learning_rate": 3.4778625270409484e-07, |
|
"loss": 0.9822, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 1.7435233160621761, |
|
"grad_norm": 0.9673233032226562, |
|
"learning_rate": 3.402720283028277e-07, |
|
"loss": 0.9392, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 1.7461139896373057, |
|
"grad_norm": 1.0081723928451538, |
|
"learning_rate": 3.328370083823679e-07, |
|
"loss": 0.9768, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 1.7487046632124352, |
|
"grad_norm": 1.021333932876587, |
|
"learning_rate": 3.2548131932067184e-07, |
|
"loss": 0.9438, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 1.7512953367875648, |
|
"grad_norm": 0.8333784341812134, |
|
"learning_rate": 3.182050861472541e-07, |
|
"loss": 1.0067, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.7538860103626943, |
|
"grad_norm": 1.0099414587020874, |
|
"learning_rate": 3.110084325410667e-07, |
|
"loss": 1.0327, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 1.7564766839378239, |
|
"grad_norm": 1.2605355978012085, |
|
"learning_rate": 3.03891480828396e-07, |
|
"loss": 0.9497, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 1.7590673575129534, |
|
"grad_norm": 0.9966846704483032, |
|
"learning_rate": 2.9685435198078095e-07, |
|
"loss": 1.0268, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 1.761658031088083, |
|
"grad_norm": 0.920387327671051, |
|
"learning_rate": 2.898971656129573e-07, |
|
"loss": 0.9526, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 1.7642487046632125, |
|
"grad_norm": 0.9313926100730896, |
|
"learning_rate": 2.830200399808286e-07, |
|
"loss": 0.9874, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.766839378238342, |
|
"grad_norm": 0.9481755495071411, |
|
"learning_rate": 2.762230919794506e-07, |
|
"loss": 0.9602, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 1.7694300518134716, |
|
"grad_norm": 0.9870350360870361, |
|
"learning_rate": 2.6950643714104774e-07, |
|
"loss": 0.9758, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 1.7720207253886011, |
|
"grad_norm": 0.8995896577835083, |
|
"learning_rate": 2.628701896330482e-07, |
|
"loss": 0.9692, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 1.7746113989637307, |
|
"grad_norm": 0.8859801888465881, |
|
"learning_rate": 2.563144622561453e-07, |
|
"loss": 0.978, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 1.7772020725388602, |
|
"grad_norm": 0.8300227522850037, |
|
"learning_rate": 2.498393664423737e-07, |
|
"loss": 0.9199, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.7797927461139897, |
|
"grad_norm": 0.9782614707946777, |
|
"learning_rate": 2.4344501225322557e-07, |
|
"loss": 0.9437, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 1.7823834196891193, |
|
"grad_norm": 0.9227774739265442, |
|
"learning_rate": 2.3713150837777142e-07, |
|
"loss": 0.9153, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 1.7849740932642488, |
|
"grad_norm": 0.854295551776886, |
|
"learning_rate": 2.3089896213081553e-07, |
|
"loss": 0.8942, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 1.7875647668393784, |
|
"grad_norm": 0.761452853679657, |
|
"learning_rate": 2.2474747945107068e-07, |
|
"loss": 1.0242, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 1.790155440414508, |
|
"grad_norm": 0.7350905537605286, |
|
"learning_rate": 2.1867716489936297e-07, |
|
"loss": 1.0056, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.7927461139896375, |
|
"grad_norm": 0.9781407117843628, |
|
"learning_rate": 2.1268812165684382e-07, |
|
"loss": 0.9554, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 1.795336787564767, |
|
"grad_norm": 0.7397287487983704, |
|
"learning_rate": 2.0678045152324798e-07, |
|
"loss": 1.0214, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 1.7979274611398963, |
|
"grad_norm": 0.9220542311668396, |
|
"learning_rate": 2.0095425491515386e-07, |
|
"loss": 0.9359, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 1.8005181347150259, |
|
"grad_norm": 0.8612248301506042, |
|
"learning_rate": 1.9520963086428258e-07, |
|
"loss": 0.9994, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 1.8031088082901554, |
|
"grad_norm": 0.8949776887893677, |
|
"learning_rate": 1.8954667701581108e-07, |
|
"loss": 0.9721, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.805699481865285, |
|
"grad_norm": 1.0004165172576904, |
|
"learning_rate": 1.8396548962671456e-07, |
|
"loss": 0.9811, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 1.8082901554404145, |
|
"grad_norm": 0.8982537388801575, |
|
"learning_rate": 1.7846616356413105e-07, |
|
"loss": 0.9457, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 1.810880829015544, |
|
"grad_norm": 0.9139662981033325, |
|
"learning_rate": 1.7304879230374328e-07, |
|
"loss": 1.0623, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 1.8134715025906736, |
|
"grad_norm": 0.9726191163063049, |
|
"learning_rate": 1.677134679281983e-07, |
|
"loss": 1.0008, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 1.8160621761658031, |
|
"grad_norm": 0.9855490326881409, |
|
"learning_rate": 1.6246028112553603e-07, |
|
"loss": 1.0302, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.8186528497409327, |
|
"grad_norm": 0.9363789558410645, |
|
"learning_rate": 1.5728932118764916e-07, |
|
"loss": 0.9876, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 1.8212435233160622, |
|
"grad_norm": 1.0399082899093628, |
|
"learning_rate": 1.5220067600876686e-07, |
|
"loss": 0.8963, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 1.8238341968911918, |
|
"grad_norm": 0.8987425565719604, |
|
"learning_rate": 1.4719443208396078e-07, |
|
"loss": 0.9535, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 1.8264248704663213, |
|
"grad_norm": 0.8786448240280151, |
|
"learning_rate": 1.422706745076713e-07, |
|
"loss": 1.0302, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 1.8290155440414506, |
|
"grad_norm": 1.0369514226913452, |
|
"learning_rate": 1.3742948697226533e-07, |
|
"loss": 0.9518, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.8316062176165802, |
|
"grad_norm": 0.9875585436820984, |
|
"learning_rate": 1.3267095176661304e-07, |
|
"loss": 0.9448, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 1.8341968911917097, |
|
"grad_norm": 1.0473564863204956, |
|
"learning_rate": 1.2799514977468618e-07, |
|
"loss": 0.9618, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 1.8367875647668392, |
|
"grad_norm": 1.106512427330017, |
|
"learning_rate": 1.2340216047418697e-07, |
|
"loss": 1.02, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 1.8393782383419688, |
|
"grad_norm": 0.8493651747703552, |
|
"learning_rate": 1.188920619351941e-07, |
|
"loss": 0.9718, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 1.8419689119170983, |
|
"grad_norm": 0.8997021317481995, |
|
"learning_rate": 1.1446493081883891e-07, |
|
"loss": 0.9508, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.8445595854922279, |
|
"grad_norm": 0.8545548319816589, |
|
"learning_rate": 1.1012084237599808e-07, |
|
"loss": 1.0185, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 1.8471502590673574, |
|
"grad_norm": 0.8686257600784302, |
|
"learning_rate": 1.0585987044602009e-07, |
|
"loss": 0.9757, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 1.849740932642487, |
|
"grad_norm": 0.8913059234619141, |
|
"learning_rate": 1.0168208745546416e-07, |
|
"loss": 0.9635, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 1.8523316062176165, |
|
"grad_norm": 0.8280147910118103, |
|
"learning_rate": 9.758756441687333e-08, |
|
"loss": 1.0558, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 1.854922279792746, |
|
"grad_norm": 0.9251644611358643, |
|
"learning_rate": 9.357637092756667e-08, |
|
"loss": 0.9797, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.8575129533678756, |
|
"grad_norm": 0.8681091666221619, |
|
"learning_rate": 8.964857516845449e-08, |
|
"loss": 0.98, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 1.8601036269430051, |
|
"grad_norm": 0.858220636844635, |
|
"learning_rate": 8.580424390288167e-08, |
|
"loss": 0.9679, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 1.8626943005181347, |
|
"grad_norm": 1.0083863735198975, |
|
"learning_rate": 8.204344247549067e-08, |
|
"loss": 0.9765, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 1.8652849740932642, |
|
"grad_norm": 0.7215631604194641, |
|
"learning_rate": 7.836623481111416e-08, |
|
"loss": 0.9826, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 1.8678756476683938, |
|
"grad_norm": 0.8494441509246826, |
|
"learning_rate": 7.47726834136836e-08, |
|
"loss": 0.9878, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.8704663212435233, |
|
"grad_norm": 0.9821386337280273, |
|
"learning_rate": 7.126284936517125e-08, |
|
"loss": 0.98, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 1.8730569948186528, |
|
"grad_norm": 0.887163519859314, |
|
"learning_rate": 6.783679232455043e-08, |
|
"loss": 0.9275, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 1.8756476683937824, |
|
"grad_norm": 0.9071056246757507, |
|
"learning_rate": 6.449457052677965e-08, |
|
"loss": 0.9906, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 1.878238341968912, |
|
"grad_norm": 0.9382151961326599, |
|
"learning_rate": 6.123624078181512e-08, |
|
"loss": 1.0063, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 1.8808290155440415, |
|
"grad_norm": 0.8040828704833984, |
|
"learning_rate": 5.8061858473645315e-08, |
|
"loss": 0.9858, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.883419689119171, |
|
"grad_norm": 0.9526659250259399, |
|
"learning_rate": 5.4971477559346286e-08, |
|
"loss": 0.9893, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 1.8860103626943006, |
|
"grad_norm": 0.8950933218002319, |
|
"learning_rate": 5.196515056816898e-08, |
|
"loss": 1.0216, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 1.88860103626943, |
|
"grad_norm": 0.8529356718063354, |
|
"learning_rate": 4.90429286006433e-08, |
|
"loss": 1.0495, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 1.8911917098445596, |
|
"grad_norm": 0.8746633529663086, |
|
"learning_rate": 4.620486132770996e-08, |
|
"loss": 0.9701, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 1.8937823834196892, |
|
"grad_norm": 0.7848332524299622, |
|
"learning_rate": 4.34509969898772e-08, |
|
"loss": 0.9551, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.8963730569948187, |
|
"grad_norm": 0.844379186630249, |
|
"learning_rate": 4.078138239639984e-08, |
|
"loss": 0.9392, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 1.8989637305699483, |
|
"grad_norm": 0.9659252762794495, |
|
"learning_rate": 3.819606292448541e-08, |
|
"loss": 0.9676, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 1.9015544041450778, |
|
"grad_norm": 1.0913461446762085, |
|
"learning_rate": 3.569508251851816e-08, |
|
"loss": 1.0477, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 1.9041450777202074, |
|
"grad_norm": 1.0511360168457031, |
|
"learning_rate": 3.327848368931907e-08, |
|
"loss": 0.9345, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 1.906735751295337, |
|
"grad_norm": 0.958997905254364, |
|
"learning_rate": 3.0946307513418095e-08, |
|
"loss": 1.0417, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.9093264248704664, |
|
"grad_norm": 0.8062818050384521, |
|
"learning_rate": 2.8698593632357496e-08, |
|
"loss": 0.9683, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 1.911917098445596, |
|
"grad_norm": 0.8307934403419495, |
|
"learning_rate": 2.653538025201685e-08, |
|
"loss": 0.9277, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 1.9145077720207255, |
|
"grad_norm": 0.9197670221328735, |
|
"learning_rate": 2.4456704141967437e-08, |
|
"loss": 0.9539, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 1.917098445595855, |
|
"grad_norm": 0.885750949382782, |
|
"learning_rate": 2.2462600634841624e-08, |
|
"loss": 0.9429, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 1.9196891191709846, |
|
"grad_norm": 0.8606031537055969, |
|
"learning_rate": 2.0553103625737813e-08, |
|
"loss": 1.0091, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.922279792746114, |
|
"grad_norm": 0.8774232268333435, |
|
"learning_rate": 1.8728245571640324e-08, |
|
"loss": 0.958, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 1.9248704663212435, |
|
"grad_norm": 0.9953173398971558, |
|
"learning_rate": 1.6988057490868736e-08, |
|
"loss": 1.0015, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 1.927461139896373, |
|
"grad_norm": 1.022383213043213, |
|
"learning_rate": 1.5332568962553306e-08, |
|
"loss": 0.9427, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 1.9300518134715026, |
|
"grad_norm": 1.0359011888504028, |
|
"learning_rate": 1.3761808126126486e-08, |
|
"loss": 0.9499, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 1.932642487046632, |
|
"grad_norm": 0.8054367899894714, |
|
"learning_rate": 1.2275801680851629e-08, |
|
"loss": 1.0467, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.9352331606217616, |
|
"grad_norm": 0.921036422252655, |
|
"learning_rate": 1.0874574885362809e-08, |
|
"loss": 0.9384, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 1.9378238341968912, |
|
"grad_norm": 0.8518050909042358, |
|
"learning_rate": 9.558151557240158e-09, |
|
"loss": 0.9375, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 1.9404145077720207, |
|
"grad_norm": 0.9647573232650757, |
|
"learning_rate": 8.32655407260241e-09, |
|
"loss": 1.0186, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 1.9430051813471503, |
|
"grad_norm": 0.97606360912323, |
|
"learning_rate": 7.179803365726656e-09, |
|
"loss": 0.9963, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 1.9455958549222798, |
|
"grad_norm": 1.1201107501983643, |
|
"learning_rate": 6.117918928693623e-09, |
|
"loss": 0.983, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.9481865284974094, |
|
"grad_norm": 0.9616057276725769, |
|
"learning_rate": 5.140918811056827e-09, |
|
"loss": 0.9794, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 1.950777202072539, |
|
"grad_norm": 0.8792617917060852, |
|
"learning_rate": 4.248819619533384e-09, |
|
"loss": 0.9646, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 1.9533678756476682, |
|
"grad_norm": 0.8594529032707214, |
|
"learning_rate": 3.4416365177236675e-09, |
|
"loss": 1.0342, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 1.9559585492227978, |
|
"grad_norm": 0.9270037412643433, |
|
"learning_rate": 2.7193832258537447e-09, |
|
"loss": 1.0157, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 1.9585492227979273, |
|
"grad_norm": 0.9513561725616455, |
|
"learning_rate": 2.08207202054056e-09, |
|
"loss": 1.0843, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.9611398963730569, |
|
"grad_norm": 0.9862794876098633, |
|
"learning_rate": 1.5297137345843261e-09, |
|
"loss": 0.9374, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 1.9637305699481864, |
|
"grad_norm": 0.889864444732666, |
|
"learning_rate": 1.0623177567847809e-09, |
|
"loss": 1.014, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 1.966321243523316, |
|
"grad_norm": 0.850560188293457, |
|
"learning_rate": 6.798920317807601e-10, |
|
"loss": 1.0113, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 1.9689119170984455, |
|
"grad_norm": 1.0084043741226196, |
|
"learning_rate": 3.8244305991530504e-10, |
|
"loss": 0.9921, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 1.971502590673575, |
|
"grad_norm": 0.9245330691337585, |
|
"learning_rate": 1.6997589712575145e-10, |
|
"loss": 1.0003, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.9740932642487046, |
|
"grad_norm": 0.9151455163955688, |
|
"learning_rate": 4.249415485657604e-11, |
|
"loss": 1.0092, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 1.9766839378238341, |
|
"grad_norm": 0.8460503220558167, |
|
"learning_rate": 0.0, |
|
"loss": 0.9456, |
|
"step": 772 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 772, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 193, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.408535403456889e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|