|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9896907216494846, |
|
"eval_steps": 500, |
|
"global_step": 435, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006872852233676976, |
|
"grad_norm": 0.9486322019687681, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 1.3163, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.013745704467353952, |
|
"grad_norm": 0.9581819297739079, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 1.3039, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.020618556701030927, |
|
"grad_norm": 1.0054838528858523, |
|
"learning_rate": 1.3636363636363637e-05, |
|
"loss": 1.3912, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.027491408934707903, |
|
"grad_norm": 0.973313664446148, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 1.3176, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03436426116838488, |
|
"grad_norm": 0.9112881771285355, |
|
"learning_rate": 2.272727272727273e-05, |
|
"loss": 1.2813, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.041237113402061855, |
|
"grad_norm": 0.868853801792406, |
|
"learning_rate": 2.7272727272727273e-05, |
|
"loss": 1.2778, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.048109965635738834, |
|
"grad_norm": 0.7999023078891502, |
|
"learning_rate": 3.181818181818182e-05, |
|
"loss": 1.2184, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.054982817869415807, |
|
"grad_norm": 0.6175826046713091, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 1.1459, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.061855670103092786, |
|
"grad_norm": 0.5099365471572147, |
|
"learning_rate": 4.0909090909090915e-05, |
|
"loss": 1.0493, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.06872852233676977, |
|
"grad_norm": 0.5444792063302974, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 1.0206, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07560137457044673, |
|
"grad_norm": 0.612097075909489, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9556, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.08247422680412371, |
|
"grad_norm": 0.5965134047415941, |
|
"learning_rate": 5.4545454545454546e-05, |
|
"loss": 0.8832, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.08934707903780069, |
|
"grad_norm": 0.6321048435692244, |
|
"learning_rate": 5.90909090909091e-05, |
|
"loss": 0.8113, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.09621993127147767, |
|
"grad_norm": 0.5433713534011196, |
|
"learning_rate": 6.363636363636364e-05, |
|
"loss": 0.7699, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.10309278350515463, |
|
"grad_norm": 0.5641051016325467, |
|
"learning_rate": 6.818181818181818e-05, |
|
"loss": 0.6987, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.10996563573883161, |
|
"grad_norm": 0.4741800529459741, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 0.6418, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.11683848797250859, |
|
"grad_norm": 0.31919675330271313, |
|
"learning_rate": 7.727272727272727e-05, |
|
"loss": 0.5883, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.12371134020618557, |
|
"grad_norm": 0.2704850432116006, |
|
"learning_rate": 8.181818181818183e-05, |
|
"loss": 0.576, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.13058419243986255, |
|
"grad_norm": 0.26062797028012374, |
|
"learning_rate": 8.636363636363637e-05, |
|
"loss": 0.5267, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.13745704467353953, |
|
"grad_norm": 0.18740805347521763, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 0.5469, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.14432989690721648, |
|
"grad_norm": 0.22017175307824505, |
|
"learning_rate": 9.545454545454546e-05, |
|
"loss": 0.5226, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.15120274914089346, |
|
"grad_norm": 0.1887039063728806, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4873, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.15807560137457044, |
|
"grad_norm": 0.1793474824943474, |
|
"learning_rate": 0.00010454545454545455, |
|
"loss": 0.4978, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.16494845360824742, |
|
"grad_norm": 0.23697858977104094, |
|
"learning_rate": 0.00010909090909090909, |
|
"loss": 0.4814, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.1718213058419244, |
|
"grad_norm": 0.17636901545890651, |
|
"learning_rate": 0.00011363636363636365, |
|
"loss": 0.5103, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.17869415807560138, |
|
"grad_norm": 0.16142348168311232, |
|
"learning_rate": 0.0001181818181818182, |
|
"loss": 0.4773, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.18556701030927836, |
|
"grad_norm": 0.1425385061824693, |
|
"learning_rate": 0.00012272727272727272, |
|
"loss": 0.492, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.19243986254295534, |
|
"grad_norm": 0.1265888154640839, |
|
"learning_rate": 0.00012727272727272728, |
|
"loss": 0.4854, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.19931271477663232, |
|
"grad_norm": 0.11449596495934233, |
|
"learning_rate": 0.0001318181818181818, |
|
"loss": 0.4615, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.20618556701030927, |
|
"grad_norm": 0.12461604075605497, |
|
"learning_rate": 0.00013636363636363637, |
|
"loss": 0.4703, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.21305841924398625, |
|
"grad_norm": 0.12169420338653658, |
|
"learning_rate": 0.00014090909090909093, |
|
"loss": 0.4433, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.21993127147766323, |
|
"grad_norm": 0.10581158171701577, |
|
"learning_rate": 0.00014545454545454546, |
|
"loss": 0.4397, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.2268041237113402, |
|
"grad_norm": 0.11163612722377497, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.4605, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.23367697594501718, |
|
"grad_norm": 0.11309459248751377, |
|
"learning_rate": 0.00015454545454545454, |
|
"loss": 0.4495, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.24054982817869416, |
|
"grad_norm": 0.11367215299047806, |
|
"learning_rate": 0.0001590909090909091, |
|
"loss": 0.4308, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.24742268041237114, |
|
"grad_norm": 0.10809464118167192, |
|
"learning_rate": 0.00016363636363636366, |
|
"loss": 0.4368, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.2542955326460481, |
|
"grad_norm": 0.10502574406388546, |
|
"learning_rate": 0.0001681818181818182, |
|
"loss": 0.4196, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2611683848797251, |
|
"grad_norm": 0.10320182075757336, |
|
"learning_rate": 0.00017272727272727275, |
|
"loss": 0.42, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.26804123711340205, |
|
"grad_norm": 0.11138418873828733, |
|
"learning_rate": 0.00017727272727272728, |
|
"loss": 0.4514, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.27491408934707906, |
|
"grad_norm": 0.11111081000023773, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 0.4162, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.281786941580756, |
|
"grad_norm": 0.10773385531295475, |
|
"learning_rate": 0.00018636363636363636, |
|
"loss": 0.4058, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.28865979381443296, |
|
"grad_norm": 0.10631032605354059, |
|
"learning_rate": 0.00019090909090909092, |
|
"loss": 0.4029, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.29553264604810997, |
|
"grad_norm": 0.10836378456827221, |
|
"learning_rate": 0.00019545454545454548, |
|
"loss": 0.4096, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.3024054982817869, |
|
"grad_norm": 0.10880423378093391, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4196, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.30927835051546393, |
|
"grad_norm": 0.1066915228080529, |
|
"learning_rate": 0.00019999677214588312, |
|
"loss": 0.4049, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3161512027491409, |
|
"grad_norm": 0.1009132049783852, |
|
"learning_rate": 0.00019998708879191335, |
|
"loss": 0.4015, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3230240549828179, |
|
"grad_norm": 0.10423644166095698, |
|
"learning_rate": 0.00019997095056321971, |
|
"loss": 0.3989, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.32989690721649484, |
|
"grad_norm": 0.10222126638135792, |
|
"learning_rate": 0.00019994835850163924, |
|
"loss": 0.3911, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.33676975945017185, |
|
"grad_norm": 0.09539411299428839, |
|
"learning_rate": 0.00019991931406564944, |
|
"loss": 0.3861, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.3436426116838488, |
|
"grad_norm": 0.10040146458411044, |
|
"learning_rate": 0.00019988381913027442, |
|
"loss": 0.3932, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.35051546391752575, |
|
"grad_norm": 0.10573179360993629, |
|
"learning_rate": 0.00019984187598696363, |
|
"loss": 0.3936, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.35738831615120276, |
|
"grad_norm": 0.10564577769908695, |
|
"learning_rate": 0.00019979348734344398, |
|
"loss": 0.4071, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3642611683848797, |
|
"grad_norm": 0.10695511977009023, |
|
"learning_rate": 0.00019973865632354516, |
|
"loss": 0.3882, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.3711340206185567, |
|
"grad_norm": 0.10064589165597464, |
|
"learning_rate": 0.0001996773864669978, |
|
"loss": 0.3785, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.37800687285223367, |
|
"grad_norm": 0.09791457914129995, |
|
"learning_rate": 0.00019960968172920516, |
|
"loss": 0.3811, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.3848797250859107, |
|
"grad_norm": 0.1118566682283669, |
|
"learning_rate": 0.00019953554648098748, |
|
"loss": 0.3938, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.3917525773195876, |
|
"grad_norm": 0.10214824698553887, |
|
"learning_rate": 0.0001994549855083001, |
|
"loss": 0.3925, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.39862542955326463, |
|
"grad_norm": 0.1014103487175917, |
|
"learning_rate": 0.0001993680040119244, |
|
"loss": 0.3802, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.4054982817869416, |
|
"grad_norm": 0.0993499098682824, |
|
"learning_rate": 0.00019927460760713197, |
|
"loss": 0.381, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.41237113402061853, |
|
"grad_norm": 0.10206073076536111, |
|
"learning_rate": 0.00019917480232332224, |
|
"loss": 0.3657, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.41924398625429554, |
|
"grad_norm": 0.09828556302343387, |
|
"learning_rate": 0.00019906859460363307, |
|
"loss": 0.3761, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4261168384879725, |
|
"grad_norm": 0.10139688359355604, |
|
"learning_rate": 0.00019895599130452505, |
|
"loss": 0.3749, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.4329896907216495, |
|
"grad_norm": 0.10711125411772042, |
|
"learning_rate": 0.0001988369996953386, |
|
"loss": 0.3706, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.43986254295532645, |
|
"grad_norm": 0.09724017597982272, |
|
"learning_rate": 0.00019871162745782478, |
|
"loss": 0.3705, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.44673539518900346, |
|
"grad_norm": 0.1012279008931452, |
|
"learning_rate": 0.00019857988268564953, |
|
"loss": 0.3712, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4536082474226804, |
|
"grad_norm": 0.09806011793458508, |
|
"learning_rate": 0.0001984417738838709, |
|
"loss": 0.355, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.46048109965635736, |
|
"grad_norm": 0.10463912089089605, |
|
"learning_rate": 0.0001982973099683902, |
|
"loss": 0.3812, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.46735395189003437, |
|
"grad_norm": 0.10616651630870087, |
|
"learning_rate": 0.0001981465002653763, |
|
"loss": 0.3671, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.4742268041237113, |
|
"grad_norm": 0.11658375430444518, |
|
"learning_rate": 0.00019798935451066361, |
|
"loss": 0.3822, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.48109965635738833, |
|
"grad_norm": 0.10686011983880458, |
|
"learning_rate": 0.0001978258828491236, |
|
"loss": 0.3564, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4879725085910653, |
|
"grad_norm": 0.10160970818733071, |
|
"learning_rate": 0.00019765609583400977, |
|
"loss": 0.3656, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.4948453608247423, |
|
"grad_norm": 0.10845572916815471, |
|
"learning_rate": 0.0001974800044262764, |
|
"loss": 0.3623, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5017182130584192, |
|
"grad_norm": 0.10898867605088586, |
|
"learning_rate": 0.00019729761999387103, |
|
"loss": 0.3689, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5085910652920962, |
|
"grad_norm": 0.10030807679917116, |
|
"learning_rate": 0.00019710895431100046, |
|
"loss": 0.3743, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5154639175257731, |
|
"grad_norm": 0.10640353541210322, |
|
"learning_rate": 0.00019691401955737072, |
|
"loss": 0.3664, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5223367697594502, |
|
"grad_norm": 0.10268077724904813, |
|
"learning_rate": 0.00019671282831740076, |
|
"loss": 0.3446, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.5292096219931272, |
|
"grad_norm": 0.1026968446802754, |
|
"learning_rate": 0.00019650539357941003, |
|
"loss": 0.3524, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.5360824742268041, |
|
"grad_norm": 0.10265145337121352, |
|
"learning_rate": 0.00019629172873477995, |
|
"loss": 0.3615, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5429553264604811, |
|
"grad_norm": 0.10234137978544197, |
|
"learning_rate": 0.00019607184757708951, |
|
"loss": 0.348, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.5498281786941581, |
|
"grad_norm": 0.10414003831964998, |
|
"learning_rate": 0.00019584576430122473, |
|
"loss": 0.3455, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5567010309278351, |
|
"grad_norm": 0.10371223386491027, |
|
"learning_rate": 0.00019561349350246226, |
|
"loss": 0.3378, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.563573883161512, |
|
"grad_norm": 0.10200261115662425, |
|
"learning_rate": 0.00019537505017552716, |
|
"loss": 0.3424, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.570446735395189, |
|
"grad_norm": 0.10679690224389146, |
|
"learning_rate": 0.00019513044971362494, |
|
"loss": 0.3381, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5773195876288659, |
|
"grad_norm": 0.11747209193015294, |
|
"learning_rate": 0.00019487970790744774, |
|
"loss": 0.3608, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.584192439862543, |
|
"grad_norm": 0.10455385108750616, |
|
"learning_rate": 0.000194622840944155, |
|
"loss": 0.3513, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5910652920962199, |
|
"grad_norm": 0.11103687968369377, |
|
"learning_rate": 0.00019435986540632843, |
|
"loss": 0.3322, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.5979381443298969, |
|
"grad_norm": 0.1044628007637211, |
|
"learning_rate": 0.00019409079827090145, |
|
"loss": 0.3452, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.6048109965635738, |
|
"grad_norm": 0.1062512518470538, |
|
"learning_rate": 0.00019381565690806328, |
|
"loss": 0.3295, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.6116838487972509, |
|
"grad_norm": 0.10737195453497204, |
|
"learning_rate": 0.00019353445908013755, |
|
"loss": 0.3322, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.6185567010309279, |
|
"grad_norm": 0.10702988631259788, |
|
"learning_rate": 0.00019324722294043558, |
|
"loss": 0.3436, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6254295532646048, |
|
"grad_norm": 0.10765269601948493, |
|
"learning_rate": 0.00019295396703208453, |
|
"loss": 0.3628, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.6323024054982818, |
|
"grad_norm": 0.1002631772076248, |
|
"learning_rate": 0.00019265471028683014, |
|
"loss": 0.337, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6391752577319587, |
|
"grad_norm": 0.10607703170572558, |
|
"learning_rate": 0.00019234947202381486, |
|
"loss": 0.3312, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6460481099656358, |
|
"grad_norm": 0.10229636934864904, |
|
"learning_rate": 0.00019203827194833026, |
|
"loss": 0.3613, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.6529209621993127, |
|
"grad_norm": 0.10916337452983911, |
|
"learning_rate": 0.00019172113015054532, |
|
"loss": 0.3402, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6597938144329897, |
|
"grad_norm": 0.10094159423939311, |
|
"learning_rate": 0.00019139806710420914, |
|
"loss": 0.3308, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.10420707581199606, |
|
"learning_rate": 0.00019106910366532942, |
|
"loss": 0.3297, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.6735395189003437, |
|
"grad_norm": 0.11033214518450266, |
|
"learning_rate": 0.000190734261070826, |
|
"loss": 0.3368, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6804123711340206, |
|
"grad_norm": 0.10648307368306249, |
|
"learning_rate": 0.00019039356093715975, |
|
"loss": 0.3436, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.6872852233676976, |
|
"grad_norm": 0.1023047486718038, |
|
"learning_rate": 0.00019004702525893732, |
|
"loss": 0.3389, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6941580756013745, |
|
"grad_norm": 0.10884134971992637, |
|
"learning_rate": 0.000189694676407491, |
|
"loss": 0.3343, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.7010309278350515, |
|
"grad_norm": 0.10624545430750702, |
|
"learning_rate": 0.0001893365371294346, |
|
"loss": 0.334, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.7079037800687286, |
|
"grad_norm": 0.10418100794965049, |
|
"learning_rate": 0.00018897263054519498, |
|
"loss": 0.3332, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.7147766323024055, |
|
"grad_norm": 0.10442261089390925, |
|
"learning_rate": 0.00018860298014751944, |
|
"loss": 0.3231, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.7216494845360825, |
|
"grad_norm": 0.11510206196708461, |
|
"learning_rate": 0.0001882276097999592, |
|
"loss": 0.3331, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.7285223367697594, |
|
"grad_norm": 0.10447026389489032, |
|
"learning_rate": 0.00018784654373532866, |
|
"loss": 0.3454, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.7353951890034365, |
|
"grad_norm": 0.10590015702307645, |
|
"learning_rate": 0.00018745980655414114, |
|
"loss": 0.3283, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.7422680412371134, |
|
"grad_norm": 0.1019688644295514, |
|
"learning_rate": 0.00018706742322302064, |
|
"loss": 0.3368, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7491408934707904, |
|
"grad_norm": 0.11013306581811545, |
|
"learning_rate": 0.00018666941907309026, |
|
"loss": 0.3371, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.7560137457044673, |
|
"grad_norm": 0.09950125567962896, |
|
"learning_rate": 0.0001862658197983366, |
|
"loss": 0.3206, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7628865979381443, |
|
"grad_norm": 0.09961853222606298, |
|
"learning_rate": 0.0001858566514539513, |
|
"loss": 0.3245, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.7697594501718213, |
|
"grad_norm": 0.10822873754218582, |
|
"learning_rate": 0.00018544194045464886, |
|
"loss": 0.336, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.7766323024054983, |
|
"grad_norm": 0.10256746162360054, |
|
"learning_rate": 0.00018502171357296144, |
|
"loss": 0.3279, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.7835051546391752, |
|
"grad_norm": 0.09750927062180448, |
|
"learning_rate": 0.0001845959979375104, |
|
"loss": 0.3388, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.7903780068728522, |
|
"grad_norm": 0.09793726368683904, |
|
"learning_rate": 0.00018416482103125506, |
|
"loss": 0.3303, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7972508591065293, |
|
"grad_norm": 0.10598098121402388, |
|
"learning_rate": 0.0001837282106897185, |
|
"loss": 0.3216, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.8041237113402062, |
|
"grad_norm": 0.10184730892936761, |
|
"learning_rate": 0.00018328619509919044, |
|
"loss": 0.3341, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.8109965635738832, |
|
"grad_norm": 0.10057188453324177, |
|
"learning_rate": 0.0001828388027949078, |
|
"loss": 0.3377, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.8178694158075601, |
|
"grad_norm": 0.10702981460753135, |
|
"learning_rate": 0.00018238606265921238, |
|
"loss": 0.3408, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.8247422680412371, |
|
"grad_norm": 0.10861206391221787, |
|
"learning_rate": 0.00018192800391968642, |
|
"loss": 0.3413, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8316151202749141, |
|
"grad_norm": 0.10665429392095056, |
|
"learning_rate": 0.00018146465614726567, |
|
"loss": 0.3234, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.8384879725085911, |
|
"grad_norm": 0.10180559686021746, |
|
"learning_rate": 0.00018099604925433043, |
|
"loss": 0.3263, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.845360824742268, |
|
"grad_norm": 0.11025637152512564, |
|
"learning_rate": 0.00018052221349277442, |
|
"loss": 0.3295, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.852233676975945, |
|
"grad_norm": 0.10266709615387361, |
|
"learning_rate": 0.00018004317945205197, |
|
"loss": 0.3275, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.8591065292096219, |
|
"grad_norm": 0.10117796525027944, |
|
"learning_rate": 0.0001795589780572031, |
|
"loss": 0.3541, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.865979381443299, |
|
"grad_norm": 0.10526178050004703, |
|
"learning_rate": 0.00017906964056685706, |
|
"loss": 0.3312, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.872852233676976, |
|
"grad_norm": 0.1063992770321544, |
|
"learning_rate": 0.00017857519857121458, |
|
"loss": 0.3312, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.8797250859106529, |
|
"grad_norm": 0.10996856445056626, |
|
"learning_rate": 0.00017807568399000822, |
|
"loss": 0.3527, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.8865979381443299, |
|
"grad_norm": 0.09753881684843836, |
|
"learning_rate": 0.000177571129070442, |
|
"loss": 0.3117, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.8934707903780069, |
|
"grad_norm": 0.10008693889485847, |
|
"learning_rate": 0.0001770615663851093, |
|
"loss": 0.3188, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9003436426116839, |
|
"grad_norm": 0.10344071862676044, |
|
"learning_rate": 0.0001765470288298905, |
|
"loss": 0.3252, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.9072164948453608, |
|
"grad_norm": 0.09310502474075653, |
|
"learning_rate": 0.0001760275496218288, |
|
"loss": 0.3058, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.9140893470790378, |
|
"grad_norm": 0.10039282316295665, |
|
"learning_rate": 0.0001755031622969862, |
|
"loss": 0.3468, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.9209621993127147, |
|
"grad_norm": 0.09871302822261965, |
|
"learning_rate": 0.00017497390070827848, |
|
"loss": 0.3241, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.9278350515463918, |
|
"grad_norm": 0.10019109474446979, |
|
"learning_rate": 0.00017443979902328956, |
|
"loss": 0.3247, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.9347079037800687, |
|
"grad_norm": 0.09742365798894928, |
|
"learning_rate": 0.00017390089172206592, |
|
"loss": 0.3268, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.9415807560137457, |
|
"grad_norm": 0.09613939400204266, |
|
"learning_rate": 0.00017335721359489057, |
|
"loss": 0.3017, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.9484536082474226, |
|
"grad_norm": 0.11015286314690642, |
|
"learning_rate": 0.00017280879974003707, |
|
"loss": 0.3369, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.9553264604810997, |
|
"grad_norm": 0.09848269873542065, |
|
"learning_rate": 0.0001722556855615039, |
|
"loss": 0.3215, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.9621993127147767, |
|
"grad_norm": 0.10356185439767568, |
|
"learning_rate": 0.00017169790676672858, |
|
"loss": 0.3209, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9690721649484536, |
|
"grad_norm": 0.10200587363931662, |
|
"learning_rate": 0.0001711354993642827, |
|
"loss": 0.3156, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.9759450171821306, |
|
"grad_norm": 0.10631936681232042, |
|
"learning_rate": 0.0001705684996615472, |
|
"loss": 0.3139, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.9828178694158075, |
|
"grad_norm": 0.09891246233686207, |
|
"learning_rate": 0.0001699969442623686, |
|
"loss": 0.3252, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.9896907216494846, |
|
"grad_norm": 0.09631426655006751, |
|
"learning_rate": 0.00016942087006469592, |
|
"loss": 0.3092, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.9965635738831615, |
|
"grad_norm": 0.1007256123012449, |
|
"learning_rate": 0.00016884031425819853, |
|
"loss": 0.3214, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.9965635738831615, |
|
"eval_loss": 0.31896448135375977, |
|
"eval_runtime": 31.3575, |
|
"eval_samples_per_second": 31.157, |
|
"eval_steps_per_second": 0.989, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.0034364261168385, |
|
"grad_norm": 0.09742513564214555, |
|
"learning_rate": 0.00016825531432186543, |
|
"loss": 0.3131, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.0103092783505154, |
|
"grad_norm": 0.1000843163672815, |
|
"learning_rate": 0.00016766590802158566, |
|
"loss": 0.308, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.0171821305841924, |
|
"grad_norm": 0.1018182338551042, |
|
"learning_rate": 0.0001670721334077103, |
|
"loss": 0.3077, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.0240549828178693, |
|
"grad_norm": 0.10928234134401811, |
|
"learning_rate": 0.00016647402881259598, |
|
"loss": 0.3137, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.0309278350515463, |
|
"grad_norm": 0.10482761572794695, |
|
"learning_rate": 0.00016587163284813032, |
|
"loss": 0.2969, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.0378006872852235, |
|
"grad_norm": 0.10755010011949057, |
|
"learning_rate": 0.00016526498440323914, |
|
"loss": 0.3027, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.0446735395189004, |
|
"grad_norm": 0.09961868507020075, |
|
"learning_rate": 0.0001646541226413761, |
|
"loss": 0.2918, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.0515463917525774, |
|
"grad_norm": 0.10275553359018949, |
|
"learning_rate": 0.00016403908699799425, |
|
"loss": 0.2935, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.0584192439862543, |
|
"grad_norm": 0.10314122954609171, |
|
"learning_rate": 0.00016341991717800023, |
|
"loss": 0.3107, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.0652920962199313, |
|
"grad_norm": 0.10201304020136209, |
|
"learning_rate": 0.00016279665315319114, |
|
"loss": 0.2925, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.0721649484536082, |
|
"grad_norm": 0.11295490175270977, |
|
"learning_rate": 0.0001621693351596739, |
|
"loss": 0.3165, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.0790378006872852, |
|
"grad_norm": 0.11187236691946645, |
|
"learning_rate": 0.00016153800369526788, |
|
"loss": 0.3008, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.0859106529209621, |
|
"grad_norm": 0.10649522892041895, |
|
"learning_rate": 0.0001609026995168904, |
|
"loss": 0.3072, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.0927835051546393, |
|
"grad_norm": 0.10138792178292377, |
|
"learning_rate": 0.00016026346363792567, |
|
"loss": 0.2968, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.0996563573883162, |
|
"grad_norm": 0.10490079005967201, |
|
"learning_rate": 0.00015962033732557686, |
|
"loss": 0.3074, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.1065292096219932, |
|
"grad_norm": 0.10339230680715779, |
|
"learning_rate": 0.00015897336209820239, |
|
"loss": 0.2904, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.1134020618556701, |
|
"grad_norm": 0.11480233498020057, |
|
"learning_rate": 0.00015832257972263523, |
|
"loss": 0.31, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.120274914089347, |
|
"grad_norm": 0.10620125747115913, |
|
"learning_rate": 0.00015766803221148673, |
|
"loss": 0.3041, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.127147766323024, |
|
"grad_norm": 0.10787426446881714, |
|
"learning_rate": 0.0001570097618204345, |
|
"loss": 0.3142, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.134020618556701, |
|
"grad_norm": 0.10375711742292576, |
|
"learning_rate": 0.00015634781104549442, |
|
"loss": 0.2916, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.140893470790378, |
|
"grad_norm": 0.1085251854270091, |
|
"learning_rate": 0.00015568222262027717, |
|
"loss": 0.2969, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.147766323024055, |
|
"grad_norm": 0.10620158303898176, |
|
"learning_rate": 0.00015501303951322943, |
|
"loss": 0.3069, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.1546391752577319, |
|
"grad_norm": 0.10221091343876472, |
|
"learning_rate": 0.00015434030492486023, |
|
"loss": 0.3015, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.161512027491409, |
|
"grad_norm": 0.11025720771873336, |
|
"learning_rate": 0.00015366406228495172, |
|
"loss": 0.2923, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.168384879725086, |
|
"grad_norm": 0.11019089511672239, |
|
"learning_rate": 0.00015298435524975572, |
|
"loss": 0.3088, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.175257731958763, |
|
"grad_norm": 0.1066893326930603, |
|
"learning_rate": 0.00015230122769917527, |
|
"loss": 0.3006, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.1821305841924399, |
|
"grad_norm": 0.11028825839961288, |
|
"learning_rate": 0.00015161472373393186, |
|
"loss": 0.3028, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.1890034364261168, |
|
"grad_norm": 0.10136590795578818, |
|
"learning_rate": 0.00015092488767271857, |
|
"loss": 0.2789, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.1958762886597938, |
|
"grad_norm": 0.1032678246834342, |
|
"learning_rate": 0.00015023176404933874, |
|
"loss": 0.2975, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.2027491408934707, |
|
"grad_norm": 0.11056648590020815, |
|
"learning_rate": 0.00014953539760983122, |
|
"loss": 0.3013, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.2096219931271477, |
|
"grad_norm": 0.09978579785299047, |
|
"learning_rate": 0.0001488358333095816, |
|
"loss": 0.2955, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.2164948453608249, |
|
"grad_norm": 0.10699681452783022, |
|
"learning_rate": 0.00014813311631041995, |
|
"loss": 0.2993, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.2233676975945018, |
|
"grad_norm": 0.10408363973129214, |
|
"learning_rate": 0.00014742729197770552, |
|
"loss": 0.2954, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.2302405498281788, |
|
"grad_norm": 0.1016249945819849, |
|
"learning_rate": 0.00014671840587739783, |
|
"loss": 0.2904, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.2371134020618557, |
|
"grad_norm": 0.10519072387735487, |
|
"learning_rate": 0.00014600650377311522, |
|
"loss": 0.2916, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.2439862542955327, |
|
"grad_norm": 0.1058597383518849, |
|
"learning_rate": 0.0001452916316231805, |
|
"loss": 0.301, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.2508591065292096, |
|
"grad_norm": 0.10402401073301179, |
|
"learning_rate": 0.00014457383557765386, |
|
"loss": 0.2875, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.2577319587628866, |
|
"grad_norm": 0.11052274366476848, |
|
"learning_rate": 0.00014385316197535372, |
|
"loss": 0.3033, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.2646048109965635, |
|
"grad_norm": 0.11220133909982184, |
|
"learning_rate": 0.00014312965734086518, |
|
"loss": 0.3048, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.2714776632302405, |
|
"grad_norm": 0.10111031103821645, |
|
"learning_rate": 0.0001424033683815365, |
|
"loss": 0.2883, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.2783505154639174, |
|
"grad_norm": 0.108844384730101, |
|
"learning_rate": 0.00014167434198446383, |
|
"loss": 0.3156, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.2852233676975944, |
|
"grad_norm": 0.11772254803070373, |
|
"learning_rate": 0.00014094262521346427, |
|
"loss": 0.2894, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.2920962199312716, |
|
"grad_norm": 0.10084875638276648, |
|
"learning_rate": 0.00014020826530603776, |
|
"loss": 0.2941, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.2989690721649485, |
|
"grad_norm": 0.10560762521845199, |
|
"learning_rate": 0.00013947130967031717, |
|
"loss": 0.2974, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.3058419243986255, |
|
"grad_norm": 0.1112230814008033, |
|
"learning_rate": 0.00013873180588200827, |
|
"loss": 0.3051, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.3127147766323024, |
|
"grad_norm": 0.10899405854558633, |
|
"learning_rate": 0.00013798980168131794, |
|
"loss": 0.3167, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.3195876288659794, |
|
"grad_norm": 0.10459030666604682, |
|
"learning_rate": 0.00013724534496987247, |
|
"loss": 0.2944, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.3264604810996563, |
|
"grad_norm": 0.10751960916175171, |
|
"learning_rate": 0.00013649848380762513, |
|
"loss": 0.2749, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.10402996897559527, |
|
"learning_rate": 0.0001357492664097534, |
|
"loss": 0.2972, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.3402061855670104, |
|
"grad_norm": 0.10280900537193306, |
|
"learning_rate": 0.00013499774114354655, |
|
"loss": 0.3093, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.3470790378006874, |
|
"grad_norm": 0.10262166065058195, |
|
"learning_rate": 0.0001342439565252831, |
|
"loss": 0.3023, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.3539518900343643, |
|
"grad_norm": 0.11035322484519891, |
|
"learning_rate": 0.00013348796121709862, |
|
"loss": 0.283, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.3608247422680413, |
|
"grad_norm": 0.10183546777840774, |
|
"learning_rate": 0.0001327298040238446, |
|
"loss": 0.2895, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.3676975945017182, |
|
"grad_norm": 0.09846452794645016, |
|
"learning_rate": 0.00013196953388993726, |
|
"loss": 0.2775, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.3745704467353952, |
|
"grad_norm": 0.10082546961742879, |
|
"learning_rate": 0.00013120719989619833, |
|
"loss": 0.2927, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.3814432989690721, |
|
"grad_norm": 0.10338270125162533, |
|
"learning_rate": 0.00013044285125668614, |
|
"loss": 0.3022, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.388316151202749, |
|
"grad_norm": 0.10489686085507173, |
|
"learning_rate": 0.0001296765373155188, |
|
"loss": 0.2886, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.395189003436426, |
|
"grad_norm": 0.10826965142841156, |
|
"learning_rate": 0.00012890830754368855, |
|
"loss": 0.2839, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.402061855670103, |
|
"grad_norm": 0.1095657642687174, |
|
"learning_rate": 0.0001281382115358679, |
|
"loss": 0.2974, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.40893470790378, |
|
"grad_norm": 0.10290443785837475, |
|
"learning_rate": 0.0001273662990072083, |
|
"loss": 0.2916, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.4158075601374571, |
|
"grad_norm": 0.10488572942462458, |
|
"learning_rate": 0.00012659261979013043, |
|
"loss": 0.2896, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.422680412371134, |
|
"grad_norm": 0.0991511111540952, |
|
"learning_rate": 0.00012581722383110718, |
|
"loss": 0.2815, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.429553264604811, |
|
"grad_norm": 0.11084616014817153, |
|
"learning_rate": 0.00012504016118743935, |
|
"loss": 0.2857, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.436426116838488, |
|
"grad_norm": 0.10755359121093663, |
|
"learning_rate": 0.00012426148202402404, |
|
"loss": 0.2784, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.443298969072165, |
|
"grad_norm": 0.10341376660828379, |
|
"learning_rate": 0.00012348123661011601, |
|
"loss": 0.2947, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.4501718213058419, |
|
"grad_norm": 0.10964970839444545, |
|
"learning_rate": 0.00012269947531608276, |
|
"loss": 0.3056, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.4570446735395188, |
|
"grad_norm": 0.10241977399698814, |
|
"learning_rate": 0.00012191624861015254, |
|
"loss": 0.3102, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.463917525773196, |
|
"grad_norm": 0.10337164620385511, |
|
"learning_rate": 0.00012113160705515625, |
|
"loss": 0.3122, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.470790378006873, |
|
"grad_norm": 0.10002116602053045, |
|
"learning_rate": 0.0001203456013052634, |
|
"loss": 0.2905, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.47766323024055, |
|
"grad_norm": 0.1005828345016487, |
|
"learning_rate": 0.00011955828210271187, |
|
"loss": 0.2897, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.4845360824742269, |
|
"grad_norm": 0.10315190703705018, |
|
"learning_rate": 0.00011876970027453222, |
|
"loss": 0.2896, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.4914089347079038, |
|
"grad_norm": 0.10209266642811851, |
|
"learning_rate": 0.00011797990672926652, |
|
"loss": 0.2862, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.4982817869415808, |
|
"grad_norm": 0.10350382211989749, |
|
"learning_rate": 0.00011718895245368167, |
|
"loss": 0.2993, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.5051546391752577, |
|
"grad_norm": 0.09959069681120952, |
|
"learning_rate": 0.00011639688850947799, |
|
"loss": 0.279, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.5120274914089347, |
|
"grad_norm": 0.1017699606831777, |
|
"learning_rate": 0.00011560376602999272, |
|
"loss": 0.2924, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.5189003436426116, |
|
"grad_norm": 0.10199010748060407, |
|
"learning_rate": 0.00011480963621689905, |
|
"loss": 0.3007, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.5257731958762886, |
|
"grad_norm": 0.10438569087355797, |
|
"learning_rate": 0.00011401455033690076, |
|
"loss": 0.2828, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.5326460481099655, |
|
"grad_norm": 0.106590652554821, |
|
"learning_rate": 0.00011321855971842243, |
|
"loss": 0.299, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.5395189003436425, |
|
"grad_norm": 0.10067806688032917, |
|
"learning_rate": 0.00011242171574829599, |
|
"loss": 0.2897, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.5463917525773194, |
|
"grad_norm": 0.11078489209589217, |
|
"learning_rate": 0.00011162406986844323, |
|
"loss": 0.2791, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.5532646048109966, |
|
"grad_norm": 0.10748644329087202, |
|
"learning_rate": 0.00011082567357255484, |
|
"loss": 0.3004, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.5601374570446735, |
|
"grad_norm": 0.09232201373607851, |
|
"learning_rate": 0.00011002657840276627, |
|
"loss": 0.2647, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.5670103092783505, |
|
"grad_norm": 0.10419820524514652, |
|
"learning_rate": 0.00010922683594633021, |
|
"loss": 0.2897, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.5738831615120275, |
|
"grad_norm": 0.11387366232726741, |
|
"learning_rate": 0.00010842649783228624, |
|
"loss": 0.3071, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.5807560137457046, |
|
"grad_norm": 0.1112209148078402, |
|
"learning_rate": 0.00010762561572812788, |
|
"loss": 0.2791, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.5876288659793816, |
|
"grad_norm": 0.1027329268180551, |
|
"learning_rate": 0.0001068242413364671, |
|
"loss": 0.2962, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.5945017182130585, |
|
"grad_norm": 0.10377377307338428, |
|
"learning_rate": 0.00010602242639169648, |
|
"loss": 0.2905, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.6013745704467355, |
|
"grad_norm": 0.10333591660725541, |
|
"learning_rate": 0.0001052202226566494, |
|
"loss": 0.294, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.6082474226804124, |
|
"grad_norm": 0.10653830908113968, |
|
"learning_rate": 0.00010441768191925847, |
|
"loss": 0.3023, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.6151202749140894, |
|
"grad_norm": 0.10203788732177198, |
|
"learning_rate": 0.00010361485598921212, |
|
"loss": 0.2961, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.6219931271477663, |
|
"grad_norm": 0.105467450842672, |
|
"learning_rate": 0.00010281179669461005, |
|
"loss": 0.2992, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.6288659793814433, |
|
"grad_norm": 0.10232600415297333, |
|
"learning_rate": 0.00010200855587861724, |
|
"loss": 0.2894, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.6357388316151202, |
|
"grad_norm": 0.1000078228085237, |
|
"learning_rate": 0.0001012051853961172, |
|
"loss": 0.2924, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.6426116838487972, |
|
"grad_norm": 0.10443130272760445, |
|
"learning_rate": 0.00010040173711036431, |
|
"loss": 0.2987, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.6494845360824741, |
|
"grad_norm": 0.10490297286528327, |
|
"learning_rate": 9.959826288963571e-05, |
|
"loss": 0.2859, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.656357388316151, |
|
"grad_norm": 0.10350833328466466, |
|
"learning_rate": 9.879481460388282e-05, |
|
"loss": 0.2931, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.663230240549828, |
|
"grad_norm": 0.10225492666499335, |
|
"learning_rate": 9.799144412138275e-05, |
|
"loss": 0.2878, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.670103092783505, |
|
"grad_norm": 0.10508397146202163, |
|
"learning_rate": 9.718820330538998e-05, |
|
"loss": 0.3004, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.6769759450171822, |
|
"grad_norm": 0.10283137687490299, |
|
"learning_rate": 9.638514401078788e-05, |
|
"loss": 0.2714, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.6838487972508591, |
|
"grad_norm": 0.10397776989480932, |
|
"learning_rate": 9.558231808074156e-05, |
|
"loss": 0.2771, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.690721649484536, |
|
"grad_norm": 0.10092959577494873, |
|
"learning_rate": 9.477977734335061e-05, |
|
"loss": 0.2899, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.697594501718213, |
|
"grad_norm": 0.09948236816567041, |
|
"learning_rate": 9.397757360830353e-05, |
|
"loss": 0.2805, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.7044673539518902, |
|
"grad_norm": 0.10097655488635103, |
|
"learning_rate": 9.317575866353292e-05, |
|
"loss": 0.296, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.7113402061855671, |
|
"grad_norm": 0.09936217630195925, |
|
"learning_rate": 9.23743842718721e-05, |
|
"loss": 0.2973, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.718213058419244, |
|
"grad_norm": 0.09913343698008316, |
|
"learning_rate": 9.157350216771378e-05, |
|
"loss": 0.2899, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.725085910652921, |
|
"grad_norm": 0.09873019613022356, |
|
"learning_rate": 9.077316405366981e-05, |
|
"loss": 0.2827, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.731958762886598, |
|
"grad_norm": 0.09795703366595764, |
|
"learning_rate": 8.997342159723371e-05, |
|
"loss": 0.2833, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.738831615120275, |
|
"grad_norm": 0.0971884033549596, |
|
"learning_rate": 8.917432642744518e-05, |
|
"loss": 0.2813, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.745704467353952, |
|
"grad_norm": 0.10306201837901838, |
|
"learning_rate": 8.83759301315568e-05, |
|
"loss": 0.2924, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.7525773195876289, |
|
"grad_norm": 0.10374979571562502, |
|
"learning_rate": 8.757828425170404e-05, |
|
"loss": 0.2838, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.7594501718213058, |
|
"grad_norm": 0.1048687957670358, |
|
"learning_rate": 8.678144028157759e-05, |
|
"loss": 0.2886, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.7663230240549828, |
|
"grad_norm": 0.10256152550776314, |
|
"learning_rate": 8.598544966309925e-05, |
|
"loss": 0.2879, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.7731958762886597, |
|
"grad_norm": 0.10130883507858145, |
|
"learning_rate": 8.519036378310096e-05, |
|
"loss": 0.2909, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.7800687285223367, |
|
"grad_norm": 0.10467386983378474, |
|
"learning_rate": 8.43962339700073e-05, |
|
"loss": 0.2828, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.7869415807560136, |
|
"grad_norm": 0.09820302182298171, |
|
"learning_rate": 8.360311149052205e-05, |
|
"loss": 0.2844, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.7938144329896906, |
|
"grad_norm": 0.10102934299035139, |
|
"learning_rate": 8.281104754631835e-05, |
|
"loss": 0.2898, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.8006872852233677, |
|
"grad_norm": 0.0958354586875355, |
|
"learning_rate": 8.20200932707335e-05, |
|
"loss": 0.28, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.8075601374570447, |
|
"grad_norm": 0.09957495231488761, |
|
"learning_rate": 8.123029972546781e-05, |
|
"loss": 0.284, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.8144329896907216, |
|
"grad_norm": 0.10693182349883178, |
|
"learning_rate": 8.044171789728816e-05, |
|
"loss": 0.2892, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.8213058419243986, |
|
"grad_norm": 0.10646064359160178, |
|
"learning_rate": 7.965439869473664e-05, |
|
"loss": 0.2958, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.8281786941580758, |
|
"grad_norm": 0.10085406622127646, |
|
"learning_rate": 7.886839294484377e-05, |
|
"loss": 0.2818, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.8350515463917527, |
|
"grad_norm": 0.10500151086048896, |
|
"learning_rate": 7.808375138984745e-05, |
|
"loss": 0.2733, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.8419243986254297, |
|
"grad_norm": 0.1002635668475348, |
|
"learning_rate": 7.730052468391725e-05, |
|
"loss": 0.285, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.8487972508591066, |
|
"grad_norm": 0.09642728830623505, |
|
"learning_rate": 7.6518763389884e-05, |
|
"loss": 0.2728, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.8556701030927836, |
|
"grad_norm": 0.10268576835416232, |
|
"learning_rate": 7.573851797597602e-05, |
|
"loss": 0.2795, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.8625429553264605, |
|
"grad_norm": 0.10795408269996427, |
|
"learning_rate": 7.495983881256067e-05, |
|
"loss": 0.295, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.8694158075601375, |
|
"grad_norm": 0.10121463339523644, |
|
"learning_rate": 7.418277616889282e-05, |
|
"loss": 0.2787, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.8762886597938144, |
|
"grad_norm": 0.0982041211411212, |
|
"learning_rate": 7.340738020986961e-05, |
|
"loss": 0.2913, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.8831615120274914, |
|
"grad_norm": 0.09689663299295423, |
|
"learning_rate": 7.263370099279172e-05, |
|
"loss": 0.2678, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.8900343642611683, |
|
"grad_norm": 0.09890680768599139, |
|
"learning_rate": 7.186178846413214e-05, |
|
"loss": 0.2622, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.8969072164948453, |
|
"grad_norm": 0.10218531402065698, |
|
"learning_rate": 7.109169245631149e-05, |
|
"loss": 0.2948, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.9037800687285222, |
|
"grad_norm": 0.09970764344361568, |
|
"learning_rate": 7.032346268448118e-05, |
|
"loss": 0.2936, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.9106529209621992, |
|
"grad_norm": 0.10646182283668124, |
|
"learning_rate": 6.955714874331387e-05, |
|
"loss": 0.2841, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.9175257731958761, |
|
"grad_norm": 0.1005308262146528, |
|
"learning_rate": 6.87928001038017e-05, |
|
"loss": 0.2741, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.9243986254295533, |
|
"grad_norm": 0.09801297561347891, |
|
"learning_rate": 6.803046611006278e-05, |
|
"loss": 0.2812, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.9312714776632303, |
|
"grad_norm": 0.10072370231430908, |
|
"learning_rate": 6.727019597615545e-05, |
|
"loss": 0.2861, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.9381443298969072, |
|
"grad_norm": 0.10002866089004976, |
|
"learning_rate": 6.651203878290139e-05, |
|
"loss": 0.2747, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.9450171821305842, |
|
"grad_norm": 0.09520289893088804, |
|
"learning_rate": 6.575604347471695e-05, |
|
"loss": 0.275, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.9518900343642611, |
|
"grad_norm": 0.10328917600068767, |
|
"learning_rate": 6.500225885645346e-05, |
|
"loss": 0.2836, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.9587628865979383, |
|
"grad_norm": 0.09800249344342209, |
|
"learning_rate": 6.425073359024663e-05, |
|
"loss": 0.2904, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.9656357388316152, |
|
"grad_norm": 0.09952065394630734, |
|
"learning_rate": 6.350151619237488e-05, |
|
"loss": 0.2804, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.9725085910652922, |
|
"grad_norm": 0.10493099048201263, |
|
"learning_rate": 6.275465503012751e-05, |
|
"loss": 0.2818, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.9793814432989691, |
|
"grad_norm": 0.09722127539298636, |
|
"learning_rate": 6.201019831868208e-05, |
|
"loss": 0.2839, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.986254295532646, |
|
"grad_norm": 0.09609058787958422, |
|
"learning_rate": 6.126819411799175e-05, |
|
"loss": 0.2725, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.993127147766323, |
|
"grad_norm": 0.09821072360835045, |
|
"learning_rate": 6.052869032968285e-05, |
|
"loss": 0.2811, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.10288780904807039, |
|
"learning_rate": 5.979173469396227e-05, |
|
"loss": 0.2761, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.2949652373790741, |
|
"eval_runtime": 27.5209, |
|
"eval_samples_per_second": 35.5, |
|
"eval_steps_per_second": 1.126, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.006872852233677, |
|
"grad_norm": 0.097538336458536, |
|
"learning_rate": 5.905737478653572e-05, |
|
"loss": 0.2679, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.013745704467354, |
|
"grad_norm": 0.09988881705573123, |
|
"learning_rate": 5.83256580155362e-05, |
|
"loss": 0.2525, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 2.020618556701031, |
|
"grad_norm": 0.0964482303899947, |
|
"learning_rate": 5.7596631618463514e-05, |
|
"loss": 0.2425, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.027491408934708, |
|
"grad_norm": 0.10864263032849888, |
|
"learning_rate": 5.687034265913485e-05, |
|
"loss": 0.27, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.0343642611683848, |
|
"grad_norm": 0.11734835455675438, |
|
"learning_rate": 5.614683802464631e-05, |
|
"loss": 0.2624, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.0412371134020617, |
|
"grad_norm": 0.11289186335789302, |
|
"learning_rate": 5.542616442234618e-05, |
|
"loss": 0.259, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.0481099656357387, |
|
"grad_norm": 0.11224803307996767, |
|
"learning_rate": 5.470836837681954e-05, |
|
"loss": 0.2607, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.0549828178694156, |
|
"grad_norm": 0.10867092848715432, |
|
"learning_rate": 5.399349622688479e-05, |
|
"loss": 0.2703, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 2.0618556701030926, |
|
"grad_norm": 0.10624834548230039, |
|
"learning_rate": 5.32815941226022e-05, |
|
"loss": 0.2654, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.06872852233677, |
|
"grad_norm": 0.10758062626317398, |
|
"learning_rate": 5.2572708022294504e-05, |
|
"loss": 0.2698, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 2.075601374570447, |
|
"grad_norm": 0.1071415029446744, |
|
"learning_rate": 5.1866883689580056e-05, |
|
"loss": 0.2632, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.082474226804124, |
|
"grad_norm": 0.10125455614344606, |
|
"learning_rate": 5.116416669041843e-05, |
|
"loss": 0.2538, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.089347079037801, |
|
"grad_norm": 0.10522800373863611, |
|
"learning_rate": 5.046460239016879e-05, |
|
"loss": 0.2693, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.0962199312714778, |
|
"grad_norm": 0.10672396619233848, |
|
"learning_rate": 4.976823595066128e-05, |
|
"loss": 0.2575, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.1030927835051547, |
|
"grad_norm": 0.10982835933289055, |
|
"learning_rate": 4.907511232728145e-05, |
|
"loss": 0.2684, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.1099656357388317, |
|
"grad_norm": 0.10787287180280963, |
|
"learning_rate": 4.8385276266068146e-05, |
|
"loss": 0.258, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 2.1168384879725086, |
|
"grad_norm": 0.11168112585189982, |
|
"learning_rate": 4.7698772300824756e-05, |
|
"loss": 0.2665, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.1237113402061856, |
|
"grad_norm": 0.10887273811279173, |
|
"learning_rate": 4.7015644750244306e-05, |
|
"loss": 0.2609, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.1305841924398625, |
|
"grad_norm": 0.10561230206084926, |
|
"learning_rate": 4.6335937715048306e-05, |
|
"loss": 0.2569, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.1374570446735395, |
|
"grad_norm": 0.1085500386167761, |
|
"learning_rate": 4.565969507513981e-05, |
|
"loss": 0.2636, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.1443298969072164, |
|
"grad_norm": 0.1012240249829896, |
|
"learning_rate": 4.498696048677059e-05, |
|
"loss": 0.2547, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.1512027491408934, |
|
"grad_norm": 0.1030596147725605, |
|
"learning_rate": 4.4317777379722866e-05, |
|
"loss": 0.2531, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.1580756013745703, |
|
"grad_norm": 0.10613673774187961, |
|
"learning_rate": 4.365218895450558e-05, |
|
"loss": 0.2608, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.1649484536082473, |
|
"grad_norm": 0.10496672072120719, |
|
"learning_rate": 4.29902381795655e-05, |
|
"loss": 0.2711, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.1718213058419242, |
|
"grad_norm": 0.1017243256571039, |
|
"learning_rate": 4.2331967788513295e-05, |
|
"loss": 0.2506, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.178694158075601, |
|
"grad_norm": 0.1083774959415722, |
|
"learning_rate": 4.167742027736482e-05, |
|
"loss": 0.2706, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.1855670103092786, |
|
"grad_norm": 0.10375985752479969, |
|
"learning_rate": 4.102663790179764e-05, |
|
"loss": 0.2446, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.1924398625429555, |
|
"grad_norm": 0.10866264016891114, |
|
"learning_rate": 4.037966267442315e-05, |
|
"loss": 0.2782, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.1993127147766325, |
|
"grad_norm": 0.10829606781249333, |
|
"learning_rate": 3.973653636207437e-05, |
|
"loss": 0.2628, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.2061855670103094, |
|
"grad_norm": 0.11273324743023601, |
|
"learning_rate": 3.909730048310962e-05, |
|
"loss": 0.2661, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.2130584192439864, |
|
"grad_norm": 0.10783971891161509, |
|
"learning_rate": 3.846199630473216e-05, |
|
"loss": 0.257, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.2199312714776633, |
|
"grad_norm": 0.10744882559462787, |
|
"learning_rate": 3.7830664840326145e-05, |
|
"loss": 0.2431, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.2268041237113403, |
|
"grad_norm": 0.10847966130290622, |
|
"learning_rate": 3.720334684680889e-05, |
|
"loss": 0.2645, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.2336769759450172, |
|
"grad_norm": 0.10687099898167607, |
|
"learning_rate": 3.6580082821999786e-05, |
|
"loss": 0.2558, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.240549828178694, |
|
"grad_norm": 0.10744277727601985, |
|
"learning_rate": 3.596091300200578e-05, |
|
"loss": 0.2648, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.247422680412371, |
|
"grad_norm": 0.1075527766804997, |
|
"learning_rate": 3.534587735862391e-05, |
|
"loss": 0.2662, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.254295532646048, |
|
"grad_norm": 0.10693314058206296, |
|
"learning_rate": 3.473501559676088e-05, |
|
"loss": 0.2691, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.261168384879725, |
|
"grad_norm": 0.10977366989631175, |
|
"learning_rate": 3.4128367151869714e-05, |
|
"loss": 0.2721, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.268041237113402, |
|
"grad_norm": 0.10791580217848652, |
|
"learning_rate": 3.352597118740404e-05, |
|
"loss": 0.2574, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.274914089347079, |
|
"grad_norm": 0.1117741340523688, |
|
"learning_rate": 3.292786659228973e-05, |
|
"loss": 0.2701, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.281786941580756, |
|
"grad_norm": 0.10832840646361684, |
|
"learning_rate": 3.233409197841437e-05, |
|
"loss": 0.2533, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.288659793814433, |
|
"grad_norm": 0.11021992972158987, |
|
"learning_rate": 3.174468567813461e-05, |
|
"loss": 0.2506, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.29553264604811, |
|
"grad_norm": 0.11244737426877117, |
|
"learning_rate": 3.115968574180149e-05, |
|
"loss": 0.2741, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.3024054982817868, |
|
"grad_norm": 0.10439400399730109, |
|
"learning_rate": 3.0579129935304066e-05, |
|
"loss": 0.2423, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.3092783505154637, |
|
"grad_norm": 0.11154784823612464, |
|
"learning_rate": 3.0003055737631403e-05, |
|
"loss": 0.2606, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.3161512027491407, |
|
"grad_norm": 0.11618867786187727, |
|
"learning_rate": 2.9431500338452832e-05, |
|
"loss": 0.2705, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.323024054982818, |
|
"grad_norm": 0.10953255344278685, |
|
"learning_rate": 2.886450063571735e-05, |
|
"loss": 0.2597, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.329896907216495, |
|
"grad_norm": 0.10670783300766909, |
|
"learning_rate": 2.8302093233271453e-05, |
|
"loss": 0.2451, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.336769759450172, |
|
"grad_norm": 0.10795627839159605, |
|
"learning_rate": 2.7744314438496088e-05, |
|
"loss": 0.2571, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.343642611683849, |
|
"grad_norm": 0.10876660642511277, |
|
"learning_rate": 2.7191200259962934e-05, |
|
"loss": 0.2646, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.350515463917526, |
|
"grad_norm": 0.10862804831060525, |
|
"learning_rate": 2.6642786405109475e-05, |
|
"loss": 0.2679, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.357388316151203, |
|
"grad_norm": 0.1068062333293375, |
|
"learning_rate": 2.6099108277934103e-05, |
|
"loss": 0.2676, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.3642611683848798, |
|
"grad_norm": 0.10420269958200333, |
|
"learning_rate": 2.556020097671046e-05, |
|
"loss": 0.2669, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.3711340206185567, |
|
"grad_norm": 0.10951228936587228, |
|
"learning_rate": 2.5026099291721516e-05, |
|
"loss": 0.2519, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.3780068728522337, |
|
"grad_norm": 0.11129513461474132, |
|
"learning_rate": 2.449683770301382e-05, |
|
"loss": 0.2571, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.3848797250859106, |
|
"grad_norm": 0.10763529170641503, |
|
"learning_rate": 2.397245037817125e-05, |
|
"loss": 0.2567, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.3917525773195876, |
|
"grad_norm": 0.10496278836242602, |
|
"learning_rate": 2.345297117010954e-05, |
|
"loss": 0.2459, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.3986254295532645, |
|
"grad_norm": 0.11015993294501161, |
|
"learning_rate": 2.2938433614890697e-05, |
|
"loss": 0.2702, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.4054982817869415, |
|
"grad_norm": 0.10738471004067172, |
|
"learning_rate": 2.242887092955801e-05, |
|
"loss": 0.2585, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.4123711340206184, |
|
"grad_norm": 0.11218254036958039, |
|
"learning_rate": 2.1924316009991787e-05, |
|
"loss": 0.2607, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.4192439862542954, |
|
"grad_norm": 0.10844975577221563, |
|
"learning_rate": 2.1424801428785447e-05, |
|
"loss": 0.2561, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.4261168384879723, |
|
"grad_norm": 0.10599271281183413, |
|
"learning_rate": 2.0930359433142932e-05, |
|
"loss": 0.2501, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.4329896907216497, |
|
"grad_norm": 0.11017719350663994, |
|
"learning_rate": 2.0441021942796944e-05, |
|
"loss": 0.2618, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.4398625429553267, |
|
"grad_norm": 0.10668690539385935, |
|
"learning_rate": 1.995682054794803e-05, |
|
"loss": 0.264, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.4467353951890036, |
|
"grad_norm": 0.10450005811019492, |
|
"learning_rate": 1.9477786507225616e-05, |
|
"loss": 0.2553, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.4536082474226806, |
|
"grad_norm": 0.11220780349903246, |
|
"learning_rate": 1.900395074566962e-05, |
|
"loss": 0.2654, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.4604810996563575, |
|
"grad_norm": 0.108477409950911, |
|
"learning_rate": 1.8535343852734332e-05, |
|
"loss": 0.2569, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.4673539518900345, |
|
"grad_norm": 0.10460682785615502, |
|
"learning_rate": 1.8071996080313602e-05, |
|
"loss": 0.2535, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.4742268041237114, |
|
"grad_norm": 0.1076833252048245, |
|
"learning_rate": 1.76139373407876e-05, |
|
"loss": 0.2651, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.4810996563573884, |
|
"grad_norm": 0.10826152402956611, |
|
"learning_rate": 1.7161197205092216e-05, |
|
"loss": 0.2676, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.4879725085910653, |
|
"grad_norm": 0.10750093502253717, |
|
"learning_rate": 1.6713804900809582e-05, |
|
"loss": 0.2559, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.4948453608247423, |
|
"grad_norm": 0.1038001972682153, |
|
"learning_rate": 1.6271789310281517e-05, |
|
"loss": 0.2423, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.5017182130584192, |
|
"grad_norm": 0.10672842152395061, |
|
"learning_rate": 1.583517896874498e-05, |
|
"loss": 0.2593, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.508591065292096, |
|
"grad_norm": 0.10761120219562789, |
|
"learning_rate": 1.540400206248963e-05, |
|
"loss": 0.2595, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.515463917525773, |
|
"grad_norm": 0.10816060382518869, |
|
"learning_rate": 1.4978286427038601e-05, |
|
"loss": 0.2539, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.52233676975945, |
|
"grad_norm": 0.10455085413886135, |
|
"learning_rate": 1.4558059545351143e-05, |
|
"loss": 0.2564, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.529209621993127, |
|
"grad_norm": 0.10969390724809498, |
|
"learning_rate": 1.4143348546048707e-05, |
|
"loss": 0.2552, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.536082474226804, |
|
"grad_norm": 0.11070585821467833, |
|
"learning_rate": 1.3734180201663439e-05, |
|
"loss": 0.2561, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.542955326460481, |
|
"grad_norm": 0.11065650967955783, |
|
"learning_rate": 1.3330580926909763e-05, |
|
"loss": 0.2552, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.549828178694158, |
|
"grad_norm": 0.10753458918108932, |
|
"learning_rate": 1.2932576776979377e-05, |
|
"loss": 0.2599, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 2.556701030927835, |
|
"grad_norm": 0.11215812047440889, |
|
"learning_rate": 1.2540193445858883e-05, |
|
"loss": 0.26, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.563573883161512, |
|
"grad_norm": 0.11029227457181179, |
|
"learning_rate": 1.2153456264671337e-05, |
|
"loss": 0.2529, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 2.5704467353951888, |
|
"grad_norm": 0.1095036155268631, |
|
"learning_rate": 1.1772390200040817e-05, |
|
"loss": 0.2503, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.5773195876288657, |
|
"grad_norm": 0.10487179029197675, |
|
"learning_rate": 1.139701985248055e-05, |
|
"loss": 0.2421, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.584192439862543, |
|
"grad_norm": 0.10299175954283599, |
|
"learning_rate": 1.1027369454805058e-05, |
|
"loss": 0.2502, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.59106529209622, |
|
"grad_norm": 0.10855354801856847, |
|
"learning_rate": 1.0663462870565411e-05, |
|
"loss": 0.2608, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 2.597938144329897, |
|
"grad_norm": 0.10315061729781284, |
|
"learning_rate": 1.0305323592509009e-05, |
|
"loss": 0.2528, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.604810996563574, |
|
"grad_norm": 0.10626039326093263, |
|
"learning_rate": 9.952974741062703e-06, |
|
"loss": 0.264, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 2.611683848797251, |
|
"grad_norm": 0.10745271675212126, |
|
"learning_rate": 9.606439062840256e-06, |
|
"loss": 0.2594, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.618556701030928, |
|
"grad_norm": 0.10974406713413064, |
|
"learning_rate": 9.265738929174051e-06, |
|
"loss": 0.2582, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 2.625429553264605, |
|
"grad_norm": 0.10537263197622973, |
|
"learning_rate": 8.93089633467058e-06, |
|
"loss": 0.2513, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.6323024054982818, |
|
"grad_norm": 0.10396472858049301, |
|
"learning_rate": 8.601932895790877e-06, |
|
"loss": 0.2583, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 2.6391752577319587, |
|
"grad_norm": 0.10276278086397593, |
|
"learning_rate": 8.278869849454718e-06, |
|
"loss": 0.2529, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.6460481099656357, |
|
"grad_norm": 0.10899885081403418, |
|
"learning_rate": 7.961728051669737e-06, |
|
"loss": 0.2637, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.6529209621993126, |
|
"grad_norm": 0.10582666731745977, |
|
"learning_rate": 7.650527976185173e-06, |
|
"loss": 0.2535, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 2.6597938144329896, |
|
"grad_norm": 0.10725397582281598, |
|
"learning_rate": 7.3452897131698564e-06, |
|
"loss": 0.2603, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.10749509280278831, |
|
"learning_rate": 7.046032967915483e-06, |
|
"loss": 0.2489, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 2.673539518900344, |
|
"grad_norm": 0.1087201328812968, |
|
"learning_rate": 6.75277705956443e-06, |
|
"loss": 0.2604, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 2.680412371134021, |
|
"grad_norm": 0.10783575017518213, |
|
"learning_rate": 6.465540919862456e-06, |
|
"loss": 0.2557, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.687285223367698, |
|
"grad_norm": 0.10758942906584705, |
|
"learning_rate": 6.184343091936751e-06, |
|
"loss": 0.2526, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 2.6941580756013748, |
|
"grad_norm": 0.10528296367950161, |
|
"learning_rate": 5.909201729098579e-06, |
|
"loss": 0.2487, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 2.7010309278350517, |
|
"grad_norm": 0.11256201944946768, |
|
"learning_rate": 5.640134593671598e-06, |
|
"loss": 0.2714, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 2.7079037800687287, |
|
"grad_norm": 0.10863241059875908, |
|
"learning_rate": 5.3771590558450265e-06, |
|
"loss": 0.2574, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 2.7147766323024056, |
|
"grad_norm": 0.10549349184272533, |
|
"learning_rate": 5.12029209255227e-06, |
|
"loss": 0.2554, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.7216494845360826, |
|
"grad_norm": 0.10847174602209737, |
|
"learning_rate": 4.869550286375091e-06, |
|
"loss": 0.268, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.7285223367697595, |
|
"grad_norm": 0.10488356613015705, |
|
"learning_rate": 4.624949824472858e-06, |
|
"loss": 0.2671, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 2.7353951890034365, |
|
"grad_norm": 0.10475860935600055, |
|
"learning_rate": 4.386506497537757e-06, |
|
"loss": 0.2514, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 2.7422680412371134, |
|
"grad_norm": 0.10597713121898167, |
|
"learning_rate": 4.154235698775277e-06, |
|
"loss": 0.2597, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 2.7491408934707904, |
|
"grad_norm": 0.10595800588015507, |
|
"learning_rate": 3.928152422910491e-06, |
|
"loss": 0.2602, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.7560137457044673, |
|
"grad_norm": 0.11154644583398933, |
|
"learning_rate": 3.7082712652200867e-06, |
|
"loss": 0.264, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 2.7628865979381443, |
|
"grad_norm": 0.10694150624625907, |
|
"learning_rate": 3.4946064205899965e-06, |
|
"loss": 0.25, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 2.7697594501718212, |
|
"grad_norm": 0.10318772901644747, |
|
"learning_rate": 3.287171682599255e-06, |
|
"loss": 0.2493, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 2.776632302405498, |
|
"grad_norm": 0.1061914310952031, |
|
"learning_rate": 3.085980442629288e-06, |
|
"loss": 0.2591, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 2.783505154639175, |
|
"grad_norm": 0.10446042630271005, |
|
"learning_rate": 2.8910456889995498e-06, |
|
"loss": 0.254, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.790378006872852, |
|
"grad_norm": 0.10518845803888338, |
|
"learning_rate": 2.7023800061289907e-06, |
|
"loss": 0.2533, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 2.797250859106529, |
|
"grad_norm": 0.10867396165520546, |
|
"learning_rate": 2.5199955737236104e-06, |
|
"loss": 0.2724, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 2.804123711340206, |
|
"grad_norm": 0.10694114528768675, |
|
"learning_rate": 2.3439041659902407e-06, |
|
"loss": 0.2564, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 2.810996563573883, |
|
"grad_norm": 0.10518421192748285, |
|
"learning_rate": 2.174117150876398e-06, |
|
"loss": 0.2609, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 2.81786941580756, |
|
"grad_norm": 0.10729647762632918, |
|
"learning_rate": 2.010645489336382e-06, |
|
"loss": 0.2587, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.824742268041237, |
|
"grad_norm": 0.10754314349393188, |
|
"learning_rate": 1.8534997346237093e-06, |
|
"loss": 0.2632, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 2.8316151202749142, |
|
"grad_norm": 0.10802289125290783, |
|
"learning_rate": 1.7026900316098215e-06, |
|
"loss": 0.2566, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 2.838487972508591, |
|
"grad_norm": 0.10931075718242897, |
|
"learning_rate": 1.5582261161291245e-06, |
|
"loss": 0.2626, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 2.845360824742268, |
|
"grad_norm": 0.10501355229769045, |
|
"learning_rate": 1.4201173143504888e-06, |
|
"loss": 0.2621, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 2.852233676975945, |
|
"grad_norm": 0.10753436065183695, |
|
"learning_rate": 1.2883725421752201e-06, |
|
"loss": 0.2475, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.859106529209622, |
|
"grad_norm": 0.10463768922989454, |
|
"learning_rate": 1.1630003046614323e-06, |
|
"loss": 0.2409, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 2.865979381443299, |
|
"grad_norm": 0.10543898222201153, |
|
"learning_rate": 1.0440086954749517e-06, |
|
"loss": 0.2505, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 2.872852233676976, |
|
"grad_norm": 0.10892369128650758, |
|
"learning_rate": 9.314053963669245e-07, |
|
"loss": 0.2456, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 2.879725085910653, |
|
"grad_norm": 0.1062255349453702, |
|
"learning_rate": 8.251976766777913e-07, |
|
"loss": 0.2649, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 2.88659793814433, |
|
"grad_norm": 0.10869941184854195, |
|
"learning_rate": 7.253923928680406e-07, |
|
"loss": 0.2559, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.893470790378007, |
|
"grad_norm": 0.1109190039810706, |
|
"learning_rate": 6.319959880756177e-07, |
|
"loss": 0.2529, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 2.9003436426116838, |
|
"grad_norm": 0.10656654333503314, |
|
"learning_rate": 5.450144916999134e-07, |
|
"loss": 0.2566, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 2.9072164948453607, |
|
"grad_norm": 0.10681708087523223, |
|
"learning_rate": 4.644535190125421e-07, |
|
"loss": 0.2526, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 2.9140893470790377, |
|
"grad_norm": 0.11101267026333346, |
|
"learning_rate": 3.903182707948649e-07, |
|
"loss": 0.2545, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 2.9209621993127146, |
|
"grad_norm": 0.10449251320574354, |
|
"learning_rate": 3.2261353300219176e-07, |
|
"loss": 0.2397, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.927835051546392, |
|
"grad_norm": 0.10614426653162999, |
|
"learning_rate": 2.613436764548505e-07, |
|
"loss": 0.2451, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 2.934707903780069, |
|
"grad_norm": 0.10344408619271417, |
|
"learning_rate": 2.0651265655603492e-07, |
|
"loss": 0.231, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 2.941580756013746, |
|
"grad_norm": 0.10751538816500814, |
|
"learning_rate": 1.5812401303639813e-07, |
|
"loss": 0.253, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 2.948453608247423, |
|
"grad_norm": 0.10902510633946422, |
|
"learning_rate": 1.1618086972559062e-07, |
|
"loss": 0.2534, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 2.9553264604811, |
|
"grad_norm": 0.10961740281781805, |
|
"learning_rate": 8.068593435055505e-08, |
|
"loss": 0.2611, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.9621993127147768, |
|
"grad_norm": 0.10667412761426893, |
|
"learning_rate": 5.164149836077714e-08, |
|
"loss": 0.2566, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 2.9690721649484537, |
|
"grad_norm": 0.10469145515742652, |
|
"learning_rate": 2.9049436780281825e-08, |
|
"loss": 0.245, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 2.9759450171821307, |
|
"grad_norm": 0.10866002746514136, |
|
"learning_rate": 1.2911208086663351e-08, |
|
"loss": 0.258, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 2.9828178694158076, |
|
"grad_norm": 0.10779386109678486, |
|
"learning_rate": 3.2278541168717646e-09, |
|
"loss": 0.2668, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 2.9896907216494846, |
|
"grad_norm": 0.11009984251752782, |
|
"learning_rate": 0.0, |
|
"loss": 0.2592, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.9896907216494846, |
|
"eval_loss": 0.29278308153152466, |
|
"eval_runtime": 27.5656, |
|
"eval_samples_per_second": 35.443, |
|
"eval_steps_per_second": 1.125, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.9896907216494846, |
|
"step": 435, |
|
"total_flos": 1.330526714897367e+17, |
|
"train_loss": 0.3336494640714821, |
|
"train_runtime": 4477.4048, |
|
"train_samples_per_second": 12.436, |
|
"train_steps_per_second": 0.097 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 435, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.330526714897367e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|