Luxe_4B / trainer_state.json
jeiku's picture
Upload 9 files
b9179c8 verified
raw
history blame contribute delete
No virus
48 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.4636336426219696,
"eval_steps": 26,
"global_step": 260,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009577970667464832,
"grad_norm": 10.084560608592307,
"learning_rate": 1.7241379310344828e-07,
"loss": 1.579,
"step": 1
},
{
"epoch": 0.009577970667464832,
"eval_loss": 2.5250306129455566,
"eval_runtime": 107.4458,
"eval_samples_per_second": 13.16,
"eval_steps_per_second": 3.295,
"step": 1
},
{
"epoch": 0.019155941334929663,
"grad_norm": 8.306669565661105,
"learning_rate": 3.4482758620689656e-07,
"loss": 1.5724,
"step": 2
},
{
"epoch": 0.02873391200239449,
"grad_norm": 13.938049536284893,
"learning_rate": 5.172413793103449e-07,
"loss": 1.5871,
"step": 3
},
{
"epoch": 0.038311882669859326,
"grad_norm": 12.43456292626288,
"learning_rate": 6.896551724137931e-07,
"loss": 1.5681,
"step": 4
},
{
"epoch": 0.04788985333732416,
"grad_norm": 13.870879646573128,
"learning_rate": 8.620689655172415e-07,
"loss": 1.5744,
"step": 5
},
{
"epoch": 0.05746782400478898,
"grad_norm": 15.247654309196745,
"learning_rate": 1.0344827586206898e-06,
"loss": 1.5925,
"step": 6
},
{
"epoch": 0.06704579467225381,
"grad_norm": 15.680512101057806,
"learning_rate": 1.2068965517241381e-06,
"loss": 1.5704,
"step": 7
},
{
"epoch": 0.07662376533971865,
"grad_norm": 14.30414461091009,
"learning_rate": 1.3793103448275862e-06,
"loss": 1.5732,
"step": 8
},
{
"epoch": 0.08620173600718348,
"grad_norm": 11.033868746409794,
"learning_rate": 1.5517241379310346e-06,
"loss": 1.5325,
"step": 9
},
{
"epoch": 0.09577970667464832,
"grad_norm": 9.293155363204939,
"learning_rate": 1.724137931034483e-06,
"loss": 1.5525,
"step": 10
},
{
"epoch": 0.10535767734211314,
"grad_norm": 10.55909566144827,
"learning_rate": 1.896551724137931e-06,
"loss": 1.5283,
"step": 11
},
{
"epoch": 0.11493564800957796,
"grad_norm": 7.362528707126726,
"learning_rate": 2.0689655172413796e-06,
"loss": 1.5246,
"step": 12
},
{
"epoch": 0.1245136186770428,
"grad_norm": 7.368215078656617,
"learning_rate": 2.241379310344828e-06,
"loss": 1.5313,
"step": 13
},
{
"epoch": 0.13409158934450763,
"grad_norm": 6.065170717786516,
"learning_rate": 2.4137931034482762e-06,
"loss": 1.5027,
"step": 14
},
{
"epoch": 0.14366956001197245,
"grad_norm": 5.328528823161362,
"learning_rate": 2.5862068965517246e-06,
"loss": 1.481,
"step": 15
},
{
"epoch": 0.1532475306794373,
"grad_norm": 4.425999183762783,
"learning_rate": 2.7586206896551725e-06,
"loss": 1.4494,
"step": 16
},
{
"epoch": 0.16282550134690213,
"grad_norm": 2.3104583142533675,
"learning_rate": 2.931034482758621e-06,
"loss": 1.4645,
"step": 17
},
{
"epoch": 0.17240347201436695,
"grad_norm": 1.595394748941364,
"learning_rate": 3.103448275862069e-06,
"loss": 1.4619,
"step": 18
},
{
"epoch": 0.18198144268183178,
"grad_norm": 1.2488731383034972,
"learning_rate": 3.2758620689655175e-06,
"loss": 1.4641,
"step": 19
},
{
"epoch": 0.19155941334929663,
"grad_norm": 1.5772662843657,
"learning_rate": 3.448275862068966e-06,
"loss": 1.4029,
"step": 20
},
{
"epoch": 0.20113738401676146,
"grad_norm": 2.556424014112241,
"learning_rate": 3.620689655172414e-06,
"loss": 1.4453,
"step": 21
},
{
"epoch": 0.21071535468422628,
"grad_norm": 2.0581192872654483,
"learning_rate": 3.793103448275862e-06,
"loss": 1.4135,
"step": 22
},
{
"epoch": 0.2202933253516911,
"grad_norm": 1.6613052346475512,
"learning_rate": 3.96551724137931e-06,
"loss": 1.4336,
"step": 23
},
{
"epoch": 0.22987129601915593,
"grad_norm": 1.2670811596205898,
"learning_rate": 4.137931034482759e-06,
"loss": 1.3898,
"step": 24
},
{
"epoch": 0.23944926668662078,
"grad_norm": 1.4594637064715403,
"learning_rate": 4.310344827586207e-06,
"loss": 1.392,
"step": 25
},
{
"epoch": 0.2490272373540856,
"grad_norm": 1.6947460151500366,
"learning_rate": 4.482758620689656e-06,
"loss": 1.3967,
"step": 26
},
{
"epoch": 0.2490272373540856,
"eval_loss": 2.319483518600464,
"eval_runtime": 107.1009,
"eval_samples_per_second": 13.202,
"eval_steps_per_second": 3.305,
"step": 26
},
{
"epoch": 0.25860520802155046,
"grad_norm": 1.4794556532045955,
"learning_rate": 4.655172413793104e-06,
"loss": 1.3882,
"step": 27
},
{
"epoch": 0.26818317868901526,
"grad_norm": 1.275878657564078,
"learning_rate": 4.8275862068965525e-06,
"loss": 1.4152,
"step": 28
},
{
"epoch": 0.2777611493564801,
"grad_norm": 1.0273810925450593,
"learning_rate": 5e-06,
"loss": 1.3897,
"step": 29
},
{
"epoch": 0.2873391200239449,
"grad_norm": 1.3658855156304837,
"learning_rate": 4.9998459603839726e-06,
"loss": 1.3539,
"step": 30
},
{
"epoch": 0.29691709069140976,
"grad_norm": 1.160650318212732,
"learning_rate": 4.9993838605184505e-06,
"loss": 1.3461,
"step": 31
},
{
"epoch": 0.3064950613588746,
"grad_norm": 0.9334705830010439,
"learning_rate": 4.998613757348784e-06,
"loss": 1.3575,
"step": 32
},
{
"epoch": 0.3160730320263394,
"grad_norm": 1.0269221075865582,
"learning_rate": 4.99753574577609e-06,
"loss": 1.3503,
"step": 33
},
{
"epoch": 0.32565100269380426,
"grad_norm": 0.9951200682896573,
"learning_rate": 4.996149958645559e-06,
"loss": 1.3718,
"step": 34
},
{
"epoch": 0.33522897336126906,
"grad_norm": 0.8568405246328175,
"learning_rate": 4.994456566730085e-06,
"loss": 1.3515,
"step": 35
},
{
"epoch": 0.3448069440287339,
"grad_norm": 0.8752926728569858,
"learning_rate": 4.992455778709222e-06,
"loss": 1.3571,
"step": 36
},
{
"epoch": 0.35438491469619876,
"grad_norm": 0.9195979878575848,
"learning_rate": 4.990147841143462e-06,
"loss": 1.3335,
"step": 37
},
{
"epoch": 0.36396288536366356,
"grad_norm": 0.8848215909446233,
"learning_rate": 4.98753303844386e-06,
"loss": 1.3093,
"step": 38
},
{
"epoch": 0.3735408560311284,
"grad_norm": 0.8261733197817335,
"learning_rate": 4.984611692836979e-06,
"loss": 1.3376,
"step": 39
},
{
"epoch": 0.38311882669859326,
"grad_norm": 0.7643849735934586,
"learning_rate": 4.981384164325184e-06,
"loss": 1.3172,
"step": 40
},
{
"epoch": 0.39269679736605806,
"grad_norm": 0.8302859072234411,
"learning_rate": 4.977850850642275e-06,
"loss": 1.352,
"step": 41
},
{
"epoch": 0.4022747680335229,
"grad_norm": 0.8019795318623388,
"learning_rate": 4.97401218720448e-06,
"loss": 1.3271,
"step": 42
},
{
"epoch": 0.4118527387009877,
"grad_norm": 0.7856123291749388,
"learning_rate": 4.969868647056793e-06,
"loss": 1.3302,
"step": 43
},
{
"epoch": 0.42143070936845256,
"grad_norm": 0.7212471859830762,
"learning_rate": 4.965420740814679e-06,
"loss": 1.3215,
"step": 44
},
{
"epoch": 0.4310086800359174,
"grad_norm": 0.7660292329930958,
"learning_rate": 4.960669016601155e-06,
"loss": 1.3435,
"step": 45
},
{
"epoch": 0.4405866507033822,
"grad_norm": 0.7247198414191649,
"learning_rate": 4.95561405997924e-06,
"loss": 1.3163,
"step": 46
},
{
"epoch": 0.45016462137084706,
"grad_norm": 0.7419070442778594,
"learning_rate": 4.950256493879795e-06,
"loss": 1.3209,
"step": 47
},
{
"epoch": 0.45974259203831186,
"grad_norm": 0.7024643859790418,
"learning_rate": 4.94459697852476e-06,
"loss": 1.2684,
"step": 48
},
{
"epoch": 0.4693205627057767,
"grad_norm": 0.7208397492740805,
"learning_rate": 4.938636211345792e-06,
"loss": 1.2818,
"step": 49
},
{
"epoch": 0.47889853337324156,
"grad_norm": 0.7159719760236076,
"learning_rate": 4.932374926898321e-06,
"loss": 1.3094,
"step": 50
},
{
"epoch": 0.48847650404070636,
"grad_norm": 0.7100286359014379,
"learning_rate": 4.92581389677103e-06,
"loss": 1.3177,
"step": 51
},
{
"epoch": 0.4980544747081712,
"grad_norm": 0.664062518173294,
"learning_rate": 4.918953929490768e-06,
"loss": 1.2868,
"step": 52
},
{
"epoch": 0.4980544747081712,
"eval_loss": 2.239407777786255,
"eval_runtime": 107.263,
"eval_samples_per_second": 13.183,
"eval_steps_per_second": 3.3,
"step": 52
},
{
"epoch": 0.507632445375636,
"grad_norm": 0.8658636506450442,
"learning_rate": 4.911795870422916e-06,
"loss": 1.2904,
"step": 53
},
{
"epoch": 0.5172104160431009,
"grad_norm": 0.6715121564275828,
"learning_rate": 4.904340601667208e-06,
"loss": 1.326,
"step": 54
},
{
"epoch": 0.5267883867105657,
"grad_norm": 0.8518222183690225,
"learning_rate": 4.896589041949036e-06,
"loss": 1.2757,
"step": 55
},
{
"epoch": 0.5363663573780305,
"grad_norm": 0.6780934729098863,
"learning_rate": 4.888542146506224e-06,
"loss": 1.3027,
"step": 56
},
{
"epoch": 0.5459443280454953,
"grad_norm": 0.8407110074770763,
"learning_rate": 4.880200906971321e-06,
"loss": 1.2965,
"step": 57
},
{
"epoch": 0.5555222987129602,
"grad_norm": 0.654501814705368,
"learning_rate": 4.8715663512493924e-06,
"loss": 1.2764,
"step": 58
},
{
"epoch": 0.565100269380425,
"grad_norm": 0.7722805216190872,
"learning_rate": 4.8626395433913595e-06,
"loss": 1.2799,
"step": 59
},
{
"epoch": 0.5746782400478898,
"grad_norm": 0.6575468000608066,
"learning_rate": 4.853421583462866e-06,
"loss": 1.3009,
"step": 60
},
{
"epoch": 0.5842562107153547,
"grad_norm": 0.6919845481307941,
"learning_rate": 4.8439136074087165e-06,
"loss": 1.2885,
"step": 61
},
{
"epoch": 0.5938341813828195,
"grad_norm": 0.652693683934317,
"learning_rate": 4.834116786912897e-06,
"loss": 1.2564,
"step": 62
},
{
"epoch": 0.6034121520502843,
"grad_norm": 0.6684643483116979,
"learning_rate": 4.82403232925418e-06,
"loss": 1.278,
"step": 63
},
{
"epoch": 0.6129901227177492,
"grad_norm": 0.6735443956477082,
"learning_rate": 4.813661477157355e-06,
"loss": 1.2895,
"step": 64
},
{
"epoch": 0.622568093385214,
"grad_norm": 0.6574494336528988,
"learning_rate": 4.803005508640083e-06,
"loss": 1.2481,
"step": 65
},
{
"epoch": 0.6321460640526788,
"grad_norm": 0.7061153031772025,
"learning_rate": 4.7920657368554e-06,
"loss": 1.3023,
"step": 66
},
{
"epoch": 0.6417240347201437,
"grad_norm": 0.6609850544647713,
"learning_rate": 4.780843509929905e-06,
"loss": 1.2619,
"step": 67
},
{
"epoch": 0.6513020053876085,
"grad_norm": 0.6958172104041147,
"learning_rate": 4.769340210797618e-06,
"loss": 1.2633,
"step": 68
},
{
"epoch": 0.6608799760550733,
"grad_norm": 0.6532872905224688,
"learning_rate": 4.757557257029563e-06,
"loss": 1.2581,
"step": 69
},
{
"epoch": 0.6704579467225381,
"grad_norm": 0.693714390508834,
"learning_rate": 4.745496100659083e-06,
"loss": 1.2499,
"step": 70
},
{
"epoch": 0.680035917390003,
"grad_norm": 0.6749996898449282,
"learning_rate": 4.733158228002891e-06,
"loss": 1.2536,
"step": 71
},
{
"epoch": 0.6896138880574678,
"grad_norm": 0.6753612400656019,
"learning_rate": 4.720545159477921e-06,
"loss": 1.2605,
"step": 72
},
{
"epoch": 0.6991918587249326,
"grad_norm": 0.6950386791904168,
"learning_rate": 4.707658449413961e-06,
"loss": 1.2489,
"step": 73
},
{
"epoch": 0.7087698293923975,
"grad_norm": 0.6396387112266337,
"learning_rate": 4.694499685862106e-06,
"loss": 1.264,
"step": 74
},
{
"epoch": 0.7183478000598623,
"grad_norm": 0.6809655013846588,
"learning_rate": 4.681070490399064e-06,
"loss": 1.2477,
"step": 75
},
{
"epoch": 0.7279257707273271,
"grad_norm": 0.6814836664342683,
"learning_rate": 4.667372517927323e-06,
"loss": 1.2349,
"step": 76
},
{
"epoch": 0.737503741394792,
"grad_norm": 0.6502075268222723,
"learning_rate": 4.653407456471222e-06,
"loss": 1.243,
"step": 77
},
{
"epoch": 0.7470817120622568,
"grad_norm": 0.6579341200451629,
"learning_rate": 4.639177026968924e-06,
"loss": 1.2549,
"step": 78
},
{
"epoch": 0.7470817120622568,
"eval_loss": 2.2078425884246826,
"eval_runtime": 107.0636,
"eval_samples_per_second": 13.207,
"eval_steps_per_second": 3.306,
"step": 78
},
{
"epoch": 0.7566596827297216,
"grad_norm": 0.6264741505964025,
"learning_rate": 4.624682983060346e-06,
"loss": 1.2903,
"step": 79
},
{
"epoch": 0.7662376533971865,
"grad_norm": 0.6533395420906253,
"learning_rate": 4.609927110871053e-06,
"loss": 1.2371,
"step": 80
},
{
"epoch": 0.7758156240646513,
"grad_norm": 0.6366166912748572,
"learning_rate": 4.594911228792156e-06,
"loss": 1.2554,
"step": 81
},
{
"epoch": 0.7853935947321161,
"grad_norm": 0.6435835690637465,
"learning_rate": 4.579637187256222e-06,
"loss": 1.2855,
"step": 82
},
{
"epoch": 0.7949715653995809,
"grad_norm": 0.6410872090826751,
"learning_rate": 4.564106868509246e-06,
"loss": 1.232,
"step": 83
},
{
"epoch": 0.8045495360670458,
"grad_norm": 0.6260242741257913,
"learning_rate": 4.5483221863786965e-06,
"loss": 1.2458,
"step": 84
},
{
"epoch": 0.8141275067345106,
"grad_norm": 0.6588265965096135,
"learning_rate": 4.5322850860376744e-06,
"loss": 1.2474,
"step": 85
},
{
"epoch": 0.8237054774019754,
"grad_norm": 0.6372013969893753,
"learning_rate": 4.515997543765202e-06,
"loss": 1.2563,
"step": 86
},
{
"epoch": 0.8332834480694403,
"grad_norm": 0.683356686747451,
"learning_rate": 4.499461566702685e-06,
"loss": 1.2447,
"step": 87
},
{
"epoch": 0.8428614187369051,
"grad_norm": 0.6520958114219059,
"learning_rate": 4.48267919260657e-06,
"loss": 1.2243,
"step": 88
},
{
"epoch": 0.8524393894043699,
"grad_norm": 0.6468861797594448,
"learning_rate": 4.465652489597226e-06,
"loss": 1.2254,
"step": 89
},
{
"epoch": 0.8620173600718348,
"grad_norm": 0.6675355862176291,
"learning_rate": 4.4483835559040885e-06,
"loss": 1.2116,
"step": 90
},
{
"epoch": 0.8715953307392996,
"grad_norm": 0.6318507194646,
"learning_rate": 4.430874519607089e-06,
"loss": 1.2634,
"step": 91
},
{
"epoch": 0.8811733014067644,
"grad_norm": 0.6496099541936005,
"learning_rate": 4.413127538374411e-06,
"loss": 1.2129,
"step": 92
},
{
"epoch": 0.8907512720742293,
"grad_norm": 0.6026396711785842,
"learning_rate": 4.395144799196593e-06,
"loss": 1.2483,
"step": 93
},
{
"epoch": 0.9003292427416941,
"grad_norm": 0.6709684350468395,
"learning_rate": 4.376928518117028e-06,
"loss": 1.2193,
"step": 94
},
{
"epoch": 0.9099072134091589,
"grad_norm": 0.6237262552476821,
"learning_rate": 4.358480939958867e-06,
"loss": 1.218,
"step": 95
},
{
"epoch": 0.9194851840766237,
"grad_norm": 0.6582242790059232,
"learning_rate": 4.339804338048397e-06,
"loss": 1.229,
"step": 96
},
{
"epoch": 0.9290631547440886,
"grad_norm": 0.6235719312223321,
"learning_rate": 4.320901013934887e-06,
"loss": 1.2098,
"step": 97
},
{
"epoch": 0.9386411254115534,
"grad_norm": 0.6295163336318428,
"learning_rate": 4.301773297106968e-06,
"loss": 1.205,
"step": 98
},
{
"epoch": 0.9482190960790182,
"grad_norm": 0.6250959313071772,
"learning_rate": 4.282423544705564e-06,
"loss": 1.2054,
"step": 99
},
{
"epoch": 0.9577970667464831,
"grad_norm": 0.6086898991547662,
"learning_rate": 4.262854141233419e-06,
"loss": 1.2118,
"step": 100
},
{
"epoch": 0.9673750374139479,
"grad_norm": 0.5764067645719498,
"learning_rate": 4.243067498261251e-06,
"loss": 1.2372,
"step": 101
},
{
"epoch": 0.9769530080814127,
"grad_norm": 0.6406315852737573,
"learning_rate": 4.223066054130568e-06,
"loss": 1.2251,
"step": 102
},
{
"epoch": 0.9865309787488776,
"grad_norm": 0.5834984455673559,
"learning_rate": 4.2028522736531895e-06,
"loss": 1.2258,
"step": 103
},
{
"epoch": 0.9961089494163424,
"grad_norm": 0.5911139350878512,
"learning_rate": 4.182428647807503e-06,
"loss": 1.2286,
"step": 104
},
{
"epoch": 0.9961089494163424,
"eval_loss": 2.184576988220215,
"eval_runtime": 107.5576,
"eval_samples_per_second": 13.146,
"eval_steps_per_second": 3.291,
"step": 104
},
{
"epoch": 1.0056869200838072,
"grad_norm": 0.6299976497698655,
"learning_rate": 4.161797693431493e-06,
"loss": 1.2383,
"step": 105
},
{
"epoch": 1.002095181083508,
"grad_norm": 0.5986176560633782,
"learning_rate": 4.140961952912594e-06,
"loss": 1.2182,
"step": 106
},
{
"epoch": 1.0116731517509727,
"grad_norm": 0.7138997909374802,
"learning_rate": 4.11992399387438e-06,
"loss": 1.1894,
"step": 107
},
{
"epoch": 1.0212511224184375,
"grad_norm": 0.6431525283411005,
"learning_rate": 4.098686408860157e-06,
"loss": 1.1741,
"step": 108
},
{
"epoch": 1.0308290930859023,
"grad_norm": 0.7490910983392529,
"learning_rate": 4.077251815013477e-06,
"loss": 1.1849,
"step": 109
},
{
"epoch": 1.0404070637533673,
"grad_norm": 0.6667698353299697,
"learning_rate": 4.055622853755627e-06,
"loss": 1.1833,
"step": 110
},
{
"epoch": 1.0499850344208321,
"grad_norm": 0.7240102351414811,
"learning_rate": 4.033802190460114e-06,
"loss": 1.1915,
"step": 111
},
{
"epoch": 1.059563005088297,
"grad_norm": 0.6281393232743739,
"learning_rate": 4.011792514124217e-06,
"loss": 1.1557,
"step": 112
},
{
"epoch": 1.0691409757557617,
"grad_norm": 0.6735415717178005,
"learning_rate": 3.989596537037608e-06,
"loss": 1.1878,
"step": 113
},
{
"epoch": 1.0787189464232265,
"grad_norm": 0.5939146155666697,
"learning_rate": 3.967216994448116e-06,
"loss": 1.1639,
"step": 114
},
{
"epoch": 1.0882969170906913,
"grad_norm": 0.6932505538102671,
"learning_rate": 3.9446566442246615e-06,
"loss": 1.1759,
"step": 115
},
{
"epoch": 1.0978748877581563,
"grad_norm": 0.5763908483496408,
"learning_rate": 3.921918266517392e-06,
"loss": 1.1781,
"step": 116
},
{
"epoch": 1.1074528584256211,
"grad_norm": 0.6818836608860367,
"learning_rate": 3.899004663415083e-06,
"loss": 1.1869,
"step": 117
},
{
"epoch": 1.117030829093086,
"grad_norm": 0.5998154432302447,
"learning_rate": 3.875918658599837e-06,
"loss": 1.1692,
"step": 118
},
{
"epoch": 1.1266087997605507,
"grad_norm": 0.6596200288243683,
"learning_rate": 3.852663096999104e-06,
"loss": 1.2059,
"step": 119
},
{
"epoch": 1.1361867704280155,
"grad_norm": 0.5918812335768482,
"learning_rate": 3.829240844435109e-06,
"loss": 1.1798,
"step": 120
},
{
"epoch": 1.1457647410954803,
"grad_norm": 0.6232580849345692,
"learning_rate": 3.8056547872716865e-06,
"loss": 1.1517,
"step": 121
},
{
"epoch": 1.1553427117629451,
"grad_norm": 0.5903843042319051,
"learning_rate": 3.7819078320585865e-06,
"loss": 1.1906,
"step": 122
},
{
"epoch": 1.1649206824304101,
"grad_norm": 0.5896678764206408,
"learning_rate": 3.7580029051732992e-06,
"loss": 1.1832,
"step": 123
},
{
"epoch": 1.174498653097875,
"grad_norm": 0.5666656027289849,
"learning_rate": 3.733942952460432e-06,
"loss": 1.1911,
"step": 124
},
{
"epoch": 1.1840766237653397,
"grad_norm": 0.565358825737842,
"learning_rate": 3.7097309388686865e-06,
"loss": 1.1945,
"step": 125
},
{
"epoch": 1.1936545944328045,
"grad_norm": 0.645159266559964,
"learning_rate": 3.6853698480854853e-06,
"loss": 1.1988,
"step": 126
},
{
"epoch": 1.2032325651002693,
"grad_norm": 0.5661828443152349,
"learning_rate": 3.660862682169283e-06,
"loss": 1.1683,
"step": 127
},
{
"epoch": 1.2128105357677341,
"grad_norm": 0.5590652900384634,
"learning_rate": 3.636212461179623e-06,
"loss": 1.1401,
"step": 128
},
{
"epoch": 1.2223885064351991,
"grad_norm": 0.5772830186331369,
"learning_rate": 3.6114222228049657e-06,
"loss": 1.1457,
"step": 129
},
{
"epoch": 1.231966477102664,
"grad_norm": 0.5638028162671672,
"learning_rate": 3.5864950219883514e-06,
"loss": 1.1599,
"step": 130
},
{
"epoch": 1.231966477102664,
"eval_loss": 2.181441068649292,
"eval_runtime": 107.4543,
"eval_samples_per_second": 13.159,
"eval_steps_per_second": 3.294,
"step": 130
},
{
"epoch": 1.2415444477701287,
"grad_norm": 0.5674264221381613,
"learning_rate": 3.561433930550934e-06,
"loss": 1.1439,
"step": 131
},
{
"epoch": 1.2511224184375935,
"grad_norm": 0.5548457286136358,
"learning_rate": 3.536242036813436e-06,
"loss": 1.1455,
"step": 132
},
{
"epoch": 1.2607003891050583,
"grad_norm": 0.5681860545302818,
"learning_rate": 3.510922445215568e-06,
"loss": 1.1619,
"step": 133
},
{
"epoch": 1.2702783597725231,
"grad_norm": 0.5189655726956113,
"learning_rate": 3.4854782759334625e-06,
"loss": 1.1647,
"step": 134
},
{
"epoch": 1.279856330439988,
"grad_norm": 0.5482759127528988,
"learning_rate": 3.4599126644951758e-06,
"loss": 1.1963,
"step": 135
},
{
"epoch": 1.289434301107453,
"grad_norm": 0.5545710145438582,
"learning_rate": 3.4342287613942804e-06,
"loss": 1.1673,
"step": 136
},
{
"epoch": 1.2990122717749177,
"grad_norm": 0.5616911560631516,
"learning_rate": 3.4084297317016353e-06,
"loss": 1.1482,
"step": 137
},
{
"epoch": 1.3085902424423825,
"grad_norm": 0.5429625311889626,
"learning_rate": 3.3825187546753426e-06,
"loss": 1.1459,
"step": 138
},
{
"epoch": 1.3181682131098473,
"grad_norm": 0.5775738090552808,
"learning_rate": 3.3564990233689632e-06,
"loss": 1.1744,
"step": 139
},
{
"epoch": 1.3277461837773121,
"grad_norm": 0.5422962267277087,
"learning_rate": 3.330373744238033e-06,
"loss": 1.1796,
"step": 140
},
{
"epoch": 1.3373241544447771,
"grad_norm": 0.5383626495155892,
"learning_rate": 3.3041461367449256e-06,
"loss": 1.1646,
"step": 141
},
{
"epoch": 1.346902125112242,
"grad_norm": 0.5588657340470299,
"learning_rate": 3.2778194329621104e-06,
"loss": 1.1842,
"step": 142
},
{
"epoch": 1.3564800957797067,
"grad_norm": 0.5198196148369068,
"learning_rate": 3.2513968771738606e-06,
"loss": 1.1708,
"step": 143
},
{
"epoch": 1.3660580664471715,
"grad_norm": 0.5453371169769571,
"learning_rate": 3.224881725476456e-06,
"loss": 1.1636,
"step": 144
},
{
"epoch": 1.3756360371146363,
"grad_norm": 0.5692897944097868,
"learning_rate": 3.198277245376924e-06,
"loss": 1.1273,
"step": 145
},
{
"epoch": 1.3852140077821011,
"grad_norm": 0.5423704486470122,
"learning_rate": 3.1715867153903844e-06,
"loss": 1.1405,
"step": 146
},
{
"epoch": 1.394791978449566,
"grad_norm": 0.5819177408649716,
"learning_rate": 3.144813424636031e-06,
"loss": 1.1665,
"step": 147
},
{
"epoch": 1.4043699491170307,
"grad_norm": 0.554870749454361,
"learning_rate": 3.1179606724318052e-06,
"loss": 1.1872,
"step": 148
},
{
"epoch": 1.4139479197844955,
"grad_norm": 0.5493659769746441,
"learning_rate": 3.091031767887817e-06,
"loss": 1.1906,
"step": 149
},
{
"epoch": 1.4235258904519605,
"grad_norm": 0.6008378552179591,
"learning_rate": 3.0640300294985613e-06,
"loss": 1.1635,
"step": 150
},
{
"epoch": 1.4331038611194253,
"grad_norm": 0.5078261653762177,
"learning_rate": 3.036958784733967e-06,
"loss": 1.1438,
"step": 151
},
{
"epoch": 1.4426818317868901,
"grad_norm": 0.5559592542323409,
"learning_rate": 3.0098213696293542e-06,
"loss": 1.1642,
"step": 152
},
{
"epoch": 1.452259802454355,
"grad_norm": 0.5461821050739424,
"learning_rate": 2.982621128374325e-06,
"loss": 1.1725,
"step": 153
},
{
"epoch": 1.46183777312182,
"grad_norm": 0.5412862095154186,
"learning_rate": 2.9553614129006543e-06,
"loss": 1.1654,
"step": 154
},
{
"epoch": 1.4714157437892847,
"grad_norm": 0.5658659296771973,
"learning_rate": 2.9280455824692255e-06,
"loss": 1.1655,
"step": 155
},
{
"epoch": 1.4809937144567495,
"grad_norm": 0.5525850336445564,
"learning_rate": 2.9006770032560637e-06,
"loss": 1.1577,
"step": 156
},
{
"epoch": 1.4809937144567495,
"eval_loss": 2.1755869388580322,
"eval_runtime": 107.2159,
"eval_samples_per_second": 13.188,
"eval_steps_per_second": 3.302,
"step": 156
},
{
"epoch": 1.4905716851242143,
"grad_norm": 0.5710362202768258,
"learning_rate": 2.8732590479375167e-06,
"loss": 1.1595,
"step": 157
},
{
"epoch": 1.5001496557916791,
"grad_norm": 0.5369626897696785,
"learning_rate": 2.8457950952746293e-06,
"loss": 1.1622,
"step": 158
},
{
"epoch": 1.509727626459144,
"grad_norm": 0.5194143574454793,
"learning_rate": 2.8182885296967833e-06,
"loss": 1.1313,
"step": 159
},
{
"epoch": 1.5193055971266087,
"grad_norm": 0.5220817246963333,
"learning_rate": 2.7907427408846156e-06,
"loss": 1.1493,
"step": 160
},
{
"epoch": 1.5288835677940735,
"grad_norm": 0.5307538609855902,
"learning_rate": 2.763161123352314e-06,
"loss": 1.1571,
"step": 161
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.5133921578064818,
"learning_rate": 2.735547076029296e-06,
"loss": 1.1398,
"step": 162
},
{
"epoch": 1.5480395091290033,
"grad_norm": 0.528392253063443,
"learning_rate": 2.7079040018413586e-06,
"loss": 1.169,
"step": 163
},
{
"epoch": 1.5576174797964681,
"grad_norm": 0.5033775123091357,
"learning_rate": 2.6802353072913307e-06,
"loss": 1.1396,
"step": 164
},
{
"epoch": 1.567195450463933,
"grad_norm": 0.5429413779707357,
"learning_rate": 2.6525444020392794e-06,
"loss": 1.1558,
"step": 165
},
{
"epoch": 1.5767734211313977,
"grad_norm": 0.5391198899526514,
"learning_rate": 2.6248346984823325e-06,
"loss": 1.1584,
"step": 166
},
{
"epoch": 1.5863513917988628,
"grad_norm": 0.5237711725405991,
"learning_rate": 2.5971096113341692e-06,
"loss": 1.1399,
"step": 167
},
{
"epoch": 1.5959293624663276,
"grad_norm": 0.522431379990406,
"learning_rate": 2.5693725572042135e-06,
"loss": 1.146,
"step": 168
},
{
"epoch": 1.6055073331337923,
"grad_norm": 0.540451111257001,
"learning_rate": 2.5416269541765963e-06,
"loss": 1.1347,
"step": 169
},
{
"epoch": 1.6150853038012571,
"grad_norm": 0.542474309771266,
"learning_rate": 2.5138762213889493e-06,
"loss": 1.1507,
"step": 170
},
{
"epoch": 1.624663274468722,
"grad_norm": 0.5339716680549861,
"learning_rate": 2.486123778611051e-06,
"loss": 1.1428,
"step": 171
},
{
"epoch": 1.6342412451361867,
"grad_norm": 0.5194346219437855,
"learning_rate": 2.458373045823404e-06,
"loss": 1.1717,
"step": 172
},
{
"epoch": 1.6438192158036515,
"grad_norm": 0.5486922902738444,
"learning_rate": 2.4306274427957878e-06,
"loss": 1.1405,
"step": 173
},
{
"epoch": 1.6533971864711163,
"grad_norm": 0.5364703724723029,
"learning_rate": 2.402890388665831e-06,
"loss": 1.1397,
"step": 174
},
{
"epoch": 1.6629751571385811,
"grad_norm": 0.5151838009534813,
"learning_rate": 2.375165301517668e-06,
"loss": 1.1625,
"step": 175
},
{
"epoch": 1.6725531278060461,
"grad_norm": 0.5387178228054901,
"learning_rate": 2.3474555979607214e-06,
"loss": 1.1586,
"step": 176
},
{
"epoch": 1.682131098473511,
"grad_norm": 0.5264984610535657,
"learning_rate": 2.3197646927086697e-06,
"loss": 1.1654,
"step": 177
},
{
"epoch": 1.6917090691409757,
"grad_norm": 0.5272357155280125,
"learning_rate": 2.2920959981586426e-06,
"loss": 1.1934,
"step": 178
},
{
"epoch": 1.7012870398084405,
"grad_norm": 0.5252339989768573,
"learning_rate": 2.2644529239707054e-06,
"loss": 1.1426,
"step": 179
},
{
"epoch": 1.7108650104759056,
"grad_norm": 0.4974185735061034,
"learning_rate": 2.2368388766476875e-06,
"loss": 1.1597,
"step": 180
},
{
"epoch": 1.7204429811433704,
"grad_norm": 0.5361098970394095,
"learning_rate": 2.2092572591153843e-06,
"loss": 1.1637,
"step": 181
},
{
"epoch": 1.7300209518108352,
"grad_norm": 0.5305009042993176,
"learning_rate": 2.1817114703032176e-06,
"loss": 1.1637,
"step": 182
},
{
"epoch": 1.7300209518108352,
"eval_loss": 2.1710658073425293,
"eval_runtime": 107.1212,
"eval_samples_per_second": 13.2,
"eval_steps_per_second": 3.305,
"step": 182
},
{
"epoch": 1.7395989224783,
"grad_norm": 0.5012187069773779,
"learning_rate": 2.154204904725371e-06,
"loss": 1.1447,
"step": 183
},
{
"epoch": 1.7491768931457647,
"grad_norm": 0.5763812037469009,
"learning_rate": 2.126740952062484e-06,
"loss": 1.1565,
"step": 184
},
{
"epoch": 1.7587548638132295,
"grad_norm": 0.5129804478325861,
"learning_rate": 2.099322996743936e-06,
"loss": 1.1798,
"step": 185
},
{
"epoch": 1.7683328344806943,
"grad_norm": 0.5107704085635135,
"learning_rate": 2.0719544175307754e-06,
"loss": 1.1486,
"step": 186
},
{
"epoch": 1.7779108051481591,
"grad_norm": 0.5225266432128085,
"learning_rate": 2.044638587099347e-06,
"loss": 1.1457,
"step": 187
},
{
"epoch": 1.787488775815624,
"grad_norm": 0.48553711881118367,
"learning_rate": 2.0173788716256758e-06,
"loss": 1.1557,
"step": 188
},
{
"epoch": 1.797066746483089,
"grad_norm": 0.5155245524580911,
"learning_rate": 1.9901786303706466e-06,
"loss": 1.1667,
"step": 189
},
{
"epoch": 1.8066447171505537,
"grad_norm": 0.5394238331941211,
"learning_rate": 1.9630412152660333e-06,
"loss": 1.1639,
"step": 190
},
{
"epoch": 1.8162226878180185,
"grad_norm": 0.5208012650775928,
"learning_rate": 1.93596997050144e-06,
"loss": 1.167,
"step": 191
},
{
"epoch": 1.8258006584854833,
"grad_norm": 0.5084683728452081,
"learning_rate": 1.9089682321121834e-06,
"loss": 1.146,
"step": 192
},
{
"epoch": 1.8353786291529484,
"grad_norm": 0.5107216674575125,
"learning_rate": 1.8820393275681954e-06,
"loss": 1.1299,
"step": 193
},
{
"epoch": 1.8449565998204132,
"grad_norm": 0.5037220655522233,
"learning_rate": 1.8551865753639692e-06,
"loss": 1.1705,
"step": 194
},
{
"epoch": 1.854534570487878,
"grad_norm": 0.5081083073272432,
"learning_rate": 1.8284132846096164e-06,
"loss": 1.1232,
"step": 195
},
{
"epoch": 1.8641125411553428,
"grad_norm": 0.4960779996118519,
"learning_rate": 1.801722754623077e-06,
"loss": 1.1356,
"step": 196
},
{
"epoch": 1.8736905118228075,
"grad_norm": 0.5194537399766056,
"learning_rate": 1.775118274523545e-06,
"loss": 1.1321,
"step": 197
},
{
"epoch": 1.8832684824902723,
"grad_norm": 0.5149057994299137,
"learning_rate": 1.74860312282614e-06,
"loss": 1.1306,
"step": 198
},
{
"epoch": 1.8928464531577371,
"grad_norm": 0.5061127962699723,
"learning_rate": 1.72218056703789e-06,
"loss": 1.1302,
"step": 199
},
{
"epoch": 1.902424423825202,
"grad_norm": 0.49704736224795454,
"learning_rate": 1.6958538632550753e-06,
"loss": 1.1479,
"step": 200
},
{
"epoch": 1.9120023944926667,
"grad_norm": 0.4976492539596855,
"learning_rate": 1.6696262557619677e-06,
"loss": 1.135,
"step": 201
},
{
"epoch": 1.9215803651601315,
"grad_norm": 0.5438558597014863,
"learning_rate": 1.6435009766310372e-06,
"loss": 1.1677,
"step": 202
},
{
"epoch": 1.9311583358275966,
"grad_norm": 0.49386649254244525,
"learning_rate": 1.6174812453246582e-06,
"loss": 1.1396,
"step": 203
},
{
"epoch": 1.9407363064950613,
"grad_norm": 0.5039832884638089,
"learning_rate": 1.5915702682983657e-06,
"loss": 1.1857,
"step": 204
},
{
"epoch": 1.9503142771625261,
"grad_norm": 0.4892382263387271,
"learning_rate": 1.5657712386057202e-06,
"loss": 1.15,
"step": 205
},
{
"epoch": 1.9598922478299912,
"grad_norm": 0.5084631284159544,
"learning_rate": 1.5400873355048248e-06,
"loss": 1.1572,
"step": 206
},
{
"epoch": 1.969470218497456,
"grad_norm": 0.5008750617477549,
"learning_rate": 1.5145217240665373e-06,
"loss": 1.1326,
"step": 207
},
{
"epoch": 1.9790481891649208,
"grad_norm": 0.4980386882470781,
"learning_rate": 1.489077554784432e-06,
"loss": 1.143,
"step": 208
},
{
"epoch": 1.9790481891649208,
"eval_loss": 2.1687815189361572,
"eval_runtime": 107.1708,
"eval_samples_per_second": 13.194,
"eval_steps_per_second": 3.303,
"step": 208
},
{
"epoch": 1.9886261598323856,
"grad_norm": 0.4895688225344272,
"learning_rate": 1.4637579631865645e-06,
"loss": 1.1171,
"step": 209
},
{
"epoch": 1.9982041304998504,
"grad_norm": 0.49262081512228967,
"learning_rate": 1.4385660694490667e-06,
"loss": 1.1449,
"step": 210
},
{
"epoch": 2.007782101167315,
"grad_norm": 0.5057383346810608,
"learning_rate": 1.4135049780116496e-06,
"loss": 1.1394,
"step": 211
},
{
"epoch": 2.0038910505836576,
"grad_norm": 0.5387584817585892,
"learning_rate": 1.388577777195035e-06,
"loss": 1.1306,
"step": 212
},
{
"epoch": 2.0134690212511224,
"grad_norm": 0.5623404364476285,
"learning_rate": 1.3637875388203784e-06,
"loss": 1.0952,
"step": 213
},
{
"epoch": 2.023046991918587,
"grad_norm": 0.5743034832238124,
"learning_rate": 1.3391373178307182e-06,
"loss": 1.1261,
"step": 214
},
{
"epoch": 2.032624962586052,
"grad_norm": 0.5461858778537674,
"learning_rate": 1.3146301519145153e-06,
"loss": 1.1328,
"step": 215
},
{
"epoch": 2.0422029332535168,
"grad_norm": 0.5528333288756201,
"learning_rate": 1.2902690611313135e-06,
"loss": 1.1249,
"step": 216
},
{
"epoch": 2.0517809039209816,
"grad_norm": 0.5258934842101934,
"learning_rate": 1.2660570475395684e-06,
"loss": 1.1109,
"step": 217
},
{
"epoch": 2.0613588745884464,
"grad_norm": 0.5524292613274455,
"learning_rate": 1.2419970948267014e-06,
"loss": 1.1135,
"step": 218
},
{
"epoch": 2.0709368452559116,
"grad_norm": 0.5405228294413486,
"learning_rate": 1.2180921679414143e-06,
"loss": 1.1287,
"step": 219
},
{
"epoch": 2.0805148159233764,
"grad_norm": 0.5298775138689613,
"learning_rate": 1.1943452127283145e-06,
"loss": 1.124,
"step": 220
},
{
"epoch": 2.090092786590841,
"grad_norm": 0.514214942457388,
"learning_rate": 1.1707591555648905e-06,
"loss": 1.1059,
"step": 221
},
{
"epoch": 2.099670757258306,
"grad_norm": 0.5329396149825425,
"learning_rate": 1.1473369030008974e-06,
"loss": 1.1201,
"step": 222
},
{
"epoch": 2.109248727925771,
"grad_norm": 0.5564862124718808,
"learning_rate": 1.124081341400165e-06,
"loss": 1.1032,
"step": 223
},
{
"epoch": 2.1188266985932356,
"grad_norm": 0.5244468629630417,
"learning_rate": 1.1009953365849168e-06,
"loss": 1.1433,
"step": 224
},
{
"epoch": 2.1284046692607004,
"grad_norm": 0.5087349968174719,
"learning_rate": 1.078081733482609e-06,
"loss": 1.1286,
"step": 225
},
{
"epoch": 2.137982639928165,
"grad_norm": 0.522473732751717,
"learning_rate": 1.055343355775339e-06,
"loss": 1.084,
"step": 226
},
{
"epoch": 2.14756061059563,
"grad_norm": 0.5213841410982886,
"learning_rate": 1.0327830055518843e-06,
"loss": 1.0778,
"step": 227
},
{
"epoch": 2.1571385812630948,
"grad_norm": 0.5211792543694728,
"learning_rate": 1.0104034629623933e-06,
"loss": 1.0892,
"step": 228
},
{
"epoch": 2.1667165519305596,
"grad_norm": 0.5366639328996056,
"learning_rate": 9.88207485875784e-07,
"loss": 1.1129,
"step": 229
},
{
"epoch": 2.1762945225980244,
"grad_norm": 0.5072189940995689,
"learning_rate": 9.661978095398854e-07,
"loss": 1.1124,
"step": 230
},
{
"epoch": 2.1858724932654896,
"grad_norm": 0.5273739329980739,
"learning_rate": 9.443771462443743e-07,
"loss": 1.0966,
"step": 231
},
{
"epoch": 2.1954504639329544,
"grad_norm": 0.530434300883332,
"learning_rate": 9.227481849865236e-07,
"loss": 1.121,
"step": 232
},
{
"epoch": 2.205028434600419,
"grad_norm": 0.49620125772094664,
"learning_rate": 9.013135911398435e-07,
"loss": 1.1227,
"step": 233
},
{
"epoch": 2.214606405267884,
"grad_norm": 0.48930931831635505,
"learning_rate": 8.800760061256205e-07,
"loss": 1.1249,
"step": 234
},
{
"epoch": 2.214606405267884,
"eval_loss": 2.177833318710327,
"eval_runtime": 106.9928,
"eval_samples_per_second": 13.216,
"eval_steps_per_second": 3.309,
"step": 234
},
{
"epoch": 2.224184375935349,
"grad_norm": 0.5117030753774101,
"learning_rate": 8.590380470874066e-07,
"loss": 1.0983,
"step": 235
},
{
"epoch": 2.2337623466028136,
"grad_norm": 0.5334281898363374,
"learning_rate": 8.382023065685071e-07,
"loss": 1.1058,
"step": 236
},
{
"epoch": 2.2433403172702784,
"grad_norm": 0.4997549069918058,
"learning_rate": 8.175713521924977e-07,
"loss": 1.1205,
"step": 237
},
{
"epoch": 2.252918287937743,
"grad_norm": 0.4903764233470244,
"learning_rate": 7.971477263468108e-07,
"loss": 1.1166,
"step": 238
},
{
"epoch": 2.262496258605208,
"grad_norm": 0.5111886828961109,
"learning_rate": 7.769339458694319e-07,
"loss": 1.1296,
"step": 239
},
{
"epoch": 2.2720742292726728,
"grad_norm": 0.5046245576610761,
"learning_rate": 7.569325017387502e-07,
"loss": 1.1214,
"step": 240
},
{
"epoch": 2.2816521999401376,
"grad_norm": 0.5012727372502416,
"learning_rate": 7.371458587665822e-07,
"loss": 1.1282,
"step": 241
},
{
"epoch": 2.2912301706076024,
"grad_norm": 0.5089746600647955,
"learning_rate": 7.175764552944368e-07,
"loss": 1.1228,
"step": 242
},
{
"epoch": 2.300808141275067,
"grad_norm": 0.49011501775043553,
"learning_rate": 6.982267028930326e-07,
"loss": 1.1019,
"step": 243
},
{
"epoch": 2.310386111942532,
"grad_norm": 0.5062866494664521,
"learning_rate": 6.790989860651143e-07,
"loss": 1.1237,
"step": 244
},
{
"epoch": 2.3199640826099968,
"grad_norm": 0.48914824725834716,
"learning_rate": 6.601956619516037e-07,
"loss": 1.1228,
"step": 245
},
{
"epoch": 2.329542053277462,
"grad_norm": 0.500095846054479,
"learning_rate": 6.41519060041134e-07,
"loss": 1.0725,
"step": 246
},
{
"epoch": 2.339120023944927,
"grad_norm": 0.48427264883155136,
"learning_rate": 6.230714818829733e-07,
"loss": 1.116,
"step": 247
},
{
"epoch": 2.3486979946123916,
"grad_norm": 0.5009855645248527,
"learning_rate": 6.048552008034073e-07,
"loss": 1.1158,
"step": 248
},
{
"epoch": 2.3582759652798564,
"grad_norm": 0.4895310310383359,
"learning_rate": 5.868724616255899e-07,
"loss": 1.1134,
"step": 249
},
{
"epoch": 2.367853935947321,
"grad_norm": 0.49721503448947285,
"learning_rate": 5.691254803929117e-07,
"loss": 1.1178,
"step": 250
},
{
"epoch": 2.377431906614786,
"grad_norm": 0.4908749278467018,
"learning_rate": 5.516164440959118e-07,
"loss": 1.0965,
"step": 251
},
{
"epoch": 2.387009877282251,
"grad_norm": 0.49404150582673295,
"learning_rate": 5.343475104027743e-07,
"loss": 1.1299,
"step": 252
},
{
"epoch": 2.3965878479497156,
"grad_norm": 0.4824591396857287,
"learning_rate": 5.17320807393431e-07,
"loss": 1.0795,
"step": 253
},
{
"epoch": 2.4061658186171804,
"grad_norm": 0.49129116007089907,
"learning_rate": 5.005384332973154e-07,
"loss": 1.1193,
"step": 254
},
{
"epoch": 2.415743789284645,
"grad_norm": 0.49733333626674653,
"learning_rate": 4.840024562347987e-07,
"loss": 1.11,
"step": 255
},
{
"epoch": 2.42532175995211,
"grad_norm": 0.4755146663348369,
"learning_rate": 4.67714913962326e-07,
"loss": 1.1091,
"step": 256
},
{
"epoch": 2.434899730619575,
"grad_norm": 0.49183045855158936,
"learning_rate": 4.5167781362130374e-07,
"loss": 1.1247,
"step": 257
},
{
"epoch": 2.44447770128704,
"grad_norm": 0.48090622566109875,
"learning_rate": 4.3589313149075495e-07,
"loss": 1.0957,
"step": 258
},
{
"epoch": 2.454055671954505,
"grad_norm": 0.49785336365870875,
"learning_rate": 4.2036281274377865e-07,
"loss": 1.1139,
"step": 259
},
{
"epoch": 2.4636336426219696,
"grad_norm": 0.49485716633378346,
"learning_rate": 4.050887712078444e-07,
"loss": 1.1298,
"step": 260
},
{
"epoch": 2.4636336426219696,
"eval_loss": 2.177307367324829,
"eval_runtime": 107.0849,
"eval_samples_per_second": 13.204,
"eval_steps_per_second": 3.306,
"step": 260
}
],
"logging_steps": 1,
"max_steps": 312,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 52,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.246866760855716e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}