1min-v2-luxia-8b / trainer_state.json
esunn's picture
Upload folder using huggingface_hub
a846016 verified
raw
history blame contribute delete
No virus
28.9 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.589743589743589,
"eval_steps": 5,
"global_step": 133,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05128205128205128,
"grad_norm": 30.696048736572266,
"learning_rate": 2e-07,
"loss": 2.6145,
"step": 1
},
{
"epoch": 0.05128205128205128,
"eval_loss": 2.721662998199463,
"eval_runtime": 0.1874,
"eval_samples_per_second": 165.427,
"eval_steps_per_second": 26.682,
"step": 1
},
{
"epoch": 0.10256410256410256,
"grad_norm": 31.234418869018555,
"learning_rate": 4e-07,
"loss": 2.839,
"step": 2
},
{
"epoch": 0.15384615384615385,
"grad_norm": 26.09066390991211,
"learning_rate": 6e-07,
"loss": 2.804,
"step": 3
},
{
"epoch": 0.20512820512820512,
"grad_norm": 25.11672019958496,
"learning_rate": 8e-07,
"loss": 2.7178,
"step": 4
},
{
"epoch": 0.2564102564102564,
"grad_norm": 25.194042205810547,
"learning_rate": 1e-06,
"loss": 2.7668,
"step": 5
},
{
"epoch": 0.2564102564102564,
"eval_loss": 2.701810359954834,
"eval_runtime": 0.1865,
"eval_samples_per_second": 166.189,
"eval_steps_per_second": 26.805,
"step": 5
},
{
"epoch": 0.3076923076923077,
"grad_norm": 31.70111656188965,
"learning_rate": 1.2e-06,
"loss": 2.5639,
"step": 6
},
{
"epoch": 0.358974358974359,
"grad_norm": 25.10308837890625,
"learning_rate": 1.4e-06,
"loss": 2.6011,
"step": 7
},
{
"epoch": 0.41025641025641024,
"grad_norm": 25.298452377319336,
"learning_rate": 1.6e-06,
"loss": 2.6779,
"step": 8
},
{
"epoch": 0.46153846153846156,
"grad_norm": 22.12431526184082,
"learning_rate": 1.8e-06,
"loss": 2.5438,
"step": 9
},
{
"epoch": 0.5128205128205128,
"grad_norm": 17.181961059570312,
"learning_rate": 2e-06,
"loss": 2.6304,
"step": 10
},
{
"epoch": 0.5128205128205128,
"eval_loss": 2.5064780712127686,
"eval_runtime": 0.1877,
"eval_samples_per_second": 165.144,
"eval_steps_per_second": 26.636,
"step": 10
},
{
"epoch": 0.5641025641025641,
"grad_norm": 15.064467430114746,
"learning_rate": 1.9998476951563913e-06,
"loss": 2.6119,
"step": 11
},
{
"epoch": 0.6153846153846154,
"grad_norm": 15.15453815460205,
"learning_rate": 1.9993908270190957e-06,
"loss": 2.5618,
"step": 12
},
{
"epoch": 0.6666666666666666,
"grad_norm": 14.976338386535645,
"learning_rate": 1.998629534754574e-06,
"loss": 2.5799,
"step": 13
},
{
"epoch": 0.717948717948718,
"grad_norm": 16.855302810668945,
"learning_rate": 1.997564050259824e-06,
"loss": 2.4803,
"step": 14
},
{
"epoch": 0.7692307692307693,
"grad_norm": 14.893013954162598,
"learning_rate": 1.9961946980917456e-06,
"loss": 2.3635,
"step": 15
},
{
"epoch": 0.7692307692307693,
"eval_loss": 2.3580050468444824,
"eval_runtime": 0.1876,
"eval_samples_per_second": 165.285,
"eval_steps_per_second": 26.659,
"step": 15
},
{
"epoch": 0.8205128205128205,
"grad_norm": 12.848993301391602,
"learning_rate": 1.994521895368273e-06,
"loss": 2.4411,
"step": 16
},
{
"epoch": 0.8717948717948718,
"grad_norm": 15.440024375915527,
"learning_rate": 1.992546151641322e-06,
"loss": 2.4781,
"step": 17
},
{
"epoch": 0.9230769230769231,
"grad_norm": 13.695003509521484,
"learning_rate": 1.99026806874157e-06,
"loss": 2.4198,
"step": 18
},
{
"epoch": 0.9743589743589743,
"grad_norm": 13.504029273986816,
"learning_rate": 1.9876883405951377e-06,
"loss": 2.4088,
"step": 19
},
{
"epoch": 1.0256410256410255,
"grad_norm": 16.270732879638672,
"learning_rate": 1.984807753012208e-06,
"loss": 2.4553,
"step": 20
},
{
"epoch": 1.0256410256410255,
"eval_loss": 2.281332015991211,
"eval_runtime": 0.1875,
"eval_samples_per_second": 165.302,
"eval_steps_per_second": 26.662,
"step": 20
},
{
"epoch": 1.0384615384615385,
"grad_norm": 13.558752059936523,
"learning_rate": 1.981627183447664e-06,
"loss": 2.3328,
"step": 21
},
{
"epoch": 1.0897435897435896,
"grad_norm": 13.454627990722656,
"learning_rate": 1.9781476007338054e-06,
"loss": 2.3366,
"step": 22
},
{
"epoch": 1.141025641025641,
"grad_norm": 14.2904052734375,
"learning_rate": 1.9743700647852355e-06,
"loss": 2.174,
"step": 23
},
{
"epoch": 1.1923076923076923,
"grad_norm": 13.595693588256836,
"learning_rate": 1.9702957262759963e-06,
"loss": 2.2358,
"step": 24
},
{
"epoch": 1.2435897435897436,
"grad_norm": 12.418634414672852,
"learning_rate": 1.965925826289068e-06,
"loss": 2.2344,
"step": 25
},
{
"epoch": 1.2435897435897436,
"eval_loss": 2.233912467956543,
"eval_runtime": 0.1904,
"eval_samples_per_second": 162.822,
"eval_steps_per_second": 26.262,
"step": 25
},
{
"epoch": 1.294871794871795,
"grad_norm": 15.914401054382324,
"learning_rate": 1.9612616959383188e-06,
"loss": 2.259,
"step": 26
},
{
"epoch": 1.3461538461538463,
"grad_norm": 12.605673789978027,
"learning_rate": 1.9563047559630356e-06,
"loss": 2.0799,
"step": 27
},
{
"epoch": 1.3974358974358974,
"grad_norm": 13.526497840881348,
"learning_rate": 1.9510565162951534e-06,
"loss": 2.1993,
"step": 28
},
{
"epoch": 1.4487179487179487,
"grad_norm": 12.563177108764648,
"learning_rate": 1.945518575599317e-06,
"loss": 2.2513,
"step": 29
},
{
"epoch": 1.5,
"grad_norm": 12.170258522033691,
"learning_rate": 1.9396926207859082e-06,
"loss": 2.4562,
"step": 30
},
{
"epoch": 1.5,
"eval_loss": 2.2017483711242676,
"eval_runtime": 0.1881,
"eval_samples_per_second": 164.78,
"eval_steps_per_second": 26.577,
"step": 30
},
{
"epoch": 1.5512820512820513,
"grad_norm": 13.118155479431152,
"learning_rate": 1.9335804264972015e-06,
"loss": 2.1825,
"step": 31
},
{
"epoch": 1.6025641025641026,
"grad_norm": 13.182004928588867,
"learning_rate": 1.9271838545667875e-06,
"loss": 2.2352,
"step": 32
},
{
"epoch": 1.6538461538461537,
"grad_norm": 14.191438674926758,
"learning_rate": 1.9205048534524403e-06,
"loss": 2.2883,
"step": 33
},
{
"epoch": 1.7051282051282053,
"grad_norm": 13.125994682312012,
"learning_rate": 1.9135454576426007e-06,
"loss": 2.204,
"step": 34
},
{
"epoch": 1.7564102564102564,
"grad_norm": 13.099204063415527,
"learning_rate": 1.9063077870366499e-06,
"loss": 2.0943,
"step": 35
},
{
"epoch": 1.7564102564102564,
"eval_loss": 2.1725800037384033,
"eval_runtime": 0.1875,
"eval_samples_per_second": 165.319,
"eval_steps_per_second": 26.664,
"step": 35
},
{
"epoch": 1.8076923076923077,
"grad_norm": 12.349153518676758,
"learning_rate": 1.8987940462991669e-06,
"loss": 2.2073,
"step": 36
},
{
"epoch": 1.858974358974359,
"grad_norm": 12.74866008758545,
"learning_rate": 1.8910065241883678e-06,
"loss": 2.2062,
"step": 37
},
{
"epoch": 1.9102564102564101,
"grad_norm": 10.330320358276367,
"learning_rate": 1.8829475928589268e-06,
"loss": 2.0004,
"step": 38
},
{
"epoch": 1.9615384615384617,
"grad_norm": 13.375683784484863,
"learning_rate": 1.8746197071393956e-06,
"loss": 1.9728,
"step": 39
},
{
"epoch": 2.0128205128205128,
"grad_norm": 13.092984199523926,
"learning_rate": 1.8660254037844386e-06,
"loss": 2.0695,
"step": 40
},
{
"epoch": 2.0128205128205128,
"eval_loss": 2.1425397396087646,
"eval_runtime": 0.1884,
"eval_samples_per_second": 164.566,
"eval_steps_per_second": 26.543,
"step": 40
},
{
"epoch": 2.0256410256410255,
"grad_norm": 12.576122283935547,
"learning_rate": 1.8571673007021123e-06,
"loss": 2.0414,
"step": 41
},
{
"epoch": 2.076923076923077,
"grad_norm": 13.123306274414062,
"learning_rate": 1.8480480961564257e-06,
"loss": 2.1836,
"step": 42
},
{
"epoch": 2.128205128205128,
"grad_norm": 11.772199630737305,
"learning_rate": 1.838670567945424e-06,
"loss": 2.0555,
"step": 43
},
{
"epoch": 2.1794871794871793,
"grad_norm": 12.407557487487793,
"learning_rate": 1.8290375725550415e-06,
"loss": 1.9841,
"step": 44
},
{
"epoch": 2.230769230769231,
"grad_norm": 10.64401626586914,
"learning_rate": 1.8191520442889917e-06,
"loss": 1.8616,
"step": 45
},
{
"epoch": 2.230769230769231,
"eval_loss": 2.117149591445923,
"eval_runtime": 0.1871,
"eval_samples_per_second": 165.686,
"eval_steps_per_second": 26.724,
"step": 45
},
{
"epoch": 2.282051282051282,
"grad_norm": 11.632575035095215,
"learning_rate": 1.8090169943749474e-06,
"loss": 1.9493,
"step": 46
},
{
"epoch": 2.3333333333333335,
"grad_norm": 13.22929573059082,
"learning_rate": 1.7986355100472927e-06,
"loss": 1.9483,
"step": 47
},
{
"epoch": 2.3846153846153846,
"grad_norm": 13.824577331542969,
"learning_rate": 1.7880107536067217e-06,
"loss": 2.0555,
"step": 48
},
{
"epoch": 2.435897435897436,
"grad_norm": 10.910252571105957,
"learning_rate": 1.7771459614569707e-06,
"loss": 2.1374,
"step": 49
},
{
"epoch": 2.4871794871794872,
"grad_norm": 13.26654052734375,
"learning_rate": 1.766044443118978e-06,
"loss": 2.0498,
"step": 50
},
{
"epoch": 2.4871794871794872,
"eval_loss": 2.1040406227111816,
"eval_runtime": 0.1886,
"eval_samples_per_second": 164.403,
"eval_steps_per_second": 26.517,
"step": 50
},
{
"epoch": 2.5384615384615383,
"grad_norm": 11.703288078308105,
"learning_rate": 1.7547095802227721e-06,
"loss": 1.9002,
"step": 51
},
{
"epoch": 2.58974358974359,
"grad_norm": 13.835978507995605,
"learning_rate": 1.743144825477394e-06,
"loss": 1.988,
"step": 52
},
{
"epoch": 2.641025641025641,
"grad_norm": 14.295548439025879,
"learning_rate": 1.7313537016191704e-06,
"loss": 2.0443,
"step": 53
},
{
"epoch": 2.6923076923076925,
"grad_norm": 11.679184913635254,
"learning_rate": 1.719339800338651e-06,
"loss": 1.9208,
"step": 54
},
{
"epoch": 2.7435897435897436,
"grad_norm": 15.062151908874512,
"learning_rate": 1.7071067811865474e-06,
"loss": 1.9028,
"step": 55
},
{
"epoch": 2.7435897435897436,
"eval_loss": 2.098405361175537,
"eval_runtime": 0.186,
"eval_samples_per_second": 166.69,
"eval_steps_per_second": 26.886,
"step": 55
},
{
"epoch": 2.7948717948717947,
"grad_norm": 14.257363319396973,
"learning_rate": 1.6946583704589972e-06,
"loss": 1.9604,
"step": 56
},
{
"epoch": 2.8461538461538463,
"grad_norm": 12.327591896057129,
"learning_rate": 1.6819983600624985e-06,
"loss": 1.9919,
"step": 57
},
{
"epoch": 2.8974358974358974,
"grad_norm": 14.447932243347168,
"learning_rate": 1.669130606358858e-06,
"loss": 1.9196,
"step": 58
},
{
"epoch": 2.948717948717949,
"grad_norm": 12.253332138061523,
"learning_rate": 1.6560590289905071e-06,
"loss": 1.8955,
"step": 59
},
{
"epoch": 3.0,
"grad_norm": 14.021129608154297,
"learning_rate": 1.6427876096865393e-06,
"loss": 1.9057,
"step": 60
},
{
"epoch": 3.0,
"eval_loss": 2.084063768386841,
"eval_runtime": 0.1878,
"eval_samples_per_second": 165.031,
"eval_steps_per_second": 26.618,
"step": 60
},
{
"epoch": 3.0128205128205128,
"grad_norm": 12.585602760314941,
"learning_rate": 1.6293203910498375e-06,
"loss": 1.9736,
"step": 61
},
{
"epoch": 3.064102564102564,
"grad_norm": 12.412880897521973,
"learning_rate": 1.615661475325658e-06,
"loss": 1.906,
"step": 62
},
{
"epoch": 3.1153846153846154,
"grad_norm": 12.772639274597168,
"learning_rate": 1.6018150231520484e-06,
"loss": 1.8674,
"step": 63
},
{
"epoch": 3.1666666666666665,
"grad_norm": 9.931306838989258,
"learning_rate": 1.587785252292473e-06,
"loss": 1.8862,
"step": 64
},
{
"epoch": 3.217948717948718,
"grad_norm": 13.5899658203125,
"learning_rate": 1.573576436351046e-06,
"loss": 1.7464,
"step": 65
},
{
"epoch": 3.217948717948718,
"eval_loss": 2.078381061553955,
"eval_runtime": 0.1867,
"eval_samples_per_second": 166.085,
"eval_steps_per_second": 26.788,
"step": 65
},
{
"epoch": 3.269230769230769,
"grad_norm": 11.722041130065918,
"learning_rate": 1.5591929034707466e-06,
"loss": 1.8595,
"step": 66
},
{
"epoch": 3.3205128205128207,
"grad_norm": 12.511164665222168,
"learning_rate": 1.544639035015027e-06,
"loss": 1.8445,
"step": 67
},
{
"epoch": 3.371794871794872,
"grad_norm": 15.670218467712402,
"learning_rate": 1.5299192642332049e-06,
"loss": 1.8044,
"step": 68
},
{
"epoch": 3.423076923076923,
"grad_norm": 12.341389656066895,
"learning_rate": 1.5150380749100543e-06,
"loss": 1.811,
"step": 69
},
{
"epoch": 3.4743589743589745,
"grad_norm": 13.361737251281738,
"learning_rate": 1.5e-06,
"loss": 1.8284,
"step": 70
},
{
"epoch": 3.4743589743589745,
"eval_loss": 2.078845500946045,
"eval_runtime": 0.1879,
"eval_samples_per_second": 164.946,
"eval_steps_per_second": 26.604,
"step": 70
},
{
"epoch": 3.5256410256410255,
"grad_norm": 14.585214614868164,
"learning_rate": 1.4848096202463372e-06,
"loss": 1.7391,
"step": 71
},
{
"epoch": 3.5769230769230766,
"grad_norm": 11.4587984085083,
"learning_rate": 1.4694715627858908e-06,
"loss": 1.8459,
"step": 72
},
{
"epoch": 3.628205128205128,
"grad_norm": 14.638727188110352,
"learning_rate": 1.4539904997395467e-06,
"loss": 1.814,
"step": 73
},
{
"epoch": 3.6794871794871797,
"grad_norm": 15.081775665283203,
"learning_rate": 1.4383711467890773e-06,
"loss": 1.9079,
"step": 74
},
{
"epoch": 3.730769230769231,
"grad_norm": 12.757416725158691,
"learning_rate": 1.4226182617406994e-06,
"loss": 1.8866,
"step": 75
},
{
"epoch": 3.730769230769231,
"eval_loss": 2.0760610103607178,
"eval_runtime": 0.1867,
"eval_samples_per_second": 166.063,
"eval_steps_per_second": 26.784,
"step": 75
},
{
"epoch": 3.782051282051282,
"grad_norm": 14.678832054138184,
"learning_rate": 1.4067366430758004e-06,
"loss": 1.7503,
"step": 76
},
{
"epoch": 3.8333333333333335,
"grad_norm": 15.981603622436523,
"learning_rate": 1.3907311284892735e-06,
"loss": 1.7984,
"step": 77
},
{
"epoch": 3.8846153846153846,
"grad_norm": 14.856511116027832,
"learning_rate": 1.374606593415912e-06,
"loss": 1.7843,
"step": 78
},
{
"epoch": 3.935897435897436,
"grad_norm": 14.275514602661133,
"learning_rate": 1.3583679495453e-06,
"loss": 1.7888,
"step": 79
},
{
"epoch": 3.9871794871794872,
"grad_norm": 12.734882354736328,
"learning_rate": 1.3420201433256689e-06,
"loss": 1.8927,
"step": 80
},
{
"epoch": 3.9871794871794872,
"eval_loss": 2.067340135574341,
"eval_runtime": 0.1861,
"eval_samples_per_second": 166.583,
"eval_steps_per_second": 26.868,
"step": 80
},
{
"epoch": 4.038461538461538,
"grad_norm": 14.663799285888672,
"learning_rate": 1.3255681544571566e-06,
"loss": 1.7531,
"step": 81
},
{
"epoch": 4.051282051282051,
"grad_norm": 12.570903778076172,
"learning_rate": 1.3090169943749473e-06,
"loss": 1.7588,
"step": 82
},
{
"epoch": 4.102564102564102,
"grad_norm": 11.108199119567871,
"learning_rate": 1.2923717047227368e-06,
"loss": 1.6173,
"step": 83
},
{
"epoch": 4.153846153846154,
"grad_norm": 14.328954696655273,
"learning_rate": 1.275637355816999e-06,
"loss": 1.7411,
"step": 84
},
{
"epoch": 4.205128205128205,
"grad_norm": 14.140481948852539,
"learning_rate": 1.2588190451025207e-06,
"loss": 1.5778,
"step": 85
},
{
"epoch": 4.205128205128205,
"eval_loss": 2.0778791904449463,
"eval_runtime": 0.1891,
"eval_samples_per_second": 163.917,
"eval_steps_per_second": 26.438,
"step": 85
},
{
"epoch": 4.256410256410256,
"grad_norm": 13.933786392211914,
"learning_rate": 1.2419218955996676e-06,
"loss": 1.5578,
"step": 86
},
{
"epoch": 4.3076923076923075,
"grad_norm": 16.1457462310791,
"learning_rate": 1.2249510543438651e-06,
"loss": 1.6873,
"step": 87
},
{
"epoch": 4.358974358974359,
"grad_norm": 16.26984977722168,
"learning_rate": 1.207911690817759e-06,
"loss": 1.6605,
"step": 88
},
{
"epoch": 4.410256410256411,
"grad_norm": 19.391223907470703,
"learning_rate": 1.1908089953765447e-06,
"loss": 1.6272,
"step": 89
},
{
"epoch": 4.461538461538462,
"grad_norm": 19.38517951965332,
"learning_rate": 1.1736481776669305e-06,
"loss": 1.7274,
"step": 90
},
{
"epoch": 4.461538461538462,
"eval_loss": 2.0934271812438965,
"eval_runtime": 0.1874,
"eval_samples_per_second": 165.396,
"eval_steps_per_second": 26.677,
"step": 90
},
{
"epoch": 4.512820512820513,
"grad_norm": 16.367389678955078,
"learning_rate": 1.156434465040231e-06,
"loss": 1.8406,
"step": 91
},
{
"epoch": 4.564102564102564,
"grad_norm": 18.22227668762207,
"learning_rate": 1.1391731009600653e-06,
"loss": 1.7469,
"step": 92
},
{
"epoch": 4.615384615384615,
"grad_norm": 14.44421100616455,
"learning_rate": 1.1218693434051474e-06,
"loss": 1.5867,
"step": 93
},
{
"epoch": 4.666666666666667,
"grad_norm": 13.295368194580078,
"learning_rate": 1.1045284632676535e-06,
"loss": 1.7081,
"step": 94
},
{
"epoch": 4.717948717948718,
"grad_norm": 15.499272346496582,
"learning_rate": 1.0871557427476583e-06,
"loss": 1.7431,
"step": 95
},
{
"epoch": 4.717948717948718,
"eval_loss": 2.065159559249878,
"eval_runtime": 0.1863,
"eval_samples_per_second": 166.408,
"eval_steps_per_second": 26.84,
"step": 95
},
{
"epoch": 4.769230769230769,
"grad_norm": 15.949275016784668,
"learning_rate": 1.069756473744125e-06,
"loss": 1.6641,
"step": 96
},
{
"epoch": 4.82051282051282,
"grad_norm": 13.781301498413086,
"learning_rate": 1.052335956242944e-06,
"loss": 1.5421,
"step": 97
},
{
"epoch": 4.871794871794872,
"grad_norm": 16.268604278564453,
"learning_rate": 1.034899496702501e-06,
"loss": 1.7906,
"step": 98
},
{
"epoch": 4.923076923076923,
"grad_norm": 12.881053924560547,
"learning_rate": 1.0174524064372837e-06,
"loss": 1.7359,
"step": 99
},
{
"epoch": 4.9743589743589745,
"grad_norm": 15.596150398254395,
"learning_rate": 1e-06,
"loss": 1.8728,
"step": 100
},
{
"epoch": 4.9743589743589745,
"eval_loss": 2.0617754459381104,
"eval_runtime": 0.1875,
"eval_samples_per_second": 165.345,
"eval_steps_per_second": 26.668,
"step": 100
},
{
"epoch": 5.0256410256410255,
"grad_norm": 16.61153221130371,
"learning_rate": 9.825475935627165e-07,
"loss": 1.6729,
"step": 101
},
{
"epoch": 5.038461538461538,
"grad_norm": 13.130430221557617,
"learning_rate": 9.651005032974993e-07,
"loss": 1.6707,
"step": 102
},
{
"epoch": 5.089743589743589,
"grad_norm": 14.977300643920898,
"learning_rate": 9.476640437570561e-07,
"loss": 1.5516,
"step": 103
},
{
"epoch": 5.141025641025641,
"grad_norm": 17.314029693603516,
"learning_rate": 9.302435262558747e-07,
"loss": 1.6449,
"step": 104
},
{
"epoch": 5.1923076923076925,
"grad_norm": 15.75112247467041,
"learning_rate": 9.128442572523417e-07,
"loss": 1.5729,
"step": 105
},
{
"epoch": 5.1923076923076925,
"eval_loss": 2.083660125732422,
"eval_runtime": 0.187,
"eval_samples_per_second": 165.747,
"eval_steps_per_second": 26.733,
"step": 105
},
{
"epoch": 5.243589743589744,
"grad_norm": 19.511394500732422,
"learning_rate": 8.954715367323466e-07,
"loss": 1.5756,
"step": 106
},
{
"epoch": 5.294871794871795,
"grad_norm": 16.741764068603516,
"learning_rate": 8.781306565948526e-07,
"loss": 1.6627,
"step": 107
},
{
"epoch": 5.346153846153846,
"grad_norm": 16.6429443359375,
"learning_rate": 8.608268990399348e-07,
"loss": 1.6097,
"step": 108
},
{
"epoch": 5.397435897435898,
"grad_norm": 22.457843780517578,
"learning_rate": 8.435655349597689e-07,
"loss": 1.6192,
"step": 109
},
{
"epoch": 5.448717948717949,
"grad_norm": 13.546624183654785,
"learning_rate": 8.263518223330696e-07,
"loss": 1.4631,
"step": 110
},
{
"epoch": 5.448717948717949,
"eval_loss": 2.087294816970825,
"eval_runtime": 0.1887,
"eval_samples_per_second": 164.276,
"eval_steps_per_second": 26.496,
"step": 110
},
{
"epoch": 5.5,
"grad_norm": 16.943618774414062,
"learning_rate": 8.091910046234551e-07,
"loss": 1.5529,
"step": 111
},
{
"epoch": 5.551282051282051,
"grad_norm": 17.719892501831055,
"learning_rate": 7.920883091822408e-07,
"loss": 1.7165,
"step": 112
},
{
"epoch": 5.602564102564102,
"grad_norm": 14.0659818649292,
"learning_rate": 7.750489456561351e-07,
"loss": 1.5024,
"step": 113
},
{
"epoch": 5.653846153846154,
"grad_norm": 17.86212921142578,
"learning_rate": 7.580781044003324e-07,
"loss": 1.5745,
"step": 114
},
{
"epoch": 5.705128205128205,
"grad_norm": 17.252527236938477,
"learning_rate": 7.411809548974791e-07,
"loss": 1.4758,
"step": 115
},
{
"epoch": 5.705128205128205,
"eval_loss": 2.074392557144165,
"eval_runtime": 0.1875,
"eval_samples_per_second": 165.36,
"eval_steps_per_second": 26.671,
"step": 115
},
{
"epoch": 5.756410256410256,
"grad_norm": 18.326730728149414,
"learning_rate": 7.243626441830009e-07,
"loss": 1.5874,
"step": 116
},
{
"epoch": 5.8076923076923075,
"grad_norm": 14.133539199829102,
"learning_rate": 7.076282952772633e-07,
"loss": 1.4556,
"step": 117
},
{
"epoch": 5.858974358974359,
"grad_norm": 16.187454223632812,
"learning_rate": 6.909830056250526e-07,
"loss": 1.5353,
"step": 118
},
{
"epoch": 5.910256410256411,
"grad_norm": 18.15951919555664,
"learning_rate": 6.744318455428435e-07,
"loss": 1.6346,
"step": 119
},
{
"epoch": 5.961538461538462,
"grad_norm": 14.860916137695312,
"learning_rate": 6.579798566743313e-07,
"loss": 1.5289,
"step": 120
},
{
"epoch": 5.961538461538462,
"eval_loss": 2.0899431705474854,
"eval_runtime": 0.1896,
"eval_samples_per_second": 163.49,
"eval_steps_per_second": 26.369,
"step": 120
},
{
"epoch": 6.012820512820513,
"grad_norm": 23.091646194458008,
"learning_rate": 6.416320504546997e-07,
"loss": 1.6633,
"step": 121
},
{
"epoch": 6.0256410256410255,
"grad_norm": 19.409482955932617,
"learning_rate": 6.253934065840879e-07,
"loss": 1.6998,
"step": 122
},
{
"epoch": 6.076923076923077,
"grad_norm": 15.723928451538086,
"learning_rate": 6.092688715107263e-07,
"loss": 1.5407,
"step": 123
},
{
"epoch": 6.128205128205128,
"grad_norm": 17.410001754760742,
"learning_rate": 5.932633569241999e-07,
"loss": 1.4682,
"step": 124
},
{
"epoch": 6.17948717948718,
"grad_norm": 15.949166297912598,
"learning_rate": 5.773817382593007e-07,
"loss": 1.515,
"step": 125
},
{
"epoch": 6.17948717948718,
"eval_loss": 2.091871500015259,
"eval_runtime": 0.1884,
"eval_samples_per_second": 164.579,
"eval_steps_per_second": 26.545,
"step": 125
},
{
"epoch": 6.230769230769231,
"grad_norm": 19.262935638427734,
"learning_rate": 5.616288532109224e-07,
"loss": 1.4557,
"step": 126
},
{
"epoch": 6.282051282051282,
"grad_norm": 18.071447372436523,
"learning_rate": 5.460095002604532e-07,
"loss": 1.4763,
"step": 127
},
{
"epoch": 6.333333333333333,
"grad_norm": 14.22094440460205,
"learning_rate": 5.305284372141095e-07,
"loss": 1.3375,
"step": 128
},
{
"epoch": 6.384615384615385,
"grad_norm": 19.112789154052734,
"learning_rate": 5.15190379753663e-07,
"loss": 1.5896,
"step": 129
},
{
"epoch": 6.435897435897436,
"grad_norm": 19.069456100463867,
"learning_rate": 5.000000000000002e-07,
"loss": 1.5757,
"step": 130
},
{
"epoch": 6.435897435897436,
"eval_loss": 2.0978188514709473,
"eval_runtime": 0.1888,
"eval_samples_per_second": 164.22,
"eval_steps_per_second": 26.487,
"step": 130
},
{
"epoch": 6.487179487179487,
"grad_norm": 16.8870792388916,
"learning_rate": 4.849619250899458e-07,
"loss": 1.4204,
"step": 131
},
{
"epoch": 6.538461538461538,
"grad_norm": 20.033496856689453,
"learning_rate": 4.700807357667952e-07,
"loss": 1.6698,
"step": 132
},
{
"epoch": 6.589743589743589,
"grad_norm": 18.386215209960938,
"learning_rate": 4.5536096498497287e-07,
"loss": 1.4692,
"step": 133
}
],
"logging_steps": 1,
"max_steps": 190,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 19,
"total_flos": 1.733580238744453e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}