{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.589743589743589, "eval_steps": 5, "global_step": 133, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05128205128205128, "grad_norm": 30.696048736572266, "learning_rate": 2e-07, "loss": 2.6145, "step": 1 }, { "epoch": 0.05128205128205128, "eval_loss": 2.721662998199463, "eval_runtime": 0.1874, "eval_samples_per_second": 165.427, "eval_steps_per_second": 26.682, "step": 1 }, { "epoch": 0.10256410256410256, "grad_norm": 31.234418869018555, "learning_rate": 4e-07, "loss": 2.839, "step": 2 }, { "epoch": 0.15384615384615385, "grad_norm": 26.09066390991211, "learning_rate": 6e-07, "loss": 2.804, "step": 3 }, { "epoch": 0.20512820512820512, "grad_norm": 25.11672019958496, "learning_rate": 8e-07, "loss": 2.7178, "step": 4 }, { "epoch": 0.2564102564102564, "grad_norm": 25.194042205810547, "learning_rate": 1e-06, "loss": 2.7668, "step": 5 }, { "epoch": 0.2564102564102564, "eval_loss": 2.701810359954834, "eval_runtime": 0.1865, "eval_samples_per_second": 166.189, "eval_steps_per_second": 26.805, "step": 5 }, { "epoch": 0.3076923076923077, "grad_norm": 31.70111656188965, "learning_rate": 1.2e-06, "loss": 2.5639, "step": 6 }, { "epoch": 0.358974358974359, "grad_norm": 25.10308837890625, "learning_rate": 1.4e-06, "loss": 2.6011, "step": 7 }, { "epoch": 0.41025641025641024, "grad_norm": 25.298452377319336, "learning_rate": 1.6e-06, "loss": 2.6779, "step": 8 }, { "epoch": 0.46153846153846156, "grad_norm": 22.12431526184082, "learning_rate": 1.8e-06, "loss": 2.5438, "step": 9 }, { "epoch": 0.5128205128205128, "grad_norm": 17.181961059570312, "learning_rate": 2e-06, "loss": 2.6304, "step": 10 }, { "epoch": 0.5128205128205128, "eval_loss": 2.5064780712127686, "eval_runtime": 0.1877, "eval_samples_per_second": 165.144, "eval_steps_per_second": 26.636, "step": 10 }, { "epoch": 0.5641025641025641, "grad_norm": 15.064467430114746, "learning_rate": 1.9998476951563913e-06, "loss": 2.6119, "step": 11 }, { "epoch": 0.6153846153846154, "grad_norm": 15.15453815460205, "learning_rate": 1.9993908270190957e-06, "loss": 2.5618, "step": 12 }, { "epoch": 0.6666666666666666, "grad_norm": 14.976338386535645, "learning_rate": 1.998629534754574e-06, "loss": 2.5799, "step": 13 }, { "epoch": 0.717948717948718, "grad_norm": 16.855302810668945, "learning_rate": 1.997564050259824e-06, "loss": 2.4803, "step": 14 }, { "epoch": 0.7692307692307693, "grad_norm": 14.893013954162598, "learning_rate": 1.9961946980917456e-06, "loss": 2.3635, "step": 15 }, { "epoch": 0.7692307692307693, "eval_loss": 2.3580050468444824, "eval_runtime": 0.1876, "eval_samples_per_second": 165.285, "eval_steps_per_second": 26.659, "step": 15 }, { "epoch": 0.8205128205128205, "grad_norm": 12.848993301391602, "learning_rate": 1.994521895368273e-06, "loss": 2.4411, "step": 16 }, { "epoch": 0.8717948717948718, "grad_norm": 15.440024375915527, "learning_rate": 1.992546151641322e-06, "loss": 2.4781, "step": 17 }, { "epoch": 0.9230769230769231, "grad_norm": 13.695003509521484, "learning_rate": 1.99026806874157e-06, "loss": 2.4198, "step": 18 }, { "epoch": 0.9743589743589743, "grad_norm": 13.504029273986816, "learning_rate": 1.9876883405951377e-06, "loss": 2.4088, "step": 19 }, { "epoch": 1.0256410256410255, "grad_norm": 16.270732879638672, "learning_rate": 1.984807753012208e-06, "loss": 2.4553, "step": 20 }, { "epoch": 1.0256410256410255, "eval_loss": 2.281332015991211, "eval_runtime": 0.1875, "eval_samples_per_second": 165.302, "eval_steps_per_second": 26.662, "step": 20 }, { "epoch": 1.0384615384615385, "grad_norm": 13.558752059936523, "learning_rate": 1.981627183447664e-06, "loss": 2.3328, "step": 21 }, { "epoch": 1.0897435897435896, "grad_norm": 13.454627990722656, "learning_rate": 1.9781476007338054e-06, "loss": 2.3366, "step": 22 }, { "epoch": 1.141025641025641, "grad_norm": 14.2904052734375, "learning_rate": 1.9743700647852355e-06, "loss": 2.174, "step": 23 }, { "epoch": 1.1923076923076923, "grad_norm": 13.595693588256836, "learning_rate": 1.9702957262759963e-06, "loss": 2.2358, "step": 24 }, { "epoch": 1.2435897435897436, "grad_norm": 12.418634414672852, "learning_rate": 1.965925826289068e-06, "loss": 2.2344, "step": 25 }, { "epoch": 1.2435897435897436, "eval_loss": 2.233912467956543, "eval_runtime": 0.1904, "eval_samples_per_second": 162.822, "eval_steps_per_second": 26.262, "step": 25 }, { "epoch": 1.294871794871795, "grad_norm": 15.914401054382324, "learning_rate": 1.9612616959383188e-06, "loss": 2.259, "step": 26 }, { "epoch": 1.3461538461538463, "grad_norm": 12.605673789978027, "learning_rate": 1.9563047559630356e-06, "loss": 2.0799, "step": 27 }, { "epoch": 1.3974358974358974, "grad_norm": 13.526497840881348, "learning_rate": 1.9510565162951534e-06, "loss": 2.1993, "step": 28 }, { "epoch": 1.4487179487179487, "grad_norm": 12.563177108764648, "learning_rate": 1.945518575599317e-06, "loss": 2.2513, "step": 29 }, { "epoch": 1.5, "grad_norm": 12.170258522033691, "learning_rate": 1.9396926207859082e-06, "loss": 2.4562, "step": 30 }, { "epoch": 1.5, "eval_loss": 2.2017483711242676, "eval_runtime": 0.1881, "eval_samples_per_second": 164.78, "eval_steps_per_second": 26.577, "step": 30 }, { "epoch": 1.5512820512820513, "grad_norm": 13.118155479431152, "learning_rate": 1.9335804264972015e-06, "loss": 2.1825, "step": 31 }, { "epoch": 1.6025641025641026, "grad_norm": 13.182004928588867, "learning_rate": 1.9271838545667875e-06, "loss": 2.2352, "step": 32 }, { "epoch": 1.6538461538461537, "grad_norm": 14.191438674926758, "learning_rate": 1.9205048534524403e-06, "loss": 2.2883, "step": 33 }, { "epoch": 1.7051282051282053, "grad_norm": 13.125994682312012, "learning_rate": 1.9135454576426007e-06, "loss": 2.204, "step": 34 }, { "epoch": 1.7564102564102564, "grad_norm": 13.099204063415527, "learning_rate": 1.9063077870366499e-06, "loss": 2.0943, "step": 35 }, { "epoch": 1.7564102564102564, "eval_loss": 2.1725800037384033, "eval_runtime": 0.1875, "eval_samples_per_second": 165.319, "eval_steps_per_second": 26.664, "step": 35 }, { "epoch": 1.8076923076923077, "grad_norm": 12.349153518676758, "learning_rate": 1.8987940462991669e-06, "loss": 2.2073, "step": 36 }, { "epoch": 1.858974358974359, "grad_norm": 12.74866008758545, "learning_rate": 1.8910065241883678e-06, "loss": 2.2062, "step": 37 }, { "epoch": 1.9102564102564101, "grad_norm": 10.330320358276367, "learning_rate": 1.8829475928589268e-06, "loss": 2.0004, "step": 38 }, { "epoch": 1.9615384615384617, "grad_norm": 13.375683784484863, "learning_rate": 1.8746197071393956e-06, "loss": 1.9728, "step": 39 }, { "epoch": 2.0128205128205128, "grad_norm": 13.092984199523926, "learning_rate": 1.8660254037844386e-06, "loss": 2.0695, "step": 40 }, { "epoch": 2.0128205128205128, "eval_loss": 2.1425397396087646, "eval_runtime": 0.1884, "eval_samples_per_second": 164.566, "eval_steps_per_second": 26.543, "step": 40 }, { "epoch": 2.0256410256410255, "grad_norm": 12.576122283935547, "learning_rate": 1.8571673007021123e-06, "loss": 2.0414, "step": 41 }, { "epoch": 2.076923076923077, "grad_norm": 13.123306274414062, "learning_rate": 1.8480480961564257e-06, "loss": 2.1836, "step": 42 }, { "epoch": 2.128205128205128, "grad_norm": 11.772199630737305, "learning_rate": 1.838670567945424e-06, "loss": 2.0555, "step": 43 }, { "epoch": 2.1794871794871793, "grad_norm": 12.407557487487793, "learning_rate": 1.8290375725550415e-06, "loss": 1.9841, "step": 44 }, { "epoch": 2.230769230769231, "grad_norm": 10.64401626586914, "learning_rate": 1.8191520442889917e-06, "loss": 1.8616, "step": 45 }, { "epoch": 2.230769230769231, "eval_loss": 2.117149591445923, "eval_runtime": 0.1871, "eval_samples_per_second": 165.686, "eval_steps_per_second": 26.724, "step": 45 }, { "epoch": 2.282051282051282, "grad_norm": 11.632575035095215, "learning_rate": 1.8090169943749474e-06, "loss": 1.9493, "step": 46 }, { "epoch": 2.3333333333333335, "grad_norm": 13.22929573059082, "learning_rate": 1.7986355100472927e-06, "loss": 1.9483, "step": 47 }, { "epoch": 2.3846153846153846, "grad_norm": 13.824577331542969, "learning_rate": 1.7880107536067217e-06, "loss": 2.0555, "step": 48 }, { "epoch": 2.435897435897436, "grad_norm": 10.910252571105957, "learning_rate": 1.7771459614569707e-06, "loss": 2.1374, "step": 49 }, { "epoch": 2.4871794871794872, "grad_norm": 13.26654052734375, "learning_rate": 1.766044443118978e-06, "loss": 2.0498, "step": 50 }, { "epoch": 2.4871794871794872, "eval_loss": 2.1040406227111816, "eval_runtime": 0.1886, "eval_samples_per_second": 164.403, "eval_steps_per_second": 26.517, "step": 50 }, { "epoch": 2.5384615384615383, "grad_norm": 11.703288078308105, "learning_rate": 1.7547095802227721e-06, "loss": 1.9002, "step": 51 }, { "epoch": 2.58974358974359, "grad_norm": 13.835978507995605, "learning_rate": 1.743144825477394e-06, "loss": 1.988, "step": 52 }, { "epoch": 2.641025641025641, "grad_norm": 14.295548439025879, "learning_rate": 1.7313537016191704e-06, "loss": 2.0443, "step": 53 }, { "epoch": 2.6923076923076925, "grad_norm": 11.679184913635254, "learning_rate": 1.719339800338651e-06, "loss": 1.9208, "step": 54 }, { "epoch": 2.7435897435897436, "grad_norm": 15.062151908874512, "learning_rate": 1.7071067811865474e-06, "loss": 1.9028, "step": 55 }, { "epoch": 2.7435897435897436, "eval_loss": 2.098405361175537, "eval_runtime": 0.186, "eval_samples_per_second": 166.69, "eval_steps_per_second": 26.886, "step": 55 }, { "epoch": 2.7948717948717947, "grad_norm": 14.257363319396973, "learning_rate": 1.6946583704589972e-06, "loss": 1.9604, "step": 56 }, { "epoch": 2.8461538461538463, "grad_norm": 12.327591896057129, "learning_rate": 1.6819983600624985e-06, "loss": 1.9919, "step": 57 }, { "epoch": 2.8974358974358974, "grad_norm": 14.447932243347168, "learning_rate": 1.669130606358858e-06, "loss": 1.9196, "step": 58 }, { "epoch": 2.948717948717949, "grad_norm": 12.253332138061523, "learning_rate": 1.6560590289905071e-06, "loss": 1.8955, "step": 59 }, { "epoch": 3.0, "grad_norm": 14.021129608154297, "learning_rate": 1.6427876096865393e-06, "loss": 1.9057, "step": 60 }, { "epoch": 3.0, "eval_loss": 2.084063768386841, "eval_runtime": 0.1878, "eval_samples_per_second": 165.031, "eval_steps_per_second": 26.618, "step": 60 }, { "epoch": 3.0128205128205128, "grad_norm": 12.585602760314941, "learning_rate": 1.6293203910498375e-06, "loss": 1.9736, "step": 61 }, { "epoch": 3.064102564102564, "grad_norm": 12.412880897521973, "learning_rate": 1.615661475325658e-06, "loss": 1.906, "step": 62 }, { "epoch": 3.1153846153846154, "grad_norm": 12.772639274597168, "learning_rate": 1.6018150231520484e-06, "loss": 1.8674, "step": 63 }, { "epoch": 3.1666666666666665, "grad_norm": 9.931306838989258, "learning_rate": 1.587785252292473e-06, "loss": 1.8862, "step": 64 }, { "epoch": 3.217948717948718, "grad_norm": 13.5899658203125, "learning_rate": 1.573576436351046e-06, "loss": 1.7464, "step": 65 }, { "epoch": 3.217948717948718, "eval_loss": 2.078381061553955, "eval_runtime": 0.1867, "eval_samples_per_second": 166.085, "eval_steps_per_second": 26.788, "step": 65 }, { "epoch": 3.269230769230769, "grad_norm": 11.722041130065918, "learning_rate": 1.5591929034707466e-06, "loss": 1.8595, "step": 66 }, { "epoch": 3.3205128205128207, "grad_norm": 12.511164665222168, "learning_rate": 1.544639035015027e-06, "loss": 1.8445, "step": 67 }, { "epoch": 3.371794871794872, "grad_norm": 15.670218467712402, "learning_rate": 1.5299192642332049e-06, "loss": 1.8044, "step": 68 }, { "epoch": 3.423076923076923, "grad_norm": 12.341389656066895, "learning_rate": 1.5150380749100543e-06, "loss": 1.811, "step": 69 }, { "epoch": 3.4743589743589745, "grad_norm": 13.361737251281738, "learning_rate": 1.5e-06, "loss": 1.8284, "step": 70 }, { "epoch": 3.4743589743589745, "eval_loss": 2.078845500946045, "eval_runtime": 0.1879, "eval_samples_per_second": 164.946, "eval_steps_per_second": 26.604, "step": 70 }, { "epoch": 3.5256410256410255, "grad_norm": 14.585214614868164, "learning_rate": 1.4848096202463372e-06, "loss": 1.7391, "step": 71 }, { "epoch": 3.5769230769230766, "grad_norm": 11.4587984085083, "learning_rate": 1.4694715627858908e-06, "loss": 1.8459, "step": 72 }, { "epoch": 3.628205128205128, "grad_norm": 14.638727188110352, "learning_rate": 1.4539904997395467e-06, "loss": 1.814, "step": 73 }, { "epoch": 3.6794871794871797, "grad_norm": 15.081775665283203, "learning_rate": 1.4383711467890773e-06, "loss": 1.9079, "step": 74 }, { "epoch": 3.730769230769231, "grad_norm": 12.757416725158691, "learning_rate": 1.4226182617406994e-06, "loss": 1.8866, "step": 75 }, { "epoch": 3.730769230769231, "eval_loss": 2.0760610103607178, "eval_runtime": 0.1867, "eval_samples_per_second": 166.063, "eval_steps_per_second": 26.784, "step": 75 }, { "epoch": 3.782051282051282, "grad_norm": 14.678832054138184, "learning_rate": 1.4067366430758004e-06, "loss": 1.7503, "step": 76 }, { "epoch": 3.8333333333333335, "grad_norm": 15.981603622436523, "learning_rate": 1.3907311284892735e-06, "loss": 1.7984, "step": 77 }, { "epoch": 3.8846153846153846, "grad_norm": 14.856511116027832, "learning_rate": 1.374606593415912e-06, "loss": 1.7843, "step": 78 }, { "epoch": 3.935897435897436, "grad_norm": 14.275514602661133, "learning_rate": 1.3583679495453e-06, "loss": 1.7888, "step": 79 }, { "epoch": 3.9871794871794872, "grad_norm": 12.734882354736328, "learning_rate": 1.3420201433256689e-06, "loss": 1.8927, "step": 80 }, { "epoch": 3.9871794871794872, "eval_loss": 2.067340135574341, "eval_runtime": 0.1861, "eval_samples_per_second": 166.583, "eval_steps_per_second": 26.868, "step": 80 }, { "epoch": 4.038461538461538, "grad_norm": 14.663799285888672, "learning_rate": 1.3255681544571566e-06, "loss": 1.7531, "step": 81 }, { "epoch": 4.051282051282051, "grad_norm": 12.570903778076172, "learning_rate": 1.3090169943749473e-06, "loss": 1.7588, "step": 82 }, { "epoch": 4.102564102564102, "grad_norm": 11.108199119567871, "learning_rate": 1.2923717047227368e-06, "loss": 1.6173, "step": 83 }, { "epoch": 4.153846153846154, "grad_norm": 14.328954696655273, "learning_rate": 1.275637355816999e-06, "loss": 1.7411, "step": 84 }, { "epoch": 4.205128205128205, "grad_norm": 14.140481948852539, "learning_rate": 1.2588190451025207e-06, "loss": 1.5778, "step": 85 }, { "epoch": 4.205128205128205, "eval_loss": 2.0778791904449463, "eval_runtime": 0.1891, "eval_samples_per_second": 163.917, "eval_steps_per_second": 26.438, "step": 85 }, { "epoch": 4.256410256410256, "grad_norm": 13.933786392211914, "learning_rate": 1.2419218955996676e-06, "loss": 1.5578, "step": 86 }, { "epoch": 4.3076923076923075, "grad_norm": 16.1457462310791, "learning_rate": 1.2249510543438651e-06, "loss": 1.6873, "step": 87 }, { "epoch": 4.358974358974359, "grad_norm": 16.26984977722168, "learning_rate": 1.207911690817759e-06, "loss": 1.6605, "step": 88 }, { "epoch": 4.410256410256411, "grad_norm": 19.391223907470703, "learning_rate": 1.1908089953765447e-06, "loss": 1.6272, "step": 89 }, { "epoch": 4.461538461538462, "grad_norm": 19.38517951965332, "learning_rate": 1.1736481776669305e-06, "loss": 1.7274, "step": 90 }, { "epoch": 4.461538461538462, "eval_loss": 2.0934271812438965, "eval_runtime": 0.1874, "eval_samples_per_second": 165.396, "eval_steps_per_second": 26.677, "step": 90 }, { "epoch": 4.512820512820513, "grad_norm": 16.367389678955078, "learning_rate": 1.156434465040231e-06, "loss": 1.8406, "step": 91 }, { "epoch": 4.564102564102564, "grad_norm": 18.22227668762207, "learning_rate": 1.1391731009600653e-06, "loss": 1.7469, "step": 92 }, { "epoch": 4.615384615384615, "grad_norm": 14.44421100616455, "learning_rate": 1.1218693434051474e-06, "loss": 1.5867, "step": 93 }, { "epoch": 4.666666666666667, "grad_norm": 13.295368194580078, "learning_rate": 1.1045284632676535e-06, "loss": 1.7081, "step": 94 }, { "epoch": 4.717948717948718, "grad_norm": 15.499272346496582, "learning_rate": 1.0871557427476583e-06, "loss": 1.7431, "step": 95 }, { "epoch": 4.717948717948718, "eval_loss": 2.065159559249878, "eval_runtime": 0.1863, "eval_samples_per_second": 166.408, "eval_steps_per_second": 26.84, "step": 95 }, { "epoch": 4.769230769230769, "grad_norm": 15.949275016784668, "learning_rate": 1.069756473744125e-06, "loss": 1.6641, "step": 96 }, { "epoch": 4.82051282051282, "grad_norm": 13.781301498413086, "learning_rate": 1.052335956242944e-06, "loss": 1.5421, "step": 97 }, { "epoch": 4.871794871794872, "grad_norm": 16.268604278564453, "learning_rate": 1.034899496702501e-06, "loss": 1.7906, "step": 98 }, { "epoch": 4.923076923076923, "grad_norm": 12.881053924560547, "learning_rate": 1.0174524064372837e-06, "loss": 1.7359, "step": 99 }, { "epoch": 4.9743589743589745, "grad_norm": 15.596150398254395, "learning_rate": 1e-06, "loss": 1.8728, "step": 100 }, { "epoch": 4.9743589743589745, "eval_loss": 2.0617754459381104, "eval_runtime": 0.1875, "eval_samples_per_second": 165.345, "eval_steps_per_second": 26.668, "step": 100 }, { "epoch": 5.0256410256410255, "grad_norm": 16.61153221130371, "learning_rate": 9.825475935627165e-07, "loss": 1.6729, "step": 101 }, { "epoch": 5.038461538461538, "grad_norm": 13.130430221557617, "learning_rate": 9.651005032974993e-07, "loss": 1.6707, "step": 102 }, { "epoch": 5.089743589743589, "grad_norm": 14.977300643920898, "learning_rate": 9.476640437570561e-07, "loss": 1.5516, "step": 103 }, { "epoch": 5.141025641025641, "grad_norm": 17.314029693603516, "learning_rate": 9.302435262558747e-07, "loss": 1.6449, "step": 104 }, { "epoch": 5.1923076923076925, "grad_norm": 15.75112247467041, "learning_rate": 9.128442572523417e-07, "loss": 1.5729, "step": 105 }, { "epoch": 5.1923076923076925, "eval_loss": 2.083660125732422, "eval_runtime": 0.187, "eval_samples_per_second": 165.747, "eval_steps_per_second": 26.733, "step": 105 }, { "epoch": 5.243589743589744, "grad_norm": 19.511394500732422, "learning_rate": 8.954715367323466e-07, "loss": 1.5756, "step": 106 }, { "epoch": 5.294871794871795, "grad_norm": 16.741764068603516, "learning_rate": 8.781306565948526e-07, "loss": 1.6627, "step": 107 }, { "epoch": 5.346153846153846, "grad_norm": 16.6429443359375, "learning_rate": 8.608268990399348e-07, "loss": 1.6097, "step": 108 }, { "epoch": 5.397435897435898, "grad_norm": 22.457843780517578, "learning_rate": 8.435655349597689e-07, "loss": 1.6192, "step": 109 }, { "epoch": 5.448717948717949, "grad_norm": 13.546624183654785, "learning_rate": 8.263518223330696e-07, "loss": 1.4631, "step": 110 }, { "epoch": 5.448717948717949, "eval_loss": 2.087294816970825, "eval_runtime": 0.1887, "eval_samples_per_second": 164.276, "eval_steps_per_second": 26.496, "step": 110 }, { "epoch": 5.5, "grad_norm": 16.943618774414062, "learning_rate": 8.091910046234551e-07, "loss": 1.5529, "step": 111 }, { "epoch": 5.551282051282051, "grad_norm": 17.719892501831055, "learning_rate": 7.920883091822408e-07, "loss": 1.7165, "step": 112 }, { "epoch": 5.602564102564102, "grad_norm": 14.0659818649292, "learning_rate": 7.750489456561351e-07, "loss": 1.5024, "step": 113 }, { "epoch": 5.653846153846154, "grad_norm": 17.86212921142578, "learning_rate": 7.580781044003324e-07, "loss": 1.5745, "step": 114 }, { "epoch": 5.705128205128205, "grad_norm": 17.252527236938477, "learning_rate": 7.411809548974791e-07, "loss": 1.4758, "step": 115 }, { "epoch": 5.705128205128205, "eval_loss": 2.074392557144165, "eval_runtime": 0.1875, "eval_samples_per_second": 165.36, "eval_steps_per_second": 26.671, "step": 115 }, { "epoch": 5.756410256410256, "grad_norm": 18.326730728149414, "learning_rate": 7.243626441830009e-07, "loss": 1.5874, "step": 116 }, { "epoch": 5.8076923076923075, "grad_norm": 14.133539199829102, "learning_rate": 7.076282952772633e-07, "loss": 1.4556, "step": 117 }, { "epoch": 5.858974358974359, "grad_norm": 16.187454223632812, "learning_rate": 6.909830056250526e-07, "loss": 1.5353, "step": 118 }, { "epoch": 5.910256410256411, "grad_norm": 18.15951919555664, "learning_rate": 6.744318455428435e-07, "loss": 1.6346, "step": 119 }, { "epoch": 5.961538461538462, "grad_norm": 14.860916137695312, "learning_rate": 6.579798566743313e-07, "loss": 1.5289, "step": 120 }, { "epoch": 5.961538461538462, "eval_loss": 2.0899431705474854, "eval_runtime": 0.1896, "eval_samples_per_second": 163.49, "eval_steps_per_second": 26.369, "step": 120 }, { "epoch": 6.012820512820513, "grad_norm": 23.091646194458008, "learning_rate": 6.416320504546997e-07, "loss": 1.6633, "step": 121 }, { "epoch": 6.0256410256410255, "grad_norm": 19.409482955932617, "learning_rate": 6.253934065840879e-07, "loss": 1.6998, "step": 122 }, { "epoch": 6.076923076923077, "grad_norm": 15.723928451538086, "learning_rate": 6.092688715107263e-07, "loss": 1.5407, "step": 123 }, { "epoch": 6.128205128205128, "grad_norm": 17.410001754760742, "learning_rate": 5.932633569241999e-07, "loss": 1.4682, "step": 124 }, { "epoch": 6.17948717948718, "grad_norm": 15.949166297912598, "learning_rate": 5.773817382593007e-07, "loss": 1.515, "step": 125 }, { "epoch": 6.17948717948718, "eval_loss": 2.091871500015259, "eval_runtime": 0.1884, "eval_samples_per_second": 164.579, "eval_steps_per_second": 26.545, "step": 125 }, { "epoch": 6.230769230769231, "grad_norm": 19.262935638427734, "learning_rate": 5.616288532109224e-07, "loss": 1.4557, "step": 126 }, { "epoch": 6.282051282051282, "grad_norm": 18.071447372436523, "learning_rate": 5.460095002604532e-07, "loss": 1.4763, "step": 127 }, { "epoch": 6.333333333333333, "grad_norm": 14.22094440460205, "learning_rate": 5.305284372141095e-07, "loss": 1.3375, "step": 128 }, { "epoch": 6.384615384615385, "grad_norm": 19.112789154052734, "learning_rate": 5.15190379753663e-07, "loss": 1.5896, "step": 129 }, { "epoch": 6.435897435897436, "grad_norm": 19.069456100463867, "learning_rate": 5.000000000000002e-07, "loss": 1.5757, "step": 130 }, { "epoch": 6.435897435897436, "eval_loss": 2.0978188514709473, "eval_runtime": 0.1888, "eval_samples_per_second": 164.22, "eval_steps_per_second": 26.487, "step": 130 }, { "epoch": 6.487179487179487, "grad_norm": 16.8870792388916, "learning_rate": 4.849619250899458e-07, "loss": 1.4204, "step": 131 }, { "epoch": 6.538461538461538, "grad_norm": 20.033496856689453, "learning_rate": 4.700807357667952e-07, "loss": 1.6698, "step": 132 }, { "epoch": 6.589743589743589, "grad_norm": 18.386215209960938, "learning_rate": 4.5536096498497287e-07, "loss": 1.4692, "step": 133 } ], "logging_steps": 1, "max_steps": 190, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 19, "total_flos": 1.733580238744453e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }