QingyiSi's picture
Upload 1268 files
4697198
{
"best_metric": 2.3293075561523438,
"best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_bloomfirefly/checkpoint-19000",
"epoch": 2.952531261381571,
"global_step": 19000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 5.9999999999999995e-05,
"loss": 2.9733,
"step": 20
},
{
"epoch": 0.01,
"learning_rate": 0.00011999999999999999,
"loss": 2.7809,
"step": 40
},
{
"epoch": 0.01,
"learning_rate": 0.00017999999999999998,
"loss": 2.6052,
"step": 60
},
{
"epoch": 0.01,
"learning_rate": 0.00023999999999999998,
"loss": 2.4925,
"step": 80
},
{
"epoch": 0.02,
"learning_rate": 0.0003,
"loss": 2.458,
"step": 100
},
{
"epoch": 0.02,
"learning_rate": 0.00029968758135902107,
"loss": 2.4281,
"step": 120
},
{
"epoch": 0.02,
"learning_rate": 0.00029937516271804216,
"loss": 2.4178,
"step": 140
},
{
"epoch": 0.02,
"learning_rate": 0.00029906274407706326,
"loss": 2.3839,
"step": 160
},
{
"epoch": 0.03,
"learning_rate": 0.0002987503254360843,
"loss": 2.3521,
"step": 180
},
{
"epoch": 0.03,
"learning_rate": 0.00029843790679510545,
"loss": 2.338,
"step": 200
},
{
"epoch": 0.03,
"eval_loss": 2.510117292404175,
"eval_runtime": 69.1765,
"eval_samples_per_second": 28.912,
"eval_steps_per_second": 1.807,
"step": 200
},
{
"epoch": 0.03,
"learning_rate": 0.0002981254881541265,
"loss": 2.3401,
"step": 220
},
{
"epoch": 0.04,
"learning_rate": 0.0002978130695131476,
"loss": 2.3665,
"step": 240
},
{
"epoch": 0.04,
"learning_rate": 0.0002975006508721687,
"loss": 2.3691,
"step": 260
},
{
"epoch": 0.04,
"learning_rate": 0.0002971882322311898,
"loss": 2.3514,
"step": 280
},
{
"epoch": 0.05,
"learning_rate": 0.0002968758135902109,
"loss": 2.3203,
"step": 300
},
{
"epoch": 0.05,
"learning_rate": 0.00029656339494923197,
"loss": 2.3393,
"step": 320
},
{
"epoch": 0.05,
"learning_rate": 0.000296250976308253,
"loss": 2.3289,
"step": 340
},
{
"epoch": 0.06,
"learning_rate": 0.00029593855766727416,
"loss": 2.3407,
"step": 360
},
{
"epoch": 0.06,
"learning_rate": 0.0002956261390262952,
"loss": 2.3163,
"step": 380
},
{
"epoch": 0.06,
"learning_rate": 0.0002953137203853163,
"loss": 2.3212,
"step": 400
},
{
"epoch": 0.06,
"eval_loss": 2.473245620727539,
"eval_runtime": 69.0219,
"eval_samples_per_second": 28.976,
"eval_steps_per_second": 1.811,
"step": 400
},
{
"epoch": 0.07,
"learning_rate": 0.0002950013017443374,
"loss": 2.2927,
"step": 420
},
{
"epoch": 0.07,
"learning_rate": 0.0002946888831033585,
"loss": 2.2927,
"step": 440
},
{
"epoch": 0.07,
"learning_rate": 0.0002943764644623796,
"loss": 2.29,
"step": 460
},
{
"epoch": 0.07,
"learning_rate": 0.0002940640458214007,
"loss": 2.3099,
"step": 480
},
{
"epoch": 0.08,
"learning_rate": 0.0002937516271804217,
"loss": 2.3286,
"step": 500
},
{
"epoch": 0.08,
"learning_rate": 0.0002934392085394428,
"loss": 2.2928,
"step": 520
},
{
"epoch": 0.08,
"learning_rate": 0.0002931267898984639,
"loss": 2.2956,
"step": 540
},
{
"epoch": 0.09,
"learning_rate": 0.000292814371257485,
"loss": 2.2627,
"step": 560
},
{
"epoch": 0.09,
"learning_rate": 0.0002925019526165061,
"loss": 2.2897,
"step": 580
},
{
"epoch": 0.09,
"learning_rate": 0.0002921895339755272,
"loss": 2.2994,
"step": 600
},
{
"epoch": 0.09,
"eval_loss": 2.455402374267578,
"eval_runtime": 69.1315,
"eval_samples_per_second": 28.93,
"eval_steps_per_second": 1.808,
"step": 600
},
{
"epoch": 0.1,
"learning_rate": 0.00029187711533454824,
"loss": 2.3232,
"step": 620
},
{
"epoch": 0.1,
"learning_rate": 0.0002915646966935694,
"loss": 2.2515,
"step": 640
},
{
"epoch": 0.1,
"learning_rate": 0.00029125227805259043,
"loss": 2.2856,
"step": 660
},
{
"epoch": 0.11,
"learning_rate": 0.0002909398594116115,
"loss": 2.252,
"step": 680
},
{
"epoch": 0.11,
"learning_rate": 0.0002906274407706326,
"loss": 2.2891,
"step": 700
},
{
"epoch": 0.11,
"learning_rate": 0.0002903150221296537,
"loss": 2.2769,
"step": 720
},
{
"epoch": 0.11,
"learning_rate": 0.0002900026034886748,
"loss": 2.2763,
"step": 740
},
{
"epoch": 0.12,
"learning_rate": 0.0002896901848476959,
"loss": 2.278,
"step": 760
},
{
"epoch": 0.12,
"learning_rate": 0.00028937776620671695,
"loss": 2.3126,
"step": 780
},
{
"epoch": 0.12,
"learning_rate": 0.0002890653475657381,
"loss": 2.2698,
"step": 800
},
{
"epoch": 0.12,
"eval_loss": 2.4434444904327393,
"eval_runtime": 69.7211,
"eval_samples_per_second": 28.686,
"eval_steps_per_second": 1.793,
"step": 800
},
{
"epoch": 0.13,
"learning_rate": 0.00028875292892475914,
"loss": 2.2587,
"step": 820
},
{
"epoch": 0.13,
"learning_rate": 0.00028844051028378023,
"loss": 2.2954,
"step": 840
},
{
"epoch": 0.13,
"learning_rate": 0.00028812809164280133,
"loss": 2.3102,
"step": 860
},
{
"epoch": 0.14,
"learning_rate": 0.0002878156730018224,
"loss": 2.2918,
"step": 880
},
{
"epoch": 0.14,
"learning_rate": 0.0002875032543608435,
"loss": 2.2698,
"step": 900
},
{
"epoch": 0.14,
"learning_rate": 0.0002871908357198646,
"loss": 2.2514,
"step": 920
},
{
"epoch": 0.15,
"learning_rate": 0.00028687841707888566,
"loss": 2.2684,
"step": 940
},
{
"epoch": 0.15,
"learning_rate": 0.00028656599843790675,
"loss": 2.2833,
"step": 960
},
{
"epoch": 0.15,
"learning_rate": 0.00028625357979692785,
"loss": 2.2709,
"step": 980
},
{
"epoch": 0.16,
"learning_rate": 0.00028594116115594894,
"loss": 2.2596,
"step": 1000
},
{
"epoch": 0.16,
"eval_loss": 2.436037302017212,
"eval_runtime": 69.727,
"eval_samples_per_second": 28.683,
"eval_steps_per_second": 1.793,
"step": 1000
},
{
"epoch": 0.16,
"learning_rate": 0.00028562874251497004,
"loss": 2.2743,
"step": 1020
},
{
"epoch": 0.16,
"learning_rate": 0.00028531632387399113,
"loss": 2.23,
"step": 1040
},
{
"epoch": 0.16,
"learning_rate": 0.0002850039052330122,
"loss": 2.2723,
"step": 1060
},
{
"epoch": 0.17,
"learning_rate": 0.0002846914865920333,
"loss": 2.2585,
"step": 1080
},
{
"epoch": 0.17,
"learning_rate": 0.00028437906795105437,
"loss": 2.2463,
"step": 1100
},
{
"epoch": 0.17,
"learning_rate": 0.00028406664931007546,
"loss": 2.2264,
"step": 1120
},
{
"epoch": 0.18,
"learning_rate": 0.00028375423066909656,
"loss": 2.223,
"step": 1140
},
{
"epoch": 0.18,
"learning_rate": 0.00028344181202811765,
"loss": 2.2412,
"step": 1160
},
{
"epoch": 0.18,
"learning_rate": 0.00028312939338713875,
"loss": 2.2714,
"step": 1180
},
{
"epoch": 0.19,
"learning_rate": 0.00028281697474615984,
"loss": 2.2638,
"step": 1200
},
{
"epoch": 0.19,
"eval_loss": 2.4272871017456055,
"eval_runtime": 69.3748,
"eval_samples_per_second": 28.829,
"eval_steps_per_second": 1.802,
"step": 1200
},
{
"epoch": 0.19,
"learning_rate": 0.0002825045561051809,
"loss": 2.2303,
"step": 1220
},
{
"epoch": 0.19,
"learning_rate": 0.000282192137464202,
"loss": 2.2491,
"step": 1240
},
{
"epoch": 0.2,
"learning_rate": 0.00028187971882322313,
"loss": 2.2598,
"step": 1260
},
{
"epoch": 0.2,
"learning_rate": 0.00028156730018224417,
"loss": 2.2566,
"step": 1280
},
{
"epoch": 0.2,
"learning_rate": 0.00028125488154126527,
"loss": 2.2642,
"step": 1300
},
{
"epoch": 0.21,
"learning_rate": 0.00028094246290028636,
"loss": 2.2976,
"step": 1320
},
{
"epoch": 0.21,
"learning_rate": 0.00028063004425930746,
"loss": 2.2144,
"step": 1340
},
{
"epoch": 0.21,
"learning_rate": 0.00028031762561832855,
"loss": 2.2618,
"step": 1360
},
{
"epoch": 0.21,
"learning_rate": 0.00028000520697734965,
"loss": 2.2232,
"step": 1380
},
{
"epoch": 0.22,
"learning_rate": 0.0002796927883363707,
"loss": 2.2349,
"step": 1400
},
{
"epoch": 0.22,
"eval_loss": 2.422177314758301,
"eval_runtime": 69.7796,
"eval_samples_per_second": 28.662,
"eval_steps_per_second": 1.791,
"step": 1400
},
{
"epoch": 0.22,
"learning_rate": 0.00027938036969539184,
"loss": 2.2655,
"step": 1420
},
{
"epoch": 0.22,
"learning_rate": 0.0002790679510544129,
"loss": 2.265,
"step": 1440
},
{
"epoch": 0.23,
"learning_rate": 0.000278755532413434,
"loss": 2.2552,
"step": 1460
},
{
"epoch": 0.23,
"learning_rate": 0.00027844311377245507,
"loss": 2.252,
"step": 1480
},
{
"epoch": 0.23,
"learning_rate": 0.00027813069513147617,
"loss": 2.255,
"step": 1500
},
{
"epoch": 0.24,
"learning_rate": 0.00027781827649049726,
"loss": 2.1869,
"step": 1520
},
{
"epoch": 0.24,
"learning_rate": 0.00027750585784951836,
"loss": 2.2601,
"step": 1540
},
{
"epoch": 0.24,
"learning_rate": 0.0002771934392085394,
"loss": 2.2607,
"step": 1560
},
{
"epoch": 0.25,
"learning_rate": 0.0002768810205675605,
"loss": 2.2245,
"step": 1580
},
{
"epoch": 0.25,
"learning_rate": 0.0002765686019265816,
"loss": 2.2561,
"step": 1600
},
{
"epoch": 0.25,
"eval_loss": 2.4173202514648438,
"eval_runtime": 69.7813,
"eval_samples_per_second": 28.661,
"eval_steps_per_second": 1.791,
"step": 1600
},
{
"epoch": 0.25,
"learning_rate": 0.0002762561832856027,
"loss": 2.2472,
"step": 1620
},
{
"epoch": 0.25,
"learning_rate": 0.0002759437646446238,
"loss": 2.2952,
"step": 1640
},
{
"epoch": 0.26,
"learning_rate": 0.0002756313460036449,
"loss": 2.1941,
"step": 1660
},
{
"epoch": 0.26,
"learning_rate": 0.0002753189273626659,
"loss": 2.2396,
"step": 1680
},
{
"epoch": 0.26,
"learning_rate": 0.00027500650872168707,
"loss": 2.2325,
"step": 1700
},
{
"epoch": 0.27,
"learning_rate": 0.0002746940900807081,
"loss": 2.2458,
"step": 1720
},
{
"epoch": 0.27,
"learning_rate": 0.0002743816714397292,
"loss": 2.2464,
"step": 1740
},
{
"epoch": 0.27,
"learning_rate": 0.0002740692527987503,
"loss": 2.2487,
"step": 1760
},
{
"epoch": 0.28,
"learning_rate": 0.0002737568341577714,
"loss": 2.2609,
"step": 1780
},
{
"epoch": 0.28,
"learning_rate": 0.0002734444155167925,
"loss": 2.3016,
"step": 1800
},
{
"epoch": 0.28,
"eval_loss": 2.4146716594696045,
"eval_runtime": 69.513,
"eval_samples_per_second": 28.772,
"eval_steps_per_second": 1.798,
"step": 1800
},
{
"epoch": 0.28,
"learning_rate": 0.0002731319968758136,
"loss": 2.2415,
"step": 1820
},
{
"epoch": 0.29,
"learning_rate": 0.0002728195782348346,
"loss": 2.2512,
"step": 1840
},
{
"epoch": 0.29,
"learning_rate": 0.0002725071595938558,
"loss": 2.2186,
"step": 1860
},
{
"epoch": 0.29,
"learning_rate": 0.0002721947409528768,
"loss": 2.1982,
"step": 1880
},
{
"epoch": 0.3,
"learning_rate": 0.0002718823223118979,
"loss": 2.2358,
"step": 1900
},
{
"epoch": 0.3,
"learning_rate": 0.000271569903670919,
"loss": 2.2359,
"step": 1920
},
{
"epoch": 0.3,
"learning_rate": 0.0002712574850299401,
"loss": 2.2367,
"step": 1940
},
{
"epoch": 0.3,
"learning_rate": 0.0002709450663889612,
"loss": 2.2209,
"step": 1960
},
{
"epoch": 0.31,
"learning_rate": 0.0002706326477479823,
"loss": 2.2026,
"step": 1980
},
{
"epoch": 0.31,
"learning_rate": 0.00027032022910700333,
"loss": 2.2302,
"step": 2000
},
{
"epoch": 0.31,
"eval_loss": 2.4096806049346924,
"eval_runtime": 69.8744,
"eval_samples_per_second": 28.623,
"eval_steps_per_second": 1.789,
"step": 2000
},
{
"epoch": 0.31,
"learning_rate": 0.00027000781046602443,
"loss": 2.2516,
"step": 2020
},
{
"epoch": 0.32,
"learning_rate": 0.0002696953918250455,
"loss": 2.2173,
"step": 2040
},
{
"epoch": 0.32,
"learning_rate": 0.0002693829731840666,
"loss": 2.2414,
"step": 2060
},
{
"epoch": 0.32,
"learning_rate": 0.0002690705545430877,
"loss": 2.1922,
"step": 2080
},
{
"epoch": 0.33,
"learning_rate": 0.0002687581359021088,
"loss": 2.2396,
"step": 2100
},
{
"epoch": 0.33,
"learning_rate": 0.00026844571726112985,
"loss": 2.2602,
"step": 2120
},
{
"epoch": 0.33,
"learning_rate": 0.000268133298620151,
"loss": 2.2263,
"step": 2140
},
{
"epoch": 0.34,
"learning_rate": 0.00026782087997917204,
"loss": 2.2082,
"step": 2160
},
{
"epoch": 0.34,
"learning_rate": 0.00026750846133819314,
"loss": 2.2144,
"step": 2180
},
{
"epoch": 0.34,
"learning_rate": 0.00026719604269721423,
"loss": 2.2066,
"step": 2200
},
{
"epoch": 0.34,
"eval_loss": 2.4065375328063965,
"eval_runtime": 69.933,
"eval_samples_per_second": 28.599,
"eval_steps_per_second": 1.787,
"step": 2200
},
{
"epoch": 0.34,
"learning_rate": 0.00026688362405623533,
"loss": 2.2494,
"step": 2220
},
{
"epoch": 0.35,
"learning_rate": 0.0002665712054152564,
"loss": 2.2471,
"step": 2240
},
{
"epoch": 0.35,
"learning_rate": 0.0002662587867742775,
"loss": 2.2512,
"step": 2260
},
{
"epoch": 0.35,
"learning_rate": 0.00026594636813329856,
"loss": 2.2249,
"step": 2280
},
{
"epoch": 0.36,
"learning_rate": 0.0002656339494923197,
"loss": 2.2526,
"step": 2300
},
{
"epoch": 0.36,
"learning_rate": 0.00026532153085134075,
"loss": 2.2375,
"step": 2320
},
{
"epoch": 0.36,
"learning_rate": 0.00026500911221036185,
"loss": 2.169,
"step": 2340
},
{
"epoch": 0.37,
"learning_rate": 0.00026469669356938294,
"loss": 2.2206,
"step": 2360
},
{
"epoch": 0.37,
"learning_rate": 0.00026438427492840404,
"loss": 2.2284,
"step": 2380
},
{
"epoch": 0.37,
"learning_rate": 0.00026407185628742513,
"loss": 2.2116,
"step": 2400
},
{
"epoch": 0.37,
"eval_loss": 2.402400255203247,
"eval_runtime": 70.6508,
"eval_samples_per_second": 28.308,
"eval_steps_per_second": 1.769,
"step": 2400
},
{
"epoch": 0.38,
"learning_rate": 0.00026375943764644623,
"loss": 2.2228,
"step": 2420
},
{
"epoch": 0.38,
"learning_rate": 0.0002634470190054673,
"loss": 2.2264,
"step": 2440
},
{
"epoch": 0.38,
"learning_rate": 0.00026313460036448837,
"loss": 2.2212,
"step": 2460
},
{
"epoch": 0.39,
"learning_rate": 0.0002628221817235095,
"loss": 2.2164,
"step": 2480
},
{
"epoch": 0.39,
"learning_rate": 0.00026250976308253056,
"loss": 2.2523,
"step": 2500
},
{
"epoch": 0.39,
"learning_rate": 0.00026219734444155165,
"loss": 2.2272,
"step": 2520
},
{
"epoch": 0.39,
"learning_rate": 0.00026188492580057275,
"loss": 2.2381,
"step": 2540
},
{
"epoch": 0.4,
"learning_rate": 0.00026157250715959384,
"loss": 2.2149,
"step": 2560
},
{
"epoch": 0.4,
"learning_rate": 0.00026126008851861494,
"loss": 2.228,
"step": 2580
},
{
"epoch": 0.4,
"learning_rate": 0.00026094766987763603,
"loss": 2.2145,
"step": 2600
},
{
"epoch": 0.4,
"eval_loss": 2.399576425552368,
"eval_runtime": 69.9194,
"eval_samples_per_second": 28.604,
"eval_steps_per_second": 1.788,
"step": 2600
},
{
"epoch": 0.41,
"learning_rate": 0.0002606352512366571,
"loss": 2.18,
"step": 2620
},
{
"epoch": 0.41,
"learning_rate": 0.0002603228325956782,
"loss": 2.1965,
"step": 2640
},
{
"epoch": 0.41,
"learning_rate": 0.00026001041395469927,
"loss": 2.178,
"step": 2660
},
{
"epoch": 0.42,
"learning_rate": 0.00025969799531372036,
"loss": 2.194,
"step": 2680
},
{
"epoch": 0.42,
"learning_rate": 0.00025938557667274146,
"loss": 2.2024,
"step": 2700
},
{
"epoch": 0.42,
"learning_rate": 0.00025907315803176255,
"loss": 2.2427,
"step": 2720
},
{
"epoch": 0.43,
"learning_rate": 0.00025876073939078365,
"loss": 2.2246,
"step": 2740
},
{
"epoch": 0.43,
"learning_rate": 0.00025844832074980474,
"loss": 2.2169,
"step": 2760
},
{
"epoch": 0.43,
"learning_rate": 0.0002581359021088258,
"loss": 2.2154,
"step": 2780
},
{
"epoch": 0.44,
"learning_rate": 0.0002578234834678469,
"loss": 2.1732,
"step": 2800
},
{
"epoch": 0.44,
"eval_loss": 2.3982491493225098,
"eval_runtime": 70.2191,
"eval_samples_per_second": 28.482,
"eval_steps_per_second": 1.78,
"step": 2800
},
{
"epoch": 0.44,
"learning_rate": 0.000257511064826868,
"loss": 2.1951,
"step": 2820
},
{
"epoch": 0.44,
"learning_rate": 0.00025719864618588907,
"loss": 2.2139,
"step": 2840
},
{
"epoch": 0.44,
"learning_rate": 0.00025688622754491017,
"loss": 2.197,
"step": 2860
},
{
"epoch": 0.45,
"learning_rate": 0.00025657380890393126,
"loss": 2.2317,
"step": 2880
},
{
"epoch": 0.45,
"learning_rate": 0.0002562613902629523,
"loss": 2.2107,
"step": 2900
},
{
"epoch": 0.45,
"learning_rate": 0.00025594897162197345,
"loss": 2.2087,
"step": 2920
},
{
"epoch": 0.46,
"learning_rate": 0.0002556365529809945,
"loss": 2.2124,
"step": 2940
},
{
"epoch": 0.46,
"learning_rate": 0.0002553241343400156,
"loss": 2.1762,
"step": 2960
},
{
"epoch": 0.46,
"learning_rate": 0.0002550117156990367,
"loss": 2.2488,
"step": 2980
},
{
"epoch": 0.47,
"learning_rate": 0.0002546992970580578,
"loss": 2.2316,
"step": 3000
},
{
"epoch": 0.47,
"eval_loss": 2.394296646118164,
"eval_runtime": 70.2494,
"eval_samples_per_second": 28.47,
"eval_steps_per_second": 1.779,
"step": 3000
},
{
"epoch": 0.47,
"learning_rate": 0.0002543868784170789,
"loss": 2.2386,
"step": 3020
},
{
"epoch": 0.47,
"learning_rate": 0.00025407445977609997,
"loss": 2.224,
"step": 3040
},
{
"epoch": 0.48,
"learning_rate": 0.000253762041135121,
"loss": 2.2479,
"step": 3060
},
{
"epoch": 0.48,
"learning_rate": 0.0002534496224941421,
"loss": 2.2396,
"step": 3080
},
{
"epoch": 0.48,
"learning_rate": 0.0002531372038531632,
"loss": 2.2405,
"step": 3100
},
{
"epoch": 0.48,
"learning_rate": 0.0002528247852121843,
"loss": 2.1969,
"step": 3120
},
{
"epoch": 0.49,
"learning_rate": 0.0002525123665712054,
"loss": 2.2095,
"step": 3140
},
{
"epoch": 0.49,
"learning_rate": 0.0002521999479302265,
"loss": 2.2202,
"step": 3160
},
{
"epoch": 0.49,
"learning_rate": 0.0002518875292892476,
"loss": 2.2088,
"step": 3180
},
{
"epoch": 0.5,
"learning_rate": 0.0002515751106482687,
"loss": 2.2075,
"step": 3200
},
{
"epoch": 0.5,
"eval_loss": 2.3918581008911133,
"eval_runtime": 69.2896,
"eval_samples_per_second": 28.864,
"eval_steps_per_second": 1.804,
"step": 3200
},
{
"epoch": 0.5,
"learning_rate": 0.0002512626920072897,
"loss": 2.1993,
"step": 3220
},
{
"epoch": 0.5,
"learning_rate": 0.0002509502733663108,
"loss": 2.2406,
"step": 3240
},
{
"epoch": 0.51,
"learning_rate": 0.0002506378547253319,
"loss": 2.2352,
"step": 3260
},
{
"epoch": 0.51,
"learning_rate": 0.000250325436084353,
"loss": 2.236,
"step": 3280
},
{
"epoch": 0.51,
"learning_rate": 0.0002500130174433741,
"loss": 2.1805,
"step": 3300
},
{
"epoch": 0.52,
"learning_rate": 0.0002497005988023952,
"loss": 2.2249,
"step": 3320
},
{
"epoch": 0.52,
"learning_rate": 0.00024938818016141624,
"loss": 2.2153,
"step": 3340
},
{
"epoch": 0.52,
"learning_rate": 0.0002490757615204374,
"loss": 2.2115,
"step": 3360
},
{
"epoch": 0.53,
"learning_rate": 0.00024876334287945843,
"loss": 2.2284,
"step": 3380
},
{
"epoch": 0.53,
"learning_rate": 0.0002484509242384795,
"loss": 2.184,
"step": 3400
},
{
"epoch": 0.53,
"eval_loss": 2.3887791633605957,
"eval_runtime": 69.2387,
"eval_samples_per_second": 28.886,
"eval_steps_per_second": 1.805,
"step": 3400
},
{
"epoch": 0.53,
"learning_rate": 0.0002481385055975006,
"loss": 2.2172,
"step": 3420
},
{
"epoch": 0.53,
"learning_rate": 0.0002478260869565217,
"loss": 2.2347,
"step": 3440
},
{
"epoch": 0.54,
"learning_rate": 0.0002475136683155428,
"loss": 2.2213,
"step": 3460
},
{
"epoch": 0.54,
"learning_rate": 0.0002472012496745639,
"loss": 2.2215,
"step": 3480
},
{
"epoch": 0.54,
"learning_rate": 0.00024688883103358495,
"loss": 2.2058,
"step": 3500
},
{
"epoch": 0.55,
"learning_rate": 0.00024657641239260604,
"loss": 2.1918,
"step": 3520
},
{
"epoch": 0.55,
"learning_rate": 0.0002462639937516272,
"loss": 2.2021,
"step": 3540
},
{
"epoch": 0.55,
"learning_rate": 0.00024595157511064824,
"loss": 2.1832,
"step": 3560
},
{
"epoch": 0.56,
"learning_rate": 0.00024563915646966933,
"loss": 2.2199,
"step": 3580
},
{
"epoch": 0.56,
"learning_rate": 0.0002453267378286904,
"loss": 2.1997,
"step": 3600
},
{
"epoch": 0.56,
"eval_loss": 2.386540412902832,
"eval_runtime": 69.2123,
"eval_samples_per_second": 28.897,
"eval_steps_per_second": 1.806,
"step": 3600
},
{
"epoch": 0.56,
"learning_rate": 0.0002450143191877115,
"loss": 2.2009,
"step": 3620
},
{
"epoch": 0.57,
"learning_rate": 0.0002447019005467326,
"loss": 2.2045,
"step": 3640
},
{
"epoch": 0.57,
"learning_rate": 0.0002443894819057537,
"loss": 2.2231,
"step": 3660
},
{
"epoch": 0.57,
"learning_rate": 0.00024407706326477478,
"loss": 2.211,
"step": 3680
},
{
"epoch": 0.57,
"learning_rate": 0.00024376464462379588,
"loss": 2.1904,
"step": 3700
},
{
"epoch": 0.58,
"learning_rate": 0.00024345222598281694,
"loss": 2.1492,
"step": 3720
},
{
"epoch": 0.58,
"learning_rate": 0.00024313980734183807,
"loss": 2.2368,
"step": 3740
},
{
"epoch": 0.58,
"learning_rate": 0.00024282738870085914,
"loss": 2.1753,
"step": 3760
},
{
"epoch": 0.59,
"learning_rate": 0.00024251497005988023,
"loss": 2.179,
"step": 3780
},
{
"epoch": 0.59,
"learning_rate": 0.0002422025514189013,
"loss": 2.1811,
"step": 3800
},
{
"epoch": 0.59,
"eval_loss": 2.3864212036132812,
"eval_runtime": 69.2951,
"eval_samples_per_second": 28.862,
"eval_steps_per_second": 1.804,
"step": 3800
},
{
"epoch": 0.59,
"learning_rate": 0.0002418901327779224,
"loss": 2.1496,
"step": 3820
},
{
"epoch": 0.6,
"learning_rate": 0.0002415777141369435,
"loss": 2.2071,
"step": 3840
},
{
"epoch": 0.6,
"learning_rate": 0.00024126529549596459,
"loss": 2.189,
"step": 3860
},
{
"epoch": 0.6,
"learning_rate": 0.00024095287685498565,
"loss": 2.1838,
"step": 3880
},
{
"epoch": 0.61,
"learning_rate": 0.00024064045821400675,
"loss": 2.2292,
"step": 3900
},
{
"epoch": 0.61,
"learning_rate": 0.00024032803957302782,
"loss": 2.1931,
"step": 3920
},
{
"epoch": 0.61,
"learning_rate": 0.00024001562093204894,
"loss": 2.2293,
"step": 3940
},
{
"epoch": 0.62,
"learning_rate": 0.00023970320229107,
"loss": 2.2112,
"step": 3960
},
{
"epoch": 0.62,
"learning_rate": 0.0002393907836500911,
"loss": 2.1479,
"step": 3980
},
{
"epoch": 0.62,
"learning_rate": 0.00023907836500911217,
"loss": 2.1661,
"step": 4000
},
{
"epoch": 0.62,
"eval_loss": 2.383505344390869,
"eval_runtime": 69.2876,
"eval_samples_per_second": 28.865,
"eval_steps_per_second": 1.804,
"step": 4000
},
{
"epoch": 0.62,
"learning_rate": 0.0002387659463681333,
"loss": 2.1783,
"step": 4020
},
{
"epoch": 0.63,
"learning_rate": 0.00023845352772715436,
"loss": 2.1975,
"step": 4040
},
{
"epoch": 0.63,
"learning_rate": 0.00023814110908617546,
"loss": 2.2268,
"step": 4060
},
{
"epoch": 0.63,
"learning_rate": 0.00023782869044519653,
"loss": 2.1815,
"step": 4080
},
{
"epoch": 0.64,
"learning_rate": 0.00023751627180421765,
"loss": 2.2305,
"step": 4100
},
{
"epoch": 0.64,
"learning_rate": 0.00023720385316323872,
"loss": 2.2087,
"step": 4120
},
{
"epoch": 0.64,
"learning_rate": 0.0002368914345222598,
"loss": 2.2204,
"step": 4140
},
{
"epoch": 0.65,
"learning_rate": 0.00023657901588128088,
"loss": 2.2138,
"step": 4160
},
{
"epoch": 0.65,
"learning_rate": 0.000236266597240302,
"loss": 2.2071,
"step": 4180
},
{
"epoch": 0.65,
"learning_rate": 0.00023595417859932307,
"loss": 2.1728,
"step": 4200
},
{
"epoch": 0.65,
"eval_loss": 2.3820013999938965,
"eval_runtime": 69.3049,
"eval_samples_per_second": 28.858,
"eval_steps_per_second": 1.804,
"step": 4200
},
{
"epoch": 0.66,
"learning_rate": 0.00023564175995834417,
"loss": 2.182,
"step": 4220
},
{
"epoch": 0.66,
"learning_rate": 0.00023532934131736524,
"loss": 2.1948,
"step": 4240
},
{
"epoch": 0.66,
"learning_rate": 0.00023501692267638633,
"loss": 2.2178,
"step": 4260
},
{
"epoch": 0.67,
"learning_rate": 0.00023470450403540743,
"loss": 2.1979,
"step": 4280
},
{
"epoch": 0.67,
"learning_rate": 0.00023439208539442852,
"loss": 2.222,
"step": 4300
},
{
"epoch": 0.67,
"learning_rate": 0.0002340796667534496,
"loss": 2.221,
"step": 4320
},
{
"epoch": 0.67,
"learning_rate": 0.00023376724811247069,
"loss": 2.208,
"step": 4340
},
{
"epoch": 0.68,
"learning_rate": 0.00023345482947149175,
"loss": 2.1502,
"step": 4360
},
{
"epoch": 0.68,
"learning_rate": 0.00023314241083051288,
"loss": 2.1628,
"step": 4380
},
{
"epoch": 0.68,
"learning_rate": 0.00023282999218953395,
"loss": 2.1933,
"step": 4400
},
{
"epoch": 0.68,
"eval_loss": 2.380128860473633,
"eval_runtime": 69.2864,
"eval_samples_per_second": 28.866,
"eval_steps_per_second": 1.804,
"step": 4400
},
{
"epoch": 0.69,
"learning_rate": 0.00023251757354855504,
"loss": 2.2204,
"step": 4420
},
{
"epoch": 0.69,
"learning_rate": 0.0002322051549075761,
"loss": 2.218,
"step": 4440
},
{
"epoch": 0.69,
"learning_rate": 0.00023189273626659723,
"loss": 2.199,
"step": 4460
},
{
"epoch": 0.7,
"learning_rate": 0.0002315803176256183,
"loss": 2.1826,
"step": 4480
},
{
"epoch": 0.7,
"learning_rate": 0.0002312678989846394,
"loss": 2.174,
"step": 4500
},
{
"epoch": 0.7,
"learning_rate": 0.00023095548034366046,
"loss": 2.2011,
"step": 4520
},
{
"epoch": 0.71,
"learning_rate": 0.00023064306170268159,
"loss": 2.1951,
"step": 4540
},
{
"epoch": 0.71,
"learning_rate": 0.00023033064306170265,
"loss": 2.2189,
"step": 4560
},
{
"epoch": 0.71,
"learning_rate": 0.00023001822442072375,
"loss": 2.1891,
"step": 4580
},
{
"epoch": 0.71,
"learning_rate": 0.00022970580577974482,
"loss": 2.1873,
"step": 4600
},
{
"epoch": 0.71,
"eval_loss": 2.379713296890259,
"eval_runtime": 69.3005,
"eval_samples_per_second": 28.86,
"eval_steps_per_second": 1.804,
"step": 4600
},
{
"epoch": 0.72,
"learning_rate": 0.00022939338713876591,
"loss": 2.2191,
"step": 4620
},
{
"epoch": 0.72,
"learning_rate": 0.000229080968497787,
"loss": 2.1966,
"step": 4640
},
{
"epoch": 0.72,
"learning_rate": 0.0002287685498568081,
"loss": 2.2062,
"step": 4660
},
{
"epoch": 0.73,
"learning_rate": 0.00022845613121582917,
"loss": 2.1888,
"step": 4680
},
{
"epoch": 0.73,
"learning_rate": 0.00022814371257485027,
"loss": 2.1938,
"step": 4700
},
{
"epoch": 0.73,
"learning_rate": 0.0002278312939338714,
"loss": 2.206,
"step": 4720
},
{
"epoch": 0.74,
"learning_rate": 0.00022751887529289246,
"loss": 2.1584,
"step": 4740
},
{
"epoch": 0.74,
"learning_rate": 0.00022720645665191355,
"loss": 2.1933,
"step": 4760
},
{
"epoch": 0.74,
"learning_rate": 0.00022689403801093462,
"loss": 2.2087,
"step": 4780
},
{
"epoch": 0.75,
"learning_rate": 0.00022658161936995575,
"loss": 2.2239,
"step": 4800
},
{
"epoch": 0.75,
"eval_loss": 2.3774757385253906,
"eval_runtime": 69.3137,
"eval_samples_per_second": 28.854,
"eval_steps_per_second": 1.803,
"step": 4800
},
{
"epoch": 0.75,
"learning_rate": 0.00022626920072897681,
"loss": 2.2136,
"step": 4820
},
{
"epoch": 0.75,
"learning_rate": 0.0002259567820879979,
"loss": 2.2046,
"step": 4840
},
{
"epoch": 0.76,
"learning_rate": 0.00022564436344701898,
"loss": 2.2031,
"step": 4860
},
{
"epoch": 0.76,
"learning_rate": 0.0002253319448060401,
"loss": 2.171,
"step": 4880
},
{
"epoch": 0.76,
"learning_rate": 0.00022501952616506117,
"loss": 2.2101,
"step": 4900
},
{
"epoch": 0.76,
"learning_rate": 0.00022470710752408226,
"loss": 2.1306,
"step": 4920
},
{
"epoch": 0.77,
"learning_rate": 0.00022439468888310333,
"loss": 2.1754,
"step": 4940
},
{
"epoch": 0.77,
"learning_rate": 0.00022408227024212443,
"loss": 2.1972,
"step": 4960
},
{
"epoch": 0.77,
"learning_rate": 0.00022376985160114552,
"loss": 2.2175,
"step": 4980
},
{
"epoch": 0.78,
"learning_rate": 0.00022345743296016662,
"loss": 2.139,
"step": 5000
},
{
"epoch": 0.78,
"eval_loss": 2.3760337829589844,
"eval_runtime": 69.3092,
"eval_samples_per_second": 28.856,
"eval_steps_per_second": 1.804,
"step": 5000
},
{
"epoch": 0.78,
"learning_rate": 0.0002231450143191877,
"loss": 2.1912,
"step": 5020
},
{
"epoch": 0.78,
"learning_rate": 0.00022283259567820878,
"loss": 2.2036,
"step": 5040
},
{
"epoch": 0.79,
"learning_rate": 0.00022252017703722985,
"loss": 2.1852,
"step": 5060
},
{
"epoch": 0.79,
"learning_rate": 0.00022220775839625097,
"loss": 2.1672,
"step": 5080
},
{
"epoch": 0.79,
"learning_rate": 0.00022189533975527204,
"loss": 2.1828,
"step": 5100
},
{
"epoch": 0.8,
"learning_rate": 0.00022158292111429314,
"loss": 2.1875,
"step": 5120
},
{
"epoch": 0.8,
"learning_rate": 0.0002212705024733142,
"loss": 2.1997,
"step": 5140
},
{
"epoch": 0.8,
"learning_rate": 0.00022095808383233533,
"loss": 2.2162,
"step": 5160
},
{
"epoch": 0.8,
"learning_rate": 0.0002206456651913564,
"loss": 2.2213,
"step": 5180
},
{
"epoch": 0.81,
"learning_rate": 0.0002203332465503775,
"loss": 2.1972,
"step": 5200
},
{
"epoch": 0.81,
"eval_loss": 2.374734878540039,
"eval_runtime": 69.2582,
"eval_samples_per_second": 28.877,
"eval_steps_per_second": 1.805,
"step": 5200
},
{
"epoch": 0.81,
"learning_rate": 0.00022002082790939856,
"loss": 2.175,
"step": 5220
},
{
"epoch": 0.81,
"learning_rate": 0.00021970840926841968,
"loss": 2.1951,
"step": 5240
},
{
"epoch": 0.82,
"learning_rate": 0.00021939599062744075,
"loss": 2.1493,
"step": 5260
},
{
"epoch": 0.82,
"learning_rate": 0.00021908357198646185,
"loss": 2.1611,
"step": 5280
},
{
"epoch": 0.82,
"learning_rate": 0.00021877115334548291,
"loss": 2.1621,
"step": 5300
},
{
"epoch": 0.83,
"learning_rate": 0.00021845873470450404,
"loss": 2.1875,
"step": 5320
},
{
"epoch": 0.83,
"learning_rate": 0.0002181463160635251,
"loss": 2.1733,
"step": 5340
},
{
"epoch": 0.83,
"learning_rate": 0.0002178338974225462,
"loss": 2.242,
"step": 5360
},
{
"epoch": 0.84,
"learning_rate": 0.00021752147878156727,
"loss": 2.2154,
"step": 5380
},
{
"epoch": 0.84,
"learning_rate": 0.00021720906014058836,
"loss": 2.1969,
"step": 5400
},
{
"epoch": 0.84,
"eval_loss": 2.372680902481079,
"eval_runtime": 69.283,
"eval_samples_per_second": 28.867,
"eval_steps_per_second": 1.804,
"step": 5400
},
{
"epoch": 0.84,
"learning_rate": 0.00021689664149960946,
"loss": 2.1245,
"step": 5420
},
{
"epoch": 0.85,
"learning_rate": 0.00021658422285863056,
"loss": 2.2049,
"step": 5440
},
{
"epoch": 0.85,
"learning_rate": 0.00021627180421765162,
"loss": 2.1716,
"step": 5460
},
{
"epoch": 0.85,
"learning_rate": 0.00021595938557667272,
"loss": 2.1891,
"step": 5480
},
{
"epoch": 0.85,
"learning_rate": 0.0002156469669356938,
"loss": 2.1963,
"step": 5500
},
{
"epoch": 0.86,
"learning_rate": 0.0002153345482947149,
"loss": 2.1946,
"step": 5520
},
{
"epoch": 0.86,
"learning_rate": 0.00021502212965373598,
"loss": 2.1982,
"step": 5540
},
{
"epoch": 0.86,
"learning_rate": 0.00021470971101275707,
"loss": 2.1759,
"step": 5560
},
{
"epoch": 0.87,
"learning_rate": 0.00021439729237177814,
"loss": 2.1661,
"step": 5580
},
{
"epoch": 0.87,
"learning_rate": 0.00021408487373079926,
"loss": 2.2051,
"step": 5600
},
{
"epoch": 0.87,
"eval_loss": 2.3719565868377686,
"eval_runtime": 69.321,
"eval_samples_per_second": 28.851,
"eval_steps_per_second": 1.803,
"step": 5600
},
{
"epoch": 0.87,
"learning_rate": 0.00021377245508982033,
"loss": 2.1605,
"step": 5620
},
{
"epoch": 0.88,
"learning_rate": 0.00021346003644884143,
"loss": 2.1375,
"step": 5640
},
{
"epoch": 0.88,
"learning_rate": 0.0002131476178078625,
"loss": 2.1293,
"step": 5660
},
{
"epoch": 0.88,
"learning_rate": 0.00021283519916688362,
"loss": 2.2189,
"step": 5680
},
{
"epoch": 0.89,
"learning_rate": 0.0002125227805259047,
"loss": 2.1784,
"step": 5700
},
{
"epoch": 0.89,
"learning_rate": 0.00021221036188492578,
"loss": 2.1764,
"step": 5720
},
{
"epoch": 0.89,
"learning_rate": 0.00021189794324394685,
"loss": 2.1569,
"step": 5740
},
{
"epoch": 0.9,
"learning_rate": 0.00021158552460296795,
"loss": 2.1704,
"step": 5760
},
{
"epoch": 0.9,
"learning_rate": 0.00021127310596198904,
"loss": 2.1614,
"step": 5780
},
{
"epoch": 0.9,
"learning_rate": 0.00021096068732101014,
"loss": 2.2078,
"step": 5800
},
{
"epoch": 0.9,
"eval_loss": 2.370939016342163,
"eval_runtime": 69.2728,
"eval_samples_per_second": 28.871,
"eval_steps_per_second": 1.804,
"step": 5800
},
{
"epoch": 0.9,
"learning_rate": 0.0002106482686800312,
"loss": 2.198,
"step": 5820
},
{
"epoch": 0.91,
"learning_rate": 0.0002103358500390523,
"loss": 2.1735,
"step": 5840
},
{
"epoch": 0.91,
"learning_rate": 0.00021002343139807342,
"loss": 2.1936,
"step": 5860
},
{
"epoch": 0.91,
"learning_rate": 0.0002097110127570945,
"loss": 2.1559,
"step": 5880
},
{
"epoch": 0.92,
"learning_rate": 0.0002093985941161156,
"loss": 2.1856,
"step": 5900
},
{
"epoch": 0.92,
"learning_rate": 0.00020908617547513666,
"loss": 2.194,
"step": 5920
},
{
"epoch": 0.92,
"learning_rate": 0.00020877375683415778,
"loss": 2.1983,
"step": 5940
},
{
"epoch": 0.93,
"learning_rate": 0.00020846133819317885,
"loss": 2.1788,
"step": 5960
},
{
"epoch": 0.93,
"learning_rate": 0.00020814891955219994,
"loss": 2.2126,
"step": 5980
},
{
"epoch": 0.93,
"learning_rate": 0.000207836500911221,
"loss": 2.1454,
"step": 6000
},
{
"epoch": 0.93,
"eval_loss": 2.369137763977051,
"eval_runtime": 69.3036,
"eval_samples_per_second": 28.859,
"eval_steps_per_second": 1.804,
"step": 6000
},
{
"epoch": 0.94,
"learning_rate": 0.00020752408227024213,
"loss": 2.1603,
"step": 6020
},
{
"epoch": 0.94,
"learning_rate": 0.0002072116636292632,
"loss": 2.2075,
"step": 6040
},
{
"epoch": 0.94,
"learning_rate": 0.0002068992449882843,
"loss": 2.1817,
"step": 6060
},
{
"epoch": 0.94,
"learning_rate": 0.00020658682634730537,
"loss": 2.1917,
"step": 6080
},
{
"epoch": 0.95,
"learning_rate": 0.00020627440770632646,
"loss": 2.1727,
"step": 6100
},
{
"epoch": 0.95,
"learning_rate": 0.00020596198906534756,
"loss": 2.1985,
"step": 6120
},
{
"epoch": 0.95,
"learning_rate": 0.00020564957042436865,
"loss": 2.1888,
"step": 6140
},
{
"epoch": 0.96,
"learning_rate": 0.00020533715178338972,
"loss": 2.1425,
"step": 6160
},
{
"epoch": 0.96,
"learning_rate": 0.00020502473314241082,
"loss": 2.1659,
"step": 6180
},
{
"epoch": 0.96,
"learning_rate": 0.00020471231450143188,
"loss": 2.1768,
"step": 6200
},
{
"epoch": 0.96,
"eval_loss": 2.368589162826538,
"eval_runtime": 69.4033,
"eval_samples_per_second": 28.817,
"eval_steps_per_second": 1.801,
"step": 6200
},
{
"epoch": 0.97,
"learning_rate": 0.000204399895860453,
"loss": 2.1744,
"step": 6220
},
{
"epoch": 0.97,
"learning_rate": 0.00020408747721947407,
"loss": 2.1484,
"step": 6240
},
{
"epoch": 0.97,
"learning_rate": 0.00020377505857849517,
"loss": 2.2154,
"step": 6260
},
{
"epoch": 0.98,
"learning_rate": 0.00020346263993751624,
"loss": 2.1358,
"step": 6280
},
{
"epoch": 0.98,
"learning_rate": 0.00020315022129653736,
"loss": 2.1809,
"step": 6300
},
{
"epoch": 0.98,
"learning_rate": 0.00020283780265555843,
"loss": 2.1813,
"step": 6320
},
{
"epoch": 0.99,
"learning_rate": 0.00020252538401457952,
"loss": 2.1903,
"step": 6340
},
{
"epoch": 0.99,
"learning_rate": 0.0002022129653736006,
"loss": 2.1971,
"step": 6360
},
{
"epoch": 0.99,
"learning_rate": 0.00020190054673262172,
"loss": 2.2041,
"step": 6380
},
{
"epoch": 0.99,
"learning_rate": 0.00020158812809164278,
"loss": 2.2169,
"step": 6400
},
{
"epoch": 0.99,
"eval_loss": 2.3672330379486084,
"eval_runtime": 69.3516,
"eval_samples_per_second": 28.839,
"eval_steps_per_second": 1.802,
"step": 6400
},
{
"epoch": 1.0,
"learning_rate": 0.00020127570945066388,
"loss": 2.2101,
"step": 6420
},
{
"epoch": 1.0,
"learning_rate": 0.00020096329080968495,
"loss": 2.1739,
"step": 6440
},
{
"epoch": 1.0,
"learning_rate": 0.00020065087216870604,
"loss": 2.1764,
"step": 6460
},
{
"epoch": 1.01,
"learning_rate": 0.00020033845352772714,
"loss": 2.1718,
"step": 6480
},
{
"epoch": 1.01,
"learning_rate": 0.00020002603488674823,
"loss": 2.1688,
"step": 6500
},
{
"epoch": 1.01,
"learning_rate": 0.0001997136162457693,
"loss": 2.1322,
"step": 6520
},
{
"epoch": 1.02,
"learning_rate": 0.0001994011976047904,
"loss": 2.1593,
"step": 6540
},
{
"epoch": 1.02,
"learning_rate": 0.0001990887789638115,
"loss": 2.179,
"step": 6560
},
{
"epoch": 1.02,
"learning_rate": 0.0001987763603228326,
"loss": 2.139,
"step": 6580
},
{
"epoch": 1.03,
"learning_rate": 0.00019846394168185366,
"loss": 2.1594,
"step": 6600
},
{
"epoch": 1.03,
"eval_loss": 2.367051839828491,
"eval_runtime": 69.3473,
"eval_samples_per_second": 28.84,
"eval_steps_per_second": 1.803,
"step": 6600
},
{
"epoch": 1.03,
"learning_rate": 0.00019815152304087475,
"loss": 2.2033,
"step": 6620
},
{
"epoch": 1.03,
"learning_rate": 0.00019783910439989582,
"loss": 2.183,
"step": 6640
},
{
"epoch": 1.03,
"learning_rate": 0.00019752668575891694,
"loss": 2.1517,
"step": 6660
},
{
"epoch": 1.04,
"learning_rate": 0.000197214267117938,
"loss": 2.183,
"step": 6680
},
{
"epoch": 1.04,
"learning_rate": 0.0001969018484769591,
"loss": 2.197,
"step": 6700
},
{
"epoch": 1.04,
"learning_rate": 0.00019658942983598017,
"loss": 2.1778,
"step": 6720
},
{
"epoch": 1.05,
"learning_rate": 0.0001962770111950013,
"loss": 2.1745,
"step": 6740
},
{
"epoch": 1.05,
"learning_rate": 0.00019596459255402237,
"loss": 2.1585,
"step": 6760
},
{
"epoch": 1.05,
"learning_rate": 0.00019565217391304346,
"loss": 2.1708,
"step": 6780
},
{
"epoch": 1.06,
"learning_rate": 0.00019533975527206453,
"loss": 2.1649,
"step": 6800
},
{
"epoch": 1.06,
"eval_loss": 2.363710880279541,
"eval_runtime": 69.2642,
"eval_samples_per_second": 28.875,
"eval_steps_per_second": 1.805,
"step": 6800
},
{
"epoch": 1.06,
"learning_rate": 0.00019502733663108565,
"loss": 2.1391,
"step": 6820
},
{
"epoch": 1.06,
"learning_rate": 0.00019471491799010672,
"loss": 2.1939,
"step": 6840
},
{
"epoch": 1.07,
"learning_rate": 0.00019440249934912782,
"loss": 2.1558,
"step": 6860
},
{
"epoch": 1.07,
"learning_rate": 0.00019409008070814888,
"loss": 2.173,
"step": 6880
},
{
"epoch": 1.07,
"learning_rate": 0.00019377766206716998,
"loss": 2.1821,
"step": 6900
},
{
"epoch": 1.08,
"learning_rate": 0.00019346524342619107,
"loss": 2.16,
"step": 6920
},
{
"epoch": 1.08,
"learning_rate": 0.00019315282478521217,
"loss": 2.1808,
"step": 6940
},
{
"epoch": 1.08,
"learning_rate": 0.00019284040614423324,
"loss": 2.1355,
"step": 6960
},
{
"epoch": 1.08,
"learning_rate": 0.00019252798750325433,
"loss": 2.1813,
"step": 6980
},
{
"epoch": 1.09,
"learning_rate": 0.00019221556886227546,
"loss": 2.1677,
"step": 7000
},
{
"epoch": 1.09,
"eval_loss": 2.3648109436035156,
"eval_runtime": 69.3675,
"eval_samples_per_second": 28.832,
"eval_steps_per_second": 1.802,
"step": 7000
},
{
"epoch": 1.09,
"learning_rate": 0.00019190315022129652,
"loss": 2.1479,
"step": 7020
},
{
"epoch": 1.09,
"learning_rate": 0.00019159073158031762,
"loss": 2.1852,
"step": 7040
},
{
"epoch": 1.1,
"learning_rate": 0.0001912783129393387,
"loss": 2.14,
"step": 7060
},
{
"epoch": 1.1,
"learning_rate": 0.0001909658942983598,
"loss": 2.1332,
"step": 7080
},
{
"epoch": 1.1,
"learning_rate": 0.00019065347565738088,
"loss": 2.178,
"step": 7100
},
{
"epoch": 1.11,
"learning_rate": 0.00019034105701640197,
"loss": 2.1661,
"step": 7120
},
{
"epoch": 1.11,
"learning_rate": 0.00019002863837542304,
"loss": 2.1902,
"step": 7140
},
{
"epoch": 1.11,
"learning_rate": 0.00018971621973444417,
"loss": 2.1775,
"step": 7160
},
{
"epoch": 1.12,
"learning_rate": 0.00018940380109346523,
"loss": 2.2007,
"step": 7180
},
{
"epoch": 1.12,
"learning_rate": 0.00018909138245248633,
"loss": 2.2078,
"step": 7200
},
{
"epoch": 1.12,
"eval_loss": 2.3642289638519287,
"eval_runtime": 69.5476,
"eval_samples_per_second": 28.757,
"eval_steps_per_second": 1.797,
"step": 7200
},
{
"epoch": 1.12,
"learning_rate": 0.0001887789638115074,
"loss": 2.185,
"step": 7220
},
{
"epoch": 1.13,
"learning_rate": 0.0001884665451705285,
"loss": 2.1856,
"step": 7240
},
{
"epoch": 1.13,
"learning_rate": 0.0001881541265295496,
"loss": 2.2049,
"step": 7260
},
{
"epoch": 1.13,
"learning_rate": 0.00018784170788857068,
"loss": 2.1376,
"step": 7280
},
{
"epoch": 1.13,
"learning_rate": 0.00018752928924759175,
"loss": 2.1693,
"step": 7300
},
{
"epoch": 1.14,
"learning_rate": 0.00018721687060661285,
"loss": 2.1825,
"step": 7320
},
{
"epoch": 1.14,
"learning_rate": 0.00018690445196563392,
"loss": 2.1649,
"step": 7340
},
{
"epoch": 1.14,
"learning_rate": 0.00018659203332465504,
"loss": 2.1936,
"step": 7360
},
{
"epoch": 1.15,
"learning_rate": 0.0001862796146836761,
"loss": 2.143,
"step": 7380
},
{
"epoch": 1.15,
"learning_rate": 0.0001859671960426972,
"loss": 2.1617,
"step": 7400
},
{
"epoch": 1.15,
"eval_loss": 2.362150192260742,
"eval_runtime": 69.3218,
"eval_samples_per_second": 28.851,
"eval_steps_per_second": 1.803,
"step": 7400
},
{
"epoch": 1.15,
"learning_rate": 0.00018565477740171827,
"loss": 2.1555,
"step": 7420
},
{
"epoch": 1.16,
"learning_rate": 0.0001853423587607394,
"loss": 2.1639,
"step": 7440
},
{
"epoch": 1.16,
"learning_rate": 0.00018502994011976046,
"loss": 2.1678,
"step": 7460
},
{
"epoch": 1.16,
"learning_rate": 0.00018471752147878156,
"loss": 2.1775,
"step": 7480
},
{
"epoch": 1.17,
"learning_rate": 0.00018440510283780263,
"loss": 2.1784,
"step": 7500
},
{
"epoch": 1.17,
"learning_rate": 0.00018409268419682375,
"loss": 2.1499,
"step": 7520
},
{
"epoch": 1.17,
"learning_rate": 0.00018378026555584482,
"loss": 2.154,
"step": 7540
},
{
"epoch": 1.17,
"learning_rate": 0.0001834678469148659,
"loss": 2.1793,
"step": 7560
},
{
"epoch": 1.18,
"learning_rate": 0.00018315542827388698,
"loss": 2.2292,
"step": 7580
},
{
"epoch": 1.18,
"learning_rate": 0.00018284300963290808,
"loss": 2.1578,
"step": 7600
},
{
"epoch": 1.18,
"eval_loss": 2.3628857135772705,
"eval_runtime": 69.2564,
"eval_samples_per_second": 28.878,
"eval_steps_per_second": 1.805,
"step": 7600
},
{
"epoch": 1.18,
"learning_rate": 0.00018253059099192917,
"loss": 2.1494,
"step": 7620
},
{
"epoch": 1.19,
"learning_rate": 0.00018221817235095027,
"loss": 2.1669,
"step": 7640
},
{
"epoch": 1.19,
"learning_rate": 0.00018190575370997133,
"loss": 2.1447,
"step": 7660
},
{
"epoch": 1.19,
"learning_rate": 0.00018159333506899243,
"loss": 2.1663,
"step": 7680
},
{
"epoch": 1.2,
"learning_rate": 0.0001812809164280135,
"loss": 2.1871,
"step": 7700
},
{
"epoch": 1.2,
"learning_rate": 0.00018096849778703462,
"loss": 2.1338,
"step": 7720
},
{
"epoch": 1.2,
"learning_rate": 0.0001806560791460557,
"loss": 2.1767,
"step": 7740
},
{
"epoch": 1.21,
"learning_rate": 0.00018034366050507678,
"loss": 2.1694,
"step": 7760
},
{
"epoch": 1.21,
"learning_rate": 0.00018003124186409785,
"loss": 2.1674,
"step": 7780
},
{
"epoch": 1.21,
"learning_rate": 0.00017971882322311898,
"loss": 2.1863,
"step": 7800
},
{
"epoch": 1.21,
"eval_loss": 2.3613035678863525,
"eval_runtime": 69.2881,
"eval_samples_per_second": 28.865,
"eval_steps_per_second": 1.804,
"step": 7800
},
{
"epoch": 1.22,
"learning_rate": 0.00017940640458214004,
"loss": 2.1441,
"step": 7820
},
{
"epoch": 1.22,
"learning_rate": 0.00017909398594116114,
"loss": 2.1885,
"step": 7840
},
{
"epoch": 1.22,
"learning_rate": 0.0001787815673001822,
"loss": 2.1514,
"step": 7860
},
{
"epoch": 1.22,
"learning_rate": 0.00017846914865920333,
"loss": 2.2002,
"step": 7880
},
{
"epoch": 1.23,
"learning_rate": 0.0001781567300182244,
"loss": 2.1759,
"step": 7900
},
{
"epoch": 1.23,
"learning_rate": 0.0001778443113772455,
"loss": 2.1611,
"step": 7920
},
{
"epoch": 1.23,
"learning_rate": 0.00017753189273626656,
"loss": 2.1667,
"step": 7940
},
{
"epoch": 1.24,
"learning_rate": 0.00017721947409528768,
"loss": 2.1717,
"step": 7960
},
{
"epoch": 1.24,
"learning_rate": 0.00017690705545430875,
"loss": 2.1983,
"step": 7980
},
{
"epoch": 1.24,
"learning_rate": 0.00017659463681332985,
"loss": 2.2092,
"step": 8000
},
{
"epoch": 1.24,
"eval_loss": 2.3608274459838867,
"eval_runtime": 69.3364,
"eval_samples_per_second": 28.845,
"eval_steps_per_second": 1.803,
"step": 8000
},
{
"epoch": 1.25,
"learning_rate": 0.00017628221817235092,
"loss": 2.1305,
"step": 8020
},
{
"epoch": 1.25,
"learning_rate": 0.000175969799531372,
"loss": 2.1431,
"step": 8040
},
{
"epoch": 1.25,
"learning_rate": 0.0001756573808903931,
"loss": 2.1384,
"step": 8060
},
{
"epoch": 1.26,
"learning_rate": 0.0001753449622494142,
"loss": 2.2093,
"step": 8080
},
{
"epoch": 1.26,
"learning_rate": 0.00017503254360843527,
"loss": 2.1271,
"step": 8100
},
{
"epoch": 1.26,
"learning_rate": 0.00017472012496745637,
"loss": 2.1466,
"step": 8120
},
{
"epoch": 1.26,
"learning_rate": 0.0001744077063264775,
"loss": 2.1578,
"step": 8140
},
{
"epoch": 1.27,
"learning_rate": 0.00017409528768549856,
"loss": 2.1632,
"step": 8160
},
{
"epoch": 1.27,
"learning_rate": 0.00017378286904451965,
"loss": 2.1465,
"step": 8180
},
{
"epoch": 1.27,
"learning_rate": 0.00017347045040354072,
"loss": 2.2226,
"step": 8200
},
{
"epoch": 1.27,
"eval_loss": 2.35835599899292,
"eval_runtime": 69.2657,
"eval_samples_per_second": 28.874,
"eval_steps_per_second": 1.805,
"step": 8200
},
{
"epoch": 1.28,
"learning_rate": 0.00017315803176256184,
"loss": 2.1585,
"step": 8220
},
{
"epoch": 1.28,
"learning_rate": 0.0001728456131215829,
"loss": 2.1529,
"step": 8240
},
{
"epoch": 1.28,
"learning_rate": 0.000172533194480604,
"loss": 2.1663,
"step": 8260
},
{
"epoch": 1.29,
"learning_rate": 0.00017222077583962508,
"loss": 2.1422,
"step": 8280
},
{
"epoch": 1.29,
"learning_rate": 0.00017190835719864617,
"loss": 2.158,
"step": 8300
},
{
"epoch": 1.29,
"learning_rate": 0.00017159593855766727,
"loss": 2.1984,
"step": 8320
},
{
"epoch": 1.3,
"learning_rate": 0.00017128351991668836,
"loss": 2.1395,
"step": 8340
},
{
"epoch": 1.3,
"learning_rate": 0.00017097110127570943,
"loss": 2.14,
"step": 8360
},
{
"epoch": 1.3,
"learning_rate": 0.00017065868263473053,
"loss": 2.1657,
"step": 8380
},
{
"epoch": 1.31,
"learning_rate": 0.00017036188492580056,
"loss": 2.167,
"step": 8400
},
{
"epoch": 1.31,
"eval_loss": 2.35697603225708,
"eval_runtime": 69.2685,
"eval_samples_per_second": 28.873,
"eval_steps_per_second": 1.805,
"step": 8400
},
{
"epoch": 1.31,
"learning_rate": 0.00017004946628482165,
"loss": 2.1396,
"step": 8420
},
{
"epoch": 1.31,
"learning_rate": 0.00016973704764384272,
"loss": 2.1777,
"step": 8440
},
{
"epoch": 1.31,
"learning_rate": 0.00016942462900286384,
"loss": 2.1366,
"step": 8460
},
{
"epoch": 1.32,
"learning_rate": 0.0001691122103618849,
"loss": 2.1625,
"step": 8480
},
{
"epoch": 1.32,
"learning_rate": 0.000168799791720906,
"loss": 2.1859,
"step": 8500
},
{
"epoch": 1.32,
"learning_rate": 0.00016848737307992707,
"loss": 2.1705,
"step": 8520
},
{
"epoch": 1.33,
"learning_rate": 0.0001681749544389482,
"loss": 2.1971,
"step": 8540
},
{
"epoch": 1.33,
"learning_rate": 0.00016786253579796927,
"loss": 2.1937,
"step": 8560
},
{
"epoch": 1.33,
"learning_rate": 0.00016755011715699036,
"loss": 2.1436,
"step": 8580
},
{
"epoch": 1.34,
"learning_rate": 0.00016723769851601143,
"loss": 2.1592,
"step": 8600
},
{
"epoch": 1.34,
"eval_loss": 2.3576247692108154,
"eval_runtime": 69.277,
"eval_samples_per_second": 28.87,
"eval_steps_per_second": 1.804,
"step": 8600
},
{
"epoch": 1.34,
"learning_rate": 0.00016692527987503252,
"loss": 2.1745,
"step": 8620
},
{
"epoch": 1.34,
"learning_rate": 0.00016661286123405362,
"loss": 2.1517,
"step": 8640
},
{
"epoch": 1.35,
"learning_rate": 0.00016630044259307472,
"loss": 2.1921,
"step": 8660
},
{
"epoch": 1.35,
"learning_rate": 0.00016598802395209578,
"loss": 2.1703,
"step": 8680
},
{
"epoch": 1.35,
"learning_rate": 0.00016567560531111688,
"loss": 2.1223,
"step": 8700
},
{
"epoch": 1.36,
"learning_rate": 0.00016536318667013795,
"loss": 2.1748,
"step": 8720
},
{
"epoch": 1.36,
"learning_rate": 0.00016505076802915907,
"loss": 2.145,
"step": 8740
},
{
"epoch": 1.36,
"learning_rate": 0.00016473834938818014,
"loss": 2.1077,
"step": 8760
},
{
"epoch": 1.36,
"learning_rate": 0.00016442593074720123,
"loss": 2.1571,
"step": 8780
},
{
"epoch": 1.37,
"learning_rate": 0.0001641135121062223,
"loss": 2.1946,
"step": 8800
},
{
"epoch": 1.37,
"eval_loss": 2.3559648990631104,
"eval_runtime": 69.3886,
"eval_samples_per_second": 28.823,
"eval_steps_per_second": 1.801,
"step": 8800
},
{
"epoch": 1.37,
"learning_rate": 0.00016380109346524342,
"loss": 2.1635,
"step": 8820
},
{
"epoch": 1.37,
"learning_rate": 0.0001634886748242645,
"loss": 2.1546,
"step": 8840
},
{
"epoch": 1.38,
"learning_rate": 0.0001631762561832856,
"loss": 2.1359,
"step": 8860
},
{
"epoch": 1.38,
"learning_rate": 0.00016286383754230666,
"loss": 2.1741,
"step": 8880
},
{
"epoch": 1.38,
"learning_rate": 0.00016255141890132778,
"loss": 2.1382,
"step": 8900
},
{
"epoch": 1.39,
"learning_rate": 0.00016223900026034885,
"loss": 2.1514,
"step": 8920
},
{
"epoch": 1.39,
"learning_rate": 0.00016192658161936994,
"loss": 2.17,
"step": 8940
},
{
"epoch": 1.39,
"learning_rate": 0.000161614162978391,
"loss": 2.1784,
"step": 8960
},
{
"epoch": 1.4,
"learning_rate": 0.0001613017443374121,
"loss": 2.1869,
"step": 8980
},
{
"epoch": 1.4,
"learning_rate": 0.0001609893256964332,
"loss": 2.155,
"step": 9000
},
{
"epoch": 1.4,
"eval_loss": 2.3562612533569336,
"eval_runtime": 70.7208,
"eval_samples_per_second": 28.28,
"eval_steps_per_second": 1.768,
"step": 9000
},
{
"epoch": 1.4,
"learning_rate": 0.0001606769070554543,
"loss": 2.1467,
"step": 9020
},
{
"epoch": 1.4,
"learning_rate": 0.00016036448841447537,
"loss": 2.1662,
"step": 9040
},
{
"epoch": 1.41,
"learning_rate": 0.00016005206977349646,
"loss": 2.1928,
"step": 9060
},
{
"epoch": 1.41,
"learning_rate": 0.00015973965113251756,
"loss": 2.1084,
"step": 9080
},
{
"epoch": 1.41,
"learning_rate": 0.00015942723249153865,
"loss": 2.182,
"step": 9100
},
{
"epoch": 1.42,
"learning_rate": 0.00015911481385055975,
"loss": 2.1502,
"step": 9120
},
{
"epoch": 1.42,
"learning_rate": 0.00015880239520958082,
"loss": 2.1645,
"step": 9140
},
{
"epoch": 1.42,
"learning_rate": 0.00015848997656860194,
"loss": 2.1246,
"step": 9160
},
{
"epoch": 1.43,
"learning_rate": 0.000158177557927623,
"loss": 2.1769,
"step": 9180
},
{
"epoch": 1.43,
"learning_rate": 0.0001578651392866441,
"loss": 2.1772,
"step": 9200
},
{
"epoch": 1.43,
"eval_loss": 2.354128360748291,
"eval_runtime": 70.4883,
"eval_samples_per_second": 28.374,
"eval_steps_per_second": 1.773,
"step": 9200
},
{
"epoch": 1.43,
"learning_rate": 0.00015755272064566517,
"loss": 2.1777,
"step": 9220
},
{
"epoch": 1.44,
"learning_rate": 0.0001572403020046863,
"loss": 2.1749,
"step": 9240
},
{
"epoch": 1.44,
"learning_rate": 0.00015692788336370736,
"loss": 2.1861,
"step": 9260
},
{
"epoch": 1.44,
"learning_rate": 0.00015661546472272846,
"loss": 2.1567,
"step": 9280
},
{
"epoch": 1.45,
"learning_rate": 0.00015630304608174952,
"loss": 2.1426,
"step": 9300
},
{
"epoch": 1.45,
"learning_rate": 0.00015599062744077062,
"loss": 2.1658,
"step": 9320
},
{
"epoch": 1.45,
"learning_rate": 0.00015567820879979172,
"loss": 2.1639,
"step": 9340
},
{
"epoch": 1.45,
"learning_rate": 0.0001553657901588128,
"loss": 2.1897,
"step": 9360
},
{
"epoch": 1.46,
"learning_rate": 0.00015505337151783388,
"loss": 2.1439,
"step": 9380
},
{
"epoch": 1.46,
"learning_rate": 0.00015474095287685497,
"loss": 2.1326,
"step": 9400
},
{
"epoch": 1.46,
"eval_loss": 2.352673292160034,
"eval_runtime": 69.2871,
"eval_samples_per_second": 28.865,
"eval_steps_per_second": 1.804,
"step": 9400
},
{
"epoch": 1.46,
"learning_rate": 0.00015442853423587604,
"loss": 2.139,
"step": 9420
},
{
"epoch": 1.47,
"learning_rate": 0.00015411611559489717,
"loss": 2.1087,
"step": 9440
},
{
"epoch": 1.47,
"learning_rate": 0.00015380369695391823,
"loss": 2.1528,
"step": 9460
},
{
"epoch": 1.47,
"learning_rate": 0.00015349127831293933,
"loss": 2.1866,
"step": 9480
},
{
"epoch": 1.48,
"learning_rate": 0.0001531788596719604,
"loss": 2.1436,
"step": 9500
},
{
"epoch": 1.48,
"learning_rate": 0.00015286644103098152,
"loss": 2.1699,
"step": 9520
},
{
"epoch": 1.48,
"learning_rate": 0.0001525540223900026,
"loss": 2.1415,
"step": 9540
},
{
"epoch": 1.49,
"learning_rate": 0.00015224160374902368,
"loss": 2.1092,
"step": 9560
},
{
"epoch": 1.49,
"learning_rate": 0.00015192918510804475,
"loss": 2.1422,
"step": 9580
},
{
"epoch": 1.49,
"learning_rate": 0.00015161676646706587,
"loss": 2.1677,
"step": 9600
},
{
"epoch": 1.49,
"eval_loss": 2.3518292903900146,
"eval_runtime": 69.3029,
"eval_samples_per_second": 28.859,
"eval_steps_per_second": 1.804,
"step": 9600
},
{
"epoch": 1.49,
"learning_rate": 0.00015130434782608694,
"loss": 2.1594,
"step": 9620
},
{
"epoch": 1.5,
"learning_rate": 0.00015099192918510804,
"loss": 2.1539,
"step": 9640
},
{
"epoch": 1.5,
"learning_rate": 0.0001506795105441291,
"loss": 2.1343,
"step": 9660
},
{
"epoch": 1.5,
"learning_rate": 0.00015036709190315023,
"loss": 2.1386,
"step": 9680
},
{
"epoch": 1.51,
"learning_rate": 0.0001500546732621713,
"loss": 2.1512,
"step": 9700
},
{
"epoch": 1.51,
"learning_rate": 0.0001497422546211924,
"loss": 2.1669,
"step": 9720
},
{
"epoch": 1.51,
"learning_rate": 0.0001494298359802135,
"loss": 2.158,
"step": 9740
},
{
"epoch": 1.52,
"learning_rate": 0.00014911741733923456,
"loss": 2.1643,
"step": 9760
},
{
"epoch": 1.52,
"learning_rate": 0.00014880499869825565,
"loss": 2.1612,
"step": 9780
},
{
"epoch": 1.52,
"learning_rate": 0.00014849258005727675,
"loss": 2.1441,
"step": 9800
},
{
"epoch": 1.52,
"eval_loss": 2.35211181640625,
"eval_runtime": 69.2821,
"eval_samples_per_second": 28.867,
"eval_steps_per_second": 1.804,
"step": 9800
},
{
"epoch": 1.53,
"learning_rate": 0.00014818016141629784,
"loss": 2.1704,
"step": 9820
},
{
"epoch": 1.53,
"learning_rate": 0.0001478677427753189,
"loss": 2.1546,
"step": 9840
},
{
"epoch": 1.53,
"learning_rate": 0.00014755532413434,
"loss": 2.1909,
"step": 9860
},
{
"epoch": 1.54,
"learning_rate": 0.0001472429054933611,
"loss": 2.149,
"step": 9880
},
{
"epoch": 1.54,
"learning_rate": 0.00014693048685238217,
"loss": 2.1419,
"step": 9900
},
{
"epoch": 1.54,
"learning_rate": 0.00014661806821140327,
"loss": 2.1465,
"step": 9920
},
{
"epoch": 1.54,
"learning_rate": 0.00014630564957042436,
"loss": 2.1551,
"step": 9940
},
{
"epoch": 1.55,
"learning_rate": 0.00014599323092944546,
"loss": 2.1526,
"step": 9960
},
{
"epoch": 1.55,
"learning_rate": 0.00014568081228846653,
"loss": 2.1437,
"step": 9980
},
{
"epoch": 1.55,
"learning_rate": 0.00014536839364748762,
"loss": 2.1659,
"step": 10000
},
{
"epoch": 1.55,
"eval_loss": 2.3507654666900635,
"eval_runtime": 69.2997,
"eval_samples_per_second": 28.86,
"eval_steps_per_second": 1.804,
"step": 10000
},
{
"epoch": 1.56,
"learning_rate": 0.00014505597500650872,
"loss": 2.14,
"step": 10020
},
{
"epoch": 1.56,
"learning_rate": 0.0001447435563655298,
"loss": 2.1289,
"step": 10040
},
{
"epoch": 1.56,
"learning_rate": 0.00014443113772455088,
"loss": 2.1226,
"step": 10060
},
{
"epoch": 1.57,
"learning_rate": 0.00014411871908357198,
"loss": 2.1627,
"step": 10080
},
{
"epoch": 1.57,
"learning_rate": 0.00014380630044259307,
"loss": 2.1759,
"step": 10100
},
{
"epoch": 1.57,
"learning_rate": 0.00014349388180161414,
"loss": 2.1511,
"step": 10120
},
{
"epoch": 1.58,
"learning_rate": 0.00014318146316063523,
"loss": 2.1275,
"step": 10140
},
{
"epoch": 1.58,
"learning_rate": 0.00014286904451965633,
"loss": 2.1638,
"step": 10160
},
{
"epoch": 1.58,
"learning_rate": 0.00014255662587867743,
"loss": 2.1494,
"step": 10180
},
{
"epoch": 1.59,
"learning_rate": 0.0001422442072376985,
"loss": 2.1554,
"step": 10200
},
{
"epoch": 1.59,
"eval_loss": 2.349271059036255,
"eval_runtime": 69.2627,
"eval_samples_per_second": 28.876,
"eval_steps_per_second": 1.805,
"step": 10200
},
{
"epoch": 1.59,
"learning_rate": 0.0001419317885967196,
"loss": 2.133,
"step": 10220
},
{
"epoch": 1.59,
"learning_rate": 0.00014161936995574068,
"loss": 2.1515,
"step": 10240
},
{
"epoch": 1.59,
"learning_rate": 0.00014130695131476178,
"loss": 2.1262,
"step": 10260
},
{
"epoch": 1.6,
"learning_rate": 0.00014099453267378285,
"loss": 2.142,
"step": 10280
},
{
"epoch": 1.6,
"learning_rate": 0.00014068211403280394,
"loss": 2.1578,
"step": 10300
},
{
"epoch": 1.6,
"learning_rate": 0.00014036969539182504,
"loss": 2.1583,
"step": 10320
},
{
"epoch": 1.61,
"learning_rate": 0.0001400572767508461,
"loss": 2.1043,
"step": 10340
},
{
"epoch": 1.61,
"learning_rate": 0.0001397448581098672,
"loss": 2.1539,
"step": 10360
},
{
"epoch": 1.61,
"learning_rate": 0.0001394324394688883,
"loss": 2.1189,
"step": 10380
},
{
"epoch": 1.62,
"learning_rate": 0.0001391200208279094,
"loss": 2.1484,
"step": 10400
},
{
"epoch": 1.62,
"eval_loss": 2.3479487895965576,
"eval_runtime": 69.2625,
"eval_samples_per_second": 28.876,
"eval_steps_per_second": 1.805,
"step": 10400
},
{
"epoch": 1.62,
"learning_rate": 0.00013880760218693046,
"loss": 2.1993,
"step": 10420
},
{
"epoch": 1.62,
"learning_rate": 0.00013849518354595156,
"loss": 2.1869,
"step": 10440
},
{
"epoch": 1.63,
"learning_rate": 0.00013818276490497265,
"loss": 2.1644,
"step": 10460
},
{
"epoch": 1.63,
"learning_rate": 0.00013787034626399375,
"loss": 2.1751,
"step": 10480
},
{
"epoch": 1.63,
"learning_rate": 0.00013755792762301482,
"loss": 2.1416,
"step": 10500
},
{
"epoch": 1.63,
"learning_rate": 0.0001372455089820359,
"loss": 2.1809,
"step": 10520
},
{
"epoch": 1.64,
"learning_rate": 0.000136933090341057,
"loss": 2.1653,
"step": 10540
},
{
"epoch": 1.64,
"learning_rate": 0.00013662067170007808,
"loss": 2.1026,
"step": 10560
},
{
"epoch": 1.64,
"learning_rate": 0.00013630825305909917,
"loss": 2.1503,
"step": 10580
},
{
"epoch": 1.65,
"learning_rate": 0.00013599583441812027,
"loss": 2.1289,
"step": 10600
},
{
"epoch": 1.65,
"eval_loss": 2.3468515872955322,
"eval_runtime": 69.2274,
"eval_samples_per_second": 28.89,
"eval_steps_per_second": 1.806,
"step": 10600
},
{
"epoch": 1.65,
"learning_rate": 0.00013568341577714136,
"loss": 2.1929,
"step": 10620
},
{
"epoch": 1.65,
"learning_rate": 0.00013537099713616243,
"loss": 2.1547,
"step": 10640
},
{
"epoch": 1.66,
"learning_rate": 0.00013505857849518353,
"loss": 2.1571,
"step": 10660
},
{
"epoch": 1.66,
"learning_rate": 0.00013474615985420462,
"loss": 2.1649,
"step": 10680
},
{
"epoch": 1.66,
"learning_rate": 0.00013443374121322572,
"loss": 2.1647,
"step": 10700
},
{
"epoch": 1.67,
"learning_rate": 0.00013412132257224679,
"loss": 2.206,
"step": 10720
},
{
"epoch": 1.67,
"learning_rate": 0.00013380890393126788,
"loss": 2.1377,
"step": 10740
},
{
"epoch": 1.67,
"learning_rate": 0.00013349648529028898,
"loss": 2.1347,
"step": 10760
},
{
"epoch": 1.68,
"learning_rate": 0.00013318406664931004,
"loss": 2.1948,
"step": 10780
},
{
"epoch": 1.68,
"learning_rate": 0.00013287164800833114,
"loss": 2.1844,
"step": 10800
},
{
"epoch": 1.68,
"eval_loss": 2.347837209701538,
"eval_runtime": 69.2425,
"eval_samples_per_second": 28.884,
"eval_steps_per_second": 1.805,
"step": 10800
},
{
"epoch": 1.68,
"learning_rate": 0.00013255922936735224,
"loss": 2.1515,
"step": 10820
},
{
"epoch": 1.68,
"learning_rate": 0.00013224681072637333,
"loss": 2.1885,
"step": 10840
},
{
"epoch": 1.69,
"learning_rate": 0.00013193439208539443,
"loss": 2.143,
"step": 10860
},
{
"epoch": 1.69,
"learning_rate": 0.00013162197344441552,
"loss": 2.1671,
"step": 10880
},
{
"epoch": 1.69,
"learning_rate": 0.0001313095548034366,
"loss": 2.1426,
"step": 10900
},
{
"epoch": 1.7,
"learning_rate": 0.00013099713616245769,
"loss": 2.1653,
"step": 10920
},
{
"epoch": 1.7,
"learning_rate": 0.00013068471752147878,
"loss": 2.1774,
"step": 10940
},
{
"epoch": 1.7,
"learning_rate": 0.00013037229888049988,
"loss": 2.1344,
"step": 10960
},
{
"epoch": 1.71,
"learning_rate": 0.00013005988023952094,
"loss": 2.1217,
"step": 10980
},
{
"epoch": 1.71,
"learning_rate": 0.00012974746159854204,
"loss": 2.1281,
"step": 11000
},
{
"epoch": 1.71,
"eval_loss": 2.345808982849121,
"eval_runtime": 69.2499,
"eval_samples_per_second": 28.881,
"eval_steps_per_second": 1.805,
"step": 11000
},
{
"epoch": 1.71,
"learning_rate": 0.00012943504295756314,
"loss": 2.1459,
"step": 11020
},
{
"epoch": 1.72,
"learning_rate": 0.0001291226243165842,
"loss": 2.1294,
"step": 11040
},
{
"epoch": 1.72,
"learning_rate": 0.0001288102056756053,
"loss": 2.1455,
"step": 11060
},
{
"epoch": 1.72,
"learning_rate": 0.0001284977870346264,
"loss": 2.1219,
"step": 11080
},
{
"epoch": 1.72,
"learning_rate": 0.0001281853683936475,
"loss": 2.1696,
"step": 11100
},
{
"epoch": 1.73,
"learning_rate": 0.00012787294975266856,
"loss": 2.1474,
"step": 11120
},
{
"epoch": 1.73,
"learning_rate": 0.00012756053111168965,
"loss": 2.1436,
"step": 11140
},
{
"epoch": 1.73,
"learning_rate": 0.00012724811247071075,
"loss": 2.1785,
"step": 11160
},
{
"epoch": 1.74,
"learning_rate": 0.00012693569382973184,
"loss": 2.1677,
"step": 11180
},
{
"epoch": 1.74,
"learning_rate": 0.0001266232751887529,
"loss": 2.1564,
"step": 11200
},
{
"epoch": 1.74,
"eval_loss": 2.3451294898986816,
"eval_runtime": 69.2454,
"eval_samples_per_second": 28.883,
"eval_steps_per_second": 1.805,
"step": 11200
},
{
"epoch": 1.74,
"learning_rate": 0.000126310856547774,
"loss": 2.1793,
"step": 11220
},
{
"epoch": 1.75,
"learning_rate": 0.0001259984379067951,
"loss": 2.1583,
"step": 11240
},
{
"epoch": 1.75,
"learning_rate": 0.00012568601926581617,
"loss": 2.1482,
"step": 11260
},
{
"epoch": 1.75,
"learning_rate": 0.00012537360062483727,
"loss": 2.1393,
"step": 11280
},
{
"epoch": 1.76,
"learning_rate": 0.00012506118198385836,
"loss": 2.1586,
"step": 11300
},
{
"epoch": 1.76,
"learning_rate": 0.00012474876334287946,
"loss": 2.1533,
"step": 11320
},
{
"epoch": 1.76,
"learning_rate": 0.00012443634470190053,
"loss": 2.1516,
"step": 11340
},
{
"epoch": 1.77,
"learning_rate": 0.00012412392606092162,
"loss": 2.1184,
"step": 11360
},
{
"epoch": 1.77,
"learning_rate": 0.00012381150741994272,
"loss": 2.1162,
"step": 11380
},
{
"epoch": 1.77,
"learning_rate": 0.0001234990887789638,
"loss": 2.1588,
"step": 11400
},
{
"epoch": 1.77,
"eval_loss": 2.3451669216156006,
"eval_runtime": 69.2383,
"eval_samples_per_second": 28.886,
"eval_steps_per_second": 1.805,
"step": 11400
},
{
"epoch": 1.77,
"learning_rate": 0.00012318667013798488,
"loss": 2.1588,
"step": 11420
},
{
"epoch": 1.78,
"learning_rate": 0.00012287425149700598,
"loss": 2.1463,
"step": 11440
},
{
"epoch": 1.78,
"learning_rate": 0.00012256183285602707,
"loss": 2.1498,
"step": 11460
},
{
"epoch": 1.78,
"learning_rate": 0.00012224941421504814,
"loss": 2.1663,
"step": 11480
},
{
"epoch": 1.79,
"learning_rate": 0.00012193699557406924,
"loss": 2.1306,
"step": 11500
},
{
"epoch": 1.79,
"learning_rate": 0.00012162457693309033,
"loss": 2.1542,
"step": 11520
},
{
"epoch": 1.79,
"learning_rate": 0.00012131215829211141,
"loss": 2.1513,
"step": 11540
},
{
"epoch": 1.8,
"learning_rate": 0.00012099973965113251,
"loss": 2.2031,
"step": 11560
},
{
"epoch": 1.8,
"learning_rate": 0.00012068732101015359,
"loss": 2.1438,
"step": 11580
},
{
"epoch": 1.8,
"learning_rate": 0.00012037490236917469,
"loss": 2.1431,
"step": 11600
},
{
"epoch": 1.8,
"eval_loss": 2.3447554111480713,
"eval_runtime": 69.2865,
"eval_samples_per_second": 28.866,
"eval_steps_per_second": 1.804,
"step": 11600
},
{
"epoch": 1.81,
"learning_rate": 0.00012006248372819577,
"loss": 2.1272,
"step": 11620
},
{
"epoch": 1.81,
"learning_rate": 0.00011975006508721686,
"loss": 2.1584,
"step": 11640
},
{
"epoch": 1.81,
"learning_rate": 0.00011943764644623794,
"loss": 2.128,
"step": 11660
},
{
"epoch": 1.82,
"learning_rate": 0.00011912522780525903,
"loss": 2.1461,
"step": 11680
},
{
"epoch": 1.82,
"learning_rate": 0.00011881280916428012,
"loss": 2.1411,
"step": 11700
},
{
"epoch": 1.82,
"learning_rate": 0.0001185003905233012,
"loss": 2.1592,
"step": 11720
},
{
"epoch": 1.82,
"learning_rate": 0.0001181879718823223,
"loss": 2.1642,
"step": 11740
},
{
"epoch": 1.83,
"learning_rate": 0.00011787555324134338,
"loss": 2.1914,
"step": 11760
},
{
"epoch": 1.83,
"learning_rate": 0.00011756313460036448,
"loss": 2.1612,
"step": 11780
},
{
"epoch": 1.83,
"learning_rate": 0.00011725071595938556,
"loss": 2.1452,
"step": 11800
},
{
"epoch": 1.83,
"eval_loss": 2.3442630767822266,
"eval_runtime": 69.2459,
"eval_samples_per_second": 28.883,
"eval_steps_per_second": 1.805,
"step": 11800
},
{
"epoch": 1.84,
"learning_rate": 0.00011693829731840665,
"loss": 2.1453,
"step": 11820
},
{
"epoch": 1.84,
"learning_rate": 0.00011662587867742774,
"loss": 2.1251,
"step": 11840
},
{
"epoch": 1.84,
"learning_rate": 0.00011631346003644882,
"loss": 2.1412,
"step": 11860
},
{
"epoch": 1.85,
"learning_rate": 0.00011600104139546991,
"loss": 2.1033,
"step": 11880
},
{
"epoch": 1.85,
"learning_rate": 0.000115688622754491,
"loss": 2.1219,
"step": 11900
},
{
"epoch": 1.85,
"learning_rate": 0.00011537620411351209,
"loss": 2.1831,
"step": 11920
},
{
"epoch": 1.86,
"learning_rate": 0.00011506378547253317,
"loss": 2.1434,
"step": 11940
},
{
"epoch": 1.86,
"learning_rate": 0.00011475136683155427,
"loss": 2.1439,
"step": 11960
},
{
"epoch": 1.86,
"learning_rate": 0.00011443894819057536,
"loss": 2.1377,
"step": 11980
},
{
"epoch": 1.86,
"learning_rate": 0.00011412652954959646,
"loss": 2.1345,
"step": 12000
},
{
"epoch": 1.86,
"eval_loss": 2.342855453491211,
"eval_runtime": 69.2714,
"eval_samples_per_second": 28.872,
"eval_steps_per_second": 1.804,
"step": 12000
},
{
"epoch": 1.87,
"learning_rate": 0.00011381411090861754,
"loss": 2.1527,
"step": 12020
},
{
"epoch": 1.87,
"learning_rate": 0.00011350169226763864,
"loss": 2.1737,
"step": 12040
},
{
"epoch": 1.87,
"learning_rate": 0.00011318927362665972,
"loss": 2.137,
"step": 12060
},
{
"epoch": 1.88,
"learning_rate": 0.00011287685498568081,
"loss": 2.1616,
"step": 12080
},
{
"epoch": 1.88,
"learning_rate": 0.0001125644363447019,
"loss": 2.1688,
"step": 12100
},
{
"epoch": 1.88,
"learning_rate": 0.00011225201770372299,
"loss": 2.1746,
"step": 12120
},
{
"epoch": 1.89,
"learning_rate": 0.00011193959906274407,
"loss": 2.1552,
"step": 12140
},
{
"epoch": 1.89,
"learning_rate": 0.00011162718042176515,
"loss": 2.1643,
"step": 12160
},
{
"epoch": 1.89,
"learning_rate": 0.00011131476178078625,
"loss": 2.1494,
"step": 12180
},
{
"epoch": 1.9,
"learning_rate": 0.00011100234313980733,
"loss": 2.1112,
"step": 12200
},
{
"epoch": 1.9,
"eval_loss": 2.34304141998291,
"eval_runtime": 72.1422,
"eval_samples_per_second": 27.723,
"eval_steps_per_second": 1.733,
"step": 12200
},
{
"epoch": 1.9,
"learning_rate": 0.00011068992449882843,
"loss": 2.1505,
"step": 12220
},
{
"epoch": 1.9,
"learning_rate": 0.00011037750585784951,
"loss": 2.1722,
"step": 12240
},
{
"epoch": 1.91,
"learning_rate": 0.0001100650872168706,
"loss": 2.1582,
"step": 12260
},
{
"epoch": 1.91,
"learning_rate": 0.00010975266857589169,
"loss": 2.1806,
"step": 12280
},
{
"epoch": 1.91,
"learning_rate": 0.00010944024993491278,
"loss": 2.1508,
"step": 12300
},
{
"epoch": 1.91,
"learning_rate": 0.00010912783129393386,
"loss": 2.1654,
"step": 12320
},
{
"epoch": 1.92,
"learning_rate": 0.00010881541265295496,
"loss": 2.131,
"step": 12340
},
{
"epoch": 1.92,
"learning_rate": 0.00010850299401197604,
"loss": 2.1301,
"step": 12360
},
{
"epoch": 1.92,
"learning_rate": 0.00010819057537099712,
"loss": 2.1312,
"step": 12380
},
{
"epoch": 1.93,
"learning_rate": 0.00010787815673001822,
"loss": 2.1301,
"step": 12400
},
{
"epoch": 1.93,
"eval_loss": 2.3404922485351562,
"eval_runtime": 71.3367,
"eval_samples_per_second": 28.036,
"eval_steps_per_second": 1.752,
"step": 12400
},
{
"epoch": 1.93,
"learning_rate": 0.00010758135902108825,
"loss": 2.1398,
"step": 12420
},
{
"epoch": 1.93,
"learning_rate": 0.00010726894038010933,
"loss": 2.1449,
"step": 12440
},
{
"epoch": 1.94,
"learning_rate": 0.00010695652173913043,
"loss": 2.1498,
"step": 12460
},
{
"epoch": 1.94,
"learning_rate": 0.00010664410309815151,
"loss": 2.1484,
"step": 12480
},
{
"epoch": 1.94,
"learning_rate": 0.0001063316844571726,
"loss": 2.1705,
"step": 12500
},
{
"epoch": 1.95,
"learning_rate": 0.00010601926581619368,
"loss": 2.1236,
"step": 12520
},
{
"epoch": 1.95,
"learning_rate": 0.00010570684717521478,
"loss": 2.1435,
"step": 12540
},
{
"epoch": 1.95,
"learning_rate": 0.00010539442853423586,
"loss": 2.1656,
"step": 12560
},
{
"epoch": 1.95,
"learning_rate": 0.00010508200989325696,
"loss": 2.1459,
"step": 12580
},
{
"epoch": 1.96,
"learning_rate": 0.00010476959125227804,
"loss": 2.1392,
"step": 12600
},
{
"epoch": 1.96,
"eval_loss": 2.3410892486572266,
"eval_runtime": 72.1407,
"eval_samples_per_second": 27.724,
"eval_steps_per_second": 1.733,
"step": 12600
},
{
"epoch": 1.96,
"learning_rate": 0.00010445717261129913,
"loss": 2.1399,
"step": 12620
},
{
"epoch": 1.96,
"learning_rate": 0.00010414475397032022,
"loss": 2.1979,
"step": 12640
},
{
"epoch": 1.97,
"learning_rate": 0.0001038323353293413,
"loss": 2.1596,
"step": 12660
},
{
"epoch": 1.97,
"learning_rate": 0.0001035199166883624,
"loss": 2.1817,
"step": 12680
},
{
"epoch": 1.97,
"learning_rate": 0.00010320749804738348,
"loss": 2.0972,
"step": 12700
},
{
"epoch": 1.98,
"learning_rate": 0.00010289507940640457,
"loss": 2.1293,
"step": 12720
},
{
"epoch": 1.98,
"learning_rate": 0.00010258266076542565,
"loss": 2.1362,
"step": 12740
},
{
"epoch": 1.98,
"learning_rate": 0.00010227024212444675,
"loss": 2.1474,
"step": 12760
},
{
"epoch": 1.99,
"learning_rate": 0.00010195782348346783,
"loss": 2.2004,
"step": 12780
},
{
"epoch": 1.99,
"learning_rate": 0.00010164540484248893,
"loss": 2.1221,
"step": 12800
},
{
"epoch": 1.99,
"eval_loss": 2.340029716491699,
"eval_runtime": 72.0796,
"eval_samples_per_second": 27.747,
"eval_steps_per_second": 1.734,
"step": 12800
},
{
"epoch": 1.99,
"learning_rate": 0.00010133298620151001,
"loss": 2.1782,
"step": 12820
},
{
"epoch": 2.0,
"learning_rate": 0.00010102056756053109,
"loss": 2.1358,
"step": 12840
},
{
"epoch": 2.0,
"learning_rate": 0.00010070814891955218,
"loss": 2.122,
"step": 12860
},
{
"epoch": 2.0,
"learning_rate": 0.00010039573027857327,
"loss": 2.1494,
"step": 12880
},
{
"epoch": 2.0,
"learning_rate": 0.00010008331163759436,
"loss": 2.1522,
"step": 12900
},
{
"epoch": 2.01,
"learning_rate": 9.977089299661544e-05,
"loss": 2.1241,
"step": 12920
},
{
"epoch": 2.01,
"learning_rate": 9.945847435563654e-05,
"loss": 2.1456,
"step": 12940
},
{
"epoch": 2.01,
"learning_rate": 9.914605571465763e-05,
"loss": 2.1495,
"step": 12960
},
{
"epoch": 2.02,
"learning_rate": 9.883363707367873e-05,
"loss": 2.1734,
"step": 12980
},
{
"epoch": 2.02,
"learning_rate": 9.852121843269981e-05,
"loss": 2.1711,
"step": 13000
},
{
"epoch": 2.02,
"eval_loss": 2.339312791824341,
"eval_runtime": 69.2994,
"eval_samples_per_second": 28.86,
"eval_steps_per_second": 1.804,
"step": 13000
},
{
"epoch": 2.02,
"learning_rate": 9.820879979172091e-05,
"loss": 2.1483,
"step": 13020
},
{
"epoch": 2.03,
"learning_rate": 9.789638115074199e-05,
"loss": 2.124,
"step": 13040
},
{
"epoch": 2.03,
"learning_rate": 9.758396250976308e-05,
"loss": 2.1337,
"step": 13060
},
{
"epoch": 2.03,
"learning_rate": 9.727154386878417e-05,
"loss": 2.137,
"step": 13080
},
{
"epoch": 2.04,
"learning_rate": 9.695912522780526e-05,
"loss": 2.1225,
"step": 13100
},
{
"epoch": 2.04,
"learning_rate": 9.664670658682634e-05,
"loss": 2.1384,
"step": 13120
},
{
"epoch": 2.04,
"learning_rate": 9.633428794584743e-05,
"loss": 2.1052,
"step": 13140
},
{
"epoch": 2.05,
"learning_rate": 9.602186930486852e-05,
"loss": 2.1489,
"step": 13160
},
{
"epoch": 2.05,
"learning_rate": 9.57094506638896e-05,
"loss": 2.1154,
"step": 13180
},
{
"epoch": 2.05,
"learning_rate": 9.53970320229107e-05,
"loss": 2.1476,
"step": 13200
},
{
"epoch": 2.05,
"eval_loss": 2.3396096229553223,
"eval_runtime": 69.2833,
"eval_samples_per_second": 28.867,
"eval_steps_per_second": 1.804,
"step": 13200
},
{
"epoch": 2.05,
"learning_rate": 9.508461338193178e-05,
"loss": 2.1109,
"step": 13220
},
{
"epoch": 2.06,
"learning_rate": 9.477219474095288e-05,
"loss": 2.0973,
"step": 13240
},
{
"epoch": 2.06,
"learning_rate": 9.445977609997396e-05,
"loss": 2.1281,
"step": 13260
},
{
"epoch": 2.06,
"learning_rate": 9.414735745899505e-05,
"loss": 2.1216,
"step": 13280
},
{
"epoch": 2.07,
"learning_rate": 9.383493881801614e-05,
"loss": 2.1323,
"step": 13300
},
{
"epoch": 2.07,
"learning_rate": 9.352252017703723e-05,
"loss": 2.1477,
"step": 13320
},
{
"epoch": 2.07,
"learning_rate": 9.321010153605831e-05,
"loss": 2.1309,
"step": 13340
},
{
"epoch": 2.08,
"learning_rate": 9.28976828950794e-05,
"loss": 2.0899,
"step": 13360
},
{
"epoch": 2.08,
"learning_rate": 9.258526425410049e-05,
"loss": 2.1402,
"step": 13380
},
{
"epoch": 2.08,
"learning_rate": 9.227284561312157e-05,
"loss": 2.0768,
"step": 13400
},
{
"epoch": 2.08,
"eval_loss": 2.3376858234405518,
"eval_runtime": 69.4568,
"eval_samples_per_second": 28.795,
"eval_steps_per_second": 1.8,
"step": 13400
},
{
"epoch": 2.09,
"learning_rate": 9.196042697214267e-05,
"loss": 2.1405,
"step": 13420
},
{
"epoch": 2.09,
"learning_rate": 9.164800833116375e-05,
"loss": 2.1118,
"step": 13440
},
{
"epoch": 2.09,
"learning_rate": 9.133558969018484e-05,
"loss": 2.1525,
"step": 13460
},
{
"epoch": 2.09,
"learning_rate": 9.102317104920593e-05,
"loss": 2.1369,
"step": 13480
},
{
"epoch": 2.1,
"learning_rate": 9.071075240822702e-05,
"loss": 2.1683,
"step": 13500
},
{
"epoch": 2.1,
"learning_rate": 9.03983337672481e-05,
"loss": 2.1193,
"step": 13520
},
{
"epoch": 2.1,
"learning_rate": 9.00859151262692e-05,
"loss": 2.1222,
"step": 13540
},
{
"epoch": 2.11,
"learning_rate": 8.977349648529028e-05,
"loss": 2.1461,
"step": 13560
},
{
"epoch": 2.11,
"learning_rate": 8.946107784431136e-05,
"loss": 2.1106,
"step": 13580
},
{
"epoch": 2.11,
"learning_rate": 8.914865920333246e-05,
"loss": 2.1307,
"step": 13600
},
{
"epoch": 2.11,
"eval_loss": 2.3381118774414062,
"eval_runtime": 69.5609,
"eval_samples_per_second": 28.752,
"eval_steps_per_second": 1.797,
"step": 13600
},
{
"epoch": 2.12,
"learning_rate": 8.883624056235354e-05,
"loss": 2.1679,
"step": 13620
},
{
"epoch": 2.12,
"learning_rate": 8.852382192137464e-05,
"loss": 2.1418,
"step": 13640
},
{
"epoch": 2.12,
"learning_rate": 8.821140328039572e-05,
"loss": 2.1238,
"step": 13660
},
{
"epoch": 2.13,
"learning_rate": 8.789898463941681e-05,
"loss": 2.0995,
"step": 13680
},
{
"epoch": 2.13,
"learning_rate": 8.75865659984379e-05,
"loss": 2.1596,
"step": 13700
},
{
"epoch": 2.13,
"learning_rate": 8.727414735745899e-05,
"loss": 2.1478,
"step": 13720
},
{
"epoch": 2.14,
"learning_rate": 8.696172871648007e-05,
"loss": 2.1299,
"step": 13740
},
{
"epoch": 2.14,
"learning_rate": 8.664931007550115e-05,
"loss": 2.1405,
"step": 13760
},
{
"epoch": 2.14,
"learning_rate": 8.633689143452225e-05,
"loss": 2.174,
"step": 13780
},
{
"epoch": 2.14,
"learning_rate": 8.602447279354333e-05,
"loss": 2.129,
"step": 13800
},
{
"epoch": 2.14,
"eval_loss": 2.337769031524658,
"eval_runtime": 69.7472,
"eval_samples_per_second": 28.675,
"eval_steps_per_second": 1.792,
"step": 13800
},
{
"epoch": 2.15,
"learning_rate": 8.571205415256443e-05,
"loss": 2.1368,
"step": 13820
},
{
"epoch": 2.15,
"learning_rate": 8.539963551158551e-05,
"loss": 2.1573,
"step": 13840
},
{
"epoch": 2.15,
"learning_rate": 8.50872168706066e-05,
"loss": 2.1132,
"step": 13860
},
{
"epoch": 2.16,
"learning_rate": 8.477479822962769e-05,
"loss": 2.1131,
"step": 13880
},
{
"epoch": 2.16,
"learning_rate": 8.446237958864878e-05,
"loss": 2.1351,
"step": 13900
},
{
"epoch": 2.16,
"learning_rate": 8.414996094766986e-05,
"loss": 2.1738,
"step": 13920
},
{
"epoch": 2.17,
"learning_rate": 8.383754230669096e-05,
"loss": 2.1551,
"step": 13940
},
{
"epoch": 2.17,
"learning_rate": 8.352512366571204e-05,
"loss": 2.1195,
"step": 13960
},
{
"epoch": 2.17,
"learning_rate": 8.321270502473312e-05,
"loss": 2.1125,
"step": 13980
},
{
"epoch": 2.18,
"learning_rate": 8.290028638375422e-05,
"loss": 2.1549,
"step": 14000
},
{
"epoch": 2.18,
"eval_loss": 2.337301731109619,
"eval_runtime": 69.7462,
"eval_samples_per_second": 28.675,
"eval_steps_per_second": 1.792,
"step": 14000
},
{
"epoch": 2.18,
"learning_rate": 8.25878677427753e-05,
"loss": 2.1573,
"step": 14020
},
{
"epoch": 2.18,
"learning_rate": 8.22754491017964e-05,
"loss": 2.1125,
"step": 14040
},
{
"epoch": 2.18,
"learning_rate": 8.196303046081748e-05,
"loss": 2.161,
"step": 14060
},
{
"epoch": 2.19,
"learning_rate": 8.165061181983857e-05,
"loss": 2.1511,
"step": 14080
},
{
"epoch": 2.19,
"learning_rate": 8.133819317885967e-05,
"loss": 2.1737,
"step": 14100
},
{
"epoch": 2.19,
"learning_rate": 8.102577453788076e-05,
"loss": 2.1158,
"step": 14120
},
{
"epoch": 2.2,
"learning_rate": 8.071335589690184e-05,
"loss": 2.1398,
"step": 14140
},
{
"epoch": 2.2,
"learning_rate": 8.040093725592294e-05,
"loss": 2.1183,
"step": 14160
},
{
"epoch": 2.2,
"learning_rate": 8.008851861494402e-05,
"loss": 2.1295,
"step": 14180
},
{
"epoch": 2.21,
"learning_rate": 7.977609997396512e-05,
"loss": 2.1416,
"step": 14200
},
{
"epoch": 2.21,
"eval_loss": 2.336796760559082,
"eval_runtime": 69.3578,
"eval_samples_per_second": 28.836,
"eval_steps_per_second": 1.802,
"step": 14200
},
{
"epoch": 2.21,
"learning_rate": 7.94636813329862e-05,
"loss": 2.1461,
"step": 14220
},
{
"epoch": 2.21,
"learning_rate": 7.91512626920073e-05,
"loss": 2.0931,
"step": 14240
},
{
"epoch": 2.22,
"learning_rate": 7.883884405102838e-05,
"loss": 2.1341,
"step": 14260
},
{
"epoch": 2.22,
"learning_rate": 7.852642541004946e-05,
"loss": 2.1369,
"step": 14280
},
{
"epoch": 2.22,
"learning_rate": 7.821400676907055e-05,
"loss": 2.1431,
"step": 14300
},
{
"epoch": 2.23,
"learning_rate": 7.790158812809164e-05,
"loss": 2.1508,
"step": 14320
},
{
"epoch": 2.23,
"learning_rate": 7.758916948711273e-05,
"loss": 2.1456,
"step": 14340
},
{
"epoch": 2.23,
"learning_rate": 7.727675084613381e-05,
"loss": 2.1448,
"step": 14360
},
{
"epoch": 2.23,
"learning_rate": 7.696433220515491e-05,
"loss": 2.1637,
"step": 14380
},
{
"epoch": 2.24,
"learning_rate": 7.665191356417599e-05,
"loss": 2.114,
"step": 14400
},
{
"epoch": 2.24,
"eval_loss": 2.3362655639648438,
"eval_runtime": 69.5792,
"eval_samples_per_second": 28.744,
"eval_steps_per_second": 1.797,
"step": 14400
},
{
"epoch": 2.24,
"learning_rate": 7.633949492319709e-05,
"loss": 2.1222,
"step": 14420
},
{
"epoch": 2.24,
"learning_rate": 7.602707628221817e-05,
"loss": 2.1776,
"step": 14440
},
{
"epoch": 2.25,
"learning_rate": 7.57302785732882e-05,
"loss": 2.1414,
"step": 14460
},
{
"epoch": 2.25,
"learning_rate": 7.541785993230929e-05,
"loss": 2.1231,
"step": 14480
},
{
"epoch": 2.25,
"learning_rate": 7.510544129133038e-05,
"loss": 2.1345,
"step": 14500
},
{
"epoch": 2.26,
"learning_rate": 7.479302265035147e-05,
"loss": 2.1339,
"step": 14520
},
{
"epoch": 2.26,
"learning_rate": 7.448060400937255e-05,
"loss": 2.1562,
"step": 14540
},
{
"epoch": 2.26,
"learning_rate": 7.416818536839363e-05,
"loss": 2.1649,
"step": 14560
},
{
"epoch": 2.27,
"learning_rate": 7.385576672741473e-05,
"loss": 2.1339,
"step": 14580
},
{
"epoch": 2.27,
"learning_rate": 7.354334808643581e-05,
"loss": 2.1347,
"step": 14600
},
{
"epoch": 2.27,
"eval_loss": 2.335818290710449,
"eval_runtime": 69.5131,
"eval_samples_per_second": 28.772,
"eval_steps_per_second": 1.798,
"step": 14600
},
{
"epoch": 2.27,
"learning_rate": 7.323092944545691e-05,
"loss": 2.1078,
"step": 14620
},
{
"epoch": 2.28,
"learning_rate": 7.291851080447799e-05,
"loss": 2.1446,
"step": 14640
},
{
"epoch": 2.28,
"learning_rate": 7.260609216349908e-05,
"loss": 2.1076,
"step": 14660
},
{
"epoch": 2.28,
"learning_rate": 7.229367352252017e-05,
"loss": 2.1548,
"step": 14680
},
{
"epoch": 2.28,
"learning_rate": 7.198125488154126e-05,
"loss": 2.1317,
"step": 14700
},
{
"epoch": 2.29,
"learning_rate": 7.166883624056234e-05,
"loss": 2.0991,
"step": 14720
},
{
"epoch": 2.29,
"learning_rate": 7.135641759958343e-05,
"loss": 2.1507,
"step": 14740
},
{
"epoch": 2.29,
"learning_rate": 7.104399895860452e-05,
"loss": 2.1173,
"step": 14760
},
{
"epoch": 2.3,
"learning_rate": 7.073158031762562e-05,
"loss": 2.104,
"step": 14780
},
{
"epoch": 2.3,
"learning_rate": 7.043478260869565e-05,
"loss": 2.1118,
"step": 14800
},
{
"epoch": 2.3,
"eval_loss": 2.334048271179199,
"eval_runtime": 69.3816,
"eval_samples_per_second": 28.826,
"eval_steps_per_second": 1.802,
"step": 14800
},
{
"epoch": 2.3,
"learning_rate": 7.012236396771674e-05,
"loss": 2.0738,
"step": 14820
},
{
"epoch": 2.31,
"learning_rate": 6.980994532673782e-05,
"loss": 2.1221,
"step": 14840
},
{
"epoch": 2.31,
"learning_rate": 6.94975266857589e-05,
"loss": 2.1531,
"step": 14860
},
{
"epoch": 2.31,
"learning_rate": 6.918510804478e-05,
"loss": 2.1318,
"step": 14880
},
{
"epoch": 2.32,
"learning_rate": 6.887268940380108e-05,
"loss": 2.1251,
"step": 14900
},
{
"epoch": 2.32,
"learning_rate": 6.856027076282218e-05,
"loss": 2.1212,
"step": 14920
},
{
"epoch": 2.32,
"learning_rate": 6.824785212184326e-05,
"loss": 2.0927,
"step": 14940
},
{
"epoch": 2.32,
"learning_rate": 6.793543348086436e-05,
"loss": 2.1277,
"step": 14960
},
{
"epoch": 2.33,
"learning_rate": 6.762301483988544e-05,
"loss": 2.156,
"step": 14980
},
{
"epoch": 2.33,
"learning_rate": 6.731059619890653e-05,
"loss": 2.1276,
"step": 15000
},
{
"epoch": 2.33,
"eval_loss": 2.3340351581573486,
"eval_runtime": 69.2926,
"eval_samples_per_second": 28.863,
"eval_steps_per_second": 1.804,
"step": 15000
},
{
"epoch": 2.33,
"learning_rate": 6.699817755792761e-05,
"loss": 2.1313,
"step": 15020
},
{
"epoch": 2.34,
"learning_rate": 6.668575891694871e-05,
"loss": 2.1452,
"step": 15040
},
{
"epoch": 2.34,
"learning_rate": 6.637334027596979e-05,
"loss": 2.1148,
"step": 15060
},
{
"epoch": 2.34,
"learning_rate": 6.606092163499087e-05,
"loss": 2.1193,
"step": 15080
},
{
"epoch": 2.35,
"learning_rate": 6.574850299401197e-05,
"loss": 2.1672,
"step": 15100
},
{
"epoch": 2.35,
"learning_rate": 6.543608435303305e-05,
"loss": 2.0789,
"step": 15120
},
{
"epoch": 2.35,
"learning_rate": 6.512366571205415e-05,
"loss": 2.1438,
"step": 15140
},
{
"epoch": 2.36,
"learning_rate": 6.481124707107523e-05,
"loss": 2.1597,
"step": 15160
},
{
"epoch": 2.36,
"learning_rate": 6.449882843009632e-05,
"loss": 2.11,
"step": 15180
},
{
"epoch": 2.36,
"learning_rate": 6.418640978911742e-05,
"loss": 2.1279,
"step": 15200
},
{
"epoch": 2.36,
"eval_loss": 2.3344008922576904,
"eval_runtime": 69.3363,
"eval_samples_per_second": 28.845,
"eval_steps_per_second": 1.803,
"step": 15200
},
{
"epoch": 2.37,
"learning_rate": 6.38739911481385e-05,
"loss": 2.1459,
"step": 15220
},
{
"epoch": 2.37,
"learning_rate": 6.35615725071596e-05,
"loss": 2.1702,
"step": 15240
},
{
"epoch": 2.37,
"learning_rate": 6.324915386618068e-05,
"loss": 2.1262,
"step": 15260
},
{
"epoch": 2.37,
"learning_rate": 6.293673522520177e-05,
"loss": 2.0988,
"step": 15280
},
{
"epoch": 2.38,
"learning_rate": 6.262431658422286e-05,
"loss": 2.1224,
"step": 15300
},
{
"epoch": 2.38,
"learning_rate": 6.231189794324394e-05,
"loss": 2.1102,
"step": 15320
},
{
"epoch": 2.38,
"learning_rate": 6.199947930226503e-05,
"loss": 2.1168,
"step": 15340
},
{
"epoch": 2.39,
"learning_rate": 6.168706066128611e-05,
"loss": 2.1205,
"step": 15360
},
{
"epoch": 2.39,
"learning_rate": 6.137464202030721e-05,
"loss": 2.0855,
"step": 15380
},
{
"epoch": 2.39,
"learning_rate": 6.106222337932829e-05,
"loss": 2.1548,
"step": 15400
},
{
"epoch": 2.39,
"eval_loss": 2.333451271057129,
"eval_runtime": 69.3334,
"eval_samples_per_second": 28.846,
"eval_steps_per_second": 1.803,
"step": 15400
},
{
"epoch": 2.4,
"learning_rate": 6.074980473834938e-05,
"loss": 2.1433,
"step": 15420
},
{
"epoch": 2.4,
"learning_rate": 6.043738609737047e-05,
"loss": 2.123,
"step": 15440
},
{
"epoch": 2.4,
"learning_rate": 6.012496745639156e-05,
"loss": 2.0965,
"step": 15460
},
{
"epoch": 2.41,
"learning_rate": 5.9812548815412647e-05,
"loss": 2.1498,
"step": 15480
},
{
"epoch": 2.41,
"learning_rate": 5.9500130174433735e-05,
"loss": 2.1456,
"step": 15500
},
{
"epoch": 2.41,
"learning_rate": 5.9187711533454824e-05,
"loss": 2.1295,
"step": 15520
},
{
"epoch": 2.41,
"learning_rate": 5.887529289247591e-05,
"loss": 2.108,
"step": 15540
},
{
"epoch": 2.42,
"learning_rate": 5.8562874251497e-05,
"loss": 2.1592,
"step": 15560
},
{
"epoch": 2.42,
"learning_rate": 5.825045561051809e-05,
"loss": 2.1214,
"step": 15580
},
{
"epoch": 2.42,
"learning_rate": 5.793803696953918e-05,
"loss": 2.1561,
"step": 15600
},
{
"epoch": 2.42,
"eval_loss": 2.3329403400421143,
"eval_runtime": 69.6034,
"eval_samples_per_second": 28.734,
"eval_steps_per_second": 1.796,
"step": 15600
},
{
"epoch": 2.43,
"learning_rate": 5.762561832856026e-05,
"loss": 2.1382,
"step": 15620
},
{
"epoch": 2.43,
"learning_rate": 5.731319968758135e-05,
"loss": 2.109,
"step": 15640
},
{
"epoch": 2.43,
"learning_rate": 5.700078104660244e-05,
"loss": 2.1283,
"step": 15660
},
{
"epoch": 2.44,
"learning_rate": 5.6688362405623526e-05,
"loss": 2.15,
"step": 15680
},
{
"epoch": 2.44,
"learning_rate": 5.6375943764644615e-05,
"loss": 2.1125,
"step": 15700
},
{
"epoch": 2.44,
"learning_rate": 5.6063525123665704e-05,
"loss": 2.1709,
"step": 15720
},
{
"epoch": 2.45,
"learning_rate": 5.575110648268679e-05,
"loss": 2.1622,
"step": 15740
},
{
"epoch": 2.45,
"learning_rate": 5.543868784170789e-05,
"loss": 2.0769,
"step": 15760
},
{
"epoch": 2.45,
"learning_rate": 5.5126269200728976e-05,
"loss": 2.137,
"step": 15780
},
{
"epoch": 2.46,
"learning_rate": 5.4813850559750065e-05,
"loss": 2.1294,
"step": 15800
},
{
"epoch": 2.46,
"eval_loss": 2.3324475288391113,
"eval_runtime": 69.559,
"eval_samples_per_second": 28.753,
"eval_steps_per_second": 1.797,
"step": 15800
},
{
"epoch": 2.46,
"learning_rate": 5.4501431918771154e-05,
"loss": 2.1425,
"step": 15820
},
{
"epoch": 2.46,
"learning_rate": 5.418901327779224e-05,
"loss": 2.128,
"step": 15840
},
{
"epoch": 2.46,
"learning_rate": 5.387659463681333e-05,
"loss": 2.1553,
"step": 15860
},
{
"epoch": 2.47,
"learning_rate": 5.356417599583441e-05,
"loss": 2.1339,
"step": 15880
},
{
"epoch": 2.47,
"learning_rate": 5.32517573548555e-05,
"loss": 2.1536,
"step": 15900
},
{
"epoch": 2.47,
"learning_rate": 5.293933871387659e-05,
"loss": 2.1669,
"step": 15920
},
{
"epoch": 2.48,
"learning_rate": 5.262692007289768e-05,
"loss": 2.122,
"step": 15940
},
{
"epoch": 2.48,
"learning_rate": 5.231450143191877e-05,
"loss": 2.1435,
"step": 15960
},
{
"epoch": 2.48,
"learning_rate": 5.2002082790939856e-05,
"loss": 2.1406,
"step": 15980
},
{
"epoch": 2.49,
"learning_rate": 5.1689664149960945e-05,
"loss": 2.1174,
"step": 16000
},
{
"epoch": 2.49,
"eval_loss": 2.332836866378784,
"eval_runtime": 69.3739,
"eval_samples_per_second": 28.829,
"eval_steps_per_second": 1.802,
"step": 16000
},
{
"epoch": 2.49,
"learning_rate": 5.137724550898203e-05,
"loss": 2.1286,
"step": 16020
},
{
"epoch": 2.49,
"learning_rate": 5.106482686800312e-05,
"loss": 2.1343,
"step": 16040
},
{
"epoch": 2.5,
"learning_rate": 5.075240822702421e-05,
"loss": 2.1134,
"step": 16060
},
{
"epoch": 2.5,
"learning_rate": 5.043998958604529e-05,
"loss": 2.1633,
"step": 16080
},
{
"epoch": 2.5,
"learning_rate": 5.012757094506638e-05,
"loss": 2.1473,
"step": 16100
},
{
"epoch": 2.5,
"learning_rate": 4.981515230408747e-05,
"loss": 2.1535,
"step": 16120
},
{
"epoch": 2.51,
"learning_rate": 4.950273366310856e-05,
"loss": 2.112,
"step": 16140
},
{
"epoch": 2.51,
"learning_rate": 4.919031502212965e-05,
"loss": 2.1399,
"step": 16160
},
{
"epoch": 2.51,
"learning_rate": 4.8877896381150736e-05,
"loss": 2.0913,
"step": 16180
},
{
"epoch": 2.52,
"learning_rate": 4.8565477740171824e-05,
"loss": 2.1179,
"step": 16200
},
{
"epoch": 2.52,
"eval_loss": 2.332409143447876,
"eval_runtime": 69.3294,
"eval_samples_per_second": 28.848,
"eval_steps_per_second": 1.803,
"step": 16200
},
{
"epoch": 2.52,
"learning_rate": 4.825305909919291e-05,
"loss": 2.1756,
"step": 16220
},
{
"epoch": 2.52,
"learning_rate": 4.7940640458214e-05,
"loss": 2.1466,
"step": 16240
},
{
"epoch": 2.53,
"learning_rate": 4.762822181723509e-05,
"loss": 2.1443,
"step": 16260
},
{
"epoch": 2.53,
"learning_rate": 4.731580317625618e-05,
"loss": 2.1207,
"step": 16280
},
{
"epoch": 2.53,
"learning_rate": 4.700338453527726e-05,
"loss": 2.1275,
"step": 16300
},
{
"epoch": 2.54,
"learning_rate": 4.669096589429835e-05,
"loss": 2.1305,
"step": 16320
},
{
"epoch": 2.54,
"learning_rate": 4.6378547253319445e-05,
"loss": 2.134,
"step": 16340
},
{
"epoch": 2.54,
"learning_rate": 4.6066128612340534e-05,
"loss": 2.1681,
"step": 16360
},
{
"epoch": 2.55,
"learning_rate": 4.575370997136162e-05,
"loss": 2.1627,
"step": 16380
},
{
"epoch": 2.55,
"learning_rate": 4.544129133038271e-05,
"loss": 2.1421,
"step": 16400
},
{
"epoch": 2.55,
"eval_loss": 2.3318614959716797,
"eval_runtime": 69.3251,
"eval_samples_per_second": 28.85,
"eval_steps_per_second": 1.803,
"step": 16400
},
{
"epoch": 2.55,
"learning_rate": 4.51288726894038e-05,
"loss": 2.1225,
"step": 16420
},
{
"epoch": 2.55,
"learning_rate": 4.481645404842489e-05,
"loss": 2.156,
"step": 16440
},
{
"epoch": 2.56,
"learning_rate": 4.450403540744598e-05,
"loss": 2.1573,
"step": 16460
},
{
"epoch": 2.56,
"learning_rate": 4.4191616766467066e-05,
"loss": 2.1295,
"step": 16480
},
{
"epoch": 2.56,
"learning_rate": 4.3879198125488154e-05,
"loss": 2.14,
"step": 16500
},
{
"epoch": 2.57,
"learning_rate": 4.356677948450924e-05,
"loss": 2.1046,
"step": 16520
},
{
"epoch": 2.57,
"learning_rate": 4.3254360843530325e-05,
"loss": 2.1201,
"step": 16540
},
{
"epoch": 2.57,
"learning_rate": 4.2941942202551413e-05,
"loss": 2.1767,
"step": 16560
},
{
"epoch": 2.58,
"learning_rate": 4.26295235615725e-05,
"loss": 2.1244,
"step": 16580
},
{
"epoch": 2.58,
"learning_rate": 4.231710492059359e-05,
"loss": 2.1301,
"step": 16600
},
{
"epoch": 2.58,
"eval_loss": 2.331899881362915,
"eval_runtime": 69.3398,
"eval_samples_per_second": 28.843,
"eval_steps_per_second": 1.803,
"step": 16600
},
{
"epoch": 2.58,
"learning_rate": 4.200468627961468e-05,
"loss": 2.1022,
"step": 16620
},
{
"epoch": 2.59,
"learning_rate": 4.169226763863577e-05,
"loss": 2.1121,
"step": 16640
},
{
"epoch": 2.59,
"learning_rate": 4.137984899765686e-05,
"loss": 2.1014,
"step": 16660
},
{
"epoch": 2.59,
"learning_rate": 4.1067430356677945e-05,
"loss": 2.1867,
"step": 16680
},
{
"epoch": 2.6,
"learning_rate": 4.0755011715699034e-05,
"loss": 2.1055,
"step": 16700
},
{
"epoch": 2.6,
"learning_rate": 4.044259307472012e-05,
"loss": 2.1435,
"step": 16720
},
{
"epoch": 2.6,
"learning_rate": 4.013017443374121e-05,
"loss": 2.09,
"step": 16740
},
{
"epoch": 2.6,
"learning_rate": 3.981775579276229e-05,
"loss": 2.1317,
"step": 16760
},
{
"epoch": 2.61,
"learning_rate": 3.950533715178338e-05,
"loss": 2.0683,
"step": 16780
},
{
"epoch": 2.61,
"learning_rate": 3.919291851080447e-05,
"loss": 2.1249,
"step": 16800
},
{
"epoch": 2.61,
"eval_loss": 2.331566572189331,
"eval_runtime": 69.3154,
"eval_samples_per_second": 28.854,
"eval_steps_per_second": 1.803,
"step": 16800
},
{
"epoch": 2.61,
"learning_rate": 3.888049986982556e-05,
"loss": 2.164,
"step": 16820
},
{
"epoch": 2.62,
"learning_rate": 3.856808122884665e-05,
"loss": 2.16,
"step": 16840
},
{
"epoch": 2.62,
"learning_rate": 3.8255662587867736e-05,
"loss": 2.1603,
"step": 16860
},
{
"epoch": 2.62,
"learning_rate": 3.7943243946888825e-05,
"loss": 2.1346,
"step": 16880
},
{
"epoch": 2.63,
"learning_rate": 3.7630825305909914e-05,
"loss": 2.1082,
"step": 16900
},
{
"epoch": 2.63,
"learning_rate": 3.7318406664931e-05,
"loss": 2.1014,
"step": 16920
},
{
"epoch": 2.63,
"learning_rate": 3.700598802395209e-05,
"loss": 2.1088,
"step": 16940
},
{
"epoch": 2.64,
"learning_rate": 3.669356938297318e-05,
"loss": 2.0975,
"step": 16960
},
{
"epoch": 2.64,
"learning_rate": 3.638115074199427e-05,
"loss": 2.1212,
"step": 16980
},
{
"epoch": 2.64,
"learning_rate": 3.606873210101536e-05,
"loss": 2.1226,
"step": 17000
},
{
"epoch": 2.64,
"eval_loss": 2.3310983180999756,
"eval_runtime": 69.3945,
"eval_samples_per_second": 28.821,
"eval_steps_per_second": 1.801,
"step": 17000
},
{
"epoch": 2.64,
"learning_rate": 3.5756313460036446e-05,
"loss": 2.1318,
"step": 17020
},
{
"epoch": 2.65,
"learning_rate": 3.5443894819057534e-05,
"loss": 2.1073,
"step": 17040
},
{
"epoch": 2.65,
"learning_rate": 3.513147617807862e-05,
"loss": 2.1411,
"step": 17060
},
{
"epoch": 2.65,
"learning_rate": 3.481905753709971e-05,
"loss": 2.0959,
"step": 17080
},
{
"epoch": 2.66,
"learning_rate": 3.45066388961208e-05,
"loss": 2.0858,
"step": 17100
},
{
"epoch": 2.66,
"learning_rate": 3.419422025514189e-05,
"loss": 2.1174,
"step": 17120
},
{
"epoch": 2.66,
"learning_rate": 3.388180161416298e-05,
"loss": 2.1459,
"step": 17140
},
{
"epoch": 2.67,
"learning_rate": 3.3569382973184066e-05,
"loss": 2.1425,
"step": 17160
},
{
"epoch": 2.67,
"learning_rate": 3.3256964332205155e-05,
"loss": 2.0971,
"step": 17180
},
{
"epoch": 2.67,
"learning_rate": 3.2944545691226243e-05,
"loss": 2.1176,
"step": 17200
},
{
"epoch": 2.67,
"eval_loss": 2.330962896347046,
"eval_runtime": 69.3407,
"eval_samples_per_second": 28.843,
"eval_steps_per_second": 1.803,
"step": 17200
},
{
"epoch": 2.68,
"learning_rate": 3.2632127050247325e-05,
"loss": 2.1471,
"step": 17220
},
{
"epoch": 2.68,
"learning_rate": 3.2319708409268414e-05,
"loss": 2.1064,
"step": 17240
},
{
"epoch": 2.68,
"learning_rate": 3.20072897682895e-05,
"loss": 2.1347,
"step": 17260
},
{
"epoch": 2.69,
"learning_rate": 3.169487112731059e-05,
"loss": 2.142,
"step": 17280
},
{
"epoch": 2.69,
"learning_rate": 3.138245248633168e-05,
"loss": 2.1773,
"step": 17300
},
{
"epoch": 2.69,
"learning_rate": 3.107003384535277e-05,
"loss": 2.1489,
"step": 17320
},
{
"epoch": 2.69,
"learning_rate": 3.075761520437386e-05,
"loss": 2.1257,
"step": 17340
},
{
"epoch": 2.7,
"learning_rate": 3.044519656339495e-05,
"loss": 2.1288,
"step": 17360
},
{
"epoch": 2.7,
"learning_rate": 3.0132777922416038e-05,
"loss": 2.1258,
"step": 17380
},
{
"epoch": 2.7,
"learning_rate": 2.9820359281437123e-05,
"loss": 2.1322,
"step": 17400
},
{
"epoch": 2.7,
"eval_loss": 2.3309593200683594,
"eval_runtime": 69.3923,
"eval_samples_per_second": 28.822,
"eval_steps_per_second": 1.801,
"step": 17400
},
{
"epoch": 2.71,
"learning_rate": 2.9507940640458212e-05,
"loss": 2.1495,
"step": 17420
},
{
"epoch": 2.71,
"learning_rate": 2.91955219994793e-05,
"loss": 2.0843,
"step": 17440
},
{
"epoch": 2.71,
"learning_rate": 2.888310335850039e-05,
"loss": 2.11,
"step": 17460
},
{
"epoch": 2.72,
"learning_rate": 2.8570684717521478e-05,
"loss": 2.1005,
"step": 17480
},
{
"epoch": 2.72,
"learning_rate": 2.827388700859151e-05,
"loss": 2.1302,
"step": 17500
},
{
"epoch": 2.72,
"learning_rate": 2.79614683676126e-05,
"loss": 2.1086,
"step": 17520
},
{
"epoch": 2.73,
"learning_rate": 2.7649049726633688e-05,
"loss": 2.1302,
"step": 17540
},
{
"epoch": 2.73,
"learning_rate": 2.7336631085654777e-05,
"loss": 2.1417,
"step": 17560
},
{
"epoch": 2.73,
"learning_rate": 2.7024212444675862e-05,
"loss": 2.1369,
"step": 17580
},
{
"epoch": 2.73,
"learning_rate": 2.671179380369695e-05,
"loss": 2.1384,
"step": 17600
},
{
"epoch": 2.73,
"eval_loss": 2.33089017868042,
"eval_runtime": 69.3747,
"eval_samples_per_second": 28.829,
"eval_steps_per_second": 1.802,
"step": 17600
},
{
"epoch": 2.74,
"learning_rate": 2.639937516271804e-05,
"loss": 2.1243,
"step": 17620
},
{
"epoch": 2.74,
"learning_rate": 2.6086956521739128e-05,
"loss": 2.1161,
"step": 17640
},
{
"epoch": 2.74,
"learning_rate": 2.5774537880760217e-05,
"loss": 2.1051,
"step": 17660
},
{
"epoch": 2.75,
"learning_rate": 2.5462119239781302e-05,
"loss": 2.0762,
"step": 17680
},
{
"epoch": 2.75,
"learning_rate": 2.514970059880239e-05,
"loss": 2.1105,
"step": 17700
},
{
"epoch": 2.75,
"learning_rate": 2.483728195782348e-05,
"loss": 2.1535,
"step": 17720
},
{
"epoch": 2.76,
"learning_rate": 2.452486331684457e-05,
"loss": 2.1706,
"step": 17740
},
{
"epoch": 2.76,
"learning_rate": 2.421244467586566e-05,
"loss": 2.0857,
"step": 17760
},
{
"epoch": 2.76,
"learning_rate": 2.390002603488675e-05,
"loss": 2.1553,
"step": 17780
},
{
"epoch": 2.77,
"learning_rate": 2.3587607393907834e-05,
"loss": 2.0983,
"step": 17800
},
{
"epoch": 2.77,
"eval_loss": 2.3304569721221924,
"eval_runtime": 69.35,
"eval_samples_per_second": 28.839,
"eval_steps_per_second": 1.802,
"step": 17800
},
{
"epoch": 2.77,
"learning_rate": 2.3275188752928923e-05,
"loss": 2.1212,
"step": 17820
},
{
"epoch": 2.77,
"learning_rate": 2.296277011195001e-05,
"loss": 2.0816,
"step": 17840
},
{
"epoch": 2.78,
"learning_rate": 2.26503514709711e-05,
"loss": 2.0935,
"step": 17860
},
{
"epoch": 2.78,
"learning_rate": 2.233793282999219e-05,
"loss": 2.1576,
"step": 17880
},
{
"epoch": 2.78,
"learning_rate": 2.2025514189013274e-05,
"loss": 2.1076,
"step": 17900
},
{
"epoch": 2.78,
"learning_rate": 2.1713095548034362e-05,
"loss": 2.1184,
"step": 17920
},
{
"epoch": 2.79,
"learning_rate": 2.140067690705545e-05,
"loss": 2.1169,
"step": 17940
},
{
"epoch": 2.79,
"learning_rate": 2.108825826607654e-05,
"loss": 2.1442,
"step": 17960
},
{
"epoch": 2.79,
"learning_rate": 2.077583962509763e-05,
"loss": 2.1332,
"step": 17980
},
{
"epoch": 2.8,
"learning_rate": 2.0463420984118717e-05,
"loss": 2.1553,
"step": 18000
},
{
"epoch": 2.8,
"eval_loss": 2.330599069595337,
"eval_runtime": 69.346,
"eval_samples_per_second": 28.841,
"eval_steps_per_second": 1.803,
"step": 18000
},
{
"epoch": 2.8,
"learning_rate": 2.0151002343139802e-05,
"loss": 2.1055,
"step": 18020
},
{
"epoch": 2.8,
"learning_rate": 1.9838583702160894e-05,
"loss": 2.0778,
"step": 18040
},
{
"epoch": 2.81,
"learning_rate": 1.9526165061181983e-05,
"loss": 2.143,
"step": 18060
},
{
"epoch": 2.81,
"learning_rate": 1.921374642020307e-05,
"loss": 2.0886,
"step": 18080
},
{
"epoch": 2.81,
"learning_rate": 1.890132777922416e-05,
"loss": 2.1236,
"step": 18100
},
{
"epoch": 2.82,
"learning_rate": 1.858890913824525e-05,
"loss": 2.1307,
"step": 18120
},
{
"epoch": 2.82,
"learning_rate": 1.8276490497266334e-05,
"loss": 2.1192,
"step": 18140
},
{
"epoch": 2.82,
"learning_rate": 1.7964071856287423e-05,
"loss": 2.0999,
"step": 18160
},
{
"epoch": 2.83,
"learning_rate": 1.765165321530851e-05,
"loss": 2.0792,
"step": 18180
},
{
"epoch": 2.83,
"learning_rate": 1.73392345743296e-05,
"loss": 2.1015,
"step": 18200
},
{
"epoch": 2.83,
"eval_loss": 2.330050230026245,
"eval_runtime": 69.3278,
"eval_samples_per_second": 28.848,
"eval_steps_per_second": 1.803,
"step": 18200
},
{
"epoch": 2.83,
"learning_rate": 1.702681593335069e-05,
"loss": 2.1226,
"step": 18220
},
{
"epoch": 2.83,
"learning_rate": 1.6714397292371778e-05,
"loss": 2.0924,
"step": 18240
},
{
"epoch": 2.84,
"learning_rate": 1.6401978651392866e-05,
"loss": 2.1272,
"step": 18260
},
{
"epoch": 2.84,
"learning_rate": 1.6089560010413955e-05,
"loss": 2.1175,
"step": 18280
},
{
"epoch": 2.84,
"learning_rate": 1.577714136943504e-05,
"loss": 2.1396,
"step": 18300
},
{
"epoch": 2.85,
"learning_rate": 1.546472272845613e-05,
"loss": 2.1514,
"step": 18320
},
{
"epoch": 2.85,
"learning_rate": 1.5152304087477217e-05,
"loss": 2.1257,
"step": 18340
},
{
"epoch": 2.85,
"learning_rate": 1.4839885446498306e-05,
"loss": 2.1459,
"step": 18360
},
{
"epoch": 2.86,
"learning_rate": 1.4527466805519396e-05,
"loss": 2.09,
"step": 18380
},
{
"epoch": 2.86,
"learning_rate": 1.4215048164540483e-05,
"loss": 2.1442,
"step": 18400
},
{
"epoch": 2.86,
"eval_loss": 2.330048084259033,
"eval_runtime": 69.2975,
"eval_samples_per_second": 28.861,
"eval_steps_per_second": 1.804,
"step": 18400
},
{
"epoch": 2.86,
"learning_rate": 1.3902629523561572e-05,
"loss": 2.1816,
"step": 18420
},
{
"epoch": 2.87,
"learning_rate": 1.3590210882582659e-05,
"loss": 2.0965,
"step": 18440
},
{
"epoch": 2.87,
"learning_rate": 1.3277792241603748e-05,
"loss": 2.1178,
"step": 18460
},
{
"epoch": 2.87,
"learning_rate": 1.2965373600624836e-05,
"loss": 2.1562,
"step": 18480
},
{
"epoch": 2.87,
"learning_rate": 1.2652954959645923e-05,
"loss": 2.095,
"step": 18500
},
{
"epoch": 2.88,
"learning_rate": 1.2340536318667012e-05,
"loss": 2.1522,
"step": 18520
},
{
"epoch": 2.88,
"learning_rate": 1.2028117677688102e-05,
"loss": 2.1729,
"step": 18540
},
{
"epoch": 2.88,
"learning_rate": 1.1715699036709189e-05,
"loss": 2.141,
"step": 18560
},
{
"epoch": 2.89,
"learning_rate": 1.1403280395730278e-05,
"loss": 2.148,
"step": 18580
},
{
"epoch": 2.89,
"learning_rate": 1.1090861754751366e-05,
"loss": 2.1619,
"step": 18600
},
{
"epoch": 2.89,
"eval_loss": 2.329728603363037,
"eval_runtime": 69.3412,
"eval_samples_per_second": 28.843,
"eval_steps_per_second": 1.803,
"step": 18600
},
{
"epoch": 2.89,
"learning_rate": 1.0778443113772453e-05,
"loss": 2.1199,
"step": 18620
},
{
"epoch": 2.9,
"learning_rate": 1.0466024472793542e-05,
"loss": 2.131,
"step": 18640
},
{
"epoch": 2.9,
"learning_rate": 1.0153605831814629e-05,
"loss": 2.1512,
"step": 18660
},
{
"epoch": 2.9,
"learning_rate": 9.84118719083572e-06,
"loss": 2.1292,
"step": 18680
},
{
"epoch": 2.91,
"learning_rate": 9.528768549856808e-06,
"loss": 2.0928,
"step": 18700
},
{
"epoch": 2.91,
"learning_rate": 9.216349908877897e-06,
"loss": 2.1168,
"step": 18720
},
{
"epoch": 2.91,
"learning_rate": 8.903931267898984e-06,
"loss": 2.1316,
"step": 18740
},
{
"epoch": 2.92,
"learning_rate": 8.591512626920072e-06,
"loss": 2.1198,
"step": 18760
},
{
"epoch": 2.92,
"learning_rate": 8.279093985941161e-06,
"loss": 2.1226,
"step": 18780
},
{
"epoch": 2.92,
"learning_rate": 7.96667534496225e-06,
"loss": 2.1234,
"step": 18800
},
{
"epoch": 2.92,
"eval_loss": 2.3294034004211426,
"eval_runtime": 69.3303,
"eval_samples_per_second": 28.847,
"eval_steps_per_second": 1.803,
"step": 18800
},
{
"epoch": 2.92,
"learning_rate": 7.654256703983337e-06,
"loss": 2.1251,
"step": 18820
},
{
"epoch": 2.93,
"learning_rate": 7.341838063004425e-06,
"loss": 2.1278,
"step": 18840
},
{
"epoch": 2.93,
"learning_rate": 7.029419422025514e-06,
"loss": 2.1115,
"step": 18860
},
{
"epoch": 2.93,
"learning_rate": 6.717000781046602e-06,
"loss": 2.1468,
"step": 18880
},
{
"epoch": 2.94,
"learning_rate": 6.4045821400676894e-06,
"loss": 2.0903,
"step": 18900
},
{
"epoch": 2.94,
"learning_rate": 6.092163499088779e-06,
"loss": 2.1271,
"step": 18920
},
{
"epoch": 2.94,
"learning_rate": 5.779744858109867e-06,
"loss": 2.1253,
"step": 18940
},
{
"epoch": 2.95,
"learning_rate": 5.4673262171309545e-06,
"loss": 2.0903,
"step": 18960
},
{
"epoch": 2.95,
"learning_rate": 5.154907576152043e-06,
"loss": 2.1566,
"step": 18980
},
{
"epoch": 2.95,
"learning_rate": 4.842488935173132e-06,
"loss": 2.1477,
"step": 19000
},
{
"epoch": 2.95,
"eval_loss": 2.3293075561523438,
"eval_runtime": 69.6518,
"eval_samples_per_second": 28.714,
"eval_steps_per_second": 1.795,
"step": 19000
}
],
"max_steps": 19305,
"num_train_epochs": 3,
"total_flos": 5.3158443458154725e+19,
"trial_name": null,
"trial_params": null
}