SmoLlm-0.15B-GrokAdamW / trainer_state.json
Syed-Hasan-8503's picture
Upload folder using huggingface_hub
88449ca verified
raw
history blame contribute delete
No virus
170 kB
{
"best_metric": 1.2385461330413818,
"best_model_checkpoint": "./Biggie-SmoLlm-0.15B-GrokAdam/checkpoint-1000",
"epoch": 3.0,
"eval_steps": 200,
"global_step": 1017,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0029498525073746312,
"grad_norm": 2.4375,
"learning_rate": 6.666666666666667e-06,
"loss": 2.1014,
"step": 1
},
{
"epoch": 0.0058997050147492625,
"grad_norm": 2.1875,
"learning_rate": 1.3333333333333333e-05,
"loss": 2.1526,
"step": 2
},
{
"epoch": 0.008849557522123894,
"grad_norm": 2.765625,
"learning_rate": 2e-05,
"loss": 2.0123,
"step": 3
},
{
"epoch": 0.011799410029498525,
"grad_norm": 1.6640625,
"learning_rate": 2.6666666666666667e-05,
"loss": 1.8556,
"step": 4
},
{
"epoch": 0.014749262536873156,
"grad_norm": 0.9453125,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.7952,
"step": 5
},
{
"epoch": 0.017699115044247787,
"grad_norm": 1.046875,
"learning_rate": 4e-05,
"loss": 1.7612,
"step": 6
},
{
"epoch": 0.02064896755162242,
"grad_norm": 2.171875,
"learning_rate": 4.666666666666667e-05,
"loss": 1.9708,
"step": 7
},
{
"epoch": 0.02359882005899705,
"grad_norm": 1.0546875,
"learning_rate": 5.333333333333333e-05,
"loss": 1.951,
"step": 8
},
{
"epoch": 0.02654867256637168,
"grad_norm": 1.796875,
"learning_rate": 6e-05,
"loss": 1.8394,
"step": 9
},
{
"epoch": 0.029498525073746312,
"grad_norm": 1.140625,
"learning_rate": 6.666666666666667e-05,
"loss": 1.6955,
"step": 10
},
{
"epoch": 0.032448377581120944,
"grad_norm": 2.421875,
"learning_rate": 7.333333333333333e-05,
"loss": 2.033,
"step": 11
},
{
"epoch": 0.035398230088495575,
"grad_norm": 1.1640625,
"learning_rate": 8e-05,
"loss": 1.8387,
"step": 12
},
{
"epoch": 0.038348082595870206,
"grad_norm": 3.3125,
"learning_rate": 8.666666666666667e-05,
"loss": 1.8568,
"step": 13
},
{
"epoch": 0.04129793510324484,
"grad_norm": 0.6953125,
"learning_rate": 9.333333333333334e-05,
"loss": 1.8152,
"step": 14
},
{
"epoch": 0.04424778761061947,
"grad_norm": 2.015625,
"learning_rate": 0.0001,
"loss": 1.8376,
"step": 15
},
{
"epoch": 0.0471976401179941,
"grad_norm": 1.796875,
"learning_rate": 0.00010666666666666667,
"loss": 1.8792,
"step": 16
},
{
"epoch": 0.05014749262536873,
"grad_norm": 1.8515625,
"learning_rate": 0.00011333333333333334,
"loss": 1.8259,
"step": 17
},
{
"epoch": 0.05309734513274336,
"grad_norm": 1.9140625,
"learning_rate": 0.00012,
"loss": 1.8056,
"step": 18
},
{
"epoch": 0.05604719764011799,
"grad_norm": 1.703125,
"learning_rate": 0.00012666666666666666,
"loss": 1.709,
"step": 19
},
{
"epoch": 0.058997050147492625,
"grad_norm": 1.8828125,
"learning_rate": 0.00013333333333333334,
"loss": 1.8277,
"step": 20
},
{
"epoch": 0.061946902654867256,
"grad_norm": 2.046875,
"learning_rate": 0.00014,
"loss": 1.9101,
"step": 21
},
{
"epoch": 0.06489675516224189,
"grad_norm": 1.0859375,
"learning_rate": 0.00014666666666666666,
"loss": 1.8073,
"step": 22
},
{
"epoch": 0.06784660766961652,
"grad_norm": 1.8359375,
"learning_rate": 0.00015333333333333334,
"loss": 1.9572,
"step": 23
},
{
"epoch": 0.07079646017699115,
"grad_norm": 1.3046875,
"learning_rate": 0.00016,
"loss": 1.7057,
"step": 24
},
{
"epoch": 0.07374631268436578,
"grad_norm": 2.0625,
"learning_rate": 0.0001666666666666667,
"loss": 2.0607,
"step": 25
},
{
"epoch": 0.07669616519174041,
"grad_norm": 2.375,
"learning_rate": 0.00017333333333333334,
"loss": 1.833,
"step": 26
},
{
"epoch": 0.07964601769911504,
"grad_norm": 1.5078125,
"learning_rate": 0.00018,
"loss": 1.8073,
"step": 27
},
{
"epoch": 0.08259587020648967,
"grad_norm": 2.640625,
"learning_rate": 0.0001866666666666667,
"loss": 1.9572,
"step": 28
},
{
"epoch": 0.0855457227138643,
"grad_norm": 1.171875,
"learning_rate": 0.00019333333333333333,
"loss": 1.9372,
"step": 29
},
{
"epoch": 0.08849557522123894,
"grad_norm": 2.328125,
"learning_rate": 0.0002,
"loss": 2.1545,
"step": 30
},
{
"epoch": 0.09144542772861357,
"grad_norm": 1.625,
"learning_rate": 0.00019999949343511917,
"loss": 1.887,
"step": 31
},
{
"epoch": 0.0943952802359882,
"grad_norm": 1.5546875,
"learning_rate": 0.00019999797374560874,
"loss": 1.8879,
"step": 32
},
{
"epoch": 0.09734513274336283,
"grad_norm": 1.8515625,
"learning_rate": 0.0001999954409468652,
"loss": 1.841,
"step": 33
},
{
"epoch": 0.10029498525073746,
"grad_norm": 1.46875,
"learning_rate": 0.00019999189506454904,
"loss": 1.8498,
"step": 34
},
{
"epoch": 0.10324483775811209,
"grad_norm": 2.03125,
"learning_rate": 0.0001999873361345847,
"loss": 1.9648,
"step": 35
},
{
"epoch": 0.10619469026548672,
"grad_norm": 1.2421875,
"learning_rate": 0.00019998176420316002,
"loss": 1.8605,
"step": 36
},
{
"epoch": 0.10914454277286136,
"grad_norm": 1.6796875,
"learning_rate": 0.0001999751793267259,
"loss": 1.9236,
"step": 37
},
{
"epoch": 0.11209439528023599,
"grad_norm": 1.4609375,
"learning_rate": 0.00019996758157199573,
"loss": 1.978,
"step": 38
},
{
"epoch": 0.11504424778761062,
"grad_norm": 1.203125,
"learning_rate": 0.00019995897101594454,
"loss": 1.8624,
"step": 39
},
{
"epoch": 0.11799410029498525,
"grad_norm": 1.53125,
"learning_rate": 0.00019994934774580851,
"loss": 2.0178,
"step": 40
},
{
"epoch": 0.12094395280235988,
"grad_norm": 1.25,
"learning_rate": 0.00019993871185908381,
"loss": 1.9454,
"step": 41
},
{
"epoch": 0.12389380530973451,
"grad_norm": 1.4375,
"learning_rate": 0.00019992706346352577,
"loss": 2.2231,
"step": 42
},
{
"epoch": 0.12684365781710916,
"grad_norm": 1.140625,
"learning_rate": 0.00019991440267714782,
"loss": 1.9269,
"step": 43
},
{
"epoch": 0.12979351032448377,
"grad_norm": 0.98828125,
"learning_rate": 0.00019990072962822007,
"loss": 1.8905,
"step": 44
},
{
"epoch": 0.13274336283185842,
"grad_norm": 1.3671875,
"learning_rate": 0.00019988604445526827,
"loss": 1.9673,
"step": 45
},
{
"epoch": 0.13569321533923304,
"grad_norm": 1.5625,
"learning_rate": 0.00019987034730707234,
"loss": 1.8198,
"step": 46
},
{
"epoch": 0.13864306784660768,
"grad_norm": 1.1015625,
"learning_rate": 0.0001998536383426647,
"loss": 1.7608,
"step": 47
},
{
"epoch": 0.1415929203539823,
"grad_norm": 1.046875,
"learning_rate": 0.00019983591773132882,
"loss": 2.0364,
"step": 48
},
{
"epoch": 0.14454277286135694,
"grad_norm": 1.140625,
"learning_rate": 0.00019981718565259752,
"loss": 1.7861,
"step": 49
},
{
"epoch": 0.14749262536873156,
"grad_norm": 1.5234375,
"learning_rate": 0.0001997974422962511,
"loss": 1.9331,
"step": 50
},
{
"epoch": 0.1504424778761062,
"grad_norm": 1.1015625,
"learning_rate": 0.00019977668786231534,
"loss": 1.7964,
"step": 51
},
{
"epoch": 0.15339233038348082,
"grad_norm": 1.21875,
"learning_rate": 0.00019975492256105957,
"loss": 1.7878,
"step": 52
},
{
"epoch": 0.15634218289085547,
"grad_norm": 1.1953125,
"learning_rate": 0.00019973214661299455,
"loss": 1.8113,
"step": 53
},
{
"epoch": 0.1592920353982301,
"grad_norm": 0.98046875,
"learning_rate": 0.0001997083602488702,
"loss": 1.8453,
"step": 54
},
{
"epoch": 0.16224188790560473,
"grad_norm": 1.15625,
"learning_rate": 0.00019968356370967327,
"loss": 1.8191,
"step": 55
},
{
"epoch": 0.16519174041297935,
"grad_norm": 0.99609375,
"learning_rate": 0.00019965775724662484,
"loss": 1.811,
"step": 56
},
{
"epoch": 0.168141592920354,
"grad_norm": 1.3203125,
"learning_rate": 0.00019963094112117785,
"loss": 1.9565,
"step": 57
},
{
"epoch": 0.1710914454277286,
"grad_norm": 0.66015625,
"learning_rate": 0.00019960311560501454,
"loss": 1.8266,
"step": 58
},
{
"epoch": 0.17404129793510326,
"grad_norm": 0.46875,
"learning_rate": 0.00019957428098004343,
"loss": 1.887,
"step": 59
},
{
"epoch": 0.17699115044247787,
"grad_norm": 0.388671875,
"learning_rate": 0.00019954443753839667,
"loss": 1.8017,
"step": 60
},
{
"epoch": 0.17994100294985252,
"grad_norm": 0.64453125,
"learning_rate": 0.00019951358558242707,
"loss": 2.0268,
"step": 61
},
{
"epoch": 0.18289085545722714,
"grad_norm": 0.56640625,
"learning_rate": 0.000199481725424705,
"loss": 2.046,
"step": 62
},
{
"epoch": 0.18584070796460178,
"grad_norm": 0.48828125,
"learning_rate": 0.00019944885738801518,
"loss": 1.7961,
"step": 63
},
{
"epoch": 0.1887905604719764,
"grad_norm": 0.59375,
"learning_rate": 0.00019941498180535346,
"loss": 1.8586,
"step": 64
},
{
"epoch": 0.19174041297935104,
"grad_norm": 0.72265625,
"learning_rate": 0.0001993800990199235,
"loss": 1.913,
"step": 65
},
{
"epoch": 0.19469026548672566,
"grad_norm": 0.69140625,
"learning_rate": 0.00019934420938513313,
"loss": 1.7897,
"step": 66
},
{
"epoch": 0.1976401179941003,
"grad_norm": 0.625,
"learning_rate": 0.00019930731326459088,
"loss": 1.63,
"step": 67
},
{
"epoch": 0.20058997050147492,
"grad_norm": 0.376953125,
"learning_rate": 0.00019926941103210247,
"loss": 1.8577,
"step": 68
},
{
"epoch": 0.20353982300884957,
"grad_norm": 0.3359375,
"learning_rate": 0.00019923050307166655,
"loss": 1.6372,
"step": 69
},
{
"epoch": 0.20648967551622419,
"grad_norm": 1.0234375,
"learning_rate": 0.00019919058977747135,
"loss": 1.995,
"step": 70
},
{
"epoch": 0.20943952802359883,
"grad_norm": 1.3046875,
"learning_rate": 0.00019914967155389027,
"loss": 1.9321,
"step": 71
},
{
"epoch": 0.21238938053097345,
"grad_norm": 0.6015625,
"learning_rate": 0.000199107748815478,
"loss": 1.9429,
"step": 72
},
{
"epoch": 0.2153392330383481,
"grad_norm": 0.494140625,
"learning_rate": 0.00019906482198696636,
"loss": 1.8391,
"step": 73
},
{
"epoch": 0.2182890855457227,
"grad_norm": 0.5859375,
"learning_rate": 0.00019902089150325978,
"loss": 1.8633,
"step": 74
},
{
"epoch": 0.22123893805309736,
"grad_norm": 0.73828125,
"learning_rate": 0.00019897595780943102,
"loss": 1.9238,
"step": 75
},
{
"epoch": 0.22418879056047197,
"grad_norm": 0.640625,
"learning_rate": 0.0001989300213607168,
"loss": 1.7452,
"step": 76
},
{
"epoch": 0.22713864306784662,
"grad_norm": 0.58984375,
"learning_rate": 0.00019888308262251285,
"loss": 1.7617,
"step": 77
},
{
"epoch": 0.23008849557522124,
"grad_norm": 0.62890625,
"learning_rate": 0.00019883514207036956,
"loss": 1.7566,
"step": 78
},
{
"epoch": 0.23303834808259588,
"grad_norm": 0.59765625,
"learning_rate": 0.00019878620018998696,
"loss": 1.6552,
"step": 79
},
{
"epoch": 0.2359882005899705,
"grad_norm": 0.62890625,
"learning_rate": 0.00019873625747720972,
"loss": 1.7515,
"step": 80
},
{
"epoch": 0.23893805309734514,
"grad_norm": 0.62890625,
"learning_rate": 0.0001986853144380224,
"loss": 1.6423,
"step": 81
},
{
"epoch": 0.24188790560471976,
"grad_norm": 0.5859375,
"learning_rate": 0.00019863337158854404,
"loss": 2.1023,
"step": 82
},
{
"epoch": 0.2448377581120944,
"grad_norm": 0.60546875,
"learning_rate": 0.00019858042945502318,
"loss": 1.8227,
"step": 83
},
{
"epoch": 0.24778761061946902,
"grad_norm": 0.404296875,
"learning_rate": 0.00019852648857383222,
"loss": 1.807,
"step": 84
},
{
"epoch": 0.25073746312684364,
"grad_norm": 0.267578125,
"learning_rate": 0.00019847154949146237,
"loss": 1.7736,
"step": 85
},
{
"epoch": 0.2536873156342183,
"grad_norm": 0.291015625,
"learning_rate": 0.0001984156127645178,
"loss": 1.7114,
"step": 86
},
{
"epoch": 0.25663716814159293,
"grad_norm": 0.3203125,
"learning_rate": 0.00019835867895971014,
"loss": 1.9551,
"step": 87
},
{
"epoch": 0.25958702064896755,
"grad_norm": 0.365234375,
"learning_rate": 0.00019830074865385272,
"loss": 1.6919,
"step": 88
},
{
"epoch": 0.26253687315634217,
"grad_norm": 0.3515625,
"learning_rate": 0.00019824182243385465,
"loss": 1.6129,
"step": 89
},
{
"epoch": 0.26548672566371684,
"grad_norm": 0.42578125,
"learning_rate": 0.00019818190089671508,
"loss": 1.8377,
"step": 90
},
{
"epoch": 0.26843657817109146,
"grad_norm": 0.41015625,
"learning_rate": 0.0001981209846495169,
"loss": 1.7187,
"step": 91
},
{
"epoch": 0.2713864306784661,
"grad_norm": 0.359375,
"learning_rate": 0.00019805907430942075,
"loss": 1.7452,
"step": 92
},
{
"epoch": 0.2743362831858407,
"grad_norm": 0.388671875,
"learning_rate": 0.0001979961705036587,
"loss": 1.7984,
"step": 93
},
{
"epoch": 0.27728613569321536,
"grad_norm": 0.35546875,
"learning_rate": 0.00019793227386952794,
"loss": 1.6951,
"step": 94
},
{
"epoch": 0.28023598820059,
"grad_norm": 0.279296875,
"learning_rate": 0.00019786738505438427,
"loss": 1.8082,
"step": 95
},
{
"epoch": 0.2831858407079646,
"grad_norm": 0.28125,
"learning_rate": 0.00019780150471563558,
"loss": 1.6958,
"step": 96
},
{
"epoch": 0.2861356932153392,
"grad_norm": 0.3671875,
"learning_rate": 0.00019773463352073525,
"loss": 1.5338,
"step": 97
},
{
"epoch": 0.2890855457227139,
"grad_norm": 0.302734375,
"learning_rate": 0.0001976667721471752,
"loss": 1.7828,
"step": 98
},
{
"epoch": 0.2920353982300885,
"grad_norm": 0.32421875,
"learning_rate": 0.00019759792128247922,
"loss": 1.768,
"step": 99
},
{
"epoch": 0.2949852507374631,
"grad_norm": 0.3515625,
"learning_rate": 0.0001975280816241959,
"loss": 1.8709,
"step": 100
},
{
"epoch": 0.29793510324483774,
"grad_norm": 0.328125,
"learning_rate": 0.00019745725387989164,
"loss": 1.899,
"step": 101
},
{
"epoch": 0.3008849557522124,
"grad_norm": 0.388671875,
"learning_rate": 0.00019738543876714334,
"loss": 1.8527,
"step": 102
},
{
"epoch": 0.30383480825958703,
"grad_norm": 0.369140625,
"learning_rate": 0.00019731263701353133,
"loss": 1.8852,
"step": 103
},
{
"epoch": 0.30678466076696165,
"grad_norm": 0.3515625,
"learning_rate": 0.00019723884935663182,
"loss": 1.6662,
"step": 104
},
{
"epoch": 0.30973451327433627,
"grad_norm": 0.3203125,
"learning_rate": 0.00019716407654400952,
"loss": 1.7199,
"step": 105
},
{
"epoch": 0.31268436578171094,
"grad_norm": 0.275390625,
"learning_rate": 0.00019708831933321004,
"loss": 1.8138,
"step": 106
},
{
"epoch": 0.31563421828908556,
"grad_norm": 0.275390625,
"learning_rate": 0.00019701157849175228,
"loss": 1.7826,
"step": 107
},
{
"epoch": 0.3185840707964602,
"grad_norm": 0.3125,
"learning_rate": 0.00019693385479712048,
"loss": 1.5911,
"step": 108
},
{
"epoch": 0.3215339233038348,
"grad_norm": 0.314453125,
"learning_rate": 0.00019685514903675655,
"loss": 1.6684,
"step": 109
},
{
"epoch": 0.32448377581120946,
"grad_norm": 0.314453125,
"learning_rate": 0.00019677546200805196,
"loss": 1.7101,
"step": 110
},
{
"epoch": 0.3274336283185841,
"grad_norm": 0.306640625,
"learning_rate": 0.00019669479451833976,
"loss": 1.7109,
"step": 111
},
{
"epoch": 0.3303834808259587,
"grad_norm": 0.314453125,
"learning_rate": 0.00019661314738488627,
"loss": 1.8087,
"step": 112
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.3203125,
"learning_rate": 0.00019653052143488287,
"loss": 1.728,
"step": 113
},
{
"epoch": 0.336283185840708,
"grad_norm": 0.3125,
"learning_rate": 0.00019644691750543767,
"loss": 1.891,
"step": 114
},
{
"epoch": 0.3392330383480826,
"grad_norm": 0.427734375,
"learning_rate": 0.00019636233644356696,
"loss": 1.9241,
"step": 115
},
{
"epoch": 0.3421828908554572,
"grad_norm": 0.37109375,
"learning_rate": 0.00019627677910618666,
"loss": 1.8242,
"step": 116
},
{
"epoch": 0.34513274336283184,
"grad_norm": 0.330078125,
"learning_rate": 0.00019619024636010363,
"loss": 1.62,
"step": 117
},
{
"epoch": 0.3480825958702065,
"grad_norm": 0.369140625,
"learning_rate": 0.00019610273908200683,
"loss": 1.7547,
"step": 118
},
{
"epoch": 0.35103244837758113,
"grad_norm": 0.359375,
"learning_rate": 0.00019601425815845858,
"loss": 1.7753,
"step": 119
},
{
"epoch": 0.35398230088495575,
"grad_norm": 0.328125,
"learning_rate": 0.00019592480448588542,
"loss": 1.7662,
"step": 120
},
{
"epoch": 0.35693215339233036,
"grad_norm": 0.392578125,
"learning_rate": 0.00019583437897056915,
"loss": 1.6668,
"step": 121
},
{
"epoch": 0.35988200589970504,
"grad_norm": 0.39453125,
"learning_rate": 0.00019574298252863758,
"loss": 1.7956,
"step": 122
},
{
"epoch": 0.36283185840707965,
"grad_norm": 0.34765625,
"learning_rate": 0.00019565061608605526,
"loss": 1.8266,
"step": 123
},
{
"epoch": 0.36578171091445427,
"grad_norm": 0.333984375,
"learning_rate": 0.0001955572805786141,
"loss": 1.7884,
"step": 124
},
{
"epoch": 0.3687315634218289,
"grad_norm": 0.29296875,
"learning_rate": 0.00019546297695192388,
"loss": 1.9548,
"step": 125
},
{
"epoch": 0.37168141592920356,
"grad_norm": 0.318359375,
"learning_rate": 0.00019536770616140276,
"loss": 1.7491,
"step": 126
},
{
"epoch": 0.3746312684365782,
"grad_norm": 0.314453125,
"learning_rate": 0.00019527146917226745,
"loss": 1.8154,
"step": 127
},
{
"epoch": 0.3775811209439528,
"grad_norm": 0.2890625,
"learning_rate": 0.00019517426695952358,
"loss": 1.6983,
"step": 128
},
{
"epoch": 0.3805309734513274,
"grad_norm": 0.29296875,
"learning_rate": 0.00019507610050795558,
"loss": 2.03,
"step": 129
},
{
"epoch": 0.3834808259587021,
"grad_norm": 0.26953125,
"learning_rate": 0.00019497697081211708,
"loss": 1.7542,
"step": 130
},
{
"epoch": 0.3864306784660767,
"grad_norm": 0.2734375,
"learning_rate": 0.00019487687887632048,
"loss": 1.7357,
"step": 131
},
{
"epoch": 0.3893805309734513,
"grad_norm": 0.271484375,
"learning_rate": 0.00019477582571462705,
"loss": 1.8159,
"step": 132
},
{
"epoch": 0.39233038348082594,
"grad_norm": 0.267578125,
"learning_rate": 0.00019467381235083634,
"loss": 1.6541,
"step": 133
},
{
"epoch": 0.3952802359882006,
"grad_norm": 0.255859375,
"learning_rate": 0.00019457083981847616,
"loss": 1.6379,
"step": 134
},
{
"epoch": 0.39823008849557523,
"grad_norm": 0.248046875,
"learning_rate": 0.0001944669091607919,
"loss": 1.7683,
"step": 135
},
{
"epoch": 0.40117994100294985,
"grad_norm": 0.279296875,
"learning_rate": 0.0001943620214307359,
"loss": 1.6556,
"step": 136
},
{
"epoch": 0.40412979351032446,
"grad_norm": 0.2890625,
"learning_rate": 0.0001942561776909571,
"loss": 1.8947,
"step": 137
},
{
"epoch": 0.40707964601769914,
"grad_norm": 0.298828125,
"learning_rate": 0.00019414937901378982,
"loss": 1.7431,
"step": 138
},
{
"epoch": 0.41002949852507375,
"grad_norm": 0.26953125,
"learning_rate": 0.0001940416264812433,
"loss": 1.7434,
"step": 139
},
{
"epoch": 0.41297935103244837,
"grad_norm": 0.263671875,
"learning_rate": 0.00019393292118499044,
"loss": 1.7497,
"step": 140
},
{
"epoch": 0.415929203539823,
"grad_norm": 0.353515625,
"learning_rate": 0.00019382326422635705,
"loss": 1.9276,
"step": 141
},
{
"epoch": 0.41887905604719766,
"grad_norm": 0.396484375,
"learning_rate": 0.00019371265671631037,
"loss": 1.6744,
"step": 142
},
{
"epoch": 0.4218289085545723,
"grad_norm": 0.36328125,
"learning_rate": 0.00019360109977544795,
"loss": 1.8112,
"step": 143
},
{
"epoch": 0.4247787610619469,
"grad_norm": 0.3828125,
"learning_rate": 0.00019348859453398646,
"loss": 1.5409,
"step": 144
},
{
"epoch": 0.4277286135693215,
"grad_norm": 0.306640625,
"learning_rate": 0.00019337514213174992,
"loss": 1.9247,
"step": 145
},
{
"epoch": 0.4306784660766962,
"grad_norm": 0.267578125,
"learning_rate": 0.0001932607437181584,
"loss": 1.706,
"step": 146
},
{
"epoch": 0.4336283185840708,
"grad_norm": 0.302734375,
"learning_rate": 0.00019314540045221626,
"loss": 1.6808,
"step": 147
},
{
"epoch": 0.4365781710914454,
"grad_norm": 0.294921875,
"learning_rate": 0.0001930291135025005,
"loss": 1.5996,
"step": 148
},
{
"epoch": 0.43952802359882004,
"grad_norm": 0.328125,
"learning_rate": 0.00019291188404714878,
"loss": 1.7956,
"step": 149
},
{
"epoch": 0.4424778761061947,
"grad_norm": 0.275390625,
"learning_rate": 0.0001927937132738476,
"loss": 1.7493,
"step": 150
},
{
"epoch": 0.44542772861356933,
"grad_norm": 0.2890625,
"learning_rate": 0.0001926746023798202,
"loss": 1.69,
"step": 151
},
{
"epoch": 0.44837758112094395,
"grad_norm": 0.341796875,
"learning_rate": 0.00019255455257181456,
"loss": 1.5992,
"step": 152
},
{
"epoch": 0.45132743362831856,
"grad_norm": 0.37890625,
"learning_rate": 0.000192433565066091,
"loss": 1.7004,
"step": 153
},
{
"epoch": 0.45427728613569324,
"grad_norm": 0.34765625,
"learning_rate": 0.00019231164108840995,
"loss": 1.9138,
"step": 154
},
{
"epoch": 0.45722713864306785,
"grad_norm": 0.263671875,
"learning_rate": 0.00019218878187401948,
"loss": 1.7336,
"step": 155
},
{
"epoch": 0.46017699115044247,
"grad_norm": 0.271484375,
"learning_rate": 0.00019206498866764288,
"loss": 1.5959,
"step": 156
},
{
"epoch": 0.4631268436578171,
"grad_norm": 0.29296875,
"learning_rate": 0.00019194026272346596,
"loss": 1.6,
"step": 157
},
{
"epoch": 0.46607669616519176,
"grad_norm": 0.326171875,
"learning_rate": 0.00019181460530512441,
"loss": 1.5815,
"step": 158
},
{
"epoch": 0.4690265486725664,
"grad_norm": 0.28515625,
"learning_rate": 0.0001916880176856909,
"loss": 1.6702,
"step": 159
},
{
"epoch": 0.471976401179941,
"grad_norm": 0.25390625,
"learning_rate": 0.0001915605011476623,
"loss": 1.6265,
"step": 160
},
{
"epoch": 0.4749262536873156,
"grad_norm": 0.291015625,
"learning_rate": 0.0001914320569829466,
"loss": 1.7662,
"step": 161
},
{
"epoch": 0.4778761061946903,
"grad_norm": 0.302734375,
"learning_rate": 0.0001913026864928498,
"loss": 1.6086,
"step": 162
},
{
"epoch": 0.4808259587020649,
"grad_norm": 0.318359375,
"learning_rate": 0.00019117239098806295,
"loss": 1.8047,
"step": 163
},
{
"epoch": 0.4837758112094395,
"grad_norm": 0.353515625,
"learning_rate": 0.00019104117178864852,
"loss": 1.5078,
"step": 164
},
{
"epoch": 0.48672566371681414,
"grad_norm": 0.2734375,
"learning_rate": 0.00019090903022402729,
"loss": 1.6856,
"step": 165
},
{
"epoch": 0.4896755162241888,
"grad_norm": 0.310546875,
"learning_rate": 0.00019077596763296474,
"loss": 1.7237,
"step": 166
},
{
"epoch": 0.49262536873156343,
"grad_norm": 0.287109375,
"learning_rate": 0.00019064198536355761,
"loss": 1.6482,
"step": 167
},
{
"epoch": 0.49557522123893805,
"grad_norm": 0.349609375,
"learning_rate": 0.00019050708477322018,
"loss": 1.8672,
"step": 168
},
{
"epoch": 0.49852507374631266,
"grad_norm": 0.28515625,
"learning_rate": 0.00019037126722867045,
"loss": 1.8568,
"step": 169
},
{
"epoch": 0.5014749262536873,
"grad_norm": 0.267578125,
"learning_rate": 0.00019023453410591635,
"loss": 1.587,
"step": 170
},
{
"epoch": 0.504424778761062,
"grad_norm": 0.275390625,
"learning_rate": 0.0001900968867902419,
"loss": 1.7017,
"step": 171
},
{
"epoch": 0.5073746312684366,
"grad_norm": 0.26171875,
"learning_rate": 0.000189958326676193,
"loss": 1.638,
"step": 172
},
{
"epoch": 0.5103244837758112,
"grad_norm": 0.306640625,
"learning_rate": 0.0001898188551675634,
"loss": 1.7855,
"step": 173
},
{
"epoch": 0.5132743362831859,
"grad_norm": 0.318359375,
"learning_rate": 0.00018967847367738048,
"loss": 1.5708,
"step": 174
},
{
"epoch": 0.5162241887905604,
"grad_norm": 0.359375,
"learning_rate": 0.00018953718362789085,
"loss": 1.6798,
"step": 175
},
{
"epoch": 0.5191740412979351,
"grad_norm": 0.419921875,
"learning_rate": 0.0001893949864505461,
"loss": 1.8502,
"step": 176
},
{
"epoch": 0.5221238938053098,
"grad_norm": 0.287109375,
"learning_rate": 0.00018925188358598813,
"loss": 1.7015,
"step": 177
},
{
"epoch": 0.5250737463126843,
"grad_norm": 0.291015625,
"learning_rate": 0.00018910787648403465,
"loss": 1.5544,
"step": 178
},
{
"epoch": 0.528023598820059,
"grad_norm": 0.279296875,
"learning_rate": 0.00018896296660366447,
"loss": 1.667,
"step": 179
},
{
"epoch": 0.5309734513274337,
"grad_norm": 0.265625,
"learning_rate": 0.00018881715541300276,
"loss": 1.7048,
"step": 180
},
{
"epoch": 0.5339233038348082,
"grad_norm": 0.31640625,
"learning_rate": 0.000188670444389306,
"loss": 1.5393,
"step": 181
},
{
"epoch": 0.5368731563421829,
"grad_norm": 0.26953125,
"learning_rate": 0.00018852283501894732,
"loss": 1.5774,
"step": 182
},
{
"epoch": 0.5398230088495575,
"grad_norm": 0.26171875,
"learning_rate": 0.00018837432879740114,
"loss": 1.6746,
"step": 183
},
{
"epoch": 0.5427728613569321,
"grad_norm": 0.337890625,
"learning_rate": 0.0001882249272292282,
"loss": 1.8244,
"step": 184
},
{
"epoch": 0.5457227138643068,
"grad_norm": 0.337890625,
"learning_rate": 0.0001880746318280602,
"loss": 1.6168,
"step": 185
},
{
"epoch": 0.5486725663716814,
"grad_norm": 0.314453125,
"learning_rate": 0.00018792344411658468,
"loss": 1.5374,
"step": 186
},
{
"epoch": 0.551622418879056,
"grad_norm": 0.357421875,
"learning_rate": 0.00018777136562652929,
"loss": 1.515,
"step": 187
},
{
"epoch": 0.5545722713864307,
"grad_norm": 0.26953125,
"learning_rate": 0.00018761839789864645,
"loss": 1.7637,
"step": 188
},
{
"epoch": 0.5575221238938053,
"grad_norm": 0.29296875,
"learning_rate": 0.00018746454248269777,
"loss": 1.5899,
"step": 189
},
{
"epoch": 0.56047197640118,
"grad_norm": 0.333984375,
"learning_rate": 0.00018730980093743823,
"loss": 1.611,
"step": 190
},
{
"epoch": 0.5634218289085545,
"grad_norm": 0.30859375,
"learning_rate": 0.0001871541748306005,
"loss": 1.694,
"step": 191
},
{
"epoch": 0.5663716814159292,
"grad_norm": 0.251953125,
"learning_rate": 0.000186997665738879,
"loss": 1.6232,
"step": 192
},
{
"epoch": 0.5693215339233039,
"grad_norm": 0.296875,
"learning_rate": 0.00018684027524791386,
"loss": 1.5747,
"step": 193
},
{
"epoch": 0.5722713864306784,
"grad_norm": 0.328125,
"learning_rate": 0.00018668200495227505,
"loss": 1.694,
"step": 194
},
{
"epoch": 0.5752212389380531,
"grad_norm": 0.306640625,
"learning_rate": 0.00018652285645544603,
"loss": 1.6196,
"step": 195
},
{
"epoch": 0.5781710914454278,
"grad_norm": 0.2373046875,
"learning_rate": 0.00018636283136980758,
"loss": 1.7641,
"step": 196
},
{
"epoch": 0.5811209439528023,
"grad_norm": 0.24609375,
"learning_rate": 0.0001862019313166214,
"loss": 1.7193,
"step": 197
},
{
"epoch": 0.584070796460177,
"grad_norm": 0.33203125,
"learning_rate": 0.00018604015792601396,
"loss": 1.623,
"step": 198
},
{
"epoch": 0.5870206489675516,
"grad_norm": 0.2734375,
"learning_rate": 0.00018587751283695949,
"loss": 1.6029,
"step": 199
},
{
"epoch": 0.5899705014749262,
"grad_norm": 0.275390625,
"learning_rate": 0.00018571399769726386,
"loss": 1.8552,
"step": 200
},
{
"epoch": 0.5899705014749262,
"eval_loss": 1.4902458190917969,
"eval_runtime": 31.6624,
"eval_samples_per_second": 31.583,
"eval_steps_per_second": 3.948,
"step": 200
},
{
"epoch": 0.5929203539823009,
"grad_norm": 0.296875,
"learning_rate": 0.0001855496141635476,
"loss": 1.5889,
"step": 201
},
{
"epoch": 0.5958702064896755,
"grad_norm": 0.287109375,
"learning_rate": 0.0001853843639012292,
"loss": 1.6736,
"step": 202
},
{
"epoch": 0.5988200589970502,
"grad_norm": 0.271484375,
"learning_rate": 0.00018521824858450827,
"loss": 1.7336,
"step": 203
},
{
"epoch": 0.6017699115044248,
"grad_norm": 0.255859375,
"learning_rate": 0.0001850512698963485,
"loss": 1.7503,
"step": 204
},
{
"epoch": 0.6047197640117994,
"grad_norm": 0.259765625,
"learning_rate": 0.00018488342952846073,
"loss": 1.5787,
"step": 205
},
{
"epoch": 0.6076696165191741,
"grad_norm": 0.2734375,
"learning_rate": 0.00018471472918128563,
"loss": 1.6003,
"step": 206
},
{
"epoch": 0.6106194690265486,
"grad_norm": 0.26953125,
"learning_rate": 0.00018454517056397661,
"loss": 1.7586,
"step": 207
},
{
"epoch": 0.6135693215339233,
"grad_norm": 0.25390625,
"learning_rate": 0.00018437475539438255,
"loss": 1.929,
"step": 208
},
{
"epoch": 0.616519174041298,
"grad_norm": 0.259765625,
"learning_rate": 0.00018420348539903019,
"loss": 1.6259,
"step": 209
},
{
"epoch": 0.6194690265486725,
"grad_norm": 0.275390625,
"learning_rate": 0.00018403136231310684,
"loss": 1.5523,
"step": 210
},
{
"epoch": 0.6224188790560472,
"grad_norm": 0.263671875,
"learning_rate": 0.00018385838788044273,
"loss": 1.6995,
"step": 211
},
{
"epoch": 0.6253687315634219,
"grad_norm": 0.283203125,
"learning_rate": 0.00018368456385349334,
"loss": 1.5545,
"step": 212
},
{
"epoch": 0.6283185840707964,
"grad_norm": 0.296875,
"learning_rate": 0.00018350989199332154,
"loss": 1.5912,
"step": 213
},
{
"epoch": 0.6312684365781711,
"grad_norm": 0.291015625,
"learning_rate": 0.00018333437406957995,
"loss": 1.7984,
"step": 214
},
{
"epoch": 0.6342182890855457,
"grad_norm": 0.24609375,
"learning_rate": 0.000183158011860493,
"loss": 1.7138,
"step": 215
},
{
"epoch": 0.6371681415929203,
"grad_norm": 0.259765625,
"learning_rate": 0.00018298080715283858,
"loss": 1.6766,
"step": 216
},
{
"epoch": 0.640117994100295,
"grad_norm": 0.271484375,
"learning_rate": 0.0001828027617419304,
"loss": 1.5929,
"step": 217
},
{
"epoch": 0.6430678466076696,
"grad_norm": 0.28515625,
"learning_rate": 0.0001826238774315995,
"loss": 1.7106,
"step": 218
},
{
"epoch": 0.6460176991150443,
"grad_norm": 0.333984375,
"learning_rate": 0.00018244415603417603,
"loss": 1.5583,
"step": 219
},
{
"epoch": 0.6489675516224189,
"grad_norm": 0.31640625,
"learning_rate": 0.000182263599370471,
"loss": 1.7153,
"step": 220
},
{
"epoch": 0.6519174041297935,
"grad_norm": 0.291015625,
"learning_rate": 0.0001820822092697577,
"loss": 1.7741,
"step": 221
},
{
"epoch": 0.6548672566371682,
"grad_norm": 0.287109375,
"learning_rate": 0.00018189998756975318,
"loss": 1.7517,
"step": 222
},
{
"epoch": 0.6578171091445427,
"grad_norm": 0.279296875,
"learning_rate": 0.00018171693611659977,
"loss": 1.8076,
"step": 223
},
{
"epoch": 0.6607669616519174,
"grad_norm": 0.30078125,
"learning_rate": 0.00018153305676484619,
"loss": 1.6618,
"step": 224
},
{
"epoch": 0.6637168141592921,
"grad_norm": 0.32421875,
"learning_rate": 0.0001813483513774289,
"loss": 1.7275,
"step": 225
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.28125,
"learning_rate": 0.00018116282182565311,
"loss": 1.681,
"step": 226
},
{
"epoch": 0.6696165191740413,
"grad_norm": 0.287109375,
"learning_rate": 0.00018097646998917397,
"loss": 1.5952,
"step": 227
},
{
"epoch": 0.672566371681416,
"grad_norm": 0.2890625,
"learning_rate": 0.0001807892977559774,
"loss": 1.6065,
"step": 228
},
{
"epoch": 0.6755162241887905,
"grad_norm": 0.263671875,
"learning_rate": 0.00018060130702236097,
"loss": 1.7203,
"step": 229
},
{
"epoch": 0.6784660766961652,
"grad_norm": 0.26953125,
"learning_rate": 0.00018041249969291475,
"loss": 1.6751,
"step": 230
},
{
"epoch": 0.6814159292035398,
"grad_norm": 0.236328125,
"learning_rate": 0.00018022287768050202,
"loss": 1.6015,
"step": 231
},
{
"epoch": 0.6843657817109144,
"grad_norm": 0.25390625,
"learning_rate": 0.0001800324429062398,
"loss": 1.6382,
"step": 232
},
{
"epoch": 0.6873156342182891,
"grad_norm": 0.3046875,
"learning_rate": 0.00017984119729947944,
"loss": 1.5951,
"step": 233
},
{
"epoch": 0.6902654867256637,
"grad_norm": 0.353515625,
"learning_rate": 0.00017964914279778715,
"loss": 1.6952,
"step": 234
},
{
"epoch": 0.6932153392330384,
"grad_norm": 0.271484375,
"learning_rate": 0.00017945628134692423,
"loss": 1.6119,
"step": 235
},
{
"epoch": 0.696165191740413,
"grad_norm": 0.2373046875,
"learning_rate": 0.0001792626149008274,
"loss": 1.5447,
"step": 236
},
{
"epoch": 0.6991150442477876,
"grad_norm": 0.255859375,
"learning_rate": 0.0001790681454215891,
"loss": 1.5155,
"step": 237
},
{
"epoch": 0.7020648967551623,
"grad_norm": 0.306640625,
"learning_rate": 0.00017887287487943754,
"loss": 1.6247,
"step": 238
},
{
"epoch": 0.7050147492625368,
"grad_norm": 0.27734375,
"learning_rate": 0.00017867680525271662,
"loss": 1.8957,
"step": 239
},
{
"epoch": 0.7079646017699115,
"grad_norm": 0.2734375,
"learning_rate": 0.0001784799385278661,
"loss": 1.6773,
"step": 240
},
{
"epoch": 0.7109144542772862,
"grad_norm": 0.291015625,
"learning_rate": 0.0001782822766994014,
"loss": 1.8234,
"step": 241
},
{
"epoch": 0.7138643067846607,
"grad_norm": 0.275390625,
"learning_rate": 0.0001780838217698933,
"loss": 1.7428,
"step": 242
},
{
"epoch": 0.7168141592920354,
"grad_norm": 0.2890625,
"learning_rate": 0.00017788457574994778,
"loss": 1.5677,
"step": 243
},
{
"epoch": 0.7197640117994101,
"grad_norm": 0.26953125,
"learning_rate": 0.00017768454065818551,
"loss": 1.675,
"step": 244
},
{
"epoch": 0.7227138643067846,
"grad_norm": 0.255859375,
"learning_rate": 0.00017748371852122164,
"loss": 1.6499,
"step": 245
},
{
"epoch": 0.7256637168141593,
"grad_norm": 0.26171875,
"learning_rate": 0.00017728211137364489,
"loss": 1.6106,
"step": 246
},
{
"epoch": 0.7286135693215339,
"grad_norm": 0.263671875,
"learning_rate": 0.00017707972125799735,
"loss": 1.9783,
"step": 247
},
{
"epoch": 0.7315634218289085,
"grad_norm": 0.2314453125,
"learning_rate": 0.0001768765502247535,
"loss": 1.6949,
"step": 248
},
{
"epoch": 0.7345132743362832,
"grad_norm": 0.28515625,
"learning_rate": 0.00017667260033229953,
"loss": 1.8032,
"step": 249
},
{
"epoch": 0.7374631268436578,
"grad_norm": 0.314453125,
"learning_rate": 0.00017646787364691252,
"loss": 1.4228,
"step": 250
},
{
"epoch": 0.7404129793510325,
"grad_norm": 0.271484375,
"learning_rate": 0.00017626237224273945,
"loss": 1.6913,
"step": 251
},
{
"epoch": 0.7433628318584071,
"grad_norm": 0.2421875,
"learning_rate": 0.00017605609820177617,
"loss": 1.7456,
"step": 252
},
{
"epoch": 0.7463126843657817,
"grad_norm": 0.267578125,
"learning_rate": 0.00017584905361384644,
"loss": 1.7311,
"step": 253
},
{
"epoch": 0.7492625368731564,
"grad_norm": 0.322265625,
"learning_rate": 0.00017564124057658056,
"loss": 1.6269,
"step": 254
},
{
"epoch": 0.7522123893805309,
"grad_norm": 0.29296875,
"learning_rate": 0.00017543266119539422,
"loss": 1.6027,
"step": 255
},
{
"epoch": 0.7551622418879056,
"grad_norm": 0.251953125,
"learning_rate": 0.0001752233175834673,
"loss": 1.7218,
"step": 256
},
{
"epoch": 0.7581120943952803,
"grad_norm": 0.291015625,
"learning_rate": 0.00017501321186172216,
"loss": 1.462,
"step": 257
},
{
"epoch": 0.7610619469026548,
"grad_norm": 0.2353515625,
"learning_rate": 0.00017480234615880247,
"loss": 1.5678,
"step": 258
},
{
"epoch": 0.7640117994100295,
"grad_norm": 0.251953125,
"learning_rate": 0.00017459072261105137,
"loss": 1.6304,
"step": 259
},
{
"epoch": 0.7669616519174042,
"grad_norm": 0.267578125,
"learning_rate": 0.00017437834336249001,
"loss": 1.6032,
"step": 260
},
{
"epoch": 0.7699115044247787,
"grad_norm": 0.248046875,
"learning_rate": 0.00017416521056479577,
"loss": 1.6544,
"step": 261
},
{
"epoch": 0.7728613569321534,
"grad_norm": 0.2578125,
"learning_rate": 0.00017395132637728047,
"loss": 1.6324,
"step": 262
},
{
"epoch": 0.775811209439528,
"grad_norm": 0.375,
"learning_rate": 0.00017373669296686842,
"loss": 1.6961,
"step": 263
},
{
"epoch": 0.7787610619469026,
"grad_norm": 0.375,
"learning_rate": 0.00017352131250807467,
"loss": 1.6144,
"step": 264
},
{
"epoch": 0.7817109144542773,
"grad_norm": 0.2890625,
"learning_rate": 0.00017330518718298264,
"loss": 1.5837,
"step": 265
},
{
"epoch": 0.7846607669616519,
"grad_norm": 0.32421875,
"learning_rate": 0.00017308831918122242,
"loss": 1.5924,
"step": 266
},
{
"epoch": 0.7876106194690266,
"grad_norm": 0.27734375,
"learning_rate": 0.0001728707106999482,
"loss": 1.7389,
"step": 267
},
{
"epoch": 0.7905604719764012,
"grad_norm": 0.2255859375,
"learning_rate": 0.00017265236394381633,
"loss": 1.6675,
"step": 268
},
{
"epoch": 0.7935103244837758,
"grad_norm": 0.298828125,
"learning_rate": 0.00017243328112496277,
"loss": 1.4277,
"step": 269
},
{
"epoch": 0.7964601769911505,
"grad_norm": 0.2158203125,
"learning_rate": 0.0001722134644629807,
"loss": 1.6197,
"step": 270
},
{
"epoch": 0.799410029498525,
"grad_norm": 0.275390625,
"learning_rate": 0.0001719929161848982,
"loss": 1.7374,
"step": 271
},
{
"epoch": 0.8023598820058997,
"grad_norm": 0.3203125,
"learning_rate": 0.00017177163852515553,
"loss": 1.8338,
"step": 272
},
{
"epoch": 0.8053097345132744,
"grad_norm": 0.25390625,
"learning_rate": 0.00017154963372558246,
"loss": 1.769,
"step": 273
},
{
"epoch": 0.8082595870206489,
"grad_norm": 0.31640625,
"learning_rate": 0.0001713269040353757,
"loss": 1.8441,
"step": 274
},
{
"epoch": 0.8112094395280236,
"grad_norm": 0.24609375,
"learning_rate": 0.0001711034517110761,
"loss": 1.5906,
"step": 275
},
{
"epoch": 0.8141592920353983,
"grad_norm": 0.306640625,
"learning_rate": 0.00017087927901654557,
"loss": 1.7079,
"step": 276
},
{
"epoch": 0.8171091445427728,
"grad_norm": 0.2431640625,
"learning_rate": 0.00017065438822294447,
"loss": 1.539,
"step": 277
},
{
"epoch": 0.8200589970501475,
"grad_norm": 0.26171875,
"learning_rate": 0.00017042878160870829,
"loss": 1.792,
"step": 278
},
{
"epoch": 0.8230088495575221,
"grad_norm": 0.25390625,
"learning_rate": 0.00017020246145952477,
"loss": 1.6786,
"step": 279
},
{
"epoch": 0.8259587020648967,
"grad_norm": 0.27734375,
"learning_rate": 0.00016997543006831082,
"loss": 1.5944,
"step": 280
},
{
"epoch": 0.8289085545722714,
"grad_norm": 0.2265625,
"learning_rate": 0.00016974768973518893,
"loss": 1.6119,
"step": 281
},
{
"epoch": 0.831858407079646,
"grad_norm": 0.2578125,
"learning_rate": 0.00016951924276746425,
"loss": 1.5559,
"step": 282
},
{
"epoch": 0.8348082595870207,
"grad_norm": 0.2236328125,
"learning_rate": 0.00016929009147960094,
"loss": 1.8518,
"step": 283
},
{
"epoch": 0.8377581120943953,
"grad_norm": 0.2333984375,
"learning_rate": 0.00016906023819319893,
"loss": 1.6339,
"step": 284
},
{
"epoch": 0.8407079646017699,
"grad_norm": 0.263671875,
"learning_rate": 0.00016882968523697028,
"loss": 1.5756,
"step": 285
},
{
"epoch": 0.8436578171091446,
"grad_norm": 0.25390625,
"learning_rate": 0.0001685984349467156,
"loss": 1.5814,
"step": 286
},
{
"epoch": 0.8466076696165191,
"grad_norm": 0.2431640625,
"learning_rate": 0.0001683664896653004,
"loss": 1.5515,
"step": 287
},
{
"epoch": 0.8495575221238938,
"grad_norm": 0.287109375,
"learning_rate": 0.00016813385174263137,
"loss": 1.7889,
"step": 288
},
{
"epoch": 0.8525073746312685,
"grad_norm": 0.3828125,
"learning_rate": 0.00016790052353563253,
"loss": 1.4825,
"step": 289
},
{
"epoch": 0.855457227138643,
"grad_norm": 0.302734375,
"learning_rate": 0.00016766650740822136,
"loss": 1.7253,
"step": 290
},
{
"epoch": 0.8584070796460177,
"grad_norm": 0.267578125,
"learning_rate": 0.00016743180573128495,
"loss": 1.5323,
"step": 291
},
{
"epoch": 0.8613569321533924,
"grad_norm": 0.392578125,
"learning_rate": 0.00016719642088265578,
"loss": 1.6399,
"step": 292
},
{
"epoch": 0.8643067846607669,
"grad_norm": 0.341796875,
"learning_rate": 0.0001669603552470879,
"loss": 1.605,
"step": 293
},
{
"epoch": 0.8672566371681416,
"grad_norm": 0.2734375,
"learning_rate": 0.00016672361121623238,
"loss": 1.6963,
"step": 294
},
{
"epoch": 0.8702064896755162,
"grad_norm": 0.2890625,
"learning_rate": 0.0001664861911886136,
"loss": 1.5511,
"step": 295
},
{
"epoch": 0.8731563421828908,
"grad_norm": 0.275390625,
"learning_rate": 0.00016624809756960444,
"loss": 1.6273,
"step": 296
},
{
"epoch": 0.8761061946902655,
"grad_norm": 0.26171875,
"learning_rate": 0.0001660093327714022,
"loss": 1.73,
"step": 297
},
{
"epoch": 0.8790560471976401,
"grad_norm": 0.2392578125,
"learning_rate": 0.00016576989921300418,
"loss": 1.7924,
"step": 298
},
{
"epoch": 0.8820058997050148,
"grad_norm": 0.25390625,
"learning_rate": 0.00016552979932018297,
"loss": 1.6071,
"step": 299
},
{
"epoch": 0.8849557522123894,
"grad_norm": 0.2578125,
"learning_rate": 0.00016528903552546207,
"loss": 1.5692,
"step": 300
},
{
"epoch": 0.887905604719764,
"grad_norm": 0.255859375,
"learning_rate": 0.0001650476102680911,
"loss": 1.5778,
"step": 301
},
{
"epoch": 0.8908554572271387,
"grad_norm": 0.2578125,
"learning_rate": 0.0001648055259940212,
"loss": 1.6021,
"step": 302
},
{
"epoch": 0.8938053097345132,
"grad_norm": 0.25,
"learning_rate": 0.00016456278515588024,
"loss": 1.6191,
"step": 303
},
{
"epoch": 0.8967551622418879,
"grad_norm": 0.271484375,
"learning_rate": 0.00016431939021294787,
"loss": 1.6039,
"step": 304
},
{
"epoch": 0.8997050147492626,
"grad_norm": 0.31640625,
"learning_rate": 0.00016407534363113073,
"loss": 1.7331,
"step": 305
},
{
"epoch": 0.9026548672566371,
"grad_norm": 0.291015625,
"learning_rate": 0.0001638306478829373,
"loss": 1.5992,
"step": 306
},
{
"epoch": 0.9056047197640118,
"grad_norm": 0.267578125,
"learning_rate": 0.00016358530544745308,
"loss": 1.5992,
"step": 307
},
{
"epoch": 0.9085545722713865,
"grad_norm": 0.267578125,
"learning_rate": 0.00016333931881031528,
"loss": 1.6499,
"step": 308
},
{
"epoch": 0.911504424778761,
"grad_norm": 0.27734375,
"learning_rate": 0.00016309269046368776,
"loss": 1.6018,
"step": 309
},
{
"epoch": 0.9144542772861357,
"grad_norm": 0.271484375,
"learning_rate": 0.00016284542290623567,
"loss": 1.5417,
"step": 310
},
{
"epoch": 0.9174041297935103,
"grad_norm": 0.263671875,
"learning_rate": 0.00016259751864310026,
"loss": 1.5422,
"step": 311
},
{
"epoch": 0.9203539823008849,
"grad_norm": 0.251953125,
"learning_rate": 0.00016234898018587337,
"loss": 1.6849,
"step": 312
},
{
"epoch": 0.9233038348082596,
"grad_norm": 0.26953125,
"learning_rate": 0.00016209981005257208,
"loss": 1.7059,
"step": 313
},
{
"epoch": 0.9262536873156342,
"grad_norm": 0.271484375,
"learning_rate": 0.0001618500107676132,
"loss": 1.6114,
"step": 314
},
{
"epoch": 0.9292035398230089,
"grad_norm": 0.443359375,
"learning_rate": 0.0001615995848617876,
"loss": 1.7133,
"step": 315
},
{
"epoch": 0.9321533923303835,
"grad_norm": 0.275390625,
"learning_rate": 0.00016134853487223465,
"loss": 1.6795,
"step": 316
},
{
"epoch": 0.9351032448377581,
"grad_norm": 0.255859375,
"learning_rate": 0.00016109686334241655,
"loss": 1.582,
"step": 317
},
{
"epoch": 0.9380530973451328,
"grad_norm": 0.271484375,
"learning_rate": 0.00016084457282209243,
"loss": 1.6309,
"step": 318
},
{
"epoch": 0.9410029498525073,
"grad_norm": 0.251953125,
"learning_rate": 0.0001605916658672927,
"loss": 1.5242,
"step": 319
},
{
"epoch": 0.943952802359882,
"grad_norm": 0.271484375,
"learning_rate": 0.00016033814504029292,
"loss": 1.5703,
"step": 320
},
{
"epoch": 0.9469026548672567,
"grad_norm": 0.2412109375,
"learning_rate": 0.00016008401290958807,
"loss": 1.6363,
"step": 321
},
{
"epoch": 0.9498525073746312,
"grad_norm": 0.279296875,
"learning_rate": 0.0001598292720498664,
"loss": 1.6733,
"step": 322
},
{
"epoch": 0.9528023598820059,
"grad_norm": 0.26953125,
"learning_rate": 0.00015957392504198337,
"loss": 1.8078,
"step": 323
},
{
"epoch": 0.9557522123893806,
"grad_norm": 0.2890625,
"learning_rate": 0.00015931797447293552,
"loss": 1.4982,
"step": 324
},
{
"epoch": 0.9587020648967551,
"grad_norm": 0.28125,
"learning_rate": 0.00015906142293583425,
"loss": 1.4663,
"step": 325
},
{
"epoch": 0.9616519174041298,
"grad_norm": 0.2314453125,
"learning_rate": 0.00015880427302987951,
"loss": 1.7379,
"step": 326
},
{
"epoch": 0.9646017699115044,
"grad_norm": 0.23046875,
"learning_rate": 0.00015854652736033354,
"loss": 1.5133,
"step": 327
},
{
"epoch": 0.967551622418879,
"grad_norm": 0.291015625,
"learning_rate": 0.00015828818853849444,
"loss": 1.5689,
"step": 328
},
{
"epoch": 0.9705014749262537,
"grad_norm": 0.27734375,
"learning_rate": 0.0001580292591816697,
"loss": 1.7375,
"step": 329
},
{
"epoch": 0.9734513274336283,
"grad_norm": 0.263671875,
"learning_rate": 0.0001577697419131497,
"loss": 1.7474,
"step": 330
},
{
"epoch": 0.976401179941003,
"grad_norm": 0.26171875,
"learning_rate": 0.00015750963936218105,
"loss": 1.5596,
"step": 331
},
{
"epoch": 0.9793510324483776,
"grad_norm": 0.46484375,
"learning_rate": 0.00015724895416394018,
"loss": 1.6763,
"step": 332
},
{
"epoch": 0.9823008849557522,
"grad_norm": 0.25390625,
"learning_rate": 0.00015698768895950642,
"loss": 1.609,
"step": 333
},
{
"epoch": 0.9852507374631269,
"grad_norm": 0.28515625,
"learning_rate": 0.00015672584639583528,
"loss": 1.5961,
"step": 334
},
{
"epoch": 0.9882005899705014,
"grad_norm": 0.2421875,
"learning_rate": 0.00015646342912573177,
"loss": 1.7245,
"step": 335
},
{
"epoch": 0.9911504424778761,
"grad_norm": 0.259765625,
"learning_rate": 0.00015620043980782327,
"loss": 1.4791,
"step": 336
},
{
"epoch": 0.9941002949852508,
"grad_norm": 0.25,
"learning_rate": 0.00015593688110653283,
"loss": 1.3877,
"step": 337
},
{
"epoch": 0.9970501474926253,
"grad_norm": 0.236328125,
"learning_rate": 0.00015567275569205218,
"loss": 1.6525,
"step": 338
},
{
"epoch": 1.0,
"grad_norm": 0.333984375,
"learning_rate": 0.00015540806624031442,
"loss": 1.523,
"step": 339
},
{
"epoch": 1.0029498525073746,
"grad_norm": 0.56640625,
"learning_rate": 0.00015514281543296715,
"loss": 1.3025,
"step": 340
},
{
"epoch": 1.0058997050147493,
"grad_norm": 0.296875,
"learning_rate": 0.00015487700595734536,
"loss": 1.3286,
"step": 341
},
{
"epoch": 1.008849557522124,
"grad_norm": 0.28515625,
"learning_rate": 0.0001546106405064438,
"loss": 1.2663,
"step": 342
},
{
"epoch": 1.0117994100294985,
"grad_norm": 0.26171875,
"learning_rate": 0.00015434372177889022,
"loss": 1.3448,
"step": 343
},
{
"epoch": 1.0147492625368733,
"grad_norm": 0.2451171875,
"learning_rate": 0.00015407625247891772,
"loss": 1.2221,
"step": 344
},
{
"epoch": 1.0176991150442478,
"grad_norm": 0.255859375,
"learning_rate": 0.00015380823531633729,
"loss": 1.2915,
"step": 345
},
{
"epoch": 1.0206489675516224,
"grad_norm": 0.2470703125,
"learning_rate": 0.0001535396730065106,
"loss": 1.1861,
"step": 346
},
{
"epoch": 1.023598820058997,
"grad_norm": 0.251953125,
"learning_rate": 0.0001532705682703224,
"loss": 1.2679,
"step": 347
},
{
"epoch": 1.0265486725663717,
"grad_norm": 0.2216796875,
"learning_rate": 0.00015300092383415282,
"loss": 1.226,
"step": 348
},
{
"epoch": 1.0294985250737463,
"grad_norm": 0.2265625,
"learning_rate": 0.00015273074242984987,
"loss": 1.4833,
"step": 349
},
{
"epoch": 1.0324483775811208,
"grad_norm": 0.26953125,
"learning_rate": 0.00015246002679470175,
"loss": 1.3283,
"step": 350
},
{
"epoch": 1.0353982300884956,
"grad_norm": 0.251953125,
"learning_rate": 0.0001521887796714092,
"loss": 1.4032,
"step": 351
},
{
"epoch": 1.0383480825958702,
"grad_norm": 0.23046875,
"learning_rate": 0.00015191700380805752,
"loss": 1.371,
"step": 352
},
{
"epoch": 1.0412979351032448,
"grad_norm": 0.2373046875,
"learning_rate": 0.0001516447019580889,
"loss": 1.3903,
"step": 353
},
{
"epoch": 1.0442477876106195,
"grad_norm": 0.30078125,
"learning_rate": 0.00015137187688027436,
"loss": 1.3544,
"step": 354
},
{
"epoch": 1.047197640117994,
"grad_norm": 0.31640625,
"learning_rate": 0.000151098531338686,
"loss": 1.3134,
"step": 355
},
{
"epoch": 1.0501474926253687,
"grad_norm": 0.23046875,
"learning_rate": 0.00015082466810266884,
"loss": 1.4383,
"step": 356
},
{
"epoch": 1.0530973451327434,
"grad_norm": 0.28125,
"learning_rate": 0.00015055028994681284,
"loss": 1.3968,
"step": 357
},
{
"epoch": 1.056047197640118,
"grad_norm": 0.283203125,
"learning_rate": 0.00015027539965092477,
"loss": 1.2906,
"step": 358
},
{
"epoch": 1.0589970501474926,
"grad_norm": 0.26171875,
"learning_rate": 0.00015000000000000001,
"loss": 1.3332,
"step": 359
},
{
"epoch": 1.0619469026548674,
"grad_norm": 0.271484375,
"learning_rate": 0.0001497240937841944,
"loss": 1.2405,
"step": 360
},
{
"epoch": 1.064896755162242,
"grad_norm": 0.234375,
"learning_rate": 0.00014944768379879591,
"loss": 1.4423,
"step": 361
},
{
"epoch": 1.0678466076696165,
"grad_norm": 0.251953125,
"learning_rate": 0.00014917077284419634,
"loss": 1.2753,
"step": 362
},
{
"epoch": 1.0707964601769913,
"grad_norm": 0.2412109375,
"learning_rate": 0.00014889336372586305,
"loss": 1.2715,
"step": 363
},
{
"epoch": 1.0737463126843658,
"grad_norm": 0.2578125,
"learning_rate": 0.00014861545925431036,
"loss": 1.2921,
"step": 364
},
{
"epoch": 1.0766961651917404,
"grad_norm": 0.296875,
"learning_rate": 0.00014833706224507114,
"loss": 1.421,
"step": 365
},
{
"epoch": 1.079646017699115,
"grad_norm": 0.240234375,
"learning_rate": 0.00014805817551866838,
"loss": 1.23,
"step": 366
},
{
"epoch": 1.0825958702064897,
"grad_norm": 0.263671875,
"learning_rate": 0.00014777880190058654,
"loss": 1.3333,
"step": 367
},
{
"epoch": 1.0855457227138643,
"grad_norm": 0.310546875,
"learning_rate": 0.0001474989442212428,
"loss": 1.3625,
"step": 368
},
{
"epoch": 1.0884955752212389,
"grad_norm": 0.25390625,
"learning_rate": 0.0001472186053159587,
"loss": 1.4502,
"step": 369
},
{
"epoch": 1.0914454277286136,
"grad_norm": 0.283203125,
"learning_rate": 0.00014693778802493104,
"loss": 1.4113,
"step": 370
},
{
"epoch": 1.0943952802359882,
"grad_norm": 0.240234375,
"learning_rate": 0.00014665649519320342,
"loss": 1.2237,
"step": 371
},
{
"epoch": 1.0973451327433628,
"grad_norm": 0.2373046875,
"learning_rate": 0.0001463747296706372,
"loss": 1.35,
"step": 372
},
{
"epoch": 1.1002949852507375,
"grad_norm": 0.27734375,
"learning_rate": 0.00014609249431188278,
"loss": 1.3157,
"step": 373
},
{
"epoch": 1.103244837758112,
"grad_norm": 0.283203125,
"learning_rate": 0.0001458097919763506,
"loss": 1.3981,
"step": 374
},
{
"epoch": 1.1061946902654867,
"grad_norm": 0.2109375,
"learning_rate": 0.0001455266255281821,
"loss": 1.2643,
"step": 375
},
{
"epoch": 1.1091445427728615,
"grad_norm": 0.26171875,
"learning_rate": 0.0001452429978362209,
"loss": 1.3088,
"step": 376
},
{
"epoch": 1.112094395280236,
"grad_norm": 0.2412109375,
"learning_rate": 0.00014495891177398353,
"loss": 1.3186,
"step": 377
},
{
"epoch": 1.1150442477876106,
"grad_norm": 0.240234375,
"learning_rate": 0.0001446743702196304,
"loss": 1.4,
"step": 378
},
{
"epoch": 1.1179941002949851,
"grad_norm": 0.2373046875,
"learning_rate": 0.0001443893760559367,
"loss": 1.3251,
"step": 379
},
{
"epoch": 1.12094395280236,
"grad_norm": 0.33203125,
"learning_rate": 0.00014410393217026318,
"loss": 1.3166,
"step": 380
},
{
"epoch": 1.1238938053097345,
"grad_norm": 0.2275390625,
"learning_rate": 0.0001438180414545267,
"loss": 1.2156,
"step": 381
},
{
"epoch": 1.1268436578171093,
"grad_norm": 0.2255859375,
"learning_rate": 0.00014353170680517132,
"loss": 1.4028,
"step": 382
},
{
"epoch": 1.1297935103244838,
"grad_norm": 0.2314453125,
"learning_rate": 0.00014324493112313844,
"loss": 1.5111,
"step": 383
},
{
"epoch": 1.1327433628318584,
"grad_norm": 0.244140625,
"learning_rate": 0.00014295771731383797,
"loss": 1.4437,
"step": 384
},
{
"epoch": 1.135693215339233,
"grad_norm": 0.271484375,
"learning_rate": 0.0001426700682871184,
"loss": 1.3703,
"step": 385
},
{
"epoch": 1.1386430678466077,
"grad_norm": 0.265625,
"learning_rate": 0.0001423819869572377,
"loss": 1.3076,
"step": 386
},
{
"epoch": 1.1415929203539823,
"grad_norm": 0.255859375,
"learning_rate": 0.0001420934762428335,
"loss": 1.2632,
"step": 387
},
{
"epoch": 1.1445427728613569,
"grad_norm": 0.482421875,
"learning_rate": 0.00014180453906689378,
"loss": 1.3056,
"step": 388
},
{
"epoch": 1.1474926253687316,
"grad_norm": 0.263671875,
"learning_rate": 0.00014151517835672697,
"loss": 1.2406,
"step": 389
},
{
"epoch": 1.1504424778761062,
"grad_norm": 0.240234375,
"learning_rate": 0.00014122539704393265,
"loss": 1.3779,
"step": 390
},
{
"epoch": 1.1533923303834808,
"grad_norm": 0.2734375,
"learning_rate": 0.00014093519806437148,
"loss": 1.3458,
"step": 391
},
{
"epoch": 1.1563421828908556,
"grad_norm": 0.265625,
"learning_rate": 0.00014064458435813566,
"loss": 1.2948,
"step": 392
},
{
"epoch": 1.1592920353982301,
"grad_norm": 0.314453125,
"learning_rate": 0.00014035355886951923,
"loss": 1.2493,
"step": 393
},
{
"epoch": 1.1622418879056047,
"grad_norm": 0.21484375,
"learning_rate": 0.00014006212454698797,
"loss": 1.2889,
"step": 394
},
{
"epoch": 1.1651917404129795,
"grad_norm": 0.5546875,
"learning_rate": 0.00013977028434314975,
"loss": 1.3204,
"step": 395
},
{
"epoch": 1.168141592920354,
"grad_norm": 0.330078125,
"learning_rate": 0.0001394780412147245,
"loss": 1.3548,
"step": 396
},
{
"epoch": 1.1710914454277286,
"grad_norm": 0.28515625,
"learning_rate": 0.0001391853981225144,
"loss": 1.5902,
"step": 397
},
{
"epoch": 1.1740412979351031,
"grad_norm": 0.2392578125,
"learning_rate": 0.00013889235803137364,
"loss": 1.2234,
"step": 398
},
{
"epoch": 1.176991150442478,
"grad_norm": 0.2578125,
"learning_rate": 0.00013859892391017865,
"loss": 1.199,
"step": 399
},
{
"epoch": 1.1799410029498525,
"grad_norm": 0.2373046875,
"learning_rate": 0.00013830509873179785,
"loss": 1.4765,
"step": 400
},
{
"epoch": 1.1799410029498525,
"eval_loss": 1.3133705854415894,
"eval_runtime": 32.5065,
"eval_samples_per_second": 30.763,
"eval_steps_per_second": 3.845,
"step": 400
},
{
"epoch": 1.182890855457227,
"grad_norm": 0.25390625,
"learning_rate": 0.00013801088547306148,
"loss": 1.2451,
"step": 401
},
{
"epoch": 1.1858407079646018,
"grad_norm": 0.2578125,
"learning_rate": 0.00013771628711473172,
"loss": 1.1576,
"step": 402
},
{
"epoch": 1.1887905604719764,
"grad_norm": 0.28515625,
"learning_rate": 0.00013742130664147218,
"loss": 1.1916,
"step": 403
},
{
"epoch": 1.191740412979351,
"grad_norm": 0.26171875,
"learning_rate": 0.00013712594704181784,
"loss": 1.2373,
"step": 404
},
{
"epoch": 1.1946902654867257,
"grad_norm": 0.25390625,
"learning_rate": 0.0001368302113081447,
"loss": 1.1996,
"step": 405
},
{
"epoch": 1.1976401179941003,
"grad_norm": 0.2421875,
"learning_rate": 0.00013653410243663952,
"loss": 1.5402,
"step": 406
},
{
"epoch": 1.2005899705014749,
"grad_norm": 0.259765625,
"learning_rate": 0.00013623762342726935,
"loss": 1.4833,
"step": 407
},
{
"epoch": 1.2035398230088497,
"grad_norm": 0.27734375,
"learning_rate": 0.00013594077728375128,
"loss": 1.318,
"step": 408
},
{
"epoch": 1.2064896755162242,
"grad_norm": 0.25,
"learning_rate": 0.00013564356701352198,
"loss": 1.3453,
"step": 409
},
{
"epoch": 1.2094395280235988,
"grad_norm": 0.2392578125,
"learning_rate": 0.00013534599562770714,
"loss": 1.2933,
"step": 410
},
{
"epoch": 1.2123893805309733,
"grad_norm": 0.267578125,
"learning_rate": 0.00013504806614109098,
"loss": 1.5392,
"step": 411
},
{
"epoch": 1.2153392330383481,
"grad_norm": 0.263671875,
"learning_rate": 0.00013474978157208592,
"loss": 1.3248,
"step": 412
},
{
"epoch": 1.2182890855457227,
"grad_norm": 0.2314453125,
"learning_rate": 0.00013445114494270154,
"loss": 1.3162,
"step": 413
},
{
"epoch": 1.2212389380530975,
"grad_norm": 0.25,
"learning_rate": 0.0001341521592785145,
"loss": 1.2872,
"step": 414
},
{
"epoch": 1.224188790560472,
"grad_norm": 0.248046875,
"learning_rate": 0.00013385282760863758,
"loss": 1.3864,
"step": 415
},
{
"epoch": 1.2271386430678466,
"grad_norm": 0.26171875,
"learning_rate": 0.00013355315296568893,
"loss": 1.2721,
"step": 416
},
{
"epoch": 1.2300884955752212,
"grad_norm": 0.2412109375,
"learning_rate": 0.0001332531383857616,
"loss": 1.3663,
"step": 417
},
{
"epoch": 1.233038348082596,
"grad_norm": 0.24609375,
"learning_rate": 0.0001329527869083926,
"loss": 1.2824,
"step": 418
},
{
"epoch": 1.2359882005899705,
"grad_norm": 0.259765625,
"learning_rate": 0.00013265210157653213,
"loss": 1.375,
"step": 419
},
{
"epoch": 1.238938053097345,
"grad_norm": 0.267578125,
"learning_rate": 0.00013235108543651272,
"loss": 1.5471,
"step": 420
},
{
"epoch": 1.2418879056047198,
"grad_norm": 0.267578125,
"learning_rate": 0.0001320497415380185,
"loss": 1.3125,
"step": 421
},
{
"epoch": 1.2448377581120944,
"grad_norm": 0.22265625,
"learning_rate": 0.00013174807293405428,
"loss": 1.3632,
"step": 422
},
{
"epoch": 1.247787610619469,
"grad_norm": 0.2578125,
"learning_rate": 0.00013144608268091435,
"loss": 1.3598,
"step": 423
},
{
"epoch": 1.2507374631268435,
"grad_norm": 0.2392578125,
"learning_rate": 0.00013114377383815188,
"loss": 1.3247,
"step": 424
},
{
"epoch": 1.2536873156342183,
"grad_norm": 0.287109375,
"learning_rate": 0.00013084114946854776,
"loss": 1.4319,
"step": 425
},
{
"epoch": 1.2566371681415929,
"grad_norm": 0.26171875,
"learning_rate": 0.00013053821263807946,
"loss": 1.2733,
"step": 426
},
{
"epoch": 1.2595870206489677,
"grad_norm": 0.296875,
"learning_rate": 0.00013023496641589025,
"loss": 1.3617,
"step": 427
},
{
"epoch": 1.2625368731563422,
"grad_norm": 0.244140625,
"learning_rate": 0.00012993141387425788,
"loss": 1.447,
"step": 428
},
{
"epoch": 1.2654867256637168,
"grad_norm": 0.2490234375,
"learning_rate": 0.00012962755808856342,
"loss": 1.1819,
"step": 429
},
{
"epoch": 1.2684365781710913,
"grad_norm": 0.2412109375,
"learning_rate": 0.0001293234021372603,
"loss": 1.3935,
"step": 430
},
{
"epoch": 1.2713864306784661,
"grad_norm": 0.298828125,
"learning_rate": 0.00012901894910184297,
"loss": 1.3463,
"step": 431
},
{
"epoch": 1.2743362831858407,
"grad_norm": 0.228515625,
"learning_rate": 0.00012871420206681571,
"loss": 1.2792,
"step": 432
},
{
"epoch": 1.2772861356932155,
"grad_norm": 0.392578125,
"learning_rate": 0.00012840916411966153,
"loss": 1.4517,
"step": 433
},
{
"epoch": 1.28023598820059,
"grad_norm": 0.232421875,
"learning_rate": 0.00012810383835081058,
"loss": 1.2504,
"step": 434
},
{
"epoch": 1.2831858407079646,
"grad_norm": 0.23046875,
"learning_rate": 0.00012779822785360912,
"loss": 1.246,
"step": 435
},
{
"epoch": 1.2861356932153392,
"grad_norm": 0.314453125,
"learning_rate": 0.00012749233572428804,
"loss": 1.3399,
"step": 436
},
{
"epoch": 1.289085545722714,
"grad_norm": 0.267578125,
"learning_rate": 0.00012718616506193151,
"loss": 1.3291,
"step": 437
},
{
"epoch": 1.2920353982300885,
"grad_norm": 0.263671875,
"learning_rate": 0.00012687971896844575,
"loss": 1.3622,
"step": 438
},
{
"epoch": 1.294985250737463,
"grad_norm": 0.248046875,
"learning_rate": 0.00012657300054852718,
"loss": 1.221,
"step": 439
},
{
"epoch": 1.2979351032448379,
"grad_norm": 0.26953125,
"learning_rate": 0.0001262660129096315,
"loss": 1.3629,
"step": 440
},
{
"epoch": 1.3008849557522124,
"grad_norm": 0.255859375,
"learning_rate": 0.00012595875916194188,
"loss": 1.3578,
"step": 441
},
{
"epoch": 1.303834808259587,
"grad_norm": 0.31640625,
"learning_rate": 0.0001256512424183373,
"loss": 1.2829,
"step": 442
},
{
"epoch": 1.3067846607669615,
"grad_norm": 0.2734375,
"learning_rate": 0.0001253434657943616,
"loss": 1.1864,
"step": 443
},
{
"epoch": 1.3097345132743363,
"grad_norm": 0.2431640625,
"learning_rate": 0.00012503543240819127,
"loss": 1.3679,
"step": 444
},
{
"epoch": 1.3126843657817109,
"grad_norm": 0.271484375,
"learning_rate": 0.00012472714538060422,
"loss": 1.445,
"step": 445
},
{
"epoch": 1.3156342182890857,
"grad_norm": 0.2431640625,
"learning_rate": 0.0001244186078349481,
"loss": 1.3589,
"step": 446
},
{
"epoch": 1.3185840707964602,
"grad_norm": 0.2373046875,
"learning_rate": 0.00012410982289710865,
"loss": 1.3212,
"step": 447
},
{
"epoch": 1.3215339233038348,
"grad_norm": 0.2431640625,
"learning_rate": 0.0001238007936954779,
"loss": 1.3721,
"step": 448
},
{
"epoch": 1.3244837758112094,
"grad_norm": 0.26953125,
"learning_rate": 0.0001234915233609227,
"loss": 1.2942,
"step": 449
},
{
"epoch": 1.3274336283185841,
"grad_norm": 0.296875,
"learning_rate": 0.00012318201502675285,
"loss": 1.2459,
"step": 450
},
{
"epoch": 1.3303834808259587,
"grad_norm": 0.255859375,
"learning_rate": 0.00012287227182868938,
"loss": 1.3887,
"step": 451
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.2431640625,
"learning_rate": 0.00012256229690483283,
"loss": 1.2222,
"step": 452
},
{
"epoch": 1.336283185840708,
"grad_norm": 0.3203125,
"learning_rate": 0.00012225209339563145,
"loss": 1.2916,
"step": 453
},
{
"epoch": 1.3392330383480826,
"grad_norm": 0.283203125,
"learning_rate": 0.00012194166444384927,
"loss": 1.324,
"step": 454
},
{
"epoch": 1.3421828908554572,
"grad_norm": 0.2734375,
"learning_rate": 0.00012163101319453436,
"loss": 1.4622,
"step": 455
},
{
"epoch": 1.3451327433628317,
"grad_norm": 0.23828125,
"learning_rate": 0.00012132014279498703,
"loss": 1.2422,
"step": 456
},
{
"epoch": 1.3480825958702065,
"grad_norm": 0.294921875,
"learning_rate": 0.00012100905639472779,
"loss": 1.3972,
"step": 457
},
{
"epoch": 1.351032448377581,
"grad_norm": 0.255859375,
"learning_rate": 0.00012069775714546559,
"loss": 1.3365,
"step": 458
},
{
"epoch": 1.3539823008849559,
"grad_norm": 0.240234375,
"learning_rate": 0.00012038624820106572,
"loss": 1.3369,
"step": 459
},
{
"epoch": 1.3569321533923304,
"grad_norm": 0.2236328125,
"learning_rate": 0.00012007453271751805,
"loss": 1.3419,
"step": 460
},
{
"epoch": 1.359882005899705,
"grad_norm": 0.2255859375,
"learning_rate": 0.00011976261385290486,
"loss": 1.3154,
"step": 461
},
{
"epoch": 1.3628318584070795,
"grad_norm": 0.232421875,
"learning_rate": 0.00011945049476736905,
"loss": 1.3711,
"step": 462
},
{
"epoch": 1.3657817109144543,
"grad_norm": 0.259765625,
"learning_rate": 0.00011913817862308194,
"loss": 1.2754,
"step": 463
},
{
"epoch": 1.368731563421829,
"grad_norm": 0.26171875,
"learning_rate": 0.00011882566858421135,
"loss": 1.31,
"step": 464
},
{
"epoch": 1.3716814159292037,
"grad_norm": 0.291015625,
"learning_rate": 0.00011851296781688952,
"loss": 1.2913,
"step": 465
},
{
"epoch": 1.3746312684365782,
"grad_norm": 0.291015625,
"learning_rate": 0.00011820007948918093,
"loss": 1.2989,
"step": 466
},
{
"epoch": 1.3775811209439528,
"grad_norm": 0.26171875,
"learning_rate": 0.00011788700677105037,
"loss": 1.3031,
"step": 467
},
{
"epoch": 1.3805309734513274,
"grad_norm": 0.259765625,
"learning_rate": 0.00011757375283433076,
"loss": 1.4073,
"step": 468
},
{
"epoch": 1.3834808259587021,
"grad_norm": 0.251953125,
"learning_rate": 0.00011726032085269092,
"loss": 1.3147,
"step": 469
},
{
"epoch": 1.3864306784660767,
"grad_norm": 0.25,
"learning_rate": 0.00011694671400160356,
"loss": 1.3423,
"step": 470
},
{
"epoch": 1.3893805309734513,
"grad_norm": 0.28515625,
"learning_rate": 0.00011663293545831302,
"loss": 1.3822,
"step": 471
},
{
"epoch": 1.392330383480826,
"grad_norm": 0.2265625,
"learning_rate": 0.00011631898840180309,
"loss": 1.3223,
"step": 472
},
{
"epoch": 1.3952802359882006,
"grad_norm": 0.25390625,
"learning_rate": 0.00011600487601276486,
"loss": 1.4917,
"step": 473
},
{
"epoch": 1.3982300884955752,
"grad_norm": 0.240234375,
"learning_rate": 0.00011569060147356441,
"loss": 1.4362,
"step": 474
},
{
"epoch": 1.4011799410029497,
"grad_norm": 0.2578125,
"learning_rate": 0.00011537616796821064,
"loss": 1.3115,
"step": 475
},
{
"epoch": 1.4041297935103245,
"grad_norm": 0.2470703125,
"learning_rate": 0.000115061578682323,
"loss": 1.2961,
"step": 476
},
{
"epoch": 1.407079646017699,
"grad_norm": 0.3046875,
"learning_rate": 0.00011474683680309912,
"loss": 1.5856,
"step": 477
},
{
"epoch": 1.4100294985250739,
"grad_norm": 0.283203125,
"learning_rate": 0.00011443194551928266,
"loss": 1.1359,
"step": 478
},
{
"epoch": 1.4129793510324484,
"grad_norm": 0.326171875,
"learning_rate": 0.000114116908021131,
"loss": 1.3312,
"step": 479
},
{
"epoch": 1.415929203539823,
"grad_norm": 0.265625,
"learning_rate": 0.00011380172750038269,
"loss": 1.2309,
"step": 480
},
{
"epoch": 1.4188790560471976,
"grad_norm": 0.251953125,
"learning_rate": 0.00011348640715022544,
"loss": 1.5125,
"step": 481
},
{
"epoch": 1.4218289085545723,
"grad_norm": 0.2294921875,
"learning_rate": 0.00011317095016526362,
"loss": 1.283,
"step": 482
},
{
"epoch": 1.424778761061947,
"grad_norm": 0.2412109375,
"learning_rate": 0.00011285535974148576,
"loss": 1.3261,
"step": 483
},
{
"epoch": 1.4277286135693215,
"grad_norm": 0.23828125,
"learning_rate": 0.00011253963907623235,
"loss": 1.1964,
"step": 484
},
{
"epoch": 1.4306784660766962,
"grad_norm": 0.2255859375,
"learning_rate": 0.00011222379136816345,
"loss": 1.2966,
"step": 485
},
{
"epoch": 1.4336283185840708,
"grad_norm": 0.21875,
"learning_rate": 0.00011190781981722623,
"loss": 1.321,
"step": 486
},
{
"epoch": 1.4365781710914454,
"grad_norm": 0.2392578125,
"learning_rate": 0.00011159172762462242,
"loss": 1.379,
"step": 487
},
{
"epoch": 1.43952802359882,
"grad_norm": 0.2490234375,
"learning_rate": 0.00011127551799277616,
"loss": 1.4737,
"step": 488
},
{
"epoch": 1.4424778761061947,
"grad_norm": 0.2265625,
"learning_rate": 0.00011095919412530136,
"loss": 1.4863,
"step": 489
},
{
"epoch": 1.4454277286135693,
"grad_norm": 0.29296875,
"learning_rate": 0.0001106427592269692,
"loss": 1.4278,
"step": 490
},
{
"epoch": 1.448377581120944,
"grad_norm": 0.27734375,
"learning_rate": 0.00011032621650367585,
"loss": 1.3267,
"step": 491
},
{
"epoch": 1.4513274336283186,
"grad_norm": 0.255859375,
"learning_rate": 0.00011000956916240985,
"loss": 1.3016,
"step": 492
},
{
"epoch": 1.4542772861356932,
"grad_norm": 0.26953125,
"learning_rate": 0.00010969282041121962,
"loss": 1.3081,
"step": 493
},
{
"epoch": 1.4572271386430677,
"grad_norm": 0.2216796875,
"learning_rate": 0.00010937597345918109,
"loss": 1.2362,
"step": 494
},
{
"epoch": 1.4601769911504425,
"grad_norm": 0.251953125,
"learning_rate": 0.00010905903151636501,
"loss": 1.3696,
"step": 495
},
{
"epoch": 1.463126843657817,
"grad_norm": 0.2412109375,
"learning_rate": 0.00010874199779380446,
"loss": 1.2046,
"step": 496
},
{
"epoch": 1.4660766961651919,
"grad_norm": 0.486328125,
"learning_rate": 0.0001084248755034625,
"loss": 1.3333,
"step": 497
},
{
"epoch": 1.4690265486725664,
"grad_norm": 0.287109375,
"learning_rate": 0.00010810766785819946,
"loss": 1.3915,
"step": 498
},
{
"epoch": 1.471976401179941,
"grad_norm": 0.25390625,
"learning_rate": 0.00010779037807174033,
"loss": 1.4227,
"step": 499
},
{
"epoch": 1.4749262536873156,
"grad_norm": 0.283203125,
"learning_rate": 0.00010747300935864243,
"loss": 1.3156,
"step": 500
},
{
"epoch": 1.4778761061946903,
"grad_norm": 0.2578125,
"learning_rate": 0.00010715556493426262,
"loss": 1.4177,
"step": 501
},
{
"epoch": 1.480825958702065,
"grad_norm": 0.302734375,
"learning_rate": 0.00010683804801472486,
"loss": 1.5573,
"step": 502
},
{
"epoch": 1.4837758112094395,
"grad_norm": 0.271484375,
"learning_rate": 0.00010652046181688751,
"loss": 1.5251,
"step": 503
},
{
"epoch": 1.4867256637168142,
"grad_norm": 0.271484375,
"learning_rate": 0.00010620280955831087,
"loss": 1.2344,
"step": 504
},
{
"epoch": 1.4896755162241888,
"grad_norm": 0.244140625,
"learning_rate": 0.00010588509445722459,
"loss": 1.3975,
"step": 505
},
{
"epoch": 1.4926253687315634,
"grad_norm": 0.263671875,
"learning_rate": 0.00010556731973249485,
"loss": 1.3213,
"step": 506
},
{
"epoch": 1.495575221238938,
"grad_norm": 0.263671875,
"learning_rate": 0.00010524948860359193,
"loss": 1.2601,
"step": 507
},
{
"epoch": 1.4985250737463127,
"grad_norm": 0.228515625,
"learning_rate": 0.00010493160429055766,
"loss": 1.3863,
"step": 508
},
{
"epoch": 1.5014749262536873,
"grad_norm": 0.30859375,
"learning_rate": 0.00010461367001397258,
"loss": 1.3607,
"step": 509
},
{
"epoch": 1.504424778761062,
"grad_norm": 0.2216796875,
"learning_rate": 0.00010429568899492348,
"loss": 1.3195,
"step": 510
},
{
"epoch": 1.5073746312684366,
"grad_norm": 0.244140625,
"learning_rate": 0.00010397766445497072,
"loss": 1.3834,
"step": 511
},
{
"epoch": 1.5103244837758112,
"grad_norm": 0.228515625,
"learning_rate": 0.00010365959961611553,
"loss": 1.3142,
"step": 512
},
{
"epoch": 1.5132743362831858,
"grad_norm": 0.27734375,
"learning_rate": 0.00010334149770076747,
"loss": 1.3302,
"step": 513
},
{
"epoch": 1.5162241887905603,
"grad_norm": 0.2490234375,
"learning_rate": 0.00010302336193171174,
"loss": 1.4499,
"step": 514
},
{
"epoch": 1.519174041297935,
"grad_norm": 0.2275390625,
"learning_rate": 0.00010270519553207642,
"loss": 1.1871,
"step": 515
},
{
"epoch": 1.5221238938053099,
"grad_norm": 0.27734375,
"learning_rate": 0.00010238700172530009,
"loss": 1.2039,
"step": 516
},
{
"epoch": 1.5250737463126844,
"grad_norm": 0.271484375,
"learning_rate": 0.00010206878373509887,
"loss": 1.2359,
"step": 517
},
{
"epoch": 1.528023598820059,
"grad_norm": 0.259765625,
"learning_rate": 0.00010175054478543393,
"loss": 1.2697,
"step": 518
},
{
"epoch": 1.5309734513274336,
"grad_norm": 0.2373046875,
"learning_rate": 0.00010143228810047875,
"loss": 1.3916,
"step": 519
},
{
"epoch": 1.5339233038348081,
"grad_norm": 0.2275390625,
"learning_rate": 0.00010111401690458654,
"loss": 1.339,
"step": 520
},
{
"epoch": 1.536873156342183,
"grad_norm": 0.2392578125,
"learning_rate": 0.00010079573442225759,
"loss": 1.3517,
"step": 521
},
{
"epoch": 1.5398230088495575,
"grad_norm": 0.2119140625,
"learning_rate": 0.00010047744387810632,
"loss": 1.3383,
"step": 522
},
{
"epoch": 1.5427728613569323,
"grad_norm": 0.2421875,
"learning_rate": 0.00010015914849682904,
"loss": 1.2532,
"step": 523
},
{
"epoch": 1.5457227138643068,
"grad_norm": 0.2451171875,
"learning_rate": 9.9840851503171e-05,
"loss": 1.2877,
"step": 524
},
{
"epoch": 1.5486725663716814,
"grad_norm": 0.2451171875,
"learning_rate": 9.952255612189368e-05,
"loss": 1.5834,
"step": 525
},
{
"epoch": 1.551622418879056,
"grad_norm": 0.23828125,
"learning_rate": 9.920426557774245e-05,
"loss": 1.3853,
"step": 526
},
{
"epoch": 1.5545722713864307,
"grad_norm": 0.240234375,
"learning_rate": 9.888598309541347e-05,
"loss": 1.3814,
"step": 527
},
{
"epoch": 1.5575221238938053,
"grad_norm": 0.2294921875,
"learning_rate": 9.856771189952126e-05,
"loss": 1.5154,
"step": 528
},
{
"epoch": 1.56047197640118,
"grad_norm": 0.25390625,
"learning_rate": 9.824945521456612e-05,
"loss": 1.3156,
"step": 529
},
{
"epoch": 1.5634218289085546,
"grad_norm": 0.263671875,
"learning_rate": 9.793121626490115e-05,
"loss": 1.3556,
"step": 530
},
{
"epoch": 1.5663716814159292,
"grad_norm": 0.2431640625,
"learning_rate": 9.761299827469992e-05,
"loss": 1.4218,
"step": 531
},
{
"epoch": 1.5693215339233038,
"grad_norm": 0.2333984375,
"learning_rate": 9.729480446792357e-05,
"loss": 1.3565,
"step": 532
},
{
"epoch": 1.5722713864306783,
"grad_norm": 0.2255859375,
"learning_rate": 9.69766380682883e-05,
"loss": 1.3742,
"step": 533
},
{
"epoch": 1.575221238938053,
"grad_norm": 0.2216796875,
"learning_rate": 9.665850229923258e-05,
"loss": 1.4162,
"step": 534
},
{
"epoch": 1.5781710914454279,
"grad_norm": 0.2373046875,
"learning_rate": 9.634040038388448e-05,
"loss": 1.3142,
"step": 535
},
{
"epoch": 1.5811209439528024,
"grad_norm": 0.2470703125,
"learning_rate": 9.602233554502931e-05,
"loss": 1.2675,
"step": 536
},
{
"epoch": 1.584070796460177,
"grad_norm": 0.2265625,
"learning_rate": 9.570431100507651e-05,
"loss": 1.336,
"step": 537
},
{
"epoch": 1.5870206489675516,
"grad_norm": 0.23046875,
"learning_rate": 9.538632998602745e-05,
"loss": 1.3952,
"step": 538
},
{
"epoch": 1.5899705014749261,
"grad_norm": 0.23046875,
"learning_rate": 9.506839570944238e-05,
"loss": 1.3981,
"step": 539
},
{
"epoch": 1.592920353982301,
"grad_norm": 0.216796875,
"learning_rate": 9.475051139640809e-05,
"loss": 1.2525,
"step": 540
},
{
"epoch": 1.5958702064896755,
"grad_norm": 0.251953125,
"learning_rate": 9.44326802675052e-05,
"loss": 1.2681,
"step": 541
},
{
"epoch": 1.5988200589970503,
"grad_norm": 0.2353515625,
"learning_rate": 9.411490554277541e-05,
"loss": 1.2269,
"step": 542
},
{
"epoch": 1.6017699115044248,
"grad_norm": 0.2197265625,
"learning_rate": 9.379719044168915e-05,
"loss": 1.1903,
"step": 543
},
{
"epoch": 1.6047197640117994,
"grad_norm": 0.2275390625,
"learning_rate": 9.34795381831125e-05,
"loss": 1.1826,
"step": 544
},
{
"epoch": 1.607669616519174,
"grad_norm": 0.2392578125,
"learning_rate": 9.316195198527518e-05,
"loss": 1.4403,
"step": 545
},
{
"epoch": 1.6106194690265485,
"grad_norm": 0.326171875,
"learning_rate": 9.28444350657374e-05,
"loss": 1.3918,
"step": 546
},
{
"epoch": 1.6135693215339233,
"grad_norm": 0.2734375,
"learning_rate": 9.252699064135758e-05,
"loss": 1.3245,
"step": 547
},
{
"epoch": 1.616519174041298,
"grad_norm": 0.2294921875,
"learning_rate": 9.220962192825968e-05,
"loss": 1.3868,
"step": 548
},
{
"epoch": 1.6194690265486726,
"grad_norm": 0.29296875,
"learning_rate": 9.189233214180056e-05,
"loss": 1.3523,
"step": 549
},
{
"epoch": 1.6224188790560472,
"grad_norm": 0.25,
"learning_rate": 9.157512449653751e-05,
"loss": 1.3435,
"step": 550
},
{
"epoch": 1.6253687315634218,
"grad_norm": 0.248046875,
"learning_rate": 9.125800220619558e-05,
"loss": 1.422,
"step": 551
},
{
"epoch": 1.6283185840707963,
"grad_norm": 0.2490234375,
"learning_rate": 9.094096848363502e-05,
"loss": 1.3882,
"step": 552
},
{
"epoch": 1.631268436578171,
"grad_norm": 0.248046875,
"learning_rate": 9.062402654081895e-05,
"loss": 1.334,
"step": 553
},
{
"epoch": 1.6342182890855457,
"grad_norm": 0.2412109375,
"learning_rate": 9.030717958878037e-05,
"loss": 1.504,
"step": 554
},
{
"epoch": 1.6371681415929205,
"grad_norm": 0.267578125,
"learning_rate": 8.999043083759017e-05,
"loss": 1.2428,
"step": 555
},
{
"epoch": 1.640117994100295,
"grad_norm": 0.2392578125,
"learning_rate": 8.967378349632415e-05,
"loss": 1.2815,
"step": 556
},
{
"epoch": 1.6430678466076696,
"grad_norm": 0.25,
"learning_rate": 8.935724077303083e-05,
"loss": 1.2689,
"step": 557
},
{
"epoch": 1.6460176991150441,
"grad_norm": 0.2314453125,
"learning_rate": 8.904080587469868e-05,
"loss": 1.3773,
"step": 558
},
{
"epoch": 1.648967551622419,
"grad_norm": 0.32421875,
"learning_rate": 8.872448200722385e-05,
"loss": 1.5014,
"step": 559
},
{
"epoch": 1.6519174041297935,
"grad_norm": 0.2158203125,
"learning_rate": 8.840827237537761e-05,
"loss": 1.3485,
"step": 560
},
{
"epoch": 1.6548672566371683,
"grad_norm": 0.255859375,
"learning_rate": 8.809218018277378e-05,
"loss": 1.2552,
"step": 561
},
{
"epoch": 1.6578171091445428,
"grad_norm": 0.244140625,
"learning_rate": 8.777620863183657e-05,
"loss": 1.2877,
"step": 562
},
{
"epoch": 1.6607669616519174,
"grad_norm": 0.2578125,
"learning_rate": 8.74603609237677e-05,
"loss": 1.3691,
"step": 563
},
{
"epoch": 1.663716814159292,
"grad_norm": 0.2294921875,
"learning_rate": 8.714464025851427e-05,
"loss": 1.3196,
"step": 564
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.2734375,
"learning_rate": 8.682904983473641e-05,
"loss": 1.4248,
"step": 565
},
{
"epoch": 1.6696165191740413,
"grad_norm": 0.216796875,
"learning_rate": 8.651359284977454e-05,
"loss": 1.3885,
"step": 566
},
{
"epoch": 1.672566371681416,
"grad_norm": 0.220703125,
"learning_rate": 8.619827249961733e-05,
"loss": 1.3231,
"step": 567
},
{
"epoch": 1.6755162241887906,
"grad_norm": 0.2353515625,
"learning_rate": 8.588309197886905e-05,
"loss": 1.3966,
"step": 568
},
{
"epoch": 1.6784660766961652,
"grad_norm": 0.267578125,
"learning_rate": 8.556805448071735e-05,
"loss": 1.2556,
"step": 569
},
{
"epoch": 1.6814159292035398,
"grad_norm": 0.216796875,
"learning_rate": 8.525316319690092e-05,
"loss": 1.3491,
"step": 570
},
{
"epoch": 1.6843657817109143,
"grad_norm": 0.263671875,
"learning_rate": 8.493842131767701e-05,
"loss": 1.2854,
"step": 571
},
{
"epoch": 1.6873156342182891,
"grad_norm": 0.2255859375,
"learning_rate": 8.462383203178938e-05,
"loss": 1.2996,
"step": 572
},
{
"epoch": 1.6902654867256637,
"grad_norm": 0.25,
"learning_rate": 8.430939852643558e-05,
"loss": 1.3843,
"step": 573
},
{
"epoch": 1.6932153392330385,
"grad_norm": 0.2275390625,
"learning_rate": 8.399512398723515e-05,
"loss": 1.2947,
"step": 574
},
{
"epoch": 1.696165191740413,
"grad_norm": 0.25390625,
"learning_rate": 8.368101159819693e-05,
"loss": 1.3611,
"step": 575
},
{
"epoch": 1.6991150442477876,
"grad_norm": 0.2451171875,
"learning_rate": 8.336706454168701e-05,
"loss": 1.3039,
"step": 576
},
{
"epoch": 1.7020648967551621,
"grad_norm": 0.26171875,
"learning_rate": 8.305328599839647e-05,
"loss": 1.3671,
"step": 577
},
{
"epoch": 1.7050147492625367,
"grad_norm": 0.232421875,
"learning_rate": 8.273967914730909e-05,
"loss": 1.3845,
"step": 578
},
{
"epoch": 1.7079646017699115,
"grad_norm": 0.251953125,
"learning_rate": 8.242624716566927e-05,
"loss": 1.2837,
"step": 579
},
{
"epoch": 1.7109144542772863,
"grad_norm": 0.2431640625,
"learning_rate": 8.211299322894965e-05,
"loss": 1.1784,
"step": 580
},
{
"epoch": 1.7138643067846608,
"grad_norm": 0.451171875,
"learning_rate": 8.17999205108191e-05,
"loss": 1.4403,
"step": 581
},
{
"epoch": 1.7168141592920354,
"grad_norm": 0.263671875,
"learning_rate": 8.148703218311053e-05,
"loss": 1.2429,
"step": 582
},
{
"epoch": 1.71976401179941,
"grad_norm": 0.265625,
"learning_rate": 8.117433141578866e-05,
"loss": 1.4779,
"step": 583
},
{
"epoch": 1.7227138643067845,
"grad_norm": 0.2490234375,
"learning_rate": 8.086182137691808e-05,
"loss": 1.289,
"step": 584
},
{
"epoch": 1.7256637168141593,
"grad_norm": 0.271484375,
"learning_rate": 8.054950523263096e-05,
"loss": 1.2747,
"step": 585
},
{
"epoch": 1.7286135693215339,
"grad_norm": 0.2451171875,
"learning_rate": 8.023738614709516e-05,
"loss": 1.1702,
"step": 586
},
{
"epoch": 1.7315634218289087,
"grad_norm": 0.2578125,
"learning_rate": 7.9925467282482e-05,
"loss": 1.3313,
"step": 587
},
{
"epoch": 1.7345132743362832,
"grad_norm": 0.24609375,
"learning_rate": 7.96137517989343e-05,
"loss": 1.3699,
"step": 588
},
{
"epoch": 1.7374631268436578,
"grad_norm": 0.2578125,
"learning_rate": 7.930224285453445e-05,
"loss": 1.3837,
"step": 589
},
{
"epoch": 1.7404129793510323,
"grad_norm": 0.240234375,
"learning_rate": 7.89909436052722e-05,
"loss": 1.3932,
"step": 590
},
{
"epoch": 1.7433628318584071,
"grad_norm": 0.2890625,
"learning_rate": 7.867985720501301e-05,
"loss": 1.4789,
"step": 591
},
{
"epoch": 1.7463126843657817,
"grad_norm": 0.2451171875,
"learning_rate": 7.836898680546569e-05,
"loss": 1.4004,
"step": 592
},
{
"epoch": 1.7492625368731565,
"grad_norm": 0.240234375,
"learning_rate": 7.805833555615077e-05,
"loss": 1.2428,
"step": 593
},
{
"epoch": 1.752212389380531,
"grad_norm": 0.251953125,
"learning_rate": 7.774790660436858e-05,
"loss": 1.3666,
"step": 594
},
{
"epoch": 1.7551622418879056,
"grad_norm": 0.2275390625,
"learning_rate": 7.743770309516715e-05,
"loss": 1.1606,
"step": 595
},
{
"epoch": 1.7581120943952802,
"grad_norm": 0.2294921875,
"learning_rate": 7.712772817131064e-05,
"loss": 1.1711,
"step": 596
},
{
"epoch": 1.7610619469026547,
"grad_norm": 0.25,
"learning_rate": 7.681798497324716e-05,
"loss": 1.3226,
"step": 597
},
{
"epoch": 1.7640117994100295,
"grad_norm": 0.251953125,
"learning_rate": 7.650847663907733e-05,
"loss": 1.2195,
"step": 598
},
{
"epoch": 1.7669616519174043,
"grad_norm": 0.2333984375,
"learning_rate": 7.619920630452214e-05,
"loss": 1.3322,
"step": 599
},
{
"epoch": 1.7699115044247788,
"grad_norm": 0.251953125,
"learning_rate": 7.589017710289139e-05,
"loss": 1.407,
"step": 600
},
{
"epoch": 1.7699115044247788,
"eval_loss": 1.2553775310516357,
"eval_runtime": 31.6806,
"eval_samples_per_second": 31.565,
"eval_steps_per_second": 3.946,
"step": 600
},
{
"epoch": 1.7728613569321534,
"grad_norm": 0.2275390625,
"learning_rate": 7.558139216505192e-05,
"loss": 1.342,
"step": 601
},
{
"epoch": 1.775811209439528,
"grad_norm": 0.275390625,
"learning_rate": 7.527285461939577e-05,
"loss": 1.2776,
"step": 602
},
{
"epoch": 1.7787610619469025,
"grad_norm": 0.24609375,
"learning_rate": 7.496456759180875e-05,
"loss": 1.338,
"step": 603
},
{
"epoch": 1.7817109144542773,
"grad_norm": 0.259765625,
"learning_rate": 7.465653420563845e-05,
"loss": 1.303,
"step": 604
},
{
"epoch": 1.7846607669616519,
"grad_norm": 0.25390625,
"learning_rate": 7.434875758166271e-05,
"loss": 1.2726,
"step": 605
},
{
"epoch": 1.7876106194690267,
"grad_norm": 0.2412109375,
"learning_rate": 7.404124083805819e-05,
"loss": 1.3422,
"step": 606
},
{
"epoch": 1.7905604719764012,
"grad_norm": 0.25390625,
"learning_rate": 7.373398709036849e-05,
"loss": 1.3315,
"step": 607
},
{
"epoch": 1.7935103244837758,
"grad_norm": 0.234375,
"learning_rate": 7.342699945147282e-05,
"loss": 1.3869,
"step": 608
},
{
"epoch": 1.7964601769911503,
"grad_norm": 0.22265625,
"learning_rate": 7.312028103155426e-05,
"loss": 1.4195,
"step": 609
},
{
"epoch": 1.799410029498525,
"grad_norm": 0.21875,
"learning_rate": 7.281383493806848e-05,
"loss": 1.4052,
"step": 610
},
{
"epoch": 1.8023598820058997,
"grad_norm": 0.2373046875,
"learning_rate": 7.2507664275712e-05,
"loss": 1.2198,
"step": 611
},
{
"epoch": 1.8053097345132745,
"grad_norm": 0.2470703125,
"learning_rate": 7.220177214639088e-05,
"loss": 1.3929,
"step": 612
},
{
"epoch": 1.808259587020649,
"grad_norm": 0.2578125,
"learning_rate": 7.189616164918943e-05,
"loss": 1.5003,
"step": 613
},
{
"epoch": 1.8112094395280236,
"grad_norm": 0.244140625,
"learning_rate": 7.159083588033848e-05,
"loss": 1.3234,
"step": 614
},
{
"epoch": 1.8141592920353982,
"grad_norm": 0.2294921875,
"learning_rate": 7.128579793318428e-05,
"loss": 1.372,
"step": 615
},
{
"epoch": 1.8171091445427727,
"grad_norm": 0.244140625,
"learning_rate": 7.098105089815707e-05,
"loss": 1.5301,
"step": 616
},
{
"epoch": 1.8200589970501475,
"grad_norm": 0.337890625,
"learning_rate": 7.067659786273974e-05,
"loss": 1.5291,
"step": 617
},
{
"epoch": 1.823008849557522,
"grad_norm": 0.2353515625,
"learning_rate": 7.037244191143661e-05,
"loss": 1.3474,
"step": 618
},
{
"epoch": 1.8259587020648969,
"grad_norm": 0.251953125,
"learning_rate": 7.006858612574215e-05,
"loss": 1.4001,
"step": 619
},
{
"epoch": 1.8289085545722714,
"grad_norm": 0.244140625,
"learning_rate": 6.976503358410976e-05,
"loss": 1.313,
"step": 620
},
{
"epoch": 1.831858407079646,
"grad_norm": 0.2578125,
"learning_rate": 6.946178736192053e-05,
"loss": 1.3431,
"step": 621
},
{
"epoch": 1.8348082595870205,
"grad_norm": 0.2138671875,
"learning_rate": 6.915885053145228e-05,
"loss": 1.1921,
"step": 622
},
{
"epoch": 1.8377581120943953,
"grad_norm": 0.23828125,
"learning_rate": 6.885622616184817e-05,
"loss": 1.4085,
"step": 623
},
{
"epoch": 1.8407079646017699,
"grad_norm": 0.2578125,
"learning_rate": 6.855391731908567e-05,
"loss": 1.4938,
"step": 624
},
{
"epoch": 1.8436578171091447,
"grad_norm": 0.2412109375,
"learning_rate": 6.825192706594575e-05,
"loss": 1.1814,
"step": 625
},
{
"epoch": 1.8466076696165192,
"grad_norm": 0.267578125,
"learning_rate": 6.795025846198148e-05,
"loss": 1.2625,
"step": 626
},
{
"epoch": 1.8495575221238938,
"grad_norm": 0.21484375,
"learning_rate": 6.764891456348729e-05,
"loss": 1.3344,
"step": 627
},
{
"epoch": 1.8525073746312684,
"grad_norm": 0.267578125,
"learning_rate": 6.734789842346791e-05,
"loss": 1.2333,
"step": 628
},
{
"epoch": 1.855457227138643,
"grad_norm": 0.251953125,
"learning_rate": 6.704721309160743e-05,
"loss": 1.465,
"step": 629
},
{
"epoch": 1.8584070796460177,
"grad_norm": 0.251953125,
"learning_rate": 6.674686161423843e-05,
"loss": 1.178,
"step": 630
},
{
"epoch": 1.8613569321533925,
"grad_norm": 0.275390625,
"learning_rate": 6.644684703431108e-05,
"loss": 1.4338,
"step": 631
},
{
"epoch": 1.864306784660767,
"grad_norm": 0.2490234375,
"learning_rate": 6.614717239136246e-05,
"loss": 1.4185,
"step": 632
},
{
"epoch": 1.8672566371681416,
"grad_norm": 0.2373046875,
"learning_rate": 6.584784072148555e-05,
"loss": 1.549,
"step": 633
},
{
"epoch": 1.8702064896755162,
"grad_norm": 0.240234375,
"learning_rate": 6.554885505729849e-05,
"loss": 1.2655,
"step": 634
},
{
"epoch": 1.8731563421828907,
"grad_norm": 0.26171875,
"learning_rate": 6.525021842791414e-05,
"loss": 1.3102,
"step": 635
},
{
"epoch": 1.8761061946902655,
"grad_norm": 0.263671875,
"learning_rate": 6.495193385890901e-05,
"loss": 1.1572,
"step": 636
},
{
"epoch": 1.87905604719764,
"grad_norm": 0.2451171875,
"learning_rate": 6.46540043722929e-05,
"loss": 1.2666,
"step": 637
},
{
"epoch": 1.8820058997050149,
"grad_norm": 0.298828125,
"learning_rate": 6.435643298647802e-05,
"loss": 1.4374,
"step": 638
},
{
"epoch": 1.8849557522123894,
"grad_norm": 0.251953125,
"learning_rate": 6.405922271624874e-05,
"loss": 1.3966,
"step": 639
},
{
"epoch": 1.887905604719764,
"grad_norm": 0.23828125,
"learning_rate": 6.37623765727307e-05,
"loss": 1.2989,
"step": 640
},
{
"epoch": 1.8908554572271385,
"grad_norm": 0.21875,
"learning_rate": 6.34658975633605e-05,
"loss": 1.3905,
"step": 641
},
{
"epoch": 1.893805309734513,
"grad_norm": 0.240234375,
"learning_rate": 6.316978869185532e-05,
"loss": 1.2243,
"step": 642
},
{
"epoch": 1.896755162241888,
"grad_norm": 0.28125,
"learning_rate": 6.287405295818215e-05,
"loss": 1.3082,
"step": 643
},
{
"epoch": 1.8997050147492627,
"grad_norm": 0.259765625,
"learning_rate": 6.257869335852782e-05,
"loss": 1.5845,
"step": 644
},
{
"epoch": 1.9026548672566372,
"grad_norm": 0.29296875,
"learning_rate": 6.22837128852683e-05,
"loss": 1.2672,
"step": 645
},
{
"epoch": 1.9056047197640118,
"grad_norm": 0.216796875,
"learning_rate": 6.198911452693853e-05,
"loss": 1.3732,
"step": 646
},
{
"epoch": 1.9085545722713864,
"grad_norm": 0.2041015625,
"learning_rate": 6.169490126820221e-05,
"loss": 1.1159,
"step": 647
},
{
"epoch": 1.911504424778761,
"grad_norm": 0.251953125,
"learning_rate": 6.140107608982136e-05,
"loss": 1.3654,
"step": 648
},
{
"epoch": 1.9144542772861357,
"grad_norm": 0.23046875,
"learning_rate": 6.110764196862638e-05,
"loss": 1.2467,
"step": 649
},
{
"epoch": 1.9174041297935103,
"grad_norm": 0.2451171875,
"learning_rate": 6.08146018774856e-05,
"loss": 1.429,
"step": 650
},
{
"epoch": 1.920353982300885,
"grad_norm": 0.2392578125,
"learning_rate": 6.05219587852755e-05,
"loss": 1.3321,
"step": 651
},
{
"epoch": 1.9233038348082596,
"grad_norm": 0.2138671875,
"learning_rate": 6.0229715656850305e-05,
"loss": 1.423,
"step": 652
},
{
"epoch": 1.9262536873156342,
"grad_norm": 0.2255859375,
"learning_rate": 5.993787545301204e-05,
"loss": 1.4881,
"step": 653
},
{
"epoch": 1.9292035398230087,
"grad_norm": 0.208984375,
"learning_rate": 5.964644113048079e-05,
"loss": 1.1902,
"step": 654
},
{
"epoch": 1.9321533923303835,
"grad_norm": 0.2255859375,
"learning_rate": 5.9355415641864334e-05,
"loss": 1.293,
"step": 655
},
{
"epoch": 1.935103244837758,
"grad_norm": 0.267578125,
"learning_rate": 5.9064801935628555e-05,
"loss": 1.3434,
"step": 656
},
{
"epoch": 1.9380530973451329,
"grad_norm": 0.2314453125,
"learning_rate": 5.877460295606738e-05,
"loss": 1.2815,
"step": 657
},
{
"epoch": 1.9410029498525074,
"grad_norm": 0.220703125,
"learning_rate": 5.8484821643273036e-05,
"loss": 1.3688,
"step": 658
},
{
"epoch": 1.943952802359882,
"grad_norm": 0.2333984375,
"learning_rate": 5.819546093310627e-05,
"loss": 1.2988,
"step": 659
},
{
"epoch": 1.9469026548672566,
"grad_norm": 0.265625,
"learning_rate": 5.790652375716652e-05,
"loss": 1.2676,
"step": 660
},
{
"epoch": 1.9498525073746311,
"grad_norm": 0.2470703125,
"learning_rate": 5.761801304276232e-05,
"loss": 1.335,
"step": 661
},
{
"epoch": 1.952802359882006,
"grad_norm": 0.283203125,
"learning_rate": 5.732993171288159e-05,
"loss": 1.6914,
"step": 662
},
{
"epoch": 1.9557522123893807,
"grad_norm": 0.287109375,
"learning_rate": 5.704228268616208e-05,
"loss": 1.5256,
"step": 663
},
{
"epoch": 1.9587020648967552,
"grad_norm": 0.2255859375,
"learning_rate": 5.675506887686157e-05,
"loss": 1.2951,
"step": 664
},
{
"epoch": 1.9616519174041298,
"grad_norm": 0.259765625,
"learning_rate": 5.6468293194828715e-05,
"loss": 1.3159,
"step": 665
},
{
"epoch": 1.9646017699115044,
"grad_norm": 0.2578125,
"learning_rate": 5.6181958545473325e-05,
"loss": 1.3121,
"step": 666
},
{
"epoch": 1.967551622418879,
"grad_norm": 0.24609375,
"learning_rate": 5.589606782973683e-05,
"loss": 1.3022,
"step": 667
},
{
"epoch": 1.9705014749262537,
"grad_norm": 0.2392578125,
"learning_rate": 5.5610623944063325e-05,
"loss": 1.2682,
"step": 668
},
{
"epoch": 1.9734513274336283,
"grad_norm": 0.255859375,
"learning_rate": 5.5325629780369635e-05,
"loss": 1.2662,
"step": 669
},
{
"epoch": 1.976401179941003,
"grad_norm": 0.2373046875,
"learning_rate": 5.50410882260165e-05,
"loss": 1.2366,
"step": 670
},
{
"epoch": 1.9793510324483776,
"grad_norm": 0.234375,
"learning_rate": 5.4757002163779136e-05,
"loss": 1.3567,
"step": 671
},
{
"epoch": 1.9823008849557522,
"grad_norm": 0.2734375,
"learning_rate": 5.4473374471817906e-05,
"loss": 1.313,
"step": 672
},
{
"epoch": 1.9852507374631267,
"grad_norm": 0.240234375,
"learning_rate": 5.41902080236494e-05,
"loss": 1.5024,
"step": 673
},
{
"epoch": 1.9882005899705013,
"grad_norm": 0.2265625,
"learning_rate": 5.39075056881172e-05,
"loss": 1.4246,
"step": 674
},
{
"epoch": 1.991150442477876,
"grad_norm": 0.220703125,
"learning_rate": 5.362527032936277e-05,
"loss": 1.1787,
"step": 675
},
{
"epoch": 1.9941002949852509,
"grad_norm": 0.21484375,
"learning_rate": 5.334350480679662e-05,
"loss": 1.1573,
"step": 676
},
{
"epoch": 1.9970501474926254,
"grad_norm": 0.23828125,
"learning_rate": 5.306221197506899e-05,
"loss": 1.2169,
"step": 677
},
{
"epoch": 2.0,
"grad_norm": 0.291015625,
"learning_rate": 5.278139468404133e-05,
"loss": 1.3579,
"step": 678
},
{
"epoch": 2.0029498525073746,
"grad_norm": 0.373046875,
"learning_rate": 5.2501055778757194e-05,
"loss": 1.2721,
"step": 679
},
{
"epoch": 2.005899705014749,
"grad_norm": 0.28125,
"learning_rate": 5.22211980994135e-05,
"loss": 1.2051,
"step": 680
},
{
"epoch": 2.0088495575221237,
"grad_norm": 0.388671875,
"learning_rate": 5.1941824481331626e-05,
"loss": 1.2817,
"step": 681
},
{
"epoch": 2.0117994100294987,
"grad_norm": 0.306640625,
"learning_rate": 5.166293775492887e-05,
"loss": 1.1325,
"step": 682
},
{
"epoch": 2.0147492625368733,
"grad_norm": 0.341796875,
"learning_rate": 5.13845407456897e-05,
"loss": 1.2806,
"step": 683
},
{
"epoch": 2.017699115044248,
"grad_norm": 0.24609375,
"learning_rate": 5.110663627413694e-05,
"loss": 1.3581,
"step": 684
},
{
"epoch": 2.0206489675516224,
"grad_norm": 0.291015625,
"learning_rate": 5.082922715580367e-05,
"loss": 1.3405,
"step": 685
},
{
"epoch": 2.023598820058997,
"grad_norm": 0.263671875,
"learning_rate": 5.055231620120413e-05,
"loss": 1.2616,
"step": 686
},
{
"epoch": 2.0265486725663715,
"grad_norm": 0.251953125,
"learning_rate": 5.0275906215805625e-05,
"loss": 1.2979,
"step": 687
},
{
"epoch": 2.0294985250737465,
"grad_norm": 0.28515625,
"learning_rate": 5.000000000000002e-05,
"loss": 1.3254,
"step": 688
},
{
"epoch": 2.032448377581121,
"grad_norm": 0.271484375,
"learning_rate": 4.972460034907524e-05,
"loss": 1.1788,
"step": 689
},
{
"epoch": 2.0353982300884956,
"grad_norm": 0.294921875,
"learning_rate": 4.944971005318716e-05,
"loss": 1.1345,
"step": 690
},
{
"epoch": 2.03834808259587,
"grad_norm": 0.2265625,
"learning_rate": 4.9175331897331154e-05,
"loss": 1.0673,
"step": 691
},
{
"epoch": 2.0412979351032448,
"grad_norm": 0.232421875,
"learning_rate": 4.890146866131403e-05,
"loss": 1.219,
"step": 692
},
{
"epoch": 2.0442477876106193,
"grad_norm": 0.259765625,
"learning_rate": 4.862812311972567e-05,
"loss": 1.1894,
"step": 693
},
{
"epoch": 2.047197640117994,
"grad_norm": 0.2294921875,
"learning_rate": 4.8355298041911125e-05,
"loss": 1.2981,
"step": 694
},
{
"epoch": 2.050147492625369,
"grad_norm": 0.267578125,
"learning_rate": 4.808299619194251e-05,
"loss": 1.1437,
"step": 695
},
{
"epoch": 2.0530973451327434,
"grad_norm": 0.236328125,
"learning_rate": 4.781122032859079e-05,
"loss": 1.1269,
"step": 696
},
{
"epoch": 2.056047197640118,
"grad_norm": 0.240234375,
"learning_rate": 4.753997320529827e-05,
"loss": 1.214,
"step": 697
},
{
"epoch": 2.0589970501474926,
"grad_norm": 0.2255859375,
"learning_rate": 4.726925757015017e-05,
"loss": 1.2819,
"step": 698
},
{
"epoch": 2.061946902654867,
"grad_norm": 0.2294921875,
"learning_rate": 4.699907616584721e-05,
"loss": 1.4018,
"step": 699
},
{
"epoch": 2.0648967551622417,
"grad_norm": 0.2431640625,
"learning_rate": 4.672943172967764e-05,
"loss": 1.2867,
"step": 700
},
{
"epoch": 2.0678466076696167,
"grad_norm": 0.234375,
"learning_rate": 4.6460326993489414e-05,
"loss": 1.254,
"step": 701
},
{
"epoch": 2.0707964601769913,
"grad_norm": 0.255859375,
"learning_rate": 4.6191764683662744e-05,
"loss": 1.2451,
"step": 702
},
{
"epoch": 2.073746312684366,
"grad_norm": 0.2392578125,
"learning_rate": 4.592374752108231e-05,
"loss": 1.4226,
"step": 703
},
{
"epoch": 2.0766961651917404,
"grad_norm": 0.2421875,
"learning_rate": 4.5656278221109804e-05,
"loss": 1.2108,
"step": 704
},
{
"epoch": 2.079646017699115,
"grad_norm": 0.216796875,
"learning_rate": 4.538935949355623e-05,
"loss": 1.2962,
"step": 705
},
{
"epoch": 2.0825958702064895,
"grad_norm": 0.24609375,
"learning_rate": 4.512299404265469e-05,
"loss": 1.1781,
"step": 706
},
{
"epoch": 2.0855457227138645,
"grad_norm": 0.2451171875,
"learning_rate": 4.485718456703284e-05,
"loss": 1.406,
"step": 707
},
{
"epoch": 2.088495575221239,
"grad_norm": 0.2490234375,
"learning_rate": 4.45919337596856e-05,
"loss": 1.1973,
"step": 708
},
{
"epoch": 2.0914454277286136,
"grad_norm": 0.2236328125,
"learning_rate": 4.432724430794786e-05,
"loss": 1.2112,
"step": 709
},
{
"epoch": 2.094395280235988,
"grad_norm": 0.7421875,
"learning_rate": 4.406311889346717e-05,
"loss": 1.3468,
"step": 710
},
{
"epoch": 2.0973451327433628,
"grad_norm": 0.2197265625,
"learning_rate": 4.379956019217675e-05,
"loss": 1.2393,
"step": 711
},
{
"epoch": 2.1002949852507373,
"grad_norm": 0.255859375,
"learning_rate": 4.3536570874268266e-05,
"loss": 1.2253,
"step": 712
},
{
"epoch": 2.103244837758112,
"grad_norm": 0.26953125,
"learning_rate": 4.327415360416468e-05,
"loss": 1.1754,
"step": 713
},
{
"epoch": 2.106194690265487,
"grad_norm": 0.234375,
"learning_rate": 4.3012311040493594e-05,
"loss": 1.3074,
"step": 714
},
{
"epoch": 2.1091445427728615,
"grad_norm": 0.2490234375,
"learning_rate": 4.275104583605982e-05,
"loss": 1.2728,
"step": 715
},
{
"epoch": 2.112094395280236,
"grad_norm": 0.310546875,
"learning_rate": 4.249036063781896e-05,
"loss": 1.2145,
"step": 716
},
{
"epoch": 2.1150442477876106,
"grad_norm": 0.251953125,
"learning_rate": 4.2230258086850374e-05,
"loss": 1.3247,
"step": 717
},
{
"epoch": 2.117994100294985,
"grad_norm": 0.2255859375,
"learning_rate": 4.197074081833033e-05,
"loss": 1.2938,
"step": 718
},
{
"epoch": 2.1209439528023597,
"grad_norm": 0.2734375,
"learning_rate": 4.171181146150557e-05,
"loss": 1.2979,
"step": 719
},
{
"epoch": 2.1238938053097347,
"grad_norm": 0.2314453125,
"learning_rate": 4.1453472639666457e-05,
"loss": 1.1638,
"step": 720
},
{
"epoch": 2.1268436578171093,
"grad_norm": 0.21875,
"learning_rate": 4.1195726970120516e-05,
"loss": 1.1842,
"step": 721
},
{
"epoch": 2.129793510324484,
"grad_norm": 0.244140625,
"learning_rate": 4.093857706416577e-05,
"loss": 1.1431,
"step": 722
},
{
"epoch": 2.1327433628318584,
"grad_norm": 0.2216796875,
"learning_rate": 4.0682025527064486e-05,
"loss": 1.0936,
"step": 723
},
{
"epoch": 2.135693215339233,
"grad_norm": 0.2353515625,
"learning_rate": 4.042607495801667e-05,
"loss": 1.2056,
"step": 724
},
{
"epoch": 2.1386430678466075,
"grad_norm": 0.251953125,
"learning_rate": 4.017072795013359e-05,
"loss": 1.2524,
"step": 725
},
{
"epoch": 2.1415929203539825,
"grad_norm": 0.248046875,
"learning_rate": 3.991598709041196e-05,
"loss": 1.2026,
"step": 726
},
{
"epoch": 2.144542772861357,
"grad_norm": 0.25,
"learning_rate": 3.96618549597071e-05,
"loss": 1.1538,
"step": 727
},
{
"epoch": 2.1474926253687316,
"grad_norm": 0.2109375,
"learning_rate": 3.9408334132707315e-05,
"loss": 1.1865,
"step": 728
},
{
"epoch": 2.150442477876106,
"grad_norm": 0.244140625,
"learning_rate": 3.915542717790759e-05,
"loss": 1.3044,
"step": 729
},
{
"epoch": 2.1533923303834808,
"grad_norm": 0.2236328125,
"learning_rate": 3.890313665758348e-05,
"loss": 1.3334,
"step": 730
},
{
"epoch": 2.1563421828908553,
"grad_norm": 0.2490234375,
"learning_rate": 3.865146512776537e-05,
"loss": 1.3032,
"step": 731
},
{
"epoch": 2.15929203539823,
"grad_norm": 0.22265625,
"learning_rate": 3.840041513821243e-05,
"loss": 1.2531,
"step": 732
},
{
"epoch": 2.162241887905605,
"grad_norm": 0.2412109375,
"learning_rate": 3.814998923238685e-05,
"loss": 1.1855,
"step": 733
},
{
"epoch": 2.1651917404129795,
"grad_norm": 0.287109375,
"learning_rate": 3.7900189947427944e-05,
"loss": 1.322,
"step": 734
},
{
"epoch": 2.168141592920354,
"grad_norm": 0.2431640625,
"learning_rate": 3.7651019814126654e-05,
"loss": 1.1993,
"step": 735
},
{
"epoch": 2.1710914454277286,
"grad_norm": 0.2314453125,
"learning_rate": 3.740248135689975e-05,
"loss": 1.2312,
"step": 736
},
{
"epoch": 2.174041297935103,
"grad_norm": 0.2412109375,
"learning_rate": 3.7154577093764334e-05,
"loss": 1.1922,
"step": 737
},
{
"epoch": 2.1769911504424777,
"grad_norm": 0.259765625,
"learning_rate": 3.6907309536312276e-05,
"loss": 1.3864,
"step": 738
},
{
"epoch": 2.1799410029498527,
"grad_norm": 0.2255859375,
"learning_rate": 3.666068118968474e-05,
"loss": 1.3758,
"step": 739
},
{
"epoch": 2.1828908554572273,
"grad_norm": 0.2333984375,
"learning_rate": 3.6414694552546946e-05,
"loss": 1.1105,
"step": 740
},
{
"epoch": 2.185840707964602,
"grad_norm": 0.251953125,
"learning_rate": 3.616935211706275e-05,
"loss": 1.2401,
"step": 741
},
{
"epoch": 2.1887905604719764,
"grad_norm": 0.2275390625,
"learning_rate": 3.592465636886933e-05,
"loss": 1.1481,
"step": 742
},
{
"epoch": 2.191740412979351,
"grad_norm": 0.21875,
"learning_rate": 3.568060978705214e-05,
"loss": 1.0494,
"step": 743
},
{
"epoch": 2.1946902654867255,
"grad_norm": 0.25390625,
"learning_rate": 3.543721484411976e-05,
"loss": 1.3134,
"step": 744
},
{
"epoch": 2.1976401179941005,
"grad_norm": 0.291015625,
"learning_rate": 3.51944740059788e-05,
"loss": 1.2992,
"step": 745
},
{
"epoch": 2.200589970501475,
"grad_norm": 0.2333984375,
"learning_rate": 3.495238973190894e-05,
"loss": 1.2532,
"step": 746
},
{
"epoch": 2.2035398230088497,
"grad_norm": 0.25,
"learning_rate": 3.4710964474537966e-05,
"loss": 1.1991,
"step": 747
},
{
"epoch": 2.206489675516224,
"grad_norm": 0.244140625,
"learning_rate": 3.447020067981704e-05,
"loss": 1.1028,
"step": 748
},
{
"epoch": 2.2094395280235988,
"grad_norm": 0.208984375,
"learning_rate": 3.4230100786995824e-05,
"loss": 1.1823,
"step": 749
},
{
"epoch": 2.2123893805309733,
"grad_norm": 0.2236328125,
"learning_rate": 3.399066722859782e-05,
"loss": 1.3618,
"step": 750
},
{
"epoch": 2.215339233038348,
"grad_norm": 0.234375,
"learning_rate": 3.375190243039556e-05,
"loss": 1.1951,
"step": 751
},
{
"epoch": 2.218289085545723,
"grad_norm": 0.2265625,
"learning_rate": 3.351380881138642e-05,
"loss": 1.3429,
"step": 752
},
{
"epoch": 2.2212389380530975,
"grad_norm": 0.2451171875,
"learning_rate": 3.327638878376764e-05,
"loss": 1.1583,
"step": 753
},
{
"epoch": 2.224188790560472,
"grad_norm": 0.25390625,
"learning_rate": 3.3039644752912125e-05,
"loss": 1.1691,
"step": 754
},
{
"epoch": 2.2271386430678466,
"grad_norm": 0.2216796875,
"learning_rate": 3.280357911734423e-05,
"loss": 1.2961,
"step": 755
},
{
"epoch": 2.230088495575221,
"grad_norm": 0.2470703125,
"learning_rate": 3.256819426871507e-05,
"loss": 1.1934,
"step": 756
},
{
"epoch": 2.2330383480825957,
"grad_norm": 0.2041015625,
"learning_rate": 3.233349259177865e-05,
"loss": 1.1564,
"step": 757
},
{
"epoch": 2.2359882005899703,
"grad_norm": 0.25,
"learning_rate": 3.209947646436752e-05,
"loss": 1.3592,
"step": 758
},
{
"epoch": 2.2389380530973453,
"grad_norm": 0.236328125,
"learning_rate": 3.1866148257368665e-05,
"loss": 1.101,
"step": 759
},
{
"epoch": 2.24188790560472,
"grad_norm": 0.248046875,
"learning_rate": 3.163351033469961e-05,
"loss": 1.1795,
"step": 760
},
{
"epoch": 2.2448377581120944,
"grad_norm": 0.234375,
"learning_rate": 3.140156505328441e-05,
"loss": 1.1023,
"step": 761
},
{
"epoch": 2.247787610619469,
"grad_norm": 0.2216796875,
"learning_rate": 3.117031476302975e-05,
"loss": 1.2247,
"step": 762
},
{
"epoch": 2.2507374631268435,
"grad_norm": 0.25390625,
"learning_rate": 3.0939761806801096e-05,
"loss": 1.3458,
"step": 763
},
{
"epoch": 2.2536873156342185,
"grad_norm": 0.2421875,
"learning_rate": 3.0709908520399076e-05,
"loss": 1.0579,
"step": 764
},
{
"epoch": 2.256637168141593,
"grad_norm": 0.220703125,
"learning_rate": 3.0480757232535772e-05,
"loss": 1.2426,
"step": 765
},
{
"epoch": 2.2595870206489677,
"grad_norm": 0.234375,
"learning_rate": 3.0252310264811067e-05,
"loss": 1.357,
"step": 766
},
{
"epoch": 2.262536873156342,
"grad_norm": 0.21875,
"learning_rate": 3.0024569931689207e-05,
"loss": 1.3801,
"step": 767
},
{
"epoch": 2.265486725663717,
"grad_norm": 0.228515625,
"learning_rate": 2.979753854047522e-05,
"loss": 1.2527,
"step": 768
},
{
"epoch": 2.2684365781710913,
"grad_norm": 0.228515625,
"learning_rate": 2.9571218391291744e-05,
"loss": 1.2639,
"step": 769
},
{
"epoch": 2.271386430678466,
"grad_norm": 0.2373046875,
"learning_rate": 2.9345611777055594e-05,
"loss": 1.2068,
"step": 770
},
{
"epoch": 2.274336283185841,
"grad_norm": 0.23828125,
"learning_rate": 2.9120720983454463e-05,
"loss": 1.2786,
"step": 771
},
{
"epoch": 2.2772861356932155,
"grad_norm": 0.2080078125,
"learning_rate": 2.889654828892393e-05,
"loss": 1.0755,
"step": 772
},
{
"epoch": 2.28023598820059,
"grad_norm": 0.2255859375,
"learning_rate": 2.8673095964624296e-05,
"loss": 1.3451,
"step": 773
},
{
"epoch": 2.2831858407079646,
"grad_norm": 0.2392578125,
"learning_rate": 2.845036627441755e-05,
"loss": 1.3701,
"step": 774
},
{
"epoch": 2.286135693215339,
"grad_norm": 0.2294921875,
"learning_rate": 2.822836147484452e-05,
"loss": 1.1044,
"step": 775
},
{
"epoch": 2.2890855457227137,
"grad_norm": 0.2236328125,
"learning_rate": 2.800708381510182e-05,
"loss": 1.2032,
"step": 776
},
{
"epoch": 2.2920353982300883,
"grad_norm": 0.25390625,
"learning_rate": 2.778653553701932e-05,
"loss": 1.1809,
"step": 777
},
{
"epoch": 2.2949852507374633,
"grad_norm": 0.25,
"learning_rate": 2.7566718875037267e-05,
"loss": 1.2321,
"step": 778
},
{
"epoch": 2.297935103244838,
"grad_norm": 0.26171875,
"learning_rate": 2.73476360561837e-05,
"loss": 1.1413,
"step": 779
},
{
"epoch": 2.3008849557522124,
"grad_norm": 0.23828125,
"learning_rate": 2.7129289300051787e-05,
"loss": 1.1207,
"step": 780
},
{
"epoch": 2.303834808259587,
"grad_norm": 0.2275390625,
"learning_rate": 2.6911680818777606e-05,
"loss": 1.2459,
"step": 781
},
{
"epoch": 2.3067846607669615,
"grad_norm": 0.24609375,
"learning_rate": 2.669481281701739e-05,
"loss": 1.2833,
"step": 782
},
{
"epoch": 2.309734513274336,
"grad_norm": 0.265625,
"learning_rate": 2.6478687491925357e-05,
"loss": 1.1722,
"step": 783
},
{
"epoch": 2.312684365781711,
"grad_norm": 0.236328125,
"learning_rate": 2.62633070331316e-05,
"loss": 1.1811,
"step": 784
},
{
"epoch": 2.3156342182890857,
"grad_norm": 0.2294921875,
"learning_rate": 2.6048673622719568e-05,
"loss": 1.1855,
"step": 785
},
{
"epoch": 2.3185840707964602,
"grad_norm": 0.220703125,
"learning_rate": 2.5834789435204243e-05,
"loss": 1.2206,
"step": 786
},
{
"epoch": 2.321533923303835,
"grad_norm": 0.2373046875,
"learning_rate": 2.562165663751003e-05,
"loss": 1.1962,
"step": 787
},
{
"epoch": 2.3244837758112094,
"grad_norm": 0.20703125,
"learning_rate": 2.540927738894866e-05,
"loss": 1.2001,
"step": 788
},
{
"epoch": 2.327433628318584,
"grad_norm": 0.212890625,
"learning_rate": 2.5197653841197543e-05,
"loss": 1.1895,
"step": 789
},
{
"epoch": 2.330383480825959,
"grad_norm": 0.220703125,
"learning_rate": 2.4986788138277827e-05,
"loss": 1.2965,
"step": 790
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.240234375,
"learning_rate": 2.4776682416532724e-05,
"loss": 1.1817,
"step": 791
},
{
"epoch": 2.336283185840708,
"grad_norm": 0.2197265625,
"learning_rate": 2.4567338804605756e-05,
"loss": 1.2809,
"step": 792
},
{
"epoch": 2.3392330383480826,
"grad_norm": 0.2314453125,
"learning_rate": 2.4358759423419474e-05,
"loss": 1.1563,
"step": 793
},
{
"epoch": 2.342182890855457,
"grad_norm": 0.23046875,
"learning_rate": 2.4150946386153605e-05,
"loss": 1.3343,
"step": 794
},
{
"epoch": 2.3451327433628317,
"grad_norm": 0.248046875,
"learning_rate": 2.394390179822382e-05,
"loss": 1.2205,
"step": 795
},
{
"epoch": 2.3480825958702063,
"grad_norm": 0.2734375,
"learning_rate": 2.3737627757260582e-05,
"loss": 1.3186,
"step": 796
},
{
"epoch": 2.3510324483775813,
"grad_norm": 0.2578125,
"learning_rate": 2.3532126353087492e-05,
"loss": 1.1532,
"step": 797
},
{
"epoch": 2.353982300884956,
"grad_norm": 0.2216796875,
"learning_rate": 2.3327399667700477e-05,
"loss": 1.2163,
"step": 798
},
{
"epoch": 2.3569321533923304,
"grad_norm": 0.2236328125,
"learning_rate": 2.312344977524653e-05,
"loss": 1.1568,
"step": 799
},
{
"epoch": 2.359882005899705,
"grad_norm": 0.228515625,
"learning_rate": 2.2920278742002676e-05,
"loss": 1.2224,
"step": 800
},
{
"epoch": 2.359882005899705,
"eval_loss": 1.2389241456985474,
"eval_runtime": 31.6507,
"eval_samples_per_second": 31.595,
"eval_steps_per_second": 3.949,
"step": 800
},
{
"epoch": 2.3628318584070795,
"grad_norm": 0.251953125,
"learning_rate": 2.2717888626355134e-05,
"loss": 1.3552,
"step": 801
},
{
"epoch": 2.365781710914454,
"grad_norm": 0.2451171875,
"learning_rate": 2.251628147877839e-05,
"loss": 1.1006,
"step": 802
},
{
"epoch": 2.3687315634218287,
"grad_norm": 0.22265625,
"learning_rate": 2.2315459341814482e-05,
"loss": 1.1587,
"step": 803
},
{
"epoch": 2.3716814159292037,
"grad_norm": 0.24609375,
"learning_rate": 2.211542425005223e-05,
"loss": 1.2576,
"step": 804
},
{
"epoch": 2.3746312684365782,
"grad_norm": 0.2353515625,
"learning_rate": 2.191617823010671e-05,
"loss": 1.193,
"step": 805
},
{
"epoch": 2.377581120943953,
"grad_norm": 0.2119140625,
"learning_rate": 2.1717723300598613e-05,
"loss": 1.3609,
"step": 806
},
{
"epoch": 2.3805309734513274,
"grad_norm": 0.2353515625,
"learning_rate": 2.1520061472133902e-05,
"loss": 1.4278,
"step": 807
},
{
"epoch": 2.383480825958702,
"grad_norm": 0.232421875,
"learning_rate": 2.1323194747283416e-05,
"loss": 1.2223,
"step": 808
},
{
"epoch": 2.386430678466077,
"grad_norm": 0.228515625,
"learning_rate": 2.1127125120562497e-05,
"loss": 1.2312,
"step": 809
},
{
"epoch": 2.3893805309734515,
"grad_norm": 0.208984375,
"learning_rate": 2.0931854578410905e-05,
"loss": 1.2904,
"step": 810
},
{
"epoch": 2.392330383480826,
"grad_norm": 0.2333984375,
"learning_rate": 2.0737385099172635e-05,
"loss": 1.3688,
"step": 811
},
{
"epoch": 2.3952802359882006,
"grad_norm": 0.2265625,
"learning_rate": 2.0543718653075782e-05,
"loss": 1.2122,
"step": 812
},
{
"epoch": 2.398230088495575,
"grad_norm": 0.2333984375,
"learning_rate": 2.035085720221288e-05,
"loss": 1.1826,
"step": 813
},
{
"epoch": 2.4011799410029497,
"grad_norm": 0.25390625,
"learning_rate": 2.0158802700520574e-05,
"loss": 1.3577,
"step": 814
},
{
"epoch": 2.4041297935103243,
"grad_norm": 0.251953125,
"learning_rate": 1.9967557093760226e-05,
"loss": 1.1507,
"step": 815
},
{
"epoch": 2.4070796460176993,
"grad_norm": 0.2294921875,
"learning_rate": 1.9777122319497986e-05,
"loss": 1.6048,
"step": 816
},
{
"epoch": 2.410029498525074,
"grad_norm": 0.2138671875,
"learning_rate": 1.958750030708527e-05,
"loss": 1.1744,
"step": 817
},
{
"epoch": 2.4129793510324484,
"grad_norm": 0.228515625,
"learning_rate": 1.9398692977639054e-05,
"loss": 1.1077,
"step": 818
},
{
"epoch": 2.415929203539823,
"grad_norm": 0.2431640625,
"learning_rate": 1.9210702244022617e-05,
"loss": 1.2388,
"step": 819
},
{
"epoch": 2.4188790560471976,
"grad_norm": 0.2451171875,
"learning_rate": 1.902353001082605e-05,
"loss": 1.3209,
"step": 820
},
{
"epoch": 2.421828908554572,
"grad_norm": 0.2177734375,
"learning_rate": 1.883717817434688e-05,
"loss": 1.377,
"step": 821
},
{
"epoch": 2.4247787610619467,
"grad_norm": 0.220703125,
"learning_rate": 1.8651648622571128e-05,
"loss": 1.1778,
"step": 822
},
{
"epoch": 2.4277286135693217,
"grad_norm": 0.232421875,
"learning_rate": 1.8466943235153844e-05,
"loss": 1.2378,
"step": 823
},
{
"epoch": 2.4306784660766962,
"grad_norm": 0.236328125,
"learning_rate": 1.8283063883400232e-05,
"loss": 1.1211,
"step": 824
},
{
"epoch": 2.433628318584071,
"grad_norm": 0.26953125,
"learning_rate": 1.8100012430246837e-05,
"loss": 1.2883,
"step": 825
},
{
"epoch": 2.4365781710914454,
"grad_norm": 0.255859375,
"learning_rate": 1.7917790730242322e-05,
"loss": 1.3108,
"step": 826
},
{
"epoch": 2.43952802359882,
"grad_norm": 0.2177734375,
"learning_rate": 1.7736400629529003e-05,
"loss": 1.1481,
"step": 827
},
{
"epoch": 2.442477876106195,
"grad_norm": 0.259765625,
"learning_rate": 1.7555843965823992e-05,
"loss": 1.2908,
"step": 828
},
{
"epoch": 2.4454277286135695,
"grad_norm": 0.2490234375,
"learning_rate": 1.7376122568400532e-05,
"loss": 1.2928,
"step": 829
},
{
"epoch": 2.448377581120944,
"grad_norm": 0.25,
"learning_rate": 1.7197238258069613e-05,
"loss": 1.1898,
"step": 830
},
{
"epoch": 2.4513274336283186,
"grad_norm": 0.2353515625,
"learning_rate": 1.7019192847161425e-05,
"loss": 1.3403,
"step": 831
},
{
"epoch": 2.454277286135693,
"grad_norm": 0.2314453125,
"learning_rate": 1.6841988139507048e-05,
"loss": 1.3968,
"step": 832
},
{
"epoch": 2.4572271386430677,
"grad_norm": 0.2490234375,
"learning_rate": 1.6665625930420024e-05,
"loss": 1.2074,
"step": 833
},
{
"epoch": 2.4601769911504423,
"grad_norm": 0.25,
"learning_rate": 1.6490108006678494e-05,
"loss": 1.3393,
"step": 834
},
{
"epoch": 2.4631268436578173,
"grad_norm": 0.2353515625,
"learning_rate": 1.6315436146506703e-05,
"loss": 1.152,
"step": 835
},
{
"epoch": 2.466076696165192,
"grad_norm": 0.236328125,
"learning_rate": 1.614161211955727e-05,
"loss": 1.2362,
"step": 836
},
{
"epoch": 2.4690265486725664,
"grad_norm": 0.2109375,
"learning_rate": 1.5968637686893186e-05,
"loss": 1.1957,
"step": 837
},
{
"epoch": 2.471976401179941,
"grad_norm": 0.23828125,
"learning_rate": 1.5796514600969837e-05,
"loss": 1.2779,
"step": 838
},
{
"epoch": 2.4749262536873156,
"grad_norm": 0.244140625,
"learning_rate": 1.5625244605617472e-05,
"loss": 1.2661,
"step": 839
},
{
"epoch": 2.47787610619469,
"grad_norm": 0.2158203125,
"learning_rate": 1.545482943602341e-05,
"loss": 1.1762,
"step": 840
},
{
"epoch": 2.4808259587020647,
"grad_norm": 0.279296875,
"learning_rate": 1.528527081871438e-05,
"loss": 1.282,
"step": 841
},
{
"epoch": 2.4837758112094397,
"grad_norm": 0.21484375,
"learning_rate": 1.5116570471539293e-05,
"loss": 1.1874,
"step": 842
},
{
"epoch": 2.4867256637168142,
"grad_norm": 0.2177734375,
"learning_rate": 1.4948730103651498e-05,
"loss": 1.1993,
"step": 843
},
{
"epoch": 2.489675516224189,
"grad_norm": 0.2216796875,
"learning_rate": 1.478175141549174e-05,
"loss": 1.3926,
"step": 844
},
{
"epoch": 2.4926253687315634,
"grad_norm": 0.240234375,
"learning_rate": 1.4615636098770802e-05,
"loss": 1.4152,
"step": 845
},
{
"epoch": 2.495575221238938,
"grad_norm": 0.2177734375,
"learning_rate": 1.4450385836452429e-05,
"loss": 1.0925,
"step": 846
},
{
"epoch": 2.4985250737463125,
"grad_norm": 0.234375,
"learning_rate": 1.4286002302736168e-05,
"loss": 1.2749,
"step": 847
},
{
"epoch": 2.501474926253687,
"grad_norm": 0.25390625,
"learning_rate": 1.412248716304052e-05,
"loss": 1.1758,
"step": 848
},
{
"epoch": 2.504424778761062,
"grad_norm": 0.22265625,
"learning_rate": 1.3959842073986085e-05,
"loss": 1.1971,
"step": 849
},
{
"epoch": 2.5073746312684366,
"grad_norm": 0.220703125,
"learning_rate": 1.3798068683378574e-05,
"loss": 1.1297,
"step": 850
},
{
"epoch": 2.510324483775811,
"grad_norm": 0.244140625,
"learning_rate": 1.3637168630192443e-05,
"loss": 1.1396,
"step": 851
},
{
"epoch": 2.5132743362831858,
"grad_norm": 0.21875,
"learning_rate": 1.3477143544553995e-05,
"loss": 1.1688,
"step": 852
},
{
"epoch": 2.5162241887905603,
"grad_norm": 0.25,
"learning_rate": 1.331799504772493e-05,
"loss": 1.1659,
"step": 853
},
{
"epoch": 2.5191740412979353,
"grad_norm": 0.251953125,
"learning_rate": 1.3159724752086144e-05,
"loss": 1.195,
"step": 854
},
{
"epoch": 2.52212389380531,
"grad_norm": 0.2255859375,
"learning_rate": 1.300233426112103e-05,
"loss": 1.3925,
"step": 855
},
{
"epoch": 2.5250737463126844,
"grad_norm": 0.2431640625,
"learning_rate": 1.2845825169399507e-05,
"loss": 1.1774,
"step": 856
},
{
"epoch": 2.528023598820059,
"grad_norm": 0.220703125,
"learning_rate": 1.269019906256178e-05,
"loss": 1.1354,
"step": 857
},
{
"epoch": 2.5309734513274336,
"grad_norm": 0.265625,
"learning_rate": 1.2535457517302263e-05,
"loss": 1.3571,
"step": 858
},
{
"epoch": 2.533923303834808,
"grad_norm": 0.279296875,
"learning_rate": 1.2381602101353573e-05,
"loss": 1.2459,
"step": 859
},
{
"epoch": 2.5368731563421827,
"grad_norm": 0.251953125,
"learning_rate": 1.2228634373470726e-05,
"loss": 1.1074,
"step": 860
},
{
"epoch": 2.5398230088495577,
"grad_norm": 0.2578125,
"learning_rate": 1.207655588341534e-05,
"loss": 1.1571,
"step": 861
},
{
"epoch": 2.5427728613569323,
"grad_norm": 0.25390625,
"learning_rate": 1.1925368171939777e-05,
"loss": 1.4061,
"step": 862
},
{
"epoch": 2.545722713864307,
"grad_norm": 0.265625,
"learning_rate": 1.1775072770771834e-05,
"loss": 1.5388,
"step": 863
},
{
"epoch": 2.5486725663716814,
"grad_norm": 0.2177734375,
"learning_rate": 1.1625671202598875e-05,
"loss": 1.4447,
"step": 864
},
{
"epoch": 2.551622418879056,
"grad_norm": 0.2578125,
"learning_rate": 1.147716498105268e-05,
"loss": 1.1055,
"step": 865
},
{
"epoch": 2.554572271386431,
"grad_norm": 0.2109375,
"learning_rate": 1.1329555610694008e-05,
"loss": 1.2615,
"step": 866
},
{
"epoch": 2.557522123893805,
"grad_norm": 0.236328125,
"learning_rate": 1.1182844586997266e-05,
"loss": 1.3414,
"step": 867
},
{
"epoch": 2.56047197640118,
"grad_norm": 0.2470703125,
"learning_rate": 1.1037033396335528e-05,
"loss": 1.2957,
"step": 868
},
{
"epoch": 2.5634218289085546,
"grad_norm": 0.2314453125,
"learning_rate": 1.0892123515965348e-05,
"loss": 1.203,
"step": 869
},
{
"epoch": 2.566371681415929,
"grad_norm": 0.2138671875,
"learning_rate": 1.0748116414011888e-05,
"loss": 1.3511,
"step": 870
},
{
"epoch": 2.5693215339233038,
"grad_norm": 0.248046875,
"learning_rate": 1.0605013549453913e-05,
"loss": 1.1888,
"step": 871
},
{
"epoch": 2.5722713864306783,
"grad_norm": 0.283203125,
"learning_rate": 1.0462816372109153e-05,
"loss": 1.367,
"step": 872
},
{
"epoch": 2.5752212389380533,
"grad_norm": 0.208984375,
"learning_rate": 1.0321526322619534e-05,
"loss": 1.1133,
"step": 873
},
{
"epoch": 2.578171091445428,
"grad_norm": 0.251953125,
"learning_rate": 1.0181144832436584e-05,
"loss": 1.5173,
"step": 874
},
{
"epoch": 2.5811209439528024,
"grad_norm": 0.2177734375,
"learning_rate": 1.0041673323807e-05,
"loss": 1.2463,
"step": 875
},
{
"epoch": 2.584070796460177,
"grad_norm": 0.2314453125,
"learning_rate": 9.903113209758096e-06,
"loss": 1.2253,
"step": 876
},
{
"epoch": 2.5870206489675516,
"grad_norm": 0.23046875,
"learning_rate": 9.765465894083636e-06,
"loss": 1.2679,
"step": 877
},
{
"epoch": 2.589970501474926,
"grad_norm": 0.29296875,
"learning_rate": 9.628732771329573e-06,
"loss": 1.4327,
"step": 878
},
{
"epoch": 2.5929203539823007,
"grad_norm": 0.2177734375,
"learning_rate": 9.492915226779808e-06,
"loss": 1.2031,
"step": 879
},
{
"epoch": 2.5958702064896757,
"grad_norm": 0.228515625,
"learning_rate": 9.358014636442392e-06,
"loss": 1.2193,
"step": 880
},
{
"epoch": 2.5988200589970503,
"grad_norm": 0.37109375,
"learning_rate": 9.224032367035274e-06,
"loss": 1.2276,
"step": 881
},
{
"epoch": 2.601769911504425,
"grad_norm": 0.22265625,
"learning_rate": 9.090969775972736e-06,
"loss": 1.2331,
"step": 882
},
{
"epoch": 2.6047197640117994,
"grad_norm": 0.2314453125,
"learning_rate": 8.9588282113515e-06,
"loss": 1.1959,
"step": 883
},
{
"epoch": 2.607669616519174,
"grad_norm": 0.26953125,
"learning_rate": 8.827609011937066e-06,
"loss": 1.237,
"step": 884
},
{
"epoch": 2.6106194690265485,
"grad_norm": 0.2138671875,
"learning_rate": 8.697313507150184e-06,
"loss": 1.1985,
"step": 885
},
{
"epoch": 2.613569321533923,
"grad_norm": 0.2001953125,
"learning_rate": 8.567943017053425e-06,
"loss": 1.3213,
"step": 886
},
{
"epoch": 2.616519174041298,
"grad_norm": 0.2294921875,
"learning_rate": 8.439498852337724e-06,
"loss": 1.234,
"step": 887
},
{
"epoch": 2.6194690265486726,
"grad_norm": 0.248046875,
"learning_rate": 8.311982314309109e-06,
"loss": 1.3079,
"step": 888
},
{
"epoch": 2.622418879056047,
"grad_norm": 0.2197265625,
"learning_rate": 8.185394694875592e-06,
"loss": 1.282,
"step": 889
},
{
"epoch": 2.6253687315634218,
"grad_norm": 0.2275390625,
"learning_rate": 8.059737276534041e-06,
"loss": 1.1944,
"step": 890
},
{
"epoch": 2.6283185840707963,
"grad_norm": 0.2275390625,
"learning_rate": 7.935011332357112e-06,
"loss": 1.2447,
"step": 891
},
{
"epoch": 2.6312684365781713,
"grad_norm": 0.2216796875,
"learning_rate": 7.811218125980535e-06,
"loss": 1.2224,
"step": 892
},
{
"epoch": 2.6342182890855455,
"grad_norm": 0.234375,
"learning_rate": 7.688358911590078e-06,
"loss": 1.292,
"step": 893
},
{
"epoch": 2.6371681415929205,
"grad_norm": 0.22265625,
"learning_rate": 7.566434933909006e-06,
"loss": 1.239,
"step": 894
},
{
"epoch": 2.640117994100295,
"grad_norm": 0.248046875,
"learning_rate": 7.445447428185448e-06,
"loss": 1.2884,
"step": 895
},
{
"epoch": 2.6430678466076696,
"grad_norm": 0.23046875,
"learning_rate": 7.325397620179808e-06,
"loss": 1.2082,
"step": 896
},
{
"epoch": 2.646017699115044,
"grad_norm": 0.25390625,
"learning_rate": 7.206286726152434e-06,
"loss": 1.091,
"step": 897
},
{
"epoch": 2.6489675516224187,
"grad_norm": 0.2421875,
"learning_rate": 7.088115952851238e-06,
"loss": 1.1554,
"step": 898
},
{
"epoch": 2.6519174041297937,
"grad_norm": 0.220703125,
"learning_rate": 6.970886497499518e-06,
"loss": 1.1842,
"step": 899
},
{
"epoch": 2.6548672566371683,
"grad_norm": 0.216796875,
"learning_rate": 6.854599547783736e-06,
"loss": 1.1645,
"step": 900
},
{
"epoch": 2.657817109144543,
"grad_norm": 0.240234375,
"learning_rate": 6.739256281841599e-06,
"loss": 1.2633,
"step": 901
},
{
"epoch": 2.6607669616519174,
"grad_norm": 0.2333984375,
"learning_rate": 6.624857868250079e-06,
"loss": 1.2458,
"step": 902
},
{
"epoch": 2.663716814159292,
"grad_norm": 0.2421875,
"learning_rate": 6.5114054660135315e-06,
"loss": 1.3913,
"step": 903
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.234375,
"learning_rate": 6.39890022455204e-06,
"loss": 1.1862,
"step": 904
},
{
"epoch": 2.669616519174041,
"grad_norm": 0.263671875,
"learning_rate": 6.287343283689661e-06,
"loss": 1.3121,
"step": 905
},
{
"epoch": 2.672566371681416,
"grad_norm": 0.2392578125,
"learning_rate": 6.176735773642961e-06,
"loss": 1.2006,
"step": 906
},
{
"epoch": 2.6755162241887906,
"grad_norm": 0.263671875,
"learning_rate": 6.067078815009575e-06,
"loss": 1.2827,
"step": 907
},
{
"epoch": 2.678466076696165,
"grad_norm": 0.2294921875,
"learning_rate": 5.958373518756733e-06,
"loss": 1.2997,
"step": 908
},
{
"epoch": 2.6814159292035398,
"grad_norm": 0.28515625,
"learning_rate": 5.850620986210198e-06,
"loss": 1.2585,
"step": 909
},
{
"epoch": 2.6843657817109143,
"grad_norm": 0.2578125,
"learning_rate": 5.743822309042912e-06,
"loss": 1.2125,
"step": 910
},
{
"epoch": 2.6873156342182893,
"grad_norm": 0.244140625,
"learning_rate": 5.63797856926408e-06,
"loss": 1.5253,
"step": 911
},
{
"epoch": 2.6902654867256635,
"grad_norm": 0.2314453125,
"learning_rate": 5.533090839208133e-06,
"loss": 1.2142,
"step": 912
},
{
"epoch": 2.6932153392330385,
"grad_norm": 0.2158203125,
"learning_rate": 5.429160181523852e-06,
"loss": 1.237,
"step": 913
},
{
"epoch": 2.696165191740413,
"grad_norm": 0.2392578125,
"learning_rate": 5.326187649163672e-06,
"loss": 1.2394,
"step": 914
},
{
"epoch": 2.6991150442477876,
"grad_norm": 0.27734375,
"learning_rate": 5.224174285372974e-06,
"loss": 1.4169,
"step": 915
},
{
"epoch": 2.702064896755162,
"grad_norm": 0.2451171875,
"learning_rate": 5.123121123679519e-06,
"loss": 1.2016,
"step": 916
},
{
"epoch": 2.7050147492625367,
"grad_norm": 0.255859375,
"learning_rate": 5.023029187882944e-06,
"loss": 1.1954,
"step": 917
},
{
"epoch": 2.7079646017699117,
"grad_norm": 0.27734375,
"learning_rate": 4.923899492044437e-06,
"loss": 1.2806,
"step": 918
},
{
"epoch": 2.7109144542772863,
"grad_norm": 0.216796875,
"learning_rate": 4.825733040476465e-06,
"loss": 1.2226,
"step": 919
},
{
"epoch": 2.713864306784661,
"grad_norm": 0.244140625,
"learning_rate": 4.728530827732536e-06,
"loss": 1.2372,
"step": 920
},
{
"epoch": 2.7168141592920354,
"grad_norm": 0.2265625,
"learning_rate": 4.632293838597246e-06,
"loss": 1.2637,
"step": 921
},
{
"epoch": 2.71976401179941,
"grad_norm": 0.23828125,
"learning_rate": 4.537023048076128e-06,
"loss": 1.3621,
"step": 922
},
{
"epoch": 2.7227138643067845,
"grad_norm": 0.236328125,
"learning_rate": 4.442719421385922e-06,
"loss": 1.191,
"step": 923
},
{
"epoch": 2.725663716814159,
"grad_norm": 0.228515625,
"learning_rate": 4.349383913944771e-06,
"loss": 1.1426,
"step": 924
},
{
"epoch": 2.728613569321534,
"grad_norm": 0.265625,
"learning_rate": 4.257017471362435e-06,
"loss": 1.1484,
"step": 925
},
{
"epoch": 2.7315634218289087,
"grad_norm": 0.259765625,
"learning_rate": 4.165621029430855e-06,
"loss": 1.2768,
"step": 926
},
{
"epoch": 2.734513274336283,
"grad_norm": 0.2158203125,
"learning_rate": 4.075195514114593e-06,
"loss": 1.1224,
"step": 927
},
{
"epoch": 2.737463126843658,
"grad_norm": 0.224609375,
"learning_rate": 3.985741841541446e-06,
"loss": 1.2434,
"step": 928
},
{
"epoch": 2.7404129793510323,
"grad_norm": 0.2197265625,
"learning_rate": 3.897260917993184e-06,
"loss": 1.3012,
"step": 929
},
{
"epoch": 2.7433628318584073,
"grad_norm": 0.220703125,
"learning_rate": 3.8097536398963963e-06,
"loss": 1.0688,
"step": 930
},
{
"epoch": 2.7463126843657815,
"grad_norm": 0.2119140625,
"learning_rate": 3.7232208938133393e-06,
"loss": 1.1687,
"step": 931
},
{
"epoch": 2.7492625368731565,
"grad_norm": 0.203125,
"learning_rate": 3.6376635564330463e-06,
"loss": 1.3316,
"step": 932
},
{
"epoch": 2.752212389380531,
"grad_norm": 0.2421875,
"learning_rate": 3.5530824945623542e-06,
"loss": 1.2765,
"step": 933
},
{
"epoch": 2.7551622418879056,
"grad_norm": 0.240234375,
"learning_rate": 3.4694785651171456e-06,
"loss": 1.2629,
"step": 934
},
{
"epoch": 2.75811209439528,
"grad_norm": 0.2490234375,
"learning_rate": 3.3868526151137445e-06,
"loss": 1.2944,
"step": 935
},
{
"epoch": 2.7610619469026547,
"grad_norm": 0.2373046875,
"learning_rate": 3.3052054816602452e-06,
"loss": 1.3661,
"step": 936
},
{
"epoch": 2.7640117994100297,
"grad_norm": 0.203125,
"learning_rate": 3.224537991948029e-06,
"loss": 1.2167,
"step": 937
},
{
"epoch": 2.7669616519174043,
"grad_norm": 0.267578125,
"learning_rate": 3.144850963243462e-06,
"loss": 1.2244,
"step": 938
},
{
"epoch": 2.769911504424779,
"grad_norm": 0.2392578125,
"learning_rate": 3.0661452028795336e-06,
"loss": 1.2336,
"step": 939
},
{
"epoch": 2.7728613569321534,
"grad_norm": 0.267578125,
"learning_rate": 2.9884215082477408e-06,
"loss": 1.2458,
"step": 940
},
{
"epoch": 2.775811209439528,
"grad_norm": 0.2490234375,
"learning_rate": 2.9116806667899734e-06,
"loss": 1.1778,
"step": 941
},
{
"epoch": 2.7787610619469025,
"grad_norm": 0.232421875,
"learning_rate": 2.835923455990508e-06,
"loss": 1.3631,
"step": 942
},
{
"epoch": 2.781710914454277,
"grad_norm": 0.251953125,
"learning_rate": 2.7611506433682045e-06,
"loss": 1.3227,
"step": 943
},
{
"epoch": 2.784660766961652,
"grad_norm": 0.265625,
"learning_rate": 2.687362986468689e-06,
"loss": 1.3318,
"step": 944
},
{
"epoch": 2.7876106194690267,
"grad_norm": 0.21484375,
"learning_rate": 2.6145612328566717e-06,
"loss": 1.1911,
"step": 945
},
{
"epoch": 2.7905604719764012,
"grad_norm": 0.2294921875,
"learning_rate": 2.5427461201083747e-06,
"loss": 1.4122,
"step": 946
},
{
"epoch": 2.793510324483776,
"grad_norm": 0.24609375,
"learning_rate": 2.471918375804105e-06,
"loss": 1.3147,
"step": 947
},
{
"epoch": 2.7964601769911503,
"grad_norm": 0.216796875,
"learning_rate": 2.402078717520795e-06,
"loss": 1.3623,
"step": 948
},
{
"epoch": 2.799410029498525,
"grad_norm": 0.26171875,
"learning_rate": 2.333227852824804e-06,
"loss": 1.2975,
"step": 949
},
{
"epoch": 2.8023598820058995,
"grad_norm": 0.259765625,
"learning_rate": 2.2653664792647634e-06,
"loss": 1.1744,
"step": 950
},
{
"epoch": 2.8053097345132745,
"grad_norm": 0.25,
"learning_rate": 2.19849528436441e-06,
"loss": 1.2339,
"step": 951
},
{
"epoch": 2.808259587020649,
"grad_norm": 0.228515625,
"learning_rate": 2.132614945615741e-06,
"loss": 1.3005,
"step": 952
},
{
"epoch": 2.8112094395280236,
"grad_norm": 0.2333984375,
"learning_rate": 2.067726130472092e-06,
"loss": 1.3549,
"step": 953
},
{
"epoch": 2.814159292035398,
"grad_norm": 0.2578125,
"learning_rate": 2.003829496341325e-06,
"loss": 1.2107,
"step": 954
},
{
"epoch": 2.8171091445427727,
"grad_norm": 0.21484375,
"learning_rate": 1.9409256905792762e-06,
"loss": 1.2295,
"step": 955
},
{
"epoch": 2.8200589970501477,
"grad_norm": 0.224609375,
"learning_rate": 1.8790153504831153e-06,
"loss": 1.1294,
"step": 956
},
{
"epoch": 2.823008849557522,
"grad_norm": 0.236328125,
"learning_rate": 1.8180991032849426e-06,
"loss": 1.1334,
"step": 957
},
{
"epoch": 2.825958702064897,
"grad_norm": 0.2236328125,
"learning_rate": 1.7581775661453692e-06,
"loss": 1.1275,
"step": 958
},
{
"epoch": 2.8289085545722714,
"grad_norm": 0.234375,
"learning_rate": 1.6992513461473237e-06,
"loss": 1.1977,
"step": 959
},
{
"epoch": 2.831858407079646,
"grad_norm": 0.2177734375,
"learning_rate": 1.6413210402898893e-06,
"loss": 1.0893,
"step": 960
},
{
"epoch": 2.8348082595870205,
"grad_norm": 0.28125,
"learning_rate": 1.5843872354822097e-06,
"loss": 1.5192,
"step": 961
},
{
"epoch": 2.837758112094395,
"grad_norm": 0.2490234375,
"learning_rate": 1.5284505085376377e-06,
"loss": 1.2963,
"step": 962
},
{
"epoch": 2.84070796460177,
"grad_norm": 0.21875,
"learning_rate": 1.473511426167784e-06,
"loss": 1.1416,
"step": 963
},
{
"epoch": 2.8436578171091447,
"grad_norm": 0.2216796875,
"learning_rate": 1.4195705449768448e-06,
"loss": 1.0892,
"step": 964
},
{
"epoch": 2.8466076696165192,
"grad_norm": 0.236328125,
"learning_rate": 1.3666284114559612e-06,
"loss": 1.1808,
"step": 965
},
{
"epoch": 2.849557522123894,
"grad_norm": 0.224609375,
"learning_rate": 1.3146855619776134e-06,
"loss": 1.2118,
"step": 966
},
{
"epoch": 2.8525073746312684,
"grad_norm": 0.283203125,
"learning_rate": 1.2637425227902787e-06,
"loss": 1.1691,
"step": 967
},
{
"epoch": 2.855457227138643,
"grad_norm": 0.236328125,
"learning_rate": 1.21379981001305e-06,
"loss": 1.2603,
"step": 968
},
{
"epoch": 2.8584070796460175,
"grad_norm": 0.2265625,
"learning_rate": 1.1648579296304253e-06,
"loss": 1.3881,
"step": 969
},
{
"epoch": 2.8613569321533925,
"grad_norm": 0.2451171875,
"learning_rate": 1.1169173774871478e-06,
"loss": 1.2478,
"step": 970
},
{
"epoch": 2.864306784660767,
"grad_norm": 0.2421875,
"learning_rate": 1.0699786392832201e-06,
"loss": 1.3015,
"step": 971
},
{
"epoch": 2.8672566371681416,
"grad_norm": 0.2314453125,
"learning_rate": 1.0240421905689745e-06,
"loss": 1.3087,
"step": 972
},
{
"epoch": 2.870206489675516,
"grad_norm": 0.2431640625,
"learning_rate": 9.79108496740244e-07,
"loss": 1.1377,
"step": 973
},
{
"epoch": 2.8731563421828907,
"grad_norm": 0.25390625,
"learning_rate": 9.351780130336441e-07,
"loss": 1.1647,
"step": 974
},
{
"epoch": 2.8761061946902657,
"grad_norm": 0.248046875,
"learning_rate": 8.922511845219971e-07,
"loss": 1.2245,
"step": 975
},
{
"epoch": 2.87905604719764,
"grad_norm": 0.251953125,
"learning_rate": 8.503284461097604e-07,
"loss": 1.2084,
"step": 976
},
{
"epoch": 2.882005899705015,
"grad_norm": 0.2255859375,
"learning_rate": 8.094102225286837e-07,
"loss": 1.2193,
"step": 977
},
{
"epoch": 2.8849557522123894,
"grad_norm": 0.259765625,
"learning_rate": 7.694969283334575e-07,
"loss": 1.0793,
"step": 978
},
{
"epoch": 2.887905604719764,
"grad_norm": 0.2314453125,
"learning_rate": 7.305889678975608e-07,
"loss": 1.2978,
"step": 979
},
{
"epoch": 2.8908554572271385,
"grad_norm": 0.2236328125,
"learning_rate": 6.926867354091093e-07,
"loss": 1.3385,
"step": 980
},
{
"epoch": 2.893805309734513,
"grad_norm": 0.240234375,
"learning_rate": 6.557906148669024e-07,
"loss": 1.2968,
"step": 981
},
{
"epoch": 2.896755162241888,
"grad_norm": 0.2119140625,
"learning_rate": 6.199009800765265e-07,
"loss": 1.1596,
"step": 982
},
{
"epoch": 2.8997050147492627,
"grad_norm": 0.2314453125,
"learning_rate": 5.850181946465361e-07,
"loss": 1.1692,
"step": 983
},
{
"epoch": 2.9026548672566372,
"grad_norm": 0.2216796875,
"learning_rate": 5.51142611984834e-07,
"loss": 1.2102,
"step": 984
},
{
"epoch": 2.905604719764012,
"grad_norm": 0.2431640625,
"learning_rate": 5.18274575295008e-07,
"loss": 1.1674,
"step": 985
},
{
"epoch": 2.9085545722713864,
"grad_norm": 0.2236328125,
"learning_rate": 4.864144175729335e-07,
"loss": 1.1879,
"step": 986
},
{
"epoch": 2.911504424778761,
"grad_norm": 0.2421875,
"learning_rate": 4.555624616033427e-07,
"loss": 1.3146,
"step": 987
},
{
"epoch": 2.9144542772861355,
"grad_norm": 0.2294921875,
"learning_rate": 4.2571901995659414e-07,
"loss": 1.2423,
"step": 988
},
{
"epoch": 2.9174041297935105,
"grad_norm": 0.25390625,
"learning_rate": 3.96884394985475e-07,
"loss": 1.3639,
"step": 989
},
{
"epoch": 2.920353982300885,
"grad_norm": 0.26953125,
"learning_rate": 3.6905887882213717e-07,
"loss": 1.1884,
"step": 990
},
{
"epoch": 2.9233038348082596,
"grad_norm": 0.224609375,
"learning_rate": 3.422427533751771e-07,
"loss": 1.1482,
"step": 991
},
{
"epoch": 2.926253687315634,
"grad_norm": 0.2314453125,
"learning_rate": 3.1643629032674924e-07,
"loss": 1.3255,
"step": 992
},
{
"epoch": 2.9292035398230087,
"grad_norm": 0.205078125,
"learning_rate": 2.916397511298019e-07,
"loss": 1.0667,
"step": 993
},
{
"epoch": 2.9321533923303837,
"grad_norm": 0.2578125,
"learning_rate": 2.678533870054567e-07,
"loss": 1.2598,
"step": 994
},
{
"epoch": 2.935103244837758,
"grad_norm": 0.26953125,
"learning_rate": 2.4507743894045533e-07,
"loss": 1.2335,
"step": 995
},
{
"epoch": 2.938053097345133,
"grad_norm": 0.2353515625,
"learning_rate": 2.2331213768468363e-07,
"loss": 1.2033,
"step": 996
},
{
"epoch": 2.9410029498525074,
"grad_norm": 0.2294921875,
"learning_rate": 2.0255770374890682e-07,
"loss": 1.4316,
"step": 997
},
{
"epoch": 2.943952802359882,
"grad_norm": 0.2255859375,
"learning_rate": 1.8281434740247128e-07,
"loss": 1.377,
"step": 998
},
{
"epoch": 2.9469026548672566,
"grad_norm": 0.27734375,
"learning_rate": 1.6408226867118403e-07,
"loss": 1.2607,
"step": 999
},
{
"epoch": 2.949852507374631,
"grad_norm": 0.251953125,
"learning_rate": 1.4636165733532546e-07,
"loss": 1.1762,
"step": 1000
},
{
"epoch": 2.949852507374631,
"eval_loss": 1.2385461330413818,
"eval_runtime": 31.701,
"eval_samples_per_second": 31.545,
"eval_steps_per_second": 3.943,
"step": 1000
},
{
"epoch": 2.952802359882006,
"grad_norm": 0.236328125,
"learning_rate": 1.2965269292767313e-07,
"loss": 1.2174,
"step": 1001
},
{
"epoch": 2.9557522123893807,
"grad_norm": 0.24609375,
"learning_rate": 1.1395554473171422e-07,
"loss": 1.2745,
"step": 1002
},
{
"epoch": 2.9587020648967552,
"grad_norm": 0.283203125,
"learning_rate": 9.927037177993592e-08,
"loss": 1.4159,
"step": 1003
},
{
"epoch": 2.96165191740413,
"grad_norm": 0.2119140625,
"learning_rate": 8.559732285219335e-08,
"loss": 1.1995,
"step": 1004
},
{
"epoch": 2.9646017699115044,
"grad_norm": 0.2392578125,
"learning_rate": 7.293653647421073e-08,
"loss": 1.2969,
"step": 1005
},
{
"epoch": 2.967551622418879,
"grad_norm": 0.2236328125,
"learning_rate": 6.128814091619362e-08,
"loss": 1.2363,
"step": 1006
},
{
"epoch": 2.9705014749262535,
"grad_norm": 0.23046875,
"learning_rate": 5.0652254191496664e-08,
"loss": 1.0256,
"step": 1007
},
{
"epoch": 2.9734513274336285,
"grad_norm": 0.259765625,
"learning_rate": 4.102898405545785e-08,
"loss": 1.1566,
"step": 1008
},
{
"epoch": 2.976401179941003,
"grad_norm": 0.251953125,
"learning_rate": 3.241842800428829e-08,
"loss": 1.3226,
"step": 1009
},
{
"epoch": 2.9793510324483776,
"grad_norm": 0.26171875,
"learning_rate": 2.482067327409521e-08,
"loss": 1.2195,
"step": 1010
},
{
"epoch": 2.982300884955752,
"grad_norm": 0.21484375,
"learning_rate": 1.8235796839982665e-08,
"loss": 1.2664,
"step": 1011
},
{
"epoch": 2.9852507374631267,
"grad_norm": 0.240234375,
"learning_rate": 1.266386541530773e-08,
"loss": 1.2031,
"step": 1012
},
{
"epoch": 2.9882005899705013,
"grad_norm": 0.2373046875,
"learning_rate": 8.104935450969908e-09,
"loss": 1.2926,
"step": 1013
},
{
"epoch": 2.991150442477876,
"grad_norm": 0.2216796875,
"learning_rate": 4.559053134822744e-09,
"loss": 1.334,
"step": 1014
},
{
"epoch": 2.994100294985251,
"grad_norm": 0.2392578125,
"learning_rate": 2.0262543912741295e-09,
"loss": 1.2621,
"step": 1015
},
{
"epoch": 2.9970501474926254,
"grad_norm": 0.2451171875,
"learning_rate": 5.065648808533219e-10,
"loss": 1.167,
"step": 1016
},
{
"epoch": 3.0,
"grad_norm": 0.3359375,
"learning_rate": 0.0,
"loss": 1.442,
"step": 1017
}
],
"logging_steps": 1,
"max_steps": 1017,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.426452217371034e+16,
"train_batch_size": 24,
"trial_name": null,
"trial_params": null
}