gemma_preference / trainer_state.json
terry69's picture
Model save
fe4f030 verified
raw
history blame
77.2 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 2187,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004572473708276177,
"grad_norm": 8.096893602247533,
"learning_rate": 4.5662100456621004e-08,
"loss": 4.0564,
"step": 1
},
{
"epoch": 0.002286236854138089,
"grad_norm": 9.362922497604572,
"learning_rate": 2.2831050228310502e-07,
"loss": 4.0286,
"step": 5
},
{
"epoch": 0.004572473708276178,
"grad_norm": 8.436200820951127,
"learning_rate": 4.5662100456621004e-07,
"loss": 4.0215,
"step": 10
},
{
"epoch": 0.006858710562414266,
"grad_norm": 8.311166463949595,
"learning_rate": 6.849315068493151e-07,
"loss": 3.995,
"step": 15
},
{
"epoch": 0.009144947416552356,
"grad_norm": 8.314737539002055,
"learning_rate": 9.132420091324201e-07,
"loss": 4.0089,
"step": 20
},
{
"epoch": 0.011431184270690443,
"grad_norm": 8.207038819761589,
"learning_rate": 1.1415525114155251e-06,
"loss": 4.0363,
"step": 25
},
{
"epoch": 0.013717421124828532,
"grad_norm": 7.559088257570073,
"learning_rate": 1.3698630136986302e-06,
"loss": 3.9854,
"step": 30
},
{
"epoch": 0.01600365797896662,
"grad_norm": 7.1469550333759315,
"learning_rate": 1.5981735159817353e-06,
"loss": 3.9172,
"step": 35
},
{
"epoch": 0.01828989483310471,
"grad_norm": 5.971779564925808,
"learning_rate": 1.8264840182648401e-06,
"loss": 3.7833,
"step": 40
},
{
"epoch": 0.0205761316872428,
"grad_norm": 5.297232176162252,
"learning_rate": 2.0547945205479454e-06,
"loss": 3.7258,
"step": 45
},
{
"epoch": 0.022862368541380886,
"grad_norm": 4.217560181118984,
"learning_rate": 2.2831050228310503e-06,
"loss": 3.6069,
"step": 50
},
{
"epoch": 0.025148605395518976,
"grad_norm": 3.786598130029432,
"learning_rate": 2.511415525114155e-06,
"loss": 3.553,
"step": 55
},
{
"epoch": 0.027434842249657063,
"grad_norm": 2.6863763381878782,
"learning_rate": 2.7397260273972604e-06,
"loss": 3.4564,
"step": 60
},
{
"epoch": 0.029721079103795154,
"grad_norm": 2.322206779034821,
"learning_rate": 2.9680365296803653e-06,
"loss": 3.331,
"step": 65
},
{
"epoch": 0.03200731595793324,
"grad_norm": 1.7328570105860337,
"learning_rate": 3.1963470319634706e-06,
"loss": 3.2806,
"step": 70
},
{
"epoch": 0.03429355281207133,
"grad_norm": 1.4846651112766411,
"learning_rate": 3.4246575342465754e-06,
"loss": 3.2356,
"step": 75
},
{
"epoch": 0.03657978966620942,
"grad_norm": 1.1848731903024705,
"learning_rate": 3.6529680365296803e-06,
"loss": 3.1934,
"step": 80
},
{
"epoch": 0.038866026520347506,
"grad_norm": 1.0381920877926754,
"learning_rate": 3.881278538812785e-06,
"loss": 3.1245,
"step": 85
},
{
"epoch": 0.0411522633744856,
"grad_norm": 0.9141218079482407,
"learning_rate": 4.109589041095891e-06,
"loss": 3.0468,
"step": 90
},
{
"epoch": 0.04343850022862369,
"grad_norm": 1.0161753499176187,
"learning_rate": 4.337899543378996e-06,
"loss": 3.0329,
"step": 95
},
{
"epoch": 0.04572473708276177,
"grad_norm": 0.8996453649527762,
"learning_rate": 4.566210045662101e-06,
"loss": 2.9958,
"step": 100
},
{
"epoch": 0.04801097393689986,
"grad_norm": 0.8082797335102323,
"learning_rate": 4.7945205479452054e-06,
"loss": 2.9651,
"step": 105
},
{
"epoch": 0.05029721079103795,
"grad_norm": 0.7090431405422901,
"learning_rate": 5.02283105022831e-06,
"loss": 2.927,
"step": 110
},
{
"epoch": 0.05258344764517604,
"grad_norm": 1.2265537925663061,
"learning_rate": 5.251141552511416e-06,
"loss": 2.862,
"step": 115
},
{
"epoch": 0.05486968449931413,
"grad_norm": 0.650571444620453,
"learning_rate": 5.479452054794521e-06,
"loss": 2.857,
"step": 120
},
{
"epoch": 0.05715592135345222,
"grad_norm": 0.7089485111846239,
"learning_rate": 5.7077625570776266e-06,
"loss": 2.8209,
"step": 125
},
{
"epoch": 0.05944215820759031,
"grad_norm": 0.5922494361050838,
"learning_rate": 5.936073059360731e-06,
"loss": 2.8037,
"step": 130
},
{
"epoch": 0.06172839506172839,
"grad_norm": 0.5597217919230902,
"learning_rate": 6.164383561643836e-06,
"loss": 2.7487,
"step": 135
},
{
"epoch": 0.06401463191586648,
"grad_norm": 0.6045746583730743,
"learning_rate": 6.392694063926941e-06,
"loss": 2.6981,
"step": 140
},
{
"epoch": 0.06630086877000457,
"grad_norm": 0.6479924774135967,
"learning_rate": 6.621004566210046e-06,
"loss": 2.7036,
"step": 145
},
{
"epoch": 0.06858710562414266,
"grad_norm": 0.760723993748018,
"learning_rate": 6.849315068493151e-06,
"loss": 2.6821,
"step": 150
},
{
"epoch": 0.07087334247828075,
"grad_norm": 0.5889973577684341,
"learning_rate": 7.077625570776257e-06,
"loss": 2.6882,
"step": 155
},
{
"epoch": 0.07315957933241884,
"grad_norm": 0.6201384588278992,
"learning_rate": 7.305936073059361e-06,
"loss": 2.6441,
"step": 160
},
{
"epoch": 0.07544581618655692,
"grad_norm": 0.645862573214957,
"learning_rate": 7.534246575342466e-06,
"loss": 2.5878,
"step": 165
},
{
"epoch": 0.07773205304069501,
"grad_norm": 0.880791499233313,
"learning_rate": 7.76255707762557e-06,
"loss": 2.5665,
"step": 170
},
{
"epoch": 0.0800182898948331,
"grad_norm": 0.7581098091472079,
"learning_rate": 7.990867579908676e-06,
"loss": 2.5423,
"step": 175
},
{
"epoch": 0.0823045267489712,
"grad_norm": 0.7502504535360037,
"learning_rate": 8.219178082191782e-06,
"loss": 2.5348,
"step": 180
},
{
"epoch": 0.08459076360310928,
"grad_norm": 0.9587325899501735,
"learning_rate": 8.447488584474887e-06,
"loss": 2.4652,
"step": 185
},
{
"epoch": 0.08687700045724737,
"grad_norm": 1.0327228370595574,
"learning_rate": 8.675799086757991e-06,
"loss": 2.4066,
"step": 190
},
{
"epoch": 0.08916323731138547,
"grad_norm": 0.8853835960264104,
"learning_rate": 8.904109589041097e-06,
"loss": 2.3642,
"step": 195
},
{
"epoch": 0.09144947416552354,
"grad_norm": 1.0446953486337078,
"learning_rate": 9.132420091324201e-06,
"loss": 2.3237,
"step": 200
},
{
"epoch": 0.09373571101966163,
"grad_norm": 1.1013758488210148,
"learning_rate": 9.360730593607307e-06,
"loss": 2.2331,
"step": 205
},
{
"epoch": 0.09602194787379972,
"grad_norm": 1.2192543249794923,
"learning_rate": 9.589041095890411e-06,
"loss": 2.1264,
"step": 210
},
{
"epoch": 0.09830818472793781,
"grad_norm": 1.3533953895273099,
"learning_rate": 9.817351598173517e-06,
"loss": 2.0554,
"step": 215
},
{
"epoch": 0.1005944215820759,
"grad_norm": 1.1876482609404326,
"learning_rate": 9.999993629265979e-06,
"loss": 1.9859,
"step": 220
},
{
"epoch": 0.102880658436214,
"grad_norm": 1.1847416528253172,
"learning_rate": 9.999770655279843e-06,
"loss": 1.8986,
"step": 225
},
{
"epoch": 0.10516689529035209,
"grad_norm": 1.3137466650624998,
"learning_rate": 9.999229160826947e-06,
"loss": 1.8,
"step": 230
},
{
"epoch": 0.10745313214449016,
"grad_norm": 1.830150495140023,
"learning_rate": 9.998369180404283e-06,
"loss": 1.7138,
"step": 235
},
{
"epoch": 0.10973936899862825,
"grad_norm": 1.1159850299398295,
"learning_rate": 9.997190768798639e-06,
"loss": 1.6867,
"step": 240
},
{
"epoch": 0.11202560585276634,
"grad_norm": 0.9727694366367986,
"learning_rate": 9.995694001083103e-06,
"loss": 1.6469,
"step": 245
},
{
"epoch": 0.11431184270690443,
"grad_norm": 1.135743426814773,
"learning_rate": 9.993878972612276e-06,
"loss": 1.5607,
"step": 250
},
{
"epoch": 0.11659807956104253,
"grad_norm": 1.0363437963731608,
"learning_rate": 9.991745799016206e-06,
"loss": 1.5332,
"step": 255
},
{
"epoch": 0.11888431641518062,
"grad_norm": 1.018006180331875,
"learning_rate": 9.989294616193018e-06,
"loss": 1.4962,
"step": 260
},
{
"epoch": 0.1211705532693187,
"grad_norm": 0.9493951106581935,
"learning_rate": 9.986525580300253e-06,
"loss": 1.4403,
"step": 265
},
{
"epoch": 0.12345679012345678,
"grad_norm": 4.150830186272059,
"learning_rate": 9.983438867744923e-06,
"loss": 1.4382,
"step": 270
},
{
"epoch": 0.12574302697759487,
"grad_norm": 0.8458476848705546,
"learning_rate": 9.980034675172274e-06,
"loss": 1.4248,
"step": 275
},
{
"epoch": 0.12802926383173296,
"grad_norm": 1.8171861028727991,
"learning_rate": 9.976313219453255e-06,
"loss": 1.4055,
"step": 280
},
{
"epoch": 0.13031550068587106,
"grad_norm": 0.7389926811741014,
"learning_rate": 9.972274737670702e-06,
"loss": 1.4033,
"step": 285
},
{
"epoch": 0.13260173754000915,
"grad_norm": 0.8834746515415843,
"learning_rate": 9.967919487104237e-06,
"loss": 1.3724,
"step": 290
},
{
"epoch": 0.13488797439414724,
"grad_norm": 0.8166186304734012,
"learning_rate": 9.963247745213876e-06,
"loss": 1.3721,
"step": 295
},
{
"epoch": 0.13717421124828533,
"grad_norm": 0.6771475216933378,
"learning_rate": 9.958259809622353e-06,
"loss": 1.3555,
"step": 300
},
{
"epoch": 0.13946044810242342,
"grad_norm": 0.60525762012324,
"learning_rate": 9.952955998096155e-06,
"loss": 1.36,
"step": 305
},
{
"epoch": 0.1417466849565615,
"grad_norm": 0.6126617626167846,
"learning_rate": 9.94733664852529e-06,
"loss": 1.353,
"step": 310
},
{
"epoch": 0.1440329218106996,
"grad_norm": 0.6630794657190928,
"learning_rate": 9.941402118901743e-06,
"loss": 1.3359,
"step": 315
},
{
"epoch": 0.1463191586648377,
"grad_norm": 0.6758533351396738,
"learning_rate": 9.935152787296689e-06,
"loss": 1.3402,
"step": 320
},
{
"epoch": 0.14860539551897575,
"grad_norm": 0.739719330356037,
"learning_rate": 9.928589051836392e-06,
"loss": 1.3346,
"step": 325
},
{
"epoch": 0.15089163237311384,
"grad_norm": 0.7258290118963521,
"learning_rate": 9.921711330676848e-06,
"loss": 1.3356,
"step": 330
},
{
"epoch": 0.15317786922725193,
"grad_norm": 0.6274092924270468,
"learning_rate": 9.91452006197715e-06,
"loss": 1.3362,
"step": 335
},
{
"epoch": 0.15546410608139002,
"grad_norm": 0.768028072114212,
"learning_rate": 9.907015703871558e-06,
"loss": 1.3214,
"step": 340
},
{
"epoch": 0.15775034293552812,
"grad_norm": 0.7738373400419118,
"learning_rate": 9.899198734440335e-06,
"loss": 1.331,
"step": 345
},
{
"epoch": 0.1600365797896662,
"grad_norm": 0.6855410863811031,
"learning_rate": 9.891069651679273e-06,
"loss": 1.3142,
"step": 350
},
{
"epoch": 0.1623228166438043,
"grad_norm": 0.6405023247699122,
"learning_rate": 9.882628973467972e-06,
"loss": 1.3171,
"step": 355
},
{
"epoch": 0.1646090534979424,
"grad_norm": 0.6764400756880153,
"learning_rate": 9.873877237536854e-06,
"loss": 1.3189,
"step": 360
},
{
"epoch": 0.16689529035208048,
"grad_norm": 0.6298462983903607,
"learning_rate": 9.86481500143289e-06,
"loss": 1.3059,
"step": 365
},
{
"epoch": 0.16918152720621857,
"grad_norm": 0.6606697771559132,
"learning_rate": 9.855442842484101e-06,
"loss": 1.3267,
"step": 370
},
{
"epoch": 0.17146776406035666,
"grad_norm": 0.5895037669135822,
"learning_rate": 9.84576135776276e-06,
"loss": 1.3057,
"step": 375
},
{
"epoch": 0.17375400091449475,
"grad_norm": 0.5762405642901876,
"learning_rate": 9.835771164047365e-06,
"loss": 1.3016,
"step": 380
},
{
"epoch": 0.17604023776863284,
"grad_norm": 0.6301891918568133,
"learning_rate": 9.825472897783344e-06,
"loss": 1.3046,
"step": 385
},
{
"epoch": 0.17832647462277093,
"grad_norm": 0.6189017845225122,
"learning_rate": 9.814867215042503e-06,
"loss": 1.3089,
"step": 390
},
{
"epoch": 0.18061271147690902,
"grad_norm": 0.6279515665165573,
"learning_rate": 9.803954791481239e-06,
"loss": 1.3011,
"step": 395
},
{
"epoch": 0.18289894833104708,
"grad_norm": 0.6380039476156935,
"learning_rate": 9.792736322297489e-06,
"loss": 1.2758,
"step": 400
},
{
"epoch": 0.18518518518518517,
"grad_norm": 0.7506004279154695,
"learning_rate": 9.781212522186442e-06,
"loss": 1.312,
"step": 405
},
{
"epoch": 0.18747142203932327,
"grad_norm": 0.7054181242720778,
"learning_rate": 9.769384125295012e-06,
"loss": 1.3112,
"step": 410
},
{
"epoch": 0.18975765889346136,
"grad_norm": 0.5797880483237029,
"learning_rate": 9.757251885175063e-06,
"loss": 1.2998,
"step": 415
},
{
"epoch": 0.19204389574759945,
"grad_norm": 0.6040659600524477,
"learning_rate": 9.744816574735405e-06,
"loss": 1.3018,
"step": 420
},
{
"epoch": 0.19433013260173754,
"grad_norm": 0.7044299546094256,
"learning_rate": 9.732078986192552e-06,
"loss": 1.2818,
"step": 425
},
{
"epoch": 0.19661636945587563,
"grad_norm": 0.567841572649114,
"learning_rate": 9.719039931020258e-06,
"loss": 1.2733,
"step": 430
},
{
"epoch": 0.19890260631001372,
"grad_norm": 0.5378351616772565,
"learning_rate": 9.705700239897809e-06,
"loss": 1.2861,
"step": 435
},
{
"epoch": 0.2011888431641518,
"grad_norm": 0.5372339490006793,
"learning_rate": 9.692060762657118e-06,
"loss": 1.2821,
"step": 440
},
{
"epoch": 0.2034750800182899,
"grad_norm": 0.6353680076674888,
"learning_rate": 9.678122368228571e-06,
"loss": 1.2643,
"step": 445
},
{
"epoch": 0.205761316872428,
"grad_norm": 0.6263499547366734,
"learning_rate": 9.66388594458568e-06,
"loss": 1.2826,
"step": 450
},
{
"epoch": 0.20804755372656608,
"grad_norm": 0.6119180746423146,
"learning_rate": 9.649352398688506e-06,
"loss": 1.2856,
"step": 455
},
{
"epoch": 0.21033379058070417,
"grad_norm": 0.6640618234127624,
"learning_rate": 9.634522656425885e-06,
"loss": 1.2765,
"step": 460
},
{
"epoch": 0.21262002743484226,
"grad_norm": 0.6253602428713037,
"learning_rate": 9.619397662556434e-06,
"loss": 1.2661,
"step": 465
},
{
"epoch": 0.21490626428898033,
"grad_norm": 0.6463257272674591,
"learning_rate": 9.603978380648375e-06,
"loss": 1.2838,
"step": 470
},
{
"epoch": 0.21719250114311842,
"grad_norm": 0.6916869993480118,
"learning_rate": 9.588265793018141e-06,
"loss": 1.2785,
"step": 475
},
{
"epoch": 0.2194787379972565,
"grad_norm": 0.578420093141111,
"learning_rate": 9.572260900667794e-06,
"loss": 1.2627,
"step": 480
},
{
"epoch": 0.2217649748513946,
"grad_norm": 0.6016744117162259,
"learning_rate": 9.555964723221258e-06,
"loss": 1.2672,
"step": 485
},
{
"epoch": 0.2240512117055327,
"grad_norm": 0.6325422647436533,
"learning_rate": 9.539378298859365e-06,
"loss": 1.2667,
"step": 490
},
{
"epoch": 0.22633744855967078,
"grad_norm": 0.674420764332063,
"learning_rate": 9.522502684253709e-06,
"loss": 1.2601,
"step": 495
},
{
"epoch": 0.22862368541380887,
"grad_norm": 0.6942742236531446,
"learning_rate": 9.505338954499332e-06,
"loss": 1.275,
"step": 500
},
{
"epoch": 0.23090992226794696,
"grad_norm": 0.5661617220667517,
"learning_rate": 9.487888203046232e-06,
"loss": 1.2683,
"step": 505
},
{
"epoch": 0.23319615912208505,
"grad_norm": 0.6389133947347537,
"learning_rate": 9.4701515416297e-06,
"loss": 1.2659,
"step": 510
},
{
"epoch": 0.23548239597622314,
"grad_norm": 0.561786602813537,
"learning_rate": 9.452130100199504e-06,
"loss": 1.2664,
"step": 515
},
{
"epoch": 0.23776863283036123,
"grad_norm": 0.5666699221383189,
"learning_rate": 9.433825026847891e-06,
"loss": 1.2573,
"step": 520
},
{
"epoch": 0.24005486968449932,
"grad_norm": 0.6718711112993888,
"learning_rate": 9.415237487736452e-06,
"loss": 1.2545,
"step": 525
},
{
"epoch": 0.2423411065386374,
"grad_norm": 0.5637527283960878,
"learning_rate": 9.396368667021835e-06,
"loss": 1.2723,
"step": 530
},
{
"epoch": 0.2446273433927755,
"grad_norm": 0.583426898925874,
"learning_rate": 9.377219766780288e-06,
"loss": 1.2473,
"step": 535
},
{
"epoch": 0.24691358024691357,
"grad_norm": 0.7422622561747031,
"learning_rate": 9.3577920069311e-06,
"loss": 1.2609,
"step": 540
},
{
"epoch": 0.24919981710105166,
"grad_norm": 0.7536416453907702,
"learning_rate": 9.338086625158867e-06,
"loss": 1.2655,
"step": 545
},
{
"epoch": 0.25148605395518975,
"grad_norm": 0.5911621999933799,
"learning_rate": 9.318104876834652e-06,
"loss": 1.2652,
"step": 550
},
{
"epoch": 0.25377229080932784,
"grad_norm": 0.6482915887304207,
"learning_rate": 9.297848034936007e-06,
"loss": 1.2488,
"step": 555
},
{
"epoch": 0.25605852766346593,
"grad_norm": 0.7813862221549358,
"learning_rate": 9.277317389965871e-06,
"loss": 1.2678,
"step": 560
},
{
"epoch": 0.258344764517604,
"grad_norm": 0.601959447185496,
"learning_rate": 9.256514249870366e-06,
"loss": 1.2549,
"step": 565
},
{
"epoch": 0.2606310013717421,
"grad_norm": 0.5439593292691556,
"learning_rate": 9.235439939955458e-06,
"loss": 1.2311,
"step": 570
},
{
"epoch": 0.2629172382258802,
"grad_norm": 0.6462948109732727,
"learning_rate": 9.214095802802533e-06,
"loss": 1.2605,
"step": 575
},
{
"epoch": 0.2652034750800183,
"grad_norm": 0.6523908850821281,
"learning_rate": 9.192483198182876e-06,
"loss": 1.2577,
"step": 580
},
{
"epoch": 0.2674897119341564,
"grad_norm": 0.6285230592028435,
"learning_rate": 9.170603502971017e-06,
"loss": 1.233,
"step": 585
},
{
"epoch": 0.2697759487882945,
"grad_norm": 0.5990676661488948,
"learning_rate": 9.148458111057043e-06,
"loss": 1.2444,
"step": 590
},
{
"epoch": 0.27206218564243256,
"grad_norm": 0.5443537881683997,
"learning_rate": 9.12604843325778e-06,
"loss": 1.2282,
"step": 595
},
{
"epoch": 0.27434842249657065,
"grad_norm": 0.5804764131758829,
"learning_rate": 9.103375897226919e-06,
"loss": 1.253,
"step": 600
},
{
"epoch": 0.27663465935070874,
"grad_norm": 0.5905170219986889,
"learning_rate": 9.080441947364065e-06,
"loss": 1.2472,
"step": 605
},
{
"epoch": 0.27892089620484684,
"grad_norm": 0.6003218456115103,
"learning_rate": 9.057248044722718e-06,
"loss": 1.2421,
"step": 610
},
{
"epoch": 0.2812071330589849,
"grad_norm": 0.5683857920528798,
"learning_rate": 9.033795666917191e-06,
"loss": 1.2551,
"step": 615
},
{
"epoch": 0.283493369913123,
"grad_norm": 0.5908776822300396,
"learning_rate": 9.010086308028487e-06,
"loss": 1.2375,
"step": 620
},
{
"epoch": 0.2857796067672611,
"grad_norm": 0.6118010788168986,
"learning_rate": 8.986121478509096e-06,
"loss": 1.2347,
"step": 625
},
{
"epoch": 0.2880658436213992,
"grad_norm": 0.5787813457678733,
"learning_rate": 8.961902705086785e-06,
"loss": 1.2395,
"step": 630
},
{
"epoch": 0.2903520804755373,
"grad_norm": 0.6290839595278495,
"learning_rate": 8.937431530667329e-06,
"loss": 1.2263,
"step": 635
},
{
"epoch": 0.2926383173296754,
"grad_norm": 0.5459763353494508,
"learning_rate": 8.912709514236218e-06,
"loss": 1.2285,
"step": 640
},
{
"epoch": 0.29492455418381347,
"grad_norm": 0.6301840515917086,
"learning_rate": 8.887738230759334e-06,
"loss": 1.2374,
"step": 645
},
{
"epoch": 0.2972107910379515,
"grad_norm": 0.5413584040020849,
"learning_rate": 8.862519271082624e-06,
"loss": 1.2505,
"step": 650
},
{
"epoch": 0.2994970278920896,
"grad_norm": 0.5979355091788396,
"learning_rate": 8.83705424183074e-06,
"loss": 1.2238,
"step": 655
},
{
"epoch": 0.3017832647462277,
"grad_norm": 0.6873493941298675,
"learning_rate": 8.811344765304698e-06,
"loss": 1.2262,
"step": 660
},
{
"epoch": 0.3040695016003658,
"grad_norm": 0.6699975954695512,
"learning_rate": 8.785392479378522e-06,
"loss": 1.23,
"step": 665
},
{
"epoch": 0.30635573845450387,
"grad_norm": 0.6860546025784545,
"learning_rate": 8.759199037394888e-06,
"loss": 1.2424,
"step": 670
},
{
"epoch": 0.30864197530864196,
"grad_norm": 0.7598573834174616,
"learning_rate": 8.732766108059814e-06,
"loss": 1.2138,
"step": 675
},
{
"epoch": 0.31092821216278005,
"grad_norm": 0.723323270057115,
"learning_rate": 8.70609537533634e-06,
"loss": 1.2373,
"step": 680
},
{
"epoch": 0.31321444901691814,
"grad_norm": 0.6170455054157933,
"learning_rate": 8.679188538337248e-06,
"loss": 1.2257,
"step": 685
},
{
"epoch": 0.31550068587105623,
"grad_norm": 0.7413957440287698,
"learning_rate": 8.652047311216823e-06,
"loss": 1.2075,
"step": 690
},
{
"epoch": 0.3177869227251943,
"grad_norm": 0.7424365012242525,
"learning_rate": 8.62467342306164e-06,
"loss": 1.2238,
"step": 695
},
{
"epoch": 0.3200731595793324,
"grad_norm": 0.8566227798899636,
"learning_rate": 8.597068617780419e-06,
"loss": 1.2278,
"step": 700
},
{
"epoch": 0.3223593964334705,
"grad_norm": 0.647075376724737,
"learning_rate": 8.569234653992916e-06,
"loss": 1.2407,
"step": 705
},
{
"epoch": 0.3246456332876086,
"grad_norm": 0.6249088936722902,
"learning_rate": 8.541173304917895e-06,
"loss": 1.2231,
"step": 710
},
{
"epoch": 0.3269318701417467,
"grad_norm": 0.70817264277616,
"learning_rate": 8.512886358260162e-06,
"loss": 1.2345,
"step": 715
},
{
"epoch": 0.3292181069958848,
"grad_norm": 0.5956107721750036,
"learning_rate": 8.484375616096658e-06,
"loss": 1.225,
"step": 720
},
{
"epoch": 0.33150434385002286,
"grad_norm": 0.6062042871270218,
"learning_rate": 8.455642894761684e-06,
"loss": 1.2185,
"step": 725
},
{
"epoch": 0.33379058070416096,
"grad_norm": 0.66611343630398,
"learning_rate": 8.426690024731161e-06,
"loss": 1.2171,
"step": 730
},
{
"epoch": 0.33607681755829905,
"grad_norm": 0.6006939272932527,
"learning_rate": 8.39751885050603e-06,
"loss": 1.2168,
"step": 735
},
{
"epoch": 0.33836305441243714,
"grad_norm": 0.5888998376074026,
"learning_rate": 8.36813123049474e-06,
"loss": 1.2447,
"step": 740
},
{
"epoch": 0.3406492912665752,
"grad_norm": 0.6170255283448466,
"learning_rate": 8.338529036894855e-06,
"loss": 1.2386,
"step": 745
},
{
"epoch": 0.3429355281207133,
"grad_norm": 0.6592250171561639,
"learning_rate": 8.308714155573785e-06,
"loss": 1.2095,
"step": 750
},
{
"epoch": 0.3452217649748514,
"grad_norm": 0.5948350472440084,
"learning_rate": 8.278688485948634e-06,
"loss": 1.2204,
"step": 755
},
{
"epoch": 0.3475080018289895,
"grad_norm": 0.6884759018973265,
"learning_rate": 8.248453940865204e-06,
"loss": 1.2205,
"step": 760
},
{
"epoch": 0.3497942386831276,
"grad_norm": 0.5629453296642776,
"learning_rate": 8.218012446476128e-06,
"loss": 1.2087,
"step": 765
},
{
"epoch": 0.3520804755372657,
"grad_norm": 0.5703699859674032,
"learning_rate": 8.187365942118162e-06,
"loss": 1.2038,
"step": 770
},
{
"epoch": 0.35436671239140377,
"grad_norm": 0.5758055939006159,
"learning_rate": 8.156516380188635e-06,
"loss": 1.2015,
"step": 775
},
{
"epoch": 0.35665294924554186,
"grad_norm": 0.6814380489670292,
"learning_rate": 8.125465726021068e-06,
"loss": 1.2267,
"step": 780
},
{
"epoch": 0.35893918609967995,
"grad_norm": 0.58819101648096,
"learning_rate": 8.09421595775997e-06,
"loss": 1.2065,
"step": 785
},
{
"epoch": 0.36122542295381804,
"grad_norm": 0.599220106737159,
"learning_rate": 8.062769066234807e-06,
"loss": 1.2084,
"step": 790
},
{
"epoch": 0.3635116598079561,
"grad_norm": 0.5687079813226833,
"learning_rate": 8.031127054833192e-06,
"loss": 1.2311,
"step": 795
},
{
"epoch": 0.36579789666209417,
"grad_norm": 0.6076443328436887,
"learning_rate": 7.999291939373232e-06,
"loss": 1.209,
"step": 800
},
{
"epoch": 0.36808413351623226,
"grad_norm": 0.5767468288489239,
"learning_rate": 7.967265747975124e-06,
"loss": 1.2153,
"step": 805
},
{
"epoch": 0.37037037037037035,
"grad_norm": 0.6275130557605428,
"learning_rate": 7.93505052093194e-06,
"loss": 1.2206,
"step": 810
},
{
"epoch": 0.37265660722450844,
"grad_norm": 0.5920904031157348,
"learning_rate": 7.90264831057965e-06,
"loss": 1.2149,
"step": 815
},
{
"epoch": 0.37494284407864653,
"grad_norm": 0.5841477404583847,
"learning_rate": 7.870061181166372e-06,
"loss": 1.2134,
"step": 820
},
{
"epoch": 0.3772290809327846,
"grad_norm": 0.545565275285448,
"learning_rate": 7.837291208720867e-06,
"loss": 1.2185,
"step": 825
},
{
"epoch": 0.3795153177869227,
"grad_norm": 0.6183231148929101,
"learning_rate": 7.804340480920274e-06,
"loss": 1.2064,
"step": 830
},
{
"epoch": 0.3818015546410608,
"grad_norm": 0.5801259298558049,
"learning_rate": 7.771211096957125e-06,
"loss": 1.2049,
"step": 835
},
{
"epoch": 0.3840877914951989,
"grad_norm": 0.579347207611424,
"learning_rate": 7.737905167405596e-06,
"loss": 1.2185,
"step": 840
},
{
"epoch": 0.386374028349337,
"grad_norm": 0.6262921976973932,
"learning_rate": 7.704424814087056e-06,
"loss": 1.2137,
"step": 845
},
{
"epoch": 0.3886602652034751,
"grad_norm": 0.6070706881138944,
"learning_rate": 7.670772169934902e-06,
"loss": 1.2177,
"step": 850
},
{
"epoch": 0.39094650205761317,
"grad_norm": 0.5688216055326876,
"learning_rate": 7.636949378858647e-06,
"loss": 1.2016,
"step": 855
},
{
"epoch": 0.39323273891175126,
"grad_norm": 0.6166249078020826,
"learning_rate": 7.602958595607375e-06,
"loss": 1.1957,
"step": 860
},
{
"epoch": 0.39551897576588935,
"grad_norm": 0.5778886288472463,
"learning_rate": 7.568801985632439e-06,
"loss": 1.2105,
"step": 865
},
{
"epoch": 0.39780521262002744,
"grad_norm": 0.6732218435967291,
"learning_rate": 7.5344817249495195e-06,
"loss": 1.2047,
"step": 870
},
{
"epoch": 0.40009144947416553,
"grad_norm": 0.672208759556888,
"learning_rate": 7.500000000000001e-06,
"loss": 1.1854,
"step": 875
},
{
"epoch": 0.4023776863283036,
"grad_norm": 0.6180565492464766,
"learning_rate": 7.465359007511667e-06,
"loss": 1.185,
"step": 880
},
{
"epoch": 0.4046639231824417,
"grad_norm": 0.6266745151721254,
"learning_rate": 7.430560954358764e-06,
"loss": 1.2082,
"step": 885
},
{
"epoch": 0.4069501600365798,
"grad_norm": 0.6163182978581346,
"learning_rate": 7.395608057421406e-06,
"loss": 1.2194,
"step": 890
},
{
"epoch": 0.4092363968907179,
"grad_norm": 0.6262674693601461,
"learning_rate": 7.360502543444339e-06,
"loss": 1.2188,
"step": 895
},
{
"epoch": 0.411522633744856,
"grad_norm": 0.5549642780561265,
"learning_rate": 7.325246648895089e-06,
"loss": 1.1986,
"step": 900
},
{
"epoch": 0.41380887059899407,
"grad_norm": 0.5540368046559051,
"learning_rate": 7.289842619821475e-06,
"loss": 1.2175,
"step": 905
},
{
"epoch": 0.41609510745313216,
"grad_norm": 0.587023330497459,
"learning_rate": 7.254292711708529e-06,
"loss": 1.2029,
"step": 910
},
{
"epoch": 0.41838134430727025,
"grad_norm": 0.5513581130094706,
"learning_rate": 7.218599189334799e-06,
"loss": 1.2009,
"step": 915
},
{
"epoch": 0.42066758116140834,
"grad_norm": 0.7237520794327035,
"learning_rate": 7.182764326628068e-06,
"loss": 1.2063,
"step": 920
},
{
"epoch": 0.42295381801554643,
"grad_norm": 0.5476819110298711,
"learning_rate": 7.146790406520491e-06,
"loss": 1.2107,
"step": 925
},
{
"epoch": 0.4252400548696845,
"grad_norm": 0.5753924094787153,
"learning_rate": 7.1106797208031554e-06,
"loss": 1.2133,
"step": 930
},
{
"epoch": 0.4275262917238226,
"grad_norm": 0.6489054914059448,
"learning_rate": 7.0744345699800755e-06,
"loss": 1.1991,
"step": 935
},
{
"epoch": 0.42981252857796065,
"grad_norm": 0.6239602498665449,
"learning_rate": 7.038057263121639e-06,
"loss": 1.1937,
"step": 940
},
{
"epoch": 0.43209876543209874,
"grad_norm": 0.5954140813357963,
"learning_rate": 7.001550117717499e-06,
"loss": 1.2092,
"step": 945
},
{
"epoch": 0.43438500228623683,
"grad_norm": 0.5953175778315464,
"learning_rate": 6.9649154595289326e-06,
"loss": 1.1957,
"step": 950
},
{
"epoch": 0.4366712391403749,
"grad_norm": 0.6030938627687562,
"learning_rate": 6.92815562244068e-06,
"loss": 1.1827,
"step": 955
},
{
"epoch": 0.438957475994513,
"grad_norm": 0.6882999466791362,
"learning_rate": 6.891272948312251e-06,
"loss": 1.2102,
"step": 960
},
{
"epoch": 0.4412437128486511,
"grad_norm": 0.6080281045836577,
"learning_rate": 6.854269786828741e-06,
"loss": 1.2093,
"step": 965
},
{
"epoch": 0.4435299497027892,
"grad_norm": 0.756192409869553,
"learning_rate": 6.817148495351131e-06,
"loss": 1.2159,
"step": 970
},
{
"epoch": 0.4458161865569273,
"grad_norm": 0.5892520162590819,
"learning_rate": 6.779911438766117e-06,
"loss": 1.193,
"step": 975
},
{
"epoch": 0.4481024234110654,
"grad_norm": 0.6265917897470434,
"learning_rate": 6.742560989335438e-06,
"loss": 1.1951,
"step": 980
},
{
"epoch": 0.45038866026520347,
"grad_norm": 0.5927415516536023,
"learning_rate": 6.705099526544757e-06,
"loss": 1.1973,
"step": 985
},
{
"epoch": 0.45267489711934156,
"grad_norm": 0.5602604942191215,
"learning_rate": 6.667529436952064e-06,
"loss": 1.1945,
"step": 990
},
{
"epoch": 0.45496113397347965,
"grad_norm": 0.751574883051813,
"learning_rate": 6.629853114035643e-06,
"loss": 1.2134,
"step": 995
},
{
"epoch": 0.45724737082761774,
"grad_norm": 0.6000318274839507,
"learning_rate": 6.5920729580415795e-06,
"loss": 1.2104,
"step": 1000
},
{
"epoch": 0.45953360768175583,
"grad_norm": 0.5783065549399249,
"learning_rate": 6.554191375830861e-06,
"loss": 1.2016,
"step": 1005
},
{
"epoch": 0.4618198445358939,
"grad_norm": 0.5751980188798808,
"learning_rate": 6.516210780726032e-06,
"loss": 1.1794,
"step": 1010
},
{
"epoch": 0.464106081390032,
"grad_norm": 0.6096335885035103,
"learning_rate": 6.478133592357455e-06,
"loss": 1.1816,
"step": 1015
},
{
"epoch": 0.4663923182441701,
"grad_norm": 0.5848690144740822,
"learning_rate": 6.43996223650916e-06,
"loss": 1.1735,
"step": 1020
},
{
"epoch": 0.4686785550983082,
"grad_norm": 0.6273777569367492,
"learning_rate": 6.401699144964306e-06,
"loss": 1.1864,
"step": 1025
},
{
"epoch": 0.4709647919524463,
"grad_norm": 0.5772389229176554,
"learning_rate": 6.3633467553502625e-06,
"loss": 1.1953,
"step": 1030
},
{
"epoch": 0.4732510288065844,
"grad_norm": 0.6320660706578101,
"learning_rate": 6.32490751098331e-06,
"loss": 1.1778,
"step": 1035
},
{
"epoch": 0.47553726566072246,
"grad_norm": 0.628014857385664,
"learning_rate": 6.286383860712982e-06,
"loss": 1.1978,
"step": 1040
},
{
"epoch": 0.47782350251486055,
"grad_norm": 0.6165011857453245,
"learning_rate": 6.247778258766069e-06,
"loss": 1.1783,
"step": 1045
},
{
"epoch": 0.48010973936899864,
"grad_norm": 0.6680859473813631,
"learning_rate": 6.209093164590253e-06,
"loss": 1.1883,
"step": 1050
},
{
"epoch": 0.48239597622313674,
"grad_norm": 0.6230269069079273,
"learning_rate": 6.170331042697425e-06,
"loss": 1.1923,
"step": 1055
},
{
"epoch": 0.4846822130772748,
"grad_norm": 0.6472681484163015,
"learning_rate": 6.131494362506693e-06,
"loss": 1.1826,
"step": 1060
},
{
"epoch": 0.4869684499314129,
"grad_norm": 0.6799978087591872,
"learning_rate": 6.09258559818704e-06,
"loss": 1.1829,
"step": 1065
},
{
"epoch": 0.489254686785551,
"grad_norm": 0.5617426984448537,
"learning_rate": 6.053607228499719e-06,
"loss": 1.1941,
"step": 1070
},
{
"epoch": 0.4915409236396891,
"grad_norm": 0.6444058153599652,
"learning_rate": 6.014561736640334e-06,
"loss": 1.2,
"step": 1075
},
{
"epoch": 0.49382716049382713,
"grad_norm": 0.6016265988080601,
"learning_rate": 5.975451610080643e-06,
"loss": 1.1655,
"step": 1080
},
{
"epoch": 0.4961133973479652,
"grad_norm": 0.7053148286233416,
"learning_rate": 5.936279340410082e-06,
"loss": 1.172,
"step": 1085
},
{
"epoch": 0.4983996342021033,
"grad_norm": 0.5586357561653685,
"learning_rate": 5.8970474231770445e-06,
"loss": 1.1922,
"step": 1090
},
{
"epoch": 0.5006858710562414,
"grad_norm": 0.7895760074140119,
"learning_rate": 5.857758357729892e-06,
"loss": 1.1839,
"step": 1095
},
{
"epoch": 0.5029721079103795,
"grad_norm": 0.7313666592611404,
"learning_rate": 5.8184146470577265e-06,
"loss": 1.1813,
"step": 1100
},
{
"epoch": 0.5052583447645176,
"grad_norm": 0.6067591576327228,
"learning_rate": 5.779018797630934e-06,
"loss": 1.1855,
"step": 1105
},
{
"epoch": 0.5075445816186557,
"grad_norm": 0.6144330199450508,
"learning_rate": 5.739573319241505e-06,
"loss": 1.1924,
"step": 1110
},
{
"epoch": 0.5098308184727938,
"grad_norm": 0.6075048668745815,
"learning_rate": 5.7000807248431466e-06,
"loss": 1.1783,
"step": 1115
},
{
"epoch": 0.5121170553269319,
"grad_norm": 0.6763365315316732,
"learning_rate": 5.66054353039118e-06,
"loss": 1.1873,
"step": 1120
},
{
"epoch": 0.51440329218107,
"grad_norm": 0.652936999197392,
"learning_rate": 5.620964254682267e-06,
"loss": 1.2019,
"step": 1125
},
{
"epoch": 0.516689529035208,
"grad_norm": 0.7510930690144121,
"learning_rate": 5.58134541919394e-06,
"loss": 1.1863,
"step": 1130
},
{
"epoch": 0.5189757658893461,
"grad_norm": 0.7485282723991191,
"learning_rate": 5.5416895479239665e-06,
"loss": 1.1878,
"step": 1135
},
{
"epoch": 0.5212620027434842,
"grad_norm": 0.6650793765929232,
"learning_rate": 5.501999167229554e-06,
"loss": 1.1844,
"step": 1140
},
{
"epoch": 0.5235482395976223,
"grad_norm": 0.6617004106280673,
"learning_rate": 5.4622768056664e-06,
"loss": 1.1819,
"step": 1145
},
{
"epoch": 0.5258344764517604,
"grad_norm": 0.639306148093516,
"learning_rate": 5.42252499382761e-06,
"loss": 1.1844,
"step": 1150
},
{
"epoch": 0.5281207133058985,
"grad_norm": 0.590573720499581,
"learning_rate": 5.38274626418248e-06,
"loss": 1.1848,
"step": 1155
},
{
"epoch": 0.5304069501600366,
"grad_norm": 0.625235396788826,
"learning_rate": 5.3429431509151515e-06,
"loss": 1.1904,
"step": 1160
},
{
"epoch": 0.5326931870141747,
"grad_norm": 0.5840052674712635,
"learning_rate": 5.303118189763187e-06,
"loss": 1.1829,
"step": 1165
},
{
"epoch": 0.5349794238683128,
"grad_norm": 0.5940842973816081,
"learning_rate": 5.263273917856e-06,
"loss": 1.1774,
"step": 1170
},
{
"epoch": 0.5372656607224509,
"grad_norm": 0.5991239115995499,
"learning_rate": 5.22341287355324e-06,
"loss": 1.1857,
"step": 1175
},
{
"epoch": 0.539551897576589,
"grad_norm": 0.6248756548437343,
"learning_rate": 5.183537596283075e-06,
"loss": 1.1799,
"step": 1180
},
{
"epoch": 0.541838134430727,
"grad_norm": 0.6023807247895316,
"learning_rate": 5.143650626380417e-06,
"loss": 1.1858,
"step": 1185
},
{
"epoch": 0.5441243712848651,
"grad_norm": 0.6101959497751839,
"learning_rate": 5.103754504925071e-06,
"loss": 1.1961,
"step": 1190
},
{
"epoch": 0.5464106081390032,
"grad_norm": 0.569676114190435,
"learning_rate": 5.06385177357987e-06,
"loss": 1.1766,
"step": 1195
},
{
"epoch": 0.5486968449931413,
"grad_norm": 0.5819652008689743,
"learning_rate": 5.023944974428739e-06,
"loss": 1.1734,
"step": 1200
},
{
"epoch": 0.5509830818472794,
"grad_norm": 0.5661449507234365,
"learning_rate": 4.9840366498147495e-06,
"loss": 1.1908,
"step": 1205
},
{
"epoch": 0.5532693187014175,
"grad_norm": 0.6109491726102372,
"learning_rate": 4.944129342178156e-06,
"loss": 1.1784,
"step": 1210
},
{
"epoch": 0.5555555555555556,
"grad_norm": 0.5811074689104263,
"learning_rate": 4.90422559389443e-06,
"loss": 1.1746,
"step": 1215
},
{
"epoch": 0.5578417924096937,
"grad_norm": 0.6060458081756667,
"learning_rate": 4.864327947112281e-06,
"loss": 1.195,
"step": 1220
},
{
"epoch": 0.5601280292638318,
"grad_norm": 0.6226718536570417,
"learning_rate": 4.82443894359171e-06,
"loss": 1.1786,
"step": 1225
},
{
"epoch": 0.5624142661179699,
"grad_norm": 0.5995864510713481,
"learning_rate": 4.784561124542088e-06,
"loss": 1.1791,
"step": 1230
},
{
"epoch": 0.5647005029721079,
"grad_norm": 0.5701958838449743,
"learning_rate": 4.744697030460248e-06,
"loss": 1.1647,
"step": 1235
},
{
"epoch": 0.566986739826246,
"grad_norm": 0.6293939505655973,
"learning_rate": 4.7048492009686525e-06,
"loss": 1.1692,
"step": 1240
},
{
"epoch": 0.5692729766803841,
"grad_norm": 0.6850447194966206,
"learning_rate": 4.6650201746535926e-06,
"loss": 1.1673,
"step": 1245
},
{
"epoch": 0.5715592135345222,
"grad_norm": 0.6040120516739561,
"learning_rate": 4.625212488903467e-06,
"loss": 1.1834,
"step": 1250
},
{
"epoch": 0.5738454503886603,
"grad_norm": 0.5686706476550618,
"learning_rate": 4.585428679747133e-06,
"loss": 1.1716,
"step": 1255
},
{
"epoch": 0.5761316872427984,
"grad_norm": 0.5946931657837966,
"learning_rate": 4.545671281692331e-06,
"loss": 1.1705,
"step": 1260
},
{
"epoch": 0.5784179240969365,
"grad_norm": 0.6120143356512502,
"learning_rate": 4.505942827564242e-06,
"loss": 1.1807,
"step": 1265
},
{
"epoch": 0.5807041609510746,
"grad_norm": 0.6341171747185648,
"learning_rate": 4.466245848344106e-06,
"loss": 1.1839,
"step": 1270
},
{
"epoch": 0.5829903978052127,
"grad_norm": 0.6494090868678567,
"learning_rate": 4.426582873007999e-06,
"loss": 1.1684,
"step": 1275
},
{
"epoch": 0.5852766346593508,
"grad_norm": 0.6252524175950205,
"learning_rate": 4.386956428365701e-06,
"loss": 1.1878,
"step": 1280
},
{
"epoch": 0.5875628715134888,
"grad_norm": 0.5911175497758677,
"learning_rate": 4.347369038899744e-06,
"loss": 1.1828,
"step": 1285
},
{
"epoch": 0.5898491083676269,
"grad_norm": 0.5988939599453593,
"learning_rate": 4.307823226604555e-06,
"loss": 1.1735,
"step": 1290
},
{
"epoch": 0.5921353452217649,
"grad_norm": 0.5813355536422021,
"learning_rate": 4.2683215108258145e-06,
"loss": 1.1706,
"step": 1295
},
{
"epoch": 0.594421582075903,
"grad_norm": 0.6208043705991068,
"learning_rate": 4.228866408099945e-06,
"loss": 1.1907,
"step": 1300
},
{
"epoch": 0.5967078189300411,
"grad_norm": 0.6512006631857741,
"learning_rate": 4.189460431993788e-06,
"loss": 1.1951,
"step": 1305
},
{
"epoch": 0.5989940557841792,
"grad_norm": 0.5845471180993255,
"learning_rate": 4.150106092944475e-06,
"loss": 1.1717,
"step": 1310
},
{
"epoch": 0.6012802926383173,
"grad_norm": 0.5949045334275538,
"learning_rate": 4.110805898099492e-06,
"loss": 1.1833,
"step": 1315
},
{
"epoch": 0.6035665294924554,
"grad_norm": 0.5971913414181261,
"learning_rate": 4.071562351156966e-06,
"loss": 1.1786,
"step": 1320
},
{
"epoch": 0.6058527663465935,
"grad_norm": 0.6178601149254982,
"learning_rate": 4.032377952206148e-06,
"loss": 1.1793,
"step": 1325
},
{
"epoch": 0.6081390032007316,
"grad_norm": 0.6046188006147395,
"learning_rate": 3.993255197568154e-06,
"loss": 1.169,
"step": 1330
},
{
"epoch": 0.6104252400548696,
"grad_norm": 0.5919458656130715,
"learning_rate": 3.954196579636918e-06,
"loss": 1.1692,
"step": 1335
},
{
"epoch": 0.6127114769090077,
"grad_norm": 0.5727049539306068,
"learning_rate": 3.91520458672042e-06,
"loss": 1.1747,
"step": 1340
},
{
"epoch": 0.6149977137631458,
"grad_norm": 0.6040809405921704,
"learning_rate": 3.876281702882156e-06,
"loss": 1.1935,
"step": 1345
},
{
"epoch": 0.6172839506172839,
"grad_norm": 0.5747789602798682,
"learning_rate": 3.837430407782896e-06,
"loss": 1.175,
"step": 1350
},
{
"epoch": 0.619570187471422,
"grad_norm": 0.6001909994942644,
"learning_rate": 3.7986531765226965e-06,
"loss": 1.1718,
"step": 1355
},
{
"epoch": 0.6218564243255601,
"grad_norm": 0.5499338552551708,
"learning_rate": 3.759952479483232e-06,
"loss": 1.1615,
"step": 1360
},
{
"epoch": 0.6241426611796982,
"grad_norm": 0.62697610396954,
"learning_rate": 3.7213307821704115e-06,
"loss": 1.1616,
"step": 1365
},
{
"epoch": 0.6264288980338363,
"grad_norm": 0.637904015143814,
"learning_rate": 3.6827905450573022e-06,
"loss": 1.1784,
"step": 1370
},
{
"epoch": 0.6287151348879744,
"grad_norm": 0.6235229612947039,
"learning_rate": 3.6443342234273905e-06,
"loss": 1.1674,
"step": 1375
},
{
"epoch": 0.6310013717421125,
"grad_norm": 0.744429415227132,
"learning_rate": 3.6059642672181537e-06,
"loss": 1.1678,
"step": 1380
},
{
"epoch": 0.6332876085962506,
"grad_norm": 0.5903117671660288,
"learning_rate": 3.5676831208649887e-06,
"loss": 1.1661,
"step": 1385
},
{
"epoch": 0.6355738454503886,
"grad_norm": 0.5977435348831742,
"learning_rate": 3.5294932231454838e-06,
"loss": 1.1655,
"step": 1390
},
{
"epoch": 0.6378600823045267,
"grad_norm": 0.6262251229258455,
"learning_rate": 3.4913970070240388e-06,
"loss": 1.1827,
"step": 1395
},
{
"epoch": 0.6401463191586648,
"grad_norm": 0.6039362156672261,
"learning_rate": 3.4533968994968913e-06,
"loss": 1.162,
"step": 1400
},
{
"epoch": 0.6424325560128029,
"grad_norm": 0.610471777862986,
"learning_rate": 3.41549532143748e-06,
"loss": 1.1719,
"step": 1405
},
{
"epoch": 0.644718792866941,
"grad_norm": 0.6124948412563855,
"learning_rate": 3.3776946874422268e-06,
"loss": 1.161,
"step": 1410
},
{
"epoch": 0.6470050297210791,
"grad_norm": 0.596054515528405,
"learning_rate": 3.3399974056767095e-06,
"loss": 1.1677,
"step": 1415
},
{
"epoch": 0.6492912665752172,
"grad_norm": 0.6199519548446956,
"learning_rate": 3.30240587772224e-06,
"loss": 1.1731,
"step": 1420
},
{
"epoch": 0.6515775034293553,
"grad_norm": 0.6123382818220521,
"learning_rate": 3.2649224984228756e-06,
"loss": 1.1751,
"step": 1425
},
{
"epoch": 0.6538637402834934,
"grad_norm": 0.6521756883889377,
"learning_rate": 3.227549655732843e-06,
"loss": 1.1746,
"step": 1430
},
{
"epoch": 0.6561499771376315,
"grad_norm": 0.6292502440238857,
"learning_rate": 3.19028973056441e-06,
"loss": 1.1796,
"step": 1435
},
{
"epoch": 0.6584362139917695,
"grad_norm": 0.7223300006546375,
"learning_rate": 3.153145096636211e-06,
"loss": 1.1769,
"step": 1440
},
{
"epoch": 0.6607224508459076,
"grad_norm": 0.6123252900962536,
"learning_rate": 3.1161181203220146e-06,
"loss": 1.1798,
"step": 1445
},
{
"epoch": 0.6630086877000457,
"grad_norm": 0.6176590524451245,
"learning_rate": 3.079211160499975e-06,
"loss": 1.1628,
"step": 1450
},
{
"epoch": 0.6652949245541838,
"grad_norm": 0.6851380779593121,
"learning_rate": 3.0424265684023556e-06,
"loss": 1.1621,
"step": 1455
},
{
"epoch": 0.6675811614083219,
"grad_norm": 0.6135186798564677,
"learning_rate": 3.0057666874657365e-06,
"loss": 1.1817,
"step": 1460
},
{
"epoch": 0.66986739826246,
"grad_norm": 0.6162664151552476,
"learning_rate": 2.9692338531817205e-06,
"loss": 1.1621,
"step": 1465
},
{
"epoch": 0.6721536351165981,
"grad_norm": 0.6209879083469707,
"learning_rate": 2.9328303929481507e-06,
"loss": 1.1788,
"step": 1470
},
{
"epoch": 0.6744398719707362,
"grad_norm": 0.6564960801220917,
"learning_rate": 2.8965586259208295e-06,
"loss": 1.1497,
"step": 1475
},
{
"epoch": 0.6767261088248743,
"grad_norm": 0.6100366044161921,
"learning_rate": 2.860420862865787e-06,
"loss": 1.1641,
"step": 1480
},
{
"epoch": 0.6790123456790124,
"grad_norm": 0.6401282278697755,
"learning_rate": 2.82441940601205e-06,
"loss": 1.1647,
"step": 1485
},
{
"epoch": 0.6812985825331505,
"grad_norm": 0.5948814066139619,
"learning_rate": 2.7885565489049948e-06,
"loss": 1.1862,
"step": 1490
},
{
"epoch": 0.6835848193872885,
"grad_norm": 0.575891260626997,
"learning_rate": 2.7528345762602125e-06,
"loss": 1.149,
"step": 1495
},
{
"epoch": 0.6858710562414266,
"grad_norm": 0.6321328549868929,
"learning_rate": 2.7172557638179674e-06,
"loss": 1.1722,
"step": 1500
},
{
"epoch": 0.6881572930955647,
"grad_norm": 0.620537429422375,
"learning_rate": 2.681822378198221e-06,
"loss": 1.1667,
"step": 1505
},
{
"epoch": 0.6904435299497028,
"grad_norm": 0.5916688359774108,
"learning_rate": 2.6465366767562162e-06,
"loss": 1.1742,
"step": 1510
},
{
"epoch": 0.6927297668038409,
"grad_norm": 0.649532932905328,
"learning_rate": 2.611400907438685e-06,
"loss": 1.1664,
"step": 1515
},
{
"epoch": 0.695016003657979,
"grad_norm": 0.5887639490410209,
"learning_rate": 2.5764173086406306e-06,
"loss": 1.1684,
"step": 1520
},
{
"epoch": 0.6973022405121171,
"grad_norm": 0.5909674256777088,
"learning_rate": 2.5415881090627227e-06,
"loss": 1.1681,
"step": 1525
},
{
"epoch": 0.6995884773662552,
"grad_norm": 0.6669572713903603,
"learning_rate": 2.506915527569318e-06,
"loss": 1.1692,
"step": 1530
},
{
"epoch": 0.7018747142203933,
"grad_norm": 0.6291006193664693,
"learning_rate": 2.472401773047107e-06,
"loss": 1.1707,
"step": 1535
},
{
"epoch": 0.7041609510745314,
"grad_norm": 0.6241336853751712,
"learning_rate": 2.438049044264382e-06,
"loss": 1.1763,
"step": 1540
},
{
"epoch": 0.7064471879286695,
"grad_norm": 0.6233093811845397,
"learning_rate": 2.4038595297309712e-06,
"loss": 1.1595,
"step": 1545
},
{
"epoch": 0.7087334247828075,
"grad_norm": 0.6099376654855213,
"learning_rate": 2.3698354075588105e-06,
"loss": 1.1815,
"step": 1550
},
{
"epoch": 0.7110196616369456,
"grad_norm": 0.608739940642273,
"learning_rate": 2.3359788453231723e-06,
"loss": 1.1558,
"step": 1555
},
{
"epoch": 0.7133058984910837,
"grad_norm": 0.6060804682823651,
"learning_rate": 2.3022919999245964e-06,
"loss": 1.1737,
"step": 1560
},
{
"epoch": 0.7155921353452218,
"grad_norm": 0.6554029837627439,
"learning_rate": 2.2687770174514674e-06,
"loss": 1.1763,
"step": 1565
},
{
"epoch": 0.7178783721993599,
"grad_norm": 0.6199763037940721,
"learning_rate": 2.23543603304329e-06,
"loss": 1.1668,
"step": 1570
},
{
"epoch": 0.720164609053498,
"grad_norm": 0.7002533112076955,
"learning_rate": 2.20227117075468e-06,
"loss": 1.1717,
"step": 1575
},
{
"epoch": 0.7224508459076361,
"grad_norm": 0.5685258465602809,
"learning_rate": 2.1692845434200323e-06,
"loss": 1.1793,
"step": 1580
},
{
"epoch": 0.7247370827617741,
"grad_norm": 0.5988803647429354,
"learning_rate": 2.136478252518924e-06,
"loss": 1.1762,
"step": 1585
},
{
"epoch": 0.7270233196159122,
"grad_norm": 0.6220944262982843,
"learning_rate": 2.103854388042243e-06,
"loss": 1.1732,
"step": 1590
},
{
"epoch": 0.7293095564700502,
"grad_norm": 0.5872374752551915,
"learning_rate": 2.071415028359026e-06,
"loss": 1.1653,
"step": 1595
},
{
"epoch": 0.7315957933241883,
"grad_norm": 0.6315378201627972,
"learning_rate": 2.0391622400840665e-06,
"loss": 1.1631,
"step": 1600
},
{
"epoch": 0.7338820301783264,
"grad_norm": 0.6166479295990325,
"learning_rate": 2.0070980779462513e-06,
"loss": 1.1632,
"step": 1605
},
{
"epoch": 0.7361682670324645,
"grad_norm": 0.6082820756952414,
"learning_rate": 1.975224584657648e-06,
"loss": 1.1609,
"step": 1610
},
{
"epoch": 0.7384545038866026,
"grad_norm": 0.5711567863660318,
"learning_rate": 1.943543790783392e-06,
"loss": 1.1629,
"step": 1615
},
{
"epoch": 0.7407407407407407,
"grad_norm": 0.5934876997772376,
"learning_rate": 1.9120577146123125e-06,
"loss": 1.1711,
"step": 1620
},
{
"epoch": 0.7430269775948788,
"grad_norm": 0.6044258229955937,
"learning_rate": 1.8807683620283496e-06,
"loss": 1.1792,
"step": 1625
},
{
"epoch": 0.7453132144490169,
"grad_norm": 0.6414108282805848,
"learning_rate": 1.8496777263827775e-06,
"loss": 1.1909,
"step": 1630
},
{
"epoch": 0.747599451303155,
"grad_norm": 0.5928077840962543,
"learning_rate": 1.8187877883672024e-06,
"loss": 1.177,
"step": 1635
},
{
"epoch": 0.7498856881572931,
"grad_norm": 0.5674967348667851,
"learning_rate": 1.7881005158873826e-06,
"loss": 1.1698,
"step": 1640
},
{
"epoch": 0.7521719250114312,
"grad_norm": 0.6190325214784786,
"learning_rate": 1.757617863937865e-06,
"loss": 1.1564,
"step": 1645
},
{
"epoch": 0.7544581618655692,
"grad_norm": 0.5994621485851359,
"learning_rate": 1.7273417744774323e-06,
"loss": 1.1682,
"step": 1650
},
{
"epoch": 0.7567443987197073,
"grad_norm": 0.6486512119864596,
"learning_rate": 1.6972741763053835e-06,
"loss": 1.1695,
"step": 1655
},
{
"epoch": 0.7590306355738454,
"grad_norm": 0.6124244446703457,
"learning_rate": 1.6674169849386606e-06,
"loss": 1.1735,
"step": 1660
},
{
"epoch": 0.7613168724279835,
"grad_norm": 0.6215393083401685,
"learning_rate": 1.6377721024898214e-06,
"loss": 1.1611,
"step": 1665
},
{
"epoch": 0.7636031092821216,
"grad_norm": 0.6379465283211975,
"learning_rate": 1.608341417545849e-06,
"loss": 1.1481,
"step": 1670
},
{
"epoch": 0.7658893461362597,
"grad_norm": 0.5646658898706897,
"learning_rate": 1.5791268050478487e-06,
"loss": 1.1732,
"step": 1675
},
{
"epoch": 0.7681755829903978,
"grad_norm": 0.6028441016085894,
"learning_rate": 1.5501301261715896e-06,
"loss": 1.1703,
"step": 1680
},
{
"epoch": 0.7704618198445359,
"grad_norm": 0.6313316478647917,
"learning_rate": 1.5213532282089466e-06,
"loss": 1.1631,
"step": 1685
},
{
"epoch": 0.772748056698674,
"grad_norm": 0.600237347487572,
"learning_rate": 1.4927979444502028e-06,
"loss": 1.1642,
"step": 1690
},
{
"epoch": 0.7750342935528121,
"grad_norm": 0.5957448361281138,
"learning_rate": 1.4644660940672628e-06,
"loss": 1.1668,
"step": 1695
},
{
"epoch": 0.7773205304069501,
"grad_norm": 0.5872437663700951,
"learning_rate": 1.4363594819977606e-06,
"loss": 1.1707,
"step": 1700
},
{
"epoch": 0.7796067672610882,
"grad_norm": 0.7075549655922131,
"learning_rate": 1.4084798988300684e-06,
"loss": 1.1723,
"step": 1705
},
{
"epoch": 0.7818930041152263,
"grad_norm": 0.6203199463017092,
"learning_rate": 1.3808291206892232e-06,
"loss": 1.1668,
"step": 1710
},
{
"epoch": 0.7841792409693644,
"grad_norm": 0.5759538308213393,
"learning_rate": 1.3534089091237757e-06,
"loss": 1.1598,
"step": 1715
},
{
"epoch": 0.7864654778235025,
"grad_norm": 0.5942123152988342,
"learning_rate": 1.3262210109935719e-06,
"loss": 1.1699,
"step": 1720
},
{
"epoch": 0.7887517146776406,
"grad_norm": 0.6597153339968819,
"learning_rate": 1.2992671583584587e-06,
"loss": 1.163,
"step": 1725
},
{
"epoch": 0.7910379515317787,
"grad_norm": 0.5994756887911626,
"learning_rate": 1.2725490683679458e-06,
"loss": 1.1797,
"step": 1730
},
{
"epoch": 0.7933241883859168,
"grad_norm": 0.5942174681280669,
"learning_rate": 1.2460684431518055e-06,
"loss": 1.1649,
"step": 1735
},
{
"epoch": 0.7956104252400549,
"grad_norm": 0.5884403788886147,
"learning_rate": 1.2198269697116416e-06,
"loss": 1.1627,
"step": 1740
},
{
"epoch": 0.797896662094193,
"grad_norm": 0.5917506875732326,
"learning_rate": 1.1938263198134087e-06,
"loss": 1.1729,
"step": 1745
},
{
"epoch": 0.8001828989483311,
"grad_norm": 0.5689945244963683,
"learning_rate": 1.168068149880912e-06,
"loss": 1.1639,
"step": 1750
},
{
"epoch": 0.8024691358024691,
"grad_norm": 0.5945700377730089,
"learning_rate": 1.1425541008902852e-06,
"loss": 1.1616,
"step": 1755
},
{
"epoch": 0.8047553726566072,
"grad_norm": 0.5960318855848052,
"learning_rate": 1.1172857982654445e-06,
"loss": 1.1796,
"step": 1760
},
{
"epoch": 0.8070416095107453,
"grad_norm": 0.606906781862042,
"learning_rate": 1.092264851774536e-06,
"loss": 1.1524,
"step": 1765
},
{
"epoch": 0.8093278463648834,
"grad_norm": 0.6686014083887466,
"learning_rate": 1.067492855427385e-06,
"loss": 1.1681,
"step": 1770
},
{
"epoch": 0.8116140832190215,
"grad_norm": 0.6637295349703526,
"learning_rate": 1.0429713873739505e-06,
"loss": 1.1603,
"step": 1775
},
{
"epoch": 0.8139003200731596,
"grad_norm": 0.5937746781646984,
"learning_rate": 1.0187020098037759e-06,
"loss": 1.1577,
"step": 1780
},
{
"epoch": 0.8161865569272977,
"grad_norm": 0.6154438358761861,
"learning_rate": 9.946862688464753e-07,
"loss": 1.1596,
"step": 1785
},
{
"epoch": 0.8184727937814358,
"grad_norm": 0.6511739287376433,
"learning_rate": 9.709256944732343e-07,
"loss": 1.1707,
"step": 1790
},
{
"epoch": 0.8207590306355739,
"grad_norm": 0.6174881374069865,
"learning_rate": 9.474218003993275e-07,
"loss": 1.1775,
"step": 1795
},
{
"epoch": 0.823045267489712,
"grad_norm": 0.5791204684491382,
"learning_rate": 9.241760839877023e-07,
"loss": 1.1571,
"step": 1800
},
{
"epoch": 0.82533150434385,
"grad_norm": 0.6464260391976697,
"learning_rate": 9.011900261535767e-07,
"loss": 1.1713,
"step": 1805
},
{
"epoch": 0.8276177411979881,
"grad_norm": 0.6102288143326278,
"learning_rate": 8.784650912700909e-07,
"loss": 1.1654,
"step": 1810
},
{
"epoch": 0.8299039780521262,
"grad_norm": 0.6226743471510658,
"learning_rate": 8.560027270750276e-07,
"loss": 1.1655,
"step": 1815
},
{
"epoch": 0.8321902149062643,
"grad_norm": 0.6079710775307922,
"learning_rate": 8.338043645785698e-07,
"loss": 1.1669,
"step": 1820
},
{
"epoch": 0.8344764517604024,
"grad_norm": 0.6077180347148399,
"learning_rate": 8.118714179721404e-07,
"loss": 1.1529,
"step": 1825
},
{
"epoch": 0.8367626886145405,
"grad_norm": 0.6420590181680129,
"learning_rate": 7.902052845383112e-07,
"loss": 1.1662,
"step": 1830
},
{
"epoch": 0.8390489254686786,
"grad_norm": 0.5675937752707487,
"learning_rate": 7.6880734456178e-07,
"loss": 1.1638,
"step": 1835
},
{
"epoch": 0.8413351623228167,
"grad_norm": 0.5963600943686237,
"learning_rate": 7.476789612414414e-07,
"loss": 1.1648,
"step": 1840
},
{
"epoch": 0.8436213991769548,
"grad_norm": 0.6248451529177521,
"learning_rate": 7.268214806035423e-07,
"loss": 1.1704,
"step": 1845
},
{
"epoch": 0.8459076360310929,
"grad_norm": 0.6582130785897107,
"learning_rate": 7.062362314159211e-07,
"loss": 1.1716,
"step": 1850
},
{
"epoch": 0.848193872885231,
"grad_norm": 0.6104979563533071,
"learning_rate": 6.859245251033697e-07,
"loss": 1.1551,
"step": 1855
},
{
"epoch": 0.850480109739369,
"grad_norm": 0.6291505363028616,
"learning_rate": 6.658876556640781e-07,
"loss": 1.1606,
"step": 1860
},
{
"epoch": 0.8527663465935071,
"grad_norm": 0.626351910055198,
"learning_rate": 6.461268995871967e-07,
"loss": 1.1648,
"step": 1865
},
{
"epoch": 0.8550525834476452,
"grad_norm": 0.5991977091276379,
"learning_rate": 6.266435157715222e-07,
"loss": 1.1403,
"step": 1870
},
{
"epoch": 0.8573388203017832,
"grad_norm": 0.6133109082285381,
"learning_rate": 6.074387454452891e-07,
"loss": 1.1578,
"step": 1875
},
{
"epoch": 0.8596250571559213,
"grad_norm": 0.6062420232877472,
"learning_rate": 5.885138120870965e-07,
"loss": 1.1422,
"step": 1880
},
{
"epoch": 0.8619112940100594,
"grad_norm": 0.5920619164293491,
"learning_rate": 5.698699213479697e-07,
"loss": 1.1503,
"step": 1885
},
{
"epoch": 0.8641975308641975,
"grad_norm": 0.6179934405963249,
"learning_rate": 5.515082609745465e-07,
"loss": 1.1728,
"step": 1890
},
{
"epoch": 0.8664837677183356,
"grad_norm": 0.6191884681224713,
"learning_rate": 5.334300007334065e-07,
"loss": 1.1514,
"step": 1895
},
{
"epoch": 0.8687700045724737,
"grad_norm": 0.6148818189812965,
"learning_rate": 5.156362923365587e-07,
"loss": 1.1772,
"step": 1900
},
{
"epoch": 0.8710562414266118,
"grad_norm": 0.5927964681781609,
"learning_rate": 4.981282693680584e-07,
"loss": 1.1747,
"step": 1905
},
{
"epoch": 0.8733424782807498,
"grad_norm": 0.630038523819453,
"learning_rate": 4.80907047211796e-07,
"loss": 1.1638,
"step": 1910
},
{
"epoch": 0.8756287151348879,
"grad_norm": 0.5822419290829026,
"learning_rate": 4.639737229804403e-07,
"loss": 1.1667,
"step": 1915
},
{
"epoch": 0.877914951989026,
"grad_norm": 0.6169634205827448,
"learning_rate": 4.473293754455399e-07,
"loss": 1.1695,
"step": 1920
},
{
"epoch": 0.8802011888431641,
"grad_norm": 0.5892947845386679,
"learning_rate": 4.3097506496880325e-07,
"loss": 1.1684,
"step": 1925
},
{
"epoch": 0.8824874256973022,
"grad_norm": 0.6796811793089527,
"learning_rate": 4.149118334345403e-07,
"loss": 1.1604,
"step": 1930
},
{
"epoch": 0.8847736625514403,
"grad_norm": 0.5951100132603444,
"learning_rate": 3.9914070418329123e-07,
"loss": 1.1632,
"step": 1935
},
{
"epoch": 0.8870598994055784,
"grad_norm": 0.6710610553022762,
"learning_rate": 3.836626819466338e-07,
"loss": 1.1455,
"step": 1940
},
{
"epoch": 0.8893461362597165,
"grad_norm": 0.6128779790737046,
"learning_rate": 3.684787527831707e-07,
"loss": 1.1609,
"step": 1945
},
{
"epoch": 0.8916323731138546,
"grad_norm": 0.5800567298586133,
"learning_rate": 3.53589884015712e-07,
"loss": 1.1636,
"step": 1950
},
{
"epoch": 0.8939186099679927,
"grad_norm": 0.5600191099569565,
"learning_rate": 3.3899702416965166e-07,
"loss": 1.1721,
"step": 1955
},
{
"epoch": 0.8962048468221308,
"grad_norm": 0.5964683215562515,
"learning_rate": 3.247011029125391e-07,
"loss": 1.1508,
"step": 1960
},
{
"epoch": 0.8984910836762688,
"grad_norm": 0.6125213377358303,
"learning_rate": 3.1070303099485055e-07,
"loss": 1.1716,
"step": 1965
},
{
"epoch": 0.9007773205304069,
"grad_norm": 0.5812964318078312,
"learning_rate": 2.9700370019197287e-07,
"loss": 1.1495,
"step": 1970
},
{
"epoch": 0.903063557384545,
"grad_norm": 0.5947330421470328,
"learning_rate": 2.8360398324738415e-07,
"loss": 1.1446,
"step": 1975
},
{
"epoch": 0.9053497942386831,
"grad_norm": 0.5936630268160432,
"learning_rate": 2.7050473381706186e-07,
"loss": 1.1519,
"step": 1980
},
{
"epoch": 0.9076360310928212,
"grad_norm": 0.6228979256825669,
"learning_rate": 2.577067864150906e-07,
"loss": 1.1688,
"step": 1985
},
{
"epoch": 0.9099222679469593,
"grad_norm": 0.6500515468078818,
"learning_rate": 2.452109563605065e-07,
"loss": 1.1718,
"step": 1990
},
{
"epoch": 0.9122085048010974,
"grad_norm": 0.568112374463465,
"learning_rate": 2.330180397253473e-07,
"loss": 1.169,
"step": 1995
},
{
"epoch": 0.9144947416552355,
"grad_norm": 0.6014335143268985,
"learning_rate": 2.2112881328394287e-07,
"loss": 1.1556,
"step": 2000
},
{
"epoch": 0.9167809785093736,
"grad_norm": 0.5814781144236604,
"learning_rate": 2.0954403446342753e-07,
"loss": 1.1688,
"step": 2005
},
{
"epoch": 0.9190672153635117,
"grad_norm": 0.6269697024329176,
"learning_rate": 1.9826444129548317e-07,
"loss": 1.1791,
"step": 2010
},
{
"epoch": 0.9213534522176497,
"grad_norm": 0.5793724546294099,
"learning_rate": 1.8729075236932903e-07,
"loss": 1.1736,
"step": 2015
},
{
"epoch": 0.9236396890717878,
"grad_norm": 0.5757028817840649,
"learning_rate": 1.7662366678593502e-07,
"loss": 1.1674,
"step": 2020
},
{
"epoch": 0.9259259259259259,
"grad_norm": 0.6383512892284545,
"learning_rate": 1.6626386411348783e-07,
"loss": 1.1725,
"step": 2025
},
{
"epoch": 0.928212162780064,
"grad_norm": 0.6064267969457637,
"learning_rate": 1.56212004344099e-07,
"loss": 1.1596,
"step": 2030
},
{
"epoch": 0.9304983996342021,
"grad_norm": 0.6046327277263103,
"learning_rate": 1.4646872785175182e-07,
"loss": 1.1616,
"step": 2035
},
{
"epoch": 0.9327846364883402,
"grad_norm": 0.611959733363112,
"learning_rate": 1.3703465535151505e-07,
"loss": 1.1614,
"step": 2040
},
{
"epoch": 0.9350708733424783,
"grad_norm": 0.6153837948383357,
"learning_rate": 1.2791038785999243e-07,
"loss": 1.1494,
"step": 2045
},
{
"epoch": 0.9373571101966164,
"grad_norm": 0.5507733416769363,
"learning_rate": 1.1909650665703265e-07,
"loss": 1.1331,
"step": 2050
},
{
"epoch": 0.9396433470507545,
"grad_norm": 0.5787602661155832,
"learning_rate": 1.1059357324870456e-07,
"loss": 1.1548,
"step": 2055
},
{
"epoch": 0.9419295839048926,
"grad_norm": 0.5848374134615248,
"learning_rate": 1.024021293315175e-07,
"loss": 1.1628,
"step": 2060
},
{
"epoch": 0.9442158207590307,
"grad_norm": 0.585861722501522,
"learning_rate": 9.452269675791603e-08,
"loss": 1.1424,
"step": 2065
},
{
"epoch": 0.9465020576131687,
"grad_norm": 0.5870866242087308,
"learning_rate": 8.69557775030344e-08,
"loss": 1.181,
"step": 2070
},
{
"epoch": 0.9487882944673068,
"grad_norm": 0.5917858310575264,
"learning_rate": 7.970185363271432e-08,
"loss": 1.1564,
"step": 2075
},
{
"epoch": 0.9510745313214449,
"grad_norm": 0.6272259568011471,
"learning_rate": 7.276138727279669e-08,
"loss": 1.1659,
"step": 2080
},
{
"epoch": 0.953360768175583,
"grad_norm": 0.607366888512829,
"learning_rate": 6.613482057968023e-08,
"loss": 1.1612,
"step": 2085
},
{
"epoch": 0.9556470050297211,
"grad_norm": 0.61579614820576,
"learning_rate": 5.982257571215178e-08,
"loss": 1.1644,
"step": 2090
},
{
"epoch": 0.9579332418838592,
"grad_norm": 0.6162342496797737,
"learning_rate": 5.382505480449274e-08,
"loss": 1.1439,
"step": 2095
},
{
"epoch": 0.9602194787379973,
"grad_norm": 0.5880335959078453,
"learning_rate": 4.814263994086077e-08,
"loss": 1.1405,
"step": 2100
},
{
"epoch": 0.9625057155921354,
"grad_norm": 0.5978901392727579,
"learning_rate": 4.2775693130948094e-08,
"loss": 1.1792,
"step": 2105
},
{
"epoch": 0.9647919524462735,
"grad_norm": 0.5725207858399001,
"learning_rate": 3.772455628691829e-08,
"loss": 1.1679,
"step": 2110
},
{
"epoch": 0.9670781893004116,
"grad_norm": 0.6126681514493614,
"learning_rate": 3.2989551201624836e-08,
"loss": 1.1621,
"step": 2115
},
{
"epoch": 0.9693644261545497,
"grad_norm": 0.6026354249744876,
"learning_rate": 2.857097952810972e-08,
"loss": 1.1728,
"step": 2120
},
{
"epoch": 0.9716506630086877,
"grad_norm": 0.5876159431495082,
"learning_rate": 2.4469122760388264e-08,
"loss": 1.1552,
"step": 2125
},
{
"epoch": 0.9739368998628258,
"grad_norm": 0.5795939734314318,
"learning_rate": 2.0684242215511797e-08,
"loss": 1.1586,
"step": 2130
},
{
"epoch": 0.9762231367169639,
"grad_norm": 0.6100064497073957,
"learning_rate": 1.7216579016925415e-08,
"loss": 1.1585,
"step": 2135
},
{
"epoch": 0.978509373571102,
"grad_norm": 0.6410024148442394,
"learning_rate": 1.4066354079101396e-08,
"loss": 1.1576,
"step": 2140
},
{
"epoch": 0.9807956104252401,
"grad_norm": 0.5946394925998356,
"learning_rate": 1.1233768093468766e-08,
"loss": 1.1565,
"step": 2145
},
{
"epoch": 0.9830818472793782,
"grad_norm": 0.5993080705042445,
"learning_rate": 8.719001515627434e-09,
"loss": 1.1649,
"step": 2150
},
{
"epoch": 0.9853680841335163,
"grad_norm": 0.5857680491868433,
"learning_rate": 6.5222145538501595e-09,
"loss": 1.176,
"step": 2155
},
{
"epoch": 0.9876543209876543,
"grad_norm": 0.6157142971328977,
"learning_rate": 4.643547158878492e-09,
"loss": 1.146,
"step": 2160
},
{
"epoch": 0.9899405578417924,
"grad_norm": 0.6005659801135901,
"learning_rate": 3.0831190150054646e-09,
"loss": 1.1607,
"step": 2165
},
{
"epoch": 0.9922267946959304,
"grad_norm": 0.5963682235084494,
"learning_rate": 1.8410295324505778e-09,
"loss": 1.1668,
"step": 2170
},
{
"epoch": 0.9945130315500685,
"grad_norm": 0.649218390898171,
"learning_rate": 9.173578410281992e-10,
"loss": 1.1602,
"step": 2175
},
{
"epoch": 0.9967992684042066,
"grad_norm": 0.612662110275474,
"learning_rate": 3.1216278510493027e-10,
"loss": 1.1596,
"step": 2180
},
{
"epoch": 0.9990855052583447,
"grad_norm": 0.6025732837303296,
"learning_rate": 2.548291985149387e-11,
"loss": 1.147,
"step": 2185
},
{
"epoch": 1.0,
"eval_runtime": 4.0833,
"eval_samples_per_second": 2.449,
"eval_steps_per_second": 0.735,
"step": 2187
},
{
"epoch": 1.0,
"step": 2187,
"total_flos": 9703359095242752.0,
"train_loss": 1.3940648635773556,
"train_runtime": 19118.6692,
"train_samples_per_second": 1.83,
"train_steps_per_second": 0.114
}
],
"logging_steps": 5,
"max_steps": 2187,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9703359095242752.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}