diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,193816 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9998916321996858, + "eval_steps": 500, + "global_step": 27682, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 7.224520020951108e-05, + "grad_norm": 439.81265542842215, + "learning_rate": 6.01684717208183e-09, + "loss": 1.5944, + "step": 1 + }, + { + "epoch": 0.00014449040041902215, + "grad_norm": 422.5603507192789, + "learning_rate": 1.203369434416366e-08, + "loss": 1.5676, + "step": 2 + }, + { + "epoch": 0.00021673560062853323, + "grad_norm": 389.93893752740314, + "learning_rate": 1.8050541516245488e-08, + "loss": 1.5645, + "step": 3 + }, + { + "epoch": 0.0002889808008380443, + "grad_norm": 276.58958200734895, + "learning_rate": 2.406738868832732e-08, + "loss": 1.5481, + "step": 4 + }, + { + "epoch": 0.0003612260010475554, + "grad_norm": 656.1307748840927, + "learning_rate": 3.008423586040915e-08, + "loss": 1.8228, + "step": 5 + }, + { + "epoch": 0.00043347120125706646, + "grad_norm": 375.9519999281823, + "learning_rate": 3.6101083032490976e-08, + "loss": 1.625, + "step": 6 + }, + { + "epoch": 0.0005057164014665776, + "grad_norm": 558.7571587908293, + "learning_rate": 4.21179302045728e-08, + "loss": 1.7485, + "step": 7 + }, + { + "epoch": 0.0005779616016760886, + "grad_norm": 391.6773424963461, + "learning_rate": 4.813477737665464e-08, + "loss": 1.6608, + "step": 8 + }, + { + "epoch": 0.0006502068018855998, + "grad_norm": 403.6424005230372, + "learning_rate": 5.4151624548736464e-08, + "loss": 1.6779, + "step": 9 + }, + { + "epoch": 0.0007224520020951108, + "grad_norm": 438.26738057377713, + "learning_rate": 6.01684717208183e-08, + "loss": 1.7037, + "step": 10 + }, + { + "epoch": 0.0007946972023046219, + "grad_norm": 227.02279965347094, + "learning_rate": 6.618531889290012e-08, + "loss": 1.4411, + "step": 11 + }, + { + "epoch": 0.0008669424025141329, + "grad_norm": 277.32529692132306, + "learning_rate": 7.220216606498195e-08, + "loss": 1.54, + "step": 12 + }, + { + "epoch": 0.0009391876027236441, + "grad_norm": 413.53476274673693, + "learning_rate": 7.82190132370638e-08, + "loss": 1.6152, + "step": 13 + }, + { + "epoch": 0.0010114328029331551, + "grad_norm": 248.7297180801884, + "learning_rate": 8.42358604091456e-08, + "loss": 1.4512, + "step": 14 + }, + { + "epoch": 0.0010836780031426663, + "grad_norm": 55.94775787579092, + "learning_rate": 9.025270758122745e-08, + "loss": 1.2864, + "step": 15 + }, + { + "epoch": 0.0011559232033521772, + "grad_norm": 38.385114200527006, + "learning_rate": 9.626955475330928e-08, + "loss": 1.3397, + "step": 16 + }, + { + "epoch": 0.0012281684035616884, + "grad_norm": 52.4331885146469, + "learning_rate": 1.0228640192539111e-07, + "loss": 1.418, + "step": 17 + }, + { + "epoch": 0.0013004136037711995, + "grad_norm": 53.048610059990025, + "learning_rate": 1.0830324909747293e-07, + "loss": 1.3503, + "step": 18 + }, + { + "epoch": 0.0013726588039807105, + "grad_norm": 39.173630981283985, + "learning_rate": 1.1432009626955476e-07, + "loss": 1.3592, + "step": 19 + }, + { + "epoch": 0.0014449040041902216, + "grad_norm": 49.42661710572174, + "learning_rate": 1.203369434416366e-07, + "loss": 1.3006, + "step": 20 + }, + { + "epoch": 0.0015171492043997328, + "grad_norm": 28.176378851713938, + "learning_rate": 1.263537906137184e-07, + "loss": 1.2994, + "step": 21 + }, + { + "epoch": 0.0015893944046092437, + "grad_norm": 53.309088218790144, + "learning_rate": 1.3237063778580024e-07, + "loss": 1.3753, + "step": 22 + }, + { + "epoch": 0.0016616396048187549, + "grad_norm": 95.72017322644167, + "learning_rate": 1.3838748495788207e-07, + "loss": 1.3621, + "step": 23 + }, + { + "epoch": 0.0017338848050282658, + "grad_norm": 150.485243774099, + "learning_rate": 1.444043321299639e-07, + "loss": 1.2712, + "step": 24 + }, + { + "epoch": 0.001806130005237777, + "grad_norm": 99.59996646742658, + "learning_rate": 1.5042117930204574e-07, + "loss": 1.4777, + "step": 25 + }, + { + "epoch": 0.0018783752054472881, + "grad_norm": 52.70700215504459, + "learning_rate": 1.564380264741276e-07, + "loss": 1.4399, + "step": 26 + }, + { + "epoch": 0.001950620405656799, + "grad_norm": 30.04132387456849, + "learning_rate": 1.6245487364620937e-07, + "loss": 1.2957, + "step": 27 + }, + { + "epoch": 0.0020228656058663102, + "grad_norm": 41.00552405383557, + "learning_rate": 1.684717208182912e-07, + "loss": 1.2942, + "step": 28 + }, + { + "epoch": 0.0020951108060758214, + "grad_norm": 41.8872489664404, + "learning_rate": 1.7448856799037306e-07, + "loss": 1.4321, + "step": 29 + }, + { + "epoch": 0.0021673560062853326, + "grad_norm": 45.29876138707465, + "learning_rate": 1.805054151624549e-07, + "loss": 1.4258, + "step": 30 + }, + { + "epoch": 0.0022396012064948437, + "grad_norm": 26.342687691733712, + "learning_rate": 1.8652226233453673e-07, + "loss": 1.3671, + "step": 31 + }, + { + "epoch": 0.0023118464067043544, + "grad_norm": 27.243494779717597, + "learning_rate": 1.9253910950661856e-07, + "loss": 1.2738, + "step": 32 + }, + { + "epoch": 0.0023840916069138656, + "grad_norm": 24.128446816999666, + "learning_rate": 1.985559566787004e-07, + "loss": 1.349, + "step": 33 + }, + { + "epoch": 0.0024563368071233768, + "grad_norm": 21.115521172232523, + "learning_rate": 2.0457280385078222e-07, + "loss": 1.2512, + "step": 34 + }, + { + "epoch": 0.002528582007332888, + "grad_norm": 23.99035705122607, + "learning_rate": 2.1058965102286403e-07, + "loss": 1.4412, + "step": 35 + }, + { + "epoch": 0.002600827207542399, + "grad_norm": 19.671560853081395, + "learning_rate": 2.1660649819494586e-07, + "loss": 1.2348, + "step": 36 + }, + { + "epoch": 0.00267307240775191, + "grad_norm": 21.67147930199318, + "learning_rate": 2.226233453670277e-07, + "loss": 1.3018, + "step": 37 + }, + { + "epoch": 0.002745317607961421, + "grad_norm": 18.773669955385653, + "learning_rate": 2.2864019253910952e-07, + "loss": 1.2628, + "step": 38 + }, + { + "epoch": 0.002817562808170932, + "grad_norm": 21.818626439737148, + "learning_rate": 2.3465703971119135e-07, + "loss": 1.3498, + "step": 39 + }, + { + "epoch": 0.0028898080083804433, + "grad_norm": 20.088418446061464, + "learning_rate": 2.406738868832732e-07, + "loss": 1.2909, + "step": 40 + }, + { + "epoch": 0.0029620532085899544, + "grad_norm": 21.95666536176643, + "learning_rate": 2.4669073405535504e-07, + "loss": 1.349, + "step": 41 + }, + { + "epoch": 0.0030342984087994656, + "grad_norm": 41.95591383887005, + "learning_rate": 2.527075812274368e-07, + "loss": 1.1927, + "step": 42 + }, + { + "epoch": 0.0031065436090089763, + "grad_norm": 16.640534220703092, + "learning_rate": 2.5872442839951865e-07, + "loss": 1.3773, + "step": 43 + }, + { + "epoch": 0.0031787888092184875, + "grad_norm": 17.762022433213193, + "learning_rate": 2.647412755716005e-07, + "loss": 1.2191, + "step": 44 + }, + { + "epoch": 0.0032510340094279986, + "grad_norm": 18.721809048091565, + "learning_rate": 2.707581227436823e-07, + "loss": 1.3404, + "step": 45 + }, + { + "epoch": 0.0033232792096375098, + "grad_norm": 16.502429205354417, + "learning_rate": 2.7677496991576415e-07, + "loss": 1.2293, + "step": 46 + }, + { + "epoch": 0.003395524409847021, + "grad_norm": 18.47724876500363, + "learning_rate": 2.82791817087846e-07, + "loss": 1.353, + "step": 47 + }, + { + "epoch": 0.0034677696100565317, + "grad_norm": 16.392851671917853, + "learning_rate": 2.888086642599278e-07, + "loss": 1.3076, + "step": 48 + }, + { + "epoch": 0.003540014810266043, + "grad_norm": 21.164947840470663, + "learning_rate": 2.9482551143200964e-07, + "loss": 1.2769, + "step": 49 + }, + { + "epoch": 0.003612260010475554, + "grad_norm": 15.670350689983627, + "learning_rate": 3.008423586040915e-07, + "loss": 1.2156, + "step": 50 + }, + { + "epoch": 0.003684505210685065, + "grad_norm": 14.258624227749467, + "learning_rate": 3.068592057761733e-07, + "loss": 1.2634, + "step": 51 + }, + { + "epoch": 0.0037567504108945763, + "grad_norm": 14.204949088725424, + "learning_rate": 3.128760529482552e-07, + "loss": 1.1854, + "step": 52 + }, + { + "epoch": 0.0038289956111040874, + "grad_norm": 18.905718936239737, + "learning_rate": 3.1889290012033697e-07, + "loss": 1.3068, + "step": 53 + }, + { + "epoch": 0.003901240811313598, + "grad_norm": 14.362272516080868, + "learning_rate": 3.2490974729241875e-07, + "loss": 1.2616, + "step": 54 + }, + { + "epoch": 0.00397348601152311, + "grad_norm": 13.230859138579092, + "learning_rate": 3.3092659446450063e-07, + "loss": 1.2445, + "step": 55 + }, + { + "epoch": 0.0040457312117326205, + "grad_norm": 12.393833202057776, + "learning_rate": 3.369434416365824e-07, + "loss": 1.1561, + "step": 56 + }, + { + "epoch": 0.004117976411942131, + "grad_norm": 15.986154287578048, + "learning_rate": 3.429602888086643e-07, + "loss": 1.3318, + "step": 57 + }, + { + "epoch": 0.004190221612151643, + "grad_norm": 15.518121770094613, + "learning_rate": 3.489771359807461e-07, + "loss": 1.1257, + "step": 58 + }, + { + "epoch": 0.0042624668123611535, + "grad_norm": 13.727282814054325, + "learning_rate": 3.5499398315282796e-07, + "loss": 1.2271, + "step": 59 + }, + { + "epoch": 0.004334712012570665, + "grad_norm": 14.4713762507517, + "learning_rate": 3.610108303249098e-07, + "loss": 1.2817, + "step": 60 + }, + { + "epoch": 0.004406957212780176, + "grad_norm": 12.615817214677003, + "learning_rate": 3.6702767749699157e-07, + "loss": 1.1911, + "step": 61 + }, + { + "epoch": 0.004479202412989687, + "grad_norm": 14.598803411072986, + "learning_rate": 3.7304452466907345e-07, + "loss": 1.2859, + "step": 62 + }, + { + "epoch": 0.004551447613199198, + "grad_norm": 14.833731199493542, + "learning_rate": 3.7906137184115523e-07, + "loss": 1.3429, + "step": 63 + }, + { + "epoch": 0.004623692813408709, + "grad_norm": 12.157282376029102, + "learning_rate": 3.850782190132371e-07, + "loss": 1.1658, + "step": 64 + }, + { + "epoch": 0.0046959380136182205, + "grad_norm": 14.035400864136093, + "learning_rate": 3.910950661853189e-07, + "loss": 1.2471, + "step": 65 + }, + { + "epoch": 0.004768183213827731, + "grad_norm": 12.133264615489608, + "learning_rate": 3.971119133574008e-07, + "loss": 1.2198, + "step": 66 + }, + { + "epoch": 0.004840428414037243, + "grad_norm": 11.55833698282732, + "learning_rate": 4.0312876052948256e-07, + "loss": 1.1482, + "step": 67 + }, + { + "epoch": 0.0049126736142467535, + "grad_norm": 15.894465969698826, + "learning_rate": 4.0914560770156444e-07, + "loss": 1.205, + "step": 68 + }, + { + "epoch": 0.004984918814456264, + "grad_norm": 14.196320736759137, + "learning_rate": 4.151624548736462e-07, + "loss": 1.3537, + "step": 69 + }, + { + "epoch": 0.005057164014665776, + "grad_norm": 14.127240838742019, + "learning_rate": 4.2117930204572805e-07, + "loss": 1.2591, + "step": 70 + }, + { + "epoch": 0.0051294092148752865, + "grad_norm": 11.741090034167598, + "learning_rate": 4.2719614921780994e-07, + "loss": 1.1881, + "step": 71 + }, + { + "epoch": 0.005201654415084798, + "grad_norm": 12.687507817308244, + "learning_rate": 4.332129963898917e-07, + "loss": 1.17, + "step": 72 + }, + { + "epoch": 0.005273899615294309, + "grad_norm": 12.379379576387867, + "learning_rate": 4.392298435619736e-07, + "loss": 1.155, + "step": 73 + }, + { + "epoch": 0.00534614481550382, + "grad_norm": 11.901085401770478, + "learning_rate": 4.452466907340554e-07, + "loss": 1.2085, + "step": 74 + }, + { + "epoch": 0.005418390015713331, + "grad_norm": 10.750920810740288, + "learning_rate": 4.5126353790613726e-07, + "loss": 1.2441, + "step": 75 + }, + { + "epoch": 0.005490635215922842, + "grad_norm": 13.472293254931582, + "learning_rate": 4.5728038507821904e-07, + "loss": 1.0728, + "step": 76 + }, + { + "epoch": 0.0055628804161323535, + "grad_norm": 11.629979533524352, + "learning_rate": 4.6329723225030087e-07, + "loss": 1.2, + "step": 77 + }, + { + "epoch": 0.005635125616341864, + "grad_norm": 13.67171205260039, + "learning_rate": 4.693140794223827e-07, + "loss": 1.2063, + "step": 78 + }, + { + "epoch": 0.005707370816551376, + "grad_norm": 11.579249771403788, + "learning_rate": 4.7533092659446454e-07, + "loss": 1.1468, + "step": 79 + }, + { + "epoch": 0.0057796160167608865, + "grad_norm": 13.504701149044358, + "learning_rate": 4.813477737665464e-07, + "loss": 1.1438, + "step": 80 + }, + { + "epoch": 0.005851861216970397, + "grad_norm": 13.233993362634939, + "learning_rate": 4.873646209386282e-07, + "loss": 1.1567, + "step": 81 + }, + { + "epoch": 0.005924106417179909, + "grad_norm": 11.612598654468586, + "learning_rate": 4.933814681107101e-07, + "loss": 1.3001, + "step": 82 + }, + { + "epoch": 0.0059963516173894196, + "grad_norm": 12.684947090564851, + "learning_rate": 4.993983152827919e-07, + "loss": 1.1483, + "step": 83 + }, + { + "epoch": 0.006068596817598931, + "grad_norm": 12.587611062886706, + "learning_rate": 5.054151624548736e-07, + "loss": 1.0058, + "step": 84 + }, + { + "epoch": 0.006140842017808442, + "grad_norm": 11.971761538755285, + "learning_rate": 5.114320096269555e-07, + "loss": 1.2153, + "step": 85 + }, + { + "epoch": 0.006213087218017953, + "grad_norm": 11.024803979908675, + "learning_rate": 5.174488567990373e-07, + "loss": 1.238, + "step": 86 + }, + { + "epoch": 0.006285332418227464, + "grad_norm": 11.349717433285715, + "learning_rate": 5.234657039711192e-07, + "loss": 1.0858, + "step": 87 + }, + { + "epoch": 0.006357577618436975, + "grad_norm": 12.009360794984119, + "learning_rate": 5.29482551143201e-07, + "loss": 1.1351, + "step": 88 + }, + { + "epoch": 0.0064298228186464865, + "grad_norm": 15.091476237575199, + "learning_rate": 5.354993983152829e-07, + "loss": 1.1764, + "step": 89 + }, + { + "epoch": 0.006502068018855997, + "grad_norm": 15.227783529961002, + "learning_rate": 5.415162454873646e-07, + "loss": 1.2352, + "step": 90 + }, + { + "epoch": 0.006574313219065508, + "grad_norm": 14.614404736617365, + "learning_rate": 5.475330926594465e-07, + "loss": 1.1896, + "step": 91 + }, + { + "epoch": 0.0066465584192750195, + "grad_norm": 13.523810969352006, + "learning_rate": 5.535499398315283e-07, + "loss": 1.1908, + "step": 92 + }, + { + "epoch": 0.00671880361948453, + "grad_norm": 12.550243262862725, + "learning_rate": 5.595667870036102e-07, + "loss": 1.1479, + "step": 93 + }, + { + "epoch": 0.006791048819694042, + "grad_norm": 12.483295723451425, + "learning_rate": 5.65583634175692e-07, + "loss": 1.1595, + "step": 94 + }, + { + "epoch": 0.006863294019903553, + "grad_norm": 15.051566433418463, + "learning_rate": 5.716004813477738e-07, + "loss": 1.3276, + "step": 95 + }, + { + "epoch": 0.006935539220113063, + "grad_norm": 14.577747663089484, + "learning_rate": 5.776173285198556e-07, + "loss": 1.1423, + "step": 96 + }, + { + "epoch": 0.007007784420322575, + "grad_norm": 13.349669811723107, + "learning_rate": 5.836341756919375e-07, + "loss": 1.1783, + "step": 97 + }, + { + "epoch": 0.007080029620532086, + "grad_norm": 14.144732979782118, + "learning_rate": 5.896510228640193e-07, + "loss": 1.2443, + "step": 98 + }, + { + "epoch": 0.007152274820741597, + "grad_norm": 11.519474954296149, + "learning_rate": 5.956678700361012e-07, + "loss": 1.1997, + "step": 99 + }, + { + "epoch": 0.007224520020951108, + "grad_norm": 12.861968621342779, + "learning_rate": 6.01684717208183e-07, + "loss": 1.1582, + "step": 100 + }, + { + "epoch": 0.0072967652211606195, + "grad_norm": 12.144631764218978, + "learning_rate": 6.077015643802648e-07, + "loss": 1.1442, + "step": 101 + }, + { + "epoch": 0.00736901042137013, + "grad_norm": 11.99403042130002, + "learning_rate": 6.137184115523466e-07, + "loss": 1.208, + "step": 102 + }, + { + "epoch": 0.007441255621579641, + "grad_norm": 11.332706508879784, + "learning_rate": 6.197352587244285e-07, + "loss": 1.1922, + "step": 103 + }, + { + "epoch": 0.0075135008217891526, + "grad_norm": 9.198543632339561, + "learning_rate": 6.257521058965104e-07, + "loss": 1.1416, + "step": 104 + }, + { + "epoch": 0.007585746021998663, + "grad_norm": 9.811663355057018, + "learning_rate": 6.31768953068592e-07, + "loss": 1.1644, + "step": 105 + }, + { + "epoch": 0.007657991222208175, + "grad_norm": 10.664680613954468, + "learning_rate": 6.377858002406739e-07, + "loss": 1.1329, + "step": 106 + }, + { + "epoch": 0.007730236422417686, + "grad_norm": 11.87829736806828, + "learning_rate": 6.438026474127558e-07, + "loss": 1.2457, + "step": 107 + }, + { + "epoch": 0.007802481622627196, + "grad_norm": 13.50912336847104, + "learning_rate": 6.498194945848375e-07, + "loss": 1.1398, + "step": 108 + }, + { + "epoch": 0.007874726822836707, + "grad_norm": 9.760445892279373, + "learning_rate": 6.558363417569194e-07, + "loss": 1.093, + "step": 109 + }, + { + "epoch": 0.00794697202304622, + "grad_norm": 14.395685588043547, + "learning_rate": 6.618531889290013e-07, + "loss": 1.1612, + "step": 110 + }, + { + "epoch": 0.00801921722325573, + "grad_norm": 11.775174706979206, + "learning_rate": 6.678700361010831e-07, + "loss": 1.1608, + "step": 111 + }, + { + "epoch": 0.008091462423465241, + "grad_norm": 10.692869388031994, + "learning_rate": 6.738868832731648e-07, + "loss": 1.217, + "step": 112 + }, + { + "epoch": 0.008163707623674752, + "grad_norm": 11.047801797667574, + "learning_rate": 6.799037304452467e-07, + "loss": 1.2239, + "step": 113 + }, + { + "epoch": 0.008235952823884262, + "grad_norm": 9.404574526989055, + "learning_rate": 6.859205776173286e-07, + "loss": 1.0854, + "step": 114 + }, + { + "epoch": 0.008308198024093775, + "grad_norm": 10.736669037076572, + "learning_rate": 6.919374247894104e-07, + "loss": 1.1454, + "step": 115 + }, + { + "epoch": 0.008380443224303286, + "grad_norm": 12.147185174599736, + "learning_rate": 6.979542719614923e-07, + "loss": 1.1893, + "step": 116 + }, + { + "epoch": 0.008452688424512796, + "grad_norm": 11.513351650816176, + "learning_rate": 7.03971119133574e-07, + "loss": 1.2205, + "step": 117 + }, + { + "epoch": 0.008524933624722307, + "grad_norm": 9.57022241666787, + "learning_rate": 7.099879663056559e-07, + "loss": 1.1085, + "step": 118 + }, + { + "epoch": 0.008597178824931818, + "grad_norm": 12.53828791902745, + "learning_rate": 7.160048134777377e-07, + "loss": 1.1605, + "step": 119 + }, + { + "epoch": 0.00866942402514133, + "grad_norm": 12.027640616261584, + "learning_rate": 7.220216606498196e-07, + "loss": 1.1451, + "step": 120 + }, + { + "epoch": 0.008741669225350841, + "grad_norm": 10.007801446007752, + "learning_rate": 7.280385078219015e-07, + "loss": 1.175, + "step": 121 + }, + { + "epoch": 0.008813914425560352, + "grad_norm": 13.065425855409503, + "learning_rate": 7.340553549939831e-07, + "loss": 1.1639, + "step": 122 + }, + { + "epoch": 0.008886159625769862, + "grad_norm": 10.05231827904405, + "learning_rate": 7.40072202166065e-07, + "loss": 1.0774, + "step": 123 + }, + { + "epoch": 0.008958404825979375, + "grad_norm": 12.37034812344877, + "learning_rate": 7.460890493381469e-07, + "loss": 1.1093, + "step": 124 + }, + { + "epoch": 0.009030650026188886, + "grad_norm": 12.996364965610036, + "learning_rate": 7.521058965102288e-07, + "loss": 1.2248, + "step": 125 + }, + { + "epoch": 0.009102895226398396, + "grad_norm": 13.880306294604281, + "learning_rate": 7.581227436823105e-07, + "loss": 1.153, + "step": 126 + }, + { + "epoch": 0.009175140426607907, + "grad_norm": 11.014619995156226, + "learning_rate": 7.641395908543923e-07, + "loss": 1.1961, + "step": 127 + }, + { + "epoch": 0.009247385626817418, + "grad_norm": 12.162729610459506, + "learning_rate": 7.701564380264742e-07, + "loss": 1.2476, + "step": 128 + }, + { + "epoch": 0.00931963082702693, + "grad_norm": 10.541900230102057, + "learning_rate": 7.761732851985561e-07, + "loss": 1.1275, + "step": 129 + }, + { + "epoch": 0.009391876027236441, + "grad_norm": 14.901209204846483, + "learning_rate": 7.821901323706378e-07, + "loss": 1.2554, + "step": 130 + }, + { + "epoch": 0.009464121227445952, + "grad_norm": 11.032671871890892, + "learning_rate": 7.882069795427197e-07, + "loss": 1.0733, + "step": 131 + }, + { + "epoch": 0.009536366427655462, + "grad_norm": 12.693517902004771, + "learning_rate": 7.942238267148016e-07, + "loss": 1.1384, + "step": 132 + }, + { + "epoch": 0.009608611627864973, + "grad_norm": 19.153922358076393, + "learning_rate": 8.002406738868833e-07, + "loss": 1.1624, + "step": 133 + }, + { + "epoch": 0.009680856828074486, + "grad_norm": 9.881047968092954, + "learning_rate": 8.062575210589651e-07, + "loss": 1.0, + "step": 134 + }, + { + "epoch": 0.009753102028283996, + "grad_norm": 13.282074882472006, + "learning_rate": 8.12274368231047e-07, + "loss": 1.0783, + "step": 135 + }, + { + "epoch": 0.009825347228493507, + "grad_norm": 13.795196050873335, + "learning_rate": 8.182912154031289e-07, + "loss": 1.1097, + "step": 136 + }, + { + "epoch": 0.009897592428703018, + "grad_norm": 13.90635094766731, + "learning_rate": 8.243080625752107e-07, + "loss": 1.166, + "step": 137 + }, + { + "epoch": 0.009969837628912528, + "grad_norm": 13.661942820866475, + "learning_rate": 8.303249097472924e-07, + "loss": 1.1634, + "step": 138 + }, + { + "epoch": 0.010042082829122041, + "grad_norm": 13.504986442550639, + "learning_rate": 8.363417569193743e-07, + "loss": 1.1762, + "step": 139 + }, + { + "epoch": 0.010114328029331552, + "grad_norm": 12.29584007319015, + "learning_rate": 8.423586040914561e-07, + "loss": 1.1161, + "step": 140 + }, + { + "epoch": 0.010186573229541062, + "grad_norm": 16.046031539724336, + "learning_rate": 8.48375451263538e-07, + "loss": 1.203, + "step": 141 + }, + { + "epoch": 0.010258818429750573, + "grad_norm": 10.544058507036306, + "learning_rate": 8.543922984356199e-07, + "loss": 1.0966, + "step": 142 + }, + { + "epoch": 0.010331063629960084, + "grad_norm": 8.90186981104602, + "learning_rate": 8.604091456077017e-07, + "loss": 0.9942, + "step": 143 + }, + { + "epoch": 0.010403308830169596, + "grad_norm": 14.73982689682081, + "learning_rate": 8.664259927797834e-07, + "loss": 1.2175, + "step": 144 + }, + { + "epoch": 0.010475554030379107, + "grad_norm": 9.33751171678168, + "learning_rate": 8.724428399518653e-07, + "loss": 1.0561, + "step": 145 + }, + { + "epoch": 0.010547799230588618, + "grad_norm": 9.760289948956194, + "learning_rate": 8.784596871239472e-07, + "loss": 1.0878, + "step": 146 + }, + { + "epoch": 0.010620044430798128, + "grad_norm": 11.446466985289064, + "learning_rate": 8.844765342960289e-07, + "loss": 1.1571, + "step": 147 + }, + { + "epoch": 0.01069228963100764, + "grad_norm": 10.79624007438017, + "learning_rate": 8.904933814681108e-07, + "loss": 1.1228, + "step": 148 + }, + { + "epoch": 0.010764534831217152, + "grad_norm": 14.034980668780662, + "learning_rate": 8.965102286401926e-07, + "loss": 1.2084, + "step": 149 + }, + { + "epoch": 0.010836780031426662, + "grad_norm": 11.926459354697764, + "learning_rate": 9.025270758122745e-07, + "loss": 1.1137, + "step": 150 + }, + { + "epoch": 0.010909025231636173, + "grad_norm": 10.113783663650176, + "learning_rate": 9.085439229843562e-07, + "loss": 1.1611, + "step": 151 + }, + { + "epoch": 0.010981270431845684, + "grad_norm": 12.167729231189963, + "learning_rate": 9.145607701564381e-07, + "loss": 1.171, + "step": 152 + }, + { + "epoch": 0.011053515632055195, + "grad_norm": 10.827798769661776, + "learning_rate": 9.2057761732852e-07, + "loss": 1.0988, + "step": 153 + }, + { + "epoch": 0.011125760832264707, + "grad_norm": 10.677373003247956, + "learning_rate": 9.265944645006017e-07, + "loss": 1.1548, + "step": 154 + }, + { + "epoch": 0.011198006032474218, + "grad_norm": 9.93242088411182, + "learning_rate": 9.326113116726835e-07, + "loss": 1.0931, + "step": 155 + }, + { + "epoch": 0.011270251232683728, + "grad_norm": 10.392494985631082, + "learning_rate": 9.386281588447654e-07, + "loss": 1.0731, + "step": 156 + }, + { + "epoch": 0.01134249643289324, + "grad_norm": 11.149917301162098, + "learning_rate": 9.446450060168473e-07, + "loss": 1.0914, + "step": 157 + }, + { + "epoch": 0.011414741633102752, + "grad_norm": 13.20041172657037, + "learning_rate": 9.506618531889291e-07, + "loss": 1.1831, + "step": 158 + }, + { + "epoch": 0.011486986833312262, + "grad_norm": 9.90419846720733, + "learning_rate": 9.566787003610109e-07, + "loss": 0.9957, + "step": 159 + }, + { + "epoch": 0.011559232033521773, + "grad_norm": 12.589103201148987, + "learning_rate": 9.626955475330928e-07, + "loss": 1.0357, + "step": 160 + }, + { + "epoch": 0.011631477233731284, + "grad_norm": 10.48329868846641, + "learning_rate": 9.687123947051744e-07, + "loss": 1.1274, + "step": 161 + }, + { + "epoch": 0.011703722433940794, + "grad_norm": 10.02032389043092, + "learning_rate": 9.747292418772564e-07, + "loss": 1.2132, + "step": 162 + }, + { + "epoch": 0.011775967634150307, + "grad_norm": 10.711235949297516, + "learning_rate": 9.807460890493382e-07, + "loss": 1.135, + "step": 163 + }, + { + "epoch": 0.011848212834359818, + "grad_norm": 11.269197825981816, + "learning_rate": 9.867629362214202e-07, + "loss": 1.1939, + "step": 164 + }, + { + "epoch": 0.011920458034569328, + "grad_norm": 10.387497326498227, + "learning_rate": 9.927797833935017e-07, + "loss": 1.0865, + "step": 165 + }, + { + "epoch": 0.011992703234778839, + "grad_norm": 11.241369370096395, + "learning_rate": 9.987966305655837e-07, + "loss": 1.2574, + "step": 166 + }, + { + "epoch": 0.01206494843498835, + "grad_norm": 10.675636176063321, + "learning_rate": 1.0048134777376655e-06, + "loss": 1.1302, + "step": 167 + }, + { + "epoch": 0.012137193635197862, + "grad_norm": 11.535807052839933, + "learning_rate": 1.0108303249097473e-06, + "loss": 1.1281, + "step": 168 + }, + { + "epoch": 0.012209438835407373, + "grad_norm": 12.772828851133626, + "learning_rate": 1.0168471720818293e-06, + "loss": 1.0783, + "step": 169 + }, + { + "epoch": 0.012281684035616884, + "grad_norm": 11.6023950143218, + "learning_rate": 1.022864019253911e-06, + "loss": 1.1214, + "step": 170 + }, + { + "epoch": 0.012353929235826394, + "grad_norm": 13.689752262791375, + "learning_rate": 1.0288808664259928e-06, + "loss": 1.1544, + "step": 171 + }, + { + "epoch": 0.012426174436035905, + "grad_norm": 12.957979219919562, + "learning_rate": 1.0348977135980746e-06, + "loss": 1.1406, + "step": 172 + }, + { + "epoch": 0.012498419636245418, + "grad_norm": 9.604726867446164, + "learning_rate": 1.0409145607701566e-06, + "loss": 1.0778, + "step": 173 + }, + { + "epoch": 0.012570664836454928, + "grad_norm": 9.908488986303574, + "learning_rate": 1.0469314079422384e-06, + "loss": 1.1605, + "step": 174 + }, + { + "epoch": 0.012642910036664439, + "grad_norm": 12.47768149138255, + "learning_rate": 1.0529482551143202e-06, + "loss": 1.1357, + "step": 175 + }, + { + "epoch": 0.01271515523687395, + "grad_norm": 11.991141546405725, + "learning_rate": 1.058965102286402e-06, + "loss": 1.141, + "step": 176 + }, + { + "epoch": 0.01278740043708346, + "grad_norm": 11.353230531697108, + "learning_rate": 1.064981949458484e-06, + "loss": 1.1, + "step": 177 + }, + { + "epoch": 0.012859645637292973, + "grad_norm": 11.594469238345726, + "learning_rate": 1.0709987966305657e-06, + "loss": 1.1866, + "step": 178 + }, + { + "epoch": 0.012931890837502484, + "grad_norm": 14.480259052298743, + "learning_rate": 1.0770156438026475e-06, + "loss": 1.1539, + "step": 179 + }, + { + "epoch": 0.013004136037711994, + "grad_norm": 20.17765638974984, + "learning_rate": 1.0830324909747293e-06, + "loss": 1.0533, + "step": 180 + }, + { + "epoch": 0.013076381237921505, + "grad_norm": 11.643746304473044, + "learning_rate": 1.0890493381468113e-06, + "loss": 1.1756, + "step": 181 + }, + { + "epoch": 0.013148626438131016, + "grad_norm": 11.022244589622254, + "learning_rate": 1.095066185318893e-06, + "loss": 1.1598, + "step": 182 + }, + { + "epoch": 0.013220871638340528, + "grad_norm": 13.4074621119436, + "learning_rate": 1.1010830324909748e-06, + "loss": 1.1307, + "step": 183 + }, + { + "epoch": 0.013293116838550039, + "grad_norm": 10.636186164021513, + "learning_rate": 1.1070998796630566e-06, + "loss": 1.0827, + "step": 184 + }, + { + "epoch": 0.01336536203875955, + "grad_norm": 13.803722509778046, + "learning_rate": 1.1131167268351386e-06, + "loss": 1.17, + "step": 185 + }, + { + "epoch": 0.01343760723896906, + "grad_norm": 10.707146327706097, + "learning_rate": 1.1191335740072204e-06, + "loss": 1.0632, + "step": 186 + }, + { + "epoch": 0.013509852439178571, + "grad_norm": 10.11691828189103, + "learning_rate": 1.1251504211793021e-06, + "loss": 1.1458, + "step": 187 + }, + { + "epoch": 0.013582097639388084, + "grad_norm": 11.39812960087915, + "learning_rate": 1.131167268351384e-06, + "loss": 1.1053, + "step": 188 + }, + { + "epoch": 0.013654342839597594, + "grad_norm": 10.996532934169098, + "learning_rate": 1.137184115523466e-06, + "loss": 1.0956, + "step": 189 + }, + { + "epoch": 0.013726588039807105, + "grad_norm": 10.490245466096818, + "learning_rate": 1.1432009626955477e-06, + "loss": 1.2017, + "step": 190 + }, + { + "epoch": 0.013798833240016616, + "grad_norm": 11.287797966590752, + "learning_rate": 1.1492178098676295e-06, + "loss": 1.1151, + "step": 191 + }, + { + "epoch": 0.013871078440226127, + "grad_norm": 11.017031922010823, + "learning_rate": 1.1552346570397112e-06, + "loss": 1.2638, + "step": 192 + }, + { + "epoch": 0.013943323640435639, + "grad_norm": 9.12169430571075, + "learning_rate": 1.161251504211793e-06, + "loss": 1.1557, + "step": 193 + }, + { + "epoch": 0.01401556884064515, + "grad_norm": 11.956386146228901, + "learning_rate": 1.167268351383875e-06, + "loss": 1.0114, + "step": 194 + }, + { + "epoch": 0.01408781404085466, + "grad_norm": 12.756233636227869, + "learning_rate": 1.1732851985559568e-06, + "loss": 1.1383, + "step": 195 + }, + { + "epoch": 0.014160059241064171, + "grad_norm": 10.935226465713699, + "learning_rate": 1.1793020457280386e-06, + "loss": 1.0492, + "step": 196 + }, + { + "epoch": 0.014232304441273684, + "grad_norm": 9.423847757427737, + "learning_rate": 1.1853188929001203e-06, + "loss": 1.1243, + "step": 197 + }, + { + "epoch": 0.014304549641483194, + "grad_norm": 10.118177964909979, + "learning_rate": 1.1913357400722023e-06, + "loss": 1.2084, + "step": 198 + }, + { + "epoch": 0.014376794841692705, + "grad_norm": 11.67271099645819, + "learning_rate": 1.1973525872442841e-06, + "loss": 1.1143, + "step": 199 + }, + { + "epoch": 0.014449040041902216, + "grad_norm": 12.3057526820697, + "learning_rate": 1.203369434416366e-06, + "loss": 1.1492, + "step": 200 + }, + { + "epoch": 0.014521285242111727, + "grad_norm": 10.097984535886937, + "learning_rate": 1.2093862815884477e-06, + "loss": 1.0993, + "step": 201 + }, + { + "epoch": 0.014593530442321239, + "grad_norm": 10.159854753912846, + "learning_rate": 1.2154031287605297e-06, + "loss": 1.2186, + "step": 202 + }, + { + "epoch": 0.01466577564253075, + "grad_norm": 11.935269292151387, + "learning_rate": 1.2214199759326114e-06, + "loss": 1.146, + "step": 203 + }, + { + "epoch": 0.01473802084274026, + "grad_norm": 11.739961922111041, + "learning_rate": 1.2274368231046932e-06, + "loss": 1.1623, + "step": 204 + }, + { + "epoch": 0.014810266042949771, + "grad_norm": 10.215743342916356, + "learning_rate": 1.233453670276775e-06, + "loss": 1.052, + "step": 205 + }, + { + "epoch": 0.014882511243159282, + "grad_norm": 11.659717197554713, + "learning_rate": 1.239470517448857e-06, + "loss": 1.2221, + "step": 206 + }, + { + "epoch": 0.014954756443368794, + "grad_norm": 12.596165730621411, + "learning_rate": 1.2454873646209388e-06, + "loss": 1.1632, + "step": 207 + }, + { + "epoch": 0.015027001643578305, + "grad_norm": 13.682347551552853, + "learning_rate": 1.2515042117930208e-06, + "loss": 1.1354, + "step": 208 + }, + { + "epoch": 0.015099246843787816, + "grad_norm": 10.808429282494636, + "learning_rate": 1.2575210589651023e-06, + "loss": 1.0641, + "step": 209 + }, + { + "epoch": 0.015171492043997327, + "grad_norm": 12.103723479587284, + "learning_rate": 1.263537906137184e-06, + "loss": 1.2332, + "step": 210 + }, + { + "epoch": 0.015243737244206837, + "grad_norm": 11.90557651166861, + "learning_rate": 1.269554753309266e-06, + "loss": 1.2182, + "step": 211 + }, + { + "epoch": 0.01531598244441635, + "grad_norm": 9.68608506236907, + "learning_rate": 1.2755716004813479e-06, + "loss": 1.0941, + "step": 212 + }, + { + "epoch": 0.01538822764462586, + "grad_norm": 11.66182855470178, + "learning_rate": 1.2815884476534297e-06, + "loss": 1.0942, + "step": 213 + }, + { + "epoch": 0.015460472844835371, + "grad_norm": 10.496131184223346, + "learning_rate": 1.2876052948255116e-06, + "loss": 1.0282, + "step": 214 + }, + { + "epoch": 0.015532718045044882, + "grad_norm": 14.432495835272437, + "learning_rate": 1.2936221419975934e-06, + "loss": 1.1463, + "step": 215 + }, + { + "epoch": 0.015604963245254393, + "grad_norm": 12.577956981602961, + "learning_rate": 1.299638989169675e-06, + "loss": 1.0951, + "step": 216 + }, + { + "epoch": 0.015677208445463903, + "grad_norm": 10.014215664786702, + "learning_rate": 1.305655836341757e-06, + "loss": 1.0738, + "step": 217 + }, + { + "epoch": 0.015749453645673414, + "grad_norm": 11.135665410222147, + "learning_rate": 1.3116726835138388e-06, + "loss": 1.1182, + "step": 218 + }, + { + "epoch": 0.01582169884588293, + "grad_norm": 14.361085210826415, + "learning_rate": 1.3176895306859207e-06, + "loss": 1.1189, + "step": 219 + }, + { + "epoch": 0.01589394404609244, + "grad_norm": 14.946329866868917, + "learning_rate": 1.3237063778580025e-06, + "loss": 1.16, + "step": 220 + }, + { + "epoch": 0.01596618924630195, + "grad_norm": 10.290108340094331, + "learning_rate": 1.3297232250300843e-06, + "loss": 1.0985, + "step": 221 + }, + { + "epoch": 0.01603843444651146, + "grad_norm": 10.29996797130753, + "learning_rate": 1.3357400722021663e-06, + "loss": 1.3023, + "step": 222 + }, + { + "epoch": 0.01611067964672097, + "grad_norm": 12.157260411494468, + "learning_rate": 1.341756919374248e-06, + "loss": 1.1905, + "step": 223 + }, + { + "epoch": 0.016182924846930482, + "grad_norm": 12.682608427234308, + "learning_rate": 1.3477737665463296e-06, + "loss": 1.1394, + "step": 224 + }, + { + "epoch": 0.016255170047139993, + "grad_norm": 12.043683966558078, + "learning_rate": 1.3537906137184118e-06, + "loss": 1.1029, + "step": 225 + }, + { + "epoch": 0.016327415247349503, + "grad_norm": 11.722002803763754, + "learning_rate": 1.3598074608904934e-06, + "loss": 1.054, + "step": 226 + }, + { + "epoch": 0.016399660447559014, + "grad_norm": 13.735112679279336, + "learning_rate": 1.3658243080625752e-06, + "loss": 1.1728, + "step": 227 + }, + { + "epoch": 0.016471905647768525, + "grad_norm": 10.985303598062348, + "learning_rate": 1.3718411552346572e-06, + "loss": 1.1576, + "step": 228 + }, + { + "epoch": 0.01654415084797804, + "grad_norm": 10.382125062480624, + "learning_rate": 1.377858002406739e-06, + "loss": 1.1832, + "step": 229 + }, + { + "epoch": 0.01661639604818755, + "grad_norm": 10.274778607807232, + "learning_rate": 1.3838748495788207e-06, + "loss": 1.1424, + "step": 230 + }, + { + "epoch": 0.01668864124839706, + "grad_norm": 9.6794116309559, + "learning_rate": 1.3898916967509027e-06, + "loss": 1.16, + "step": 231 + }, + { + "epoch": 0.01676088644860657, + "grad_norm": 13.630306225567141, + "learning_rate": 1.3959085439229845e-06, + "loss": 1.1613, + "step": 232 + }, + { + "epoch": 0.016833131648816082, + "grad_norm": 10.784633116726889, + "learning_rate": 1.4019253910950665e-06, + "loss": 1.095, + "step": 233 + }, + { + "epoch": 0.016905376849025593, + "grad_norm": 10.188486624842271, + "learning_rate": 1.407942238267148e-06, + "loss": 1.1912, + "step": 234 + }, + { + "epoch": 0.016977622049235103, + "grad_norm": 11.15979733956474, + "learning_rate": 1.4139590854392298e-06, + "loss": 1.068, + "step": 235 + }, + { + "epoch": 0.017049867249444614, + "grad_norm": 11.525446064357851, + "learning_rate": 1.4199759326113118e-06, + "loss": 1.1771, + "step": 236 + }, + { + "epoch": 0.017122112449654125, + "grad_norm": 11.185853293504639, + "learning_rate": 1.4259927797833936e-06, + "loss": 1.1719, + "step": 237 + }, + { + "epoch": 0.017194357649863636, + "grad_norm": 13.826341692919895, + "learning_rate": 1.4320096269554754e-06, + "loss": 1.2402, + "step": 238 + }, + { + "epoch": 0.01726660285007315, + "grad_norm": 11.298129738841872, + "learning_rate": 1.4380264741275574e-06, + "loss": 1.097, + "step": 239 + }, + { + "epoch": 0.01733884805028266, + "grad_norm": 8.83049392649357, + "learning_rate": 1.4440433212996392e-06, + "loss": 1.1333, + "step": 240 + }, + { + "epoch": 0.01741109325049217, + "grad_norm": 9.92760276969872, + "learning_rate": 1.4500601684717207e-06, + "loss": 1.0828, + "step": 241 + }, + { + "epoch": 0.017483338450701682, + "grad_norm": 13.054416911655622, + "learning_rate": 1.456077015643803e-06, + "loss": 1.2752, + "step": 242 + }, + { + "epoch": 0.017555583650911193, + "grad_norm": 13.562701386393655, + "learning_rate": 1.4620938628158845e-06, + "loss": 1.0531, + "step": 243 + }, + { + "epoch": 0.017627828851120703, + "grad_norm": 9.694739608376223, + "learning_rate": 1.4681107099879663e-06, + "loss": 1.1271, + "step": 244 + }, + { + "epoch": 0.017700074051330214, + "grad_norm": 10.627002482482787, + "learning_rate": 1.4741275571600483e-06, + "loss": 1.0768, + "step": 245 + }, + { + "epoch": 0.017772319251539725, + "grad_norm": 11.91704816585486, + "learning_rate": 1.48014440433213e-06, + "loss": 1.1204, + "step": 246 + }, + { + "epoch": 0.017844564451749236, + "grad_norm": 10.889418591268285, + "learning_rate": 1.486161251504212e-06, + "loss": 1.0212, + "step": 247 + }, + { + "epoch": 0.01791680965195875, + "grad_norm": 10.429936941071436, + "learning_rate": 1.4921780986762938e-06, + "loss": 1.1512, + "step": 248 + }, + { + "epoch": 0.01798905485216826, + "grad_norm": 10.33457627563293, + "learning_rate": 1.4981949458483756e-06, + "loss": 1.1099, + "step": 249 + }, + { + "epoch": 0.01806130005237777, + "grad_norm": 12.305452605181143, + "learning_rate": 1.5042117930204576e-06, + "loss": 1.2576, + "step": 250 + }, + { + "epoch": 0.018133545252587282, + "grad_norm": 12.585249462299915, + "learning_rate": 1.5102286401925391e-06, + "loss": 1.0889, + "step": 251 + }, + { + "epoch": 0.018205790452796793, + "grad_norm": 14.356691061908045, + "learning_rate": 1.516245487364621e-06, + "loss": 1.1968, + "step": 252 + }, + { + "epoch": 0.018278035653006303, + "grad_norm": 8.859725487721018, + "learning_rate": 1.522262334536703e-06, + "loss": 1.1676, + "step": 253 + }, + { + "epoch": 0.018350280853215814, + "grad_norm": 10.017819739908909, + "learning_rate": 1.5282791817087847e-06, + "loss": 1.1416, + "step": 254 + }, + { + "epoch": 0.018422526053425325, + "grad_norm": 10.902995938482702, + "learning_rate": 1.5342960288808665e-06, + "loss": 1.0688, + "step": 255 + }, + { + "epoch": 0.018494771253634835, + "grad_norm": 10.37440250009063, + "learning_rate": 1.5403128760529485e-06, + "loss": 1.1257, + "step": 256 + }, + { + "epoch": 0.018567016453844346, + "grad_norm": 10.678388671477576, + "learning_rate": 1.5463297232250302e-06, + "loss": 1.087, + "step": 257 + }, + { + "epoch": 0.01863926165405386, + "grad_norm": 13.712926093325802, + "learning_rate": 1.5523465703971122e-06, + "loss": 1.1182, + "step": 258 + }, + { + "epoch": 0.01871150685426337, + "grad_norm": 10.187886798279159, + "learning_rate": 1.5583634175691938e-06, + "loss": 1.1414, + "step": 259 + }, + { + "epoch": 0.018783752054472882, + "grad_norm": 11.245450943877366, + "learning_rate": 1.5643802647412756e-06, + "loss": 1.0821, + "step": 260 + }, + { + "epoch": 0.018855997254682393, + "grad_norm": 16.62574341911607, + "learning_rate": 1.5703971119133576e-06, + "loss": 1.1864, + "step": 261 + }, + { + "epoch": 0.018928242454891903, + "grad_norm": 9.484227388292208, + "learning_rate": 1.5764139590854393e-06, + "loss": 1.0936, + "step": 262 + }, + { + "epoch": 0.019000487655101414, + "grad_norm": 9.648556505465477, + "learning_rate": 1.5824308062575211e-06, + "loss": 1.0329, + "step": 263 + }, + { + "epoch": 0.019072732855310925, + "grad_norm": 10.341220511854715, + "learning_rate": 1.5884476534296031e-06, + "loss": 1.0442, + "step": 264 + }, + { + "epoch": 0.019144978055520435, + "grad_norm": 14.998587986926053, + "learning_rate": 1.594464500601685e-06, + "loss": 1.0755, + "step": 265 + }, + { + "epoch": 0.019217223255729946, + "grad_norm": 9.098306988856583, + "learning_rate": 1.6004813477737667e-06, + "loss": 1.1011, + "step": 266 + }, + { + "epoch": 0.019289468455939457, + "grad_norm": 8.35152732905948, + "learning_rate": 1.6064981949458487e-06, + "loss": 1.1264, + "step": 267 + }, + { + "epoch": 0.01936171365614897, + "grad_norm": 11.454058891410094, + "learning_rate": 1.6125150421179302e-06, + "loss": 1.1711, + "step": 268 + }, + { + "epoch": 0.019433958856358482, + "grad_norm": 7.0999631853895355, + "learning_rate": 1.618531889290012e-06, + "loss": 0.9992, + "step": 269 + }, + { + "epoch": 0.019506204056567993, + "grad_norm": 10.017404288131482, + "learning_rate": 1.624548736462094e-06, + "loss": 1.1489, + "step": 270 + }, + { + "epoch": 0.019578449256777503, + "grad_norm": 10.443169247673183, + "learning_rate": 1.6305655836341758e-06, + "loss": 1.0537, + "step": 271 + }, + { + "epoch": 0.019650694456987014, + "grad_norm": 12.33313737318369, + "learning_rate": 1.6365824308062578e-06, + "loss": 1.1564, + "step": 272 + }, + { + "epoch": 0.019722939657196525, + "grad_norm": 11.396936078505444, + "learning_rate": 1.6425992779783395e-06, + "loss": 1.1079, + "step": 273 + }, + { + "epoch": 0.019795184857406035, + "grad_norm": 10.81275824971579, + "learning_rate": 1.6486161251504213e-06, + "loss": 1.2034, + "step": 274 + }, + { + "epoch": 0.019867430057615546, + "grad_norm": 12.459984779409846, + "learning_rate": 1.6546329723225033e-06, + "loss": 1.1988, + "step": 275 + }, + { + "epoch": 0.019939675257825057, + "grad_norm": 9.819196809136809, + "learning_rate": 1.6606498194945849e-06, + "loss": 1.0506, + "step": 276 + }, + { + "epoch": 0.020011920458034568, + "grad_norm": 9.247241459450217, + "learning_rate": 1.6666666666666667e-06, + "loss": 1.0462, + "step": 277 + }, + { + "epoch": 0.020084165658244082, + "grad_norm": 10.628192736065758, + "learning_rate": 1.6726835138387487e-06, + "loss": 1.0516, + "step": 278 + }, + { + "epoch": 0.020156410858453593, + "grad_norm": 11.321217586963304, + "learning_rate": 1.6787003610108304e-06, + "loss": 0.9755, + "step": 279 + }, + { + "epoch": 0.020228656058663103, + "grad_norm": 9.903170422840764, + "learning_rate": 1.6847172081829122e-06, + "loss": 1.1079, + "step": 280 + }, + { + "epoch": 0.020300901258872614, + "grad_norm": 11.07275604868417, + "learning_rate": 1.6907340553549942e-06, + "loss": 1.1733, + "step": 281 + }, + { + "epoch": 0.020373146459082125, + "grad_norm": 12.883596568772052, + "learning_rate": 1.696750902527076e-06, + "loss": 1.0936, + "step": 282 + }, + { + "epoch": 0.020445391659291635, + "grad_norm": 8.663004859645536, + "learning_rate": 1.7027677496991575e-06, + "loss": 1.047, + "step": 283 + }, + { + "epoch": 0.020517636859501146, + "grad_norm": 10.416227489114304, + "learning_rate": 1.7087845968712397e-06, + "loss": 1.0967, + "step": 284 + }, + { + "epoch": 0.020589882059710657, + "grad_norm": 11.612976419225028, + "learning_rate": 1.7148014440433213e-06, + "loss": 1.0525, + "step": 285 + }, + { + "epoch": 0.020662127259920168, + "grad_norm": 12.172128853458434, + "learning_rate": 1.7208182912154033e-06, + "loss": 1.068, + "step": 286 + }, + { + "epoch": 0.020734372460129682, + "grad_norm": 11.264666343248123, + "learning_rate": 1.726835138387485e-06, + "loss": 1.1458, + "step": 287 + }, + { + "epoch": 0.020806617660339193, + "grad_norm": 8.901229996285757, + "learning_rate": 1.7328519855595669e-06, + "loss": 1.0478, + "step": 288 + }, + { + "epoch": 0.020878862860548703, + "grad_norm": 9.099535383163973, + "learning_rate": 1.7388688327316489e-06, + "loss": 1.0209, + "step": 289 + }, + { + "epoch": 0.020951108060758214, + "grad_norm": 12.417428695200574, + "learning_rate": 1.7448856799037306e-06, + "loss": 1.1668, + "step": 290 + }, + { + "epoch": 0.021023353260967725, + "grad_norm": 15.537298302787544, + "learning_rate": 1.7509025270758124e-06, + "loss": 1.0354, + "step": 291 + }, + { + "epoch": 0.021095598461177235, + "grad_norm": 10.707671109118673, + "learning_rate": 1.7569193742478944e-06, + "loss": 1.0992, + "step": 292 + }, + { + "epoch": 0.021167843661386746, + "grad_norm": 13.50910924949694, + "learning_rate": 1.762936221419976e-06, + "loss": 1.1489, + "step": 293 + }, + { + "epoch": 0.021240088861596257, + "grad_norm": 12.202008966852485, + "learning_rate": 1.7689530685920577e-06, + "loss": 1.1295, + "step": 294 + }, + { + "epoch": 0.021312334061805768, + "grad_norm": 10.53160606257774, + "learning_rate": 1.7749699157641397e-06, + "loss": 1.0336, + "step": 295 + }, + { + "epoch": 0.02138457926201528, + "grad_norm": 10.627453330042272, + "learning_rate": 1.7809867629362215e-06, + "loss": 1.0403, + "step": 296 + }, + { + "epoch": 0.021456824462224793, + "grad_norm": 11.97797024709255, + "learning_rate": 1.7870036101083035e-06, + "loss": 1.1627, + "step": 297 + }, + { + "epoch": 0.021529069662434303, + "grad_norm": 9.3455371997418, + "learning_rate": 1.7930204572803853e-06, + "loss": 1.0893, + "step": 298 + }, + { + "epoch": 0.021601314862643814, + "grad_norm": 10.206880261034021, + "learning_rate": 1.799037304452467e-06, + "loss": 1.1021, + "step": 299 + }, + { + "epoch": 0.021673560062853325, + "grad_norm": 9.740916789496888, + "learning_rate": 1.805054151624549e-06, + "loss": 1.1006, + "step": 300 + }, + { + "epoch": 0.021745805263062835, + "grad_norm": 12.41213995496823, + "learning_rate": 1.8110709987966308e-06, + "loss": 1.1011, + "step": 301 + }, + { + "epoch": 0.021818050463272346, + "grad_norm": 8.81461390038671, + "learning_rate": 1.8170878459687124e-06, + "loss": 1.1155, + "step": 302 + }, + { + "epoch": 0.021890295663481857, + "grad_norm": 10.183773927371355, + "learning_rate": 1.8231046931407944e-06, + "loss": 1.1249, + "step": 303 + }, + { + "epoch": 0.021962540863691368, + "grad_norm": 12.040730335478772, + "learning_rate": 1.8291215403128762e-06, + "loss": 1.0297, + "step": 304 + }, + { + "epoch": 0.02203478606390088, + "grad_norm": 11.163451875467503, + "learning_rate": 1.835138387484958e-06, + "loss": 1.1126, + "step": 305 + }, + { + "epoch": 0.02210703126411039, + "grad_norm": 9.601199361267565, + "learning_rate": 1.84115523465704e-06, + "loss": 1.0662, + "step": 306 + }, + { + "epoch": 0.022179276464319903, + "grad_norm": 12.933495238034675, + "learning_rate": 1.8471720818291217e-06, + "loss": 1.1517, + "step": 307 + }, + { + "epoch": 0.022251521664529414, + "grad_norm": 10.032866825989812, + "learning_rate": 1.8531889290012035e-06, + "loss": 1.0646, + "step": 308 + }, + { + "epoch": 0.022323766864738925, + "grad_norm": 10.211609565081694, + "learning_rate": 1.8592057761732855e-06, + "loss": 1.0995, + "step": 309 + }, + { + "epoch": 0.022396012064948435, + "grad_norm": 9.586876211551157, + "learning_rate": 1.865222623345367e-06, + "loss": 0.9966, + "step": 310 + }, + { + "epoch": 0.022468257265157946, + "grad_norm": 11.034204881556445, + "learning_rate": 1.871239470517449e-06, + "loss": 1.0381, + "step": 311 + }, + { + "epoch": 0.022540502465367457, + "grad_norm": 9.479844244470163, + "learning_rate": 1.8772563176895308e-06, + "loss": 1.0201, + "step": 312 + }, + { + "epoch": 0.022612747665576968, + "grad_norm": 16.549538892332563, + "learning_rate": 1.8832731648616126e-06, + "loss": 1.2135, + "step": 313 + }, + { + "epoch": 0.02268499286578648, + "grad_norm": 11.510586054089242, + "learning_rate": 1.8892900120336946e-06, + "loss": 1.0322, + "step": 314 + }, + { + "epoch": 0.02275723806599599, + "grad_norm": 9.149192199069152, + "learning_rate": 1.8953068592057764e-06, + "loss": 1.1051, + "step": 315 + }, + { + "epoch": 0.022829483266205503, + "grad_norm": 12.852171575612596, + "learning_rate": 1.9013237063778581e-06, + "loss": 1.0755, + "step": 316 + }, + { + "epoch": 0.022901728466415014, + "grad_norm": 11.913961961665962, + "learning_rate": 1.9073405535499401e-06, + "loss": 1.1087, + "step": 317 + }, + { + "epoch": 0.022973973666624525, + "grad_norm": 11.986186660247574, + "learning_rate": 1.9133574007220217e-06, + "loss": 1.1368, + "step": 318 + }, + { + "epoch": 0.023046218866834035, + "grad_norm": 10.301501148052177, + "learning_rate": 1.9193742478941035e-06, + "loss": 1.1992, + "step": 319 + }, + { + "epoch": 0.023118464067043546, + "grad_norm": 8.44141212473766, + "learning_rate": 1.9253910950661857e-06, + "loss": 1.1152, + "step": 320 + }, + { + "epoch": 0.023190709267253057, + "grad_norm": 13.143937463371302, + "learning_rate": 1.9314079422382675e-06, + "loss": 1.0774, + "step": 321 + }, + { + "epoch": 0.023262954467462568, + "grad_norm": 10.804216861473398, + "learning_rate": 1.937424789410349e-06, + "loss": 1.1333, + "step": 322 + }, + { + "epoch": 0.023335199667672078, + "grad_norm": 10.786404380988822, + "learning_rate": 1.943441636582431e-06, + "loss": 1.1205, + "step": 323 + }, + { + "epoch": 0.02340744486788159, + "grad_norm": 9.999180950955003, + "learning_rate": 1.949458483754513e-06, + "loss": 1.1007, + "step": 324 + }, + { + "epoch": 0.0234796900680911, + "grad_norm": 8.705604624601488, + "learning_rate": 1.9554753309265946e-06, + "loss": 0.9751, + "step": 325 + }, + { + "epoch": 0.023551935268300614, + "grad_norm": 12.249093158398502, + "learning_rate": 1.9614921780986764e-06, + "loss": 1.1575, + "step": 326 + }, + { + "epoch": 0.023624180468510125, + "grad_norm": 10.70542965322077, + "learning_rate": 1.967509025270758e-06, + "loss": 1.0538, + "step": 327 + }, + { + "epoch": 0.023696425668719635, + "grad_norm": 9.46332497874401, + "learning_rate": 1.9735258724428403e-06, + "loss": 1.148, + "step": 328 + }, + { + "epoch": 0.023768670868929146, + "grad_norm": 10.474817140571856, + "learning_rate": 1.979542719614922e-06, + "loss": 0.9958, + "step": 329 + }, + { + "epoch": 0.023840916069138657, + "grad_norm": 9.848370923714958, + "learning_rate": 1.9855595667870035e-06, + "loss": 1.1308, + "step": 330 + }, + { + "epoch": 0.023913161269348168, + "grad_norm": 10.580111115706826, + "learning_rate": 1.9915764139590857e-06, + "loss": 1.0433, + "step": 331 + }, + { + "epoch": 0.023985406469557678, + "grad_norm": 9.759139252526081, + "learning_rate": 1.9975932611311675e-06, + "loss": 1.0922, + "step": 332 + }, + { + "epoch": 0.02405765166976719, + "grad_norm": 10.660828979780739, + "learning_rate": 2.0036101083032492e-06, + "loss": 1.0714, + "step": 333 + }, + { + "epoch": 0.0241298968699767, + "grad_norm": 8.59699201986083, + "learning_rate": 2.009626955475331e-06, + "loss": 1.1281, + "step": 334 + }, + { + "epoch": 0.02420214207018621, + "grad_norm": 8.528106445650161, + "learning_rate": 2.0156438026474128e-06, + "loss": 1.0635, + "step": 335 + }, + { + "epoch": 0.024274387270395725, + "grad_norm": 10.32076861335956, + "learning_rate": 2.0216606498194946e-06, + "loss": 1.1233, + "step": 336 + }, + { + "epoch": 0.024346632470605235, + "grad_norm": 13.160367851333863, + "learning_rate": 2.0276774969915768e-06, + "loss": 1.2112, + "step": 337 + }, + { + "epoch": 0.024418877670814746, + "grad_norm": 11.033995721608408, + "learning_rate": 2.0336943441636585e-06, + "loss": 1.145, + "step": 338 + }, + { + "epoch": 0.024491122871024257, + "grad_norm": 11.3396546929211, + "learning_rate": 2.0397111913357403e-06, + "loss": 1.0089, + "step": 339 + }, + { + "epoch": 0.024563368071233768, + "grad_norm": 9.205099881618146, + "learning_rate": 2.045728038507822e-06, + "loss": 1.0592, + "step": 340 + }, + { + "epoch": 0.024635613271443278, + "grad_norm": 9.698783357751912, + "learning_rate": 2.051744885679904e-06, + "loss": 1.0776, + "step": 341 + }, + { + "epoch": 0.02470785847165279, + "grad_norm": 11.728104339426913, + "learning_rate": 2.0577617328519857e-06, + "loss": 1.0467, + "step": 342 + }, + { + "epoch": 0.0247801036718623, + "grad_norm": 13.038134469296331, + "learning_rate": 2.0637785800240674e-06, + "loss": 1.0519, + "step": 343 + }, + { + "epoch": 0.02485234887207181, + "grad_norm": 8.364704237302062, + "learning_rate": 2.0697954271961492e-06, + "loss": 1.0623, + "step": 344 + }, + { + "epoch": 0.02492459407228132, + "grad_norm": 12.850877403851774, + "learning_rate": 2.0758122743682314e-06, + "loss": 1.115, + "step": 345 + }, + { + "epoch": 0.024996839272490835, + "grad_norm": 14.002649873906401, + "learning_rate": 2.081829121540313e-06, + "loss": 1.1828, + "step": 346 + }, + { + "epoch": 0.025069084472700346, + "grad_norm": 12.924697512364954, + "learning_rate": 2.0878459687123946e-06, + "loss": 1.1307, + "step": 347 + }, + { + "epoch": 0.025141329672909857, + "grad_norm": 10.840020057687818, + "learning_rate": 2.0938628158844768e-06, + "loss": 1.137, + "step": 348 + }, + { + "epoch": 0.025213574873119367, + "grad_norm": 9.880538739176481, + "learning_rate": 2.0998796630565585e-06, + "loss": 1.1478, + "step": 349 + }, + { + "epoch": 0.025285820073328878, + "grad_norm": 8.775597324310038, + "learning_rate": 2.1058965102286403e-06, + "loss": 1.0279, + "step": 350 + }, + { + "epoch": 0.02535806527353839, + "grad_norm": 10.060805657890418, + "learning_rate": 2.111913357400722e-06, + "loss": 1.1281, + "step": 351 + }, + { + "epoch": 0.0254303104737479, + "grad_norm": 9.857083448779914, + "learning_rate": 2.117930204572804e-06, + "loss": 1.0271, + "step": 352 + }, + { + "epoch": 0.02550255567395741, + "grad_norm": 11.331902656759175, + "learning_rate": 2.123947051744886e-06, + "loss": 1.0896, + "step": 353 + }, + { + "epoch": 0.02557480087416692, + "grad_norm": 9.37714412330213, + "learning_rate": 2.129963898916968e-06, + "loss": 1.0402, + "step": 354 + }, + { + "epoch": 0.025647046074376435, + "grad_norm": 11.160461827501809, + "learning_rate": 2.1359807460890496e-06, + "loss": 1.0929, + "step": 355 + }, + { + "epoch": 0.025719291274585946, + "grad_norm": 9.370985468580333, + "learning_rate": 2.1419975932611314e-06, + "loss": 1.1004, + "step": 356 + }, + { + "epoch": 0.025791536474795457, + "grad_norm": 9.34606945631299, + "learning_rate": 2.148014440433213e-06, + "loss": 1.0352, + "step": 357 + }, + { + "epoch": 0.025863781675004967, + "grad_norm": 9.277090045873129, + "learning_rate": 2.154031287605295e-06, + "loss": 1.1027, + "step": 358 + }, + { + "epoch": 0.025936026875214478, + "grad_norm": 9.65066317320303, + "learning_rate": 2.1600481347773767e-06, + "loss": 1.0032, + "step": 359 + }, + { + "epoch": 0.02600827207542399, + "grad_norm": 8.921992152525497, + "learning_rate": 2.1660649819494585e-06, + "loss": 1.1018, + "step": 360 + }, + { + "epoch": 0.0260805172756335, + "grad_norm": 10.125241500423313, + "learning_rate": 2.1720818291215403e-06, + "loss": 1.0425, + "step": 361 + }, + { + "epoch": 0.02615276247584301, + "grad_norm": 7.744306380240593, + "learning_rate": 2.1780986762936225e-06, + "loss": 1.0782, + "step": 362 + }, + { + "epoch": 0.02622500767605252, + "grad_norm": 10.056177652785127, + "learning_rate": 2.1841155234657043e-06, + "loss": 1.2477, + "step": 363 + }, + { + "epoch": 0.026297252876262032, + "grad_norm": 10.84038392516702, + "learning_rate": 2.190132370637786e-06, + "loss": 1.1179, + "step": 364 + }, + { + "epoch": 0.026369498076471546, + "grad_norm": 10.366640330911856, + "learning_rate": 2.196149217809868e-06, + "loss": 1.1271, + "step": 365 + }, + { + "epoch": 0.026441743276681057, + "grad_norm": 10.670515160892098, + "learning_rate": 2.2021660649819496e-06, + "loss": 1.0915, + "step": 366 + }, + { + "epoch": 0.026513988476890567, + "grad_norm": 9.395176043360879, + "learning_rate": 2.2081829121540314e-06, + "loss": 1.0953, + "step": 367 + }, + { + "epoch": 0.026586233677100078, + "grad_norm": 9.330218567409073, + "learning_rate": 2.214199759326113e-06, + "loss": 1.1226, + "step": 368 + }, + { + "epoch": 0.02665847887730959, + "grad_norm": 10.931472599823119, + "learning_rate": 2.220216606498195e-06, + "loss": 1.0355, + "step": 369 + }, + { + "epoch": 0.0267307240775191, + "grad_norm": 10.37166937256151, + "learning_rate": 2.226233453670277e-06, + "loss": 0.9653, + "step": 370 + }, + { + "epoch": 0.02680296927772861, + "grad_norm": 12.315403939552123, + "learning_rate": 2.232250300842359e-06, + "loss": 1.0271, + "step": 371 + }, + { + "epoch": 0.02687521447793812, + "grad_norm": 15.387028648662666, + "learning_rate": 2.2382671480144407e-06, + "loss": 1.0553, + "step": 372 + }, + { + "epoch": 0.026947459678147632, + "grad_norm": 10.342419311120162, + "learning_rate": 2.2442839951865225e-06, + "loss": 1.019, + "step": 373 + }, + { + "epoch": 0.027019704878357143, + "grad_norm": 10.222178965859412, + "learning_rate": 2.2503008423586043e-06, + "loss": 1.146, + "step": 374 + }, + { + "epoch": 0.027091950078566657, + "grad_norm": 10.443765370182437, + "learning_rate": 2.256317689530686e-06, + "loss": 1.1014, + "step": 375 + }, + { + "epoch": 0.027164195278776167, + "grad_norm": 9.164957014233831, + "learning_rate": 2.262334536702768e-06, + "loss": 1.0655, + "step": 376 + }, + { + "epoch": 0.027236440478985678, + "grad_norm": 11.90494015514804, + "learning_rate": 2.2683513838748496e-06, + "loss": 1.0783, + "step": 377 + }, + { + "epoch": 0.02730868567919519, + "grad_norm": 9.157866768122137, + "learning_rate": 2.274368231046932e-06, + "loss": 1.09, + "step": 378 + }, + { + "epoch": 0.0273809308794047, + "grad_norm": 9.497426939399865, + "learning_rate": 2.2803850782190136e-06, + "loss": 1.0842, + "step": 379 + }, + { + "epoch": 0.02745317607961421, + "grad_norm": 9.414518353200757, + "learning_rate": 2.2864019253910954e-06, + "loss": 1.0478, + "step": 380 + }, + { + "epoch": 0.02752542127982372, + "grad_norm": 8.512817591989869, + "learning_rate": 2.292418772563177e-06, + "loss": 1.0765, + "step": 381 + }, + { + "epoch": 0.027597666480033232, + "grad_norm": 9.590347332755504, + "learning_rate": 2.298435619735259e-06, + "loss": 1.1014, + "step": 382 + }, + { + "epoch": 0.027669911680242743, + "grad_norm": 8.366016181289961, + "learning_rate": 2.3044524669073407e-06, + "loss": 1.0468, + "step": 383 + }, + { + "epoch": 0.027742156880452253, + "grad_norm": 10.417895150541485, + "learning_rate": 2.3104693140794225e-06, + "loss": 1.0493, + "step": 384 + }, + { + "epoch": 0.027814402080661767, + "grad_norm": 8.741293799270005, + "learning_rate": 2.3164861612515043e-06, + "loss": 1.0352, + "step": 385 + }, + { + "epoch": 0.027886647280871278, + "grad_norm": 10.453010045261971, + "learning_rate": 2.322503008423586e-06, + "loss": 0.9831, + "step": 386 + }, + { + "epoch": 0.02795889248108079, + "grad_norm": 9.535075836280134, + "learning_rate": 2.3285198555956682e-06, + "loss": 1.1267, + "step": 387 + }, + { + "epoch": 0.0280311376812903, + "grad_norm": 9.095356068574784, + "learning_rate": 2.33453670276775e-06, + "loss": 1.0254, + "step": 388 + }, + { + "epoch": 0.02810338288149981, + "grad_norm": 10.460332145012623, + "learning_rate": 2.3405535499398314e-06, + "loss": 1.1144, + "step": 389 + }, + { + "epoch": 0.02817562808170932, + "grad_norm": 9.757665506298753, + "learning_rate": 2.3465703971119136e-06, + "loss": 1.0902, + "step": 390 + }, + { + "epoch": 0.028247873281918832, + "grad_norm": 8.245921195938566, + "learning_rate": 2.3525872442839954e-06, + "loss": 1.0891, + "step": 391 + }, + { + "epoch": 0.028320118482128342, + "grad_norm": 9.810306765613868, + "learning_rate": 2.358604091456077e-06, + "loss": 1.0331, + "step": 392 + }, + { + "epoch": 0.028392363682337853, + "grad_norm": 11.104256482265068, + "learning_rate": 2.364620938628159e-06, + "loss": 1.0768, + "step": 393 + }, + { + "epoch": 0.028464608882547367, + "grad_norm": 9.780329554335388, + "learning_rate": 2.3706377858002407e-06, + "loss": 1.0037, + "step": 394 + }, + { + "epoch": 0.028536854082756878, + "grad_norm": 8.909707987545936, + "learning_rate": 2.376654632972323e-06, + "loss": 1.1503, + "step": 395 + }, + { + "epoch": 0.02860909928296639, + "grad_norm": 10.491712296689416, + "learning_rate": 2.3826714801444047e-06, + "loss": 1.1613, + "step": 396 + }, + { + "epoch": 0.0286813444831759, + "grad_norm": 9.294043358160074, + "learning_rate": 2.3886883273164865e-06, + "loss": 1.1194, + "step": 397 + }, + { + "epoch": 0.02875358968338541, + "grad_norm": 8.821884461200868, + "learning_rate": 2.3947051744885682e-06, + "loss": 1.1121, + "step": 398 + }, + { + "epoch": 0.02882583488359492, + "grad_norm": 7.349237218188632, + "learning_rate": 2.40072202166065e-06, + "loss": 0.9874, + "step": 399 + }, + { + "epoch": 0.028898080083804432, + "grad_norm": 8.312203997167499, + "learning_rate": 2.406738868832732e-06, + "loss": 1.1367, + "step": 400 + }, + { + "epoch": 0.028970325284013942, + "grad_norm": 8.94462333589815, + "learning_rate": 2.4127557160048136e-06, + "loss": 1.0105, + "step": 401 + }, + { + "epoch": 0.029042570484223453, + "grad_norm": 8.92928892032051, + "learning_rate": 2.4187725631768953e-06, + "loss": 1.0623, + "step": 402 + }, + { + "epoch": 0.029114815684432964, + "grad_norm": 9.060557611076307, + "learning_rate": 2.4247894103489775e-06, + "loss": 1.0363, + "step": 403 + }, + { + "epoch": 0.029187060884642478, + "grad_norm": 10.222328609336799, + "learning_rate": 2.4308062575210593e-06, + "loss": 1.1599, + "step": 404 + }, + { + "epoch": 0.02925930608485199, + "grad_norm": 8.74786176440956, + "learning_rate": 2.436823104693141e-06, + "loss": 1.0862, + "step": 405 + }, + { + "epoch": 0.0293315512850615, + "grad_norm": 12.380408752299878, + "learning_rate": 2.442839951865223e-06, + "loss": 1.176, + "step": 406 + }, + { + "epoch": 0.02940379648527101, + "grad_norm": 10.31678784555519, + "learning_rate": 2.4488567990373047e-06, + "loss": 1.0881, + "step": 407 + }, + { + "epoch": 0.02947604168548052, + "grad_norm": 12.7537680649099, + "learning_rate": 2.4548736462093864e-06, + "loss": 1.017, + "step": 408 + }, + { + "epoch": 0.02954828688569003, + "grad_norm": 8.41908843699027, + "learning_rate": 2.4608904933814682e-06, + "loss": 1.1455, + "step": 409 + }, + { + "epoch": 0.029620532085899542, + "grad_norm": 9.908354622893137, + "learning_rate": 2.46690734055355e-06, + "loss": 0.9987, + "step": 410 + }, + { + "epoch": 0.029692777286109053, + "grad_norm": 10.084737716691155, + "learning_rate": 2.4729241877256318e-06, + "loss": 1.0418, + "step": 411 + }, + { + "epoch": 0.029765022486318564, + "grad_norm": 11.297239010171012, + "learning_rate": 2.478941034897714e-06, + "loss": 1.0339, + "step": 412 + }, + { + "epoch": 0.029837267686528075, + "grad_norm": 8.469949897698998, + "learning_rate": 2.4849578820697958e-06, + "loss": 1.017, + "step": 413 + }, + { + "epoch": 0.02990951288673759, + "grad_norm": 8.642065883478036, + "learning_rate": 2.4909747292418775e-06, + "loss": 1.0904, + "step": 414 + }, + { + "epoch": 0.0299817580869471, + "grad_norm": 9.870359078690358, + "learning_rate": 2.4969915764139593e-06, + "loss": 1.1015, + "step": 415 + }, + { + "epoch": 0.03005400328715661, + "grad_norm": 9.999998092651186, + "learning_rate": 2.5030084235860415e-06, + "loss": 1.0231, + "step": 416 + }, + { + "epoch": 0.03012624848736612, + "grad_norm": 9.760599488154611, + "learning_rate": 2.509025270758123e-06, + "loss": 1.0936, + "step": 417 + }, + { + "epoch": 0.03019849368757563, + "grad_norm": 10.392621621490916, + "learning_rate": 2.5150421179302047e-06, + "loss": 1.0915, + "step": 418 + }, + { + "epoch": 0.030270738887785142, + "grad_norm": 10.619366756891996, + "learning_rate": 2.5210589651022864e-06, + "loss": 1.1712, + "step": 419 + }, + { + "epoch": 0.030342984087994653, + "grad_norm": 8.807769154862743, + "learning_rate": 2.527075812274368e-06, + "loss": 1.1117, + "step": 420 + }, + { + "epoch": 0.030415229288204164, + "grad_norm": 10.388965078290965, + "learning_rate": 2.53309265944645e-06, + "loss": 1.1632, + "step": 421 + }, + { + "epoch": 0.030487474488413675, + "grad_norm": 12.376365509218505, + "learning_rate": 2.539109506618532e-06, + "loss": 1.1279, + "step": 422 + }, + { + "epoch": 0.03055971968862319, + "grad_norm": 9.406090212253691, + "learning_rate": 2.545126353790614e-06, + "loss": 1.0389, + "step": 423 + }, + { + "epoch": 0.0306319648888327, + "grad_norm": 9.277291940644808, + "learning_rate": 2.5511432009626957e-06, + "loss": 1.1604, + "step": 424 + }, + { + "epoch": 0.03070421008904221, + "grad_norm": 9.24068992780284, + "learning_rate": 2.5571600481347775e-06, + "loss": 0.9744, + "step": 425 + }, + { + "epoch": 0.03077645528925172, + "grad_norm": 10.441184484293847, + "learning_rate": 2.5631768953068593e-06, + "loss": 1.026, + "step": 426 + }, + { + "epoch": 0.03084870048946123, + "grad_norm": 9.098360236690018, + "learning_rate": 2.5691937424789415e-06, + "loss": 1.1384, + "step": 427 + }, + { + "epoch": 0.030920945689670742, + "grad_norm": 10.829586222203535, + "learning_rate": 2.5752105896510233e-06, + "loss": 1.068, + "step": 428 + }, + { + "epoch": 0.030993190889880253, + "grad_norm": 11.025156212908866, + "learning_rate": 2.581227436823105e-06, + "loss": 1.1689, + "step": 429 + }, + { + "epoch": 0.031065436090089764, + "grad_norm": 11.750407110425671, + "learning_rate": 2.587244283995187e-06, + "loss": 1.0467, + "step": 430 + }, + { + "epoch": 0.031137681290299275, + "grad_norm": 7.423379524725597, + "learning_rate": 2.593261131167268e-06, + "loss": 1.0174, + "step": 431 + }, + { + "epoch": 0.031209926490508785, + "grad_norm": 9.104103722489311, + "learning_rate": 2.59927797833935e-06, + "loss": 0.983, + "step": 432 + }, + { + "epoch": 0.0312821716907183, + "grad_norm": 14.239122488866007, + "learning_rate": 2.6052948255114326e-06, + "loss": 1.1598, + "step": 433 + }, + { + "epoch": 0.03135441689092781, + "grad_norm": 10.232854878758252, + "learning_rate": 2.611311672683514e-06, + "loss": 1.0722, + "step": 434 + }, + { + "epoch": 0.03142666209113732, + "grad_norm": 9.995888436979039, + "learning_rate": 2.6173285198555957e-06, + "loss": 1.1113, + "step": 435 + }, + { + "epoch": 0.03149890729134683, + "grad_norm": 8.871714668934759, + "learning_rate": 2.6233453670276775e-06, + "loss": 1.1306, + "step": 436 + }, + { + "epoch": 0.03157115249155634, + "grad_norm": 11.392358300050027, + "learning_rate": 2.6293622141997593e-06, + "loss": 1.0081, + "step": 437 + }, + { + "epoch": 0.03164339769176586, + "grad_norm": 10.469489319159932, + "learning_rate": 2.6353790613718415e-06, + "loss": 1.0125, + "step": 438 + }, + { + "epoch": 0.031715642891975364, + "grad_norm": 11.619148996939748, + "learning_rate": 2.6413959085439233e-06, + "loss": 1.1026, + "step": 439 + }, + { + "epoch": 0.03178788809218488, + "grad_norm": 9.133740563143048, + "learning_rate": 2.647412755716005e-06, + "loss": 1.1344, + "step": 440 + }, + { + "epoch": 0.031860133292394385, + "grad_norm": 8.463504306808568, + "learning_rate": 2.653429602888087e-06, + "loss": 1.0907, + "step": 441 + }, + { + "epoch": 0.0319323784926039, + "grad_norm": 7.73217544876021, + "learning_rate": 2.6594464500601686e-06, + "loss": 0.9804, + "step": 442 + }, + { + "epoch": 0.03200462369281341, + "grad_norm": 7.860276521278803, + "learning_rate": 2.6654632972322504e-06, + "loss": 1.0049, + "step": 443 + }, + { + "epoch": 0.03207686889302292, + "grad_norm": 7.986570052813416, + "learning_rate": 2.6714801444043326e-06, + "loss": 1.0567, + "step": 444 + }, + { + "epoch": 0.03214911409323243, + "grad_norm": 9.545335311475238, + "learning_rate": 2.6774969915764144e-06, + "loss": 1.0697, + "step": 445 + }, + { + "epoch": 0.03222135929344194, + "grad_norm": 10.01886114961933, + "learning_rate": 2.683513838748496e-06, + "loss": 1.0478, + "step": 446 + }, + { + "epoch": 0.03229360449365145, + "grad_norm": 10.421787152928115, + "learning_rate": 2.689530685920578e-06, + "loss": 1.1086, + "step": 447 + }, + { + "epoch": 0.032365849693860964, + "grad_norm": 8.10485737528616, + "learning_rate": 2.6955475330926593e-06, + "loss": 1.0466, + "step": 448 + }, + { + "epoch": 0.03243809489407048, + "grad_norm": 8.721289480217585, + "learning_rate": 2.701564380264742e-06, + "loss": 1.081, + "step": 449 + }, + { + "epoch": 0.032510340094279985, + "grad_norm": 8.00891331991538, + "learning_rate": 2.7075812274368237e-06, + "loss": 1.0242, + "step": 450 + }, + { + "epoch": 0.0325825852944895, + "grad_norm": 8.377394604317342, + "learning_rate": 2.713598074608905e-06, + "loss": 1.0905, + "step": 451 + }, + { + "epoch": 0.03265483049469901, + "grad_norm": 8.142159520527391, + "learning_rate": 2.719614921780987e-06, + "loss": 1.0712, + "step": 452 + }, + { + "epoch": 0.03272707569490852, + "grad_norm": 8.965125438112606, + "learning_rate": 2.7256317689530686e-06, + "loss": 1.0832, + "step": 453 + }, + { + "epoch": 0.03279932089511803, + "grad_norm": 9.526045181721491, + "learning_rate": 2.7316486161251504e-06, + "loss": 1.0289, + "step": 454 + }, + { + "epoch": 0.03287156609532754, + "grad_norm": 7.6873600644657065, + "learning_rate": 2.7376654632972326e-06, + "loss": 0.9663, + "step": 455 + }, + { + "epoch": 0.03294381129553705, + "grad_norm": 8.388511121495233, + "learning_rate": 2.7436823104693144e-06, + "loss": 0.9849, + "step": 456 + }, + { + "epoch": 0.033016056495746564, + "grad_norm": 8.68785215254116, + "learning_rate": 2.749699157641396e-06, + "loss": 1.0437, + "step": 457 + }, + { + "epoch": 0.03308830169595608, + "grad_norm": 9.280414376708508, + "learning_rate": 2.755716004813478e-06, + "loss": 0.9701, + "step": 458 + }, + { + "epoch": 0.033160546896165585, + "grad_norm": 10.845254843124005, + "learning_rate": 2.7617328519855597e-06, + "loss": 1.0812, + "step": 459 + }, + { + "epoch": 0.0332327920963751, + "grad_norm": 10.008213484843493, + "learning_rate": 2.7677496991576415e-06, + "loss": 1.0272, + "step": 460 + }, + { + "epoch": 0.03330503729658461, + "grad_norm": 10.046880315657614, + "learning_rate": 2.7737665463297237e-06, + "loss": 1.1313, + "step": 461 + }, + { + "epoch": 0.03337728249679412, + "grad_norm": 10.195912263848186, + "learning_rate": 2.7797833935018055e-06, + "loss": 1.1248, + "step": 462 + }, + { + "epoch": 0.03344952769700363, + "grad_norm": 9.327780661305757, + "learning_rate": 2.7858002406738872e-06, + "loss": 1.0535, + "step": 463 + }, + { + "epoch": 0.03352177289721314, + "grad_norm": 9.840942369941795, + "learning_rate": 2.791817087845969e-06, + "loss": 1.1426, + "step": 464 + }, + { + "epoch": 0.03359401809742265, + "grad_norm": 9.774066663901502, + "learning_rate": 2.7978339350180504e-06, + "loss": 1.1201, + "step": 465 + }, + { + "epoch": 0.033666263297632164, + "grad_norm": 9.012696106109106, + "learning_rate": 2.803850782190133e-06, + "loss": 0.9304, + "step": 466 + }, + { + "epoch": 0.03373850849784168, + "grad_norm": 9.190551666205854, + "learning_rate": 2.8098676293622148e-06, + "loss": 1.0771, + "step": 467 + }, + { + "epoch": 0.033810753698051185, + "grad_norm": 8.384643468493458, + "learning_rate": 2.815884476534296e-06, + "loss": 0.9346, + "step": 468 + }, + { + "epoch": 0.0338829988982607, + "grad_norm": 8.745417239495843, + "learning_rate": 2.821901323706378e-06, + "loss": 1.148, + "step": 469 + }, + { + "epoch": 0.03395524409847021, + "grad_norm": 10.349236914474133, + "learning_rate": 2.8279181708784597e-06, + "loss": 1.0953, + "step": 470 + }, + { + "epoch": 0.03402748929867972, + "grad_norm": 7.8680883685425975, + "learning_rate": 2.8339350180505415e-06, + "loss": 1.0755, + "step": 471 + }, + { + "epoch": 0.03409973449888923, + "grad_norm": 11.026558805197235, + "learning_rate": 2.8399518652226237e-06, + "loss": 1.1494, + "step": 472 + }, + { + "epoch": 0.03417197969909874, + "grad_norm": 10.258892017049428, + "learning_rate": 2.8459687123947054e-06, + "loss": 1.0863, + "step": 473 + }, + { + "epoch": 0.03424422489930825, + "grad_norm": 10.883603255550009, + "learning_rate": 2.8519855595667872e-06, + "loss": 1.0061, + "step": 474 + }, + { + "epoch": 0.034316470099517764, + "grad_norm": 9.997832254057549, + "learning_rate": 2.858002406738869e-06, + "loss": 0.9922, + "step": 475 + }, + { + "epoch": 0.03438871529972727, + "grad_norm": 11.943448007898843, + "learning_rate": 2.8640192539109508e-06, + "loss": 1.0969, + "step": 476 + }, + { + "epoch": 0.034460960499936785, + "grad_norm": 13.66734020775411, + "learning_rate": 2.870036101083033e-06, + "loss": 1.1059, + "step": 477 + }, + { + "epoch": 0.0345332057001463, + "grad_norm": 8.955125621806975, + "learning_rate": 2.8760529482551148e-06, + "loss": 1.0787, + "step": 478 + }, + { + "epoch": 0.03460545090035581, + "grad_norm": 9.84640806126703, + "learning_rate": 2.8820697954271965e-06, + "loss": 1.0476, + "step": 479 + }, + { + "epoch": 0.03467769610056532, + "grad_norm": 9.279861911535535, + "learning_rate": 2.8880866425992783e-06, + "loss": 1.094, + "step": 480 + }, + { + "epoch": 0.03474994130077483, + "grad_norm": 9.545532331934139, + "learning_rate": 2.89410348977136e-06, + "loss": 1.0989, + "step": 481 + }, + { + "epoch": 0.03482218650098434, + "grad_norm": 8.98934517755278, + "learning_rate": 2.9001203369434414e-06, + "loss": 1.087, + "step": 482 + }, + { + "epoch": 0.03489443170119385, + "grad_norm": 10.423128936230668, + "learning_rate": 2.906137184115524e-06, + "loss": 1.0825, + "step": 483 + }, + { + "epoch": 0.034966676901403364, + "grad_norm": 9.521680887395183, + "learning_rate": 2.912154031287606e-06, + "loss": 1.0214, + "step": 484 + }, + { + "epoch": 0.03503892210161287, + "grad_norm": 9.065612679347884, + "learning_rate": 2.918170878459687e-06, + "loss": 1.1667, + "step": 485 + }, + { + "epoch": 0.035111167301822385, + "grad_norm": 9.340734499164759, + "learning_rate": 2.924187725631769e-06, + "loss": 1.0433, + "step": 486 + }, + { + "epoch": 0.0351834125020319, + "grad_norm": 8.747763211590417, + "learning_rate": 2.9302045728038508e-06, + "loss": 1.0742, + "step": 487 + }, + { + "epoch": 0.03525565770224141, + "grad_norm": 10.418027702425066, + "learning_rate": 2.9362214199759325e-06, + "loss": 1.1099, + "step": 488 + }, + { + "epoch": 0.03532790290245092, + "grad_norm": 9.01613738456787, + "learning_rate": 2.9422382671480147e-06, + "loss": 1.006, + "step": 489 + }, + { + "epoch": 0.03540014810266043, + "grad_norm": 9.322399637956186, + "learning_rate": 2.9482551143200965e-06, + "loss": 1.1485, + "step": 490 + }, + { + "epoch": 0.03547239330286994, + "grad_norm": 13.51847084381507, + "learning_rate": 2.9542719614921783e-06, + "loss": 1.2012, + "step": 491 + }, + { + "epoch": 0.03554463850307945, + "grad_norm": 7.790163871011124, + "learning_rate": 2.96028880866426e-06, + "loss": 1.0569, + "step": 492 + }, + { + "epoch": 0.035616883703288964, + "grad_norm": 9.594763090340773, + "learning_rate": 2.966305655836342e-06, + "loss": 1.0769, + "step": 493 + }, + { + "epoch": 0.03568912890349847, + "grad_norm": 11.04312316117284, + "learning_rate": 2.972322503008424e-06, + "loss": 1.0505, + "step": 494 + }, + { + "epoch": 0.035761374103707985, + "grad_norm": 10.322735509941303, + "learning_rate": 2.978339350180506e-06, + "loss": 1.1057, + "step": 495 + }, + { + "epoch": 0.0358336193039175, + "grad_norm": 10.162039365569433, + "learning_rate": 2.9843561973525876e-06, + "loss": 1.0628, + "step": 496 + }, + { + "epoch": 0.03590586450412701, + "grad_norm": 10.845328708027075, + "learning_rate": 2.9903730445246694e-06, + "loss": 1.1351, + "step": 497 + }, + { + "epoch": 0.03597810970433652, + "grad_norm": 9.989272276192603, + "learning_rate": 2.996389891696751e-06, + "loss": 1.0864, + "step": 498 + }, + { + "epoch": 0.03605035490454603, + "grad_norm": 9.661646054372953, + "learning_rate": 3.0024067388688325e-06, + "loss": 1.0662, + "step": 499 + }, + { + "epoch": 0.03612260010475554, + "grad_norm": 11.237636639164247, + "learning_rate": 3.008423586040915e-06, + "loss": 1.0626, + "step": 500 + }, + { + "epoch": 0.03619484530496505, + "grad_norm": 12.558644172721483, + "learning_rate": 3.014440433212997e-06, + "loss": 1.0685, + "step": 501 + }, + { + "epoch": 0.036267090505174564, + "grad_norm": 8.579426539892859, + "learning_rate": 3.0204572803850783e-06, + "loss": 1.0309, + "step": 502 + }, + { + "epoch": 0.03633933570538407, + "grad_norm": 12.175236092222901, + "learning_rate": 3.02647412755716e-06, + "loss": 1.1219, + "step": 503 + }, + { + "epoch": 0.036411580905593585, + "grad_norm": 9.056489621948502, + "learning_rate": 3.032490974729242e-06, + "loss": 0.9755, + "step": 504 + }, + { + "epoch": 0.03648382610580309, + "grad_norm": 9.780459045995265, + "learning_rate": 3.038507821901324e-06, + "loss": 0.9961, + "step": 505 + }, + { + "epoch": 0.03655607130601261, + "grad_norm": 12.212619345949603, + "learning_rate": 3.044524669073406e-06, + "loss": 1.1087, + "step": 506 + }, + { + "epoch": 0.03662831650622212, + "grad_norm": 9.651508241105377, + "learning_rate": 3.0505415162454876e-06, + "loss": 1.1164, + "step": 507 + }, + { + "epoch": 0.03670056170643163, + "grad_norm": 12.172205948645875, + "learning_rate": 3.0565583634175694e-06, + "loss": 1.084, + "step": 508 + }, + { + "epoch": 0.03677280690664114, + "grad_norm": 7.802986150506115, + "learning_rate": 3.062575210589651e-06, + "loss": 1.0624, + "step": 509 + }, + { + "epoch": 0.03684505210685065, + "grad_norm": 7.410376063391041, + "learning_rate": 3.068592057761733e-06, + "loss": 0.9564, + "step": 510 + }, + { + "epoch": 0.036917297307060164, + "grad_norm": 8.526415450645818, + "learning_rate": 3.074608904933815e-06, + "loss": 1.0601, + "step": 511 + }, + { + "epoch": 0.03698954250726967, + "grad_norm": 10.47825077571861, + "learning_rate": 3.080625752105897e-06, + "loss": 1.0709, + "step": 512 + }, + { + "epoch": 0.037061787707479185, + "grad_norm": 7.593186596978452, + "learning_rate": 3.0866425992779787e-06, + "loss": 0.9573, + "step": 513 + }, + { + "epoch": 0.03713403290768869, + "grad_norm": 8.634608127734069, + "learning_rate": 3.0926594464500605e-06, + "loss": 1.1084, + "step": 514 + }, + { + "epoch": 0.03720627810789821, + "grad_norm": 8.336651815253504, + "learning_rate": 3.0986762936221423e-06, + "loss": 1.0965, + "step": 515 + }, + { + "epoch": 0.03727852330810772, + "grad_norm": 7.932318250111056, + "learning_rate": 3.1046931407942245e-06, + "loss": 1.1669, + "step": 516 + }, + { + "epoch": 0.03735076850831723, + "grad_norm": 8.089437270992843, + "learning_rate": 3.1107099879663062e-06, + "loss": 1.0383, + "step": 517 + }, + { + "epoch": 0.03742301370852674, + "grad_norm": 9.777907900474224, + "learning_rate": 3.1167268351383876e-06, + "loss": 1.083, + "step": 518 + }, + { + "epoch": 0.03749525890873625, + "grad_norm": 9.51587675416147, + "learning_rate": 3.1227436823104694e-06, + "loss": 1.1154, + "step": 519 + }, + { + "epoch": 0.037567504108945764, + "grad_norm": 8.490864949912714, + "learning_rate": 3.128760529482551e-06, + "loss": 1.0578, + "step": 520 + }, + { + "epoch": 0.03763974930915527, + "grad_norm": 8.439652684012406, + "learning_rate": 3.134777376654633e-06, + "loss": 1.0896, + "step": 521 + }, + { + "epoch": 0.037711994509364785, + "grad_norm": 10.309381556790694, + "learning_rate": 3.140794223826715e-06, + "loss": 1.0449, + "step": 522 + }, + { + "epoch": 0.03778423970957429, + "grad_norm": 10.36264147690666, + "learning_rate": 3.146811070998797e-06, + "loss": 1.0611, + "step": 523 + }, + { + "epoch": 0.03785648490978381, + "grad_norm": 7.713245235897261, + "learning_rate": 3.1528279181708787e-06, + "loss": 1.0356, + "step": 524 + }, + { + "epoch": 0.03792873010999332, + "grad_norm": 9.886589878407424, + "learning_rate": 3.1588447653429605e-06, + "loss": 1.1255, + "step": 525 + }, + { + "epoch": 0.03800097531020283, + "grad_norm": 12.601115077495685, + "learning_rate": 3.1648616125150423e-06, + "loss": 1.0573, + "step": 526 + }, + { + "epoch": 0.03807322051041234, + "grad_norm": 8.103976707553057, + "learning_rate": 3.170878459687124e-06, + "loss": 1.0055, + "step": 527 + }, + { + "epoch": 0.03814546571062185, + "grad_norm": 10.369962641355343, + "learning_rate": 3.1768953068592062e-06, + "loss": 0.9683, + "step": 528 + }, + { + "epoch": 0.038217710910831364, + "grad_norm": 8.627537450850058, + "learning_rate": 3.182912154031288e-06, + "loss": 0.9004, + "step": 529 + }, + { + "epoch": 0.03828995611104087, + "grad_norm": 14.343653726358555, + "learning_rate": 3.18892900120337e-06, + "loss": 1.1437, + "step": 530 + }, + { + "epoch": 0.038362201311250385, + "grad_norm": 9.479305414435, + "learning_rate": 3.1949458483754516e-06, + "loss": 0.982, + "step": 531 + }, + { + "epoch": 0.03843444651145989, + "grad_norm": 12.015610396000552, + "learning_rate": 3.2009626955475333e-06, + "loss": 1.0415, + "step": 532 + }, + { + "epoch": 0.03850669171166941, + "grad_norm": 9.538462627022732, + "learning_rate": 3.2069795427196155e-06, + "loss": 1.0587, + "step": 533 + }, + { + "epoch": 0.038578936911878914, + "grad_norm": 9.252553638878947, + "learning_rate": 3.2129963898916973e-06, + "loss": 1.1353, + "step": 534 + }, + { + "epoch": 0.03865118211208843, + "grad_norm": 9.644106837436764, + "learning_rate": 3.2190132370637787e-06, + "loss": 1.0296, + "step": 535 + }, + { + "epoch": 0.03872342731229794, + "grad_norm": 8.590170152819706, + "learning_rate": 3.2250300842358605e-06, + "loss": 1.0104, + "step": 536 + }, + { + "epoch": 0.03879567251250745, + "grad_norm": 9.532777557534098, + "learning_rate": 3.2310469314079422e-06, + "loss": 0.9937, + "step": 537 + }, + { + "epoch": 0.038867917712716964, + "grad_norm": 10.668652429117762, + "learning_rate": 3.237063778580024e-06, + "loss": 1.0501, + "step": 538 + }, + { + "epoch": 0.03894016291292647, + "grad_norm": 8.333879325782155, + "learning_rate": 3.2430806257521062e-06, + "loss": 1.0834, + "step": 539 + }, + { + "epoch": 0.039012408113135985, + "grad_norm": 8.019768608731654, + "learning_rate": 3.249097472924188e-06, + "loss": 1.0044, + "step": 540 + }, + { + "epoch": 0.03908465331334549, + "grad_norm": 9.239572369476889, + "learning_rate": 3.2551143200962698e-06, + "loss": 1.0995, + "step": 541 + }, + { + "epoch": 0.03915689851355501, + "grad_norm": 9.333154086253687, + "learning_rate": 3.2611311672683516e-06, + "loss": 1.0933, + "step": 542 + }, + { + "epoch": 0.039229143713764514, + "grad_norm": 11.329264490103439, + "learning_rate": 3.2671480144404333e-06, + "loss": 1.0653, + "step": 543 + }, + { + "epoch": 0.03930138891397403, + "grad_norm": 8.404628441272097, + "learning_rate": 3.2731648616125155e-06, + "loss": 1.0941, + "step": 544 + }, + { + "epoch": 0.03937363411418354, + "grad_norm": 8.7066071425424, + "learning_rate": 3.2791817087845973e-06, + "loss": 1.1225, + "step": 545 + }, + { + "epoch": 0.03944587931439305, + "grad_norm": 10.735012873947035, + "learning_rate": 3.285198555956679e-06, + "loss": 1.0526, + "step": 546 + }, + { + "epoch": 0.039518124514602564, + "grad_norm": 10.680055044082353, + "learning_rate": 3.291215403128761e-06, + "loss": 1.059, + "step": 547 + }, + { + "epoch": 0.03959036971481207, + "grad_norm": 11.485824472729826, + "learning_rate": 3.2972322503008427e-06, + "loss": 1.0177, + "step": 548 + }, + { + "epoch": 0.039662614915021585, + "grad_norm": 9.723938761961982, + "learning_rate": 3.303249097472924e-06, + "loss": 0.9915, + "step": 549 + }, + { + "epoch": 0.03973486011523109, + "grad_norm": 9.372364131403618, + "learning_rate": 3.3092659446450066e-06, + "loss": 1.1305, + "step": 550 + }, + { + "epoch": 0.03980710531544061, + "grad_norm": 10.337824020012626, + "learning_rate": 3.3152827918170884e-06, + "loss": 1.1532, + "step": 551 + }, + { + "epoch": 0.039879350515650114, + "grad_norm": 11.044871276685605, + "learning_rate": 3.3212996389891698e-06, + "loss": 1.0793, + "step": 552 + }, + { + "epoch": 0.03995159571585963, + "grad_norm": 9.361309583945232, + "learning_rate": 3.3273164861612515e-06, + "loss": 0.9762, + "step": 553 + }, + { + "epoch": 0.040023840916069135, + "grad_norm": 8.463945553263754, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.9919, + "step": 554 + }, + { + "epoch": 0.04009608611627865, + "grad_norm": 9.680252218822583, + "learning_rate": 3.3393501805054155e-06, + "loss": 1.0204, + "step": 555 + }, + { + "epoch": 0.040168331316488164, + "grad_norm": 9.045501973745363, + "learning_rate": 3.3453670276774973e-06, + "loss": 1.0564, + "step": 556 + }, + { + "epoch": 0.04024057651669767, + "grad_norm": 7.536111489315596, + "learning_rate": 3.351383874849579e-06, + "loss": 1.0104, + "step": 557 + }, + { + "epoch": 0.040312821716907185, + "grad_norm": 9.705546447054772, + "learning_rate": 3.357400722021661e-06, + "loss": 1.0274, + "step": 558 + }, + { + "epoch": 0.04038506691711669, + "grad_norm": 8.586961096855292, + "learning_rate": 3.3634175691937426e-06, + "loss": 0.9909, + "step": 559 + }, + { + "epoch": 0.040457312117326207, + "grad_norm": 9.384842245719998, + "learning_rate": 3.3694344163658244e-06, + "loss": 1.0409, + "step": 560 + }, + { + "epoch": 0.040529557317535714, + "grad_norm": 9.451379536040088, + "learning_rate": 3.3754512635379066e-06, + "loss": 1.017, + "step": 561 + }, + { + "epoch": 0.04060180251774523, + "grad_norm": 7.459659678696653, + "learning_rate": 3.3814681107099884e-06, + "loss": 0.9764, + "step": 562 + }, + { + "epoch": 0.040674047717954735, + "grad_norm": 7.835199444927798, + "learning_rate": 3.38748495788207e-06, + "loss": 1.0364, + "step": 563 + }, + { + "epoch": 0.04074629291816425, + "grad_norm": 9.837032290516687, + "learning_rate": 3.393501805054152e-06, + "loss": 1.0801, + "step": 564 + }, + { + "epoch": 0.040818538118373764, + "grad_norm": 9.424817993673257, + "learning_rate": 3.3995186522262337e-06, + "loss": 1.0211, + "step": 565 + }, + { + "epoch": 0.04089078331858327, + "grad_norm": 8.160910257918106, + "learning_rate": 3.405535499398315e-06, + "loss": 1.0756, + "step": 566 + }, + { + "epoch": 0.040963028518792785, + "grad_norm": 9.8175603368045, + "learning_rate": 3.4115523465703977e-06, + "loss": 1.0529, + "step": 567 + }, + { + "epoch": 0.04103527371900229, + "grad_norm": 11.361564330802723, + "learning_rate": 3.4175691937424795e-06, + "loss": 1.1245, + "step": 568 + }, + { + "epoch": 0.041107518919211807, + "grad_norm": 8.4553404776374, + "learning_rate": 3.423586040914561e-06, + "loss": 1.0693, + "step": 569 + }, + { + "epoch": 0.041179764119421314, + "grad_norm": 9.041298948155728, + "learning_rate": 3.4296028880866426e-06, + "loss": 1.0315, + "step": 570 + }, + { + "epoch": 0.04125200931963083, + "grad_norm": 8.874253792353121, + "learning_rate": 3.4356197352587244e-06, + "loss": 1.0034, + "step": 571 + }, + { + "epoch": 0.041324254519840335, + "grad_norm": 9.009223450316592, + "learning_rate": 3.4416365824308066e-06, + "loss": 0.9879, + "step": 572 + }, + { + "epoch": 0.04139649972004985, + "grad_norm": 7.835488638377854, + "learning_rate": 3.4476534296028884e-06, + "loss": 1.0548, + "step": 573 + }, + { + "epoch": 0.041468744920259364, + "grad_norm": 8.227234380949776, + "learning_rate": 3.45367027677497e-06, + "loss": 1.0987, + "step": 574 + }, + { + "epoch": 0.04154099012046887, + "grad_norm": 8.114263865328258, + "learning_rate": 3.459687123947052e-06, + "loss": 1.1154, + "step": 575 + }, + { + "epoch": 0.041613235320678385, + "grad_norm": 8.905921903221982, + "learning_rate": 3.4657039711191337e-06, + "loss": 1.0347, + "step": 576 + }, + { + "epoch": 0.04168548052088789, + "grad_norm": 7.503703156820226, + "learning_rate": 3.4717208182912155e-06, + "loss": 1.054, + "step": 577 + }, + { + "epoch": 0.041757725721097406, + "grad_norm": 8.587977020474566, + "learning_rate": 3.4777376654632977e-06, + "loss": 1.0095, + "step": 578 + }, + { + "epoch": 0.041829970921306914, + "grad_norm": 8.220405904953083, + "learning_rate": 3.4837545126353795e-06, + "loss": 0.969, + "step": 579 + }, + { + "epoch": 0.04190221612151643, + "grad_norm": 8.743029188285204, + "learning_rate": 3.4897713598074613e-06, + "loss": 1.0897, + "step": 580 + }, + { + "epoch": 0.041974461321725935, + "grad_norm": 8.300673252973924, + "learning_rate": 3.495788206979543e-06, + "loss": 1.0031, + "step": 581 + }, + { + "epoch": 0.04204670652193545, + "grad_norm": 11.241633716381319, + "learning_rate": 3.501805054151625e-06, + "loss": 1.0231, + "step": 582 + }, + { + "epoch": 0.04211895172214496, + "grad_norm": 7.758115149795744, + "learning_rate": 3.507821901323707e-06, + "loss": 1.0426, + "step": 583 + }, + { + "epoch": 0.04219119692235447, + "grad_norm": 7.237382103152441, + "learning_rate": 3.513838748495789e-06, + "loss": 1.0369, + "step": 584 + }, + { + "epoch": 0.042263442122563985, + "grad_norm": 8.788628027455735, + "learning_rate": 3.5198555956678706e-06, + "loss": 1.0763, + "step": 585 + }, + { + "epoch": 0.04233568732277349, + "grad_norm": 10.622272814635066, + "learning_rate": 3.525872442839952e-06, + "loss": 0.938, + "step": 586 + }, + { + "epoch": 0.042407932522983006, + "grad_norm": 8.101474448804298, + "learning_rate": 3.5318892900120337e-06, + "loss": 1.0448, + "step": 587 + }, + { + "epoch": 0.042480177723192514, + "grad_norm": 7.24538669478266, + "learning_rate": 3.5379061371841155e-06, + "loss": 1.1258, + "step": 588 + }, + { + "epoch": 0.04255242292340203, + "grad_norm": 12.49571215520048, + "learning_rate": 3.5439229843561977e-06, + "loss": 1.1251, + "step": 589 + }, + { + "epoch": 0.042624668123611535, + "grad_norm": 10.488261609618482, + "learning_rate": 3.5499398315282795e-06, + "loss": 1.0829, + "step": 590 + }, + { + "epoch": 0.04269691332382105, + "grad_norm": 8.272417072192903, + "learning_rate": 3.5559566787003613e-06, + "loss": 1.0034, + "step": 591 + }, + { + "epoch": 0.04276915852403056, + "grad_norm": 7.179006317998835, + "learning_rate": 3.561973525872443e-06, + "loss": 1.0391, + "step": 592 + }, + { + "epoch": 0.04284140372424007, + "grad_norm": 8.11638478044855, + "learning_rate": 3.567990373044525e-06, + "loss": 1.0203, + "step": 593 + }, + { + "epoch": 0.042913648924449585, + "grad_norm": 10.154309084371867, + "learning_rate": 3.574007220216607e-06, + "loss": 1.066, + "step": 594 + }, + { + "epoch": 0.04298589412465909, + "grad_norm": 10.090871678063522, + "learning_rate": 3.580024067388689e-06, + "loss": 1.0613, + "step": 595 + }, + { + "epoch": 0.043058139324868606, + "grad_norm": 13.232484000084792, + "learning_rate": 3.5860409145607706e-06, + "loss": 1.027, + "step": 596 + }, + { + "epoch": 0.043130384525078114, + "grad_norm": 9.74692115100087, + "learning_rate": 3.5920577617328523e-06, + "loss": 1.0504, + "step": 597 + }, + { + "epoch": 0.04320262972528763, + "grad_norm": 11.077087784219138, + "learning_rate": 3.598074608904934e-06, + "loss": 1.1067, + "step": 598 + }, + { + "epoch": 0.043274874925497135, + "grad_norm": 11.64737639784707, + "learning_rate": 3.604091456077016e-06, + "loss": 0.95, + "step": 599 + }, + { + "epoch": 0.04334712012570665, + "grad_norm": 8.835397826965245, + "learning_rate": 3.610108303249098e-06, + "loss": 1.0275, + "step": 600 + }, + { + "epoch": 0.04341936532591616, + "grad_norm": 8.43253434609352, + "learning_rate": 3.61612515042118e-06, + "loss": 0.9687, + "step": 601 + }, + { + "epoch": 0.04349161052612567, + "grad_norm": 8.676439008721857, + "learning_rate": 3.6221419975932617e-06, + "loss": 1.0203, + "step": 602 + }, + { + "epoch": 0.043563855726335185, + "grad_norm": 9.12145592830186, + "learning_rate": 3.628158844765343e-06, + "loss": 1.0453, + "step": 603 + }, + { + "epoch": 0.04363610092654469, + "grad_norm": 9.0454176287292, + "learning_rate": 3.634175691937425e-06, + "loss": 1.1023, + "step": 604 + }, + { + "epoch": 0.043708346126754206, + "grad_norm": 7.999159053472067, + "learning_rate": 3.6401925391095066e-06, + "loss": 1.023, + "step": 605 + }, + { + "epoch": 0.043780591326963714, + "grad_norm": 10.031564107475106, + "learning_rate": 3.6462093862815888e-06, + "loss": 1.0968, + "step": 606 + }, + { + "epoch": 0.04385283652717323, + "grad_norm": 8.612792290543123, + "learning_rate": 3.6522262334536706e-06, + "loss": 0.9911, + "step": 607 + }, + { + "epoch": 0.043925081727382735, + "grad_norm": 9.522545013040927, + "learning_rate": 3.6582430806257523e-06, + "loss": 1.0611, + "step": 608 + }, + { + "epoch": 0.04399732692759225, + "grad_norm": 7.1788121002131, + "learning_rate": 3.664259927797834e-06, + "loss": 1.0506, + "step": 609 + }, + { + "epoch": 0.04406957212780176, + "grad_norm": 9.009161207167862, + "learning_rate": 3.670276774969916e-06, + "loss": 0.9861, + "step": 610 + }, + { + "epoch": 0.04414181732801127, + "grad_norm": 8.659785350932133, + "learning_rate": 3.676293622141998e-06, + "loss": 1.0845, + "step": 611 + }, + { + "epoch": 0.04421406252822078, + "grad_norm": 7.344269949179822, + "learning_rate": 3.68231046931408e-06, + "loss": 0.9944, + "step": 612 + }, + { + "epoch": 0.04428630772843029, + "grad_norm": 8.724648507369706, + "learning_rate": 3.6883273164861617e-06, + "loss": 1.056, + "step": 613 + }, + { + "epoch": 0.044358552928639806, + "grad_norm": 8.504232811253923, + "learning_rate": 3.6943441636582434e-06, + "loss": 1.0937, + "step": 614 + }, + { + "epoch": 0.044430798128849314, + "grad_norm": 8.222775477906385, + "learning_rate": 3.700361010830325e-06, + "loss": 1.0394, + "step": 615 + }, + { + "epoch": 0.04450304332905883, + "grad_norm": 8.32840669595702, + "learning_rate": 3.706377858002407e-06, + "loss": 1.0236, + "step": 616 + }, + { + "epoch": 0.044575288529268335, + "grad_norm": 9.118325915028251, + "learning_rate": 3.712394705174489e-06, + "loss": 1.0857, + "step": 617 + }, + { + "epoch": 0.04464753372947785, + "grad_norm": 7.372740059841404, + "learning_rate": 3.718411552346571e-06, + "loss": 0.9527, + "step": 618 + }, + { + "epoch": 0.04471977892968736, + "grad_norm": 7.552328283787955, + "learning_rate": 3.7244283995186527e-06, + "loss": 0.9574, + "step": 619 + }, + { + "epoch": 0.04479202412989687, + "grad_norm": 8.481147503497048, + "learning_rate": 3.730445246690734e-06, + "loss": 1.0112, + "step": 620 + }, + { + "epoch": 0.04486426933010638, + "grad_norm": 7.733386043938958, + "learning_rate": 3.736462093862816e-06, + "loss": 0.9635, + "step": 621 + }, + { + "epoch": 0.04493651453031589, + "grad_norm": 7.728511923094262, + "learning_rate": 3.742478941034898e-06, + "loss": 1.0985, + "step": 622 + }, + { + "epoch": 0.045008759730525406, + "grad_norm": 8.538161834934453, + "learning_rate": 3.74849578820698e-06, + "loss": 1.0623, + "step": 623 + }, + { + "epoch": 0.045081004930734914, + "grad_norm": 9.040837356810744, + "learning_rate": 3.7545126353790616e-06, + "loss": 1.0252, + "step": 624 + }, + { + "epoch": 0.04515325013094443, + "grad_norm": 7.536187417293589, + "learning_rate": 3.7605294825511434e-06, + "loss": 0.9809, + "step": 625 + }, + { + "epoch": 0.045225495331153935, + "grad_norm": 9.474546364264294, + "learning_rate": 3.766546329723225e-06, + "loss": 1.0552, + "step": 626 + }, + { + "epoch": 0.04529774053136345, + "grad_norm": 8.120282490894857, + "learning_rate": 3.772563176895307e-06, + "loss": 0.9457, + "step": 627 + }, + { + "epoch": 0.04536998573157296, + "grad_norm": 7.870775573206195, + "learning_rate": 3.778580024067389e-06, + "loss": 0.9694, + "step": 628 + }, + { + "epoch": 0.04544223093178247, + "grad_norm": 12.397411051556288, + "learning_rate": 3.784596871239471e-06, + "loss": 1.0433, + "step": 629 + }, + { + "epoch": 0.04551447613199198, + "grad_norm": 9.316041004370751, + "learning_rate": 3.7906137184115527e-06, + "loss": 0.9675, + "step": 630 + }, + { + "epoch": 0.04558672133220149, + "grad_norm": 10.524785171454733, + "learning_rate": 3.7966305655836345e-06, + "loss": 1.0, + "step": 631 + }, + { + "epoch": 0.045658966532411006, + "grad_norm": 10.571250766746454, + "learning_rate": 3.8026474127557163e-06, + "loss": 1.0872, + "step": 632 + }, + { + "epoch": 0.045731211732620514, + "grad_norm": 8.915720191230786, + "learning_rate": 3.808664259927798e-06, + "loss": 1.073, + "step": 633 + }, + { + "epoch": 0.04580345693283003, + "grad_norm": 8.911145173364714, + "learning_rate": 3.8146811070998803e-06, + "loss": 1.0442, + "step": 634 + }, + { + "epoch": 0.045875702133039535, + "grad_norm": 11.350593289232863, + "learning_rate": 3.820697954271962e-06, + "loss": 1.1023, + "step": 635 + }, + { + "epoch": 0.04594794733324905, + "grad_norm": 11.686436569058046, + "learning_rate": 3.826714801444043e-06, + "loss": 1.155, + "step": 636 + }, + { + "epoch": 0.04602019253345856, + "grad_norm": 9.667877603737018, + "learning_rate": 3.832731648616125e-06, + "loss": 1.1304, + "step": 637 + }, + { + "epoch": 0.04609243773366807, + "grad_norm": 7.94553311172028, + "learning_rate": 3.838748495788207e-06, + "loss": 1.0988, + "step": 638 + }, + { + "epoch": 0.04616468293387758, + "grad_norm": 10.215769855245936, + "learning_rate": 3.84476534296029e-06, + "loss": 0.9747, + "step": 639 + }, + { + "epoch": 0.04623692813408709, + "grad_norm": 12.920653943970628, + "learning_rate": 3.850782190132371e-06, + "loss": 1.0739, + "step": 640 + }, + { + "epoch": 0.0463091733342966, + "grad_norm": 11.418520663191979, + "learning_rate": 3.856799037304453e-06, + "loss": 1.0497, + "step": 641 + }, + { + "epoch": 0.046381418534506114, + "grad_norm": 11.39704251659175, + "learning_rate": 3.862815884476535e-06, + "loss": 1.084, + "step": 642 + }, + { + "epoch": 0.04645366373471563, + "grad_norm": 7.935770161684499, + "learning_rate": 3.868832731648616e-06, + "loss": 1.017, + "step": 643 + }, + { + "epoch": 0.046525908934925135, + "grad_norm": 11.6081966248783, + "learning_rate": 3.874849578820698e-06, + "loss": 1.0752, + "step": 644 + }, + { + "epoch": 0.04659815413513465, + "grad_norm": 10.590210089684357, + "learning_rate": 3.88086642599278e-06, + "loss": 1.0977, + "step": 645 + }, + { + "epoch": 0.046670399335344157, + "grad_norm": 8.352572801964037, + "learning_rate": 3.886883273164862e-06, + "loss": 1.0354, + "step": 646 + }, + { + "epoch": 0.04674264453555367, + "grad_norm": 7.378044146717218, + "learning_rate": 3.892900120336944e-06, + "loss": 0.9732, + "step": 647 + }, + { + "epoch": 0.04681488973576318, + "grad_norm": 9.867861714891365, + "learning_rate": 3.898916967509026e-06, + "loss": 1.1215, + "step": 648 + }, + { + "epoch": 0.04688713493597269, + "grad_norm": 8.979486186869256, + "learning_rate": 3.904933814681107e-06, + "loss": 1.0034, + "step": 649 + }, + { + "epoch": 0.0469593801361822, + "grad_norm": 11.300395074200186, + "learning_rate": 3.910950661853189e-06, + "loss": 0.9374, + "step": 650 + }, + { + "epoch": 0.047031625336391714, + "grad_norm": 9.275614559244048, + "learning_rate": 3.916967509025271e-06, + "loss": 1.0823, + "step": 651 + }, + { + "epoch": 0.04710387053660123, + "grad_norm": 8.035515628357288, + "learning_rate": 3.922984356197353e-06, + "loss": 1.0493, + "step": 652 + }, + { + "epoch": 0.047176115736810735, + "grad_norm": 10.184799491695218, + "learning_rate": 3.9290012033694345e-06, + "loss": 1.0815, + "step": 653 + }, + { + "epoch": 0.04724836093702025, + "grad_norm": 11.137502096156684, + "learning_rate": 3.935018050541516e-06, + "loss": 1.1539, + "step": 654 + }, + { + "epoch": 0.047320606137229757, + "grad_norm": 10.596198685127938, + "learning_rate": 3.941034897713598e-06, + "loss": 1.027, + "step": 655 + }, + { + "epoch": 0.04739285133743927, + "grad_norm": 9.829333984537804, + "learning_rate": 3.947051744885681e-06, + "loss": 1.0118, + "step": 656 + }, + { + "epoch": 0.04746509653764878, + "grad_norm": 7.066739860491497, + "learning_rate": 3.9530685920577625e-06, + "loss": 0.9597, + "step": 657 + }, + { + "epoch": 0.04753734173785829, + "grad_norm": 9.281716074945324, + "learning_rate": 3.959085439229844e-06, + "loss": 0.928, + "step": 658 + }, + { + "epoch": 0.0476095869380678, + "grad_norm": 10.62385427244043, + "learning_rate": 3.965102286401926e-06, + "loss": 1.1307, + "step": 659 + }, + { + "epoch": 0.047681832138277314, + "grad_norm": 9.647105012999285, + "learning_rate": 3.971119133574007e-06, + "loss": 1.0724, + "step": 660 + }, + { + "epoch": 0.04775407733848682, + "grad_norm": 9.89964252173307, + "learning_rate": 3.9771359807460896e-06, + "loss": 0.9843, + "step": 661 + }, + { + "epoch": 0.047826322538696335, + "grad_norm": 10.13564842615186, + "learning_rate": 3.983152827918171e-06, + "loss": 1.0537, + "step": 662 + }, + { + "epoch": 0.04789856773890585, + "grad_norm": 8.59291011520817, + "learning_rate": 3.989169675090253e-06, + "loss": 1.0089, + "step": 663 + }, + { + "epoch": 0.047970812939115356, + "grad_norm": 8.620632669194732, + "learning_rate": 3.995186522262335e-06, + "loss": 1.0187, + "step": 664 + }, + { + "epoch": 0.04804305813932487, + "grad_norm": 10.517062900855299, + "learning_rate": 4.001203369434417e-06, + "loss": 1.0481, + "step": 665 + }, + { + "epoch": 0.04811530333953438, + "grad_norm": 8.61822863157469, + "learning_rate": 4.0072202166064985e-06, + "loss": 0.9345, + "step": 666 + }, + { + "epoch": 0.04818754853974389, + "grad_norm": 7.318502978057996, + "learning_rate": 4.01323706377858e-06, + "loss": 0.9345, + "step": 667 + }, + { + "epoch": 0.0482597937399534, + "grad_norm": 8.572488346887358, + "learning_rate": 4.019253910950662e-06, + "loss": 0.9736, + "step": 668 + }, + { + "epoch": 0.048332038940162914, + "grad_norm": 9.343124113522393, + "learning_rate": 4.025270758122744e-06, + "loss": 1.0342, + "step": 669 + }, + { + "epoch": 0.04840428414037242, + "grad_norm": 8.825257502357093, + "learning_rate": 4.0312876052948256e-06, + "loss": 1.0257, + "step": 670 + }, + { + "epoch": 0.048476529340581935, + "grad_norm": 8.460411781047233, + "learning_rate": 4.037304452466907e-06, + "loss": 0.9583, + "step": 671 + }, + { + "epoch": 0.04854877454079145, + "grad_norm": 8.341902281805945, + "learning_rate": 4.043321299638989e-06, + "loss": 0.9806, + "step": 672 + }, + { + "epoch": 0.048621019741000956, + "grad_norm": 10.69444290315251, + "learning_rate": 4.049338146811072e-06, + "loss": 1.0581, + "step": 673 + }, + { + "epoch": 0.04869326494121047, + "grad_norm": 6.555182746301002, + "learning_rate": 4.0553549939831535e-06, + "loss": 0.9151, + "step": 674 + }, + { + "epoch": 0.04876551014141998, + "grad_norm": 10.067904519611597, + "learning_rate": 4.061371841155235e-06, + "loss": 0.9953, + "step": 675 + }, + { + "epoch": 0.04883775534162949, + "grad_norm": 8.090316692326013, + "learning_rate": 4.067388688327317e-06, + "loss": 1.0519, + "step": 676 + }, + { + "epoch": 0.048910000541839, + "grad_norm": 8.752238831882531, + "learning_rate": 4.073405535499398e-06, + "loss": 1.0546, + "step": 677 + }, + { + "epoch": 0.048982245742048514, + "grad_norm": 10.187004217709966, + "learning_rate": 4.079422382671481e-06, + "loss": 1.0505, + "step": 678 + }, + { + "epoch": 0.04905449094225802, + "grad_norm": 8.751259849312857, + "learning_rate": 4.0854392298435624e-06, + "loss": 1.0362, + "step": 679 + }, + { + "epoch": 0.049126736142467535, + "grad_norm": 7.489728124188994, + "learning_rate": 4.091456077015644e-06, + "loss": 0.9456, + "step": 680 + }, + { + "epoch": 0.04919898134267705, + "grad_norm": 11.319482829543066, + "learning_rate": 4.097472924187726e-06, + "loss": 1.0564, + "step": 681 + }, + { + "epoch": 0.049271226542886556, + "grad_norm": 8.557561410811184, + "learning_rate": 4.103489771359808e-06, + "loss": 0.939, + "step": 682 + }, + { + "epoch": 0.04934347174309607, + "grad_norm": 9.10430652036699, + "learning_rate": 4.1095066185318895e-06, + "loss": 1.0206, + "step": 683 + }, + { + "epoch": 0.04941571694330558, + "grad_norm": 9.476339490110933, + "learning_rate": 4.115523465703971e-06, + "loss": 1.0651, + "step": 684 + }, + { + "epoch": 0.04948796214351509, + "grad_norm": 9.268610609965236, + "learning_rate": 4.121540312876053e-06, + "loss": 1.0398, + "step": 685 + }, + { + "epoch": 0.0495602073437246, + "grad_norm": 10.377117630271906, + "learning_rate": 4.127557160048135e-06, + "loss": 1.0125, + "step": 686 + }, + { + "epoch": 0.049632452543934114, + "grad_norm": 8.268358553777656, + "learning_rate": 4.133574007220217e-06, + "loss": 1.0555, + "step": 687 + }, + { + "epoch": 0.04970469774414362, + "grad_norm": 7.603793061018621, + "learning_rate": 4.1395908543922984e-06, + "loss": 1.0179, + "step": 688 + }, + { + "epoch": 0.049776942944353135, + "grad_norm": 8.988618223529096, + "learning_rate": 4.145607701564381e-06, + "loss": 0.9787, + "step": 689 + }, + { + "epoch": 0.04984918814456264, + "grad_norm": 8.073609733774651, + "learning_rate": 4.151624548736463e-06, + "loss": 1.0686, + "step": 690 + }, + { + "epoch": 0.049921433344772156, + "grad_norm": 10.265759889596087, + "learning_rate": 4.157641395908545e-06, + "loss": 1.0691, + "step": 691 + }, + { + "epoch": 0.04999367854498167, + "grad_norm": 8.664649826519877, + "learning_rate": 4.163658243080626e-06, + "loss": 1.0221, + "step": 692 + }, + { + "epoch": 0.05006592374519118, + "grad_norm": 9.14329429841572, + "learning_rate": 4.169675090252708e-06, + "loss": 0.9806, + "step": 693 + }, + { + "epoch": 0.05013816894540069, + "grad_norm": 12.533034352192695, + "learning_rate": 4.175691937424789e-06, + "loss": 1.1317, + "step": 694 + }, + { + "epoch": 0.0502104141456102, + "grad_norm": 9.170971177651175, + "learning_rate": 4.181708784596872e-06, + "loss": 1.0516, + "step": 695 + }, + { + "epoch": 0.050282659345819714, + "grad_norm": 10.558912401545703, + "learning_rate": 4.1877256317689535e-06, + "loss": 0.9761, + "step": 696 + }, + { + "epoch": 0.05035490454602922, + "grad_norm": 18.30068995107963, + "learning_rate": 4.193742478941035e-06, + "loss": 1.0548, + "step": 697 + }, + { + "epoch": 0.050427149746238735, + "grad_norm": 10.310398696901226, + "learning_rate": 4.199759326113117e-06, + "loss": 1.1167, + "step": 698 + }, + { + "epoch": 0.05049939494644824, + "grad_norm": 10.680904026131389, + "learning_rate": 4.205776173285199e-06, + "loss": 0.9672, + "step": 699 + }, + { + "epoch": 0.050571640146657756, + "grad_norm": 17.033068638565798, + "learning_rate": 4.211793020457281e-06, + "loss": 1.1109, + "step": 700 + }, + { + "epoch": 0.05064388534686727, + "grad_norm": 11.808606025671754, + "learning_rate": 4.217809867629362e-06, + "loss": 1.0477, + "step": 701 + }, + { + "epoch": 0.05071613054707678, + "grad_norm": 9.057366541764498, + "learning_rate": 4.223826714801444e-06, + "loss": 0.9823, + "step": 702 + }, + { + "epoch": 0.05078837574728629, + "grad_norm": 11.42381526889557, + "learning_rate": 4.229843561973526e-06, + "loss": 1.0306, + "step": 703 + }, + { + "epoch": 0.0508606209474958, + "grad_norm": 9.695814932715695, + "learning_rate": 4.235860409145608e-06, + "loss": 1.0017, + "step": 704 + }, + { + "epoch": 0.050932866147705314, + "grad_norm": 9.89725699908601, + "learning_rate": 4.2418772563176895e-06, + "loss": 1.071, + "step": 705 + }, + { + "epoch": 0.05100511134791482, + "grad_norm": 8.384669856269406, + "learning_rate": 4.247894103489772e-06, + "loss": 1.0553, + "step": 706 + }, + { + "epoch": 0.051077356548124335, + "grad_norm": 9.038569563049993, + "learning_rate": 4.253910950661854e-06, + "loss": 1.0472, + "step": 707 + }, + { + "epoch": 0.05114960174833384, + "grad_norm": 8.653222832035436, + "learning_rate": 4.259927797833936e-06, + "loss": 1.0746, + "step": 708 + }, + { + "epoch": 0.051221846948543356, + "grad_norm": 10.650640448713563, + "learning_rate": 4.2659446450060175e-06, + "loss": 1.0389, + "step": 709 + }, + { + "epoch": 0.05129409214875287, + "grad_norm": 8.656856364141374, + "learning_rate": 4.271961492178099e-06, + "loss": 1.0477, + "step": 710 + }, + { + "epoch": 0.05136633734896238, + "grad_norm": 14.016353320332312, + "learning_rate": 4.27797833935018e-06, + "loss": 1.0607, + "step": 711 + }, + { + "epoch": 0.05143858254917189, + "grad_norm": 9.354404052803442, + "learning_rate": 4.283995186522263e-06, + "loss": 0.9832, + "step": 712 + }, + { + "epoch": 0.0515108277493814, + "grad_norm": 9.322096827313718, + "learning_rate": 4.290012033694345e-06, + "loss": 1.0535, + "step": 713 + }, + { + "epoch": 0.051583072949590913, + "grad_norm": 10.596536365566692, + "learning_rate": 4.296028880866426e-06, + "loss": 1.0285, + "step": 714 + }, + { + "epoch": 0.05165531814980042, + "grad_norm": 8.123883918988454, + "learning_rate": 4.302045728038508e-06, + "loss": 0.983, + "step": 715 + }, + { + "epoch": 0.051727563350009935, + "grad_norm": 8.730024806945442, + "learning_rate": 4.30806257521059e-06, + "loss": 1.0249, + "step": 716 + }, + { + "epoch": 0.05179980855021944, + "grad_norm": 9.186812809786144, + "learning_rate": 4.314079422382672e-06, + "loss": 1.0687, + "step": 717 + }, + { + "epoch": 0.051872053750428956, + "grad_norm": 7.7636981622982635, + "learning_rate": 4.3200962695547535e-06, + "loss": 0.9607, + "step": 718 + }, + { + "epoch": 0.051944298950638464, + "grad_norm": 10.352491114008503, + "learning_rate": 4.326113116726835e-06, + "loss": 1.1309, + "step": 719 + }, + { + "epoch": 0.05201654415084798, + "grad_norm": 12.768190459970054, + "learning_rate": 4.332129963898917e-06, + "loss": 1.0337, + "step": 720 + }, + { + "epoch": 0.05208878935105749, + "grad_norm": 9.804038511887748, + "learning_rate": 4.338146811070999e-06, + "loss": 0.9905, + "step": 721 + }, + { + "epoch": 0.052161034551267, + "grad_norm": 9.605447701659534, + "learning_rate": 4.344163658243081e-06, + "loss": 0.9696, + "step": 722 + }, + { + "epoch": 0.052233279751476513, + "grad_norm": 8.46824774009344, + "learning_rate": 4.350180505415163e-06, + "loss": 1.0271, + "step": 723 + }, + { + "epoch": 0.05230552495168602, + "grad_norm": 13.514060151462393, + "learning_rate": 4.356197352587245e-06, + "loss": 1.0756, + "step": 724 + }, + { + "epoch": 0.052377770151895535, + "grad_norm": 8.83597290139902, + "learning_rate": 4.362214199759327e-06, + "loss": 1.1201, + "step": 725 + }, + { + "epoch": 0.05245001535210504, + "grad_norm": 9.391846842308226, + "learning_rate": 4.3682310469314086e-06, + "loss": 0.9494, + "step": 726 + }, + { + "epoch": 0.052522260552314556, + "grad_norm": 8.72173824168915, + "learning_rate": 4.37424789410349e-06, + "loss": 1.0673, + "step": 727 + }, + { + "epoch": 0.052594505752524064, + "grad_norm": 12.666775619305955, + "learning_rate": 4.380264741275572e-06, + "loss": 0.9918, + "step": 728 + }, + { + "epoch": 0.05266675095273358, + "grad_norm": 8.8445785727413, + "learning_rate": 4.386281588447654e-06, + "loss": 1.0334, + "step": 729 + }, + { + "epoch": 0.05273899615294309, + "grad_norm": 8.323701352849332, + "learning_rate": 4.392298435619736e-06, + "loss": 1.0555, + "step": 730 + }, + { + "epoch": 0.0528112413531526, + "grad_norm": 9.019573120707275, + "learning_rate": 4.3983152827918175e-06, + "loss": 0.9901, + "step": 731 + }, + { + "epoch": 0.05288348655336211, + "grad_norm": 9.345970680927156, + "learning_rate": 4.404332129963899e-06, + "loss": 1.0324, + "step": 732 + }, + { + "epoch": 0.05295573175357162, + "grad_norm": 7.162694702697633, + "learning_rate": 4.410348977135981e-06, + "loss": 0.9915, + "step": 733 + }, + { + "epoch": 0.053027976953781135, + "grad_norm": 7.914300611854764, + "learning_rate": 4.416365824308063e-06, + "loss": 1.0857, + "step": 734 + }, + { + "epoch": 0.05310022215399064, + "grad_norm": 7.523730057428349, + "learning_rate": 4.422382671480145e-06, + "loss": 0.9789, + "step": 735 + }, + { + "epoch": 0.053172467354200156, + "grad_norm": 12.149142111800915, + "learning_rate": 4.428399518652226e-06, + "loss": 1.0978, + "step": 736 + }, + { + "epoch": 0.053244712554409664, + "grad_norm": 8.104935976466265, + "learning_rate": 4.434416365824308e-06, + "loss": 1.0795, + "step": 737 + }, + { + "epoch": 0.05331695775461918, + "grad_norm": 7.664666730450006, + "learning_rate": 4.44043321299639e-06, + "loss": 0.9951, + "step": 738 + }, + { + "epoch": 0.05338920295482869, + "grad_norm": 7.917474672007784, + "learning_rate": 4.4464500601684725e-06, + "loss": 0.947, + "step": 739 + }, + { + "epoch": 0.0534614481550382, + "grad_norm": 9.76549882730992, + "learning_rate": 4.452466907340554e-06, + "loss": 1.086, + "step": 740 + }, + { + "epoch": 0.05353369335524771, + "grad_norm": 10.1628137594025, + "learning_rate": 4.458483754512636e-06, + "loss": 1.089, + "step": 741 + }, + { + "epoch": 0.05360593855545722, + "grad_norm": 8.146132936033112, + "learning_rate": 4.464500601684718e-06, + "loss": 1.0311, + "step": 742 + }, + { + "epoch": 0.053678183755666735, + "grad_norm": 8.603981391691535, + "learning_rate": 4.4705174488568e-06, + "loss": 1.1101, + "step": 743 + }, + { + "epoch": 0.05375042895587624, + "grad_norm": 9.636689061187546, + "learning_rate": 4.4765342960288814e-06, + "loss": 1.012, + "step": 744 + }, + { + "epoch": 0.053822674156085756, + "grad_norm": 7.843362623884979, + "learning_rate": 4.482551143200963e-06, + "loss": 1.0967, + "step": 745 + }, + { + "epoch": 0.053894919356295264, + "grad_norm": 7.373402826672686, + "learning_rate": 4.488567990373045e-06, + "loss": 0.9351, + "step": 746 + }, + { + "epoch": 0.05396716455650478, + "grad_norm": 9.904050564810946, + "learning_rate": 4.494584837545127e-06, + "loss": 1.0814, + "step": 747 + }, + { + "epoch": 0.054039409756714285, + "grad_norm": 8.581422710417153, + "learning_rate": 4.5006016847172085e-06, + "loss": 0.9902, + "step": 748 + }, + { + "epoch": 0.0541116549569238, + "grad_norm": 8.557377751931828, + "learning_rate": 4.50661853188929e-06, + "loss": 0.9985, + "step": 749 + }, + { + "epoch": 0.05418390015713331, + "grad_norm": 7.242884597231849, + "learning_rate": 4.512635379061372e-06, + "loss": 0.9048, + "step": 750 + }, + { + "epoch": 0.05425614535734282, + "grad_norm": 7.61640921504712, + "learning_rate": 4.518652226233454e-06, + "loss": 1.1279, + "step": 751 + }, + { + "epoch": 0.054328390557552335, + "grad_norm": 7.8026801077587224, + "learning_rate": 4.524669073405536e-06, + "loss": 0.9339, + "step": 752 + }, + { + "epoch": 0.05440063575776184, + "grad_norm": 8.192818581303527, + "learning_rate": 4.5306859205776174e-06, + "loss": 1.0654, + "step": 753 + }, + { + "epoch": 0.054472880957971356, + "grad_norm": 8.035804258781395, + "learning_rate": 4.536702767749699e-06, + "loss": 0.9956, + "step": 754 + }, + { + "epoch": 0.054545126158180864, + "grad_norm": 10.417067578547965, + "learning_rate": 4.542719614921781e-06, + "loss": 1.0459, + "step": 755 + }, + { + "epoch": 0.05461737135839038, + "grad_norm": 8.39069819640443, + "learning_rate": 4.548736462093864e-06, + "loss": 1.0279, + "step": 756 + }, + { + "epoch": 0.054689616558599885, + "grad_norm": 9.670424640724605, + "learning_rate": 4.554753309265945e-06, + "loss": 1.0886, + "step": 757 + }, + { + "epoch": 0.0547618617588094, + "grad_norm": 7.956883110274173, + "learning_rate": 4.560770156438027e-06, + "loss": 0.9665, + "step": 758 + }, + { + "epoch": 0.05483410695901891, + "grad_norm": 7.88150603372345, + "learning_rate": 4.566787003610109e-06, + "loss": 1.0568, + "step": 759 + }, + { + "epoch": 0.05490635215922842, + "grad_norm": 8.384917806314641, + "learning_rate": 4.572803850782191e-06, + "loss": 1.0313, + "step": 760 + }, + { + "epoch": 0.054978597359437935, + "grad_norm": 9.944393046534557, + "learning_rate": 4.578820697954272e-06, + "loss": 0.9444, + "step": 761 + }, + { + "epoch": 0.05505084255964744, + "grad_norm": 8.834773060291672, + "learning_rate": 4.584837545126354e-06, + "loss": 0.9721, + "step": 762 + }, + { + "epoch": 0.055123087759856956, + "grad_norm": 8.06190997189993, + "learning_rate": 4.590854392298436e-06, + "loss": 0.9341, + "step": 763 + }, + { + "epoch": 0.055195332960066464, + "grad_norm": 9.465605865287898, + "learning_rate": 4.596871239470518e-06, + "loss": 1.093, + "step": 764 + }, + { + "epoch": 0.05526757816027598, + "grad_norm": 8.302863249926055, + "learning_rate": 4.6028880866426e-06, + "loss": 1.0282, + "step": 765 + }, + { + "epoch": 0.055339823360485485, + "grad_norm": 9.383668274488636, + "learning_rate": 4.608904933814681e-06, + "loss": 0.9146, + "step": 766 + }, + { + "epoch": 0.055412068560695, + "grad_norm": 8.615425434644646, + "learning_rate": 4.614921780986763e-06, + "loss": 1.0583, + "step": 767 + }, + { + "epoch": 0.055484313760904506, + "grad_norm": 9.367758840869229, + "learning_rate": 4.620938628158845e-06, + "loss": 0.9924, + "step": 768 + }, + { + "epoch": 0.05555655896111402, + "grad_norm": 8.10584007056096, + "learning_rate": 4.626955475330927e-06, + "loss": 0.9683, + "step": 769 + }, + { + "epoch": 0.055628804161323535, + "grad_norm": 8.034202418927517, + "learning_rate": 4.6329723225030085e-06, + "loss": 1.0607, + "step": 770 + }, + { + "epoch": 0.05570104936153304, + "grad_norm": 8.792364397128598, + "learning_rate": 4.63898916967509e-06, + "loss": 1.0136, + "step": 771 + }, + { + "epoch": 0.055773294561742556, + "grad_norm": 10.012163013211731, + "learning_rate": 4.645006016847172e-06, + "loss": 0.9864, + "step": 772 + }, + { + "epoch": 0.055845539761952064, + "grad_norm": 6.289747876085858, + "learning_rate": 4.651022864019255e-06, + "loss": 1.0429, + "step": 773 + }, + { + "epoch": 0.05591778496216158, + "grad_norm": 6.846085937941309, + "learning_rate": 4.6570397111913365e-06, + "loss": 1.0789, + "step": 774 + }, + { + "epoch": 0.055990030162371085, + "grad_norm": 9.248106891634718, + "learning_rate": 4.663056558363418e-06, + "loss": 1.0348, + "step": 775 + }, + { + "epoch": 0.0560622753625806, + "grad_norm": 8.56508447109953, + "learning_rate": 4.6690734055355e-06, + "loss": 1.0918, + "step": 776 + }, + { + "epoch": 0.056134520562790106, + "grad_norm": 8.079466481488012, + "learning_rate": 4.675090252707582e-06, + "loss": 1.0892, + "step": 777 + }, + { + "epoch": 0.05620676576299962, + "grad_norm": 8.794290979550757, + "learning_rate": 4.681107099879663e-06, + "loss": 1.0612, + "step": 778 + }, + { + "epoch": 0.056279010963209135, + "grad_norm": 8.284525903502312, + "learning_rate": 4.687123947051745e-06, + "loss": 1.0156, + "step": 779 + }, + { + "epoch": 0.05635125616341864, + "grad_norm": 9.529624925499594, + "learning_rate": 4.693140794223827e-06, + "loss": 1.0012, + "step": 780 + }, + { + "epoch": 0.056423501363628156, + "grad_norm": 7.985110971232673, + "learning_rate": 4.699157641395909e-06, + "loss": 0.9738, + "step": 781 + }, + { + "epoch": 0.056495746563837664, + "grad_norm": 9.213232425886066, + "learning_rate": 4.705174488567991e-06, + "loss": 0.9615, + "step": 782 + }, + { + "epoch": 0.05656799176404718, + "grad_norm": 7.61476148265618, + "learning_rate": 4.7111913357400725e-06, + "loss": 1.1128, + "step": 783 + }, + { + "epoch": 0.056640236964256685, + "grad_norm": 9.783868896105052, + "learning_rate": 4.717208182912154e-06, + "loss": 1.1889, + "step": 784 + }, + { + "epoch": 0.0567124821644662, + "grad_norm": 6.837120154393605, + "learning_rate": 4.723225030084236e-06, + "loss": 0.9801, + "step": 785 + }, + { + "epoch": 0.056784727364675706, + "grad_norm": 7.592437253911938, + "learning_rate": 4.729241877256318e-06, + "loss": 1.0858, + "step": 786 + }, + { + "epoch": 0.05685697256488522, + "grad_norm": 9.652594704725326, + "learning_rate": 4.7352587244284e-06, + "loss": 1.0016, + "step": 787 + }, + { + "epoch": 0.056929217765094735, + "grad_norm": 8.70441310808215, + "learning_rate": 4.741275571600481e-06, + "loss": 1.0535, + "step": 788 + }, + { + "epoch": 0.05700146296530424, + "grad_norm": 7.119597679086969, + "learning_rate": 4.747292418772563e-06, + "loss": 0.946, + "step": 789 + }, + { + "epoch": 0.057073708165513756, + "grad_norm": 10.529328213353011, + "learning_rate": 4.753309265944646e-06, + "loss": 1.0428, + "step": 790 + }, + { + "epoch": 0.057145953365723264, + "grad_norm": 9.155871699202956, + "learning_rate": 4.7593261131167276e-06, + "loss": 1.1032, + "step": 791 + }, + { + "epoch": 0.05721819856593278, + "grad_norm": 7.266973161212394, + "learning_rate": 4.765342960288809e-06, + "loss": 1.0256, + "step": 792 + }, + { + "epoch": 0.057290443766142285, + "grad_norm": 9.224638115625881, + "learning_rate": 4.771359807460891e-06, + "loss": 1.0262, + "step": 793 + }, + { + "epoch": 0.0573626889663518, + "grad_norm": 8.7826213376779, + "learning_rate": 4.777376654632973e-06, + "loss": 1.1238, + "step": 794 + }, + { + "epoch": 0.057434934166561306, + "grad_norm": 6.738954619887011, + "learning_rate": 4.783393501805055e-06, + "loss": 0.9191, + "step": 795 + }, + { + "epoch": 0.05750717936677082, + "grad_norm": 9.934591383495913, + "learning_rate": 4.7894103489771365e-06, + "loss": 1.0798, + "step": 796 + }, + { + "epoch": 0.05757942456698033, + "grad_norm": 9.931799447169132, + "learning_rate": 4.795427196149218e-06, + "loss": 0.9937, + "step": 797 + }, + { + "epoch": 0.05765166976718984, + "grad_norm": 9.732978242989944, + "learning_rate": 4.8014440433213e-06, + "loss": 1.1232, + "step": 798 + }, + { + "epoch": 0.057723914967399356, + "grad_norm": 7.319465384633341, + "learning_rate": 4.807460890493382e-06, + "loss": 1.0508, + "step": 799 + }, + { + "epoch": 0.057796160167608863, + "grad_norm": 8.430251787618014, + "learning_rate": 4.813477737665464e-06, + "loss": 1.0848, + "step": 800 + }, + { + "epoch": 0.05786840536781838, + "grad_norm": 9.230553057168944, + "learning_rate": 4.819494584837545e-06, + "loss": 1.0206, + "step": 801 + }, + { + "epoch": 0.057940650568027885, + "grad_norm": 9.489118668659916, + "learning_rate": 4.825511432009627e-06, + "loss": 0.9759, + "step": 802 + }, + { + "epoch": 0.0580128957682374, + "grad_norm": 7.600672702129881, + "learning_rate": 4.831528279181709e-06, + "loss": 0.9895, + "step": 803 + }, + { + "epoch": 0.058085140968446906, + "grad_norm": 7.38613878027911, + "learning_rate": 4.837545126353791e-06, + "loss": 0.9876, + "step": 804 + }, + { + "epoch": 0.05815738616865642, + "grad_norm": 7.417195790593376, + "learning_rate": 4.8435619735258725e-06, + "loss": 0.9801, + "step": 805 + }, + { + "epoch": 0.05822963136886593, + "grad_norm": 9.911650428106531, + "learning_rate": 4.849578820697955e-06, + "loss": 1.0259, + "step": 806 + }, + { + "epoch": 0.05830187656907544, + "grad_norm": 7.952223211398574, + "learning_rate": 4.855595667870037e-06, + "loss": 0.9565, + "step": 807 + }, + { + "epoch": 0.058374121769284956, + "grad_norm": 8.41290768319938, + "learning_rate": 4.861612515042119e-06, + "loss": 1.0988, + "step": 808 + }, + { + "epoch": 0.058446366969494463, + "grad_norm": 7.655991904131875, + "learning_rate": 4.8676293622142004e-06, + "loss": 0.9828, + "step": 809 + }, + { + "epoch": 0.05851861216970398, + "grad_norm": 8.30525982123631, + "learning_rate": 4.873646209386282e-06, + "loss": 1.0781, + "step": 810 + }, + { + "epoch": 0.058590857369913485, + "grad_norm": 9.406545235192288, + "learning_rate": 4.879663056558364e-06, + "loss": 1.1418, + "step": 811 + }, + { + "epoch": 0.058663102570123, + "grad_norm": 9.137862663534465, + "learning_rate": 4.885679903730446e-06, + "loss": 1.1007, + "step": 812 + }, + { + "epoch": 0.058735347770332506, + "grad_norm": 7.419520339885498, + "learning_rate": 4.8916967509025275e-06, + "loss": 1.1, + "step": 813 + }, + { + "epoch": 0.05880759297054202, + "grad_norm": 11.07541536193804, + "learning_rate": 4.897713598074609e-06, + "loss": 1.0524, + "step": 814 + }, + { + "epoch": 0.05887983817075153, + "grad_norm": 7.934490684696599, + "learning_rate": 4.903730445246691e-06, + "loss": 0.973, + "step": 815 + }, + { + "epoch": 0.05895208337096104, + "grad_norm": 8.844085147829071, + "learning_rate": 4.909747292418773e-06, + "loss": 1.0354, + "step": 816 + }, + { + "epoch": 0.059024328571170556, + "grad_norm": 8.15805419610593, + "learning_rate": 4.915764139590855e-06, + "loss": 0.9903, + "step": 817 + }, + { + "epoch": 0.05909657377138006, + "grad_norm": 8.858260580812585, + "learning_rate": 4.9217809867629364e-06, + "loss": 0.9559, + "step": 818 + }, + { + "epoch": 0.05916881897158958, + "grad_norm": 9.136047782251898, + "learning_rate": 4.927797833935018e-06, + "loss": 1.0515, + "step": 819 + }, + { + "epoch": 0.059241064171799085, + "grad_norm": 8.056687263122587, + "learning_rate": 4.9338146811071e-06, + "loss": 1.0371, + "step": 820 + }, + { + "epoch": 0.0593133093720086, + "grad_norm": 7.852273755097946, + "learning_rate": 4.939831528279182e-06, + "loss": 0.9309, + "step": 821 + }, + { + "epoch": 0.059385554572218106, + "grad_norm": 7.340701505311865, + "learning_rate": 4.9458483754512636e-06, + "loss": 1.0132, + "step": 822 + }, + { + "epoch": 0.05945779977242762, + "grad_norm": 8.835611109687921, + "learning_rate": 4.951865222623346e-06, + "loss": 1.0406, + "step": 823 + }, + { + "epoch": 0.05953004497263713, + "grad_norm": 9.530680855189548, + "learning_rate": 4.957882069795428e-06, + "loss": 0.8972, + "step": 824 + }, + { + "epoch": 0.05960229017284664, + "grad_norm": 9.131000374914004, + "learning_rate": 4.96389891696751e-06, + "loss": 1.0486, + "step": 825 + }, + { + "epoch": 0.05967453537305615, + "grad_norm": 10.041179555472274, + "learning_rate": 4.9699157641395915e-06, + "loss": 1.0356, + "step": 826 + }, + { + "epoch": 0.05974678057326566, + "grad_norm": 10.84899669877898, + "learning_rate": 4.975932611311673e-06, + "loss": 1.0789, + "step": 827 + }, + { + "epoch": 0.05981902577347518, + "grad_norm": 9.552454639957077, + "learning_rate": 4.981949458483755e-06, + "loss": 1.0654, + "step": 828 + }, + { + "epoch": 0.059891270973684685, + "grad_norm": 9.346932267983203, + "learning_rate": 4.987966305655837e-06, + "loss": 0.9542, + "step": 829 + }, + { + "epoch": 0.0599635161738942, + "grad_norm": 8.001770777227318, + "learning_rate": 4.993983152827919e-06, + "loss": 0.9286, + "step": 830 + }, + { + "epoch": 0.060035761374103706, + "grad_norm": 8.383849976573263, + "learning_rate": 5e-06, + "loss": 1.0398, + "step": 831 + }, + { + "epoch": 0.06010800657431322, + "grad_norm": 9.356566350018829, + "learning_rate": 4.999999982888471e-06, + "loss": 1.0202, + "step": 832 + }, + { + "epoch": 0.06018025177452273, + "grad_norm": 8.695614325399543, + "learning_rate": 4.999999931553883e-06, + "loss": 1.1094, + "step": 833 + }, + { + "epoch": 0.06025249697473224, + "grad_norm": 7.5701159755838, + "learning_rate": 4.999999845996237e-06, + "loss": 1.0123, + "step": 834 + }, + { + "epoch": 0.06032474217494175, + "grad_norm": 7.429380456654953, + "learning_rate": 4.999999726215534e-06, + "loss": 1.0539, + "step": 835 + }, + { + "epoch": 0.06039698737515126, + "grad_norm": 10.391040214330799, + "learning_rate": 4.999999572211776e-06, + "loss": 1.0087, + "step": 836 + }, + { + "epoch": 0.06046923257536078, + "grad_norm": 8.873596510257578, + "learning_rate": 4.999999383984965e-06, + "loss": 1.0038, + "step": 837 + }, + { + "epoch": 0.060541477775570285, + "grad_norm": 6.673910719327221, + "learning_rate": 4.999999161535104e-06, + "loss": 0.9166, + "step": 838 + }, + { + "epoch": 0.0606137229757798, + "grad_norm": 8.057874668747715, + "learning_rate": 4.999998904862195e-06, + "loss": 0.9842, + "step": 839 + }, + { + "epoch": 0.060685968175989306, + "grad_norm": 9.993227763603405, + "learning_rate": 4.999998613966243e-06, + "loss": 1.0604, + "step": 840 + }, + { + "epoch": 0.06075821337619882, + "grad_norm": 10.262995599449335, + "learning_rate": 4.99999828884725e-06, + "loss": 1.1527, + "step": 841 + }, + { + "epoch": 0.06083045857640833, + "grad_norm": 8.512931859829354, + "learning_rate": 4.999997929505222e-06, + "loss": 1.0186, + "step": 842 + }, + { + "epoch": 0.06090270377661784, + "grad_norm": 7.9787178676937724, + "learning_rate": 4.999997535940163e-06, + "loss": 1.0261, + "step": 843 + }, + { + "epoch": 0.06097494897682735, + "grad_norm": 9.634398392492077, + "learning_rate": 4.999997108152079e-06, + "loss": 1.0695, + "step": 844 + }, + { + "epoch": 0.06104719417703686, + "grad_norm": 9.71772942290115, + "learning_rate": 4.999996646140976e-06, + "loss": 1.0477, + "step": 845 + }, + { + "epoch": 0.06111943937724638, + "grad_norm": 10.38466772952024, + "learning_rate": 4.9999961499068605e-06, + "loss": 1.0145, + "step": 846 + }, + { + "epoch": 0.061191684577455885, + "grad_norm": 8.186069043160737, + "learning_rate": 4.999995619449739e-06, + "loss": 1.0377, + "step": 847 + }, + { + "epoch": 0.0612639297776654, + "grad_norm": 11.46558172135127, + "learning_rate": 4.999995054769617e-06, + "loss": 1.1109, + "step": 848 + }, + { + "epoch": 0.061336174977874906, + "grad_norm": 7.605505364735952, + "learning_rate": 4.999994455866506e-06, + "loss": 0.9259, + "step": 849 + }, + { + "epoch": 0.06140842017808442, + "grad_norm": 10.440657269480212, + "learning_rate": 4.9999938227404095e-06, + "loss": 0.9584, + "step": 850 + }, + { + "epoch": 0.06148066537829393, + "grad_norm": 11.054715105972598, + "learning_rate": 4.9999931553913405e-06, + "loss": 1.0384, + "step": 851 + }, + { + "epoch": 0.06155291057850344, + "grad_norm": 7.986856153444878, + "learning_rate": 4.999992453819306e-06, + "loss": 1.0237, + "step": 852 + }, + { + "epoch": 0.06162515577871295, + "grad_norm": 9.974267658320796, + "learning_rate": 4.999991718024316e-06, + "loss": 1.0684, + "step": 853 + }, + { + "epoch": 0.06169740097892246, + "grad_norm": 8.93489047281669, + "learning_rate": 4.99999094800638e-06, + "loss": 1.0416, + "step": 854 + }, + { + "epoch": 0.06176964617913197, + "grad_norm": 9.728469551374035, + "learning_rate": 4.999990143765509e-06, + "loss": 0.9856, + "step": 855 + }, + { + "epoch": 0.061841891379341485, + "grad_norm": 7.297362742163957, + "learning_rate": 4.999989305301715e-06, + "loss": 1.0276, + "step": 856 + }, + { + "epoch": 0.061914136579551, + "grad_norm": 9.242987784323441, + "learning_rate": 4.999988432615008e-06, + "loss": 1.0953, + "step": 857 + }, + { + "epoch": 0.061986381779760506, + "grad_norm": 7.843438738844079, + "learning_rate": 4.9999875257054e-06, + "loss": 1.0238, + "step": 858 + }, + { + "epoch": 0.06205862697997002, + "grad_norm": 7.737683014561781, + "learning_rate": 4.999986584572904e-06, + "loss": 1.0412, + "step": 859 + }, + { + "epoch": 0.06213087218017953, + "grad_norm": 6.564655930741052, + "learning_rate": 4.9999856092175335e-06, + "loss": 0.9772, + "step": 860 + }, + { + "epoch": 0.06220311738038904, + "grad_norm": 9.745523452126983, + "learning_rate": 4.999984599639301e-06, + "loss": 1.1154, + "step": 861 + }, + { + "epoch": 0.06227536258059855, + "grad_norm": 8.975602884684386, + "learning_rate": 4.999983555838219e-06, + "loss": 1.0276, + "step": 862 + }, + { + "epoch": 0.06234760778080806, + "grad_norm": 7.781949544201382, + "learning_rate": 4.999982477814305e-06, + "loss": 1.0596, + "step": 863 + }, + { + "epoch": 0.06241985298101757, + "grad_norm": 9.069471321890475, + "learning_rate": 4.999981365567571e-06, + "loss": 0.9431, + "step": 864 + }, + { + "epoch": 0.062492098181227085, + "grad_norm": 7.395780092146245, + "learning_rate": 4.999980219098034e-06, + "loss": 1.0289, + "step": 865 + }, + { + "epoch": 0.0625643433814366, + "grad_norm": 8.524710695610008, + "learning_rate": 4.9999790384057076e-06, + "loss": 1.0276, + "step": 866 + }, + { + "epoch": 0.06263658858164611, + "grad_norm": 11.062546896969804, + "learning_rate": 4.99997782349061e-06, + "loss": 1.0354, + "step": 867 + }, + { + "epoch": 0.06270883378185561, + "grad_norm": 8.030277178435375, + "learning_rate": 4.999976574352757e-06, + "loss": 1.0569, + "step": 868 + }, + { + "epoch": 0.06278107898206513, + "grad_norm": 7.404003720126462, + "learning_rate": 4.9999752909921665e-06, + "loss": 1.0232, + "step": 869 + }, + { + "epoch": 0.06285332418227464, + "grad_norm": 7.328006832902546, + "learning_rate": 4.999973973408855e-06, + "loss": 0.8951, + "step": 870 + }, + { + "epoch": 0.06292556938248416, + "grad_norm": 10.549071313561837, + "learning_rate": 4.999972621602841e-06, + "loss": 1.054, + "step": 871 + }, + { + "epoch": 0.06299781458269366, + "grad_norm": 9.013728373671793, + "learning_rate": 4.999971235574142e-06, + "loss": 1.14, + "step": 872 + }, + { + "epoch": 0.06307005978290317, + "grad_norm": 8.549061720377747, + "learning_rate": 4.9999698153227796e-06, + "loss": 1.0556, + "step": 873 + }, + { + "epoch": 0.06314230498311268, + "grad_norm": 7.788930511121541, + "learning_rate": 4.99996836084877e-06, + "loss": 1.0282, + "step": 874 + }, + { + "epoch": 0.0632145501833222, + "grad_norm": 7.3000538863192, + "learning_rate": 4.999966872152135e-06, + "loss": 1.0105, + "step": 875 + }, + { + "epoch": 0.06328679538353171, + "grad_norm": 10.042705139237729, + "learning_rate": 4.999965349232895e-06, + "loss": 1.0716, + "step": 876 + }, + { + "epoch": 0.06335904058374121, + "grad_norm": 8.150983374309, + "learning_rate": 4.999963792091071e-06, + "loss": 1.0397, + "step": 877 + }, + { + "epoch": 0.06343128578395073, + "grad_norm": 7.242641792739681, + "learning_rate": 4.999962200726683e-06, + "loss": 1.051, + "step": 878 + }, + { + "epoch": 0.06350353098416024, + "grad_norm": 9.301658105190675, + "learning_rate": 4.999960575139753e-06, + "loss": 1.0827, + "step": 879 + }, + { + "epoch": 0.06357577618436976, + "grad_norm": 9.172482212119064, + "learning_rate": 4.9999589153303044e-06, + "loss": 1.0484, + "step": 880 + }, + { + "epoch": 0.06364802138457926, + "grad_norm": 11.360718197752728, + "learning_rate": 4.999957221298359e-06, + "loss": 1.0212, + "step": 881 + }, + { + "epoch": 0.06372026658478877, + "grad_norm": 9.504804901898014, + "learning_rate": 4.999955493043941e-06, + "loss": 1.111, + "step": 882 + }, + { + "epoch": 0.06379251178499828, + "grad_norm": 9.679261476071817, + "learning_rate": 4.999953730567073e-06, + "loss": 1.0514, + "step": 883 + }, + { + "epoch": 0.0638647569852078, + "grad_norm": 8.424025789980341, + "learning_rate": 4.999951933867779e-06, + "loss": 1.0785, + "step": 884 + }, + { + "epoch": 0.06393700218541731, + "grad_norm": 9.488774543964642, + "learning_rate": 4.999950102946085e-06, + "loss": 0.9919, + "step": 885 + }, + { + "epoch": 0.06400924738562681, + "grad_norm": 10.413038634144762, + "learning_rate": 4.9999482378020146e-06, + "loss": 1.1535, + "step": 886 + }, + { + "epoch": 0.06408149258583633, + "grad_norm": 7.418302491340411, + "learning_rate": 4.999946338435595e-06, + "loss": 0.968, + "step": 887 + }, + { + "epoch": 0.06415373778604584, + "grad_norm": 6.8370677079088, + "learning_rate": 4.99994440484685e-06, + "loss": 0.9832, + "step": 888 + }, + { + "epoch": 0.06422598298625536, + "grad_norm": 7.108255352682228, + "learning_rate": 4.999942437035807e-06, + "loss": 1.0225, + "step": 889 + }, + { + "epoch": 0.06429822818646486, + "grad_norm": 9.377986178058665, + "learning_rate": 4.999940435002494e-06, + "loss": 1.0764, + "step": 890 + }, + { + "epoch": 0.06437047338667437, + "grad_norm": 10.18950834546327, + "learning_rate": 4.999938398746937e-06, + "loss": 1.0144, + "step": 891 + }, + { + "epoch": 0.06444271858688388, + "grad_norm": 7.932930659912544, + "learning_rate": 4.999936328269165e-06, + "loss": 1.0333, + "step": 892 + }, + { + "epoch": 0.0645149637870934, + "grad_norm": 10.74468472394116, + "learning_rate": 4.999934223569205e-06, + "loss": 1.0492, + "step": 893 + }, + { + "epoch": 0.0645872089873029, + "grad_norm": 7.37181513924933, + "learning_rate": 4.999932084647087e-06, + "loss": 0.9474, + "step": 894 + }, + { + "epoch": 0.06465945418751241, + "grad_norm": 7.410759048195411, + "learning_rate": 4.999929911502839e-06, + "loss": 1.0032, + "step": 895 + }, + { + "epoch": 0.06473169938772193, + "grad_norm": 6.7098335184061435, + "learning_rate": 4.9999277041364925e-06, + "loss": 0.9842, + "step": 896 + }, + { + "epoch": 0.06480394458793144, + "grad_norm": 7.760532973951913, + "learning_rate": 4.9999254625480765e-06, + "loss": 1.0158, + "step": 897 + }, + { + "epoch": 0.06487618978814096, + "grad_norm": 8.460535323428676, + "learning_rate": 4.999923186737622e-06, + "loss": 1.066, + "step": 898 + }, + { + "epoch": 0.06494843498835046, + "grad_norm": 8.289887152664157, + "learning_rate": 4.99992087670516e-06, + "loss": 1.0261, + "step": 899 + }, + { + "epoch": 0.06502068018855997, + "grad_norm": 7.523724480183398, + "learning_rate": 4.999918532450722e-06, + "loss": 0.9096, + "step": 900 + }, + { + "epoch": 0.06509292538876948, + "grad_norm": 7.164294123675224, + "learning_rate": 4.999916153974341e-06, + "loss": 0.9929, + "step": 901 + }, + { + "epoch": 0.065165170588979, + "grad_norm": 7.574298576890454, + "learning_rate": 4.999913741276049e-06, + "loss": 1.0287, + "step": 902 + }, + { + "epoch": 0.0652374157891885, + "grad_norm": 8.570323182536642, + "learning_rate": 4.999911294355878e-06, + "loss": 1.0936, + "step": 903 + }, + { + "epoch": 0.06530966098939801, + "grad_norm": 8.307807687898269, + "learning_rate": 4.999908813213864e-06, + "loss": 1.0067, + "step": 904 + }, + { + "epoch": 0.06538190618960753, + "grad_norm": 8.896167281327577, + "learning_rate": 4.999906297850038e-06, + "loss": 0.9323, + "step": 905 + }, + { + "epoch": 0.06545415138981704, + "grad_norm": 8.070222689783431, + "learning_rate": 4.999903748264437e-06, + "loss": 1.0254, + "step": 906 + }, + { + "epoch": 0.06552639659002656, + "grad_norm": 6.540582587281861, + "learning_rate": 4.9999011644570935e-06, + "loss": 0.925, + "step": 907 + }, + { + "epoch": 0.06559864179023606, + "grad_norm": 9.915589175203168, + "learning_rate": 4.9998985464280445e-06, + "loss": 1.0844, + "step": 908 + }, + { + "epoch": 0.06567088699044557, + "grad_norm": 7.974578521018564, + "learning_rate": 4.999895894177325e-06, + "loss": 0.9083, + "step": 909 + }, + { + "epoch": 0.06574313219065508, + "grad_norm": 8.507272638404192, + "learning_rate": 4.999893207704973e-06, + "loss": 1.0302, + "step": 910 + }, + { + "epoch": 0.0658153773908646, + "grad_norm": 8.456221092834461, + "learning_rate": 4.999890487011023e-06, + "loss": 1.0668, + "step": 911 + }, + { + "epoch": 0.0658876225910741, + "grad_norm": 8.661530904796571, + "learning_rate": 4.999887732095514e-06, + "loss": 0.9864, + "step": 912 + }, + { + "epoch": 0.06595986779128361, + "grad_norm": 7.278353790447794, + "learning_rate": 4.999884942958483e-06, + "loss": 0.9872, + "step": 913 + }, + { + "epoch": 0.06603211299149313, + "grad_norm": 7.371029835854991, + "learning_rate": 4.999882119599967e-06, + "loss": 1.0154, + "step": 914 + }, + { + "epoch": 0.06610435819170264, + "grad_norm": 7.624355851777973, + "learning_rate": 4.999879262020007e-06, + "loss": 1.0608, + "step": 915 + }, + { + "epoch": 0.06617660339191216, + "grad_norm": 10.11638698872114, + "learning_rate": 4.99987637021864e-06, + "loss": 0.9501, + "step": 916 + }, + { + "epoch": 0.06624884859212166, + "grad_norm": 9.02544282371023, + "learning_rate": 4.999873444195908e-06, + "loss": 1.0702, + "step": 917 + }, + { + "epoch": 0.06632109379233117, + "grad_norm": 7.600722890906856, + "learning_rate": 4.999870483951848e-06, + "loss": 1.0862, + "step": 918 + }, + { + "epoch": 0.06639333899254068, + "grad_norm": 10.66570214043302, + "learning_rate": 4.999867489486503e-06, + "loss": 0.8919, + "step": 919 + }, + { + "epoch": 0.0664655841927502, + "grad_norm": 10.827562722143572, + "learning_rate": 4.999864460799912e-06, + "loss": 1.0554, + "step": 920 + }, + { + "epoch": 0.0665378293929597, + "grad_norm": 6.8497385796553685, + "learning_rate": 4.999861397892119e-06, + "loss": 0.9239, + "step": 921 + }, + { + "epoch": 0.06661007459316921, + "grad_norm": 9.821056741388785, + "learning_rate": 4.999858300763164e-06, + "loss": 1.043, + "step": 922 + }, + { + "epoch": 0.06668231979337873, + "grad_norm": 9.443290096372042, + "learning_rate": 4.999855169413089e-06, + "loss": 0.9471, + "step": 923 + }, + { + "epoch": 0.06675456499358824, + "grad_norm": 8.543147806908388, + "learning_rate": 4.999852003841939e-06, + "loss": 1.0847, + "step": 924 + }, + { + "epoch": 0.06682681019379776, + "grad_norm": 8.453149820174975, + "learning_rate": 4.999848804049755e-06, + "loss": 0.967, + "step": 925 + }, + { + "epoch": 0.06689905539400726, + "grad_norm": 7.731832066969407, + "learning_rate": 4.999845570036582e-06, + "loss": 1.026, + "step": 926 + }, + { + "epoch": 0.06697130059421677, + "grad_norm": 7.241076535439316, + "learning_rate": 4.9998423018024655e-06, + "loss": 0.9694, + "step": 927 + }, + { + "epoch": 0.06704354579442628, + "grad_norm": 7.5825891933106675, + "learning_rate": 4.9998389993474475e-06, + "loss": 0.8292, + "step": 928 + }, + { + "epoch": 0.0671157909946358, + "grad_norm": 7.887007263559446, + "learning_rate": 4.999835662671575e-06, + "loss": 0.9163, + "step": 929 + }, + { + "epoch": 0.0671880361948453, + "grad_norm": 8.403517722205498, + "learning_rate": 4.999832291774894e-06, + "loss": 0.9422, + "step": 930 + }, + { + "epoch": 0.06726028139505481, + "grad_norm": 8.520187191764911, + "learning_rate": 4.999828886657449e-06, + "loss": 1.0747, + "step": 931 + }, + { + "epoch": 0.06733252659526433, + "grad_norm": 9.599590499409215, + "learning_rate": 4.999825447319288e-06, + "loss": 1.1415, + "step": 932 + }, + { + "epoch": 0.06740477179547384, + "grad_norm": 9.617166477307444, + "learning_rate": 4.999821973760457e-06, + "loss": 0.9436, + "step": 933 + }, + { + "epoch": 0.06747701699568336, + "grad_norm": 7.967325958467295, + "learning_rate": 4.999818465981004e-06, + "loss": 1.0447, + "step": 934 + }, + { + "epoch": 0.06754926219589286, + "grad_norm": 7.700809773062095, + "learning_rate": 4.9998149239809785e-06, + "loss": 0.9577, + "step": 935 + }, + { + "epoch": 0.06762150739610237, + "grad_norm": 13.457300546108096, + "learning_rate": 4.999811347760427e-06, + "loss": 1.0158, + "step": 936 + }, + { + "epoch": 0.06769375259631188, + "grad_norm": 17.125825959495263, + "learning_rate": 4.999807737319399e-06, + "loss": 1.0227, + "step": 937 + }, + { + "epoch": 0.0677659977965214, + "grad_norm": 8.145747529835688, + "learning_rate": 4.999804092657944e-06, + "loss": 1.1189, + "step": 938 + }, + { + "epoch": 0.0678382429967309, + "grad_norm": 168.07717721332662, + "learning_rate": 4.999800413776112e-06, + "loss": 1.291, + "step": 939 + }, + { + "epoch": 0.06791048819694041, + "grad_norm": 1252.7281229380937, + "learning_rate": 4.999796700673953e-06, + "loss": 4.4331, + "step": 940 + }, + { + "epoch": 0.06798273339714993, + "grad_norm": 579.5830128204242, + "learning_rate": 4.999792953351519e-06, + "loss": 4.2564, + "step": 941 + }, + { + "epoch": 0.06805497859735944, + "grad_norm": 1366.9764994322325, + "learning_rate": 4.9997891718088595e-06, + "loss": 8.2551, + "step": 942 + }, + { + "epoch": 0.06812722379756894, + "grad_norm": 167.11694773047407, + "learning_rate": 4.999785356046028e-06, + "loss": 2.9589, + "step": 943 + }, + { + "epoch": 0.06819946899777846, + "grad_norm": 140.71637309229513, + "learning_rate": 4.999781506063076e-06, + "loss": 2.4998, + "step": 944 + }, + { + "epoch": 0.06827171419798797, + "grad_norm": 87.14427972208647, + "learning_rate": 4.999777621860055e-06, + "loss": 1.9832, + "step": 945 + }, + { + "epoch": 0.06834395939819748, + "grad_norm": 47.80717137381823, + "learning_rate": 4.99977370343702e-06, + "loss": 1.6568, + "step": 946 + }, + { + "epoch": 0.068416204598407, + "grad_norm": 42.183343255865665, + "learning_rate": 4.999769750794024e-06, + "loss": 1.4574, + "step": 947 + }, + { + "epoch": 0.0684884497986165, + "grad_norm": 32.07011552162406, + "learning_rate": 4.999765763931122e-06, + "loss": 1.6063, + "step": 948 + }, + { + "epoch": 0.06856069499882601, + "grad_norm": 44.321087132971336, + "learning_rate": 4.999761742848366e-06, + "loss": 1.4548, + "step": 949 + }, + { + "epoch": 0.06863294019903553, + "grad_norm": 31.97084051999083, + "learning_rate": 4.999757687545813e-06, + "loss": 1.3405, + "step": 950 + }, + { + "epoch": 0.06870518539924504, + "grad_norm": 28.5199087107777, + "learning_rate": 4.999753598023518e-06, + "loss": 1.3621, + "step": 951 + }, + { + "epoch": 0.06877743059945454, + "grad_norm": 24.756747472359052, + "learning_rate": 4.999749474281538e-06, + "loss": 1.3252, + "step": 952 + }, + { + "epoch": 0.06884967579966406, + "grad_norm": 23.519081971813335, + "learning_rate": 4.999745316319928e-06, + "loss": 1.3094, + "step": 953 + }, + { + "epoch": 0.06892192099987357, + "grad_norm": 18.22155132971197, + "learning_rate": 4.999741124138746e-06, + "loss": 1.3866, + "step": 954 + }, + { + "epoch": 0.06899416620008308, + "grad_norm": 15.798736734228498, + "learning_rate": 4.999736897738049e-06, + "loss": 1.2714, + "step": 955 + }, + { + "epoch": 0.0690664114002926, + "grad_norm": 14.441216060757368, + "learning_rate": 4.999732637117895e-06, + "loss": 1.4716, + "step": 956 + }, + { + "epoch": 0.0691386566005021, + "grad_norm": 15.004367446862554, + "learning_rate": 4.999728342278341e-06, + "loss": 1.2103, + "step": 957 + }, + { + "epoch": 0.06921090180071161, + "grad_norm": 14.339982612205615, + "learning_rate": 4.999724013219448e-06, + "loss": 1.1751, + "step": 958 + }, + { + "epoch": 0.06928314700092113, + "grad_norm": 16.341533444938122, + "learning_rate": 4.999719649941274e-06, + "loss": 1.3727, + "step": 959 + }, + { + "epoch": 0.06935539220113064, + "grad_norm": 12.866704249615374, + "learning_rate": 4.999715252443879e-06, + "loss": 1.1248, + "step": 960 + }, + { + "epoch": 0.06942763740134014, + "grad_norm": 12.487614713796802, + "learning_rate": 4.999710820727322e-06, + "loss": 1.3028, + "step": 961 + }, + { + "epoch": 0.06949988260154966, + "grad_norm": 19.961318183747597, + "learning_rate": 4.9997063547916655e-06, + "loss": 1.3402, + "step": 962 + }, + { + "epoch": 0.06957212780175917, + "grad_norm": 9.612443139512013, + "learning_rate": 4.99970185463697e-06, + "loss": 1.1635, + "step": 963 + }, + { + "epoch": 0.06964437300196868, + "grad_norm": 14.32953439684184, + "learning_rate": 4.999697320263297e-06, + "loss": 1.0558, + "step": 964 + }, + { + "epoch": 0.0697166182021782, + "grad_norm": 12.044072915040514, + "learning_rate": 4.999692751670708e-06, + "loss": 1.1984, + "step": 965 + }, + { + "epoch": 0.0697888634023877, + "grad_norm": 9.324575902381692, + "learning_rate": 4.999688148859268e-06, + "loss": 1.2023, + "step": 966 + }, + { + "epoch": 0.06986110860259721, + "grad_norm": 13.371500002957841, + "learning_rate": 4.999683511829036e-06, + "loss": 1.2055, + "step": 967 + }, + { + "epoch": 0.06993335380280673, + "grad_norm": 11.212923795075916, + "learning_rate": 4.999678840580079e-06, + "loss": 1.1462, + "step": 968 + }, + { + "epoch": 0.07000559900301624, + "grad_norm": 9.304082765954023, + "learning_rate": 4.9996741351124585e-06, + "loss": 1.2165, + "step": 969 + }, + { + "epoch": 0.07007784420322574, + "grad_norm": 15.671522537786293, + "learning_rate": 4.9996693954262395e-06, + "loss": 1.1787, + "step": 970 + }, + { + "epoch": 0.07015008940343526, + "grad_norm": 7.554205010203367, + "learning_rate": 4.999664621521489e-06, + "loss": 1.1306, + "step": 971 + }, + { + "epoch": 0.07022233460364477, + "grad_norm": 15.969241618074582, + "learning_rate": 4.99965981339827e-06, + "loss": 1.0527, + "step": 972 + }, + { + "epoch": 0.07029457980385428, + "grad_norm": 9.581639104159752, + "learning_rate": 4.999654971056649e-06, + "loss": 1.1107, + "step": 973 + }, + { + "epoch": 0.0703668250040638, + "grad_norm": 13.49053849578474, + "learning_rate": 4.999650094496692e-06, + "loss": 1.1928, + "step": 974 + }, + { + "epoch": 0.0704390702042733, + "grad_norm": 9.491932706199693, + "learning_rate": 4.9996451837184665e-06, + "loss": 1.1282, + "step": 975 + }, + { + "epoch": 0.07051131540448281, + "grad_norm": 9.09452811251664, + "learning_rate": 4.999640238722039e-06, + "loss": 1.2054, + "step": 976 + }, + { + "epoch": 0.07058356060469233, + "grad_norm": 12.203298804380083, + "learning_rate": 4.999635259507477e-06, + "loss": 1.0912, + "step": 977 + }, + { + "epoch": 0.07065580580490184, + "grad_norm": 10.605703625967278, + "learning_rate": 4.99963024607485e-06, + "loss": 1.0932, + "step": 978 + }, + { + "epoch": 0.07072805100511134, + "grad_norm": 15.64453222534329, + "learning_rate": 4.999625198424226e-06, + "loss": 1.1285, + "step": 979 + }, + { + "epoch": 0.07080029620532086, + "grad_norm": 7.5180000470225625, + "learning_rate": 4.999620116555672e-06, + "loss": 1.072, + "step": 980 + }, + { + "epoch": 0.07087254140553037, + "grad_norm": 12.301884601397209, + "learning_rate": 4.999615000469261e-06, + "loss": 1.2232, + "step": 981 + }, + { + "epoch": 0.07094478660573988, + "grad_norm": 14.741817985233897, + "learning_rate": 4.999609850165062e-06, + "loss": 1.1257, + "step": 982 + }, + { + "epoch": 0.0710170318059494, + "grad_norm": 7.728657036471624, + "learning_rate": 4.999604665643144e-06, + "loss": 1.0595, + "step": 983 + }, + { + "epoch": 0.0710892770061589, + "grad_norm": 11.948299935048945, + "learning_rate": 4.9995994469035794e-06, + "loss": 1.2485, + "step": 984 + }, + { + "epoch": 0.07116152220636841, + "grad_norm": 9.995372846098315, + "learning_rate": 4.999594193946439e-06, + "loss": 1.0904, + "step": 985 + }, + { + "epoch": 0.07123376740657793, + "grad_norm": 7.802999105725011, + "learning_rate": 4.999588906771794e-06, + "loss": 1.0469, + "step": 986 + }, + { + "epoch": 0.07130601260678744, + "grad_norm": 11.389051540629588, + "learning_rate": 4.999583585379719e-06, + "loss": 1.1461, + "step": 987 + }, + { + "epoch": 0.07137825780699694, + "grad_norm": 12.720010830856607, + "learning_rate": 4.999578229770285e-06, + "loss": 1.1334, + "step": 988 + }, + { + "epoch": 0.07145050300720646, + "grad_norm": 10.948911457737477, + "learning_rate": 4.9995728399435665e-06, + "loss": 1.1993, + "step": 989 + }, + { + "epoch": 0.07152274820741597, + "grad_norm": 10.926613234210294, + "learning_rate": 4.999567415899636e-06, + "loss": 1.1061, + "step": 990 + }, + { + "epoch": 0.07159499340762548, + "grad_norm": 10.067142150293634, + "learning_rate": 4.9995619576385675e-06, + "loss": 1.0645, + "step": 991 + }, + { + "epoch": 0.071667238607835, + "grad_norm": 9.571186981731044, + "learning_rate": 4.999556465160438e-06, + "loss": 1.1765, + "step": 992 + }, + { + "epoch": 0.0717394838080445, + "grad_norm": 11.328238819471302, + "learning_rate": 4.99955093846532e-06, + "loss": 1.061, + "step": 993 + }, + { + "epoch": 0.07181172900825401, + "grad_norm": 18.699339111399766, + "learning_rate": 4.999545377553291e-06, + "loss": 1.1961, + "step": 994 + }, + { + "epoch": 0.07188397420846353, + "grad_norm": 12.101520259973086, + "learning_rate": 4.999539782424427e-06, + "loss": 1.0998, + "step": 995 + }, + { + "epoch": 0.07195621940867304, + "grad_norm": 10.005480980852601, + "learning_rate": 4.9995341530788036e-06, + "loss": 1.157, + "step": 996 + }, + { + "epoch": 0.07202846460888254, + "grad_norm": 9.770064615838335, + "learning_rate": 4.999528489516498e-06, + "loss": 1.097, + "step": 997 + }, + { + "epoch": 0.07210070980909206, + "grad_norm": 10.67458665391469, + "learning_rate": 4.999522791737589e-06, + "loss": 1.2029, + "step": 998 + }, + { + "epoch": 0.07217295500930157, + "grad_norm": 10.797893558933547, + "learning_rate": 4.999517059742154e-06, + "loss": 1.2861, + "step": 999 + }, + { + "epoch": 0.07224520020951108, + "grad_norm": 9.363852014440416, + "learning_rate": 4.99951129353027e-06, + "loss": 1.1835, + "step": 1000 + }, + { + "epoch": 0.07231744540972058, + "grad_norm": 6.911859366063537, + "learning_rate": 4.999505493102018e-06, + "loss": 1.064, + "step": 1001 + }, + { + "epoch": 0.0723896906099301, + "grad_norm": 11.66220799578339, + "learning_rate": 4.999499658457477e-06, + "loss": 1.1255, + "step": 1002 + }, + { + "epoch": 0.07246193581013961, + "grad_norm": 8.815600559940421, + "learning_rate": 4.999493789596726e-06, + "loss": 1.0788, + "step": 1003 + }, + { + "epoch": 0.07253418101034913, + "grad_norm": 15.791114066058341, + "learning_rate": 4.999487886519845e-06, + "loss": 1.2195, + "step": 1004 + }, + { + "epoch": 0.07260642621055864, + "grad_norm": 9.319790233166966, + "learning_rate": 4.9994819492269165e-06, + "loss": 1.196, + "step": 1005 + }, + { + "epoch": 0.07267867141076814, + "grad_norm": 8.429510558412899, + "learning_rate": 4.99947597771802e-06, + "loss": 1.1572, + "step": 1006 + }, + { + "epoch": 0.07275091661097766, + "grad_norm": 7.784609885023894, + "learning_rate": 4.999469971993238e-06, + "loss": 1.0095, + "step": 1007 + }, + { + "epoch": 0.07282316181118717, + "grad_norm": 16.09034399638103, + "learning_rate": 4.999463932052654e-06, + "loss": 1.2012, + "step": 1008 + }, + { + "epoch": 0.07289540701139668, + "grad_norm": 10.30262311106982, + "learning_rate": 4.999457857896349e-06, + "loss": 1.1139, + "step": 1009 + }, + { + "epoch": 0.07296765221160618, + "grad_norm": 13.580707756554713, + "learning_rate": 4.999451749524406e-06, + "loss": 1.1026, + "step": 1010 + }, + { + "epoch": 0.0730398974118157, + "grad_norm": 9.150527572014163, + "learning_rate": 4.9994456069369095e-06, + "loss": 1.0825, + "step": 1011 + }, + { + "epoch": 0.07311214261202521, + "grad_norm": 12.158839874139348, + "learning_rate": 4.999439430133943e-06, + "loss": 1.188, + "step": 1012 + }, + { + "epoch": 0.07318438781223473, + "grad_norm": 11.05066077556723, + "learning_rate": 4.999433219115592e-06, + "loss": 1.0179, + "step": 1013 + }, + { + "epoch": 0.07325663301244424, + "grad_norm": 13.49997061267056, + "learning_rate": 4.999426973881941e-06, + "loss": 1.1698, + "step": 1014 + }, + { + "epoch": 0.07332887821265374, + "grad_norm": 8.056771542496925, + "learning_rate": 4.999420694433076e-06, + "loss": 1.0667, + "step": 1015 + }, + { + "epoch": 0.07340112341286326, + "grad_norm": 10.122579026124553, + "learning_rate": 4.9994143807690805e-06, + "loss": 1.0561, + "step": 1016 + }, + { + "epoch": 0.07347336861307277, + "grad_norm": 13.091996614963316, + "learning_rate": 4.999408032890045e-06, + "loss": 1.1001, + "step": 1017 + }, + { + "epoch": 0.07354561381328228, + "grad_norm": 11.430015908644878, + "learning_rate": 4.999401650796052e-06, + "loss": 1.145, + "step": 1018 + }, + { + "epoch": 0.07361785901349178, + "grad_norm": 10.033891466739542, + "learning_rate": 4.999395234487192e-06, + "loss": 1.1755, + "step": 1019 + }, + { + "epoch": 0.0736901042137013, + "grad_norm": 9.02972040125841, + "learning_rate": 4.999388783963552e-06, + "loss": 1.0965, + "step": 1020 + }, + { + "epoch": 0.07376234941391081, + "grad_norm": 10.051461746420074, + "learning_rate": 4.99938229922522e-06, + "loss": 1.0868, + "step": 1021 + }, + { + "epoch": 0.07383459461412033, + "grad_norm": 11.513462313879566, + "learning_rate": 4.999375780272286e-06, + "loss": 1.117, + "step": 1022 + }, + { + "epoch": 0.07390683981432984, + "grad_norm": 10.49931587533672, + "learning_rate": 4.9993692271048375e-06, + "loss": 1.1138, + "step": 1023 + }, + { + "epoch": 0.07397908501453934, + "grad_norm": 10.592615297476447, + "learning_rate": 4.999362639722964e-06, + "loss": 1.0211, + "step": 1024 + }, + { + "epoch": 0.07405133021474886, + "grad_norm": 8.014517486383742, + "learning_rate": 4.999356018126758e-06, + "loss": 1.0701, + "step": 1025 + }, + { + "epoch": 0.07412357541495837, + "grad_norm": 9.537454355513116, + "learning_rate": 4.999349362316308e-06, + "loss": 1.1702, + "step": 1026 + }, + { + "epoch": 0.07419582061516788, + "grad_norm": 11.499276179879011, + "learning_rate": 4.999342672291706e-06, + "loss": 1.1402, + "step": 1027 + }, + { + "epoch": 0.07426806581537738, + "grad_norm": 9.01688454224321, + "learning_rate": 4.999335948053044e-06, + "loss": 1.155, + "step": 1028 + }, + { + "epoch": 0.0743403110155869, + "grad_norm": 10.333059450847367, + "learning_rate": 4.999329189600413e-06, + "loss": 1.1249, + "step": 1029 + }, + { + "epoch": 0.07441255621579641, + "grad_norm": 12.433520116417215, + "learning_rate": 4.9993223969339056e-06, + "loss": 1.0675, + "step": 1030 + }, + { + "epoch": 0.07448480141600593, + "grad_norm": 8.939625307291443, + "learning_rate": 4.999315570053616e-06, + "loss": 1.0674, + "step": 1031 + }, + { + "epoch": 0.07455704661621544, + "grad_norm": 9.45522477970596, + "learning_rate": 4.999308708959636e-06, + "loss": 1.0398, + "step": 1032 + }, + { + "epoch": 0.07462929181642494, + "grad_norm": 13.8720518537034, + "learning_rate": 4.999301813652061e-06, + "loss": 1.0534, + "step": 1033 + }, + { + "epoch": 0.07470153701663446, + "grad_norm": 11.505012414984407, + "learning_rate": 4.999294884130984e-06, + "loss": 1.1643, + "step": 1034 + }, + { + "epoch": 0.07477378221684397, + "grad_norm": 9.697365345975179, + "learning_rate": 4.9992879203965e-06, + "loss": 1.063, + "step": 1035 + }, + { + "epoch": 0.07484602741705348, + "grad_norm": 12.800564896516164, + "learning_rate": 4.999280922448707e-06, + "loss": 0.988, + "step": 1036 + }, + { + "epoch": 0.07491827261726298, + "grad_norm": 11.25646007284626, + "learning_rate": 4.999273890287698e-06, + "loss": 1.1378, + "step": 1037 + }, + { + "epoch": 0.0749905178174725, + "grad_norm": 8.921728770966718, + "learning_rate": 4.999266823913569e-06, + "loss": 1.0187, + "step": 1038 + }, + { + "epoch": 0.07506276301768201, + "grad_norm": 11.769766556102812, + "learning_rate": 4.999259723326419e-06, + "loss": 1.0744, + "step": 1039 + }, + { + "epoch": 0.07513500821789153, + "grad_norm": 12.213888698872179, + "learning_rate": 4.999252588526343e-06, + "loss": 1.1411, + "step": 1040 + }, + { + "epoch": 0.07520725341810104, + "grad_norm": 7.38209037680377, + "learning_rate": 4.99924541951344e-06, + "loss": 1.0931, + "step": 1041 + }, + { + "epoch": 0.07527949861831054, + "grad_norm": 11.892896157114391, + "learning_rate": 4.999238216287808e-06, + "loss": 1.0689, + "step": 1042 + }, + { + "epoch": 0.07535174381852006, + "grad_norm": 12.732171252950264, + "learning_rate": 4.999230978849545e-06, + "loss": 1.219, + "step": 1043 + }, + { + "epoch": 0.07542398901872957, + "grad_norm": 7.93188638501308, + "learning_rate": 4.99922370719875e-06, + "loss": 1.0188, + "step": 1044 + }, + { + "epoch": 0.07549623421893908, + "grad_norm": 11.27336034926823, + "learning_rate": 4.999216401335524e-06, + "loss": 1.0437, + "step": 1045 + }, + { + "epoch": 0.07556847941914858, + "grad_norm": 8.143089463211084, + "learning_rate": 4.9992090612599655e-06, + "loss": 1.1571, + "step": 1046 + }, + { + "epoch": 0.0756407246193581, + "grad_norm": 8.372680798432567, + "learning_rate": 4.999201686972176e-06, + "loss": 1.0619, + "step": 1047 + }, + { + "epoch": 0.07571296981956761, + "grad_norm": 8.191027165465307, + "learning_rate": 4.999194278472255e-06, + "loss": 1.0507, + "step": 1048 + }, + { + "epoch": 0.07578521501977713, + "grad_norm": 7.784751257576184, + "learning_rate": 4.999186835760305e-06, + "loss": 1.0829, + "step": 1049 + }, + { + "epoch": 0.07585746021998664, + "grad_norm": 9.461203208361098, + "learning_rate": 4.999179358836428e-06, + "loss": 0.9316, + "step": 1050 + }, + { + "epoch": 0.07592970542019614, + "grad_norm": 8.90134442065451, + "learning_rate": 4.999171847700725e-06, + "loss": 1.1381, + "step": 1051 + }, + { + "epoch": 0.07600195062040566, + "grad_norm": 9.435129436338897, + "learning_rate": 4.999164302353302e-06, + "loss": 1.0992, + "step": 1052 + }, + { + "epoch": 0.07607419582061517, + "grad_norm": 7.950773419839909, + "learning_rate": 4.999156722794259e-06, + "loss": 1.0634, + "step": 1053 + }, + { + "epoch": 0.07614644102082468, + "grad_norm": 6.149359705398356, + "learning_rate": 4.999149109023701e-06, + "loss": 0.9633, + "step": 1054 + }, + { + "epoch": 0.07621868622103418, + "grad_norm": 8.409094347004105, + "learning_rate": 4.999141461041732e-06, + "loss": 1.1221, + "step": 1055 + }, + { + "epoch": 0.0762909314212437, + "grad_norm": 10.605138548036365, + "learning_rate": 4.999133778848457e-06, + "loss": 1.1766, + "step": 1056 + }, + { + "epoch": 0.07636317662145321, + "grad_norm": 8.686196647959983, + "learning_rate": 4.999126062443981e-06, + "loss": 1.03, + "step": 1057 + }, + { + "epoch": 0.07643542182166273, + "grad_norm": 8.343998708840084, + "learning_rate": 4.999118311828409e-06, + "loss": 1.1061, + "step": 1058 + }, + { + "epoch": 0.07650766702187223, + "grad_norm": 8.326950935212805, + "learning_rate": 4.999110527001849e-06, + "loss": 1.1387, + "step": 1059 + }, + { + "epoch": 0.07657991222208174, + "grad_norm": 8.29351336235745, + "learning_rate": 4.999102707964406e-06, + "loss": 1.0004, + "step": 1060 + }, + { + "epoch": 0.07665215742229126, + "grad_norm": 8.809209547488498, + "learning_rate": 4.999094854716187e-06, + "loss": 1.1205, + "step": 1061 + }, + { + "epoch": 0.07672440262250077, + "grad_norm": 9.26867810746836, + "learning_rate": 4.9990869672573e-06, + "loss": 1.0999, + "step": 1062 + }, + { + "epoch": 0.07679664782271028, + "grad_norm": 9.428294561705481, + "learning_rate": 4.999079045587852e-06, + "loss": 1.042, + "step": 1063 + }, + { + "epoch": 0.07686889302291978, + "grad_norm": 10.003404800615662, + "learning_rate": 4.999071089707953e-06, + "loss": 1.158, + "step": 1064 + }, + { + "epoch": 0.0769411382231293, + "grad_norm": 7.3664252654463445, + "learning_rate": 4.999063099617712e-06, + "loss": 1.0712, + "step": 1065 + }, + { + "epoch": 0.07701338342333881, + "grad_norm": 8.603267989129098, + "learning_rate": 4.999055075317237e-06, + "loss": 1.0203, + "step": 1066 + }, + { + "epoch": 0.07708562862354833, + "grad_norm": 9.014182466239962, + "learning_rate": 4.999047016806637e-06, + "loss": 1.101, + "step": 1067 + }, + { + "epoch": 0.07715787382375783, + "grad_norm": 10.773023568348824, + "learning_rate": 4.999038924086026e-06, + "loss": 1.0112, + "step": 1068 + }, + { + "epoch": 0.07723011902396734, + "grad_norm": 8.711257613933169, + "learning_rate": 4.999030797155511e-06, + "loss": 1.0507, + "step": 1069 + }, + { + "epoch": 0.07730236422417686, + "grad_norm": 11.018520715916898, + "learning_rate": 4.999022636015205e-06, + "loss": 0.9961, + "step": 1070 + }, + { + "epoch": 0.07737460942438637, + "grad_norm": 8.808058031797016, + "learning_rate": 4.99901444066522e-06, + "loss": 1.0866, + "step": 1071 + }, + { + "epoch": 0.07744685462459588, + "grad_norm": 7.085077706760869, + "learning_rate": 4.999006211105667e-06, + "loss": 0.997, + "step": 1072 + }, + { + "epoch": 0.07751909982480538, + "grad_norm": 7.515437450963564, + "learning_rate": 4.99899794733666e-06, + "loss": 0.987, + "step": 1073 + }, + { + "epoch": 0.0775913450250149, + "grad_norm": 8.220134429946187, + "learning_rate": 4.998989649358311e-06, + "loss": 1.0864, + "step": 1074 + }, + { + "epoch": 0.07766359022522441, + "grad_norm": 7.01109361332347, + "learning_rate": 4.9989813171707345e-06, + "loss": 1.0873, + "step": 1075 + }, + { + "epoch": 0.07773583542543393, + "grad_norm": 7.783209251487551, + "learning_rate": 4.9989729507740435e-06, + "loss": 1.0629, + "step": 1076 + }, + { + "epoch": 0.07780808062564343, + "grad_norm": 7.85286010343997, + "learning_rate": 4.998964550168354e-06, + "loss": 1.0904, + "step": 1077 + }, + { + "epoch": 0.07788032582585294, + "grad_norm": 10.559509576333353, + "learning_rate": 4.9989561153537795e-06, + "loss": 1.0421, + "step": 1078 + }, + { + "epoch": 0.07795257102606246, + "grad_norm": 7.750250535422478, + "learning_rate": 4.998947646330435e-06, + "loss": 1.0503, + "step": 1079 + }, + { + "epoch": 0.07802481622627197, + "grad_norm": 8.632834152263571, + "learning_rate": 4.998939143098439e-06, + "loss": 1.1292, + "step": 1080 + }, + { + "epoch": 0.07809706142648148, + "grad_norm": 8.116755131651287, + "learning_rate": 4.998930605657906e-06, + "loss": 0.9979, + "step": 1081 + }, + { + "epoch": 0.07816930662669098, + "grad_norm": 8.48251339210647, + "learning_rate": 4.998922034008954e-06, + "loss": 1.1144, + "step": 1082 + }, + { + "epoch": 0.0782415518269005, + "grad_norm": 7.646926372123236, + "learning_rate": 4.998913428151699e-06, + "loss": 0.9841, + "step": 1083 + }, + { + "epoch": 0.07831379702711001, + "grad_norm": 8.063860164913411, + "learning_rate": 4.99890478808626e-06, + "loss": 1.088, + "step": 1084 + }, + { + "epoch": 0.07838604222731953, + "grad_norm": 8.69952570181406, + "learning_rate": 4.998896113812754e-06, + "loss": 1.0103, + "step": 1085 + }, + { + "epoch": 0.07845828742752903, + "grad_norm": 7.262411083524104, + "learning_rate": 4.9988874053313e-06, + "loss": 1.0132, + "step": 1086 + }, + { + "epoch": 0.07853053262773854, + "grad_norm": 7.728081502765024, + "learning_rate": 4.998878662642018e-06, + "loss": 0.9863, + "step": 1087 + }, + { + "epoch": 0.07860277782794806, + "grad_norm": 7.079423730533815, + "learning_rate": 4.998869885745028e-06, + "loss": 0.9302, + "step": 1088 + }, + { + "epoch": 0.07867502302815757, + "grad_norm": 10.197776058319997, + "learning_rate": 4.998861074640449e-06, + "loss": 0.9764, + "step": 1089 + }, + { + "epoch": 0.07874726822836708, + "grad_norm": 7.017699073112334, + "learning_rate": 4.998852229328402e-06, + "loss": 1.0114, + "step": 1090 + }, + { + "epoch": 0.07881951342857658, + "grad_norm": 12.024054260496955, + "learning_rate": 4.9988433498090096e-06, + "loss": 1.0303, + "step": 1091 + }, + { + "epoch": 0.0788917586287861, + "grad_norm": 8.720532306907664, + "learning_rate": 4.99883443608239e-06, + "loss": 1.0227, + "step": 1092 + }, + { + "epoch": 0.07896400382899561, + "grad_norm": 10.23896754066679, + "learning_rate": 4.998825488148668e-06, + "loss": 1.1158, + "step": 1093 + }, + { + "epoch": 0.07903624902920513, + "grad_norm": 8.286403448825022, + "learning_rate": 4.998816506007966e-06, + "loss": 1.059, + "step": 1094 + }, + { + "epoch": 0.07910849422941463, + "grad_norm": 9.157439379208945, + "learning_rate": 4.998807489660405e-06, + "loss": 1.0544, + "step": 1095 + }, + { + "epoch": 0.07918073942962414, + "grad_norm": 8.844072207994168, + "learning_rate": 4.998798439106111e-06, + "loss": 1.0838, + "step": 1096 + }, + { + "epoch": 0.07925298462983366, + "grad_norm": 8.374370949877509, + "learning_rate": 4.998789354345206e-06, + "loss": 0.9951, + "step": 1097 + }, + { + "epoch": 0.07932522983004317, + "grad_norm": 7.5593075910498255, + "learning_rate": 4.998780235377815e-06, + "loss": 1.1335, + "step": 1098 + }, + { + "epoch": 0.07939747503025268, + "grad_norm": 8.159172616038976, + "learning_rate": 4.998771082204062e-06, + "loss": 1.0723, + "step": 1099 + }, + { + "epoch": 0.07946972023046218, + "grad_norm": 7.843636196572333, + "learning_rate": 4.998761894824074e-06, + "loss": 1.077, + "step": 1100 + }, + { + "epoch": 0.0795419654306717, + "grad_norm": 10.013461017564953, + "learning_rate": 4.998752673237976e-06, + "loss": 1.0934, + "step": 1101 + }, + { + "epoch": 0.07961421063088121, + "grad_norm": 9.251904678137354, + "learning_rate": 4.998743417445893e-06, + "loss": 1.1208, + "step": 1102 + }, + { + "epoch": 0.07968645583109073, + "grad_norm": 8.063492115103234, + "learning_rate": 4.9987341274479524e-06, + "loss": 1.0444, + "step": 1103 + }, + { + "epoch": 0.07975870103130023, + "grad_norm": 7.964504172965911, + "learning_rate": 4.998724803244283e-06, + "loss": 1.0694, + "step": 1104 + }, + { + "epoch": 0.07983094623150974, + "grad_norm": 8.099611023405306, + "learning_rate": 4.998715444835011e-06, + "loss": 1.094, + "step": 1105 + }, + { + "epoch": 0.07990319143171926, + "grad_norm": 9.626043089785451, + "learning_rate": 4.998706052220265e-06, + "loss": 0.9621, + "step": 1106 + }, + { + "epoch": 0.07997543663192877, + "grad_norm": 9.898266002731987, + "learning_rate": 4.998696625400172e-06, + "loss": 1.1121, + "step": 1107 + }, + { + "epoch": 0.08004768183213827, + "grad_norm": 8.261856344894401, + "learning_rate": 4.998687164374863e-06, + "loss": 1.032, + "step": 1108 + }, + { + "epoch": 0.08011992703234778, + "grad_norm": 6.460485809294053, + "learning_rate": 4.998677669144467e-06, + "loss": 0.9505, + "step": 1109 + }, + { + "epoch": 0.0801921722325573, + "grad_norm": 9.319259341038178, + "learning_rate": 4.998668139709113e-06, + "loss": 1.0253, + "step": 1110 + }, + { + "epoch": 0.08026441743276681, + "grad_norm": 7.739720079352888, + "learning_rate": 4.998658576068933e-06, + "loss": 1.0211, + "step": 1111 + }, + { + "epoch": 0.08033666263297633, + "grad_norm": 9.040593049843086, + "learning_rate": 4.9986489782240575e-06, + "loss": 1.0445, + "step": 1112 + }, + { + "epoch": 0.08040890783318583, + "grad_norm": 11.828095973998401, + "learning_rate": 4.998639346174618e-06, + "loss": 1.0956, + "step": 1113 + }, + { + "epoch": 0.08048115303339534, + "grad_norm": 9.531187964065532, + "learning_rate": 4.998629679920744e-06, + "loss": 1.2079, + "step": 1114 + }, + { + "epoch": 0.08055339823360486, + "grad_norm": 7.734908638471938, + "learning_rate": 4.998619979462571e-06, + "loss": 0.9799, + "step": 1115 + }, + { + "epoch": 0.08062564343381437, + "grad_norm": 11.347745669828408, + "learning_rate": 4.99861024480023e-06, + "loss": 1.1444, + "step": 1116 + }, + { + "epoch": 0.08069788863402387, + "grad_norm": 9.159509072287253, + "learning_rate": 4.9986004759338555e-06, + "loss": 1.0438, + "step": 1117 + }, + { + "epoch": 0.08077013383423338, + "grad_norm": 8.254305554113202, + "learning_rate": 4.9985906728635805e-06, + "loss": 1.0677, + "step": 1118 + }, + { + "epoch": 0.0808423790344429, + "grad_norm": 9.087270414241175, + "learning_rate": 4.998580835589538e-06, + "loss": 0.9993, + "step": 1119 + }, + { + "epoch": 0.08091462423465241, + "grad_norm": 9.578841862447739, + "learning_rate": 4.998570964111865e-06, + "loss": 1.0351, + "step": 1120 + }, + { + "epoch": 0.08098686943486193, + "grad_norm": 10.152247558701673, + "learning_rate": 4.998561058430696e-06, + "loss": 1.0478, + "step": 1121 + }, + { + "epoch": 0.08105911463507143, + "grad_norm": 8.402404132308606, + "learning_rate": 4.998551118546165e-06, + "loss": 0.9444, + "step": 1122 + }, + { + "epoch": 0.08113135983528094, + "grad_norm": 11.8092436969662, + "learning_rate": 4.99854114445841e-06, + "loss": 1.0678, + "step": 1123 + }, + { + "epoch": 0.08120360503549046, + "grad_norm": 8.492356847006285, + "learning_rate": 4.998531136167566e-06, + "loss": 1.0957, + "step": 1124 + }, + { + "epoch": 0.08127585023569997, + "grad_norm": 8.914724500916789, + "learning_rate": 4.9985210936737705e-06, + "loss": 1.0199, + "step": 1125 + }, + { + "epoch": 0.08134809543590947, + "grad_norm": 10.226529301297335, + "learning_rate": 4.9985110169771624e-06, + "loss": 1.0936, + "step": 1126 + }, + { + "epoch": 0.08142034063611898, + "grad_norm": 11.26752175541672, + "learning_rate": 4.998500906077878e-06, + "loss": 1.0493, + "step": 1127 + }, + { + "epoch": 0.0814925858363285, + "grad_norm": 7.6901009974401555, + "learning_rate": 4.9984907609760556e-06, + "loss": 0.9427, + "step": 1128 + }, + { + "epoch": 0.08156483103653801, + "grad_norm": 8.110694381018506, + "learning_rate": 4.9984805816718355e-06, + "loss": 1.0684, + "step": 1129 + }, + { + "epoch": 0.08163707623674753, + "grad_norm": 10.169367280405849, + "learning_rate": 4.998470368165355e-06, + "loss": 1.2313, + "step": 1130 + }, + { + "epoch": 0.08170932143695703, + "grad_norm": 8.312233368462254, + "learning_rate": 4.998460120456756e-06, + "loss": 1.0566, + "step": 1131 + }, + { + "epoch": 0.08178156663716654, + "grad_norm": 12.509159241987698, + "learning_rate": 4.998449838546178e-06, + "loss": 1.057, + "step": 1132 + }, + { + "epoch": 0.08185381183737606, + "grad_norm": 7.44460643130261, + "learning_rate": 4.998439522433761e-06, + "loss": 0.9894, + "step": 1133 + }, + { + "epoch": 0.08192605703758557, + "grad_norm": 7.34645089337428, + "learning_rate": 4.998429172119647e-06, + "loss": 1.0724, + "step": 1134 + }, + { + "epoch": 0.08199830223779507, + "grad_norm": 9.75773665778172, + "learning_rate": 4.998418787603978e-06, + "loss": 0.9903, + "step": 1135 + }, + { + "epoch": 0.08207054743800458, + "grad_norm": 8.746829957290688, + "learning_rate": 4.9984083688868945e-06, + "loss": 1.0508, + "step": 1136 + }, + { + "epoch": 0.0821427926382141, + "grad_norm": 9.26367496343244, + "learning_rate": 4.998397915968541e-06, + "loss": 1.0882, + "step": 1137 + }, + { + "epoch": 0.08221503783842361, + "grad_norm": 8.051920729192458, + "learning_rate": 4.998387428849061e-06, + "loss": 1.0418, + "step": 1138 + }, + { + "epoch": 0.08228728303863313, + "grad_norm": 9.871850006274746, + "learning_rate": 4.998376907528596e-06, + "loss": 1.1238, + "step": 1139 + }, + { + "epoch": 0.08235952823884263, + "grad_norm": 7.55332453207304, + "learning_rate": 4.998366352007291e-06, + "loss": 1.038, + "step": 1140 + }, + { + "epoch": 0.08243177343905214, + "grad_norm": 7.612874379841177, + "learning_rate": 4.9983557622852906e-06, + "loss": 1.0458, + "step": 1141 + }, + { + "epoch": 0.08250401863926166, + "grad_norm": 9.162435583952453, + "learning_rate": 4.9983451383627394e-06, + "loss": 1.1471, + "step": 1142 + }, + { + "epoch": 0.08257626383947117, + "grad_norm": 8.699883506258054, + "learning_rate": 4.998334480239783e-06, + "loss": 1.0271, + "step": 1143 + }, + { + "epoch": 0.08264850903968067, + "grad_norm": 8.003606936814391, + "learning_rate": 4.998323787916568e-06, + "loss": 0.9465, + "step": 1144 + }, + { + "epoch": 0.08272075423989018, + "grad_norm": 9.08944967519757, + "learning_rate": 4.998313061393239e-06, + "loss": 1.023, + "step": 1145 + }, + { + "epoch": 0.0827929994400997, + "grad_norm": 10.64499783954446, + "learning_rate": 4.998302300669946e-06, + "loss": 1.0594, + "step": 1146 + }, + { + "epoch": 0.08286524464030921, + "grad_norm": 7.027268023928428, + "learning_rate": 4.998291505746833e-06, + "loss": 1.0061, + "step": 1147 + }, + { + "epoch": 0.08293748984051873, + "grad_norm": 6.857355676481956, + "learning_rate": 4.998280676624049e-06, + "loss": 0.9767, + "step": 1148 + }, + { + "epoch": 0.08300973504072823, + "grad_norm": 9.301179495227549, + "learning_rate": 4.9982698133017425e-06, + "loss": 1.0287, + "step": 1149 + }, + { + "epoch": 0.08308198024093774, + "grad_norm": 7.371449537236845, + "learning_rate": 4.998258915780062e-06, + "loss": 1.0113, + "step": 1150 + }, + { + "epoch": 0.08315422544114726, + "grad_norm": 7.118158904361727, + "learning_rate": 4.998247984059157e-06, + "loss": 1.0289, + "step": 1151 + }, + { + "epoch": 0.08322647064135677, + "grad_norm": 7.493440429809104, + "learning_rate": 4.998237018139177e-06, + "loss": 0.9469, + "step": 1152 + }, + { + "epoch": 0.08329871584156627, + "grad_norm": 8.059626104045504, + "learning_rate": 4.998226018020271e-06, + "loss": 1.1325, + "step": 1153 + }, + { + "epoch": 0.08337096104177578, + "grad_norm": 7.627840107274696, + "learning_rate": 4.9982149837025915e-06, + "loss": 0.9679, + "step": 1154 + }, + { + "epoch": 0.0834432062419853, + "grad_norm": 10.390496137006382, + "learning_rate": 4.9982039151862886e-06, + "loss": 1.0214, + "step": 1155 + }, + { + "epoch": 0.08351545144219481, + "grad_norm": 8.294970852442527, + "learning_rate": 4.998192812471514e-06, + "loss": 1.0068, + "step": 1156 + }, + { + "epoch": 0.08358769664240433, + "grad_norm": 7.2687658596767815, + "learning_rate": 4.998181675558419e-06, + "loss": 1.0359, + "step": 1157 + }, + { + "epoch": 0.08365994184261383, + "grad_norm": 11.96708233323029, + "learning_rate": 4.998170504447156e-06, + "loss": 1.1325, + "step": 1158 + }, + { + "epoch": 0.08373218704282334, + "grad_norm": 7.53534583945329, + "learning_rate": 4.99815929913788e-06, + "loss": 0.9961, + "step": 1159 + }, + { + "epoch": 0.08380443224303286, + "grad_norm": 8.456074480376564, + "learning_rate": 4.998148059630742e-06, + "loss": 1.0054, + "step": 1160 + }, + { + "epoch": 0.08387667744324237, + "grad_norm": 10.585953715970053, + "learning_rate": 4.998136785925896e-06, + "loss": 1.1631, + "step": 1161 + }, + { + "epoch": 0.08394892264345187, + "grad_norm": 9.7325467127128, + "learning_rate": 4.998125478023498e-06, + "loss": 1.1169, + "step": 1162 + }, + { + "epoch": 0.08402116784366138, + "grad_norm": 9.187471766006787, + "learning_rate": 4.998114135923702e-06, + "loss": 1.0181, + "step": 1163 + }, + { + "epoch": 0.0840934130438709, + "grad_norm": 8.152130840685295, + "learning_rate": 4.998102759626663e-06, + "loss": 1.0458, + "step": 1164 + }, + { + "epoch": 0.08416565824408041, + "grad_norm": 10.813768610097702, + "learning_rate": 4.998091349132536e-06, + "loss": 1.042, + "step": 1165 + }, + { + "epoch": 0.08423790344428991, + "grad_norm": 12.487774783481994, + "learning_rate": 4.99807990444148e-06, + "loss": 1.0521, + "step": 1166 + }, + { + "epoch": 0.08431014864449943, + "grad_norm": 8.111434645565318, + "learning_rate": 4.9980684255536484e-06, + "loss": 1.0093, + "step": 1167 + }, + { + "epoch": 0.08438239384470894, + "grad_norm": 7.229229343226888, + "learning_rate": 4.9980569124692e-06, + "loss": 0.9784, + "step": 1168 + }, + { + "epoch": 0.08445463904491846, + "grad_norm": 11.523341498540786, + "learning_rate": 4.9980453651882924e-06, + "loss": 0.9999, + "step": 1169 + }, + { + "epoch": 0.08452688424512797, + "grad_norm": 9.631893315202843, + "learning_rate": 4.998033783711083e-06, + "loss": 1.0699, + "step": 1170 + }, + { + "epoch": 0.08459912944533747, + "grad_norm": 10.038918104506301, + "learning_rate": 4.99802216803773e-06, + "loss": 1.0899, + "step": 1171 + }, + { + "epoch": 0.08467137464554698, + "grad_norm": 9.196129722871618, + "learning_rate": 4.998010518168393e-06, + "loss": 1.213, + "step": 1172 + }, + { + "epoch": 0.0847436198457565, + "grad_norm": 11.634530047608754, + "learning_rate": 4.997998834103233e-06, + "loss": 1.1776, + "step": 1173 + }, + { + "epoch": 0.08481586504596601, + "grad_norm": 8.441376424345735, + "learning_rate": 4.9979871158424075e-06, + "loss": 0.9978, + "step": 1174 + }, + { + "epoch": 0.08488811024617551, + "grad_norm": 7.7491075555650175, + "learning_rate": 4.997975363386078e-06, + "loss": 1.0773, + "step": 1175 + }, + { + "epoch": 0.08496035544638503, + "grad_norm": 9.233312452088397, + "learning_rate": 4.997963576734406e-06, + "loss": 0.9961, + "step": 1176 + }, + { + "epoch": 0.08503260064659454, + "grad_norm": 9.806789802004687, + "learning_rate": 4.997951755887552e-06, + "loss": 0.9672, + "step": 1177 + }, + { + "epoch": 0.08510484584680406, + "grad_norm": 9.566787057932062, + "learning_rate": 4.997939900845678e-06, + "loss": 1.0264, + "step": 1178 + }, + { + "epoch": 0.08517709104701357, + "grad_norm": 7.884842791686425, + "learning_rate": 4.997928011608946e-06, + "loss": 1.0766, + "step": 1179 + }, + { + "epoch": 0.08524933624722307, + "grad_norm": 9.559610167020942, + "learning_rate": 4.9979160881775194e-06, + "loss": 1.0885, + "step": 1180 + }, + { + "epoch": 0.08532158144743258, + "grad_norm": 9.250748062815228, + "learning_rate": 4.997904130551561e-06, + "loss": 1.1334, + "step": 1181 + }, + { + "epoch": 0.0853938266476421, + "grad_norm": 7.939635462620237, + "learning_rate": 4.997892138731234e-06, + "loss": 0.9784, + "step": 1182 + }, + { + "epoch": 0.08546607184785161, + "grad_norm": 6.920102411295118, + "learning_rate": 4.997880112716703e-06, + "loss": 1.029, + "step": 1183 + }, + { + "epoch": 0.08553831704806111, + "grad_norm": 7.195743245538496, + "learning_rate": 4.997868052508133e-06, + "loss": 1.0498, + "step": 1184 + }, + { + "epoch": 0.08561056224827063, + "grad_norm": 8.418912631972443, + "learning_rate": 4.99785595810569e-06, + "loss": 1.0402, + "step": 1185 + }, + { + "epoch": 0.08568280744848014, + "grad_norm": 7.743907133144584, + "learning_rate": 4.997843829509536e-06, + "loss": 1.0653, + "step": 1186 + }, + { + "epoch": 0.08575505264868966, + "grad_norm": 8.158732186355419, + "learning_rate": 4.997831666719842e-06, + "loss": 1.0036, + "step": 1187 + }, + { + "epoch": 0.08582729784889917, + "grad_norm": 7.8889102696709585, + "learning_rate": 4.9978194697367705e-06, + "loss": 0.986, + "step": 1188 + }, + { + "epoch": 0.08589954304910867, + "grad_norm": 9.042432995738585, + "learning_rate": 4.99780723856049e-06, + "loss": 1.0258, + "step": 1189 + }, + { + "epoch": 0.08597178824931818, + "grad_norm": 6.668155821738742, + "learning_rate": 4.997794973191168e-06, + "loss": 0.9792, + "step": 1190 + }, + { + "epoch": 0.0860440334495277, + "grad_norm": 8.493749227516751, + "learning_rate": 4.997782673628973e-06, + "loss": 1.0146, + "step": 1191 + }, + { + "epoch": 0.08611627864973721, + "grad_norm": 8.381222363454155, + "learning_rate": 4.997770339874071e-06, + "loss": 1.0213, + "step": 1192 + }, + { + "epoch": 0.08618852384994671, + "grad_norm": 8.04084127930678, + "learning_rate": 4.997757971926634e-06, + "loss": 1.0865, + "step": 1193 + }, + { + "epoch": 0.08626076905015623, + "grad_norm": 8.849180246092404, + "learning_rate": 4.9977455697868284e-06, + "loss": 0.9883, + "step": 1194 + }, + { + "epoch": 0.08633301425036574, + "grad_norm": 9.541466812195212, + "learning_rate": 4.997733133454826e-06, + "loss": 1.1456, + "step": 1195 + }, + { + "epoch": 0.08640525945057526, + "grad_norm": 7.734661551667768, + "learning_rate": 4.997720662930796e-06, + "loss": 0.9471, + "step": 1196 + }, + { + "epoch": 0.08647750465078477, + "grad_norm": 12.614228440743945, + "learning_rate": 4.99770815821491e-06, + "loss": 1.063, + "step": 1197 + }, + { + "epoch": 0.08654974985099427, + "grad_norm": 9.355804730397077, + "learning_rate": 4.997695619307338e-06, + "loss": 0.9665, + "step": 1198 + }, + { + "epoch": 0.08662199505120378, + "grad_norm": 7.6980746351525395, + "learning_rate": 4.9976830462082525e-06, + "loss": 1.0062, + "step": 1199 + }, + { + "epoch": 0.0866942402514133, + "grad_norm": 9.033183065856107, + "learning_rate": 4.997670438917826e-06, + "loss": 0.9619, + "step": 1200 + }, + { + "epoch": 0.08676648545162281, + "grad_norm": 8.392159717650904, + "learning_rate": 4.997657797436231e-06, + "loss": 1.01, + "step": 1201 + }, + { + "epoch": 0.08683873065183231, + "grad_norm": 14.59124198453878, + "learning_rate": 4.997645121763638e-06, + "loss": 1.1509, + "step": 1202 + }, + { + "epoch": 0.08691097585204183, + "grad_norm": 10.620290273374012, + "learning_rate": 4.997632411900224e-06, + "loss": 1.105, + "step": 1203 + }, + { + "epoch": 0.08698322105225134, + "grad_norm": 6.610973841135195, + "learning_rate": 4.997619667846162e-06, + "loss": 1.0147, + "step": 1204 + }, + { + "epoch": 0.08705546625246086, + "grad_norm": 10.12523999341825, + "learning_rate": 4.997606889601625e-06, + "loss": 1.0926, + "step": 1205 + }, + { + "epoch": 0.08712771145267037, + "grad_norm": 9.81527379187025, + "learning_rate": 4.997594077166789e-06, + "loss": 1.1371, + "step": 1206 + }, + { + "epoch": 0.08719995665287987, + "grad_norm": 11.050979736530469, + "learning_rate": 4.99758123054183e-06, + "loss": 0.9712, + "step": 1207 + }, + { + "epoch": 0.08727220185308938, + "grad_norm": 7.89698604600767, + "learning_rate": 4.9975683497269225e-06, + "loss": 1.0224, + "step": 1208 + }, + { + "epoch": 0.0873444470532989, + "grad_norm": 9.178702573491375, + "learning_rate": 4.997555434722244e-06, + "loss": 1.1017, + "step": 1209 + }, + { + "epoch": 0.08741669225350841, + "grad_norm": 10.762319203983754, + "learning_rate": 4.997542485527971e-06, + "loss": 0.999, + "step": 1210 + }, + { + "epoch": 0.08748893745371791, + "grad_norm": 7.619951671937692, + "learning_rate": 4.997529502144281e-06, + "loss": 1.0271, + "step": 1211 + }, + { + "epoch": 0.08756118265392743, + "grad_norm": 7.020186065207725, + "learning_rate": 4.99751648457135e-06, + "loss": 0.9666, + "step": 1212 + }, + { + "epoch": 0.08763342785413694, + "grad_norm": 8.187727830352312, + "learning_rate": 4.997503432809358e-06, + "loss": 1.0493, + "step": 1213 + }, + { + "epoch": 0.08770567305434646, + "grad_norm": 12.288750219530375, + "learning_rate": 4.9974903468584835e-06, + "loss": 1.1465, + "step": 1214 + }, + { + "epoch": 0.08777791825455596, + "grad_norm": 12.846954317249368, + "learning_rate": 4.997477226718905e-06, + "loss": 1.0951, + "step": 1215 + }, + { + "epoch": 0.08785016345476547, + "grad_norm": 7.384135132847736, + "learning_rate": 4.997464072390803e-06, + "loss": 0.9689, + "step": 1216 + }, + { + "epoch": 0.08792240865497498, + "grad_norm": 8.630582467907457, + "learning_rate": 4.997450883874356e-06, + "loss": 0.991, + "step": 1217 + }, + { + "epoch": 0.0879946538551845, + "grad_norm": 10.715823827875232, + "learning_rate": 4.997437661169746e-06, + "loss": 1.0127, + "step": 1218 + }, + { + "epoch": 0.08806689905539401, + "grad_norm": 7.665405280241323, + "learning_rate": 4.997424404277154e-06, + "loss": 1.0252, + "step": 1219 + }, + { + "epoch": 0.08813914425560351, + "grad_norm": 8.277917500908998, + "learning_rate": 4.9974111131967604e-06, + "loss": 1.0161, + "step": 1220 + }, + { + "epoch": 0.08821138945581303, + "grad_norm": 7.94931832515951, + "learning_rate": 4.997397787928748e-06, + "loss": 1.0223, + "step": 1221 + }, + { + "epoch": 0.08828363465602254, + "grad_norm": 12.318375942641863, + "learning_rate": 4.9973844284733e-06, + "loss": 0.9923, + "step": 1222 + }, + { + "epoch": 0.08835587985623206, + "grad_norm": 11.634357582917396, + "learning_rate": 4.997371034830597e-06, + "loss": 1.0476, + "step": 1223 + }, + { + "epoch": 0.08842812505644156, + "grad_norm": 8.727590584023964, + "learning_rate": 4.997357607000824e-06, + "loss": 0.991, + "step": 1224 + }, + { + "epoch": 0.08850037025665107, + "grad_norm": 9.445898096249321, + "learning_rate": 4.997344144984164e-06, + "loss": 0.9266, + "step": 1225 + }, + { + "epoch": 0.08857261545686058, + "grad_norm": 15.154415058865427, + "learning_rate": 4.997330648780802e-06, + "loss": 1.0491, + "step": 1226 + }, + { + "epoch": 0.0886448606570701, + "grad_norm": 12.147092218520216, + "learning_rate": 4.997317118390923e-06, + "loss": 1.0141, + "step": 1227 + }, + { + "epoch": 0.08871710585727961, + "grad_norm": 8.807405771470952, + "learning_rate": 4.997303553814711e-06, + "loss": 1.0545, + "step": 1228 + }, + { + "epoch": 0.08878935105748911, + "grad_norm": 8.773896637585334, + "learning_rate": 4.997289955052353e-06, + "loss": 0.9648, + "step": 1229 + }, + { + "epoch": 0.08886159625769863, + "grad_norm": 12.582803138190359, + "learning_rate": 4.997276322104034e-06, + "loss": 0.982, + "step": 1230 + }, + { + "epoch": 0.08893384145790814, + "grad_norm": 8.20721624166806, + "learning_rate": 4.997262654969942e-06, + "loss": 1.1456, + "step": 1231 + }, + { + "epoch": 0.08900608665811766, + "grad_norm": 8.216809642030134, + "learning_rate": 4.997248953650262e-06, + "loss": 1.0748, + "step": 1232 + }, + { + "epoch": 0.08907833185832716, + "grad_norm": 8.933539947094216, + "learning_rate": 4.997235218145184e-06, + "loss": 1.0812, + "step": 1233 + }, + { + "epoch": 0.08915057705853667, + "grad_norm": 9.513865341070755, + "learning_rate": 4.997221448454894e-06, + "loss": 0.9748, + "step": 1234 + }, + { + "epoch": 0.08922282225874618, + "grad_norm": 12.238541206742763, + "learning_rate": 4.997207644579581e-06, + "loss": 1.0394, + "step": 1235 + }, + { + "epoch": 0.0892950674589557, + "grad_norm": 9.030921798075168, + "learning_rate": 4.997193806519436e-06, + "loss": 1.0685, + "step": 1236 + }, + { + "epoch": 0.08936731265916521, + "grad_norm": 10.022809527045283, + "learning_rate": 4.997179934274645e-06, + "loss": 0.9943, + "step": 1237 + }, + { + "epoch": 0.08943955785937471, + "grad_norm": 11.31666176467662, + "learning_rate": 4.997166027845401e-06, + "loss": 0.982, + "step": 1238 + }, + { + "epoch": 0.08951180305958423, + "grad_norm": 11.069126364428211, + "learning_rate": 4.997152087231892e-06, + "loss": 1.0336, + "step": 1239 + }, + { + "epoch": 0.08958404825979374, + "grad_norm": 9.509747774487765, + "learning_rate": 4.9971381124343095e-06, + "loss": 1.0249, + "step": 1240 + }, + { + "epoch": 0.08965629346000326, + "grad_norm": 9.540294520792166, + "learning_rate": 4.9971241034528465e-06, + "loss": 0.9429, + "step": 1241 + }, + { + "epoch": 0.08972853866021276, + "grad_norm": 10.411457030598552, + "learning_rate": 4.997110060287692e-06, + "loss": 1.1491, + "step": 1242 + }, + { + "epoch": 0.08980078386042227, + "grad_norm": 10.482388940755511, + "learning_rate": 4.997095982939041e-06, + "loss": 1.0594, + "step": 1243 + }, + { + "epoch": 0.08987302906063178, + "grad_norm": 10.118509355007921, + "learning_rate": 4.997081871407084e-06, + "loss": 1.0699, + "step": 1244 + }, + { + "epoch": 0.0899452742608413, + "grad_norm": 9.172298389131146, + "learning_rate": 4.9970677256920154e-06, + "loss": 1.0776, + "step": 1245 + }, + { + "epoch": 0.09001751946105081, + "grad_norm": 8.362438289488766, + "learning_rate": 4.9970535457940285e-06, + "loss": 1.0118, + "step": 1246 + }, + { + "epoch": 0.09008976466126031, + "grad_norm": 8.090159205130359, + "learning_rate": 4.997039331713317e-06, + "loss": 0.979, + "step": 1247 + }, + { + "epoch": 0.09016200986146983, + "grad_norm": 7.958686487906969, + "learning_rate": 4.997025083450076e-06, + "loss": 0.9991, + "step": 1248 + }, + { + "epoch": 0.09023425506167934, + "grad_norm": 8.477703696222079, + "learning_rate": 4.997010801004501e-06, + "loss": 1.0602, + "step": 1249 + }, + { + "epoch": 0.09030650026188886, + "grad_norm": 8.156121848550232, + "learning_rate": 4.996996484376786e-06, + "loss": 1.0094, + "step": 1250 + }, + { + "epoch": 0.09037874546209836, + "grad_norm": 8.01067069799641, + "learning_rate": 4.9969821335671284e-06, + "loss": 1.0767, + "step": 1251 + }, + { + "epoch": 0.09045099066230787, + "grad_norm": 9.088012143570214, + "learning_rate": 4.996967748575724e-06, + "loss": 1.0258, + "step": 1252 + }, + { + "epoch": 0.09052323586251738, + "grad_norm": 11.172493358651913, + "learning_rate": 4.99695332940277e-06, + "loss": 1.0625, + "step": 1253 + }, + { + "epoch": 0.0905954810627269, + "grad_norm": 8.543679597197587, + "learning_rate": 4.996938876048464e-06, + "loss": 0.9319, + "step": 1254 + }, + { + "epoch": 0.09066772626293641, + "grad_norm": 8.44583283549682, + "learning_rate": 4.996924388513003e-06, + "loss": 0.9843, + "step": 1255 + }, + { + "epoch": 0.09073997146314591, + "grad_norm": 9.912610632337973, + "learning_rate": 4.996909866796587e-06, + "loss": 0.9995, + "step": 1256 + }, + { + "epoch": 0.09081221666335543, + "grad_norm": 10.974965825820584, + "learning_rate": 4.996895310899412e-06, + "loss": 0.9536, + "step": 1257 + }, + { + "epoch": 0.09088446186356494, + "grad_norm": 7.7812061231977045, + "learning_rate": 4.9968807208216795e-06, + "loss": 1.0097, + "step": 1258 + }, + { + "epoch": 0.09095670706377446, + "grad_norm": 9.898632116895977, + "learning_rate": 4.996866096563589e-06, + "loss": 1.0198, + "step": 1259 + }, + { + "epoch": 0.09102895226398396, + "grad_norm": 9.135253997595575, + "learning_rate": 4.99685143812534e-06, + "loss": 1.0731, + "step": 1260 + }, + { + "epoch": 0.09110119746419347, + "grad_norm": 9.64758306882217, + "learning_rate": 4.996836745507134e-06, + "loss": 0.9839, + "step": 1261 + }, + { + "epoch": 0.09117344266440298, + "grad_norm": 7.748854275583489, + "learning_rate": 4.996822018709171e-06, + "loss": 0.9661, + "step": 1262 + }, + { + "epoch": 0.0912456878646125, + "grad_norm": 7.718057331094688, + "learning_rate": 4.996807257731653e-06, + "loss": 1.1335, + "step": 1263 + }, + { + "epoch": 0.09131793306482201, + "grad_norm": 8.27364406021653, + "learning_rate": 4.996792462574783e-06, + "loss": 0.9964, + "step": 1264 + }, + { + "epoch": 0.09139017826503151, + "grad_norm": 7.711454457246791, + "learning_rate": 4.996777633238763e-06, + "loss": 1.0548, + "step": 1265 + }, + { + "epoch": 0.09146242346524103, + "grad_norm": 8.45389755018728, + "learning_rate": 4.996762769723795e-06, + "loss": 1.1132, + "step": 1266 + }, + { + "epoch": 0.09153466866545054, + "grad_norm": 7.993358955034686, + "learning_rate": 4.996747872030084e-06, + "loss": 1.0494, + "step": 1267 + }, + { + "epoch": 0.09160691386566006, + "grad_norm": 9.106131916943085, + "learning_rate": 4.996732940157833e-06, + "loss": 1.1574, + "step": 1268 + }, + { + "epoch": 0.09167915906586956, + "grad_norm": 7.888912203878178, + "learning_rate": 4.996717974107246e-06, + "loss": 0.9449, + "step": 1269 + }, + { + "epoch": 0.09175140426607907, + "grad_norm": 7.5295696995804375, + "learning_rate": 4.9967029738785295e-06, + "loss": 0.9847, + "step": 1270 + }, + { + "epoch": 0.09182364946628858, + "grad_norm": 8.033991126938988, + "learning_rate": 4.9966879394718875e-06, + "loss": 1.0168, + "step": 1271 + }, + { + "epoch": 0.0918958946664981, + "grad_norm": 7.349434458455348, + "learning_rate": 4.996672870887526e-06, + "loss": 1.0579, + "step": 1272 + }, + { + "epoch": 0.0919681398667076, + "grad_norm": 9.189182672361843, + "learning_rate": 4.9966577681256515e-06, + "loss": 0.9804, + "step": 1273 + }, + { + "epoch": 0.09204038506691711, + "grad_norm": 8.132016013771873, + "learning_rate": 4.996642631186471e-06, + "loss": 1.0015, + "step": 1274 + }, + { + "epoch": 0.09211263026712663, + "grad_norm": 8.625386989591531, + "learning_rate": 4.996627460070191e-06, + "loss": 1.0529, + "step": 1275 + }, + { + "epoch": 0.09218487546733614, + "grad_norm": 8.25005872300952, + "learning_rate": 4.996612254777019e-06, + "loss": 1.0277, + "step": 1276 + }, + { + "epoch": 0.09225712066754566, + "grad_norm": 8.5577829552336, + "learning_rate": 4.996597015307165e-06, + "loss": 1.067, + "step": 1277 + }, + { + "epoch": 0.09232936586775516, + "grad_norm": 8.436781110867777, + "learning_rate": 4.996581741660836e-06, + "loss": 1.0608, + "step": 1278 + }, + { + "epoch": 0.09240161106796467, + "grad_norm": 7.437232261134157, + "learning_rate": 4.996566433838241e-06, + "loss": 0.9662, + "step": 1279 + }, + { + "epoch": 0.09247385626817418, + "grad_norm": 8.37979535274034, + "learning_rate": 4.9965510918395895e-06, + "loss": 1.0893, + "step": 1280 + }, + { + "epoch": 0.0925461014683837, + "grad_norm": 8.732865040966251, + "learning_rate": 4.996535715665093e-06, + "loss": 1.0636, + "step": 1281 + }, + { + "epoch": 0.0926183466685932, + "grad_norm": 7.034580916750934, + "learning_rate": 4.996520305314961e-06, + "loss": 0.9279, + "step": 1282 + }, + { + "epoch": 0.09269059186880271, + "grad_norm": 9.531770285492126, + "learning_rate": 4.996504860789404e-06, + "loss": 1.1575, + "step": 1283 + }, + { + "epoch": 0.09276283706901223, + "grad_norm": 6.979645153742987, + "learning_rate": 4.996489382088634e-06, + "loss": 0.9101, + "step": 1284 + }, + { + "epoch": 0.09283508226922174, + "grad_norm": 8.894210441074417, + "learning_rate": 4.996473869212863e-06, + "loss": 1.1211, + "step": 1285 + }, + { + "epoch": 0.09290732746943126, + "grad_norm": 10.109754976799287, + "learning_rate": 4.996458322162302e-06, + "loss": 1.0854, + "step": 1286 + }, + { + "epoch": 0.09297957266964076, + "grad_norm": 6.680004732507183, + "learning_rate": 4.996442740937166e-06, + "loss": 1.0216, + "step": 1287 + }, + { + "epoch": 0.09305181786985027, + "grad_norm": 8.298889030651416, + "learning_rate": 4.996427125537667e-06, + "loss": 0.9832, + "step": 1288 + }, + { + "epoch": 0.09312406307005978, + "grad_norm": 12.369646860446277, + "learning_rate": 4.9964114759640196e-06, + "loss": 1.1166, + "step": 1289 + }, + { + "epoch": 0.0931963082702693, + "grad_norm": 8.553765289903952, + "learning_rate": 4.9963957922164365e-06, + "loss": 1.0469, + "step": 1290 + }, + { + "epoch": 0.0932685534704788, + "grad_norm": 7.415385217724437, + "learning_rate": 4.996380074295134e-06, + "loss": 1.0494, + "step": 1291 + }, + { + "epoch": 0.09334079867068831, + "grad_norm": 7.536417980564985, + "learning_rate": 4.996364322200326e-06, + "loss": 1.0358, + "step": 1292 + }, + { + "epoch": 0.09341304387089783, + "grad_norm": 9.503062908919361, + "learning_rate": 4.9963485359322295e-06, + "loss": 1.034, + "step": 1293 + }, + { + "epoch": 0.09348528907110734, + "grad_norm": 6.798321168886627, + "learning_rate": 4.996332715491059e-06, + "loss": 1.0092, + "step": 1294 + }, + { + "epoch": 0.09355753427131686, + "grad_norm": 7.650129090260912, + "learning_rate": 4.996316860877032e-06, + "loss": 0.9791, + "step": 1295 + }, + { + "epoch": 0.09362977947152636, + "grad_norm": 9.530319018313097, + "learning_rate": 4.996300972090366e-06, + "loss": 1.0153, + "step": 1296 + }, + { + "epoch": 0.09370202467173587, + "grad_norm": 8.788728726301189, + "learning_rate": 4.996285049131278e-06, + "loss": 0.9792, + "step": 1297 + }, + { + "epoch": 0.09377426987194538, + "grad_norm": 7.4643534085586625, + "learning_rate": 4.996269091999985e-06, + "loss": 1.0143, + "step": 1298 + }, + { + "epoch": 0.0938465150721549, + "grad_norm": 8.782940576254381, + "learning_rate": 4.996253100696707e-06, + "loss": 1.1465, + "step": 1299 + }, + { + "epoch": 0.0939187602723644, + "grad_norm": 8.896562628291957, + "learning_rate": 4.996237075221662e-06, + "loss": 1.0202, + "step": 1300 + }, + { + "epoch": 0.09399100547257391, + "grad_norm": 8.679668162107172, + "learning_rate": 4.99622101557507e-06, + "loss": 1.0216, + "step": 1301 + }, + { + "epoch": 0.09406325067278343, + "grad_norm": 8.873943857025282, + "learning_rate": 4.996204921757151e-06, + "loss": 0.9941, + "step": 1302 + }, + { + "epoch": 0.09413549587299294, + "grad_norm": 8.022917822599432, + "learning_rate": 4.996188793768123e-06, + "loss": 0.9741, + "step": 1303 + }, + { + "epoch": 0.09420774107320246, + "grad_norm": 8.346704767077245, + "learning_rate": 4.99617263160821e-06, + "loss": 1.024, + "step": 1304 + }, + { + "epoch": 0.09427998627341196, + "grad_norm": 9.57734674524792, + "learning_rate": 4.996156435277631e-06, + "loss": 0.9899, + "step": 1305 + }, + { + "epoch": 0.09435223147362147, + "grad_norm": 11.054190235613675, + "learning_rate": 4.99614020477661e-06, + "loss": 1.1201, + "step": 1306 + }, + { + "epoch": 0.09442447667383098, + "grad_norm": 7.501479702257993, + "learning_rate": 4.996123940105366e-06, + "loss": 0.9674, + "step": 1307 + }, + { + "epoch": 0.0944967218740405, + "grad_norm": 6.358825809843484, + "learning_rate": 4.996107641264125e-06, + "loss": 0.9878, + "step": 1308 + }, + { + "epoch": 0.09456896707425, + "grad_norm": 8.310144535281298, + "learning_rate": 4.996091308253107e-06, + "loss": 1.0772, + "step": 1309 + }, + { + "epoch": 0.09464121227445951, + "grad_norm": 9.475756177696857, + "learning_rate": 4.996074941072538e-06, + "loss": 0.952, + "step": 1310 + }, + { + "epoch": 0.09471345747466903, + "grad_norm": 8.688062032179559, + "learning_rate": 4.996058539722641e-06, + "loss": 0.9904, + "step": 1311 + }, + { + "epoch": 0.09478570267487854, + "grad_norm": 8.999182346077294, + "learning_rate": 4.99604210420364e-06, + "loss": 1.0451, + "step": 1312 + }, + { + "epoch": 0.09485794787508806, + "grad_norm": 7.950583660845014, + "learning_rate": 4.9960256345157615e-06, + "loss": 1.0908, + "step": 1313 + }, + { + "epoch": 0.09493019307529756, + "grad_norm": 7.500309492719102, + "learning_rate": 4.99600913065923e-06, + "loss": 0.9923, + "step": 1314 + }, + { + "epoch": 0.09500243827550707, + "grad_norm": 11.469660003066242, + "learning_rate": 4.995992592634271e-06, + "loss": 0.9773, + "step": 1315 + }, + { + "epoch": 0.09507468347571658, + "grad_norm": 7.875078957782493, + "learning_rate": 4.995976020441112e-06, + "loss": 1.0759, + "step": 1316 + }, + { + "epoch": 0.0951469286759261, + "grad_norm": 9.680657314335512, + "learning_rate": 4.995959414079979e-06, + "loss": 1.0787, + "step": 1317 + }, + { + "epoch": 0.0952191738761356, + "grad_norm": 8.795564868997676, + "learning_rate": 4.995942773551099e-06, + "loss": 1.0619, + "step": 1318 + }, + { + "epoch": 0.09529141907634511, + "grad_norm": 8.111761488017049, + "learning_rate": 4.9959260988547015e-06, + "loss": 0.983, + "step": 1319 + }, + { + "epoch": 0.09536366427655463, + "grad_norm": 7.870649559039023, + "learning_rate": 4.995909389991012e-06, + "loss": 0.9242, + "step": 1320 + }, + { + "epoch": 0.09543590947676414, + "grad_norm": 8.528398980603761, + "learning_rate": 4.995892646960263e-06, + "loss": 1.1365, + "step": 1321 + }, + { + "epoch": 0.09550815467697364, + "grad_norm": 7.686129067847314, + "learning_rate": 4.995875869762681e-06, + "loss": 1.0881, + "step": 1322 + }, + { + "epoch": 0.09558039987718316, + "grad_norm": 7.619341642519364, + "learning_rate": 4.995859058398495e-06, + "loss": 0.9283, + "step": 1323 + }, + { + "epoch": 0.09565264507739267, + "grad_norm": 8.021740935527571, + "learning_rate": 4.995842212867938e-06, + "loss": 0.9556, + "step": 1324 + }, + { + "epoch": 0.09572489027760218, + "grad_norm": 7.488387718498461, + "learning_rate": 4.995825333171238e-06, + "loss": 1.0256, + "step": 1325 + }, + { + "epoch": 0.0957971354778117, + "grad_norm": 8.910053498421126, + "learning_rate": 4.995808419308627e-06, + "loss": 1.0858, + "step": 1326 + }, + { + "epoch": 0.0958693806780212, + "grad_norm": 7.901927625907373, + "learning_rate": 4.995791471280338e-06, + "loss": 1.0606, + "step": 1327 + }, + { + "epoch": 0.09594162587823071, + "grad_norm": 7.740923827482147, + "learning_rate": 4.9957744890866e-06, + "loss": 0.9872, + "step": 1328 + }, + { + "epoch": 0.09601387107844023, + "grad_norm": 7.019898062723928, + "learning_rate": 4.995757472727648e-06, + "loss": 1.0145, + "step": 1329 + }, + { + "epoch": 0.09608611627864974, + "grad_norm": 8.088834116590762, + "learning_rate": 4.9957404222037146e-06, + "loss": 0.9568, + "step": 1330 + }, + { + "epoch": 0.09615836147885924, + "grad_norm": 7.746081192075676, + "learning_rate": 4.995723337515031e-06, + "loss": 1.0068, + "step": 1331 + }, + { + "epoch": 0.09623060667906876, + "grad_norm": 7.648225027373079, + "learning_rate": 4.995706218661833e-06, + "loss": 1.0134, + "step": 1332 + }, + { + "epoch": 0.09630285187927827, + "grad_norm": 8.003933893009854, + "learning_rate": 4.995689065644356e-06, + "loss": 0.9283, + "step": 1333 + }, + { + "epoch": 0.09637509707948778, + "grad_norm": 8.215415365128049, + "learning_rate": 4.9956718784628325e-06, + "loss": 1.0737, + "step": 1334 + }, + { + "epoch": 0.0964473422796973, + "grad_norm": 7.450292020873347, + "learning_rate": 4.995654657117499e-06, + "loss": 1.0778, + "step": 1335 + }, + { + "epoch": 0.0965195874799068, + "grad_norm": 7.755127717865784, + "learning_rate": 4.99563740160859e-06, + "loss": 0.9483, + "step": 1336 + }, + { + "epoch": 0.09659183268011631, + "grad_norm": 7.956223399559078, + "learning_rate": 4.995620111936345e-06, + "loss": 0.8622, + "step": 1337 + }, + { + "epoch": 0.09666407788032583, + "grad_norm": 8.317171777383928, + "learning_rate": 4.9956027881009964e-06, + "loss": 1.0225, + "step": 1338 + }, + { + "epoch": 0.09673632308053534, + "grad_norm": 8.530764834962548, + "learning_rate": 4.995585430102784e-06, + "loss": 1.0961, + "step": 1339 + }, + { + "epoch": 0.09680856828074484, + "grad_norm": 8.065054082340318, + "learning_rate": 4.995568037941945e-06, + "loss": 0.981, + "step": 1340 + }, + { + "epoch": 0.09688081348095436, + "grad_norm": 8.544731024654398, + "learning_rate": 4.995550611618717e-06, + "loss": 1.0443, + "step": 1341 + }, + { + "epoch": 0.09695305868116387, + "grad_norm": 8.084188461953303, + "learning_rate": 4.995533151133339e-06, + "loss": 1.0245, + "step": 1342 + }, + { + "epoch": 0.09702530388137338, + "grad_norm": 9.618345660361115, + "learning_rate": 4.995515656486049e-06, + "loss": 1.0871, + "step": 1343 + }, + { + "epoch": 0.0970975490815829, + "grad_norm": 8.163963441384682, + "learning_rate": 4.995498127677087e-06, + "loss": 1.034, + "step": 1344 + }, + { + "epoch": 0.0971697942817924, + "grad_norm": 8.217036195483784, + "learning_rate": 4.995480564706695e-06, + "loss": 1.0156, + "step": 1345 + }, + { + "epoch": 0.09724203948200191, + "grad_norm": 6.639671059699107, + "learning_rate": 4.99546296757511e-06, + "loss": 1.0039, + "step": 1346 + }, + { + "epoch": 0.09731428468221143, + "grad_norm": 8.311192000885669, + "learning_rate": 4.995445336282576e-06, + "loss": 1.0209, + "step": 1347 + }, + { + "epoch": 0.09738652988242094, + "grad_norm": 8.106137961831763, + "learning_rate": 4.995427670829331e-06, + "loss": 0.9845, + "step": 1348 + }, + { + "epoch": 0.09745877508263044, + "grad_norm": 9.874286094645337, + "learning_rate": 4.995409971215621e-06, + "loss": 1.0154, + "step": 1349 + }, + { + "epoch": 0.09753102028283996, + "grad_norm": 8.316492484287528, + "learning_rate": 4.9953922374416855e-06, + "loss": 1.0171, + "step": 1350 + }, + { + "epoch": 0.09760326548304947, + "grad_norm": 10.393345068076089, + "learning_rate": 4.995374469507767e-06, + "loss": 1.0948, + "step": 1351 + }, + { + "epoch": 0.09767551068325898, + "grad_norm": 8.017429914510137, + "learning_rate": 4.9953566674141094e-06, + "loss": 1.0048, + "step": 1352 + }, + { + "epoch": 0.0977477558834685, + "grad_norm": 8.378591649833997, + "learning_rate": 4.995338831160958e-06, + "loss": 1.0452, + "step": 1353 + }, + { + "epoch": 0.097820001083678, + "grad_norm": 8.884620717314125, + "learning_rate": 4.995320960748554e-06, + "loss": 1.0543, + "step": 1354 + }, + { + "epoch": 0.09789224628388751, + "grad_norm": 8.77345924001091, + "learning_rate": 4.995303056177145e-06, + "loss": 0.8894, + "step": 1355 + }, + { + "epoch": 0.09796449148409703, + "grad_norm": 8.825762785454705, + "learning_rate": 4.995285117446973e-06, + "loss": 1.0676, + "step": 1356 + }, + { + "epoch": 0.09803673668430654, + "grad_norm": 7.907008681252373, + "learning_rate": 4.995267144558286e-06, + "loss": 1.0341, + "step": 1357 + }, + { + "epoch": 0.09810898188451604, + "grad_norm": 9.475258582620294, + "learning_rate": 4.995249137511329e-06, + "loss": 0.9321, + "step": 1358 + }, + { + "epoch": 0.09818122708472556, + "grad_norm": 6.341514456581121, + "learning_rate": 4.995231096306349e-06, + "loss": 0.9002, + "step": 1359 + }, + { + "epoch": 0.09825347228493507, + "grad_norm": 8.712643470841671, + "learning_rate": 4.995213020943593e-06, + "loss": 1.0457, + "step": 1360 + }, + { + "epoch": 0.09832571748514458, + "grad_norm": 10.168878867019185, + "learning_rate": 4.995194911423308e-06, + "loss": 0.9368, + "step": 1361 + }, + { + "epoch": 0.0983979626853541, + "grad_norm": 8.166928137920705, + "learning_rate": 4.9951767677457415e-06, + "loss": 0.9875, + "step": 1362 + }, + { + "epoch": 0.0984702078855636, + "grad_norm": 7.388200745371383, + "learning_rate": 4.995158589911143e-06, + "loss": 0.973, + "step": 1363 + }, + { + "epoch": 0.09854245308577311, + "grad_norm": 8.513391604342344, + "learning_rate": 4.99514037791976e-06, + "loss": 1.0028, + "step": 1364 + }, + { + "epoch": 0.09861469828598263, + "grad_norm": 9.929158866730921, + "learning_rate": 4.995122131771843e-06, + "loss": 1.0452, + "step": 1365 + }, + { + "epoch": 0.09868694348619214, + "grad_norm": 9.362396720449002, + "learning_rate": 4.995103851467642e-06, + "loss": 1.1052, + "step": 1366 + }, + { + "epoch": 0.09875918868640164, + "grad_norm": 7.166559913490736, + "learning_rate": 4.995085537007407e-06, + "loss": 1.0391, + "step": 1367 + }, + { + "epoch": 0.09883143388661116, + "grad_norm": 6.747836578669614, + "learning_rate": 4.995067188391387e-06, + "loss": 1.0132, + "step": 1368 + }, + { + "epoch": 0.09890367908682067, + "grad_norm": 9.055316472034061, + "learning_rate": 4.9950488056198345e-06, + "loss": 0.9714, + "step": 1369 + }, + { + "epoch": 0.09897592428703018, + "grad_norm": 11.124379001387174, + "learning_rate": 4.995030388693002e-06, + "loss": 1.1118, + "step": 1370 + }, + { + "epoch": 0.0990481694872397, + "grad_norm": 7.101550145238153, + "learning_rate": 4.99501193761114e-06, + "loss": 0.9405, + "step": 1371 + }, + { + "epoch": 0.0991204146874492, + "grad_norm": 7.16310424386294, + "learning_rate": 4.994993452374503e-06, + "loss": 0.9823, + "step": 1372 + }, + { + "epoch": 0.09919265988765871, + "grad_norm": 11.736649090978466, + "learning_rate": 4.9949749329833415e-06, + "loss": 1.0722, + "step": 1373 + }, + { + "epoch": 0.09926490508786823, + "grad_norm": 10.330164720933597, + "learning_rate": 4.994956379437911e-06, + "loss": 0.9035, + "step": 1374 + }, + { + "epoch": 0.09933715028807774, + "grad_norm": 8.050916764082679, + "learning_rate": 4.994937791738464e-06, + "loss": 1.0588, + "step": 1375 + }, + { + "epoch": 0.09940939548828724, + "grad_norm": 9.0004446131768, + "learning_rate": 4.994919169885258e-06, + "loss": 1.0447, + "step": 1376 + }, + { + "epoch": 0.09948164068849676, + "grad_norm": 8.242970101323223, + "learning_rate": 4.994900513878543e-06, + "loss": 1.0005, + "step": 1377 + }, + { + "epoch": 0.09955388588870627, + "grad_norm": 10.394462622495173, + "learning_rate": 4.99488182371858e-06, + "loss": 1.0059, + "step": 1378 + }, + { + "epoch": 0.09962613108891578, + "grad_norm": 9.147388740190292, + "learning_rate": 4.994863099405619e-06, + "loss": 1.0263, + "step": 1379 + }, + { + "epoch": 0.09969837628912528, + "grad_norm": 8.39786289822144, + "learning_rate": 4.9948443409399215e-06, + "loss": 0.9454, + "step": 1380 + }, + { + "epoch": 0.0997706214893348, + "grad_norm": 8.673594811163488, + "learning_rate": 4.994825548321741e-06, + "loss": 1.096, + "step": 1381 + }, + { + "epoch": 0.09984286668954431, + "grad_norm": 8.880866462431195, + "learning_rate": 4.9948067215513364e-06, + "loss": 0.9099, + "step": 1382 + }, + { + "epoch": 0.09991511188975383, + "grad_norm": 9.031859486219247, + "learning_rate": 4.994787860628965e-06, + "loss": 0.9634, + "step": 1383 + }, + { + "epoch": 0.09998735708996334, + "grad_norm": 7.032215916726597, + "learning_rate": 4.994768965554884e-06, + "loss": 0.9658, + "step": 1384 + }, + { + "epoch": 0.10005960229017284, + "grad_norm": 7.552850793888761, + "learning_rate": 4.994750036329353e-06, + "loss": 0.9624, + "step": 1385 + }, + { + "epoch": 0.10013184749038236, + "grad_norm": 7.2875425465095915, + "learning_rate": 4.994731072952632e-06, + "loss": 1.0368, + "step": 1386 + }, + { + "epoch": 0.10020409269059187, + "grad_norm": 8.926347082721517, + "learning_rate": 4.994712075424979e-06, + "loss": 1.0837, + "step": 1387 + }, + { + "epoch": 0.10027633789080138, + "grad_norm": 11.117663075079069, + "learning_rate": 4.9946930437466545e-06, + "loss": 0.9865, + "step": 1388 + }, + { + "epoch": 0.10034858309101088, + "grad_norm": 6.480662117085728, + "learning_rate": 4.99467397791792e-06, + "loss": 0.9841, + "step": 1389 + }, + { + "epoch": 0.1004208282912204, + "grad_norm": 8.701462318034134, + "learning_rate": 4.9946548779390355e-06, + "loss": 1.0506, + "step": 1390 + }, + { + "epoch": 0.10049307349142991, + "grad_norm": 12.063378494767036, + "learning_rate": 4.9946357438102626e-06, + "loss": 0.988, + "step": 1391 + }, + { + "epoch": 0.10056531869163943, + "grad_norm": 8.600180921204007, + "learning_rate": 4.994616575531863e-06, + "loss": 0.942, + "step": 1392 + }, + { + "epoch": 0.10063756389184894, + "grad_norm": 8.340293368295763, + "learning_rate": 4.9945973731041e-06, + "loss": 0.9563, + "step": 1393 + }, + { + "epoch": 0.10070980909205844, + "grad_norm": 8.133341102387934, + "learning_rate": 4.994578136527235e-06, + "loss": 1.0102, + "step": 1394 + }, + { + "epoch": 0.10078205429226796, + "grad_norm": 8.282956044785204, + "learning_rate": 4.9945588658015335e-06, + "loss": 1.0685, + "step": 1395 + }, + { + "epoch": 0.10085429949247747, + "grad_norm": 8.353652849512947, + "learning_rate": 4.994539560927257e-06, + "loss": 0.9452, + "step": 1396 + }, + { + "epoch": 0.10092654469268698, + "grad_norm": 8.443610345347688, + "learning_rate": 4.994520221904671e-06, + "loss": 0.9165, + "step": 1397 + }, + { + "epoch": 0.10099878989289648, + "grad_norm": 10.985953986684933, + "learning_rate": 4.99450084873404e-06, + "loss": 0.9819, + "step": 1398 + }, + { + "epoch": 0.101071035093106, + "grad_norm": 9.649906580971878, + "learning_rate": 4.99448144141563e-06, + "loss": 1.0011, + "step": 1399 + }, + { + "epoch": 0.10114328029331551, + "grad_norm": 7.9447865119804995, + "learning_rate": 4.9944619999497045e-06, + "loss": 0.9704, + "step": 1400 + }, + { + "epoch": 0.10121552549352503, + "grad_norm": 9.706215383218614, + "learning_rate": 4.994442524336533e-06, + "loss": 1.0234, + "step": 1401 + }, + { + "epoch": 0.10128777069373454, + "grad_norm": 8.968155791373155, + "learning_rate": 4.994423014576379e-06, + "loss": 0.9934, + "step": 1402 + }, + { + "epoch": 0.10136001589394404, + "grad_norm": 7.360926503039838, + "learning_rate": 4.99440347066951e-06, + "loss": 1.0011, + "step": 1403 + }, + { + "epoch": 0.10143226109415356, + "grad_norm": 7.540922040341344, + "learning_rate": 4.994383892616195e-06, + "loss": 0.9943, + "step": 1404 + }, + { + "epoch": 0.10150450629436307, + "grad_norm": 15.34381215380105, + "learning_rate": 4.994364280416701e-06, + "loss": 1.029, + "step": 1405 + }, + { + "epoch": 0.10157675149457258, + "grad_norm": 8.401907894901852, + "learning_rate": 4.994344634071297e-06, + "loss": 1.0124, + "step": 1406 + }, + { + "epoch": 0.10164899669478208, + "grad_norm": 8.051933047005361, + "learning_rate": 4.994324953580251e-06, + "loss": 1.0348, + "step": 1407 + }, + { + "epoch": 0.1017212418949916, + "grad_norm": 7.854882353563934, + "learning_rate": 4.994305238943835e-06, + "loss": 0.97, + "step": 1408 + }, + { + "epoch": 0.10179348709520111, + "grad_norm": 9.089579776329261, + "learning_rate": 4.994285490162315e-06, + "loss": 0.9641, + "step": 1409 + }, + { + "epoch": 0.10186573229541063, + "grad_norm": 6.553201536532962, + "learning_rate": 4.994265707235965e-06, + "loss": 0.9502, + "step": 1410 + }, + { + "epoch": 0.10193797749562014, + "grad_norm": 8.825065582291062, + "learning_rate": 4.994245890165053e-06, + "loss": 1.0312, + "step": 1411 + }, + { + "epoch": 0.10201022269582964, + "grad_norm": 9.162462646075161, + "learning_rate": 4.994226038949851e-06, + "loss": 0.9768, + "step": 1412 + }, + { + "epoch": 0.10208246789603916, + "grad_norm": 8.578237953206704, + "learning_rate": 4.994206153590632e-06, + "loss": 1.0096, + "step": 1413 + }, + { + "epoch": 0.10215471309624867, + "grad_norm": 9.680093013221976, + "learning_rate": 4.994186234087667e-06, + "loss": 1.0235, + "step": 1414 + }, + { + "epoch": 0.10222695829645818, + "grad_norm": 11.306352662758577, + "learning_rate": 4.99416628044123e-06, + "loss": 0.9626, + "step": 1415 + }, + { + "epoch": 0.10229920349666768, + "grad_norm": 9.732876731230702, + "learning_rate": 4.994146292651592e-06, + "loss": 0.9991, + "step": 1416 + }, + { + "epoch": 0.1023714486968772, + "grad_norm": 7.584511997162864, + "learning_rate": 4.9941262707190285e-06, + "loss": 1.0046, + "step": 1417 + }, + { + "epoch": 0.10244369389708671, + "grad_norm": 7.520035240821239, + "learning_rate": 4.994106214643812e-06, + "loss": 1.0152, + "step": 1418 + }, + { + "epoch": 0.10251593909729623, + "grad_norm": 10.015300489575157, + "learning_rate": 4.99408612442622e-06, + "loss": 1.0038, + "step": 1419 + }, + { + "epoch": 0.10258818429750574, + "grad_norm": 7.898913934150145, + "learning_rate": 4.994066000066524e-06, + "loss": 0.9398, + "step": 1420 + }, + { + "epoch": 0.10266042949771524, + "grad_norm": 10.329855631053052, + "learning_rate": 4.994045841565e-06, + "loss": 1.022, + "step": 1421 + }, + { + "epoch": 0.10273267469792476, + "grad_norm": 6.554213169108347, + "learning_rate": 4.994025648921927e-06, + "loss": 0.9099, + "step": 1422 + }, + { + "epoch": 0.10280491989813427, + "grad_norm": 7.41804974407797, + "learning_rate": 4.994005422137579e-06, + "loss": 1.0025, + "step": 1423 + }, + { + "epoch": 0.10287716509834378, + "grad_norm": 9.413528819805492, + "learning_rate": 4.993985161212232e-06, + "loss": 0.9881, + "step": 1424 + }, + { + "epoch": 0.10294941029855328, + "grad_norm": 8.292607188597263, + "learning_rate": 4.993964866146165e-06, + "loss": 0.9254, + "step": 1425 + }, + { + "epoch": 0.1030216554987628, + "grad_norm": 7.244104816742181, + "learning_rate": 4.993944536939656e-06, + "loss": 0.8753, + "step": 1426 + }, + { + "epoch": 0.10309390069897231, + "grad_norm": 7.265388731037448, + "learning_rate": 4.9939241735929824e-06, + "loss": 0.9981, + "step": 1427 + }, + { + "epoch": 0.10316614589918183, + "grad_norm": 7.392203487039493, + "learning_rate": 4.993903776106424e-06, + "loss": 1.0287, + "step": 1428 + }, + { + "epoch": 0.10323839109939133, + "grad_norm": 7.400501708472837, + "learning_rate": 4.993883344480258e-06, + "loss": 0.9884, + "step": 1429 + }, + { + "epoch": 0.10331063629960084, + "grad_norm": 6.145909954390844, + "learning_rate": 4.993862878714766e-06, + "loss": 0.9843, + "step": 1430 + }, + { + "epoch": 0.10338288149981036, + "grad_norm": 6.527546247700521, + "learning_rate": 4.993842378810227e-06, + "loss": 0.9275, + "step": 1431 + }, + { + "epoch": 0.10345512670001987, + "grad_norm": 7.449947167535638, + "learning_rate": 4.9938218447669235e-06, + "loss": 1.0013, + "step": 1432 + }, + { + "epoch": 0.10352737190022938, + "grad_norm": 9.287331383716047, + "learning_rate": 4.993801276585135e-06, + "loss": 1.0307, + "step": 1433 + }, + { + "epoch": 0.10359961710043888, + "grad_norm": 7.1047511899739, + "learning_rate": 4.993780674265142e-06, + "loss": 1.0046, + "step": 1434 + }, + { + "epoch": 0.1036718623006484, + "grad_norm": 6.718493363557925, + "learning_rate": 4.993760037807229e-06, + "loss": 0.9829, + "step": 1435 + }, + { + "epoch": 0.10374410750085791, + "grad_norm": 7.66339301277585, + "learning_rate": 4.993739367211677e-06, + "loss": 1.0881, + "step": 1436 + }, + { + "epoch": 0.10381635270106743, + "grad_norm": 9.527419824675777, + "learning_rate": 4.9937186624787696e-06, + "loss": 0.9727, + "step": 1437 + }, + { + "epoch": 0.10388859790127693, + "grad_norm": 7.945239041419268, + "learning_rate": 4.993697923608789e-06, + "loss": 0.9445, + "step": 1438 + }, + { + "epoch": 0.10396084310148644, + "grad_norm": 9.503988532961726, + "learning_rate": 4.9936771506020215e-06, + "loss": 0.9497, + "step": 1439 + }, + { + "epoch": 0.10403308830169596, + "grad_norm": 14.7170367691546, + "learning_rate": 4.9936563434587495e-06, + "loss": 1.0142, + "step": 1440 + }, + { + "epoch": 0.10410533350190547, + "grad_norm": 8.376181903397788, + "learning_rate": 4.993635502179259e-06, + "loss": 0.9569, + "step": 1441 + }, + { + "epoch": 0.10417757870211498, + "grad_norm": 8.344628677106552, + "learning_rate": 4.993614626763833e-06, + "loss": 1.0993, + "step": 1442 + }, + { + "epoch": 0.10424982390232448, + "grad_norm": 7.8335821504507965, + "learning_rate": 4.993593717212759e-06, + "loss": 1.039, + "step": 1443 + }, + { + "epoch": 0.104322069102534, + "grad_norm": 11.455831210592532, + "learning_rate": 4.993572773526324e-06, + "loss": 0.9481, + "step": 1444 + }, + { + "epoch": 0.10439431430274351, + "grad_norm": 10.58412151455487, + "learning_rate": 4.993551795704814e-06, + "loss": 1.0068, + "step": 1445 + }, + { + "epoch": 0.10446655950295303, + "grad_norm": 7.081812556951566, + "learning_rate": 4.9935307837485155e-06, + "loss": 0.924, + "step": 1446 + }, + { + "epoch": 0.10453880470316253, + "grad_norm": 6.487616553757165, + "learning_rate": 4.993509737657718e-06, + "loss": 0.9241, + "step": 1447 + }, + { + "epoch": 0.10461104990337204, + "grad_norm": 9.430704461783744, + "learning_rate": 4.993488657432707e-06, + "loss": 1.065, + "step": 1448 + }, + { + "epoch": 0.10468329510358156, + "grad_norm": 7.699224200118568, + "learning_rate": 4.9934675430737726e-06, + "loss": 1.0432, + "step": 1449 + }, + { + "epoch": 0.10475554030379107, + "grad_norm": 6.956665783406695, + "learning_rate": 4.993446394581203e-06, + "loss": 0.9319, + "step": 1450 + }, + { + "epoch": 0.10482778550400058, + "grad_norm": 9.905782116932743, + "learning_rate": 4.993425211955289e-06, + "loss": 1.0024, + "step": 1451 + }, + { + "epoch": 0.10490003070421008, + "grad_norm": 8.032682892147603, + "learning_rate": 4.99340399519632e-06, + "loss": 1.0414, + "step": 1452 + }, + { + "epoch": 0.1049722759044196, + "grad_norm": 8.574632944864764, + "learning_rate": 4.993382744304586e-06, + "loss": 1.003, + "step": 1453 + }, + { + "epoch": 0.10504452110462911, + "grad_norm": 7.791184544734351, + "learning_rate": 4.9933614592803785e-06, + "loss": 0.9943, + "step": 1454 + }, + { + "epoch": 0.10511676630483863, + "grad_norm": 9.42273815749783, + "learning_rate": 4.993340140123988e-06, + "loss": 0.9563, + "step": 1455 + }, + { + "epoch": 0.10518901150504813, + "grad_norm": 8.074581586549954, + "learning_rate": 4.993318786835708e-06, + "loss": 1.1059, + "step": 1456 + }, + { + "epoch": 0.10526125670525764, + "grad_norm": 7.6041906347724435, + "learning_rate": 4.9932973994158285e-06, + "loss": 0.8899, + "step": 1457 + }, + { + "epoch": 0.10533350190546716, + "grad_norm": 8.143466094916986, + "learning_rate": 4.993275977864644e-06, + "loss": 0.9596, + "step": 1458 + }, + { + "epoch": 0.10540574710567667, + "grad_norm": 6.933076290109782, + "learning_rate": 4.993254522182448e-06, + "loss": 1.0132, + "step": 1459 + }, + { + "epoch": 0.10547799230588618, + "grad_norm": 8.830212695626306, + "learning_rate": 4.993233032369533e-06, + "loss": 1.0578, + "step": 1460 + }, + { + "epoch": 0.10555023750609568, + "grad_norm": 8.167438652244876, + "learning_rate": 4.993211508426194e-06, + "loss": 0.9878, + "step": 1461 + }, + { + "epoch": 0.1056224827063052, + "grad_norm": 8.374200583494382, + "learning_rate": 4.993189950352724e-06, + "loss": 1.036, + "step": 1462 + }, + { + "epoch": 0.10569472790651471, + "grad_norm": 10.479860155177182, + "learning_rate": 4.9931683581494205e-06, + "loss": 1.0579, + "step": 1463 + }, + { + "epoch": 0.10576697310672423, + "grad_norm": 7.9204208574316235, + "learning_rate": 4.993146731816577e-06, + "loss": 0.9765, + "step": 1464 + }, + { + "epoch": 0.10583921830693373, + "grad_norm": 8.021256340729387, + "learning_rate": 4.9931250713544914e-06, + "loss": 1.0277, + "step": 1465 + }, + { + "epoch": 0.10591146350714324, + "grad_norm": 9.867223454135802, + "learning_rate": 4.99310337676346e-06, + "loss": 1.0412, + "step": 1466 + }, + { + "epoch": 0.10598370870735276, + "grad_norm": 7.379761791158186, + "learning_rate": 4.993081648043778e-06, + "loss": 0.9779, + "step": 1467 + }, + { + "epoch": 0.10605595390756227, + "grad_norm": 8.5696319059466, + "learning_rate": 4.993059885195745e-06, + "loss": 1.0107, + "step": 1468 + }, + { + "epoch": 0.10612819910777178, + "grad_norm": 7.747172178144261, + "learning_rate": 4.993038088219656e-06, + "loss": 0.9793, + "step": 1469 + }, + { + "epoch": 0.10620044430798128, + "grad_norm": 8.144520477360981, + "learning_rate": 4.9930162571158134e-06, + "loss": 0.9422, + "step": 1470 + }, + { + "epoch": 0.1062726895081908, + "grad_norm": 10.748615264420765, + "learning_rate": 4.9929943918845124e-06, + "loss": 1.0039, + "step": 1471 + }, + { + "epoch": 0.10634493470840031, + "grad_norm": 9.983402592880994, + "learning_rate": 4.992972492526055e-06, + "loss": 1.1095, + "step": 1472 + }, + { + "epoch": 0.10641717990860983, + "grad_norm": 9.782962344499069, + "learning_rate": 4.992950559040739e-06, + "loss": 1.0342, + "step": 1473 + }, + { + "epoch": 0.10648942510881933, + "grad_norm": 7.0731089083165015, + "learning_rate": 4.9929285914288665e-06, + "loss": 1.0367, + "step": 1474 + }, + { + "epoch": 0.10656167030902884, + "grad_norm": 8.764652300225473, + "learning_rate": 4.992906589690736e-06, + "loss": 0.9446, + "step": 1475 + }, + { + "epoch": 0.10663391550923836, + "grad_norm": 8.445559121341246, + "learning_rate": 4.992884553826651e-06, + "loss": 0.9871, + "step": 1476 + }, + { + "epoch": 0.10670616070944787, + "grad_norm": 8.442009519020829, + "learning_rate": 4.992862483836911e-06, + "loss": 0.9809, + "step": 1477 + }, + { + "epoch": 0.10677840590965738, + "grad_norm": 10.74935023983372, + "learning_rate": 4.99284037972182e-06, + "loss": 1.011, + "step": 1478 + }, + { + "epoch": 0.10685065110986688, + "grad_norm": 9.651627999185836, + "learning_rate": 4.992818241481679e-06, + "loss": 0.9567, + "step": 1479 + }, + { + "epoch": 0.1069228963100764, + "grad_norm": 7.363094483291182, + "learning_rate": 4.992796069116793e-06, + "loss": 1.0151, + "step": 1480 + }, + { + "epoch": 0.10699514151028591, + "grad_norm": 8.821183924374138, + "learning_rate": 4.9927738626274635e-06, + "loss": 0.9477, + "step": 1481 + }, + { + "epoch": 0.10706738671049543, + "grad_norm": 7.48439946499133, + "learning_rate": 4.992751622013996e-06, + "loss": 0.8696, + "step": 1482 + }, + { + "epoch": 0.10713963191070493, + "grad_norm": 13.164987448959076, + "learning_rate": 4.992729347276694e-06, + "loss": 1.0181, + "step": 1483 + }, + { + "epoch": 0.10721187711091444, + "grad_norm": 8.296832700574848, + "learning_rate": 4.992707038415862e-06, + "loss": 0.956, + "step": 1484 + }, + { + "epoch": 0.10728412231112396, + "grad_norm": 7.988146344750231, + "learning_rate": 4.992684695431806e-06, + "loss": 1.0046, + "step": 1485 + }, + { + "epoch": 0.10735636751133347, + "grad_norm": 8.364744825375867, + "learning_rate": 4.992662318324833e-06, + "loss": 0.8697, + "step": 1486 + }, + { + "epoch": 0.10742861271154297, + "grad_norm": 7.600957769978471, + "learning_rate": 4.992639907095248e-06, + "loss": 1.0109, + "step": 1487 + }, + { + "epoch": 0.10750085791175248, + "grad_norm": 8.250783709668463, + "learning_rate": 4.992617461743358e-06, + "loss": 0.9849, + "step": 1488 + }, + { + "epoch": 0.107573103111962, + "grad_norm": 7.3883476377773665, + "learning_rate": 4.992594982269471e-06, + "loss": 0.9599, + "step": 1489 + }, + { + "epoch": 0.10764534831217151, + "grad_norm": 7.198147524836278, + "learning_rate": 4.992572468673893e-06, + "loss": 0.9363, + "step": 1490 + }, + { + "epoch": 0.10771759351238103, + "grad_norm": 9.380115791696017, + "learning_rate": 4.992549920956934e-06, + "loss": 0.9126, + "step": 1491 + }, + { + "epoch": 0.10778983871259053, + "grad_norm": 7.710786858568422, + "learning_rate": 4.992527339118901e-06, + "loss": 0.9649, + "step": 1492 + }, + { + "epoch": 0.10786208391280004, + "grad_norm": 6.914757093464751, + "learning_rate": 4.992504723160105e-06, + "loss": 0.9131, + "step": 1493 + }, + { + "epoch": 0.10793432911300956, + "grad_norm": 7.364395275889326, + "learning_rate": 4.992482073080854e-06, + "loss": 0.9457, + "step": 1494 + }, + { + "epoch": 0.10800657431321907, + "grad_norm": 8.331228626220941, + "learning_rate": 4.992459388881459e-06, + "loss": 0.9429, + "step": 1495 + }, + { + "epoch": 0.10807881951342857, + "grad_norm": 7.948311478860792, + "learning_rate": 4.99243667056223e-06, + "loss": 1.0276, + "step": 1496 + }, + { + "epoch": 0.10815106471363808, + "grad_norm": 9.320333373693098, + "learning_rate": 4.9924139181234785e-06, + "loss": 1.0976, + "step": 1497 + }, + { + "epoch": 0.1082233099138476, + "grad_norm": 8.28881674500206, + "learning_rate": 4.9923911315655164e-06, + "loss": 0.9506, + "step": 1498 + }, + { + "epoch": 0.10829555511405711, + "grad_norm": 8.472186190280135, + "learning_rate": 4.992368310888653e-06, + "loss": 0.9617, + "step": 1499 + }, + { + "epoch": 0.10836780031426663, + "grad_norm": 7.055075194392697, + "learning_rate": 4.9923454560932035e-06, + "loss": 0.9138, + "step": 1500 + }, + { + "epoch": 0.10844004551447613, + "grad_norm": 7.691898731217709, + "learning_rate": 4.99232256717948e-06, + "loss": 0.9997, + "step": 1501 + }, + { + "epoch": 0.10851229071468564, + "grad_norm": 9.093454257835988, + "learning_rate": 4.992299644147797e-06, + "loss": 1.0152, + "step": 1502 + }, + { + "epoch": 0.10858453591489516, + "grad_norm": 8.698946869348163, + "learning_rate": 4.9922766869984655e-06, + "loss": 1.012, + "step": 1503 + }, + { + "epoch": 0.10865678111510467, + "grad_norm": 12.381943113210829, + "learning_rate": 4.992253695731802e-06, + "loss": 1.0386, + "step": 1504 + }, + { + "epoch": 0.10872902631531417, + "grad_norm": 7.669770925351244, + "learning_rate": 4.99223067034812e-06, + "loss": 0.9479, + "step": 1505 + }, + { + "epoch": 0.10880127151552368, + "grad_norm": 7.290797706778859, + "learning_rate": 4.992207610847736e-06, + "loss": 1.0017, + "step": 1506 + }, + { + "epoch": 0.1088735167157332, + "grad_norm": 9.725887511092136, + "learning_rate": 4.992184517230964e-06, + "loss": 1.0379, + "step": 1507 + }, + { + "epoch": 0.10894576191594271, + "grad_norm": 7.677201482794487, + "learning_rate": 4.992161389498121e-06, + "loss": 1.1857, + "step": 1508 + }, + { + "epoch": 0.10901800711615223, + "grad_norm": 8.586014362855455, + "learning_rate": 4.992138227649524e-06, + "loss": 1.0316, + "step": 1509 + }, + { + "epoch": 0.10909025231636173, + "grad_norm": 9.32899642273332, + "learning_rate": 4.992115031685489e-06, + "loss": 0.9922, + "step": 1510 + }, + { + "epoch": 0.10916249751657124, + "grad_norm": 8.378824755576881, + "learning_rate": 4.992091801606336e-06, + "loss": 1.0567, + "step": 1511 + }, + { + "epoch": 0.10923474271678076, + "grad_norm": 6.992545927440397, + "learning_rate": 4.992068537412379e-06, + "loss": 1.0289, + "step": 1512 + }, + { + "epoch": 0.10930698791699027, + "grad_norm": 8.441987829196437, + "learning_rate": 4.99204523910394e-06, + "loss": 1.0336, + "step": 1513 + }, + { + "epoch": 0.10937923311719977, + "grad_norm": 7.632226989841359, + "learning_rate": 4.992021906681337e-06, + "loss": 0.9982, + "step": 1514 + }, + { + "epoch": 0.10945147831740928, + "grad_norm": 7.751524498417105, + "learning_rate": 4.9919985401448875e-06, + "loss": 0.9477, + "step": 1515 + }, + { + "epoch": 0.1095237235176188, + "grad_norm": 7.380832499454907, + "learning_rate": 4.991975139494915e-06, + "loss": 0.9473, + "step": 1516 + }, + { + "epoch": 0.10959596871782831, + "grad_norm": 10.987194931082877, + "learning_rate": 4.991951704731736e-06, + "loss": 1.0503, + "step": 1517 + }, + { + "epoch": 0.10966821391803783, + "grad_norm": 7.687961967667933, + "learning_rate": 4.991928235855673e-06, + "loss": 0.8783, + "step": 1518 + }, + { + "epoch": 0.10974045911824733, + "grad_norm": 7.138661187577499, + "learning_rate": 4.9919047328670486e-06, + "loss": 1.0082, + "step": 1519 + }, + { + "epoch": 0.10981270431845684, + "grad_norm": 10.544927482229015, + "learning_rate": 4.991881195766182e-06, + "loss": 0.9466, + "step": 1520 + }, + { + "epoch": 0.10988494951866636, + "grad_norm": 7.105324331098311, + "learning_rate": 4.991857624553397e-06, + "loss": 0.9767, + "step": 1521 + }, + { + "epoch": 0.10995719471887587, + "grad_norm": 7.481165570106941, + "learning_rate": 4.991834019229017e-06, + "loss": 1.0615, + "step": 1522 + }, + { + "epoch": 0.11002943991908537, + "grad_norm": 7.557588858427491, + "learning_rate": 4.991810379793362e-06, + "loss": 0.9785, + "step": 1523 + }, + { + "epoch": 0.11010168511929488, + "grad_norm": 6.603136080485704, + "learning_rate": 4.991786706246759e-06, + "loss": 1.0123, + "step": 1524 + }, + { + "epoch": 0.1101739303195044, + "grad_norm": 7.622462616525714, + "learning_rate": 4.991762998589531e-06, + "loss": 1.1021, + "step": 1525 + }, + { + "epoch": 0.11024617551971391, + "grad_norm": 7.02023958889567, + "learning_rate": 4.9917392568220015e-06, + "loss": 0.9431, + "step": 1526 + }, + { + "epoch": 0.11031842071992343, + "grad_norm": 8.790957261039875, + "learning_rate": 4.991715480944497e-06, + "loss": 1.016, + "step": 1527 + }, + { + "epoch": 0.11039066592013293, + "grad_norm": 6.902865171572185, + "learning_rate": 4.991691670957342e-06, + "loss": 1.0098, + "step": 1528 + }, + { + "epoch": 0.11046291112034244, + "grad_norm": 6.812149178807388, + "learning_rate": 4.991667826860862e-06, + "loss": 1.0107, + "step": 1529 + }, + { + "epoch": 0.11053515632055196, + "grad_norm": 7.115902898871099, + "learning_rate": 4.9916439486553845e-06, + "loss": 0.986, + "step": 1530 + }, + { + "epoch": 0.11060740152076147, + "grad_norm": 7.999283520085833, + "learning_rate": 4.9916200363412374e-06, + "loss": 1.0289, + "step": 1531 + }, + { + "epoch": 0.11067964672097097, + "grad_norm": 6.163159825283003, + "learning_rate": 4.991596089918745e-06, + "loss": 0.9766, + "step": 1532 + }, + { + "epoch": 0.11075189192118048, + "grad_norm": 9.967665849525302, + "learning_rate": 4.991572109388237e-06, + "loss": 1.0656, + "step": 1533 + }, + { + "epoch": 0.11082413712139, + "grad_norm": 6.507305148384349, + "learning_rate": 4.9915480947500415e-06, + "loss": 0.8628, + "step": 1534 + }, + { + "epoch": 0.11089638232159951, + "grad_norm": 7.55850341112035, + "learning_rate": 4.9915240460044865e-06, + "loss": 1.1237, + "step": 1535 + }, + { + "epoch": 0.11096862752180901, + "grad_norm": 7.738289630390464, + "learning_rate": 4.991499963151903e-06, + "loss": 1.0563, + "step": 1536 + }, + { + "epoch": 0.11104087272201853, + "grad_norm": 7.868684446320839, + "learning_rate": 4.99147584619262e-06, + "loss": 1.049, + "step": 1537 + }, + { + "epoch": 0.11111311792222804, + "grad_norm": 7.669400874764079, + "learning_rate": 4.991451695126965e-06, + "loss": 0.95, + "step": 1538 + }, + { + "epoch": 0.11118536312243756, + "grad_norm": 8.309880582499977, + "learning_rate": 4.991427509955273e-06, + "loss": 0.9949, + "step": 1539 + }, + { + "epoch": 0.11125760832264707, + "grad_norm": 6.9668237050927155, + "learning_rate": 4.991403290677871e-06, + "loss": 0.9875, + "step": 1540 + }, + { + "epoch": 0.11132985352285657, + "grad_norm": 8.018212092398866, + "learning_rate": 4.991379037295093e-06, + "loss": 1.0368, + "step": 1541 + }, + { + "epoch": 0.11140209872306608, + "grad_norm": 7.487544589156673, + "learning_rate": 4.991354749807271e-06, + "loss": 1.0696, + "step": 1542 + }, + { + "epoch": 0.1114743439232756, + "grad_norm": 7.535242818806764, + "learning_rate": 4.991330428214737e-06, + "loss": 1.0016, + "step": 1543 + }, + { + "epoch": 0.11154658912348511, + "grad_norm": 7.372461431641074, + "learning_rate": 4.991306072517823e-06, + "loss": 1.078, + "step": 1544 + }, + { + "epoch": 0.11161883432369461, + "grad_norm": 7.33421505337952, + "learning_rate": 4.991281682716864e-06, + "loss": 0.972, + "step": 1545 + }, + { + "epoch": 0.11169107952390413, + "grad_norm": 7.981723412294559, + "learning_rate": 4.9912572588121925e-06, + "loss": 0.9719, + "step": 1546 + }, + { + "epoch": 0.11176332472411364, + "grad_norm": 7.232556635677404, + "learning_rate": 4.991232800804144e-06, + "loss": 1.107, + "step": 1547 + }, + { + "epoch": 0.11183556992432316, + "grad_norm": 6.786858414874638, + "learning_rate": 4.9912083086930515e-06, + "loss": 0.945, + "step": 1548 + }, + { + "epoch": 0.11190781512453267, + "grad_norm": 6.249781185134448, + "learning_rate": 4.991183782479253e-06, + "loss": 0.9651, + "step": 1549 + }, + { + "epoch": 0.11198006032474217, + "grad_norm": 7.006038241287219, + "learning_rate": 4.991159222163082e-06, + "loss": 1.0488, + "step": 1550 + }, + { + "epoch": 0.11205230552495168, + "grad_norm": 6.4216929314785185, + "learning_rate": 4.9911346277448756e-06, + "loss": 1.0705, + "step": 1551 + }, + { + "epoch": 0.1121245507251612, + "grad_norm": 8.965245429454319, + "learning_rate": 4.991109999224971e-06, + "loss": 0.8845, + "step": 1552 + }, + { + "epoch": 0.11219679592537071, + "grad_norm": 6.985579487076478, + "learning_rate": 4.991085336603705e-06, + "loss": 0.9131, + "step": 1553 + }, + { + "epoch": 0.11226904112558021, + "grad_norm": 7.939863438813161, + "learning_rate": 4.991060639881414e-06, + "loss": 0.9675, + "step": 1554 + }, + { + "epoch": 0.11234128632578973, + "grad_norm": 6.888730296835481, + "learning_rate": 4.991035909058437e-06, + "loss": 0.9453, + "step": 1555 + }, + { + "epoch": 0.11241353152599924, + "grad_norm": 8.306587123990063, + "learning_rate": 4.991011144135113e-06, + "loss": 1.0662, + "step": 1556 + }, + { + "epoch": 0.11248577672620876, + "grad_norm": 8.661142006651028, + "learning_rate": 4.9909863451117805e-06, + "loss": 0.9124, + "step": 1557 + }, + { + "epoch": 0.11255802192641827, + "grad_norm": 7.562253837677544, + "learning_rate": 4.990961511988779e-06, + "loss": 0.9314, + "step": 1558 + }, + { + "epoch": 0.11263026712662777, + "grad_norm": 7.889197010715156, + "learning_rate": 4.990936644766449e-06, + "loss": 0.961, + "step": 1559 + }, + { + "epoch": 0.11270251232683728, + "grad_norm": 8.163395700191812, + "learning_rate": 4.990911743445129e-06, + "loss": 1.0203, + "step": 1560 + }, + { + "epoch": 0.1127747575270468, + "grad_norm": 7.270829106924437, + "learning_rate": 4.990886808025162e-06, + "loss": 1.0488, + "step": 1561 + }, + { + "epoch": 0.11284700272725631, + "grad_norm": 9.568291798789236, + "learning_rate": 4.99086183850689e-06, + "loss": 0.9464, + "step": 1562 + }, + { + "epoch": 0.11291924792746581, + "grad_norm": 8.661528262285987, + "learning_rate": 4.990836834890652e-06, + "loss": 1.042, + "step": 1563 + }, + { + "epoch": 0.11299149312767533, + "grad_norm": 6.980675316963207, + "learning_rate": 4.990811797176792e-06, + "loss": 0.9589, + "step": 1564 + }, + { + "epoch": 0.11306373832788484, + "grad_norm": 7.534942861237844, + "learning_rate": 4.990786725365653e-06, + "loss": 0.9888, + "step": 1565 + }, + { + "epoch": 0.11313598352809436, + "grad_norm": 10.410553096915235, + "learning_rate": 4.990761619457577e-06, + "loss": 1.0067, + "step": 1566 + }, + { + "epoch": 0.11320822872830387, + "grad_norm": 8.822546460620352, + "learning_rate": 4.990736479452909e-06, + "loss": 0.9455, + "step": 1567 + }, + { + "epoch": 0.11328047392851337, + "grad_norm": 6.852225572782391, + "learning_rate": 4.9907113053519915e-06, + "loss": 0.9683, + "step": 1568 + }, + { + "epoch": 0.11335271912872288, + "grad_norm": 9.915993889391679, + "learning_rate": 4.9906860971551716e-06, + "loss": 0.9444, + "step": 1569 + }, + { + "epoch": 0.1134249643289324, + "grad_norm": 8.34055956017673, + "learning_rate": 4.990660854862792e-06, + "loss": 0.966, + "step": 1570 + }, + { + "epoch": 0.11349720952914191, + "grad_norm": 7.776573399740578, + "learning_rate": 4.990635578475199e-06, + "loss": 1.022, + "step": 1571 + }, + { + "epoch": 0.11356945472935141, + "grad_norm": 8.17827243460674, + "learning_rate": 4.990610267992739e-06, + "loss": 1.1344, + "step": 1572 + }, + { + "epoch": 0.11364169992956093, + "grad_norm": 7.174354540139967, + "learning_rate": 4.990584923415759e-06, + "loss": 0.9661, + "step": 1573 + }, + { + "epoch": 0.11371394512977044, + "grad_norm": 8.046837075153348, + "learning_rate": 4.9905595447446046e-06, + "loss": 1.0138, + "step": 1574 + }, + { + "epoch": 0.11378619032997996, + "grad_norm": 10.375483122458157, + "learning_rate": 4.990534131979623e-06, + "loss": 1.0806, + "step": 1575 + }, + { + "epoch": 0.11385843553018947, + "grad_norm": 7.814352075390147, + "learning_rate": 4.990508685121165e-06, + "loss": 0.9817, + "step": 1576 + }, + { + "epoch": 0.11393068073039897, + "grad_norm": 8.6388527016599, + "learning_rate": 4.990483204169575e-06, + "loss": 1.0189, + "step": 1577 + }, + { + "epoch": 0.11400292593060848, + "grad_norm": 8.629659969486912, + "learning_rate": 4.990457689125204e-06, + "loss": 0.9739, + "step": 1578 + }, + { + "epoch": 0.114075171130818, + "grad_norm": 9.972769856506257, + "learning_rate": 4.990432139988401e-06, + "loss": 1.0113, + "step": 1579 + }, + { + "epoch": 0.11414741633102751, + "grad_norm": 7.317336871039719, + "learning_rate": 4.990406556759516e-06, + "loss": 0.9876, + "step": 1580 + }, + { + "epoch": 0.11421966153123701, + "grad_norm": 9.279975777956166, + "learning_rate": 4.990380939438899e-06, + "loss": 1.0097, + "step": 1581 + }, + { + "epoch": 0.11429190673144653, + "grad_norm": 7.981948035774273, + "learning_rate": 4.990355288026901e-06, + "loss": 0.9058, + "step": 1582 + }, + { + "epoch": 0.11436415193165604, + "grad_norm": 6.4845420723958505, + "learning_rate": 4.990329602523872e-06, + "loss": 0.9778, + "step": 1583 + }, + { + "epoch": 0.11443639713186556, + "grad_norm": 7.604408350896702, + "learning_rate": 4.990303882930164e-06, + "loss": 0.9941, + "step": 1584 + }, + { + "epoch": 0.11450864233207507, + "grad_norm": 6.6330704386131405, + "learning_rate": 4.99027812924613e-06, + "loss": 1.0252, + "step": 1585 + }, + { + "epoch": 0.11458088753228457, + "grad_norm": 8.590738108567855, + "learning_rate": 4.990252341472122e-06, + "loss": 0.9633, + "step": 1586 + }, + { + "epoch": 0.11465313273249408, + "grad_norm": 7.617997002258272, + "learning_rate": 4.9902265196084935e-06, + "loss": 0.9699, + "step": 1587 + }, + { + "epoch": 0.1147253779327036, + "grad_norm": 7.502177367450436, + "learning_rate": 4.990200663655596e-06, + "loss": 1.0292, + "step": 1588 + }, + { + "epoch": 0.11479762313291311, + "grad_norm": 9.1029073744852, + "learning_rate": 4.990174773613786e-06, + "loss": 1.0369, + "step": 1589 + }, + { + "epoch": 0.11486986833312261, + "grad_norm": 7.9446263800049834, + "learning_rate": 4.990148849483417e-06, + "loss": 1.0145, + "step": 1590 + }, + { + "epoch": 0.11494211353333213, + "grad_norm": 7.893871644110474, + "learning_rate": 4.9901228912648435e-06, + "loss": 0.9509, + "step": 1591 + }, + { + "epoch": 0.11501435873354164, + "grad_norm": 7.117475317485796, + "learning_rate": 4.990096898958421e-06, + "loss": 0.9542, + "step": 1592 + }, + { + "epoch": 0.11508660393375116, + "grad_norm": 7.053958014436547, + "learning_rate": 4.990070872564505e-06, + "loss": 1.0486, + "step": 1593 + }, + { + "epoch": 0.11515884913396066, + "grad_norm": 10.951939060150313, + "learning_rate": 4.990044812083453e-06, + "loss": 1.0359, + "step": 1594 + }, + { + "epoch": 0.11523109433417017, + "grad_norm": 7.551146757433266, + "learning_rate": 4.990018717515621e-06, + "loss": 0.934, + "step": 1595 + }, + { + "epoch": 0.11530333953437968, + "grad_norm": 7.103703846403111, + "learning_rate": 4.989992588861365e-06, + "loss": 0.9687, + "step": 1596 + }, + { + "epoch": 0.1153755847345892, + "grad_norm": 6.911220781220028, + "learning_rate": 4.989966426121045e-06, + "loss": 1.0145, + "step": 1597 + }, + { + "epoch": 0.11544782993479871, + "grad_norm": 6.594909855224342, + "learning_rate": 4.989940229295017e-06, + "loss": 0.9493, + "step": 1598 + }, + { + "epoch": 0.11552007513500821, + "grad_norm": 7.738553608044599, + "learning_rate": 4.989913998383641e-06, + "loss": 0.9985, + "step": 1599 + }, + { + "epoch": 0.11559232033521773, + "grad_norm": 10.040221388606257, + "learning_rate": 4.989887733387275e-06, + "loss": 0.9336, + "step": 1600 + }, + { + "epoch": 0.11566456553542724, + "grad_norm": 9.632941994643588, + "learning_rate": 4.98986143430628e-06, + "loss": 0.9504, + "step": 1601 + }, + { + "epoch": 0.11573681073563676, + "grad_norm": 9.225491195875469, + "learning_rate": 4.989835101141015e-06, + "loss": 0.9549, + "step": 1602 + }, + { + "epoch": 0.11580905593584626, + "grad_norm": 7.5299691658351255, + "learning_rate": 4.989808733891841e-06, + "loss": 0.9961, + "step": 1603 + }, + { + "epoch": 0.11588130113605577, + "grad_norm": 7.303578991341414, + "learning_rate": 4.989782332559119e-06, + "loss": 0.9127, + "step": 1604 + }, + { + "epoch": 0.11595354633626528, + "grad_norm": 6.611806724275876, + "learning_rate": 4.989755897143209e-06, + "loss": 0.8495, + "step": 1605 + }, + { + "epoch": 0.1160257915364748, + "grad_norm": 7.727032252767082, + "learning_rate": 4.989729427644474e-06, + "loss": 0.9042, + "step": 1606 + }, + { + "epoch": 0.11609803673668431, + "grad_norm": 8.50529898232911, + "learning_rate": 4.9897029240632775e-06, + "loss": 1.0256, + "step": 1607 + }, + { + "epoch": 0.11617028193689381, + "grad_norm": 7.501611917845843, + "learning_rate": 4.9896763863999795e-06, + "loss": 0.9821, + "step": 1608 + }, + { + "epoch": 0.11624252713710333, + "grad_norm": 7.9996390261273564, + "learning_rate": 4.989649814654946e-06, + "loss": 1.0302, + "step": 1609 + }, + { + "epoch": 0.11631477233731284, + "grad_norm": 7.845514228261865, + "learning_rate": 4.989623208828539e-06, + "loss": 0.961, + "step": 1610 + }, + { + "epoch": 0.11638701753752236, + "grad_norm": 7.6467345604824395, + "learning_rate": 4.989596568921124e-06, + "loss": 0.9386, + "step": 1611 + }, + { + "epoch": 0.11645926273773186, + "grad_norm": 6.572059571435582, + "learning_rate": 4.989569894933064e-06, + "loss": 0.9094, + "step": 1612 + }, + { + "epoch": 0.11653150793794137, + "grad_norm": 8.112114651172623, + "learning_rate": 4.989543186864726e-06, + "loss": 0.9852, + "step": 1613 + }, + { + "epoch": 0.11660375313815088, + "grad_norm": 9.413694559718405, + "learning_rate": 4.989516444716475e-06, + "loss": 1.0511, + "step": 1614 + }, + { + "epoch": 0.1166759983383604, + "grad_norm": 7.92966729523288, + "learning_rate": 4.989489668488676e-06, + "loss": 0.9844, + "step": 1615 + }, + { + "epoch": 0.11674824353856991, + "grad_norm": 8.96166414542361, + "learning_rate": 4.989462858181697e-06, + "loss": 0.9128, + "step": 1616 + }, + { + "epoch": 0.11682048873877941, + "grad_norm": 11.443571886627621, + "learning_rate": 4.9894360137959045e-06, + "loss": 1.0484, + "step": 1617 + }, + { + "epoch": 0.11689273393898893, + "grad_norm": 8.485037705759714, + "learning_rate": 4.989409135331666e-06, + "loss": 1.0057, + "step": 1618 + }, + { + "epoch": 0.11696497913919844, + "grad_norm": 7.435986276664198, + "learning_rate": 4.9893822227893484e-06, + "loss": 1.0225, + "step": 1619 + }, + { + "epoch": 0.11703722433940796, + "grad_norm": 6.994304247073812, + "learning_rate": 4.989355276169322e-06, + "loss": 0.9845, + "step": 1620 + }, + { + "epoch": 0.11710946953961746, + "grad_norm": 9.686886085008215, + "learning_rate": 4.989328295471954e-06, + "loss": 1.0764, + "step": 1621 + }, + { + "epoch": 0.11718171473982697, + "grad_norm": 8.167615666687, + "learning_rate": 4.989301280697615e-06, + "loss": 0.9564, + "step": 1622 + }, + { + "epoch": 0.11725395994003648, + "grad_norm": 6.653374422414087, + "learning_rate": 4.989274231846674e-06, + "loss": 0.873, + "step": 1623 + }, + { + "epoch": 0.117326205140246, + "grad_norm": 7.984620092403866, + "learning_rate": 4.989247148919502e-06, + "loss": 1.0323, + "step": 1624 + }, + { + "epoch": 0.11739845034045551, + "grad_norm": 8.29503247623168, + "learning_rate": 4.989220031916468e-06, + "loss": 1.037, + "step": 1625 + }, + { + "epoch": 0.11747069554066501, + "grad_norm": 8.602521712950066, + "learning_rate": 4.989192880837946e-06, + "loss": 1.0488, + "step": 1626 + }, + { + "epoch": 0.11754294074087453, + "grad_norm": 9.618825542111173, + "learning_rate": 4.989165695684306e-06, + "loss": 1.0005, + "step": 1627 + }, + { + "epoch": 0.11761518594108404, + "grad_norm": 7.737790241868051, + "learning_rate": 4.98913847645592e-06, + "loss": 1.0002, + "step": 1628 + }, + { + "epoch": 0.11768743114129356, + "grad_norm": 11.069443414853723, + "learning_rate": 4.9891112231531605e-06, + "loss": 1.0201, + "step": 1629 + }, + { + "epoch": 0.11775967634150306, + "grad_norm": 7.501881172770673, + "learning_rate": 4.989083935776402e-06, + "loss": 0.9785, + "step": 1630 + }, + { + "epoch": 0.11783192154171257, + "grad_norm": 8.940261520895891, + "learning_rate": 4.989056614326017e-06, + "loss": 0.8392, + "step": 1631 + }, + { + "epoch": 0.11790416674192208, + "grad_norm": 7.616345606519522, + "learning_rate": 4.989029258802378e-06, + "loss": 1.0527, + "step": 1632 + }, + { + "epoch": 0.1179764119421316, + "grad_norm": 8.497254769809299, + "learning_rate": 4.989001869205863e-06, + "loss": 0.9463, + "step": 1633 + }, + { + "epoch": 0.11804865714234111, + "grad_norm": 7.939855991854431, + "learning_rate": 4.988974445536844e-06, + "loss": 0.9139, + "step": 1634 + }, + { + "epoch": 0.11812090234255061, + "grad_norm": 7.927083547210357, + "learning_rate": 4.988946987795698e-06, + "loss": 0.9172, + "step": 1635 + }, + { + "epoch": 0.11819314754276013, + "grad_norm": 6.74518017527507, + "learning_rate": 4.9889194959827995e-06, + "loss": 1.0636, + "step": 1636 + }, + { + "epoch": 0.11826539274296964, + "grad_norm": 6.224799659293449, + "learning_rate": 4.988891970098526e-06, + "loss": 0.9249, + "step": 1637 + }, + { + "epoch": 0.11833763794317916, + "grad_norm": 6.423459342561227, + "learning_rate": 4.988864410143254e-06, + "loss": 0.8689, + "step": 1638 + }, + { + "epoch": 0.11840988314338866, + "grad_norm": 7.4835627678559264, + "learning_rate": 4.988836816117361e-06, + "loss": 1.0034, + "step": 1639 + }, + { + "epoch": 0.11848212834359817, + "grad_norm": 8.090062542309333, + "learning_rate": 4.9888091880212235e-06, + "loss": 1.1369, + "step": 1640 + }, + { + "epoch": 0.11855437354380768, + "grad_norm": 7.637416048727946, + "learning_rate": 4.988781525855221e-06, + "loss": 1.0647, + "step": 1641 + }, + { + "epoch": 0.1186266187440172, + "grad_norm": 7.733513308217612, + "learning_rate": 4.988753829619732e-06, + "loss": 1.0274, + "step": 1642 + }, + { + "epoch": 0.1186988639442267, + "grad_norm": 8.059860862117858, + "learning_rate": 4.988726099315136e-06, + "loss": 0.9893, + "step": 1643 + }, + { + "epoch": 0.11877110914443621, + "grad_norm": 7.915551545426923, + "learning_rate": 4.988698334941812e-06, + "loss": 1.0508, + "step": 1644 + }, + { + "epoch": 0.11884335434464573, + "grad_norm": 7.245464353140312, + "learning_rate": 4.988670536500139e-06, + "loss": 0.8989, + "step": 1645 + }, + { + "epoch": 0.11891559954485524, + "grad_norm": 8.19299178816371, + "learning_rate": 4.9886427039904996e-06, + "loss": 0.8919, + "step": 1646 + }, + { + "epoch": 0.11898784474506476, + "grad_norm": 7.876966442850755, + "learning_rate": 4.988614837413274e-06, + "loss": 1.0411, + "step": 1647 + }, + { + "epoch": 0.11906008994527426, + "grad_norm": 8.337760461402015, + "learning_rate": 4.988586936768843e-06, + "loss": 1.0056, + "step": 1648 + }, + { + "epoch": 0.11913233514548377, + "grad_norm": 7.942222091716549, + "learning_rate": 4.988559002057589e-06, + "loss": 1.0142, + "step": 1649 + }, + { + "epoch": 0.11920458034569328, + "grad_norm": 7.7341996606545615, + "learning_rate": 4.988531033279895e-06, + "loss": 0.8887, + "step": 1650 + }, + { + "epoch": 0.1192768255459028, + "grad_norm": 8.793760463163418, + "learning_rate": 4.988503030436144e-06, + "loss": 0.9804, + "step": 1651 + }, + { + "epoch": 0.1193490707461123, + "grad_norm": 10.403149758719412, + "learning_rate": 4.988474993526717e-06, + "loss": 0.9642, + "step": 1652 + }, + { + "epoch": 0.11942131594632181, + "grad_norm": 8.786135797435444, + "learning_rate": 4.988446922552001e-06, + "loss": 1.0178, + "step": 1653 + }, + { + "epoch": 0.11949356114653133, + "grad_norm": 9.073764719054479, + "learning_rate": 4.988418817512378e-06, + "loss": 0.8691, + "step": 1654 + }, + { + "epoch": 0.11956580634674084, + "grad_norm": 8.538882017191456, + "learning_rate": 4.988390678408234e-06, + "loss": 1.0076, + "step": 1655 + }, + { + "epoch": 0.11963805154695036, + "grad_norm": 8.802431672255489, + "learning_rate": 4.988362505239954e-06, + "loss": 1.0731, + "step": 1656 + }, + { + "epoch": 0.11971029674715986, + "grad_norm": 8.062885127369015, + "learning_rate": 4.988334298007922e-06, + "loss": 0.9113, + "step": 1657 + }, + { + "epoch": 0.11978254194736937, + "grad_norm": 8.344066370991762, + "learning_rate": 4.988306056712527e-06, + "loss": 0.9978, + "step": 1658 + }, + { + "epoch": 0.11985478714757888, + "grad_norm": 6.685810196483178, + "learning_rate": 4.988277781354154e-06, + "loss": 0.8468, + "step": 1659 + }, + { + "epoch": 0.1199270323477884, + "grad_norm": 8.427066936563415, + "learning_rate": 4.988249471933189e-06, + "loss": 0.953, + "step": 1660 + }, + { + "epoch": 0.1199992775479979, + "grad_norm": 7.3961989048257815, + "learning_rate": 4.988221128450021e-06, + "loss": 0.9413, + "step": 1661 + }, + { + "epoch": 0.12007152274820741, + "grad_norm": 8.367662090444576, + "learning_rate": 4.988192750905039e-06, + "loss": 0.9897, + "step": 1662 + }, + { + "epoch": 0.12014376794841693, + "grad_norm": 8.432451560421065, + "learning_rate": 4.988164339298629e-06, + "loss": 1.0171, + "step": 1663 + }, + { + "epoch": 0.12021601314862644, + "grad_norm": 7.044850399034417, + "learning_rate": 4.988135893631182e-06, + "loss": 0.9311, + "step": 1664 + }, + { + "epoch": 0.12028825834883596, + "grad_norm": 8.340578769571437, + "learning_rate": 4.9881074139030865e-06, + "loss": 1.0334, + "step": 1665 + }, + { + "epoch": 0.12036050354904546, + "grad_norm": 7.77888863517389, + "learning_rate": 4.988078900114731e-06, + "loss": 0.9718, + "step": 1666 + }, + { + "epoch": 0.12043274874925497, + "grad_norm": 7.490967461273569, + "learning_rate": 4.988050352266509e-06, + "loss": 0.8726, + "step": 1667 + }, + { + "epoch": 0.12050499394946448, + "grad_norm": 11.056800537893665, + "learning_rate": 4.988021770358809e-06, + "loss": 0.8691, + "step": 1668 + }, + { + "epoch": 0.120577239149674, + "grad_norm": 8.295708008500823, + "learning_rate": 4.987993154392022e-06, + "loss": 1.0534, + "step": 1669 + }, + { + "epoch": 0.1206494843498835, + "grad_norm": 7.377123510823532, + "learning_rate": 4.98796450436654e-06, + "loss": 0.9813, + "step": 1670 + }, + { + "epoch": 0.12072172955009301, + "grad_norm": 7.346598620552501, + "learning_rate": 4.9879358202827574e-06, + "loss": 1.0795, + "step": 1671 + }, + { + "epoch": 0.12079397475030253, + "grad_norm": 7.792948190762354, + "learning_rate": 4.987907102141065e-06, + "loss": 1.1076, + "step": 1672 + }, + { + "epoch": 0.12086621995051204, + "grad_norm": 8.669490451996912, + "learning_rate": 4.987878349941855e-06, + "loss": 0.9987, + "step": 1673 + }, + { + "epoch": 0.12093846515072156, + "grad_norm": 7.491000816393039, + "learning_rate": 4.987849563685523e-06, + "loss": 0.9119, + "step": 1674 + }, + { + "epoch": 0.12101071035093106, + "grad_norm": 9.475102374245958, + "learning_rate": 4.987820743372462e-06, + "loss": 1.0878, + "step": 1675 + }, + { + "epoch": 0.12108295555114057, + "grad_norm": 8.263373430633616, + "learning_rate": 4.987791889003067e-06, + "loss": 0.9575, + "step": 1676 + }, + { + "epoch": 0.12115520075135008, + "grad_norm": 7.297081235793362, + "learning_rate": 4.987763000577732e-06, + "loss": 0.9767, + "step": 1677 + }, + { + "epoch": 0.1212274459515596, + "grad_norm": 7.319839055110748, + "learning_rate": 4.987734078096853e-06, + "loss": 1.1022, + "step": 1678 + }, + { + "epoch": 0.1212996911517691, + "grad_norm": 6.532236709164828, + "learning_rate": 4.987705121560826e-06, + "loss": 0.9859, + "step": 1679 + }, + { + "epoch": 0.12137193635197861, + "grad_norm": 7.613790057635425, + "learning_rate": 4.9876761309700485e-06, + "loss": 1.0064, + "step": 1680 + }, + { + "epoch": 0.12144418155218813, + "grad_norm": 9.950325229256014, + "learning_rate": 4.987647106324916e-06, + "loss": 1.0547, + "step": 1681 + }, + { + "epoch": 0.12151642675239764, + "grad_norm": 9.53460214169885, + "learning_rate": 4.9876180476258255e-06, + "loss": 0.9226, + "step": 1682 + }, + { + "epoch": 0.12158867195260716, + "grad_norm": 7.052186169759645, + "learning_rate": 4.987588954873176e-06, + "loss": 0.9975, + "step": 1683 + }, + { + "epoch": 0.12166091715281666, + "grad_norm": 6.772624493901931, + "learning_rate": 4.987559828067365e-06, + "loss": 0.9943, + "step": 1684 + }, + { + "epoch": 0.12173316235302617, + "grad_norm": 7.1837679254305895, + "learning_rate": 4.9875306672087916e-06, + "loss": 0.9592, + "step": 1685 + }, + { + "epoch": 0.12180540755323568, + "grad_norm": 7.354505608576421, + "learning_rate": 4.987501472297854e-06, + "loss": 1.0128, + "step": 1686 + }, + { + "epoch": 0.1218776527534452, + "grad_norm": 8.47090149748985, + "learning_rate": 4.9874722433349536e-06, + "loss": 0.9542, + "step": 1687 + }, + { + "epoch": 0.1219498979536547, + "grad_norm": 7.979620486535112, + "learning_rate": 4.98744298032049e-06, + "loss": 1.0254, + "step": 1688 + }, + { + "epoch": 0.12202214315386421, + "grad_norm": 7.446040018888605, + "learning_rate": 4.9874136832548625e-06, + "loss": 0.8951, + "step": 1689 + }, + { + "epoch": 0.12209438835407373, + "grad_norm": 7.644633296856574, + "learning_rate": 4.987384352138473e-06, + "loss": 0.9713, + "step": 1690 + }, + { + "epoch": 0.12216663355428324, + "grad_norm": 6.858308261657102, + "learning_rate": 4.987354986971723e-06, + "loss": 0.9863, + "step": 1691 + }, + { + "epoch": 0.12223887875449276, + "grad_norm": 6.512133497589687, + "learning_rate": 4.987325587755015e-06, + "loss": 0.9073, + "step": 1692 + }, + { + "epoch": 0.12231112395470226, + "grad_norm": 8.578054737018705, + "learning_rate": 4.987296154488751e-06, + "loss": 1.0397, + "step": 1693 + }, + { + "epoch": 0.12238336915491177, + "grad_norm": 8.231772429528343, + "learning_rate": 4.987266687173333e-06, + "loss": 0.9585, + "step": 1694 + }, + { + "epoch": 0.12245561435512128, + "grad_norm": 7.803254294708755, + "learning_rate": 4.987237185809166e-06, + "loss": 0.9922, + "step": 1695 + }, + { + "epoch": 0.1225278595553308, + "grad_norm": 8.159059471829075, + "learning_rate": 4.9872076503966536e-06, + "loss": 0.9718, + "step": 1696 + }, + { + "epoch": 0.1226001047555403, + "grad_norm": 10.242174091810446, + "learning_rate": 4.987178080936199e-06, + "loss": 1.0809, + "step": 1697 + }, + { + "epoch": 0.12267234995574981, + "grad_norm": 7.215093305781486, + "learning_rate": 4.987148477428208e-06, + "loss": 1.0334, + "step": 1698 + }, + { + "epoch": 0.12274459515595933, + "grad_norm": 6.553899160634352, + "learning_rate": 4.9871188398730855e-06, + "loss": 0.949, + "step": 1699 + }, + { + "epoch": 0.12281684035616884, + "grad_norm": 6.350150651721623, + "learning_rate": 4.987089168271237e-06, + "loss": 0.9762, + "step": 1700 + }, + { + "epoch": 0.12288908555637834, + "grad_norm": 7.242466662971795, + "learning_rate": 4.987059462623069e-06, + "loss": 0.9393, + "step": 1701 + }, + { + "epoch": 0.12296133075658786, + "grad_norm": 7.03283592140554, + "learning_rate": 4.9870297229289875e-06, + "loss": 1.0918, + "step": 1702 + }, + { + "epoch": 0.12303357595679737, + "grad_norm": 7.152129873747891, + "learning_rate": 4.9869999491894e-06, + "loss": 0.9487, + "step": 1703 + }, + { + "epoch": 0.12310582115700688, + "grad_norm": 7.0927090994474575, + "learning_rate": 4.986970141404716e-06, + "loss": 0.9748, + "step": 1704 + }, + { + "epoch": 0.1231780663572164, + "grad_norm": 8.22504511765741, + "learning_rate": 4.986940299575341e-06, + "loss": 1.0185, + "step": 1705 + }, + { + "epoch": 0.1232503115574259, + "grad_norm": 7.051958437047936, + "learning_rate": 4.986910423701683e-06, + "loss": 0.977, + "step": 1706 + }, + { + "epoch": 0.12332255675763541, + "grad_norm": 6.157766087707292, + "learning_rate": 4.986880513784153e-06, + "loss": 0.8982, + "step": 1707 + }, + { + "epoch": 0.12339480195784493, + "grad_norm": 6.753830846980041, + "learning_rate": 4.98685056982316e-06, + "loss": 0.9816, + "step": 1708 + }, + { + "epoch": 0.12346704715805444, + "grad_norm": 6.854885848171002, + "learning_rate": 4.986820591819114e-06, + "loss": 0.9235, + "step": 1709 + }, + { + "epoch": 0.12353929235826394, + "grad_norm": 8.607921996972152, + "learning_rate": 4.986790579772424e-06, + "loss": 1.0071, + "step": 1710 + }, + { + "epoch": 0.12361153755847346, + "grad_norm": 7.51876619484696, + "learning_rate": 4.986760533683502e-06, + "loss": 0.9897, + "step": 1711 + }, + { + "epoch": 0.12368378275868297, + "grad_norm": 7.681620512702503, + "learning_rate": 4.98673045355276e-06, + "loss": 0.9724, + "step": 1712 + }, + { + "epoch": 0.12375602795889248, + "grad_norm": 8.236304272478504, + "learning_rate": 4.986700339380608e-06, + "loss": 1.0207, + "step": 1713 + }, + { + "epoch": 0.123828273159102, + "grad_norm": 8.509116276938112, + "learning_rate": 4.98667019116746e-06, + "loss": 0.9748, + "step": 1714 + }, + { + "epoch": 0.1239005183593115, + "grad_norm": 7.501169495000403, + "learning_rate": 4.986640008913727e-06, + "loss": 0.9561, + "step": 1715 + }, + { + "epoch": 0.12397276355952101, + "grad_norm": 6.526085961250649, + "learning_rate": 4.986609792619823e-06, + "loss": 0.8294, + "step": 1716 + }, + { + "epoch": 0.12404500875973053, + "grad_norm": 8.677775916721972, + "learning_rate": 4.986579542286162e-06, + "loss": 0.9352, + "step": 1717 + }, + { + "epoch": 0.12411725395994004, + "grad_norm": 7.887798989216644, + "learning_rate": 4.986549257913158e-06, + "loss": 0.9992, + "step": 1718 + }, + { + "epoch": 0.12418949916014954, + "grad_norm": 7.128550932926012, + "learning_rate": 4.986518939501225e-06, + "loss": 0.9502, + "step": 1719 + }, + { + "epoch": 0.12426174436035906, + "grad_norm": 8.28137022686902, + "learning_rate": 4.986488587050779e-06, + "loss": 1.1193, + "step": 1720 + }, + { + "epoch": 0.12433398956056857, + "grad_norm": 9.236753722154255, + "learning_rate": 4.986458200562234e-06, + "loss": 0.9539, + "step": 1721 + }, + { + "epoch": 0.12440623476077808, + "grad_norm": 7.459364097021514, + "learning_rate": 4.986427780036007e-06, + "loss": 1.0201, + "step": 1722 + }, + { + "epoch": 0.1244784799609876, + "grad_norm": 6.554861218738641, + "learning_rate": 4.986397325472515e-06, + "loss": 1.0141, + "step": 1723 + }, + { + "epoch": 0.1245507251611971, + "grad_norm": 7.74137127343181, + "learning_rate": 4.9863668368721735e-06, + "loss": 0.9062, + "step": 1724 + }, + { + "epoch": 0.12462297036140661, + "grad_norm": 8.284711927471673, + "learning_rate": 4.9863363142354e-06, + "loss": 0.9387, + "step": 1725 + }, + { + "epoch": 0.12469521556161613, + "grad_norm": 9.722323530365648, + "learning_rate": 4.986305757562614e-06, + "loss": 1.0212, + "step": 1726 + }, + { + "epoch": 0.12476746076182564, + "grad_norm": 8.592441306603256, + "learning_rate": 4.986275166854233e-06, + "loss": 0.968, + "step": 1727 + }, + { + "epoch": 0.12483970596203514, + "grad_norm": 7.261241222027585, + "learning_rate": 4.986244542110674e-06, + "loss": 0.8959, + "step": 1728 + }, + { + "epoch": 0.12491195116224466, + "grad_norm": 8.539770992165588, + "learning_rate": 4.986213883332359e-06, + "loss": 0.9725, + "step": 1729 + }, + { + "epoch": 0.12498419636245417, + "grad_norm": 10.453344322834596, + "learning_rate": 4.9861831905197056e-06, + "loss": 1.0724, + "step": 1730 + }, + { + "epoch": 0.12505644156266368, + "grad_norm": 8.042593103782375, + "learning_rate": 4.986152463673134e-06, + "loss": 0.9462, + "step": 1731 + }, + { + "epoch": 0.1251286867628732, + "grad_norm": 7.295839027854256, + "learning_rate": 4.986121702793067e-06, + "loss": 0.9846, + "step": 1732 + }, + { + "epoch": 0.1252009319630827, + "grad_norm": 8.416883812045212, + "learning_rate": 4.986090907879924e-06, + "loss": 0.9701, + "step": 1733 + }, + { + "epoch": 0.12527317716329223, + "grad_norm": 8.953649908260179, + "learning_rate": 4.986060078934126e-06, + "loss": 1.0822, + "step": 1734 + }, + { + "epoch": 0.1253454223635017, + "grad_norm": 7.917682569137575, + "learning_rate": 4.986029215956096e-06, + "loss": 0.9708, + "step": 1735 + }, + { + "epoch": 0.12541766756371123, + "grad_norm": 7.227419448767847, + "learning_rate": 4.985998318946257e-06, + "loss": 0.9886, + "step": 1736 + }, + { + "epoch": 0.12548991276392074, + "grad_norm": 7.019715202184159, + "learning_rate": 4.985967387905031e-06, + "loss": 1.0245, + "step": 1737 + }, + { + "epoch": 0.12556215796413026, + "grad_norm": 7.27550309267097, + "learning_rate": 4.985936422832841e-06, + "loss": 0.9738, + "step": 1738 + }, + { + "epoch": 0.12563440316433977, + "grad_norm": 6.970053512389713, + "learning_rate": 4.985905423730112e-06, + "loss": 0.9329, + "step": 1739 + }, + { + "epoch": 0.12570664836454928, + "grad_norm": 8.693098500986375, + "learning_rate": 4.985874390597269e-06, + "loss": 1.0577, + "step": 1740 + }, + { + "epoch": 0.1257788935647588, + "grad_norm": 8.30654762946122, + "learning_rate": 4.985843323434734e-06, + "loss": 0.9587, + "step": 1741 + }, + { + "epoch": 0.1258511387649683, + "grad_norm": 7.48051923530614, + "learning_rate": 4.985812222242935e-06, + "loss": 0.9435, + "step": 1742 + }, + { + "epoch": 0.12592338396517783, + "grad_norm": 7.888797843061477, + "learning_rate": 4.985781087022297e-06, + "loss": 0.868, + "step": 1743 + }, + { + "epoch": 0.1259956291653873, + "grad_norm": 7.055641016939007, + "learning_rate": 4.985749917773245e-06, + "loss": 1.0181, + "step": 1744 + }, + { + "epoch": 0.12606787436559683, + "grad_norm": 7.136850239962742, + "learning_rate": 4.9857187144962075e-06, + "loss": 0.9571, + "step": 1745 + }, + { + "epoch": 0.12614011956580634, + "grad_norm": 7.782090719987855, + "learning_rate": 4.9856874771916105e-06, + "loss": 1.0095, + "step": 1746 + }, + { + "epoch": 0.12621236476601586, + "grad_norm": 7.171179245255143, + "learning_rate": 4.985656205859882e-06, + "loss": 0.9804, + "step": 1747 + }, + { + "epoch": 0.12628460996622537, + "grad_norm": 8.711306221176107, + "learning_rate": 4.98562490050145e-06, + "loss": 1.0101, + "step": 1748 + }, + { + "epoch": 0.12635685516643488, + "grad_norm": 7.371286782879175, + "learning_rate": 4.985593561116743e-06, + "loss": 0.906, + "step": 1749 + }, + { + "epoch": 0.1264291003666444, + "grad_norm": 10.612547994085013, + "learning_rate": 4.98556218770619e-06, + "loss": 1.0397, + "step": 1750 + }, + { + "epoch": 0.1265013455668539, + "grad_norm": 7.583688608867044, + "learning_rate": 4.9855307802702215e-06, + "loss": 0.9265, + "step": 1751 + }, + { + "epoch": 0.12657359076706343, + "grad_norm": 8.236361703675326, + "learning_rate": 4.985499338809265e-06, + "loss": 1.0036, + "step": 1752 + }, + { + "epoch": 0.1266458359672729, + "grad_norm": 7.4197892320007615, + "learning_rate": 4.985467863323754e-06, + "loss": 0.9202, + "step": 1753 + }, + { + "epoch": 0.12671808116748243, + "grad_norm": 7.368841153552508, + "learning_rate": 4.985436353814117e-06, + "loss": 1.045, + "step": 1754 + }, + { + "epoch": 0.12679032636769194, + "grad_norm": 8.029849631681895, + "learning_rate": 4.985404810280786e-06, + "loss": 0.8961, + "step": 1755 + }, + { + "epoch": 0.12686257156790146, + "grad_norm": 7.824239958392998, + "learning_rate": 4.985373232724193e-06, + "loss": 0.9815, + "step": 1756 + }, + { + "epoch": 0.12693481676811097, + "grad_norm": 6.808274104570041, + "learning_rate": 4.98534162114477e-06, + "loss": 0.9654, + "step": 1757 + }, + { + "epoch": 0.12700706196832048, + "grad_norm": 6.670771130048041, + "learning_rate": 4.9853099755429505e-06, + "loss": 0.9449, + "step": 1758 + }, + { + "epoch": 0.12707930716853, + "grad_norm": 7.702342769376269, + "learning_rate": 4.985278295919167e-06, + "loss": 1.0055, + "step": 1759 + }, + { + "epoch": 0.1271515523687395, + "grad_norm": 6.983559511781213, + "learning_rate": 4.985246582273853e-06, + "loss": 0.8943, + "step": 1760 + }, + { + "epoch": 0.12722379756894903, + "grad_norm": 7.873463314231348, + "learning_rate": 4.985214834607443e-06, + "loss": 1.0164, + "step": 1761 + }, + { + "epoch": 0.1272960427691585, + "grad_norm": 7.2111758175170415, + "learning_rate": 4.9851830529203725e-06, + "loss": 0.9237, + "step": 1762 + }, + { + "epoch": 0.12736828796936803, + "grad_norm": 7.425162460656243, + "learning_rate": 4.985151237213075e-06, + "loss": 0.9574, + "step": 1763 + }, + { + "epoch": 0.12744053316957754, + "grad_norm": 6.423930857722157, + "learning_rate": 4.985119387485986e-06, + "loss": 0.9757, + "step": 1764 + }, + { + "epoch": 0.12751277836978706, + "grad_norm": 9.065235225026353, + "learning_rate": 4.985087503739543e-06, + "loss": 1.0078, + "step": 1765 + }, + { + "epoch": 0.12758502356999657, + "grad_norm": 9.898222838906605, + "learning_rate": 4.985055585974181e-06, + "loss": 0.9955, + "step": 1766 + }, + { + "epoch": 0.12765726877020608, + "grad_norm": 8.311223211682746, + "learning_rate": 4.985023634190338e-06, + "loss": 0.8921, + "step": 1767 + }, + { + "epoch": 0.1277295139704156, + "grad_norm": 8.222343094541571, + "learning_rate": 4.984991648388451e-06, + "loss": 1.0571, + "step": 1768 + }, + { + "epoch": 0.1278017591706251, + "grad_norm": 8.46288318752558, + "learning_rate": 4.984959628568957e-06, + "loss": 1.0144, + "step": 1769 + }, + { + "epoch": 0.12787400437083463, + "grad_norm": 7.40295877845724, + "learning_rate": 4.984927574732297e-06, + "loss": 1.0224, + "step": 1770 + }, + { + "epoch": 0.1279462495710441, + "grad_norm": 10.851376888261404, + "learning_rate": 4.984895486878905e-06, + "loss": 1.0222, + "step": 1771 + }, + { + "epoch": 0.12801849477125363, + "grad_norm": 7.628636477965079, + "learning_rate": 4.984863365009226e-06, + "loss": 1.0165, + "step": 1772 + }, + { + "epoch": 0.12809073997146314, + "grad_norm": 7.711362693681698, + "learning_rate": 4.984831209123696e-06, + "loss": 0.964, + "step": 1773 + }, + { + "epoch": 0.12816298517167266, + "grad_norm": 7.454550041061987, + "learning_rate": 4.984799019222756e-06, + "loss": 0.9324, + "step": 1774 + }, + { + "epoch": 0.12823523037188217, + "grad_norm": 6.822264506691863, + "learning_rate": 4.984766795306845e-06, + "loss": 0.9423, + "step": 1775 + }, + { + "epoch": 0.12830747557209168, + "grad_norm": 7.726013719531228, + "learning_rate": 4.984734537376409e-06, + "loss": 1.0539, + "step": 1776 + }, + { + "epoch": 0.1283797207723012, + "grad_norm": 6.7912383461284715, + "learning_rate": 4.984702245431885e-06, + "loss": 1.0321, + "step": 1777 + }, + { + "epoch": 0.1284519659725107, + "grad_norm": 7.284115207119251, + "learning_rate": 4.984669919473716e-06, + "loss": 0.9908, + "step": 1778 + }, + { + "epoch": 0.1285242111727202, + "grad_norm": 6.174930328099643, + "learning_rate": 4.984637559502346e-06, + "loss": 0.8643, + "step": 1779 + }, + { + "epoch": 0.1285964563729297, + "grad_norm": 6.956521565499511, + "learning_rate": 4.984605165518216e-06, + "loss": 0.8869, + "step": 1780 + }, + { + "epoch": 0.12866870157313923, + "grad_norm": 7.3006305056501315, + "learning_rate": 4.984572737521771e-06, + "loss": 0.9913, + "step": 1781 + }, + { + "epoch": 0.12874094677334874, + "grad_norm": 6.1879386361056445, + "learning_rate": 4.984540275513454e-06, + "loss": 0.9938, + "step": 1782 + }, + { + "epoch": 0.12881319197355826, + "grad_norm": 7.00707840662789, + "learning_rate": 4.98450777949371e-06, + "loss": 0.944, + "step": 1783 + }, + { + "epoch": 0.12888543717376777, + "grad_norm": 6.857213542210609, + "learning_rate": 4.984475249462984e-06, + "loss": 0.9533, + "step": 1784 + }, + { + "epoch": 0.12895768237397728, + "grad_norm": 6.12321975682971, + "learning_rate": 4.9844426854217206e-06, + "loss": 0.9672, + "step": 1785 + }, + { + "epoch": 0.1290299275741868, + "grad_norm": 7.768847527969757, + "learning_rate": 4.984410087370365e-06, + "loss": 0.9893, + "step": 1786 + }, + { + "epoch": 0.1291021727743963, + "grad_norm": 7.049703702719894, + "learning_rate": 4.984377455309366e-06, + "loss": 1.007, + "step": 1787 + }, + { + "epoch": 0.1291744179746058, + "grad_norm": 7.634958986423256, + "learning_rate": 4.984344789239167e-06, + "loss": 0.9898, + "step": 1788 + }, + { + "epoch": 0.1292466631748153, + "grad_norm": 6.167106234718414, + "learning_rate": 4.984312089160218e-06, + "loss": 0.9627, + "step": 1789 + }, + { + "epoch": 0.12931890837502483, + "grad_norm": 7.9744283154347695, + "learning_rate": 4.984279355072965e-06, + "loss": 1.0664, + "step": 1790 + }, + { + "epoch": 0.12939115357523434, + "grad_norm": 7.477889976522559, + "learning_rate": 4.984246586977857e-06, + "loss": 0.8809, + "step": 1791 + }, + { + "epoch": 0.12946339877544386, + "grad_norm": 8.209029218218584, + "learning_rate": 4.984213784875341e-06, + "loss": 1.0023, + "step": 1792 + }, + { + "epoch": 0.12953564397565337, + "grad_norm": 7.532272079569621, + "learning_rate": 4.984180948765868e-06, + "loss": 0.9323, + "step": 1793 + }, + { + "epoch": 0.12960788917586288, + "grad_norm": 7.031283094540171, + "learning_rate": 4.9841480786498864e-06, + "loss": 0.9449, + "step": 1794 + }, + { + "epoch": 0.1296801343760724, + "grad_norm": 9.401245408038658, + "learning_rate": 4.984115174527847e-06, + "loss": 1.0713, + "step": 1795 + }, + { + "epoch": 0.1297523795762819, + "grad_norm": 9.216965945617668, + "learning_rate": 4.984082236400199e-06, + "loss": 1.0561, + "step": 1796 + }, + { + "epoch": 0.1298246247764914, + "grad_norm": 9.013166333669785, + "learning_rate": 4.984049264267394e-06, + "loss": 1.0835, + "step": 1797 + }, + { + "epoch": 0.1298968699767009, + "grad_norm": 7.813334916324043, + "learning_rate": 4.9840162581298836e-06, + "loss": 0.9989, + "step": 1798 + }, + { + "epoch": 0.12996911517691043, + "grad_norm": 8.198863755565414, + "learning_rate": 4.983983217988119e-06, + "loss": 1.0055, + "step": 1799 + }, + { + "epoch": 0.13004136037711994, + "grad_norm": 9.362259001743258, + "learning_rate": 4.983950143842553e-06, + "loss": 1.0262, + "step": 1800 + }, + { + "epoch": 0.13011360557732946, + "grad_norm": 8.042575079911634, + "learning_rate": 4.9839170356936386e-06, + "loss": 1.0391, + "step": 1801 + }, + { + "epoch": 0.13018585077753897, + "grad_norm": 7.686013675071032, + "learning_rate": 4.983883893541828e-06, + "loss": 0.9637, + "step": 1802 + }, + { + "epoch": 0.13025809597774848, + "grad_norm": 8.318147275486272, + "learning_rate": 4.983850717387576e-06, + "loss": 0.9503, + "step": 1803 + }, + { + "epoch": 0.130330341177958, + "grad_norm": 7.055036263700486, + "learning_rate": 4.983817507231335e-06, + "loss": 0.9917, + "step": 1804 + }, + { + "epoch": 0.1304025863781675, + "grad_norm": 8.684256627448702, + "learning_rate": 4.983784263073562e-06, + "loss": 0.9431, + "step": 1805 + }, + { + "epoch": 0.130474831578377, + "grad_norm": 8.026100972076344, + "learning_rate": 4.983750984914711e-06, + "loss": 0.9222, + "step": 1806 + }, + { + "epoch": 0.1305470767785865, + "grad_norm": 6.691161107228747, + "learning_rate": 4.983717672755237e-06, + "loss": 1.0179, + "step": 1807 + }, + { + "epoch": 0.13061932197879603, + "grad_norm": 7.266265250268246, + "learning_rate": 4.9836843265955975e-06, + "loss": 0.9732, + "step": 1808 + }, + { + "epoch": 0.13069156717900554, + "grad_norm": 7.410222915405842, + "learning_rate": 4.983650946436247e-06, + "loss": 0.9806, + "step": 1809 + }, + { + "epoch": 0.13076381237921506, + "grad_norm": 7.5432790082974845, + "learning_rate": 4.983617532277644e-06, + "loss": 0.8889, + "step": 1810 + }, + { + "epoch": 0.13083605757942457, + "grad_norm": 8.23439584689605, + "learning_rate": 4.983584084120245e-06, + "loss": 0.9939, + "step": 1811 + }, + { + "epoch": 0.13090830277963408, + "grad_norm": 8.078876275963397, + "learning_rate": 4.9835506019645095e-06, + "loss": 0.9797, + "step": 1812 + }, + { + "epoch": 0.1309805479798436, + "grad_norm": 9.282932116218259, + "learning_rate": 4.983517085810893e-06, + "loss": 1.012, + "step": 1813 + }, + { + "epoch": 0.1310527931800531, + "grad_norm": 7.168434398192635, + "learning_rate": 4.983483535659856e-06, + "loss": 0.9675, + "step": 1814 + }, + { + "epoch": 0.1311250383802626, + "grad_norm": 8.128275988451277, + "learning_rate": 4.9834499515118595e-06, + "loss": 0.9439, + "step": 1815 + }, + { + "epoch": 0.1311972835804721, + "grad_norm": 6.667297079161606, + "learning_rate": 4.98341633336736e-06, + "loss": 0.8304, + "step": 1816 + }, + { + "epoch": 0.13126952878068163, + "grad_norm": 8.582739306390755, + "learning_rate": 4.983382681226819e-06, + "loss": 0.9611, + "step": 1817 + }, + { + "epoch": 0.13134177398089114, + "grad_norm": 7.95913798739797, + "learning_rate": 4.983348995090698e-06, + "loss": 0.958, + "step": 1818 + }, + { + "epoch": 0.13141401918110066, + "grad_norm": 7.863198535254848, + "learning_rate": 4.9833152749594574e-06, + "loss": 0.9472, + "step": 1819 + }, + { + "epoch": 0.13148626438131017, + "grad_norm": 9.290243088218068, + "learning_rate": 4.9832815208335584e-06, + "loss": 1.0334, + "step": 1820 + }, + { + "epoch": 0.13155850958151968, + "grad_norm": 6.869797003577422, + "learning_rate": 4.983247732713463e-06, + "loss": 0.9853, + "step": 1821 + }, + { + "epoch": 0.1316307547817292, + "grad_norm": 7.558406005420285, + "learning_rate": 4.983213910599636e-06, + "loss": 0.9988, + "step": 1822 + }, + { + "epoch": 0.1317029999819387, + "grad_norm": 6.4563597914453155, + "learning_rate": 4.983180054492538e-06, + "loss": 0.9789, + "step": 1823 + }, + { + "epoch": 0.1317752451821482, + "grad_norm": 8.795317218865318, + "learning_rate": 4.983146164392632e-06, + "loss": 1.0715, + "step": 1824 + }, + { + "epoch": 0.1318474903823577, + "grad_norm": 9.269276508957521, + "learning_rate": 4.983112240300384e-06, + "loss": 1.1283, + "step": 1825 + }, + { + "epoch": 0.13191973558256723, + "grad_norm": 9.207610042928442, + "learning_rate": 4.983078282216257e-06, + "loss": 0.9678, + "step": 1826 + }, + { + "epoch": 0.13199198078277674, + "grad_norm": 7.652344996251174, + "learning_rate": 4.983044290140717e-06, + "loss": 0.9794, + "step": 1827 + }, + { + "epoch": 0.13206422598298626, + "grad_norm": 11.058470601023064, + "learning_rate": 4.9830102640742276e-06, + "loss": 1.0473, + "step": 1828 + }, + { + "epoch": 0.13213647118319577, + "grad_norm": 9.805058664538375, + "learning_rate": 4.982976204017257e-06, + "loss": 1.0345, + "step": 1829 + }, + { + "epoch": 0.13220871638340528, + "grad_norm": 7.621170332937871, + "learning_rate": 4.982942109970269e-06, + "loss": 0.9301, + "step": 1830 + }, + { + "epoch": 0.1322809615836148, + "grad_norm": 7.181901692035509, + "learning_rate": 4.982907981933731e-06, + "loss": 0.9869, + "step": 1831 + }, + { + "epoch": 0.1323532067838243, + "grad_norm": 9.529974780214449, + "learning_rate": 4.982873819908112e-06, + "loss": 1.0587, + "step": 1832 + }, + { + "epoch": 0.1324254519840338, + "grad_norm": 10.123056484234779, + "learning_rate": 4.982839623893877e-06, + "loss": 0.9403, + "step": 1833 + }, + { + "epoch": 0.1324976971842433, + "grad_norm": 9.762961355485892, + "learning_rate": 4.982805393891496e-06, + "loss": 0.9923, + "step": 1834 + }, + { + "epoch": 0.13256994238445283, + "grad_norm": 9.369295953540963, + "learning_rate": 4.982771129901437e-06, + "loss": 1.0665, + "step": 1835 + }, + { + "epoch": 0.13264218758466234, + "grad_norm": 8.132616903717814, + "learning_rate": 4.982736831924169e-06, + "loss": 1.0233, + "step": 1836 + }, + { + "epoch": 0.13271443278487186, + "grad_norm": 8.260609596055094, + "learning_rate": 4.9827024999601606e-06, + "loss": 0.9744, + "step": 1837 + }, + { + "epoch": 0.13278667798508137, + "grad_norm": 8.722209285178447, + "learning_rate": 4.982668134009883e-06, + "loss": 0.9912, + "step": 1838 + }, + { + "epoch": 0.13285892318529088, + "grad_norm": 10.135511052213932, + "learning_rate": 4.982633734073807e-06, + "loss": 0.9275, + "step": 1839 + }, + { + "epoch": 0.1329311683855004, + "grad_norm": 7.679764988800317, + "learning_rate": 4.982599300152402e-06, + "loss": 0.9357, + "step": 1840 + }, + { + "epoch": 0.1330034135857099, + "grad_norm": 8.127833415209162, + "learning_rate": 4.982564832246141e-06, + "loss": 0.9804, + "step": 1841 + }, + { + "epoch": 0.1330756587859194, + "grad_norm": 10.616334859872007, + "learning_rate": 4.9825303303554945e-06, + "loss": 0.9213, + "step": 1842 + }, + { + "epoch": 0.1331479039861289, + "grad_norm": 9.03791199057692, + "learning_rate": 4.982495794480935e-06, + "loss": 0.9746, + "step": 1843 + }, + { + "epoch": 0.13322014918633843, + "grad_norm": 5.95104591729602, + "learning_rate": 4.982461224622936e-06, + "loss": 0.8141, + "step": 1844 + }, + { + "epoch": 0.13329239438654794, + "grad_norm": 6.5403901168706575, + "learning_rate": 4.982426620781971e-06, + "loss": 0.9242, + "step": 1845 + }, + { + "epoch": 0.13336463958675746, + "grad_norm": 7.865941969039864, + "learning_rate": 4.9823919829585125e-06, + "loss": 1.0484, + "step": 1846 + }, + { + "epoch": 0.13343688478696697, + "grad_norm": 8.439588500201557, + "learning_rate": 4.982357311153036e-06, + "loss": 0.984, + "step": 1847 + }, + { + "epoch": 0.13350912998717648, + "grad_norm": 8.056435366703509, + "learning_rate": 4.982322605366013e-06, + "loss": 1.0598, + "step": 1848 + }, + { + "epoch": 0.133581375187386, + "grad_norm": 6.053167967085016, + "learning_rate": 4.982287865597923e-06, + "loss": 0.8954, + "step": 1849 + }, + { + "epoch": 0.1336536203875955, + "grad_norm": 6.013574186239678, + "learning_rate": 4.982253091849239e-06, + "loss": 0.986, + "step": 1850 + }, + { + "epoch": 0.133725865587805, + "grad_norm": 7.816570228789707, + "learning_rate": 4.982218284120438e-06, + "loss": 0.9665, + "step": 1851 + }, + { + "epoch": 0.1337981107880145, + "grad_norm": 7.241217456693795, + "learning_rate": 4.982183442411995e-06, + "loss": 1.0057, + "step": 1852 + }, + { + "epoch": 0.13387035598822403, + "grad_norm": 8.066454486841245, + "learning_rate": 4.982148566724389e-06, + "loss": 0.9711, + "step": 1853 + }, + { + "epoch": 0.13394260118843354, + "grad_norm": 7.023433969600753, + "learning_rate": 4.9821136570580955e-06, + "loss": 1.0455, + "step": 1854 + }, + { + "epoch": 0.13401484638864306, + "grad_norm": 7.00264063483167, + "learning_rate": 4.982078713413594e-06, + "loss": 0.9653, + "step": 1855 + }, + { + "epoch": 0.13408709158885257, + "grad_norm": 7.266671841587476, + "learning_rate": 4.982043735791361e-06, + "loss": 1.039, + "step": 1856 + }, + { + "epoch": 0.13415933678906208, + "grad_norm": 8.308544164167973, + "learning_rate": 4.982008724191877e-06, + "loss": 0.9182, + "step": 1857 + }, + { + "epoch": 0.1342315819892716, + "grad_norm": 6.271385227727647, + "learning_rate": 4.981973678615621e-06, + "loss": 1.0054, + "step": 1858 + }, + { + "epoch": 0.1343038271894811, + "grad_norm": 7.878157936157476, + "learning_rate": 4.981938599063072e-06, + "loss": 0.9589, + "step": 1859 + }, + { + "epoch": 0.1343760723896906, + "grad_norm": 6.272217590372368, + "learning_rate": 4.981903485534711e-06, + "loss": 0.8277, + "step": 1860 + }, + { + "epoch": 0.1344483175899001, + "grad_norm": 7.65804671400462, + "learning_rate": 4.981868338031019e-06, + "loss": 0.9954, + "step": 1861 + }, + { + "epoch": 0.13452056279010963, + "grad_norm": 8.364577455308748, + "learning_rate": 4.9818331565524754e-06, + "loss": 1.0007, + "step": 1862 + }, + { + "epoch": 0.13459280799031914, + "grad_norm": 6.24127075949757, + "learning_rate": 4.981797941099564e-06, + "loss": 0.9143, + "step": 1863 + }, + { + "epoch": 0.13466505319052866, + "grad_norm": 7.3136945221602305, + "learning_rate": 4.981762691672765e-06, + "loss": 0.9479, + "step": 1864 + }, + { + "epoch": 0.13473729839073817, + "grad_norm": 6.63301724139244, + "learning_rate": 4.981727408272562e-06, + "loss": 0.9467, + "step": 1865 + }, + { + "epoch": 0.13480954359094768, + "grad_norm": 7.871754779935011, + "learning_rate": 4.981692090899438e-06, + "loss": 1.0075, + "step": 1866 + }, + { + "epoch": 0.1348817887911572, + "grad_norm": 8.099139094814241, + "learning_rate": 4.981656739553875e-06, + "loss": 0.9334, + "step": 1867 + }, + { + "epoch": 0.1349540339913667, + "grad_norm": 6.5070061705041065, + "learning_rate": 4.981621354236359e-06, + "loss": 0.93, + "step": 1868 + }, + { + "epoch": 0.1350262791915762, + "grad_norm": 6.548436381531759, + "learning_rate": 4.981585934947374e-06, + "loss": 1.0082, + "step": 1869 + }, + { + "epoch": 0.1350985243917857, + "grad_norm": 6.818821205445699, + "learning_rate": 4.981550481687403e-06, + "loss": 0.9383, + "step": 1870 + }, + { + "epoch": 0.13517076959199523, + "grad_norm": 7.515236953583424, + "learning_rate": 4.981514994456934e-06, + "loss": 0.9463, + "step": 1871 + }, + { + "epoch": 0.13524301479220474, + "grad_norm": 9.004295171914832, + "learning_rate": 4.981479473256451e-06, + "loss": 0.9027, + "step": 1872 + }, + { + "epoch": 0.13531525999241426, + "grad_norm": 8.295243097368129, + "learning_rate": 4.98144391808644e-06, + "loss": 0.9564, + "step": 1873 + }, + { + "epoch": 0.13538750519262377, + "grad_norm": 8.632041820803103, + "learning_rate": 4.98140832894739e-06, + "loss": 0.9663, + "step": 1874 + }, + { + "epoch": 0.13545975039283328, + "grad_norm": 7.098680841102655, + "learning_rate": 4.981372705839786e-06, + "loss": 0.9634, + "step": 1875 + }, + { + "epoch": 0.1355319955930428, + "grad_norm": 7.1180005409988665, + "learning_rate": 4.981337048764116e-06, + "loss": 0.9566, + "step": 1876 + }, + { + "epoch": 0.1356042407932523, + "grad_norm": 8.203065941007639, + "learning_rate": 4.981301357720869e-06, + "loss": 1.019, + "step": 1877 + }, + { + "epoch": 0.1356764859934618, + "grad_norm": 8.293973771089908, + "learning_rate": 4.981265632710533e-06, + "loss": 1.0432, + "step": 1878 + }, + { + "epoch": 0.1357487311936713, + "grad_norm": 7.9671545488477005, + "learning_rate": 4.981229873733596e-06, + "loss": 0.9333, + "step": 1879 + }, + { + "epoch": 0.13582097639388083, + "grad_norm": 6.504052586097733, + "learning_rate": 4.981194080790549e-06, + "loss": 1.008, + "step": 1880 + }, + { + "epoch": 0.13589322159409034, + "grad_norm": 8.211718434110688, + "learning_rate": 4.981158253881882e-06, + "loss": 0.9716, + "step": 1881 + }, + { + "epoch": 0.13596546679429986, + "grad_norm": 7.121392122966054, + "learning_rate": 4.981122393008086e-06, + "loss": 0.9849, + "step": 1882 + }, + { + "epoch": 0.13603771199450937, + "grad_norm": 7.934047398634527, + "learning_rate": 4.981086498169649e-06, + "loss": 0.9891, + "step": 1883 + }, + { + "epoch": 0.13610995719471888, + "grad_norm": 6.807781860478733, + "learning_rate": 4.981050569367065e-06, + "loss": 0.965, + "step": 1884 + }, + { + "epoch": 0.1361822023949284, + "grad_norm": 8.184343734279484, + "learning_rate": 4.981014606600825e-06, + "loss": 0.9641, + "step": 1885 + }, + { + "epoch": 0.13625444759513788, + "grad_norm": 7.132308691476261, + "learning_rate": 4.980978609871422e-06, + "loss": 1.001, + "step": 1886 + }, + { + "epoch": 0.1363266927953474, + "grad_norm": 6.252379002794063, + "learning_rate": 4.980942579179348e-06, + "loss": 0.973, + "step": 1887 + }, + { + "epoch": 0.1363989379955569, + "grad_norm": 8.814760175310624, + "learning_rate": 4.980906514525096e-06, + "loss": 1.0263, + "step": 1888 + }, + { + "epoch": 0.13647118319576643, + "grad_norm": 6.3333950374341335, + "learning_rate": 4.98087041590916e-06, + "loss": 1.0157, + "step": 1889 + }, + { + "epoch": 0.13654342839597594, + "grad_norm": 6.649876081416357, + "learning_rate": 4.980834283332034e-06, + "loss": 0.9543, + "step": 1890 + }, + { + "epoch": 0.13661567359618546, + "grad_norm": 7.647382809606821, + "learning_rate": 4.980798116794215e-06, + "loss": 0.8973, + "step": 1891 + }, + { + "epoch": 0.13668791879639497, + "grad_norm": 7.5949721118246725, + "learning_rate": 4.980761916296194e-06, + "loss": 0.9476, + "step": 1892 + }, + { + "epoch": 0.13676016399660448, + "grad_norm": 6.5417789846444325, + "learning_rate": 4.9807256818384685e-06, + "loss": 0.9794, + "step": 1893 + }, + { + "epoch": 0.136832409196814, + "grad_norm": 8.110599373942092, + "learning_rate": 4.980689413421535e-06, + "loss": 0.8384, + "step": 1894 + }, + { + "epoch": 0.13690465439702348, + "grad_norm": 8.197088552784306, + "learning_rate": 4.98065311104589e-06, + "loss": 1.0335, + "step": 1895 + }, + { + "epoch": 0.136976899597233, + "grad_norm": 7.430720982438222, + "learning_rate": 4.980616774712029e-06, + "loss": 0.9609, + "step": 1896 + }, + { + "epoch": 0.1370491447974425, + "grad_norm": 10.08370084066574, + "learning_rate": 4.980580404420452e-06, + "loss": 1.009, + "step": 1897 + }, + { + "epoch": 0.13712138999765203, + "grad_norm": 7.196620561323357, + "learning_rate": 4.980544000171654e-06, + "loss": 0.9277, + "step": 1898 + }, + { + "epoch": 0.13719363519786154, + "grad_norm": 9.059148339669504, + "learning_rate": 4.980507561966135e-06, + "loss": 1.0231, + "step": 1899 + }, + { + "epoch": 0.13726588039807106, + "grad_norm": 6.932703507989325, + "learning_rate": 4.980471089804394e-06, + "loss": 0.8983, + "step": 1900 + }, + { + "epoch": 0.13733812559828057, + "grad_norm": 7.6965442581762895, + "learning_rate": 4.98043458368693e-06, + "loss": 0.8802, + "step": 1901 + }, + { + "epoch": 0.13741037079849008, + "grad_norm": 7.402568690925938, + "learning_rate": 4.980398043614241e-06, + "loss": 1.0523, + "step": 1902 + }, + { + "epoch": 0.1374826159986996, + "grad_norm": 8.136136917357542, + "learning_rate": 4.98036146958683e-06, + "loss": 1.0597, + "step": 1903 + }, + { + "epoch": 0.13755486119890908, + "grad_norm": 8.350034939812753, + "learning_rate": 4.980324861605196e-06, + "loss": 0.9397, + "step": 1904 + }, + { + "epoch": 0.1376271063991186, + "grad_norm": 9.578397414050242, + "learning_rate": 4.98028821966984e-06, + "loss": 0.8744, + "step": 1905 + }, + { + "epoch": 0.1376993515993281, + "grad_norm": 12.843264679661319, + "learning_rate": 4.980251543781264e-06, + "loss": 0.9894, + "step": 1906 + }, + { + "epoch": 0.13777159679953763, + "grad_norm": 6.486260785652408, + "learning_rate": 4.98021483393997e-06, + "loss": 0.9647, + "step": 1907 + }, + { + "epoch": 0.13784384199974714, + "grad_norm": 6.305173342301133, + "learning_rate": 4.9801780901464614e-06, + "loss": 1.0046, + "step": 1908 + }, + { + "epoch": 0.13791608719995666, + "grad_norm": 7.0399961879026325, + "learning_rate": 4.980141312401241e-06, + "loss": 0.9935, + "step": 1909 + }, + { + "epoch": 0.13798833240016617, + "grad_norm": 8.628856736503485, + "learning_rate": 4.980104500704811e-06, + "loss": 0.9499, + "step": 1910 + }, + { + "epoch": 0.13806057760037568, + "grad_norm": 7.526063518206455, + "learning_rate": 4.980067655057676e-06, + "loss": 1.1008, + "step": 1911 + }, + { + "epoch": 0.1381328228005852, + "grad_norm": 7.954349925622949, + "learning_rate": 4.9800307754603405e-06, + "loss": 0.9098, + "step": 1912 + }, + { + "epoch": 0.13820506800079468, + "grad_norm": 8.00498616281108, + "learning_rate": 4.979993861913309e-06, + "loss": 1.0663, + "step": 1913 + }, + { + "epoch": 0.1382773132010042, + "grad_norm": 7.484966979377601, + "learning_rate": 4.9799569144170874e-06, + "loss": 1.0479, + "step": 1914 + }, + { + "epoch": 0.1383495584012137, + "grad_norm": 8.174648303011555, + "learning_rate": 4.9799199329721815e-06, + "loss": 0.972, + "step": 1915 + }, + { + "epoch": 0.13842180360142323, + "grad_norm": 6.30121260583668, + "learning_rate": 4.979882917579097e-06, + "loss": 0.9772, + "step": 1916 + }, + { + "epoch": 0.13849404880163274, + "grad_norm": 7.099793670235221, + "learning_rate": 4.979845868238341e-06, + "loss": 0.9574, + "step": 1917 + }, + { + "epoch": 0.13856629400184226, + "grad_norm": 8.459932924550326, + "learning_rate": 4.979808784950421e-06, + "loss": 1.0147, + "step": 1918 + }, + { + "epoch": 0.13863853920205177, + "grad_norm": 7.232188741010269, + "learning_rate": 4.979771667715844e-06, + "loss": 0.8707, + "step": 1919 + }, + { + "epoch": 0.13871078440226128, + "grad_norm": 7.763408014384066, + "learning_rate": 4.9797345165351175e-06, + "loss": 0.9813, + "step": 1920 + }, + { + "epoch": 0.1387830296024708, + "grad_norm": 7.55725066798048, + "learning_rate": 4.979697331408751e-06, + "loss": 1.0155, + "step": 1921 + }, + { + "epoch": 0.13885527480268028, + "grad_norm": 6.074841708409081, + "learning_rate": 4.979660112337253e-06, + "loss": 0.8682, + "step": 1922 + }, + { + "epoch": 0.1389275200028898, + "grad_norm": 6.936101274131583, + "learning_rate": 4.9796228593211345e-06, + "loss": 0.9624, + "step": 1923 + }, + { + "epoch": 0.1389997652030993, + "grad_norm": 9.665355549610714, + "learning_rate": 4.979585572360904e-06, + "loss": 1.0503, + "step": 1924 + }, + { + "epoch": 0.13907201040330883, + "grad_norm": 8.26321324027156, + "learning_rate": 4.979548251457073e-06, + "loss": 1.0759, + "step": 1925 + }, + { + "epoch": 0.13914425560351834, + "grad_norm": 8.637767243287975, + "learning_rate": 4.97951089661015e-06, + "loss": 1.0327, + "step": 1926 + }, + { + "epoch": 0.13921650080372786, + "grad_norm": 9.96669334314491, + "learning_rate": 4.979473507820649e-06, + "loss": 0.9892, + "step": 1927 + }, + { + "epoch": 0.13928874600393737, + "grad_norm": 9.241479609539544, + "learning_rate": 4.979436085089081e-06, + "loss": 1.0298, + "step": 1928 + }, + { + "epoch": 0.13936099120414688, + "grad_norm": 9.429006632549378, + "learning_rate": 4.979398628415958e-06, + "loss": 0.9719, + "step": 1929 + }, + { + "epoch": 0.1394332364043564, + "grad_norm": 6.8277956536168265, + "learning_rate": 4.979361137801793e-06, + "loss": 0.9816, + "step": 1930 + }, + { + "epoch": 0.13950548160456588, + "grad_norm": 9.537003578475757, + "learning_rate": 4.979323613247099e-06, + "loss": 0.9865, + "step": 1931 + }, + { + "epoch": 0.1395777268047754, + "grad_norm": 7.946498065969133, + "learning_rate": 4.97928605475239e-06, + "loss": 0.9273, + "step": 1932 + }, + { + "epoch": 0.1396499720049849, + "grad_norm": 8.450308565002228, + "learning_rate": 4.97924846231818e-06, + "loss": 0.898, + "step": 1933 + }, + { + "epoch": 0.13972221720519443, + "grad_norm": 6.726500968341535, + "learning_rate": 4.979210835944983e-06, + "loss": 0.9576, + "step": 1934 + }, + { + "epoch": 0.13979446240540394, + "grad_norm": 6.0839323332445145, + "learning_rate": 4.979173175633315e-06, + "loss": 0.9343, + "step": 1935 + }, + { + "epoch": 0.13986670760561346, + "grad_norm": 10.073858259835594, + "learning_rate": 4.979135481383691e-06, + "loss": 0.9344, + "step": 1936 + }, + { + "epoch": 0.13993895280582297, + "grad_norm": 6.747834882704826, + "learning_rate": 4.979097753196627e-06, + "loss": 0.9727, + "step": 1937 + }, + { + "epoch": 0.14001119800603248, + "grad_norm": 7.141442850968727, + "learning_rate": 4.97905999107264e-06, + "loss": 0.9768, + "step": 1938 + }, + { + "epoch": 0.140083443206242, + "grad_norm": 7.085238151989682, + "learning_rate": 4.979022195012247e-06, + "loss": 0.9354, + "step": 1939 + }, + { + "epoch": 0.14015568840645148, + "grad_norm": 8.100602360686336, + "learning_rate": 4.978984365015964e-06, + "loss": 1.0052, + "step": 1940 + }, + { + "epoch": 0.140227933606661, + "grad_norm": 6.562669733668084, + "learning_rate": 4.978946501084311e-06, + "loss": 0.8593, + "step": 1941 + }, + { + "epoch": 0.1403001788068705, + "grad_norm": 6.750206273070765, + "learning_rate": 4.978908603217805e-06, + "loss": 0.961, + "step": 1942 + }, + { + "epoch": 0.14037242400708003, + "grad_norm": 7.581780941981723, + "learning_rate": 4.978870671416964e-06, + "loss": 0.9959, + "step": 1943 + }, + { + "epoch": 0.14044466920728954, + "grad_norm": 7.127938718590148, + "learning_rate": 4.978832705682308e-06, + "loss": 0.9913, + "step": 1944 + }, + { + "epoch": 0.14051691440749906, + "grad_norm": 7.276536455490754, + "learning_rate": 4.978794706014359e-06, + "loss": 1.0336, + "step": 1945 + }, + { + "epoch": 0.14058915960770857, + "grad_norm": 8.443852498683954, + "learning_rate": 4.978756672413633e-06, + "loss": 1.0141, + "step": 1946 + }, + { + "epoch": 0.14066140480791808, + "grad_norm": 6.334006959661401, + "learning_rate": 4.978718604880654e-06, + "loss": 0.8607, + "step": 1947 + }, + { + "epoch": 0.1407336500081276, + "grad_norm": 6.509845026987249, + "learning_rate": 4.978680503415941e-06, + "loss": 0.9247, + "step": 1948 + }, + { + "epoch": 0.14080589520833708, + "grad_norm": 5.794641439751316, + "learning_rate": 4.9786423680200175e-06, + "loss": 0.9746, + "step": 1949 + }, + { + "epoch": 0.1408781404085466, + "grad_norm": 8.06221421792445, + "learning_rate": 4.978604198693404e-06, + "loss": 1.02, + "step": 1950 + }, + { + "epoch": 0.1409503856087561, + "grad_norm": 7.917374937264042, + "learning_rate": 4.978565995436624e-06, + "loss": 0.9701, + "step": 1951 + }, + { + "epoch": 0.14102263080896563, + "grad_norm": 10.047569808546601, + "learning_rate": 4.978527758250199e-06, + "loss": 1.0093, + "step": 1952 + }, + { + "epoch": 0.14109487600917514, + "grad_norm": 13.258095049839774, + "learning_rate": 4.978489487134655e-06, + "loss": 1.0906, + "step": 1953 + }, + { + "epoch": 0.14116712120938466, + "grad_norm": 6.351356493622605, + "learning_rate": 4.978451182090512e-06, + "loss": 0.9803, + "step": 1954 + }, + { + "epoch": 0.14123936640959417, + "grad_norm": 6.370174601271428, + "learning_rate": 4.978412843118299e-06, + "loss": 0.9931, + "step": 1955 + }, + { + "epoch": 0.14131161160980368, + "grad_norm": 6.8688773895752115, + "learning_rate": 4.9783744702185375e-06, + "loss": 0.952, + "step": 1956 + }, + { + "epoch": 0.1413838568100132, + "grad_norm": 7.353129399302764, + "learning_rate": 4.978336063391753e-06, + "loss": 0.9759, + "step": 1957 + }, + { + "epoch": 0.14145610201022268, + "grad_norm": 6.899449931899859, + "learning_rate": 4.9782976226384734e-06, + "loss": 0.9398, + "step": 1958 + }, + { + "epoch": 0.1415283472104322, + "grad_norm": 8.533605676517118, + "learning_rate": 4.978259147959224e-06, + "loss": 0.9212, + "step": 1959 + }, + { + "epoch": 0.1416005924106417, + "grad_norm": 6.89240461624555, + "learning_rate": 4.97822063935453e-06, + "loss": 0.9166, + "step": 1960 + }, + { + "epoch": 0.14167283761085123, + "grad_norm": 6.90733059675736, + "learning_rate": 4.97818209682492e-06, + "loss": 0.9576, + "step": 1961 + }, + { + "epoch": 0.14174508281106074, + "grad_norm": 6.894560297831868, + "learning_rate": 4.978143520370922e-06, + "loss": 0.9504, + "step": 1962 + }, + { + "epoch": 0.14181732801127026, + "grad_norm": 6.447623875327663, + "learning_rate": 4.978104909993062e-06, + "loss": 0.9383, + "step": 1963 + }, + { + "epoch": 0.14188957321147977, + "grad_norm": 8.429040355430537, + "learning_rate": 4.978066265691871e-06, + "loss": 0.9663, + "step": 1964 + }, + { + "epoch": 0.14196181841168928, + "grad_norm": 8.596448883733594, + "learning_rate": 4.978027587467876e-06, + "loss": 0.9876, + "step": 1965 + }, + { + "epoch": 0.1420340636118988, + "grad_norm": 7.776128715203353, + "learning_rate": 4.977988875321607e-06, + "loss": 1.0647, + "step": 1966 + }, + { + "epoch": 0.14210630881210828, + "grad_norm": 7.9739939470614996, + "learning_rate": 4.977950129253596e-06, + "loss": 0.9269, + "step": 1967 + }, + { + "epoch": 0.1421785540123178, + "grad_norm": 8.012556712028724, + "learning_rate": 4.977911349264371e-06, + "loss": 0.9091, + "step": 1968 + }, + { + "epoch": 0.1422507992125273, + "grad_norm": 9.001657121475867, + "learning_rate": 4.977872535354463e-06, + "loss": 0.9502, + "step": 1969 + }, + { + "epoch": 0.14232304441273683, + "grad_norm": 6.213089863887234, + "learning_rate": 4.977833687524405e-06, + "loss": 0.8442, + "step": 1970 + }, + { + "epoch": 0.14239528961294634, + "grad_norm": 7.286243478271791, + "learning_rate": 4.977794805774727e-06, + "loss": 0.9752, + "step": 1971 + }, + { + "epoch": 0.14246753481315585, + "grad_norm": 7.632026311705077, + "learning_rate": 4.977755890105963e-06, + "loss": 0.9571, + "step": 1972 + }, + { + "epoch": 0.14253978001336537, + "grad_norm": 7.279337750589555, + "learning_rate": 4.977716940518643e-06, + "loss": 0.9802, + "step": 1973 + }, + { + "epoch": 0.14261202521357488, + "grad_norm": 7.901812005172477, + "learning_rate": 4.977677957013303e-06, + "loss": 1.0341, + "step": 1974 + }, + { + "epoch": 0.1426842704137844, + "grad_norm": 9.222811770713301, + "learning_rate": 4.9776389395904755e-06, + "loss": 0.8298, + "step": 1975 + }, + { + "epoch": 0.14275651561399388, + "grad_norm": 9.082035870294524, + "learning_rate": 4.977599888250695e-06, + "loss": 0.9942, + "step": 1976 + }, + { + "epoch": 0.1428287608142034, + "grad_norm": 10.227809051939213, + "learning_rate": 4.977560802994496e-06, + "loss": 1.0392, + "step": 1977 + }, + { + "epoch": 0.1429010060144129, + "grad_norm": 6.746187334234351, + "learning_rate": 4.977521683822412e-06, + "loss": 0.9386, + "step": 1978 + }, + { + "epoch": 0.14297325121462243, + "grad_norm": 8.296379807344577, + "learning_rate": 4.977482530734981e-06, + "loss": 0.9537, + "step": 1979 + }, + { + "epoch": 0.14304549641483194, + "grad_norm": 9.001338011570011, + "learning_rate": 4.977443343732736e-06, + "loss": 0.9752, + "step": 1980 + }, + { + "epoch": 0.14311774161504145, + "grad_norm": 7.112692392224357, + "learning_rate": 4.977404122816217e-06, + "loss": 0.9272, + "step": 1981 + }, + { + "epoch": 0.14318998681525097, + "grad_norm": 7.142866254528228, + "learning_rate": 4.977364867985959e-06, + "loss": 0.9064, + "step": 1982 + }, + { + "epoch": 0.14326223201546048, + "grad_norm": 8.660591001012367, + "learning_rate": 4.977325579242499e-06, + "loss": 0.9828, + "step": 1983 + }, + { + "epoch": 0.14333447721567, + "grad_norm": 7.687207472283015, + "learning_rate": 4.977286256586375e-06, + "loss": 0.9757, + "step": 1984 + }, + { + "epoch": 0.14340672241587948, + "grad_norm": 8.071933166317962, + "learning_rate": 4.977246900018126e-06, + "loss": 1.0186, + "step": 1985 + }, + { + "epoch": 0.143478967616089, + "grad_norm": 7.491606529710173, + "learning_rate": 4.977207509538291e-06, + "loss": 1.0251, + "step": 1986 + }, + { + "epoch": 0.1435512128162985, + "grad_norm": 9.830501687489523, + "learning_rate": 4.977168085147408e-06, + "loss": 1.0394, + "step": 1987 + }, + { + "epoch": 0.14362345801650803, + "grad_norm": 9.644012301246901, + "learning_rate": 4.977128626846017e-06, + "loss": 1.0137, + "step": 1988 + }, + { + "epoch": 0.14369570321671754, + "grad_norm": 8.35691315238608, + "learning_rate": 4.97708913463466e-06, + "loss": 1.0104, + "step": 1989 + }, + { + "epoch": 0.14376794841692705, + "grad_norm": 8.168018255170399, + "learning_rate": 4.977049608513874e-06, + "loss": 0.9895, + "step": 1990 + }, + { + "epoch": 0.14384019361713657, + "grad_norm": 7.4670094014025965, + "learning_rate": 4.977010048484204e-06, + "loss": 0.9591, + "step": 1991 + }, + { + "epoch": 0.14391243881734608, + "grad_norm": 8.5831095348428, + "learning_rate": 4.976970454546189e-06, + "loss": 0.9746, + "step": 1992 + }, + { + "epoch": 0.14398468401755557, + "grad_norm": 6.30352144673075, + "learning_rate": 4.976930826700371e-06, + "loss": 0.9543, + "step": 1993 + }, + { + "epoch": 0.14405692921776508, + "grad_norm": 6.869699827963254, + "learning_rate": 4.976891164947294e-06, + "loss": 0.9689, + "step": 1994 + }, + { + "epoch": 0.1441291744179746, + "grad_norm": 8.552889365817043, + "learning_rate": 4.9768514692875e-06, + "loss": 0.9649, + "step": 1995 + }, + { + "epoch": 0.1442014196181841, + "grad_norm": 10.71305566902264, + "learning_rate": 4.976811739721532e-06, + "loss": 0.9741, + "step": 1996 + }, + { + "epoch": 0.14427366481839363, + "grad_norm": 7.86785661566156, + "learning_rate": 4.976771976249935e-06, + "loss": 0.9867, + "step": 1997 + }, + { + "epoch": 0.14434591001860314, + "grad_norm": 7.7834060317026825, + "learning_rate": 4.976732178873253e-06, + "loss": 1.1076, + "step": 1998 + }, + { + "epoch": 0.14441815521881265, + "grad_norm": 6.907045344474365, + "learning_rate": 4.97669234759203e-06, + "loss": 0.9609, + "step": 1999 + }, + { + "epoch": 0.14449040041902217, + "grad_norm": 7.623173479284311, + "learning_rate": 4.976652482406812e-06, + "loss": 1.0356, + "step": 2000 + }, + { + "epoch": 0.14456264561923168, + "grad_norm": 7.683748151568887, + "learning_rate": 4.976612583318144e-06, + "loss": 1.0453, + "step": 2001 + }, + { + "epoch": 0.14463489081944117, + "grad_norm": 6.875376188216335, + "learning_rate": 4.976572650326573e-06, + "loss": 0.8921, + "step": 2002 + }, + { + "epoch": 0.14470713601965068, + "grad_norm": 8.382349689431834, + "learning_rate": 4.9765326834326456e-06, + "loss": 1.0062, + "step": 2003 + }, + { + "epoch": 0.1447793812198602, + "grad_norm": 7.7386687103275795, + "learning_rate": 4.976492682636909e-06, + "loss": 1.0785, + "step": 2004 + }, + { + "epoch": 0.1448516264200697, + "grad_norm": 8.445705012993521, + "learning_rate": 4.97645264793991e-06, + "loss": 0.8895, + "step": 2005 + }, + { + "epoch": 0.14492387162027923, + "grad_norm": 6.852667863996332, + "learning_rate": 4.9764125793421966e-06, + "loss": 0.9158, + "step": 2006 + }, + { + "epoch": 0.14499611682048874, + "grad_norm": 6.87644305256546, + "learning_rate": 4.976372476844319e-06, + "loss": 0.9157, + "step": 2007 + }, + { + "epoch": 0.14506836202069825, + "grad_norm": 6.281506585341566, + "learning_rate": 4.9763323404468235e-06, + "loss": 0.925, + "step": 2008 + }, + { + "epoch": 0.14514060722090777, + "grad_norm": 8.041032940760443, + "learning_rate": 4.976292170150262e-06, + "loss": 1.0327, + "step": 2009 + }, + { + "epoch": 0.14521285242111728, + "grad_norm": 8.988007078336908, + "learning_rate": 4.976251965955183e-06, + "loss": 1.0029, + "step": 2010 + }, + { + "epoch": 0.14528509762132677, + "grad_norm": 8.914656463011664, + "learning_rate": 4.976211727862138e-06, + "loss": 0.9504, + "step": 2011 + }, + { + "epoch": 0.14535734282153628, + "grad_norm": 7.441999853292544, + "learning_rate": 4.976171455871676e-06, + "loss": 0.9529, + "step": 2012 + }, + { + "epoch": 0.1454295880217458, + "grad_norm": 8.02244851499723, + "learning_rate": 4.9761311499843504e-06, + "loss": 1.0321, + "step": 2013 + }, + { + "epoch": 0.1455018332219553, + "grad_norm": 9.230414611166877, + "learning_rate": 4.976090810200711e-06, + "loss": 0.9951, + "step": 2014 + }, + { + "epoch": 0.14557407842216483, + "grad_norm": 9.186139271974984, + "learning_rate": 4.976050436521311e-06, + "loss": 0.9925, + "step": 2015 + }, + { + "epoch": 0.14564632362237434, + "grad_norm": 10.493491562551515, + "learning_rate": 4.976010028946704e-06, + "loss": 0.9992, + "step": 2016 + }, + { + "epoch": 0.14571856882258385, + "grad_norm": 8.371319773546855, + "learning_rate": 4.975969587477441e-06, + "loss": 0.9789, + "step": 2017 + }, + { + "epoch": 0.14579081402279337, + "grad_norm": 8.007525242561144, + "learning_rate": 4.975929112114078e-06, + "loss": 0.9291, + "step": 2018 + }, + { + "epoch": 0.14586305922300288, + "grad_norm": 8.028621020646327, + "learning_rate": 4.975888602857168e-06, + "loss": 1.066, + "step": 2019 + }, + { + "epoch": 0.14593530442321237, + "grad_norm": 7.592880638813155, + "learning_rate": 4.975848059707265e-06, + "loss": 1.0512, + "step": 2020 + }, + { + "epoch": 0.14600754962342188, + "grad_norm": 8.410142188939709, + "learning_rate": 4.975807482664924e-06, + "loss": 0.9222, + "step": 2021 + }, + { + "epoch": 0.1460797948236314, + "grad_norm": 7.7821833652059045, + "learning_rate": 4.975766871730701e-06, + "loss": 0.9584, + "step": 2022 + }, + { + "epoch": 0.1461520400238409, + "grad_norm": 8.393222854206309, + "learning_rate": 4.975726226905152e-06, + "loss": 0.9873, + "step": 2023 + }, + { + "epoch": 0.14622428522405043, + "grad_norm": 6.294213728881769, + "learning_rate": 4.9756855481888334e-06, + "loss": 0.9207, + "step": 2024 + }, + { + "epoch": 0.14629653042425994, + "grad_norm": 6.872639476239193, + "learning_rate": 4.975644835582302e-06, + "loss": 0.871, + "step": 2025 + }, + { + "epoch": 0.14636877562446945, + "grad_norm": 7.514381099553454, + "learning_rate": 4.975604089086115e-06, + "loss": 1.0528, + "step": 2026 + }, + { + "epoch": 0.14644102082467897, + "grad_norm": 6.415594663701566, + "learning_rate": 4.975563308700829e-06, + "loss": 1.0123, + "step": 2027 + }, + { + "epoch": 0.14651326602488848, + "grad_norm": 8.107233429889554, + "learning_rate": 4.975522494427005e-06, + "loss": 0.9507, + "step": 2028 + }, + { + "epoch": 0.14658551122509797, + "grad_norm": 7.264042638930823, + "learning_rate": 4.9754816462652e-06, + "loss": 0.9847, + "step": 2029 + }, + { + "epoch": 0.14665775642530748, + "grad_norm": 6.637913623860439, + "learning_rate": 4.975440764215972e-06, + "loss": 0.9024, + "step": 2030 + }, + { + "epoch": 0.146730001625517, + "grad_norm": 7.36991474242049, + "learning_rate": 4.9753998482798835e-06, + "loss": 0.9248, + "step": 2031 + }, + { + "epoch": 0.1468022468257265, + "grad_norm": 8.80998637827276, + "learning_rate": 4.975358898457492e-06, + "loss": 0.972, + "step": 2032 + }, + { + "epoch": 0.14687449202593603, + "grad_norm": 8.070150368198354, + "learning_rate": 4.97531791474936e-06, + "loss": 0.9685, + "step": 2033 + }, + { + "epoch": 0.14694673722614554, + "grad_norm": 8.678428689095668, + "learning_rate": 4.975276897156047e-06, + "loss": 1.0179, + "step": 2034 + }, + { + "epoch": 0.14701898242635505, + "grad_norm": 7.693402017367224, + "learning_rate": 4.975235845678116e-06, + "loss": 0.9897, + "step": 2035 + }, + { + "epoch": 0.14709122762656457, + "grad_norm": 9.210095911510301, + "learning_rate": 4.975194760316128e-06, + "loss": 0.9616, + "step": 2036 + }, + { + "epoch": 0.14716347282677408, + "grad_norm": 8.013166559973097, + "learning_rate": 4.975153641070644e-06, + "loss": 0.9543, + "step": 2037 + }, + { + "epoch": 0.14723571802698357, + "grad_norm": 8.110364673472997, + "learning_rate": 4.975112487942231e-06, + "loss": 0.9713, + "step": 2038 + }, + { + "epoch": 0.14730796322719308, + "grad_norm": 7.829119529670197, + "learning_rate": 4.975071300931449e-06, + "loss": 0.9907, + "step": 2039 + }, + { + "epoch": 0.1473802084274026, + "grad_norm": 12.434680408256126, + "learning_rate": 4.9750300800388615e-06, + "loss": 1.0158, + "step": 2040 + }, + { + "epoch": 0.1474524536276121, + "grad_norm": 8.174305774947726, + "learning_rate": 4.974988825265035e-06, + "loss": 1.1059, + "step": 2041 + }, + { + "epoch": 0.14752469882782163, + "grad_norm": 7.367935935521642, + "learning_rate": 4.974947536610533e-06, + "loss": 0.8223, + "step": 2042 + }, + { + "epoch": 0.14759694402803114, + "grad_norm": 6.95705427950989, + "learning_rate": 4.974906214075921e-06, + "loss": 0.9371, + "step": 2043 + }, + { + "epoch": 0.14766918922824065, + "grad_norm": 8.27139605730595, + "learning_rate": 4.974864857661764e-06, + "loss": 0.9142, + "step": 2044 + }, + { + "epoch": 0.14774143442845017, + "grad_norm": 7.143179205037029, + "learning_rate": 4.9748234673686295e-06, + "loss": 0.963, + "step": 2045 + }, + { + "epoch": 0.14781367962865968, + "grad_norm": 9.18732561218426, + "learning_rate": 4.974782043197083e-06, + "loss": 1.0088, + "step": 2046 + }, + { + "epoch": 0.14788592482886917, + "grad_norm": 7.2984617415401045, + "learning_rate": 4.974740585147692e-06, + "loss": 0.8995, + "step": 2047 + }, + { + "epoch": 0.14795817002907868, + "grad_norm": 6.516220102188539, + "learning_rate": 4.974699093221024e-06, + "loss": 0.916, + "step": 2048 + }, + { + "epoch": 0.1480304152292882, + "grad_norm": 8.505722363635325, + "learning_rate": 4.9746575674176464e-06, + "loss": 0.9726, + "step": 2049 + }, + { + "epoch": 0.1481026604294977, + "grad_norm": 8.349263744673838, + "learning_rate": 4.974616007738128e-06, + "loss": 0.9698, + "step": 2050 + }, + { + "epoch": 0.14817490562970723, + "grad_norm": 8.633764706393793, + "learning_rate": 4.974574414183039e-06, + "loss": 0.9617, + "step": 2051 + }, + { + "epoch": 0.14824715082991674, + "grad_norm": 9.417675088095862, + "learning_rate": 4.974532786752947e-06, + "loss": 1.0048, + "step": 2052 + }, + { + "epoch": 0.14831939603012625, + "grad_norm": 7.594087318495076, + "learning_rate": 4.974491125448422e-06, + "loss": 1.0039, + "step": 2053 + }, + { + "epoch": 0.14839164123033577, + "grad_norm": 8.265322241619652, + "learning_rate": 4.974449430270035e-06, + "loss": 0.8842, + "step": 2054 + }, + { + "epoch": 0.14846388643054528, + "grad_norm": 7.978141008264639, + "learning_rate": 4.974407701218357e-06, + "loss": 0.9564, + "step": 2055 + }, + { + "epoch": 0.14853613163075477, + "grad_norm": 8.002973957416451, + "learning_rate": 4.974365938293959e-06, + "loss": 0.9387, + "step": 2056 + }, + { + "epoch": 0.14860837683096428, + "grad_norm": 7.460242625527556, + "learning_rate": 4.974324141497412e-06, + "loss": 1.0456, + "step": 2057 + }, + { + "epoch": 0.1486806220311738, + "grad_norm": 6.542006692330726, + "learning_rate": 4.974282310829288e-06, + "loss": 1.0484, + "step": 2058 + }, + { + "epoch": 0.1487528672313833, + "grad_norm": 8.135499244756021, + "learning_rate": 4.9742404462901614e-06, + "loss": 0.9664, + "step": 2059 + }, + { + "epoch": 0.14882511243159283, + "grad_norm": 8.383268914502928, + "learning_rate": 4.974198547880604e-06, + "loss": 0.901, + "step": 2060 + }, + { + "epoch": 0.14889735763180234, + "grad_norm": 9.288708910296137, + "learning_rate": 4.97415661560119e-06, + "loss": 1.0158, + "step": 2061 + }, + { + "epoch": 0.14896960283201185, + "grad_norm": 8.029628248528715, + "learning_rate": 4.974114649452492e-06, + "loss": 1.028, + "step": 2062 + }, + { + "epoch": 0.14904184803222137, + "grad_norm": 8.16060548443304, + "learning_rate": 4.974072649435087e-06, + "loss": 0.9343, + "step": 2063 + }, + { + "epoch": 0.14911409323243088, + "grad_norm": 8.224412019346115, + "learning_rate": 4.9740306155495464e-06, + "loss": 0.9674, + "step": 2064 + }, + { + "epoch": 0.14918633843264037, + "grad_norm": 10.153898090060775, + "learning_rate": 4.973988547796449e-06, + "loss": 1.0692, + "step": 2065 + }, + { + "epoch": 0.14925858363284988, + "grad_norm": 6.980648813342826, + "learning_rate": 4.973946446176368e-06, + "loss": 0.9394, + "step": 2066 + }, + { + "epoch": 0.1493308288330594, + "grad_norm": 8.81336094323401, + "learning_rate": 4.9739043106898835e-06, + "loss": 0.9696, + "step": 2067 + }, + { + "epoch": 0.1494030740332689, + "grad_norm": 8.094024771367563, + "learning_rate": 4.973862141337568e-06, + "loss": 0.8782, + "step": 2068 + }, + { + "epoch": 0.14947531923347843, + "grad_norm": 7.567386782265219, + "learning_rate": 4.973819938120001e-06, + "loss": 1.0072, + "step": 2069 + }, + { + "epoch": 0.14954756443368794, + "grad_norm": 7.7860421127956, + "learning_rate": 4.97377770103776e-06, + "loss": 0.9285, + "step": 2070 + }, + { + "epoch": 0.14961980963389745, + "grad_norm": 7.021156492257492, + "learning_rate": 4.973735430091422e-06, + "loss": 0.9885, + "step": 2071 + }, + { + "epoch": 0.14969205483410697, + "grad_norm": 8.31638377389518, + "learning_rate": 4.973693125281568e-06, + "loss": 1.1409, + "step": 2072 + }, + { + "epoch": 0.14976430003431648, + "grad_norm": 6.261790636284181, + "learning_rate": 4.973650786608776e-06, + "loss": 1.0005, + "step": 2073 + }, + { + "epoch": 0.14983654523452597, + "grad_norm": 8.342286399459411, + "learning_rate": 4.9736084140736245e-06, + "loss": 1.1011, + "step": 2074 + }, + { + "epoch": 0.14990879043473548, + "grad_norm": 8.470363336581904, + "learning_rate": 4.973566007676695e-06, + "loss": 1.0795, + "step": 2075 + }, + { + "epoch": 0.149981035634945, + "grad_norm": 6.297734582884932, + "learning_rate": 4.973523567418567e-06, + "loss": 0.9205, + "step": 2076 + }, + { + "epoch": 0.1500532808351545, + "grad_norm": 7.910789664068364, + "learning_rate": 4.9734810932998215e-06, + "loss": 0.9813, + "step": 2077 + }, + { + "epoch": 0.15012552603536403, + "grad_norm": 7.404217533701786, + "learning_rate": 4.973438585321041e-06, + "loss": 0.957, + "step": 2078 + }, + { + "epoch": 0.15019777123557354, + "grad_norm": 6.41520846092845, + "learning_rate": 4.973396043482807e-06, + "loss": 0.9213, + "step": 2079 + }, + { + "epoch": 0.15027001643578305, + "grad_norm": 6.742102913088306, + "learning_rate": 4.973353467785701e-06, + "loss": 0.9102, + "step": 2080 + }, + { + "epoch": 0.15034226163599257, + "grad_norm": 6.305727810217686, + "learning_rate": 4.973310858230307e-06, + "loss": 1.0018, + "step": 2081 + }, + { + "epoch": 0.15041450683620208, + "grad_norm": 8.122286299374327, + "learning_rate": 4.973268214817208e-06, + "loss": 0.9655, + "step": 2082 + }, + { + "epoch": 0.15048675203641157, + "grad_norm": 8.196560338300117, + "learning_rate": 4.973225537546987e-06, + "loss": 0.8959, + "step": 2083 + }, + { + "epoch": 0.15055899723662108, + "grad_norm": 7.378161512427523, + "learning_rate": 4.9731828264202286e-06, + "loss": 0.972, + "step": 2084 + }, + { + "epoch": 0.1506312424368306, + "grad_norm": 7.278688430434015, + "learning_rate": 4.973140081437518e-06, + "loss": 0.9137, + "step": 2085 + }, + { + "epoch": 0.1507034876370401, + "grad_norm": 7.283653026463583, + "learning_rate": 4.97309730259944e-06, + "loss": 0.9094, + "step": 2086 + }, + { + "epoch": 0.15077573283724963, + "grad_norm": 7.314627313513362, + "learning_rate": 4.97305448990658e-06, + "loss": 0.9486, + "step": 2087 + }, + { + "epoch": 0.15084797803745914, + "grad_norm": 9.95697684738963, + "learning_rate": 4.973011643359524e-06, + "loss": 0.9675, + "step": 2088 + }, + { + "epoch": 0.15092022323766865, + "grad_norm": 7.724409615384855, + "learning_rate": 4.972968762958859e-06, + "loss": 0.9852, + "step": 2089 + }, + { + "epoch": 0.15099246843787817, + "grad_norm": 8.068027228169981, + "learning_rate": 4.972925848705171e-06, + "loss": 0.9104, + "step": 2090 + }, + { + "epoch": 0.15106471363808768, + "grad_norm": 8.402848133251505, + "learning_rate": 4.97288290059905e-06, + "loss": 0.9969, + "step": 2091 + }, + { + "epoch": 0.15113695883829717, + "grad_norm": 8.171592576232223, + "learning_rate": 4.9728399186410805e-06, + "loss": 0.9265, + "step": 2092 + }, + { + "epoch": 0.15120920403850668, + "grad_norm": 6.436271772527476, + "learning_rate": 4.972796902831853e-06, + "loss": 0.9895, + "step": 2093 + }, + { + "epoch": 0.1512814492387162, + "grad_norm": 6.582082541064992, + "learning_rate": 4.9727538531719564e-06, + "loss": 0.9101, + "step": 2094 + }, + { + "epoch": 0.1513536944389257, + "grad_norm": 8.137036607149666, + "learning_rate": 4.972710769661979e-06, + "loss": 1.0104, + "step": 2095 + }, + { + "epoch": 0.15142593963913523, + "grad_norm": 7.097626420978184, + "learning_rate": 4.972667652302512e-06, + "loss": 0.9291, + "step": 2096 + }, + { + "epoch": 0.15149818483934474, + "grad_norm": 6.504598018297416, + "learning_rate": 4.972624501094143e-06, + "loss": 0.9062, + "step": 2097 + }, + { + "epoch": 0.15157043003955425, + "grad_norm": 6.15099005444671, + "learning_rate": 4.972581316037466e-06, + "loss": 0.8526, + "step": 2098 + }, + { + "epoch": 0.15164267523976377, + "grad_norm": 6.086498151426192, + "learning_rate": 4.97253809713307e-06, + "loss": 0.9256, + "step": 2099 + }, + { + "epoch": 0.15171492043997328, + "grad_norm": 8.501724068316324, + "learning_rate": 4.9724948443815474e-06, + "loss": 0.9512, + "step": 2100 + }, + { + "epoch": 0.15178716564018277, + "grad_norm": 8.063826577516757, + "learning_rate": 4.97245155778349e-06, + "loss": 0.926, + "step": 2101 + }, + { + "epoch": 0.15185941084039228, + "grad_norm": 7.329732299745844, + "learning_rate": 4.972408237339491e-06, + "loss": 0.9068, + "step": 2102 + }, + { + "epoch": 0.1519316560406018, + "grad_norm": 8.068300511540906, + "learning_rate": 4.9723648830501425e-06, + "loss": 1.0268, + "step": 2103 + }, + { + "epoch": 0.1520039012408113, + "grad_norm": 6.421945687529147, + "learning_rate": 4.972321494916038e-06, + "loss": 0.8591, + "step": 2104 + }, + { + "epoch": 0.15207614644102083, + "grad_norm": 8.027117070248362, + "learning_rate": 4.972278072937773e-06, + "loss": 1.0937, + "step": 2105 + }, + { + "epoch": 0.15214839164123034, + "grad_norm": 7.986875019486927, + "learning_rate": 4.9722346171159394e-06, + "loss": 1.0572, + "step": 2106 + }, + { + "epoch": 0.15222063684143985, + "grad_norm": 7.911139502932765, + "learning_rate": 4.972191127451135e-06, + "loss": 0.9851, + "step": 2107 + }, + { + "epoch": 0.15229288204164937, + "grad_norm": 9.332510321524815, + "learning_rate": 4.972147603943952e-06, + "loss": 1.0022, + "step": 2108 + }, + { + "epoch": 0.15236512724185886, + "grad_norm": 6.367906700658468, + "learning_rate": 4.972104046594989e-06, + "loss": 0.9249, + "step": 2109 + }, + { + "epoch": 0.15243737244206837, + "grad_norm": 6.543657846175371, + "learning_rate": 4.97206045540484e-06, + "loss": 0.9562, + "step": 2110 + }, + { + "epoch": 0.15250961764227788, + "grad_norm": 8.048988079656057, + "learning_rate": 4.972016830374103e-06, + "loss": 0.959, + "step": 2111 + }, + { + "epoch": 0.1525818628424874, + "grad_norm": 7.139517466591262, + "learning_rate": 4.971973171503376e-06, + "loss": 0.9301, + "step": 2112 + }, + { + "epoch": 0.1526541080426969, + "grad_norm": 7.992723732239974, + "learning_rate": 4.971929478793255e-06, + "loss": 1.0069, + "step": 2113 + }, + { + "epoch": 0.15272635324290643, + "grad_norm": 7.442218726150898, + "learning_rate": 4.971885752244339e-06, + "loss": 1.0689, + "step": 2114 + }, + { + "epoch": 0.15279859844311594, + "grad_norm": 7.258607161824333, + "learning_rate": 4.971841991857226e-06, + "loss": 0.9992, + "step": 2115 + }, + { + "epoch": 0.15287084364332545, + "grad_norm": 7.247498968781592, + "learning_rate": 4.971798197632516e-06, + "loss": 0.9095, + "step": 2116 + }, + { + "epoch": 0.15294308884353497, + "grad_norm": 7.328614826546618, + "learning_rate": 4.971754369570807e-06, + "loss": 0.8476, + "step": 2117 + }, + { + "epoch": 0.15301533404374446, + "grad_norm": 7.126498951426815, + "learning_rate": 4.9717105076727e-06, + "loss": 1.0288, + "step": 2118 + }, + { + "epoch": 0.15308757924395397, + "grad_norm": 8.45223729237307, + "learning_rate": 4.9716666119387966e-06, + "loss": 0.9852, + "step": 2119 + }, + { + "epoch": 0.15315982444416348, + "grad_norm": 9.7440151528427, + "learning_rate": 4.971622682369695e-06, + "loss": 0.9996, + "step": 2120 + }, + { + "epoch": 0.153232069644373, + "grad_norm": 7.479448516833764, + "learning_rate": 4.971578718965999e-06, + "loss": 0.9965, + "step": 2121 + }, + { + "epoch": 0.1533043148445825, + "grad_norm": 6.867740752031149, + "learning_rate": 4.971534721728308e-06, + "loss": 0.9139, + "step": 2122 + }, + { + "epoch": 0.15337656004479203, + "grad_norm": 7.767527786563693, + "learning_rate": 4.971490690657227e-06, + "loss": 1.0193, + "step": 2123 + }, + { + "epoch": 0.15344880524500154, + "grad_norm": 10.342617007435788, + "learning_rate": 4.971446625753357e-06, + "loss": 1.0003, + "step": 2124 + }, + { + "epoch": 0.15352105044521105, + "grad_norm": 7.36980060985728, + "learning_rate": 4.971402527017301e-06, + "loss": 0.9265, + "step": 2125 + }, + { + "epoch": 0.15359329564542057, + "grad_norm": 7.134124802268036, + "learning_rate": 4.971358394449664e-06, + "loss": 0.8922, + "step": 2126 + }, + { + "epoch": 0.15366554084563006, + "grad_norm": 6.446490185967356, + "learning_rate": 4.9713142280510495e-06, + "loss": 0.9051, + "step": 2127 + }, + { + "epoch": 0.15373778604583957, + "grad_norm": 9.853252172208961, + "learning_rate": 4.971270027822062e-06, + "loss": 0.9707, + "step": 2128 + }, + { + "epoch": 0.15381003124604908, + "grad_norm": 8.468645045907534, + "learning_rate": 4.971225793763307e-06, + "loss": 1.0475, + "step": 2129 + }, + { + "epoch": 0.1538822764462586, + "grad_norm": 6.134494507102114, + "learning_rate": 4.97118152587539e-06, + "loss": 0.9562, + "step": 2130 + }, + { + "epoch": 0.1539545216464681, + "grad_norm": 8.435647492670936, + "learning_rate": 4.971137224158915e-06, + "loss": 1.0174, + "step": 2131 + }, + { + "epoch": 0.15402676684667763, + "grad_norm": 9.573401924815572, + "learning_rate": 4.971092888614491e-06, + "loss": 0.8861, + "step": 2132 + }, + { + "epoch": 0.15409901204688714, + "grad_norm": 7.426331412911393, + "learning_rate": 4.971048519242724e-06, + "loss": 1.0222, + "step": 2133 + }, + { + "epoch": 0.15417125724709665, + "grad_norm": 7.526053887767615, + "learning_rate": 4.9710041160442215e-06, + "loss": 0.9749, + "step": 2134 + }, + { + "epoch": 0.15424350244730617, + "grad_norm": 7.526937809338173, + "learning_rate": 4.970959679019591e-06, + "loss": 0.9366, + "step": 2135 + }, + { + "epoch": 0.15431574764751566, + "grad_norm": 7.1043023089961075, + "learning_rate": 4.9709152081694416e-06, + "loss": 0.955, + "step": 2136 + }, + { + "epoch": 0.15438799284772517, + "grad_norm": 6.927091471349386, + "learning_rate": 4.970870703494381e-06, + "loss": 0.9887, + "step": 2137 + }, + { + "epoch": 0.15446023804793468, + "grad_norm": 7.023102104147654, + "learning_rate": 4.970826164995019e-06, + "loss": 1.0018, + "step": 2138 + }, + { + "epoch": 0.1545324832481442, + "grad_norm": 8.425184968904475, + "learning_rate": 4.970781592671966e-06, + "loss": 0.9707, + "step": 2139 + }, + { + "epoch": 0.1546047284483537, + "grad_norm": 7.091815432621685, + "learning_rate": 4.970736986525831e-06, + "loss": 0.9616, + "step": 2140 + }, + { + "epoch": 0.15467697364856323, + "grad_norm": 7.264027672198182, + "learning_rate": 4.9706923465572245e-06, + "loss": 0.9433, + "step": 2141 + }, + { + "epoch": 0.15474921884877274, + "grad_norm": 8.892065270119557, + "learning_rate": 4.970647672766759e-06, + "loss": 0.9391, + "step": 2142 + }, + { + "epoch": 0.15482146404898225, + "grad_norm": 7.1869734861501335, + "learning_rate": 4.970602965155045e-06, + "loss": 0.9582, + "step": 2143 + }, + { + "epoch": 0.15489370924919177, + "grad_norm": 7.858688658215047, + "learning_rate": 4.9705582237226945e-06, + "loss": 0.9691, + "step": 2144 + }, + { + "epoch": 0.15496595444940126, + "grad_norm": 6.981699591138837, + "learning_rate": 4.97051344847032e-06, + "loss": 1.0345, + "step": 2145 + }, + { + "epoch": 0.15503819964961077, + "grad_norm": 6.254855291825678, + "learning_rate": 4.9704686393985345e-06, + "loss": 0.962, + "step": 2146 + }, + { + "epoch": 0.15511044484982028, + "grad_norm": 9.001888500866428, + "learning_rate": 4.970423796507952e-06, + "loss": 1.0684, + "step": 2147 + }, + { + "epoch": 0.1551826900500298, + "grad_norm": 5.888803053576218, + "learning_rate": 4.970378919799186e-06, + "loss": 0.9136, + "step": 2148 + }, + { + "epoch": 0.1552549352502393, + "grad_norm": 8.220023517064519, + "learning_rate": 4.97033400927285e-06, + "loss": 0.9324, + "step": 2149 + }, + { + "epoch": 0.15532718045044883, + "grad_norm": 6.538117371213645, + "learning_rate": 4.97028906492956e-06, + "loss": 0.923, + "step": 2150 + }, + { + "epoch": 0.15539942565065834, + "grad_norm": 6.267629554848979, + "learning_rate": 4.970244086769931e-06, + "loss": 0.9571, + "step": 2151 + }, + { + "epoch": 0.15547167085086785, + "grad_norm": 8.14140096469697, + "learning_rate": 4.970199074794578e-06, + "loss": 1.0054, + "step": 2152 + }, + { + "epoch": 0.15554391605107737, + "grad_norm": 10.331564187812731, + "learning_rate": 4.9701540290041186e-06, + "loss": 0.9419, + "step": 2153 + }, + { + "epoch": 0.15561616125128686, + "grad_norm": 8.424106847130739, + "learning_rate": 4.970108949399167e-06, + "loss": 0.9581, + "step": 2154 + }, + { + "epoch": 0.15568840645149637, + "grad_norm": 6.1022233006852336, + "learning_rate": 4.9700638359803435e-06, + "loss": 0.924, + "step": 2155 + }, + { + "epoch": 0.15576065165170588, + "grad_norm": 8.548642716675161, + "learning_rate": 4.970018688748263e-06, + "loss": 0.9022, + "step": 2156 + }, + { + "epoch": 0.1558328968519154, + "grad_norm": 8.547037015003495, + "learning_rate": 4.9699735077035434e-06, + "loss": 0.8812, + "step": 2157 + }, + { + "epoch": 0.1559051420521249, + "grad_norm": 6.476885264122669, + "learning_rate": 4.969928292846806e-06, + "loss": 0.9184, + "step": 2158 + }, + { + "epoch": 0.15597738725233443, + "grad_norm": 7.393678965674729, + "learning_rate": 4.969883044178668e-06, + "loss": 0.9714, + "step": 2159 + }, + { + "epoch": 0.15604963245254394, + "grad_norm": 7.433237858089275, + "learning_rate": 4.969837761699747e-06, + "loss": 0.9857, + "step": 2160 + }, + { + "epoch": 0.15612187765275345, + "grad_norm": 7.3297872061241005, + "learning_rate": 4.969792445410667e-06, + "loss": 1.0262, + "step": 2161 + }, + { + "epoch": 0.15619412285296297, + "grad_norm": 6.920157811564914, + "learning_rate": 4.969747095312045e-06, + "loss": 0.9538, + "step": 2162 + }, + { + "epoch": 0.15626636805317246, + "grad_norm": 6.853750229758454, + "learning_rate": 4.969701711404503e-06, + "loss": 1.0342, + "step": 2163 + }, + { + "epoch": 0.15633861325338197, + "grad_norm": 6.453943676315341, + "learning_rate": 4.969656293688661e-06, + "loss": 0.9154, + "step": 2164 + }, + { + "epoch": 0.15641085845359148, + "grad_norm": 7.070430657679813, + "learning_rate": 4.969610842165144e-06, + "loss": 0.95, + "step": 2165 + }, + { + "epoch": 0.156483103653801, + "grad_norm": 7.23665732845979, + "learning_rate": 4.969565356834571e-06, + "loss": 0.9691, + "step": 2166 + }, + { + "epoch": 0.1565553488540105, + "grad_norm": 9.006487309693384, + "learning_rate": 4.969519837697566e-06, + "loss": 1.0342, + "step": 2167 + }, + { + "epoch": 0.15662759405422003, + "grad_norm": 6.481107398646698, + "learning_rate": 4.969474284754752e-06, + "loss": 0.9055, + "step": 2168 + }, + { + "epoch": 0.15669983925442954, + "grad_norm": 7.111441041657088, + "learning_rate": 4.9694286980067525e-06, + "loss": 0.9624, + "step": 2169 + }, + { + "epoch": 0.15677208445463905, + "grad_norm": 6.218311083462526, + "learning_rate": 4.9693830774541915e-06, + "loss": 0.8953, + "step": 2170 + }, + { + "epoch": 0.15684432965484857, + "grad_norm": 8.078065499357756, + "learning_rate": 4.969337423097693e-06, + "loss": 0.9723, + "step": 2171 + }, + { + "epoch": 0.15691657485505806, + "grad_norm": 7.381245441742215, + "learning_rate": 4.969291734937883e-06, + "loss": 0.9243, + "step": 2172 + }, + { + "epoch": 0.15698882005526757, + "grad_norm": 6.575147956372796, + "learning_rate": 4.969246012975386e-06, + "loss": 0.9082, + "step": 2173 + }, + { + "epoch": 0.15706106525547708, + "grad_norm": 6.667511346075748, + "learning_rate": 4.969200257210829e-06, + "loss": 0.9697, + "step": 2174 + }, + { + "epoch": 0.1571333104556866, + "grad_norm": 6.909054010009794, + "learning_rate": 4.969154467644838e-06, + "loss": 1.0061, + "step": 2175 + }, + { + "epoch": 0.1572055556558961, + "grad_norm": 11.41077139089867, + "learning_rate": 4.969108644278038e-06, + "loss": 1.0069, + "step": 2176 + }, + { + "epoch": 0.15727780085610563, + "grad_norm": 9.133131192979963, + "learning_rate": 4.969062787111059e-06, + "loss": 0.9582, + "step": 2177 + }, + { + "epoch": 0.15735004605631514, + "grad_norm": 7.36346542168546, + "learning_rate": 4.969016896144526e-06, + "loss": 0.9695, + "step": 2178 + }, + { + "epoch": 0.15742229125652465, + "grad_norm": 7.493635656201894, + "learning_rate": 4.968970971379071e-06, + "loss": 0.966, + "step": 2179 + }, + { + "epoch": 0.15749453645673417, + "grad_norm": 7.491132707700803, + "learning_rate": 4.968925012815319e-06, + "loss": 0.9827, + "step": 2180 + }, + { + "epoch": 0.15756678165694366, + "grad_norm": 10.303868606108733, + "learning_rate": 4.968879020453901e-06, + "loss": 0.8345, + "step": 2181 + }, + { + "epoch": 0.15763902685715317, + "grad_norm": 9.3128907806536, + "learning_rate": 4.968832994295446e-06, + "loss": 0.929, + "step": 2182 + }, + { + "epoch": 0.15771127205736268, + "grad_norm": 6.518036393837985, + "learning_rate": 4.968786934340584e-06, + "loss": 0.9485, + "step": 2183 + }, + { + "epoch": 0.1577835172575722, + "grad_norm": 8.71339432492419, + "learning_rate": 4.968740840589946e-06, + "loss": 0.9205, + "step": 2184 + }, + { + "epoch": 0.1578557624577817, + "grad_norm": 7.417847385789707, + "learning_rate": 4.968694713044163e-06, + "loss": 0.9645, + "step": 2185 + }, + { + "epoch": 0.15792800765799123, + "grad_norm": 7.963317215930995, + "learning_rate": 4.968648551703866e-06, + "loss": 1.0453, + "step": 2186 + }, + { + "epoch": 0.15800025285820074, + "grad_norm": 8.22419077130227, + "learning_rate": 4.968602356569687e-06, + "loss": 0.8779, + "step": 2187 + }, + { + "epoch": 0.15807249805841025, + "grad_norm": 7.252552503930297, + "learning_rate": 4.968556127642259e-06, + "loss": 1.0164, + "step": 2188 + }, + { + "epoch": 0.15814474325861977, + "grad_norm": 7.560931334677301, + "learning_rate": 4.968509864922214e-06, + "loss": 0.9608, + "step": 2189 + }, + { + "epoch": 0.15821698845882926, + "grad_norm": 7.348167097472866, + "learning_rate": 4.968463568410186e-06, + "loss": 1.0464, + "step": 2190 + }, + { + "epoch": 0.15828923365903877, + "grad_norm": 7.549573206680451, + "learning_rate": 4.968417238106807e-06, + "loss": 0.9951, + "step": 2191 + }, + { + "epoch": 0.15836147885924828, + "grad_norm": 7.674447741194426, + "learning_rate": 4.968370874012714e-06, + "loss": 0.9513, + "step": 2192 + }, + { + "epoch": 0.1584337240594578, + "grad_norm": 8.869903390523854, + "learning_rate": 4.9683244761285396e-06, + "loss": 0.9688, + "step": 2193 + }, + { + "epoch": 0.1585059692596673, + "grad_norm": 7.439147253901779, + "learning_rate": 4.968278044454921e-06, + "loss": 0.9641, + "step": 2194 + }, + { + "epoch": 0.15857821445987683, + "grad_norm": 9.252029196370291, + "learning_rate": 4.968231578992491e-06, + "loss": 1.1167, + "step": 2195 + }, + { + "epoch": 0.15865045966008634, + "grad_norm": 9.255994349419428, + "learning_rate": 4.968185079741887e-06, + "loss": 0.8801, + "step": 2196 + }, + { + "epoch": 0.15872270486029585, + "grad_norm": 7.4794956938928, + "learning_rate": 4.968138546703746e-06, + "loss": 1.0019, + "step": 2197 + }, + { + "epoch": 0.15879495006050537, + "grad_norm": 8.210898939163735, + "learning_rate": 4.968091979878705e-06, + "loss": 0.98, + "step": 2198 + }, + { + "epoch": 0.15886719526071486, + "grad_norm": 7.635838794011747, + "learning_rate": 4.968045379267401e-06, + "loss": 0.9799, + "step": 2199 + }, + { + "epoch": 0.15893944046092437, + "grad_norm": 6.803057925481097, + "learning_rate": 4.967998744870472e-06, + "loss": 0.9147, + "step": 2200 + }, + { + "epoch": 0.15901168566113388, + "grad_norm": 8.426114913566103, + "learning_rate": 4.967952076688557e-06, + "loss": 1.0388, + "step": 2201 + }, + { + "epoch": 0.1590839308613434, + "grad_norm": 7.635201056030916, + "learning_rate": 4.9679053747222935e-06, + "loss": 0.9328, + "step": 2202 + }, + { + "epoch": 0.1591561760615529, + "grad_norm": 8.09474818139003, + "learning_rate": 4.967858638972322e-06, + "loss": 1.0085, + "step": 2203 + }, + { + "epoch": 0.15922842126176243, + "grad_norm": 6.924223146347893, + "learning_rate": 4.967811869439282e-06, + "loss": 0.9207, + "step": 2204 + }, + { + "epoch": 0.15930066646197194, + "grad_norm": 6.670199538967964, + "learning_rate": 4.9677650661238136e-06, + "loss": 0.9592, + "step": 2205 + }, + { + "epoch": 0.15937291166218145, + "grad_norm": 7.700775840612638, + "learning_rate": 4.967718229026558e-06, + "loss": 0.9396, + "step": 2206 + }, + { + "epoch": 0.15944515686239097, + "grad_norm": 6.755253760528958, + "learning_rate": 4.967671358148155e-06, + "loss": 1.0001, + "step": 2207 + }, + { + "epoch": 0.15951740206260046, + "grad_norm": 7.765407628292601, + "learning_rate": 4.967624453489247e-06, + "loss": 0.9726, + "step": 2208 + }, + { + "epoch": 0.15958964726280997, + "grad_norm": 7.919375414640211, + "learning_rate": 4.967577515050477e-06, + "loss": 0.9096, + "step": 2209 + }, + { + "epoch": 0.15966189246301948, + "grad_norm": 7.938494282122092, + "learning_rate": 4.967530542832486e-06, + "loss": 1.0089, + "step": 2210 + }, + { + "epoch": 0.159734137663229, + "grad_norm": 7.653967941547974, + "learning_rate": 4.967483536835919e-06, + "loss": 0.9839, + "step": 2211 + }, + { + "epoch": 0.1598063828634385, + "grad_norm": 7.0199985932351465, + "learning_rate": 4.967436497061417e-06, + "loss": 1.0273, + "step": 2212 + }, + { + "epoch": 0.15987862806364803, + "grad_norm": 8.236165324378105, + "learning_rate": 4.967389423509626e-06, + "loss": 0.9903, + "step": 2213 + }, + { + "epoch": 0.15995087326385754, + "grad_norm": 7.561985723490583, + "learning_rate": 4.967342316181189e-06, + "loss": 0.9332, + "step": 2214 + }, + { + "epoch": 0.16002311846406705, + "grad_norm": 9.231979549973712, + "learning_rate": 4.967295175076752e-06, + "loss": 0.9594, + "step": 2215 + }, + { + "epoch": 0.16009536366427654, + "grad_norm": 7.092099707800283, + "learning_rate": 4.96724800019696e-06, + "loss": 0.8508, + "step": 2216 + }, + { + "epoch": 0.16016760886448606, + "grad_norm": 6.600999074514273, + "learning_rate": 4.967200791542458e-06, + "loss": 0.967, + "step": 2217 + }, + { + "epoch": 0.16023985406469557, + "grad_norm": 8.11132083546205, + "learning_rate": 4.967153549113893e-06, + "loss": 0.9161, + "step": 2218 + }, + { + "epoch": 0.16031209926490508, + "grad_norm": 9.423308154372625, + "learning_rate": 4.967106272911912e-06, + "loss": 0.919, + "step": 2219 + }, + { + "epoch": 0.1603843444651146, + "grad_norm": 6.612205963231584, + "learning_rate": 4.967058962937161e-06, + "loss": 0.9395, + "step": 2220 + }, + { + "epoch": 0.1604565896653241, + "grad_norm": 7.353361552170773, + "learning_rate": 4.9670116191902884e-06, + "loss": 0.929, + "step": 2221 + }, + { + "epoch": 0.16052883486553363, + "grad_norm": 7.465003953589014, + "learning_rate": 4.966964241671942e-06, + "loss": 0.9567, + "step": 2222 + }, + { + "epoch": 0.16060108006574314, + "grad_norm": 6.596220448601075, + "learning_rate": 4.966916830382771e-06, + "loss": 0.9251, + "step": 2223 + }, + { + "epoch": 0.16067332526595265, + "grad_norm": 7.402017792542993, + "learning_rate": 4.966869385323424e-06, + "loss": 0.9682, + "step": 2224 + }, + { + "epoch": 0.16074557046616214, + "grad_norm": 6.759498412354358, + "learning_rate": 4.966821906494551e-06, + "loss": 0.9441, + "step": 2225 + }, + { + "epoch": 0.16081781566637166, + "grad_norm": 6.43802973965723, + "learning_rate": 4.966774393896801e-06, + "loss": 0.8931, + "step": 2226 + }, + { + "epoch": 0.16089006086658117, + "grad_norm": 7.307968430958813, + "learning_rate": 4.966726847530825e-06, + "loss": 0.9392, + "step": 2227 + }, + { + "epoch": 0.16096230606679068, + "grad_norm": 7.941841440055598, + "learning_rate": 4.966679267397273e-06, + "loss": 0.9424, + "step": 2228 + }, + { + "epoch": 0.1610345512670002, + "grad_norm": 7.799603124449717, + "learning_rate": 4.966631653496799e-06, + "loss": 0.923, + "step": 2229 + }, + { + "epoch": 0.1611067964672097, + "grad_norm": 7.01283993908534, + "learning_rate": 4.966584005830051e-06, + "loss": 0.9971, + "step": 2230 + }, + { + "epoch": 0.16117904166741923, + "grad_norm": 7.981207949199741, + "learning_rate": 4.9665363243976835e-06, + "loss": 0.9284, + "step": 2231 + }, + { + "epoch": 0.16125128686762874, + "grad_norm": 9.532658707372775, + "learning_rate": 4.966488609200349e-06, + "loss": 0.9461, + "step": 2232 + }, + { + "epoch": 0.16132353206783825, + "grad_norm": 6.818032913893963, + "learning_rate": 4.966440860238701e-06, + "loss": 0.9396, + "step": 2233 + }, + { + "epoch": 0.16139577726804774, + "grad_norm": 7.490737281438407, + "learning_rate": 4.9663930775133915e-06, + "loss": 0.8997, + "step": 2234 + }, + { + "epoch": 0.16146802246825726, + "grad_norm": 7.622049980454144, + "learning_rate": 4.966345261025077e-06, + "loss": 0.9127, + "step": 2235 + }, + { + "epoch": 0.16154026766846677, + "grad_norm": 8.190883257773882, + "learning_rate": 4.966297410774411e-06, + "loss": 0.9711, + "step": 2236 + }, + { + "epoch": 0.16161251286867628, + "grad_norm": 8.629663505845468, + "learning_rate": 4.966249526762048e-06, + "loss": 0.9964, + "step": 2237 + }, + { + "epoch": 0.1616847580688858, + "grad_norm": 9.107128880594251, + "learning_rate": 4.966201608988643e-06, + "loss": 0.9493, + "step": 2238 + }, + { + "epoch": 0.1617570032690953, + "grad_norm": 7.515204467428405, + "learning_rate": 4.966153657454854e-06, + "loss": 0.9287, + "step": 2239 + }, + { + "epoch": 0.16182924846930483, + "grad_norm": 9.595314122038213, + "learning_rate": 4.966105672161335e-06, + "loss": 1.0362, + "step": 2240 + }, + { + "epoch": 0.16190149366951434, + "grad_norm": 6.806375532857933, + "learning_rate": 4.966057653108746e-06, + "loss": 0.9385, + "step": 2241 + }, + { + "epoch": 0.16197373886972385, + "grad_norm": 11.418848724857154, + "learning_rate": 4.966009600297742e-06, + "loss": 1.0434, + "step": 2242 + }, + { + "epoch": 0.16204598406993334, + "grad_norm": 13.391954745454335, + "learning_rate": 4.965961513728981e-06, + "loss": 0.9409, + "step": 2243 + }, + { + "epoch": 0.16211822927014286, + "grad_norm": 9.03022058025314, + "learning_rate": 4.9659133934031216e-06, + "loss": 1.0527, + "step": 2244 + }, + { + "epoch": 0.16219047447035237, + "grad_norm": 8.648146378907773, + "learning_rate": 4.965865239320822e-06, + "loss": 0.9877, + "step": 2245 + }, + { + "epoch": 0.16226271967056188, + "grad_norm": 10.639635485378188, + "learning_rate": 4.965817051482742e-06, + "loss": 1.0514, + "step": 2246 + }, + { + "epoch": 0.1623349648707714, + "grad_norm": 10.373800185353703, + "learning_rate": 4.965768829889542e-06, + "loss": 0.9354, + "step": 2247 + }, + { + "epoch": 0.1624072100709809, + "grad_norm": 6.849693469686224, + "learning_rate": 4.96572057454188e-06, + "loss": 0.9102, + "step": 2248 + }, + { + "epoch": 0.16247945527119043, + "grad_norm": 6.58367787035006, + "learning_rate": 4.965672285440418e-06, + "loss": 0.9794, + "step": 2249 + }, + { + "epoch": 0.16255170047139994, + "grad_norm": 7.271392042960663, + "learning_rate": 4.965623962585818e-06, + "loss": 0.9645, + "step": 2250 + }, + { + "epoch": 0.16262394567160945, + "grad_norm": 9.326244068765076, + "learning_rate": 4.96557560597874e-06, + "loss": 0.9534, + "step": 2251 + }, + { + "epoch": 0.16269619087181894, + "grad_norm": 7.65972323363378, + "learning_rate": 4.9655272156198455e-06, + "loss": 0.8919, + "step": 2252 + }, + { + "epoch": 0.16276843607202846, + "grad_norm": 6.777969058869204, + "learning_rate": 4.965478791509799e-06, + "loss": 0.934, + "step": 2253 + }, + { + "epoch": 0.16284068127223797, + "grad_norm": 8.376236553855934, + "learning_rate": 4.9654303336492615e-06, + "loss": 0.9281, + "step": 2254 + }, + { + "epoch": 0.16291292647244748, + "grad_norm": 6.45959650627853, + "learning_rate": 4.9653818420388965e-06, + "loss": 0.9116, + "step": 2255 + }, + { + "epoch": 0.162985171672657, + "grad_norm": 9.283356604098929, + "learning_rate": 4.965333316679369e-06, + "loss": 0.9369, + "step": 2256 + }, + { + "epoch": 0.1630574168728665, + "grad_norm": 9.91703022095427, + "learning_rate": 4.9652847575713426e-06, + "loss": 1.0058, + "step": 2257 + }, + { + "epoch": 0.16312966207307603, + "grad_norm": 9.85011807288647, + "learning_rate": 4.965236164715483e-06, + "loss": 0.91, + "step": 2258 + }, + { + "epoch": 0.16320190727328554, + "grad_norm": 7.908379188889698, + "learning_rate": 4.965187538112453e-06, + "loss": 0.9307, + "step": 2259 + }, + { + "epoch": 0.16327415247349505, + "grad_norm": 7.339581284390532, + "learning_rate": 4.96513887776292e-06, + "loss": 0.9227, + "step": 2260 + }, + { + "epoch": 0.16334639767370454, + "grad_norm": 9.29959420580157, + "learning_rate": 4.965090183667549e-06, + "loss": 0.936, + "step": 2261 + }, + { + "epoch": 0.16341864287391406, + "grad_norm": 9.993330829697241, + "learning_rate": 4.965041455827009e-06, + "loss": 0.9082, + "step": 2262 + }, + { + "epoch": 0.16349088807412357, + "grad_norm": 7.106571122482099, + "learning_rate": 4.964992694241965e-06, + "loss": 0.9569, + "step": 2263 + }, + { + "epoch": 0.16356313327433308, + "grad_norm": 8.852935442122167, + "learning_rate": 4.964943898913084e-06, + "loss": 1.0164, + "step": 2264 + }, + { + "epoch": 0.1636353784745426, + "grad_norm": 6.651218572268825, + "learning_rate": 4.964895069841036e-06, + "loss": 0.9463, + "step": 2265 + }, + { + "epoch": 0.1637076236747521, + "grad_norm": 8.893493725856867, + "learning_rate": 4.964846207026488e-06, + "loss": 0.9311, + "step": 2266 + }, + { + "epoch": 0.16377986887496163, + "grad_norm": 6.808067069416964, + "learning_rate": 4.964797310470109e-06, + "loss": 0.9036, + "step": 2267 + }, + { + "epoch": 0.16385211407517114, + "grad_norm": 8.14908726735796, + "learning_rate": 4.964748380172569e-06, + "loss": 0.9331, + "step": 2268 + }, + { + "epoch": 0.16392435927538065, + "grad_norm": 8.134552490080027, + "learning_rate": 4.964699416134537e-06, + "loss": 0.9101, + "step": 2269 + }, + { + "epoch": 0.16399660447559014, + "grad_norm": 7.89845754317128, + "learning_rate": 4.964650418356684e-06, + "loss": 0.9805, + "step": 2270 + }, + { + "epoch": 0.16406884967579966, + "grad_norm": 6.024073941935819, + "learning_rate": 4.964601386839681e-06, + "loss": 0.8884, + "step": 2271 + }, + { + "epoch": 0.16414109487600917, + "grad_norm": 8.534652087512578, + "learning_rate": 4.964552321584198e-06, + "loss": 0.9446, + "step": 2272 + }, + { + "epoch": 0.16421334007621868, + "grad_norm": 11.526483972299266, + "learning_rate": 4.964503222590908e-06, + "loss": 1.0255, + "step": 2273 + }, + { + "epoch": 0.1642855852764282, + "grad_norm": 10.22600183778713, + "learning_rate": 4.964454089860481e-06, + "loss": 0.9966, + "step": 2274 + }, + { + "epoch": 0.1643578304766377, + "grad_norm": 6.111236054655609, + "learning_rate": 4.964404923393592e-06, + "loss": 0.9485, + "step": 2275 + }, + { + "epoch": 0.16443007567684723, + "grad_norm": 7.49657972548591, + "learning_rate": 4.9643557231909135e-06, + "loss": 0.9312, + "step": 2276 + }, + { + "epoch": 0.16450232087705674, + "grad_norm": 9.31599022918097, + "learning_rate": 4.964306489253118e-06, + "loss": 1.0629, + "step": 2277 + }, + { + "epoch": 0.16457456607726625, + "grad_norm": 9.197328046812487, + "learning_rate": 4.9642572215808806e-06, + "loss": 1.0107, + "step": 2278 + }, + { + "epoch": 0.16464681127747574, + "grad_norm": 6.77711353625574, + "learning_rate": 4.964207920174874e-06, + "loss": 0.9206, + "step": 2279 + }, + { + "epoch": 0.16471905647768526, + "grad_norm": 6.615515775864561, + "learning_rate": 4.964158585035775e-06, + "loss": 0.9454, + "step": 2280 + }, + { + "epoch": 0.16479130167789477, + "grad_norm": 6.254801012467655, + "learning_rate": 4.964109216164259e-06, + "loss": 0.8958, + "step": 2281 + }, + { + "epoch": 0.16486354687810428, + "grad_norm": 9.474596289737923, + "learning_rate": 4.964059813561e-06, + "loss": 1.0003, + "step": 2282 + }, + { + "epoch": 0.1649357920783138, + "grad_norm": 7.869198312024841, + "learning_rate": 4.964010377226675e-06, + "loss": 1.0007, + "step": 2283 + }, + { + "epoch": 0.1650080372785233, + "grad_norm": 7.309971559732744, + "learning_rate": 4.963960907161963e-06, + "loss": 0.777, + "step": 2284 + }, + { + "epoch": 0.16508028247873283, + "grad_norm": 8.00125159963393, + "learning_rate": 4.963911403367539e-06, + "loss": 0.989, + "step": 2285 + }, + { + "epoch": 0.16515252767894234, + "grad_norm": 7.472553641317223, + "learning_rate": 4.963861865844079e-06, + "loss": 1.0216, + "step": 2286 + }, + { + "epoch": 0.16522477287915185, + "grad_norm": 7.186534783610591, + "learning_rate": 4.9638122945922655e-06, + "loss": 0.984, + "step": 2287 + }, + { + "epoch": 0.16529701807936134, + "grad_norm": 9.922434008612678, + "learning_rate": 4.963762689612773e-06, + "loss": 1.0522, + "step": 2288 + }, + { + "epoch": 0.16536926327957086, + "grad_norm": 6.667067070698727, + "learning_rate": 4.963713050906282e-06, + "loss": 0.9034, + "step": 2289 + }, + { + "epoch": 0.16544150847978037, + "grad_norm": 7.921477479008537, + "learning_rate": 4.963663378473474e-06, + "loss": 0.9879, + "step": 2290 + }, + { + "epoch": 0.16551375367998988, + "grad_norm": 7.971540134623918, + "learning_rate": 4.963613672315027e-06, + "loss": 0.9887, + "step": 2291 + }, + { + "epoch": 0.1655859988801994, + "grad_norm": 8.541591985112419, + "learning_rate": 4.96356393243162e-06, + "loss": 1.053, + "step": 2292 + }, + { + "epoch": 0.1656582440804089, + "grad_norm": 8.295953099529303, + "learning_rate": 4.963514158823937e-06, + "loss": 1.0347, + "step": 2293 + }, + { + "epoch": 0.16573048928061843, + "grad_norm": 7.977736727586198, + "learning_rate": 4.963464351492657e-06, + "loss": 0.8912, + "step": 2294 + }, + { + "epoch": 0.16580273448082794, + "grad_norm": 7.323779153474468, + "learning_rate": 4.963414510438464e-06, + "loss": 0.9312, + "step": 2295 + }, + { + "epoch": 0.16587497968103745, + "grad_norm": 8.658517038001559, + "learning_rate": 4.963364635662039e-06, + "loss": 0.9956, + "step": 2296 + }, + { + "epoch": 0.16594722488124694, + "grad_norm": 8.215776609093467, + "learning_rate": 4.963314727164064e-06, + "loss": 0.8889, + "step": 2297 + }, + { + "epoch": 0.16601947008145646, + "grad_norm": 9.850169967501671, + "learning_rate": 4.963264784945223e-06, + "loss": 1.1332, + "step": 2298 + }, + { + "epoch": 0.16609171528166597, + "grad_norm": 7.4193684089611445, + "learning_rate": 4.9632148090062e-06, + "loss": 0.8875, + "step": 2299 + }, + { + "epoch": 0.16616396048187548, + "grad_norm": 7.684914751243941, + "learning_rate": 4.963164799347679e-06, + "loss": 1.0172, + "step": 2300 + }, + { + "epoch": 0.166236205682085, + "grad_norm": 8.460538930478402, + "learning_rate": 4.963114755970344e-06, + "loss": 1.0498, + "step": 2301 + }, + { + "epoch": 0.1663084508822945, + "grad_norm": 5.723664720228626, + "learning_rate": 4.963064678874882e-06, + "loss": 0.8993, + "step": 2302 + }, + { + "epoch": 0.16638069608250403, + "grad_norm": 7.385329948601283, + "learning_rate": 4.963014568061975e-06, + "loss": 0.97, + "step": 2303 + }, + { + "epoch": 0.16645294128271354, + "grad_norm": 8.749839127287883, + "learning_rate": 4.962964423532312e-06, + "loss": 0.9917, + "step": 2304 + }, + { + "epoch": 0.16652518648292305, + "grad_norm": 7.962687261705961, + "learning_rate": 4.962914245286578e-06, + "loss": 1.0192, + "step": 2305 + }, + { + "epoch": 0.16659743168313254, + "grad_norm": 8.593359810033968, + "learning_rate": 4.96286403332546e-06, + "loss": 1.0406, + "step": 2306 + }, + { + "epoch": 0.16666967688334205, + "grad_norm": 10.18671961878783, + "learning_rate": 4.962813787649647e-06, + "loss": 1.0022, + "step": 2307 + }, + { + "epoch": 0.16674192208355157, + "grad_norm": 7.162362365998053, + "learning_rate": 4.962763508259824e-06, + "loss": 1.0422, + "step": 2308 + }, + { + "epoch": 0.16681416728376108, + "grad_norm": 7.075484770756767, + "learning_rate": 4.962713195156681e-06, + "loss": 0.9428, + "step": 2309 + }, + { + "epoch": 0.1668864124839706, + "grad_norm": 7.705494235833093, + "learning_rate": 4.962662848340908e-06, + "loss": 1.0138, + "step": 2310 + }, + { + "epoch": 0.1669586576841801, + "grad_norm": 8.292951730287529, + "learning_rate": 4.962612467813192e-06, + "loss": 0.9569, + "step": 2311 + }, + { + "epoch": 0.16703090288438963, + "grad_norm": 7.761841372254209, + "learning_rate": 4.962562053574222e-06, + "loss": 0.9799, + "step": 2312 + }, + { + "epoch": 0.16710314808459914, + "grad_norm": 6.719427030296446, + "learning_rate": 4.962511605624691e-06, + "loss": 0.9365, + "step": 2313 + }, + { + "epoch": 0.16717539328480865, + "grad_norm": 6.758786734327189, + "learning_rate": 4.962461123965287e-06, + "loss": 1.0343, + "step": 2314 + }, + { + "epoch": 0.16724763848501814, + "grad_norm": 7.480843302062259, + "learning_rate": 4.962410608596704e-06, + "loss": 1.0329, + "step": 2315 + }, + { + "epoch": 0.16731988368522765, + "grad_norm": 7.163557960440321, + "learning_rate": 4.96236005951963e-06, + "loss": 0.9821, + "step": 2316 + }, + { + "epoch": 0.16739212888543717, + "grad_norm": 8.512209930183102, + "learning_rate": 4.9623094767347596e-06, + "loss": 0.9278, + "step": 2317 + }, + { + "epoch": 0.16746437408564668, + "grad_norm": 8.755735860647338, + "learning_rate": 4.962258860242784e-06, + "loss": 0.9823, + "step": 2318 + }, + { + "epoch": 0.1675366192858562, + "grad_norm": 6.315210950020729, + "learning_rate": 4.962208210044397e-06, + "loss": 0.9716, + "step": 2319 + }, + { + "epoch": 0.1676088644860657, + "grad_norm": 7.324433851008075, + "learning_rate": 4.962157526140291e-06, + "loss": 0.9614, + "step": 2320 + }, + { + "epoch": 0.16768110968627523, + "grad_norm": 7.1333529047237025, + "learning_rate": 4.9621068085311596e-06, + "loss": 0.9757, + "step": 2321 + }, + { + "epoch": 0.16775335488648474, + "grad_norm": 8.06567083890379, + "learning_rate": 4.962056057217698e-06, + "loss": 0.9281, + "step": 2322 + }, + { + "epoch": 0.16782560008669423, + "grad_norm": 7.664963353337563, + "learning_rate": 4.962005272200601e-06, + "loss": 1.0151, + "step": 2323 + }, + { + "epoch": 0.16789784528690374, + "grad_norm": 6.771286073855848, + "learning_rate": 4.961954453480563e-06, + "loss": 0.91, + "step": 2324 + }, + { + "epoch": 0.16797009048711325, + "grad_norm": 6.737767190536283, + "learning_rate": 4.961903601058281e-06, + "loss": 0.9174, + "step": 2325 + }, + { + "epoch": 0.16804233568732277, + "grad_norm": 7.849418871319815, + "learning_rate": 4.961852714934449e-06, + "loss": 1.0418, + "step": 2326 + }, + { + "epoch": 0.16811458088753228, + "grad_norm": 9.688339098614689, + "learning_rate": 4.961801795109766e-06, + "loss": 1.0213, + "step": 2327 + }, + { + "epoch": 0.1681868260877418, + "grad_norm": 9.16709949743106, + "learning_rate": 4.961750841584927e-06, + "loss": 1.0579, + "step": 2328 + }, + { + "epoch": 0.1682590712879513, + "grad_norm": 7.252657173186758, + "learning_rate": 4.961699854360631e-06, + "loss": 0.8674, + "step": 2329 + }, + { + "epoch": 0.16833131648816083, + "grad_norm": 9.154763787897918, + "learning_rate": 4.961648833437575e-06, + "loss": 1.0379, + "step": 2330 + }, + { + "epoch": 0.16840356168837034, + "grad_norm": 6.512203791173418, + "learning_rate": 4.961597778816458e-06, + "loss": 0.9443, + "step": 2331 + }, + { + "epoch": 0.16847580688857983, + "grad_norm": 6.922582310345313, + "learning_rate": 4.961546690497979e-06, + "loss": 0.994, + "step": 2332 + }, + { + "epoch": 0.16854805208878934, + "grad_norm": 8.877335362026312, + "learning_rate": 4.961495568482837e-06, + "loss": 0.972, + "step": 2333 + }, + { + "epoch": 0.16862029728899885, + "grad_norm": 9.28285198322061, + "learning_rate": 4.961444412771731e-06, + "loss": 1.0168, + "step": 2334 + }, + { + "epoch": 0.16869254248920837, + "grad_norm": 6.424545735134277, + "learning_rate": 4.961393223365363e-06, + "loss": 0.9555, + "step": 2335 + }, + { + "epoch": 0.16876478768941788, + "grad_norm": 6.810472939267444, + "learning_rate": 4.961342000264433e-06, + "loss": 0.9976, + "step": 2336 + }, + { + "epoch": 0.1688370328896274, + "grad_norm": 9.25086847300743, + "learning_rate": 4.961290743469642e-06, + "loss": 1.0618, + "step": 2337 + }, + { + "epoch": 0.1689092780898369, + "grad_norm": 10.129513711658026, + "learning_rate": 4.961239452981691e-06, + "loss": 0.9458, + "step": 2338 + }, + { + "epoch": 0.16898152329004643, + "grad_norm": 8.826768510966593, + "learning_rate": 4.961188128801284e-06, + "loss": 0.9626, + "step": 2339 + }, + { + "epoch": 0.16905376849025594, + "grad_norm": 6.625496611785656, + "learning_rate": 4.961136770929122e-06, + "loss": 0.8931, + "step": 2340 + }, + { + "epoch": 0.16912601369046543, + "grad_norm": 6.81427958631854, + "learning_rate": 4.961085379365908e-06, + "loss": 0.9353, + "step": 2341 + }, + { + "epoch": 0.16919825889067494, + "grad_norm": 12.066487814348802, + "learning_rate": 4.961033954112348e-06, + "loss": 0.9445, + "step": 2342 + }, + { + "epoch": 0.16927050409088445, + "grad_norm": 7.8443766187858115, + "learning_rate": 4.9609824951691425e-06, + "loss": 0.9454, + "step": 2343 + }, + { + "epoch": 0.16934274929109397, + "grad_norm": 8.189499472576339, + "learning_rate": 4.9609310025369975e-06, + "loss": 0.9581, + "step": 2344 + }, + { + "epoch": 0.16941499449130348, + "grad_norm": 6.6372305777659735, + "learning_rate": 4.9608794762166176e-06, + "loss": 0.9983, + "step": 2345 + }, + { + "epoch": 0.169487239691513, + "grad_norm": 10.623420508172357, + "learning_rate": 4.960827916208709e-06, + "loss": 0.9396, + "step": 2346 + }, + { + "epoch": 0.1695594848917225, + "grad_norm": 7.694253823512373, + "learning_rate": 4.960776322513977e-06, + "loss": 0.9735, + "step": 2347 + }, + { + "epoch": 0.16963173009193203, + "grad_norm": 9.18216888005208, + "learning_rate": 4.9607246951331274e-06, + "loss": 0.964, + "step": 2348 + }, + { + "epoch": 0.16970397529214154, + "grad_norm": 8.07201114293096, + "learning_rate": 4.960673034066868e-06, + "loss": 1.0112, + "step": 2349 + }, + { + "epoch": 0.16977622049235103, + "grad_norm": 28.70753168574897, + "learning_rate": 4.960621339315904e-06, + "loss": 1.0382, + "step": 2350 + }, + { + "epoch": 0.16984846569256054, + "grad_norm": 9.043578711879979, + "learning_rate": 4.9605696108809465e-06, + "loss": 0.8868, + "step": 2351 + }, + { + "epoch": 0.16992071089277005, + "grad_norm": 8.774663117726506, + "learning_rate": 4.9605178487627e-06, + "loss": 0.8521, + "step": 2352 + }, + { + "epoch": 0.16999295609297957, + "grad_norm": 8.230389957204716, + "learning_rate": 4.960466052961876e-06, + "loss": 0.9756, + "step": 2353 + }, + { + "epoch": 0.17006520129318908, + "grad_norm": 6.880761801217064, + "learning_rate": 4.9604142234791805e-06, + "loss": 1.0196, + "step": 2354 + }, + { + "epoch": 0.1701374464933986, + "grad_norm": 6.582485611172201, + "learning_rate": 4.960362360315325e-06, + "loss": 1.0548, + "step": 2355 + }, + { + "epoch": 0.1702096916936081, + "grad_norm": 8.617683292627506, + "learning_rate": 4.96031046347102e-06, + "loss": 1.0204, + "step": 2356 + }, + { + "epoch": 0.17028193689381763, + "grad_norm": 7.65770523877263, + "learning_rate": 4.960258532946974e-06, + "loss": 1.0347, + "step": 2357 + }, + { + "epoch": 0.17035418209402714, + "grad_norm": 6.382500518308709, + "learning_rate": 4.9602065687438996e-06, + "loss": 0.9056, + "step": 2358 + }, + { + "epoch": 0.17042642729423663, + "grad_norm": 8.15144340979473, + "learning_rate": 4.960154570862508e-06, + "loss": 0.8964, + "step": 2359 + }, + { + "epoch": 0.17049867249444614, + "grad_norm": 14.097800203615268, + "learning_rate": 4.96010253930351e-06, + "loss": 0.9179, + "step": 2360 + }, + { + "epoch": 0.17057091769465565, + "grad_norm": 6.211945918528795, + "learning_rate": 4.960050474067618e-06, + "loss": 0.9805, + "step": 2361 + }, + { + "epoch": 0.17064316289486517, + "grad_norm": 8.615556937894246, + "learning_rate": 4.959998375155545e-06, + "loss": 1.0204, + "step": 2362 + }, + { + "epoch": 0.17071540809507468, + "grad_norm": 7.719567383024432, + "learning_rate": 4.9599462425680054e-06, + "loss": 0.8515, + "step": 2363 + }, + { + "epoch": 0.1707876532952842, + "grad_norm": 6.500419309736301, + "learning_rate": 4.959894076305711e-06, + "loss": 0.9256, + "step": 2364 + }, + { + "epoch": 0.1708598984954937, + "grad_norm": 7.083165544504757, + "learning_rate": 4.959841876369377e-06, + "loss": 0.983, + "step": 2365 + }, + { + "epoch": 0.17093214369570323, + "grad_norm": 7.494123827189931, + "learning_rate": 4.959789642759717e-06, + "loss": 1.0266, + "step": 2366 + }, + { + "epoch": 0.17100438889591274, + "grad_norm": 8.150122200453453, + "learning_rate": 4.959737375477447e-06, + "loss": 1.0181, + "step": 2367 + }, + { + "epoch": 0.17107663409612223, + "grad_norm": 7.64232231275489, + "learning_rate": 4.9596850745232825e-06, + "loss": 0.9891, + "step": 2368 + }, + { + "epoch": 0.17114887929633174, + "grad_norm": 6.345640004302874, + "learning_rate": 4.959632739897939e-06, + "loss": 0.9358, + "step": 2369 + }, + { + "epoch": 0.17122112449654125, + "grad_norm": 7.149639691430972, + "learning_rate": 4.959580371602133e-06, + "loss": 0.9115, + "step": 2370 + }, + { + "epoch": 0.17129336969675077, + "grad_norm": 7.2928808745375315, + "learning_rate": 4.9595279696365815e-06, + "loss": 0.8856, + "step": 2371 + }, + { + "epoch": 0.17136561489696028, + "grad_norm": 7.9221391201193505, + "learning_rate": 4.959475534002002e-06, + "loss": 0.9047, + "step": 2372 + }, + { + "epoch": 0.1714378600971698, + "grad_norm": 9.061712401794267, + "learning_rate": 4.9594230646991115e-06, + "loss": 0.9803, + "step": 2373 + }, + { + "epoch": 0.1715101052973793, + "grad_norm": 6.03627966637509, + "learning_rate": 4.959370561728629e-06, + "loss": 0.9526, + "step": 2374 + }, + { + "epoch": 0.17158235049758883, + "grad_norm": 7.083130538131769, + "learning_rate": 4.9593180250912735e-06, + "loss": 0.906, + "step": 2375 + }, + { + "epoch": 0.17165459569779834, + "grad_norm": 8.348846593447384, + "learning_rate": 4.959265454787764e-06, + "loss": 0.9846, + "step": 2376 + }, + { + "epoch": 0.17172684089800783, + "grad_norm": 7.40198249036011, + "learning_rate": 4.959212850818819e-06, + "loss": 0.9856, + "step": 2377 + }, + { + "epoch": 0.17179908609821734, + "grad_norm": 6.8733385332583055, + "learning_rate": 4.95916021318516e-06, + "loss": 0.8904, + "step": 2378 + }, + { + "epoch": 0.17187133129842685, + "grad_norm": 8.101473977939717, + "learning_rate": 4.959107541887507e-06, + "loss": 0.9841, + "step": 2379 + }, + { + "epoch": 0.17194357649863637, + "grad_norm": 5.961670836689918, + "learning_rate": 4.959054836926582e-06, + "loss": 0.9709, + "step": 2380 + }, + { + "epoch": 0.17201582169884588, + "grad_norm": 7.880098418187571, + "learning_rate": 4.959002098303105e-06, + "loss": 1.0484, + "step": 2381 + }, + { + "epoch": 0.1720880668990554, + "grad_norm": 8.62110525458, + "learning_rate": 4.9589493260177975e-06, + "loss": 0.9846, + "step": 2382 + }, + { + "epoch": 0.1721603120992649, + "grad_norm": 7.597475495921841, + "learning_rate": 4.958896520071385e-06, + "loss": 0.973, + "step": 2383 + }, + { + "epoch": 0.17223255729947443, + "grad_norm": 7.7271783812776045, + "learning_rate": 4.9588436804645865e-06, + "loss": 0.9374, + "step": 2384 + }, + { + "epoch": 0.17230480249968394, + "grad_norm": 6.876819439200453, + "learning_rate": 4.958790807198129e-06, + "loss": 0.8867, + "step": 2385 + }, + { + "epoch": 0.17237704769989343, + "grad_norm": 6.6077977095298825, + "learning_rate": 4.958737900272733e-06, + "loss": 0.8981, + "step": 2386 + }, + { + "epoch": 0.17244929290010294, + "grad_norm": 6.941454610833223, + "learning_rate": 4.958684959689125e-06, + "loss": 0.9535, + "step": 2387 + }, + { + "epoch": 0.17252153810031245, + "grad_norm": 6.871223105838934, + "learning_rate": 4.958631985448028e-06, + "loss": 0.8725, + "step": 2388 + }, + { + "epoch": 0.17259378330052197, + "grad_norm": 7.278037217865054, + "learning_rate": 4.958578977550169e-06, + "loss": 0.9783, + "step": 2389 + }, + { + "epoch": 0.17266602850073148, + "grad_norm": 8.304184045181682, + "learning_rate": 4.958525935996272e-06, + "loss": 0.9845, + "step": 2390 + }, + { + "epoch": 0.172738273700941, + "grad_norm": 7.753102819679327, + "learning_rate": 4.958472860787065e-06, + "loss": 0.9234, + "step": 2391 + }, + { + "epoch": 0.1728105189011505, + "grad_norm": 8.327547837059576, + "learning_rate": 4.958419751923273e-06, + "loss": 0.9737, + "step": 2392 + }, + { + "epoch": 0.17288276410136003, + "grad_norm": 6.802349683172587, + "learning_rate": 4.958366609405623e-06, + "loss": 0.8978, + "step": 2393 + }, + { + "epoch": 0.17295500930156954, + "grad_norm": 7.7084467149248646, + "learning_rate": 4.958313433234843e-06, + "loss": 0.9191, + "step": 2394 + }, + { + "epoch": 0.17302725450177903, + "grad_norm": 6.649381577267725, + "learning_rate": 4.95826022341166e-06, + "loss": 0.8745, + "step": 2395 + }, + { + "epoch": 0.17309949970198854, + "grad_norm": 6.656857458343473, + "learning_rate": 4.958206979936805e-06, + "loss": 0.8519, + "step": 2396 + }, + { + "epoch": 0.17317174490219805, + "grad_norm": 7.075425734364665, + "learning_rate": 4.958153702811005e-06, + "loss": 0.9794, + "step": 2397 + }, + { + "epoch": 0.17324399010240757, + "grad_norm": 9.070000011439886, + "learning_rate": 4.958100392034988e-06, + "loss": 0.8491, + "step": 2398 + }, + { + "epoch": 0.17331623530261708, + "grad_norm": 8.274987497925318, + "learning_rate": 4.958047047609487e-06, + "loss": 0.8826, + "step": 2399 + }, + { + "epoch": 0.1733884805028266, + "grad_norm": 7.408763346369606, + "learning_rate": 4.9579936695352295e-06, + "loss": 0.9219, + "step": 2400 + }, + { + "epoch": 0.1734607257030361, + "grad_norm": 7.839656415144705, + "learning_rate": 4.957940257812947e-06, + "loss": 0.9638, + "step": 2401 + }, + { + "epoch": 0.17353297090324563, + "grad_norm": 7.0176792323289705, + "learning_rate": 4.957886812443371e-06, + "loss": 1.0606, + "step": 2402 + }, + { + "epoch": 0.17360521610345514, + "grad_norm": 8.006387544711615, + "learning_rate": 4.957833333427233e-06, + "loss": 0.9639, + "step": 2403 + }, + { + "epoch": 0.17367746130366463, + "grad_norm": 9.69010722853911, + "learning_rate": 4.957779820765265e-06, + "loss": 0.9298, + "step": 2404 + }, + { + "epoch": 0.17374970650387414, + "grad_norm": 8.891601508872185, + "learning_rate": 4.9577262744581985e-06, + "loss": 0.9436, + "step": 2405 + }, + { + "epoch": 0.17382195170408365, + "grad_norm": 6.35482948056443, + "learning_rate": 4.95767269450677e-06, + "loss": 1.0245, + "step": 2406 + }, + { + "epoch": 0.17389419690429317, + "grad_norm": 7.136824850829395, + "learning_rate": 4.9576190809117085e-06, + "loss": 0.9641, + "step": 2407 + }, + { + "epoch": 0.17396644210450268, + "grad_norm": 9.458359055897246, + "learning_rate": 4.957565433673751e-06, + "loss": 0.9368, + "step": 2408 + }, + { + "epoch": 0.1740386873047122, + "grad_norm": 7.814033052708961, + "learning_rate": 4.957511752793632e-06, + "loss": 0.9528, + "step": 2409 + }, + { + "epoch": 0.1741109325049217, + "grad_norm": 6.381812550765297, + "learning_rate": 4.957458038272083e-06, + "loss": 0.9632, + "step": 2410 + }, + { + "epoch": 0.17418317770513123, + "grad_norm": 6.698570429179478, + "learning_rate": 4.957404290109843e-06, + "loss": 0.9148, + "step": 2411 + }, + { + "epoch": 0.17425542290534074, + "grad_norm": 6.360001816659344, + "learning_rate": 4.957350508307645e-06, + "loss": 0.9078, + "step": 2412 + }, + { + "epoch": 0.17432766810555023, + "grad_norm": 7.103911931308215, + "learning_rate": 4.957296692866228e-06, + "loss": 0.9575, + "step": 2413 + }, + { + "epoch": 0.17439991330575974, + "grad_norm": 6.721593068983704, + "learning_rate": 4.957242843786326e-06, + "loss": 1.0229, + "step": 2414 + }, + { + "epoch": 0.17447215850596925, + "grad_norm": 7.329403893768461, + "learning_rate": 4.957188961068678e-06, + "loss": 0.9649, + "step": 2415 + }, + { + "epoch": 0.17454440370617877, + "grad_norm": 6.909861729255632, + "learning_rate": 4.957135044714021e-06, + "loss": 1.016, + "step": 2416 + }, + { + "epoch": 0.17461664890638828, + "grad_norm": 6.9455478486734865, + "learning_rate": 4.957081094723093e-06, + "loss": 0.9814, + "step": 2417 + }, + { + "epoch": 0.1746888941065978, + "grad_norm": 7.364000814476155, + "learning_rate": 4.957027111096633e-06, + "loss": 0.9556, + "step": 2418 + }, + { + "epoch": 0.1747611393068073, + "grad_norm": 7.111088338788975, + "learning_rate": 4.9569730938353785e-06, + "loss": 0.9283, + "step": 2419 + }, + { + "epoch": 0.17483338450701683, + "grad_norm": 7.10359563984355, + "learning_rate": 4.956919042940071e-06, + "loss": 0.891, + "step": 2420 + }, + { + "epoch": 0.17490562970722634, + "grad_norm": 7.408622265199394, + "learning_rate": 4.95686495841145e-06, + "loss": 0.9327, + "step": 2421 + }, + { + "epoch": 0.17497787490743583, + "grad_norm": 8.498671932812917, + "learning_rate": 4.956810840250254e-06, + "loss": 1.0071, + "step": 2422 + }, + { + "epoch": 0.17505012010764534, + "grad_norm": 6.874715070888989, + "learning_rate": 4.956756688457226e-06, + "loss": 0.9183, + "step": 2423 + }, + { + "epoch": 0.17512236530785485, + "grad_norm": 7.960706293793593, + "learning_rate": 4.956702503033107e-06, + "loss": 0.9693, + "step": 2424 + }, + { + "epoch": 0.17519461050806437, + "grad_norm": 7.6364594944804, + "learning_rate": 4.956648283978637e-06, + "loss": 0.9593, + "step": 2425 + }, + { + "epoch": 0.17526685570827388, + "grad_norm": 6.8141541878756, + "learning_rate": 4.95659403129456e-06, + "loss": 0.9935, + "step": 2426 + }, + { + "epoch": 0.1753391009084834, + "grad_norm": 6.971724213354977, + "learning_rate": 4.956539744981619e-06, + "loss": 0.9922, + "step": 2427 + }, + { + "epoch": 0.1754113461086929, + "grad_norm": 7.098069812438862, + "learning_rate": 4.956485425040555e-06, + "loss": 0.9488, + "step": 2428 + }, + { + "epoch": 0.17548359130890243, + "grad_norm": 8.596444446205606, + "learning_rate": 4.956431071472113e-06, + "loss": 0.9629, + "step": 2429 + }, + { + "epoch": 0.1755558365091119, + "grad_norm": 6.490752170512573, + "learning_rate": 4.956376684277038e-06, + "loss": 0.9645, + "step": 2430 + }, + { + "epoch": 0.17562808170932143, + "grad_norm": 6.8532409352012404, + "learning_rate": 4.9563222634560725e-06, + "loss": 0.9356, + "step": 2431 + }, + { + "epoch": 0.17570032690953094, + "grad_norm": 8.800781683449834, + "learning_rate": 4.956267809009963e-06, + "loss": 0.9044, + "step": 2432 + }, + { + "epoch": 0.17577257210974045, + "grad_norm": 7.160961482527891, + "learning_rate": 4.956213320939454e-06, + "loss": 0.9076, + "step": 2433 + }, + { + "epoch": 0.17584481730994997, + "grad_norm": 5.957739455853044, + "learning_rate": 4.956158799245291e-06, + "loss": 0.8615, + "step": 2434 + }, + { + "epoch": 0.17591706251015948, + "grad_norm": 6.94448638225184, + "learning_rate": 4.956104243928222e-06, + "loss": 0.9268, + "step": 2435 + }, + { + "epoch": 0.175989307710369, + "grad_norm": 8.096929330764986, + "learning_rate": 4.956049654988993e-06, + "loss": 0.946, + "step": 2436 + }, + { + "epoch": 0.1760615529105785, + "grad_norm": 9.077688814015469, + "learning_rate": 4.955995032428352e-06, + "loss": 1.0942, + "step": 2437 + }, + { + "epoch": 0.17613379811078803, + "grad_norm": 6.989371268574912, + "learning_rate": 4.955940376247044e-06, + "loss": 0.9561, + "step": 2438 + }, + { + "epoch": 0.1762060433109975, + "grad_norm": 6.5704286181772416, + "learning_rate": 4.955885686445821e-06, + "loss": 0.9771, + "step": 2439 + }, + { + "epoch": 0.17627828851120703, + "grad_norm": 7.636854868112352, + "learning_rate": 4.955830963025428e-06, + "loss": 0.937, + "step": 2440 + }, + { + "epoch": 0.17635053371141654, + "grad_norm": 6.862808793143665, + "learning_rate": 4.955776205986616e-06, + "loss": 0.9246, + "step": 2441 + }, + { + "epoch": 0.17642277891162605, + "grad_norm": 7.73839142676172, + "learning_rate": 4.955721415330136e-06, + "loss": 0.9288, + "step": 2442 + }, + { + "epoch": 0.17649502411183557, + "grad_norm": 6.878221346688616, + "learning_rate": 4.955666591056736e-06, + "loss": 0.9379, + "step": 2443 + }, + { + "epoch": 0.17656726931204508, + "grad_norm": 7.176012495423392, + "learning_rate": 4.955611733167168e-06, + "loss": 0.9768, + "step": 2444 + }, + { + "epoch": 0.1766395145122546, + "grad_norm": 9.908004268468215, + "learning_rate": 4.955556841662181e-06, + "loss": 0.9838, + "step": 2445 + }, + { + "epoch": 0.1767117597124641, + "grad_norm": 7.65538673983716, + "learning_rate": 4.955501916542527e-06, + "loss": 1.0007, + "step": 2446 + }, + { + "epoch": 0.17678400491267363, + "grad_norm": 8.315875522491087, + "learning_rate": 4.955446957808959e-06, + "loss": 0.9428, + "step": 2447 + }, + { + "epoch": 0.1768562501128831, + "grad_norm": 8.612196555141699, + "learning_rate": 4.955391965462229e-06, + "loss": 0.9905, + "step": 2448 + }, + { + "epoch": 0.17692849531309263, + "grad_norm": 9.565719162366294, + "learning_rate": 4.955336939503089e-06, + "loss": 0.8933, + "step": 2449 + }, + { + "epoch": 0.17700074051330214, + "grad_norm": 7.6617762959722, + "learning_rate": 4.9552818799322926e-06, + "loss": 0.9668, + "step": 2450 + }, + { + "epoch": 0.17707298571351165, + "grad_norm": 6.935480597138258, + "learning_rate": 4.955226786750594e-06, + "loss": 0.9744, + "step": 2451 + }, + { + "epoch": 0.17714523091372117, + "grad_norm": 7.64077902172533, + "learning_rate": 4.9551716599587475e-06, + "loss": 0.9521, + "step": 2452 + }, + { + "epoch": 0.17721747611393068, + "grad_norm": 6.891063719469659, + "learning_rate": 4.955116499557506e-06, + "loss": 0.8485, + "step": 2453 + }, + { + "epoch": 0.1772897213141402, + "grad_norm": 7.270068050942304, + "learning_rate": 4.955061305547627e-06, + "loss": 0.9884, + "step": 2454 + }, + { + "epoch": 0.1773619665143497, + "grad_norm": 7.462044103552936, + "learning_rate": 4.955006077929865e-06, + "loss": 1.0163, + "step": 2455 + }, + { + "epoch": 0.17743421171455923, + "grad_norm": 7.851634405765266, + "learning_rate": 4.954950816704976e-06, + "loss": 0.9161, + "step": 2456 + }, + { + "epoch": 0.1775064569147687, + "grad_norm": 6.732933226428357, + "learning_rate": 4.954895521873715e-06, + "loss": 0.9474, + "step": 2457 + }, + { + "epoch": 0.17757870211497823, + "grad_norm": 8.492567964539596, + "learning_rate": 4.954840193436842e-06, + "loss": 1.0034, + "step": 2458 + }, + { + "epoch": 0.17765094731518774, + "grad_norm": 6.932988254811732, + "learning_rate": 4.954784831395112e-06, + "loss": 0.9725, + "step": 2459 + }, + { + "epoch": 0.17772319251539725, + "grad_norm": 7.107454614860015, + "learning_rate": 4.954729435749284e-06, + "loss": 1.0011, + "step": 2460 + }, + { + "epoch": 0.17779543771560677, + "grad_norm": 7.517137403049656, + "learning_rate": 4.9546740065001155e-06, + "loss": 0.9148, + "step": 2461 + }, + { + "epoch": 0.17786768291581628, + "grad_norm": 6.705755397905815, + "learning_rate": 4.9546185436483664e-06, + "loss": 0.8974, + "step": 2462 + }, + { + "epoch": 0.1779399281160258, + "grad_norm": 7.281221970925799, + "learning_rate": 4.954563047194795e-06, + "loss": 0.9671, + "step": 2463 + }, + { + "epoch": 0.1780121733162353, + "grad_norm": 6.930804896875592, + "learning_rate": 4.954507517140162e-06, + "loss": 0.9478, + "step": 2464 + }, + { + "epoch": 0.17808441851644483, + "grad_norm": 8.675410579232647, + "learning_rate": 4.9544519534852265e-06, + "loss": 0.9808, + "step": 2465 + }, + { + "epoch": 0.1781566637166543, + "grad_norm": 8.353787560236524, + "learning_rate": 4.954396356230749e-06, + "loss": 1.0261, + "step": 2466 + }, + { + "epoch": 0.17822890891686383, + "grad_norm": 7.487591205735592, + "learning_rate": 4.954340725377491e-06, + "loss": 0.9135, + "step": 2467 + }, + { + "epoch": 0.17830115411707334, + "grad_norm": 7.457334341420957, + "learning_rate": 4.954285060926215e-06, + "loss": 0.9921, + "step": 2468 + }, + { + "epoch": 0.17837339931728285, + "grad_norm": 6.777535119776561, + "learning_rate": 4.9542293628776815e-06, + "loss": 0.9732, + "step": 2469 + }, + { + "epoch": 0.17844564451749237, + "grad_norm": 6.720322819630678, + "learning_rate": 4.954173631232654e-06, + "loss": 0.9718, + "step": 2470 + }, + { + "epoch": 0.17851788971770188, + "grad_norm": 10.009053137752614, + "learning_rate": 4.9541178659918955e-06, + "loss": 0.9129, + "step": 2471 + }, + { + "epoch": 0.1785901349179114, + "grad_norm": 8.285920060707888, + "learning_rate": 4.954062067156168e-06, + "loss": 1.0349, + "step": 2472 + }, + { + "epoch": 0.1786623801181209, + "grad_norm": 7.483760290844457, + "learning_rate": 4.954006234726237e-06, + "loss": 0.9906, + "step": 2473 + }, + { + "epoch": 0.17873462531833043, + "grad_norm": 7.47652780490905, + "learning_rate": 4.953950368702865e-06, + "loss": 0.9459, + "step": 2474 + }, + { + "epoch": 0.1788068705185399, + "grad_norm": 10.904685708912304, + "learning_rate": 4.953894469086819e-06, + "loss": 0.9273, + "step": 2475 + }, + { + "epoch": 0.17887911571874943, + "grad_norm": 7.696999983522015, + "learning_rate": 4.953838535878862e-06, + "loss": 1.0126, + "step": 2476 + }, + { + "epoch": 0.17895136091895894, + "grad_norm": 7.244611908368927, + "learning_rate": 4.953782569079761e-06, + "loss": 0.9816, + "step": 2477 + }, + { + "epoch": 0.17902360611916845, + "grad_norm": 9.430511514457548, + "learning_rate": 4.953726568690282e-06, + "loss": 0.9323, + "step": 2478 + }, + { + "epoch": 0.17909585131937797, + "grad_norm": 9.633174051026135, + "learning_rate": 4.953670534711192e-06, + "loss": 1.0652, + "step": 2479 + }, + { + "epoch": 0.17916809651958748, + "grad_norm": 14.33113690484295, + "learning_rate": 4.953614467143257e-06, + "loss": 0.9599, + "step": 2480 + }, + { + "epoch": 0.179240341719797, + "grad_norm": 7.5656988771369775, + "learning_rate": 4.953558365987246e-06, + "loss": 0.8991, + "step": 2481 + }, + { + "epoch": 0.1793125869200065, + "grad_norm": 6.562184934773178, + "learning_rate": 4.9535022312439246e-06, + "loss": 0.9583, + "step": 2482 + }, + { + "epoch": 0.17938483212021603, + "grad_norm": 9.463722430224173, + "learning_rate": 4.953446062914063e-06, + "loss": 0.9884, + "step": 2483 + }, + { + "epoch": 0.1794570773204255, + "grad_norm": 8.65890913790508, + "learning_rate": 4.95338986099843e-06, + "loss": 0.9143, + "step": 2484 + }, + { + "epoch": 0.17952932252063503, + "grad_norm": 8.918749366979435, + "learning_rate": 4.9533336254977945e-06, + "loss": 0.9586, + "step": 2485 + }, + { + "epoch": 0.17960156772084454, + "grad_norm": 11.051400171007568, + "learning_rate": 4.953277356412927e-06, + "loss": 0.9624, + "step": 2486 + }, + { + "epoch": 0.17967381292105405, + "grad_norm": 7.273177620642577, + "learning_rate": 4.953221053744597e-06, + "loss": 1.0278, + "step": 2487 + }, + { + "epoch": 0.17974605812126357, + "grad_norm": 6.402048212367395, + "learning_rate": 4.953164717493576e-06, + "loss": 0.9511, + "step": 2488 + }, + { + "epoch": 0.17981830332147308, + "grad_norm": 6.921909995238145, + "learning_rate": 4.953108347660635e-06, + "loss": 0.9818, + "step": 2489 + }, + { + "epoch": 0.1798905485216826, + "grad_norm": 9.146297318514847, + "learning_rate": 4.953051944246544e-06, + "loss": 1.027, + "step": 2490 + }, + { + "epoch": 0.1799627937218921, + "grad_norm": 7.071985676936494, + "learning_rate": 4.9529955072520784e-06, + "loss": 0.9687, + "step": 2491 + }, + { + "epoch": 0.18003503892210163, + "grad_norm": 8.344755762181943, + "learning_rate": 4.952939036678008e-06, + "loss": 0.9232, + "step": 2492 + }, + { + "epoch": 0.1801072841223111, + "grad_norm": 8.319415664693832, + "learning_rate": 4.9528825325251064e-06, + "loss": 1.0159, + "step": 2493 + }, + { + "epoch": 0.18017952932252063, + "grad_norm": 7.493611221364696, + "learning_rate": 4.952825994794148e-06, + "loss": 0.9227, + "step": 2494 + }, + { + "epoch": 0.18025177452273014, + "grad_norm": 6.8680820685289445, + "learning_rate": 4.952769423485906e-06, + "loss": 0.9268, + "step": 2495 + }, + { + "epoch": 0.18032401972293965, + "grad_norm": 7.29454692811518, + "learning_rate": 4.952712818601155e-06, + "loss": 0.8863, + "step": 2496 + }, + { + "epoch": 0.18039626492314917, + "grad_norm": 6.459468060964094, + "learning_rate": 4.95265618014067e-06, + "loss": 1.0324, + "step": 2497 + }, + { + "epoch": 0.18046851012335868, + "grad_norm": 7.867183863346354, + "learning_rate": 4.952599508105227e-06, + "loss": 1.0258, + "step": 2498 + }, + { + "epoch": 0.1805407553235682, + "grad_norm": 7.533467453069342, + "learning_rate": 4.9525428024956e-06, + "loss": 0.9291, + "step": 2499 + }, + { + "epoch": 0.1806130005237777, + "grad_norm": 7.026760448189495, + "learning_rate": 4.952486063312567e-06, + "loss": 0.9759, + "step": 2500 + }, + { + "epoch": 0.18068524572398723, + "grad_norm": 8.471660268799791, + "learning_rate": 4.952429290556904e-06, + "loss": 1.0299, + "step": 2501 + }, + { + "epoch": 0.1807574909241967, + "grad_norm": 8.008621815048132, + "learning_rate": 4.9523724842293875e-06, + "loss": 0.9523, + "step": 2502 + }, + { + "epoch": 0.18082973612440623, + "grad_norm": 7.8313266808240805, + "learning_rate": 4.9523156443307964e-06, + "loss": 0.9093, + "step": 2503 + }, + { + "epoch": 0.18090198132461574, + "grad_norm": 7.808440840017345, + "learning_rate": 4.952258770861909e-06, + "loss": 0.9142, + "step": 2504 + }, + { + "epoch": 0.18097422652482525, + "grad_norm": 7.825903784870398, + "learning_rate": 4.952201863823502e-06, + "loss": 0.993, + "step": 2505 + }, + { + "epoch": 0.18104647172503477, + "grad_norm": 7.373509401883082, + "learning_rate": 4.952144923216355e-06, + "loss": 0.9433, + "step": 2506 + }, + { + "epoch": 0.18111871692524428, + "grad_norm": 9.26720292806105, + "learning_rate": 4.952087949041249e-06, + "loss": 0.9893, + "step": 2507 + }, + { + "epoch": 0.1811909621254538, + "grad_norm": 8.234932288868324, + "learning_rate": 4.952030941298962e-06, + "loss": 1.0124, + "step": 2508 + }, + { + "epoch": 0.1812632073256633, + "grad_norm": 7.418259296080632, + "learning_rate": 4.951973899990276e-06, + "loss": 0.9573, + "step": 2509 + }, + { + "epoch": 0.18133545252587283, + "grad_norm": 8.909092285637405, + "learning_rate": 4.951916825115971e-06, + "loss": 0.9359, + "step": 2510 + }, + { + "epoch": 0.1814076977260823, + "grad_norm": 7.57250063110631, + "learning_rate": 4.951859716676829e-06, + "loss": 0.9165, + "step": 2511 + }, + { + "epoch": 0.18147994292629183, + "grad_norm": 7.76082445818432, + "learning_rate": 4.95180257467363e-06, + "loss": 0.9609, + "step": 2512 + }, + { + "epoch": 0.18155218812650134, + "grad_norm": 7.29263554993393, + "learning_rate": 4.9517453991071586e-06, + "loss": 0.9384, + "step": 2513 + }, + { + "epoch": 0.18162443332671085, + "grad_norm": 7.56184371748814, + "learning_rate": 4.951688189978196e-06, + "loss": 0.9004, + "step": 2514 + }, + { + "epoch": 0.18169667852692037, + "grad_norm": 8.23582489162487, + "learning_rate": 4.951630947287525e-06, + "loss": 1.0106, + "step": 2515 + }, + { + "epoch": 0.18176892372712988, + "grad_norm": 7.321621414462697, + "learning_rate": 4.951573671035931e-06, + "loss": 0.8926, + "step": 2516 + }, + { + "epoch": 0.1818411689273394, + "grad_norm": 7.610812386299071, + "learning_rate": 4.951516361224196e-06, + "loss": 1.0075, + "step": 2517 + }, + { + "epoch": 0.1819134141275489, + "grad_norm": 6.657548518073887, + "learning_rate": 4.951459017853106e-06, + "loss": 0.8819, + "step": 2518 + }, + { + "epoch": 0.18198565932775843, + "grad_norm": 9.264950603882506, + "learning_rate": 4.951401640923445e-06, + "loss": 0.9283, + "step": 2519 + }, + { + "epoch": 0.1820579045279679, + "grad_norm": 8.277809666423549, + "learning_rate": 4.951344230436e-06, + "loss": 0.9438, + "step": 2520 + }, + { + "epoch": 0.18213014972817743, + "grad_norm": 8.13202914845299, + "learning_rate": 4.951286786391555e-06, + "loss": 1.0357, + "step": 2521 + }, + { + "epoch": 0.18220239492838694, + "grad_norm": 6.800808454587317, + "learning_rate": 4.951229308790897e-06, + "loss": 0.9443, + "step": 2522 + }, + { + "epoch": 0.18227464012859645, + "grad_norm": 7.882614813784207, + "learning_rate": 4.951171797634814e-06, + "loss": 1.0425, + "step": 2523 + }, + { + "epoch": 0.18234688532880597, + "grad_norm": 8.71094713424355, + "learning_rate": 4.951114252924091e-06, + "loss": 0.9443, + "step": 2524 + }, + { + "epoch": 0.18241913052901548, + "grad_norm": 6.671621420550659, + "learning_rate": 4.951056674659518e-06, + "loss": 0.9846, + "step": 2525 + }, + { + "epoch": 0.182491375729225, + "grad_norm": 7.832481770864071, + "learning_rate": 4.950999062841882e-06, + "loss": 0.9232, + "step": 2526 + }, + { + "epoch": 0.1825636209294345, + "grad_norm": 7.560988598323924, + "learning_rate": 4.950941417471972e-06, + "loss": 0.9038, + "step": 2527 + }, + { + "epoch": 0.18263586612964403, + "grad_norm": 6.983158014079975, + "learning_rate": 4.950883738550577e-06, + "loss": 0.9421, + "step": 2528 + }, + { + "epoch": 0.1827081113298535, + "grad_norm": 5.781117700661048, + "learning_rate": 4.950826026078486e-06, + "loss": 0.8135, + "step": 2529 + }, + { + "epoch": 0.18278035653006303, + "grad_norm": 7.493950502132901, + "learning_rate": 4.9507682800564906e-06, + "loss": 0.8649, + "step": 2530 + }, + { + "epoch": 0.18285260173027254, + "grad_norm": 7.045707792054367, + "learning_rate": 4.95071050048538e-06, + "loss": 0.9203, + "step": 2531 + }, + { + "epoch": 0.18292484693048205, + "grad_norm": 7.903282362463718, + "learning_rate": 4.950652687365945e-06, + "loss": 0.9654, + "step": 2532 + }, + { + "epoch": 0.18299709213069157, + "grad_norm": 8.21931345353497, + "learning_rate": 4.950594840698978e-06, + "loss": 0.9475, + "step": 2533 + }, + { + "epoch": 0.18306933733090108, + "grad_norm": 8.389988482607187, + "learning_rate": 4.950536960485271e-06, + "loss": 0.9081, + "step": 2534 + }, + { + "epoch": 0.1831415825311106, + "grad_norm": 8.27229625406038, + "learning_rate": 4.950479046725615e-06, + "loss": 0.9847, + "step": 2535 + }, + { + "epoch": 0.1832138277313201, + "grad_norm": 6.97243439991956, + "learning_rate": 4.950421099420803e-06, + "loss": 0.9599, + "step": 2536 + }, + { + "epoch": 0.1832860729315296, + "grad_norm": 13.303940291150882, + "learning_rate": 4.95036311857163e-06, + "loss": 1.1352, + "step": 2537 + }, + { + "epoch": 0.1833583181317391, + "grad_norm": 8.04908807932249, + "learning_rate": 4.950305104178887e-06, + "loss": 0.9773, + "step": 2538 + }, + { + "epoch": 0.18343056333194863, + "grad_norm": 7.688049296781733, + "learning_rate": 4.9502470562433704e-06, + "loss": 0.9687, + "step": 2539 + }, + { + "epoch": 0.18350280853215814, + "grad_norm": 7.822376081851215, + "learning_rate": 4.950188974765875e-06, + "loss": 0.8817, + "step": 2540 + }, + { + "epoch": 0.18357505373236765, + "grad_norm": 6.593889714067054, + "learning_rate": 4.950130859747194e-06, + "loss": 0.9091, + "step": 2541 + }, + { + "epoch": 0.18364729893257717, + "grad_norm": 6.356537056801619, + "learning_rate": 4.950072711188124e-06, + "loss": 0.9443, + "step": 2542 + }, + { + "epoch": 0.18371954413278668, + "grad_norm": 7.4326345733972525, + "learning_rate": 4.950014529089461e-06, + "loss": 0.9234, + "step": 2543 + }, + { + "epoch": 0.1837917893329962, + "grad_norm": 7.794478971381136, + "learning_rate": 4.949956313452002e-06, + "loss": 0.9146, + "step": 2544 + }, + { + "epoch": 0.1838640345332057, + "grad_norm": 6.5061144313558215, + "learning_rate": 4.949898064276542e-06, + "loss": 0.9634, + "step": 2545 + }, + { + "epoch": 0.1839362797334152, + "grad_norm": 6.707718846706744, + "learning_rate": 4.949839781563881e-06, + "loss": 0.894, + "step": 2546 + }, + { + "epoch": 0.1840085249336247, + "grad_norm": 7.62785586046801, + "learning_rate": 4.949781465314815e-06, + "loss": 0.9724, + "step": 2547 + }, + { + "epoch": 0.18408077013383423, + "grad_norm": 7.3130187804258595, + "learning_rate": 4.949723115530143e-06, + "loss": 1.0402, + "step": 2548 + }, + { + "epoch": 0.18415301533404374, + "grad_norm": 8.40376466222412, + "learning_rate": 4.949664732210664e-06, + "loss": 1.0384, + "step": 2549 + }, + { + "epoch": 0.18422526053425325, + "grad_norm": 7.019200286959835, + "learning_rate": 4.949606315357176e-06, + "loss": 0.8904, + "step": 2550 + }, + { + "epoch": 0.18429750573446277, + "grad_norm": 9.149036260828412, + "learning_rate": 4.94954786497048e-06, + "loss": 0.969, + "step": 2551 + }, + { + "epoch": 0.18436975093467228, + "grad_norm": 7.139035505017084, + "learning_rate": 4.949489381051375e-06, + "loss": 0.9706, + "step": 2552 + }, + { + "epoch": 0.1844419961348818, + "grad_norm": 8.69936038768283, + "learning_rate": 4.9494308636006635e-06, + "loss": 1.0166, + "step": 2553 + }, + { + "epoch": 0.1845142413350913, + "grad_norm": 6.5222561617474915, + "learning_rate": 4.9493723126191454e-06, + "loss": 0.9731, + "step": 2554 + }, + { + "epoch": 0.1845864865353008, + "grad_norm": 6.8130028215383165, + "learning_rate": 4.949313728107621e-06, + "loss": 0.9914, + "step": 2555 + }, + { + "epoch": 0.1846587317355103, + "grad_norm": 9.033138724432597, + "learning_rate": 4.949255110066894e-06, + "loss": 1.0037, + "step": 2556 + }, + { + "epoch": 0.18473097693571983, + "grad_norm": 6.730199523960952, + "learning_rate": 4.949196458497766e-06, + "loss": 0.9683, + "step": 2557 + }, + { + "epoch": 0.18480322213592934, + "grad_norm": 7.006256304966916, + "learning_rate": 4.949137773401041e-06, + "loss": 0.9681, + "step": 2558 + }, + { + "epoch": 0.18487546733613885, + "grad_norm": 7.658776687667718, + "learning_rate": 4.9490790547775206e-06, + "loss": 0.9663, + "step": 2559 + }, + { + "epoch": 0.18494771253634837, + "grad_norm": 7.4330641396992405, + "learning_rate": 4.949020302628009e-06, + "loss": 0.9721, + "step": 2560 + }, + { + "epoch": 0.18501995773655788, + "grad_norm": 8.841262952939887, + "learning_rate": 4.948961516953312e-06, + "loss": 0.9046, + "step": 2561 + }, + { + "epoch": 0.1850922029367674, + "grad_norm": 7.112488854907376, + "learning_rate": 4.948902697754234e-06, + "loss": 0.8815, + "step": 2562 + }, + { + "epoch": 0.1851644481369769, + "grad_norm": 7.623034614789654, + "learning_rate": 4.948843845031579e-06, + "loss": 0.8934, + "step": 2563 + }, + { + "epoch": 0.1852366933371864, + "grad_norm": 9.246061156601826, + "learning_rate": 4.948784958786152e-06, + "loss": 1.0074, + "step": 2564 + }, + { + "epoch": 0.1853089385373959, + "grad_norm": 6.9234672420054855, + "learning_rate": 4.948726039018762e-06, + "loss": 0.9234, + "step": 2565 + }, + { + "epoch": 0.18538118373760543, + "grad_norm": 8.00126971659103, + "learning_rate": 4.948667085730212e-06, + "loss": 0.9824, + "step": 2566 + }, + { + "epoch": 0.18545342893781494, + "grad_norm": 7.821614556238208, + "learning_rate": 4.9486080989213125e-06, + "loss": 0.9045, + "step": 2567 + }, + { + "epoch": 0.18552567413802445, + "grad_norm": 7.200225974881307, + "learning_rate": 4.948549078592868e-06, + "loss": 0.959, + "step": 2568 + }, + { + "epoch": 0.18559791933823397, + "grad_norm": 7.553633102479561, + "learning_rate": 4.948490024745689e-06, + "loss": 0.9405, + "step": 2569 + }, + { + "epoch": 0.18567016453844348, + "grad_norm": 8.175412639504618, + "learning_rate": 4.948430937380582e-06, + "loss": 1.0104, + "step": 2570 + }, + { + "epoch": 0.185742409738653, + "grad_norm": 7.211825662224803, + "learning_rate": 4.948371816498357e-06, + "loss": 0.9141, + "step": 2571 + }, + { + "epoch": 0.1858146549388625, + "grad_norm": 8.225947140595014, + "learning_rate": 4.948312662099822e-06, + "loss": 1.0263, + "step": 2572 + }, + { + "epoch": 0.185886900139072, + "grad_norm": 7.78051435677654, + "learning_rate": 4.948253474185789e-06, + "loss": 0.9158, + "step": 2573 + }, + { + "epoch": 0.1859591453392815, + "grad_norm": 8.80575108302696, + "learning_rate": 4.9481942527570656e-06, + "loss": 0.9628, + "step": 2574 + }, + { + "epoch": 0.18603139053949103, + "grad_norm": 7.07548045761238, + "learning_rate": 4.9481349978144644e-06, + "loss": 0.9676, + "step": 2575 + }, + { + "epoch": 0.18610363573970054, + "grad_norm": 6.890376979663377, + "learning_rate": 4.948075709358796e-06, + "loss": 0.9815, + "step": 2576 + }, + { + "epoch": 0.18617588093991005, + "grad_norm": 7.7442785493707635, + "learning_rate": 4.948016387390871e-06, + "loss": 1.0177, + "step": 2577 + }, + { + "epoch": 0.18624812614011957, + "grad_norm": 5.978410661954292, + "learning_rate": 4.9479570319115036e-06, + "loss": 1.015, + "step": 2578 + }, + { + "epoch": 0.18632037134032908, + "grad_norm": 8.91143326762316, + "learning_rate": 4.947897642921505e-06, + "loss": 0.9204, + "step": 2579 + }, + { + "epoch": 0.1863926165405386, + "grad_norm": 7.394936979178754, + "learning_rate": 4.947838220421688e-06, + "loss": 0.9561, + "step": 2580 + }, + { + "epoch": 0.1864648617407481, + "grad_norm": 7.65423283376182, + "learning_rate": 4.947778764412867e-06, + "loss": 0.9484, + "step": 2581 + }, + { + "epoch": 0.1865371069409576, + "grad_norm": 7.159719633868054, + "learning_rate": 4.9477192748958546e-06, + "loss": 0.9534, + "step": 2582 + }, + { + "epoch": 0.1866093521411671, + "grad_norm": 6.882403462235572, + "learning_rate": 4.947659751871466e-06, + "loss": 0.9036, + "step": 2583 + }, + { + "epoch": 0.18668159734137663, + "grad_norm": 7.258530432400621, + "learning_rate": 4.9476001953405155e-06, + "loss": 1.0291, + "step": 2584 + }, + { + "epoch": 0.18675384254158614, + "grad_norm": 6.81600833813511, + "learning_rate": 4.947540605303819e-06, + "loss": 0.9565, + "step": 2585 + }, + { + "epoch": 0.18682608774179565, + "grad_norm": 8.723565853267175, + "learning_rate": 4.9474809817621924e-06, + "loss": 1.0396, + "step": 2586 + }, + { + "epoch": 0.18689833294200517, + "grad_norm": 7.00997568210623, + "learning_rate": 4.947421324716452e-06, + "loss": 0.9756, + "step": 2587 + }, + { + "epoch": 0.18697057814221468, + "grad_norm": 6.420466136231398, + "learning_rate": 4.9473616341674125e-06, + "loss": 0.9271, + "step": 2588 + }, + { + "epoch": 0.1870428233424242, + "grad_norm": 8.41578105635919, + "learning_rate": 4.947301910115894e-06, + "loss": 1.037, + "step": 2589 + }, + { + "epoch": 0.1871150685426337, + "grad_norm": 6.317072052977531, + "learning_rate": 4.947242152562711e-06, + "loss": 0.9516, + "step": 2590 + }, + { + "epoch": 0.1871873137428432, + "grad_norm": 6.786195981704301, + "learning_rate": 4.9471823615086845e-06, + "loss": 0.9438, + "step": 2591 + }, + { + "epoch": 0.1872595589430527, + "grad_norm": 7.444809599440662, + "learning_rate": 4.947122536954631e-06, + "loss": 1.0955, + "step": 2592 + }, + { + "epoch": 0.18733180414326223, + "grad_norm": 7.576027617294423, + "learning_rate": 4.94706267890137e-06, + "loss": 0.9581, + "step": 2593 + }, + { + "epoch": 0.18740404934347174, + "grad_norm": 7.599832492789366, + "learning_rate": 4.947002787349721e-06, + "loss": 0.9602, + "step": 2594 + }, + { + "epoch": 0.18747629454368125, + "grad_norm": 8.243359291703193, + "learning_rate": 4.9469428623005034e-06, + "loss": 0.9876, + "step": 2595 + }, + { + "epoch": 0.18754853974389077, + "grad_norm": 6.539676760463598, + "learning_rate": 4.946882903754538e-06, + "loss": 0.8996, + "step": 2596 + }, + { + "epoch": 0.18762078494410028, + "grad_norm": 6.536789000963335, + "learning_rate": 4.946822911712646e-06, + "loss": 0.9958, + "step": 2597 + }, + { + "epoch": 0.1876930301443098, + "grad_norm": 7.313764153132536, + "learning_rate": 4.946762886175648e-06, + "loss": 0.9424, + "step": 2598 + }, + { + "epoch": 0.1877652753445193, + "grad_norm": 6.273894144923009, + "learning_rate": 4.946702827144366e-06, + "loss": 0.8166, + "step": 2599 + }, + { + "epoch": 0.1878375205447288, + "grad_norm": 7.578037410898455, + "learning_rate": 4.946642734619622e-06, + "loss": 1.0095, + "step": 2600 + }, + { + "epoch": 0.1879097657449383, + "grad_norm": 7.2985568671162175, + "learning_rate": 4.9465826086022385e-06, + "loss": 1.0039, + "step": 2601 + }, + { + "epoch": 0.18798201094514783, + "grad_norm": 6.95249569220274, + "learning_rate": 4.9465224490930385e-06, + "loss": 0.9981, + "step": 2602 + }, + { + "epoch": 0.18805425614535734, + "grad_norm": 6.323190531280316, + "learning_rate": 4.946462256092847e-06, + "loss": 0.9509, + "step": 2603 + }, + { + "epoch": 0.18812650134556685, + "grad_norm": 6.7285734531259545, + "learning_rate": 4.9464020296024855e-06, + "loss": 0.8719, + "step": 2604 + }, + { + "epoch": 0.18819874654577637, + "grad_norm": 8.876846766070752, + "learning_rate": 4.94634176962278e-06, + "loss": 0.9916, + "step": 2605 + }, + { + "epoch": 0.18827099174598588, + "grad_norm": 7.808017024334343, + "learning_rate": 4.946281476154555e-06, + "loss": 0.9543, + "step": 2606 + }, + { + "epoch": 0.1883432369461954, + "grad_norm": 8.015099580732763, + "learning_rate": 4.9462211491986366e-06, + "loss": 0.8705, + "step": 2607 + }, + { + "epoch": 0.1884154821464049, + "grad_norm": 6.389071026108581, + "learning_rate": 4.94616078875585e-06, + "loss": 0.8902, + "step": 2608 + }, + { + "epoch": 0.1884877273466144, + "grad_norm": 8.110012374849674, + "learning_rate": 4.9461003948270204e-06, + "loss": 0.933, + "step": 2609 + }, + { + "epoch": 0.1885599725468239, + "grad_norm": 8.14944349389907, + "learning_rate": 4.946039967412976e-06, + "loss": 0.899, + "step": 2610 + }, + { + "epoch": 0.18863221774703343, + "grad_norm": 7.798797064723398, + "learning_rate": 4.945979506514544e-06, + "loss": 1.0134, + "step": 2611 + }, + { + "epoch": 0.18870446294724294, + "grad_norm": 7.746581769643274, + "learning_rate": 4.9459190121325525e-06, + "loss": 0.9645, + "step": 2612 + }, + { + "epoch": 0.18877670814745245, + "grad_norm": 6.720560654912462, + "learning_rate": 4.945858484267828e-06, + "loss": 0.9995, + "step": 2613 + }, + { + "epoch": 0.18884895334766197, + "grad_norm": 7.295920593448657, + "learning_rate": 4.945797922921201e-06, + "loss": 0.9451, + "step": 2614 + }, + { + "epoch": 0.18892119854787148, + "grad_norm": 10.798037343593178, + "learning_rate": 4.945737328093499e-06, + "loss": 0.8805, + "step": 2615 + }, + { + "epoch": 0.188993443748081, + "grad_norm": 6.32915030756283, + "learning_rate": 4.945676699785551e-06, + "loss": 0.9414, + "step": 2616 + }, + { + "epoch": 0.1890656889482905, + "grad_norm": 6.789932809758515, + "learning_rate": 4.945616037998188e-06, + "loss": 0.9933, + "step": 2617 + }, + { + "epoch": 0.1891379341485, + "grad_norm": 7.088875461168371, + "learning_rate": 4.945555342732241e-06, + "loss": 0.894, + "step": 2618 + }, + { + "epoch": 0.1892101793487095, + "grad_norm": 8.369357813328303, + "learning_rate": 4.94549461398854e-06, + "loss": 0.9622, + "step": 2619 + }, + { + "epoch": 0.18928242454891903, + "grad_norm": 6.7177239632618155, + "learning_rate": 4.945433851767917e-06, + "loss": 0.8959, + "step": 2620 + }, + { + "epoch": 0.18935466974912854, + "grad_norm": 6.700456563851638, + "learning_rate": 4.945373056071203e-06, + "loss": 0.9463, + "step": 2621 + }, + { + "epoch": 0.18942691494933805, + "grad_norm": 6.7585121454117205, + "learning_rate": 4.9453122268992305e-06, + "loss": 0.8948, + "step": 2622 + }, + { + "epoch": 0.18949916014954757, + "grad_norm": 6.569116944028049, + "learning_rate": 4.945251364252832e-06, + "loss": 0.9008, + "step": 2623 + }, + { + "epoch": 0.18957140534975708, + "grad_norm": 7.379870859065514, + "learning_rate": 4.945190468132841e-06, + "loss": 0.9088, + "step": 2624 + }, + { + "epoch": 0.1896436505499666, + "grad_norm": 6.973200313595286, + "learning_rate": 4.945129538540092e-06, + "loss": 0.8675, + "step": 2625 + }, + { + "epoch": 0.1897158957501761, + "grad_norm": 12.386804364454992, + "learning_rate": 4.945068575475417e-06, + "loss": 0.9208, + "step": 2626 + }, + { + "epoch": 0.1897881409503856, + "grad_norm": 9.982355573236577, + "learning_rate": 4.945007578939652e-06, + "loss": 1.1001, + "step": 2627 + }, + { + "epoch": 0.1898603861505951, + "grad_norm": 6.8634707794334435, + "learning_rate": 4.944946548933632e-06, + "loss": 0.9752, + "step": 2628 + }, + { + "epoch": 0.18993263135080463, + "grad_norm": 11.181538447950187, + "learning_rate": 4.944885485458192e-06, + "loss": 0.9303, + "step": 2629 + }, + { + "epoch": 0.19000487655101414, + "grad_norm": 11.546521502750366, + "learning_rate": 4.944824388514168e-06, + "loss": 1.058, + "step": 2630 + }, + { + "epoch": 0.19007712175122365, + "grad_norm": 10.585823627155063, + "learning_rate": 4.944763258102396e-06, + "loss": 0.9448, + "step": 2631 + }, + { + "epoch": 0.19014936695143317, + "grad_norm": 7.695408916353476, + "learning_rate": 4.944702094223714e-06, + "loss": 0.9548, + "step": 2632 + }, + { + "epoch": 0.19022161215164268, + "grad_norm": 7.820998551404063, + "learning_rate": 4.944640896878958e-06, + "loss": 0.9294, + "step": 2633 + }, + { + "epoch": 0.1902938573518522, + "grad_norm": 7.119764043777188, + "learning_rate": 4.944579666068966e-06, + "loss": 0.9667, + "step": 2634 + }, + { + "epoch": 0.1903661025520617, + "grad_norm": 9.63581894043428, + "learning_rate": 4.944518401794577e-06, + "loss": 0.9265, + "step": 2635 + }, + { + "epoch": 0.1904383477522712, + "grad_norm": 9.581222655573953, + "learning_rate": 4.944457104056629e-06, + "loss": 1.0564, + "step": 2636 + }, + { + "epoch": 0.1905105929524807, + "grad_norm": 7.236362126691373, + "learning_rate": 4.9443957728559615e-06, + "loss": 0.9067, + "step": 2637 + }, + { + "epoch": 0.19058283815269023, + "grad_norm": 6.802613249890654, + "learning_rate": 4.944334408193413e-06, + "loss": 0.9651, + "step": 2638 + }, + { + "epoch": 0.19065508335289974, + "grad_norm": 7.581207089094346, + "learning_rate": 4.944273010069825e-06, + "loss": 1.0696, + "step": 2639 + }, + { + "epoch": 0.19072732855310925, + "grad_norm": 10.364976200089316, + "learning_rate": 4.944211578486037e-06, + "loss": 0.9594, + "step": 2640 + }, + { + "epoch": 0.19079957375331877, + "grad_norm": 7.856452292278596, + "learning_rate": 4.9441501134428905e-06, + "loss": 0.9041, + "step": 2641 + }, + { + "epoch": 0.19087181895352828, + "grad_norm": 7.129681571708855, + "learning_rate": 4.9440886149412275e-06, + "loss": 0.939, + "step": 2642 + }, + { + "epoch": 0.1909440641537378, + "grad_norm": 6.358683630550252, + "learning_rate": 4.944027082981888e-06, + "loss": 0.905, + "step": 2643 + }, + { + "epoch": 0.19101630935394728, + "grad_norm": 8.553235910663112, + "learning_rate": 4.943965517565715e-06, + "loss": 0.9699, + "step": 2644 + }, + { + "epoch": 0.1910885545541568, + "grad_norm": 9.705476878191275, + "learning_rate": 4.9439039186935525e-06, + "loss": 0.9611, + "step": 2645 + }, + { + "epoch": 0.1911607997543663, + "grad_norm": 8.106209021165457, + "learning_rate": 4.943842286366243e-06, + "loss": 0.993, + "step": 2646 + }, + { + "epoch": 0.19123304495457583, + "grad_norm": 6.115599600340075, + "learning_rate": 4.943780620584629e-06, + "loss": 0.9432, + "step": 2647 + }, + { + "epoch": 0.19130529015478534, + "grad_norm": 7.3616696159846855, + "learning_rate": 4.943718921349557e-06, + "loss": 1.052, + "step": 2648 + }, + { + "epoch": 0.19137753535499485, + "grad_norm": 7.89236159146795, + "learning_rate": 4.943657188661871e-06, + "loss": 0.9938, + "step": 2649 + }, + { + "epoch": 0.19144978055520437, + "grad_norm": 6.631862325198754, + "learning_rate": 4.9435954225224145e-06, + "loss": 0.9293, + "step": 2650 + }, + { + "epoch": 0.19152202575541388, + "grad_norm": 7.780260873456185, + "learning_rate": 4.943533622932034e-06, + "loss": 0.9856, + "step": 2651 + }, + { + "epoch": 0.1915942709556234, + "grad_norm": 7.614723159055916, + "learning_rate": 4.943471789891575e-06, + "loss": 1.0355, + "step": 2652 + }, + { + "epoch": 0.19166651615583288, + "grad_norm": 6.715656934744772, + "learning_rate": 4.9434099234018854e-06, + "loss": 0.8268, + "step": 2653 + }, + { + "epoch": 0.1917387613560424, + "grad_norm": 7.243490519578704, + "learning_rate": 4.943348023463811e-06, + "loss": 0.9753, + "step": 2654 + }, + { + "epoch": 0.1918110065562519, + "grad_norm": 6.39489454240616, + "learning_rate": 4.943286090078199e-06, + "loss": 0.9692, + "step": 2655 + }, + { + "epoch": 0.19188325175646143, + "grad_norm": 6.723302531943298, + "learning_rate": 4.943224123245897e-06, + "loss": 0.9644, + "step": 2656 + }, + { + "epoch": 0.19195549695667094, + "grad_norm": 6.028142415086301, + "learning_rate": 4.943162122967754e-06, + "loss": 0.8227, + "step": 2657 + }, + { + "epoch": 0.19202774215688045, + "grad_norm": 8.164787180008075, + "learning_rate": 4.943100089244619e-06, + "loss": 0.8431, + "step": 2658 + }, + { + "epoch": 0.19209998735708997, + "grad_norm": 6.953012255493933, + "learning_rate": 4.943038022077341e-06, + "loss": 0.9751, + "step": 2659 + }, + { + "epoch": 0.19217223255729948, + "grad_norm": 7.442575726263194, + "learning_rate": 4.942975921466769e-06, + "loss": 0.9887, + "step": 2660 + }, + { + "epoch": 0.192244477757509, + "grad_norm": 6.960989835223435, + "learning_rate": 4.942913787413753e-06, + "loss": 0.8914, + "step": 2661 + }, + { + "epoch": 0.19231672295771848, + "grad_norm": 6.773312471813205, + "learning_rate": 4.942851619919145e-06, + "loss": 0.9029, + "step": 2662 + }, + { + "epoch": 0.192388968157928, + "grad_norm": 6.818883582326815, + "learning_rate": 4.942789418983794e-06, + "loss": 0.9835, + "step": 2663 + }, + { + "epoch": 0.1924612133581375, + "grad_norm": 6.855537192500228, + "learning_rate": 4.942727184608553e-06, + "loss": 0.8992, + "step": 2664 + }, + { + "epoch": 0.19253345855834703, + "grad_norm": 8.200295270860487, + "learning_rate": 4.942664916794273e-06, + "loss": 0.921, + "step": 2665 + }, + { + "epoch": 0.19260570375855654, + "grad_norm": 7.275585934878426, + "learning_rate": 4.942602615541809e-06, + "loss": 0.9873, + "step": 2666 + }, + { + "epoch": 0.19267794895876605, + "grad_norm": 6.022896629818294, + "learning_rate": 4.942540280852009e-06, + "loss": 1.002, + "step": 2667 + }, + { + "epoch": 0.19275019415897557, + "grad_norm": 6.9471873787588265, + "learning_rate": 4.94247791272573e-06, + "loss": 1.0428, + "step": 2668 + }, + { + "epoch": 0.19282243935918508, + "grad_norm": 6.354722028966309, + "learning_rate": 4.942415511163826e-06, + "loss": 0.9861, + "step": 2669 + }, + { + "epoch": 0.1928946845593946, + "grad_norm": 7.9755681807113765, + "learning_rate": 4.942353076167149e-06, + "loss": 0.9305, + "step": 2670 + }, + { + "epoch": 0.19296692975960408, + "grad_norm": 6.02223789327224, + "learning_rate": 4.942290607736555e-06, + "loss": 0.9148, + "step": 2671 + }, + { + "epoch": 0.1930391749598136, + "grad_norm": 6.247913775348493, + "learning_rate": 4.9422281058728984e-06, + "loss": 0.8374, + "step": 2672 + }, + { + "epoch": 0.1931114201600231, + "grad_norm": 8.674298471893401, + "learning_rate": 4.942165570577035e-06, + "loss": 0.878, + "step": 2673 + }, + { + "epoch": 0.19318366536023263, + "grad_norm": 7.932434628442991, + "learning_rate": 4.942103001849823e-06, + "loss": 0.9997, + "step": 2674 + }, + { + "epoch": 0.19325591056044214, + "grad_norm": 8.167607726807399, + "learning_rate": 4.942040399692116e-06, + "loss": 0.9166, + "step": 2675 + }, + { + "epoch": 0.19332815576065165, + "grad_norm": 6.692219717592598, + "learning_rate": 4.941977764104772e-06, + "loss": 0.9031, + "step": 2676 + }, + { + "epoch": 0.19340040096086117, + "grad_norm": 9.620126654728999, + "learning_rate": 4.941915095088648e-06, + "loss": 0.9601, + "step": 2677 + }, + { + "epoch": 0.19347264616107068, + "grad_norm": 7.690624563503137, + "learning_rate": 4.941852392644603e-06, + "loss": 0.9384, + "step": 2678 + }, + { + "epoch": 0.1935448913612802, + "grad_norm": 7.389274877092374, + "learning_rate": 4.941789656773495e-06, + "loss": 0.9606, + "step": 2679 + }, + { + "epoch": 0.19361713656148968, + "grad_norm": 7.162072623985338, + "learning_rate": 4.941726887476182e-06, + "loss": 0.9663, + "step": 2680 + }, + { + "epoch": 0.1936893817616992, + "grad_norm": 5.846121403762438, + "learning_rate": 4.941664084753523e-06, + "loss": 0.9079, + "step": 2681 + }, + { + "epoch": 0.1937616269619087, + "grad_norm": 7.335206601718358, + "learning_rate": 4.9416012486063804e-06, + "loss": 1.0286, + "step": 2682 + }, + { + "epoch": 0.19383387216211823, + "grad_norm": 7.004362109572272, + "learning_rate": 4.941538379035611e-06, + "loss": 1.0122, + "step": 2683 + }, + { + "epoch": 0.19390611736232774, + "grad_norm": 7.227440561098305, + "learning_rate": 4.941475476042078e-06, + "loss": 0.8576, + "step": 2684 + }, + { + "epoch": 0.19397836256253725, + "grad_norm": 7.900231524587735, + "learning_rate": 4.941412539626641e-06, + "loss": 1.0118, + "step": 2685 + }, + { + "epoch": 0.19405060776274677, + "grad_norm": 6.854349367746508, + "learning_rate": 4.941349569790162e-06, + "loss": 1.0348, + "step": 2686 + }, + { + "epoch": 0.19412285296295628, + "grad_norm": 6.563278297411847, + "learning_rate": 4.941286566533502e-06, + "loss": 0.9249, + "step": 2687 + }, + { + "epoch": 0.1941950981631658, + "grad_norm": 8.077953580295578, + "learning_rate": 4.9412235298575255e-06, + "loss": 0.9923, + "step": 2688 + }, + { + "epoch": 0.19426734336337528, + "grad_norm": 7.570784389720445, + "learning_rate": 4.941160459763094e-06, + "loss": 1.0453, + "step": 2689 + }, + { + "epoch": 0.1943395885635848, + "grad_norm": 6.260797557785295, + "learning_rate": 4.941097356251071e-06, + "loss": 0.9441, + "step": 2690 + }, + { + "epoch": 0.1944118337637943, + "grad_norm": 6.645716109084036, + "learning_rate": 4.941034219322322e-06, + "loss": 0.9157, + "step": 2691 + }, + { + "epoch": 0.19448407896400383, + "grad_norm": 7.003358852528204, + "learning_rate": 4.940971048977709e-06, + "loss": 0.983, + "step": 2692 + }, + { + "epoch": 0.19455632416421334, + "grad_norm": 5.874412344903039, + "learning_rate": 4.9409078452180966e-06, + "loss": 0.9091, + "step": 2693 + }, + { + "epoch": 0.19462856936442285, + "grad_norm": 6.908348351597615, + "learning_rate": 4.940844608044352e-06, + "loss": 0.8616, + "step": 2694 + }, + { + "epoch": 0.19470081456463237, + "grad_norm": 6.225383958120607, + "learning_rate": 4.94078133745734e-06, + "loss": 0.9456, + "step": 2695 + }, + { + "epoch": 0.19477305976484188, + "grad_norm": 7.135693740105996, + "learning_rate": 4.940718033457926e-06, + "loss": 0.915, + "step": 2696 + }, + { + "epoch": 0.1948453049650514, + "grad_norm": 6.974311285985044, + "learning_rate": 4.940654696046978e-06, + "loss": 1.0251, + "step": 2697 + }, + { + "epoch": 0.19491755016526088, + "grad_norm": 6.839030550245225, + "learning_rate": 4.940591325225361e-06, + "loss": 0.9135, + "step": 2698 + }, + { + "epoch": 0.1949897953654704, + "grad_norm": 6.698499528630169, + "learning_rate": 4.940527920993945e-06, + "loss": 0.912, + "step": 2699 + }, + { + "epoch": 0.1950620405656799, + "grad_norm": 6.334733239438244, + "learning_rate": 4.940464483353596e-06, + "loss": 0.9126, + "step": 2700 + }, + { + "epoch": 0.19513428576588943, + "grad_norm": 7.660115165541915, + "learning_rate": 4.9404010123051835e-06, + "loss": 0.9748, + "step": 2701 + }, + { + "epoch": 0.19520653096609894, + "grad_norm": 6.75188193754055, + "learning_rate": 4.940337507849576e-06, + "loss": 1.0217, + "step": 2702 + }, + { + "epoch": 0.19527877616630845, + "grad_norm": 8.2545902313619, + "learning_rate": 4.940273969987644e-06, + "loss": 0.9782, + "step": 2703 + }, + { + "epoch": 0.19535102136651797, + "grad_norm": 7.826801850658585, + "learning_rate": 4.9402103987202545e-06, + "loss": 0.9647, + "step": 2704 + }, + { + "epoch": 0.19542326656672748, + "grad_norm": 6.5773061210914365, + "learning_rate": 4.94014679404828e-06, + "loss": 0.9679, + "step": 2705 + }, + { + "epoch": 0.195495511766937, + "grad_norm": 8.487761719029548, + "learning_rate": 4.94008315597259e-06, + "loss": 0.9019, + "step": 2706 + }, + { + "epoch": 0.19556775696714648, + "grad_norm": 5.718907088326589, + "learning_rate": 4.940019484494057e-06, + "loss": 0.937, + "step": 2707 + }, + { + "epoch": 0.195640002167356, + "grad_norm": 8.463318156103183, + "learning_rate": 4.939955779613551e-06, + "loss": 1.0458, + "step": 2708 + }, + { + "epoch": 0.1957122473675655, + "grad_norm": 9.944300214217604, + "learning_rate": 4.939892041331945e-06, + "loss": 1.1102, + "step": 2709 + }, + { + "epoch": 0.19578449256777503, + "grad_norm": 6.593851820896304, + "learning_rate": 4.939828269650111e-06, + "loss": 0.9111, + "step": 2710 + }, + { + "epoch": 0.19585673776798454, + "grad_norm": 8.786176175371626, + "learning_rate": 4.939764464568924e-06, + "loss": 1.0119, + "step": 2711 + }, + { + "epoch": 0.19592898296819405, + "grad_norm": 7.221092735718851, + "learning_rate": 4.939700626089254e-06, + "loss": 1.029, + "step": 2712 + }, + { + "epoch": 0.19600122816840357, + "grad_norm": 7.683887904654702, + "learning_rate": 4.939636754211977e-06, + "loss": 1.005, + "step": 2713 + }, + { + "epoch": 0.19607347336861308, + "grad_norm": 6.570634142320402, + "learning_rate": 4.939572848937968e-06, + "loss": 0.9161, + "step": 2714 + }, + { + "epoch": 0.1961457185688226, + "grad_norm": 6.443294546790424, + "learning_rate": 4.9395089102681006e-06, + "loss": 0.8869, + "step": 2715 + }, + { + "epoch": 0.19621796376903208, + "grad_norm": 6.207012812669282, + "learning_rate": 4.9394449382032505e-06, + "loss": 0.981, + "step": 2716 + }, + { + "epoch": 0.1962902089692416, + "grad_norm": 7.159153778108382, + "learning_rate": 4.939380932744293e-06, + "loss": 0.8832, + "step": 2717 + }, + { + "epoch": 0.1963624541694511, + "grad_norm": 9.38975970301016, + "learning_rate": 4.939316893892104e-06, + "loss": 0.9795, + "step": 2718 + }, + { + "epoch": 0.19643469936966063, + "grad_norm": 7.182642082829902, + "learning_rate": 4.939252821647562e-06, + "loss": 0.9779, + "step": 2719 + }, + { + "epoch": 0.19650694456987014, + "grad_norm": 8.657414657520228, + "learning_rate": 4.939188716011543e-06, + "loss": 0.9775, + "step": 2720 + }, + { + "epoch": 0.19657918977007965, + "grad_norm": 7.593835398503862, + "learning_rate": 4.939124576984922e-06, + "loss": 0.9857, + "step": 2721 + }, + { + "epoch": 0.19665143497028917, + "grad_norm": 7.578719220017927, + "learning_rate": 4.939060404568582e-06, + "loss": 1.0388, + "step": 2722 + }, + { + "epoch": 0.19672368017049868, + "grad_norm": 7.973418898787826, + "learning_rate": 4.938996198763397e-06, + "loss": 0.9444, + "step": 2723 + }, + { + "epoch": 0.1967959253707082, + "grad_norm": 7.827687386408321, + "learning_rate": 4.9389319595702495e-06, + "loss": 0.9154, + "step": 2724 + }, + { + "epoch": 0.19686817057091768, + "grad_norm": 7.870793505827538, + "learning_rate": 4.9388676869900165e-06, + "loss": 0.9788, + "step": 2725 + }, + { + "epoch": 0.1969404157711272, + "grad_norm": 8.518025303197799, + "learning_rate": 4.938803381023578e-06, + "loss": 0.9655, + "step": 2726 + }, + { + "epoch": 0.1970126609713367, + "grad_norm": 6.209747699520426, + "learning_rate": 4.938739041671816e-06, + "loss": 0.8296, + "step": 2727 + }, + { + "epoch": 0.19708490617154623, + "grad_norm": 7.370420424168754, + "learning_rate": 4.938674668935609e-06, + "loss": 1.0031, + "step": 2728 + }, + { + "epoch": 0.19715715137175574, + "grad_norm": 8.471131162447357, + "learning_rate": 4.938610262815839e-06, + "loss": 0.9375, + "step": 2729 + }, + { + "epoch": 0.19722939657196525, + "grad_norm": 8.154627846950811, + "learning_rate": 4.938545823313389e-06, + "loss": 0.9623, + "step": 2730 + }, + { + "epoch": 0.19730164177217477, + "grad_norm": 9.989922977490437, + "learning_rate": 4.93848135042914e-06, + "loss": 0.9769, + "step": 2731 + }, + { + "epoch": 0.19737388697238428, + "grad_norm": 7.018802186766834, + "learning_rate": 4.938416844163974e-06, + "loss": 0.9195, + "step": 2732 + }, + { + "epoch": 0.1974461321725938, + "grad_norm": 7.010866722258071, + "learning_rate": 4.938352304518775e-06, + "loss": 0.9738, + "step": 2733 + }, + { + "epoch": 0.19751837737280328, + "grad_norm": 6.57377397253232, + "learning_rate": 4.938287731494426e-06, + "loss": 0.9727, + "step": 2734 + }, + { + "epoch": 0.1975906225730128, + "grad_norm": 7.6128641075914585, + "learning_rate": 4.938223125091812e-06, + "loss": 0.9102, + "step": 2735 + }, + { + "epoch": 0.1976628677732223, + "grad_norm": 6.042099123856689, + "learning_rate": 4.938158485311816e-06, + "loss": 0.8974, + "step": 2736 + }, + { + "epoch": 0.19773511297343183, + "grad_norm": 7.791076583413442, + "learning_rate": 4.938093812155323e-06, + "loss": 0.9635, + "step": 2737 + }, + { + "epoch": 0.19780735817364134, + "grad_norm": 8.946662954148698, + "learning_rate": 4.938029105623219e-06, + "loss": 1.0139, + "step": 2738 + }, + { + "epoch": 0.19787960337385085, + "grad_norm": 8.233718527407541, + "learning_rate": 4.93796436571639e-06, + "loss": 0.9063, + "step": 2739 + }, + { + "epoch": 0.19795184857406037, + "grad_norm": 7.371829887145373, + "learning_rate": 4.937899592435721e-06, + "loss": 0.8817, + "step": 2740 + }, + { + "epoch": 0.19802409377426988, + "grad_norm": 7.019243492380234, + "learning_rate": 4.937834785782101e-06, + "loss": 0.9882, + "step": 2741 + }, + { + "epoch": 0.1980963389744794, + "grad_norm": 7.945608728038528, + "learning_rate": 4.937769945756414e-06, + "loss": 0.929, + "step": 2742 + }, + { + "epoch": 0.19816858417468888, + "grad_norm": 6.525609551382142, + "learning_rate": 4.93770507235955e-06, + "loss": 0.9671, + "step": 2743 + }, + { + "epoch": 0.1982408293748984, + "grad_norm": 8.163930733087367, + "learning_rate": 4.937640165592396e-06, + "loss": 0.8956, + "step": 2744 + }, + { + "epoch": 0.1983130745751079, + "grad_norm": 7.468418768397129, + "learning_rate": 4.937575225455841e-06, + "loss": 0.9031, + "step": 2745 + }, + { + "epoch": 0.19838531977531743, + "grad_norm": 9.54455038635329, + "learning_rate": 4.937510251950775e-06, + "loss": 0.9547, + "step": 2746 + }, + { + "epoch": 0.19845756497552694, + "grad_norm": 7.1222234301765255, + "learning_rate": 4.937445245078084e-06, + "loss": 0.9245, + "step": 2747 + }, + { + "epoch": 0.19852981017573645, + "grad_norm": 6.456235122294239, + "learning_rate": 4.937380204838662e-06, + "loss": 0.8174, + "step": 2748 + }, + { + "epoch": 0.19860205537594597, + "grad_norm": 7.509505288481684, + "learning_rate": 4.937315131233397e-06, + "loss": 0.9209, + "step": 2749 + }, + { + "epoch": 0.19867430057615548, + "grad_norm": 9.334646223461686, + "learning_rate": 4.937250024263179e-06, + "loss": 0.9826, + "step": 2750 + }, + { + "epoch": 0.19874654577636497, + "grad_norm": 7.3359629800209785, + "learning_rate": 4.937184883928902e-06, + "loss": 0.9312, + "step": 2751 + }, + { + "epoch": 0.19881879097657448, + "grad_norm": 6.167864537098561, + "learning_rate": 4.937119710231456e-06, + "loss": 0.8835, + "step": 2752 + }, + { + "epoch": 0.198891036176784, + "grad_norm": 7.476486221609977, + "learning_rate": 4.937054503171733e-06, + "loss": 0.922, + "step": 2753 + }, + { + "epoch": 0.1989632813769935, + "grad_norm": 6.505470028315144, + "learning_rate": 4.936989262750627e-06, + "loss": 0.87, + "step": 2754 + }, + { + "epoch": 0.19903552657720303, + "grad_norm": 6.528790900788652, + "learning_rate": 4.93692398896903e-06, + "loss": 1.0097, + "step": 2755 + }, + { + "epoch": 0.19910777177741254, + "grad_norm": 8.009638701826011, + "learning_rate": 4.936858681827835e-06, + "loss": 0.9679, + "step": 2756 + }, + { + "epoch": 0.19918001697762205, + "grad_norm": 9.113578023055721, + "learning_rate": 4.9367933413279375e-06, + "loss": 0.9694, + "step": 2757 + }, + { + "epoch": 0.19925226217783157, + "grad_norm": 7.310031832979247, + "learning_rate": 4.93672796747023e-06, + "loss": 0.8989, + "step": 2758 + }, + { + "epoch": 0.19932450737804108, + "grad_norm": 7.743291350814815, + "learning_rate": 4.93666256025561e-06, + "loss": 0.8908, + "step": 2759 + }, + { + "epoch": 0.19939675257825057, + "grad_norm": 6.404894876205552, + "learning_rate": 4.9365971196849715e-06, + "loss": 0.9718, + "step": 2760 + }, + { + "epoch": 0.19946899777846008, + "grad_norm": 8.295769626814131, + "learning_rate": 4.93653164575921e-06, + "loss": 0.998, + "step": 2761 + }, + { + "epoch": 0.1995412429786696, + "grad_norm": 6.1381566956904585, + "learning_rate": 4.9364661384792215e-06, + "loss": 0.9477, + "step": 2762 + }, + { + "epoch": 0.1996134881788791, + "grad_norm": 6.487766197197013, + "learning_rate": 4.936400597845904e-06, + "loss": 0.9394, + "step": 2763 + }, + { + "epoch": 0.19968573337908863, + "grad_norm": 8.365753535358035, + "learning_rate": 4.936335023860154e-06, + "loss": 0.9709, + "step": 2764 + }, + { + "epoch": 0.19975797857929814, + "grad_norm": 6.770972836353338, + "learning_rate": 4.936269416522869e-06, + "loss": 0.9255, + "step": 2765 + }, + { + "epoch": 0.19983022377950765, + "grad_norm": 6.770655922154879, + "learning_rate": 4.936203775834947e-06, + "loss": 0.8991, + "step": 2766 + }, + { + "epoch": 0.19990246897971717, + "grad_norm": 7.892036296670143, + "learning_rate": 4.936138101797288e-06, + "loss": 0.8975, + "step": 2767 + }, + { + "epoch": 0.19997471417992668, + "grad_norm": 7.039429378792283, + "learning_rate": 4.9360723944107895e-06, + "loss": 0.9045, + "step": 2768 + }, + { + "epoch": 0.20004695938013617, + "grad_norm": 7.201216902151065, + "learning_rate": 4.936006653676351e-06, + "loss": 0.9732, + "step": 2769 + }, + { + "epoch": 0.20011920458034568, + "grad_norm": 6.35531299027402, + "learning_rate": 4.935940879594872e-06, + "loss": 0.9526, + "step": 2770 + }, + { + "epoch": 0.2001914497805552, + "grad_norm": 6.938981181619329, + "learning_rate": 4.935875072167256e-06, + "loss": 0.9298, + "step": 2771 + }, + { + "epoch": 0.2002636949807647, + "grad_norm": 7.009321545734233, + "learning_rate": 4.935809231394399e-06, + "loss": 1.0036, + "step": 2772 + }, + { + "epoch": 0.20033594018097423, + "grad_norm": 5.774347587162817, + "learning_rate": 4.9357433572772064e-06, + "loss": 0.868, + "step": 2773 + }, + { + "epoch": 0.20040818538118374, + "grad_norm": 8.771145365790721, + "learning_rate": 4.935677449816579e-06, + "loss": 1.018, + "step": 2774 + }, + { + "epoch": 0.20048043058139325, + "grad_norm": 6.726888012957194, + "learning_rate": 4.935611509013417e-06, + "loss": 0.843, + "step": 2775 + }, + { + "epoch": 0.20055267578160277, + "grad_norm": 8.279125704101636, + "learning_rate": 4.935545534868626e-06, + "loss": 1.0404, + "step": 2776 + }, + { + "epoch": 0.20062492098181228, + "grad_norm": 6.454669462763771, + "learning_rate": 4.935479527383107e-06, + "loss": 0.901, + "step": 2777 + }, + { + "epoch": 0.20069716618202177, + "grad_norm": 6.262052283206155, + "learning_rate": 4.935413486557764e-06, + "loss": 0.9887, + "step": 2778 + }, + { + "epoch": 0.20076941138223128, + "grad_norm": 6.983589554898617, + "learning_rate": 4.935347412393502e-06, + "loss": 0.9435, + "step": 2779 + }, + { + "epoch": 0.2008416565824408, + "grad_norm": 7.530165724836729, + "learning_rate": 4.935281304891224e-06, + "loss": 1.0706, + "step": 2780 + }, + { + "epoch": 0.2009139017826503, + "grad_norm": 7.247090775409899, + "learning_rate": 4.935215164051837e-06, + "loss": 0.9616, + "step": 2781 + }, + { + "epoch": 0.20098614698285983, + "grad_norm": 7.154683970383629, + "learning_rate": 4.935148989876245e-06, + "loss": 0.9347, + "step": 2782 + }, + { + "epoch": 0.20105839218306934, + "grad_norm": 7.985488604918133, + "learning_rate": 4.935082782365353e-06, + "loss": 0.9175, + "step": 2783 + }, + { + "epoch": 0.20113063738327885, + "grad_norm": 6.22867761773899, + "learning_rate": 4.93501654152007e-06, + "loss": 0.9695, + "step": 2784 + }, + { + "epoch": 0.20120288258348837, + "grad_norm": 6.598354088131622, + "learning_rate": 4.9349502673412995e-06, + "loss": 0.8628, + "step": 2785 + }, + { + "epoch": 0.20127512778369788, + "grad_norm": 6.053864296356599, + "learning_rate": 4.934883959829952e-06, + "loss": 0.9718, + "step": 2786 + }, + { + "epoch": 0.20134737298390737, + "grad_norm": 8.005556084536305, + "learning_rate": 4.934817618986932e-06, + "loss": 0.9485, + "step": 2787 + }, + { + "epoch": 0.20141961818411688, + "grad_norm": 6.749388773053043, + "learning_rate": 4.934751244813151e-06, + "loss": 0.9596, + "step": 2788 + }, + { + "epoch": 0.2014918633843264, + "grad_norm": 8.123929936297044, + "learning_rate": 4.9346848373095155e-06, + "loss": 0.9113, + "step": 2789 + }, + { + "epoch": 0.2015641085845359, + "grad_norm": 7.626102305296567, + "learning_rate": 4.934618396476934e-06, + "loss": 0.9152, + "step": 2790 + }, + { + "epoch": 0.20163635378474543, + "grad_norm": 7.893674234855743, + "learning_rate": 4.934551922316318e-06, + "loss": 0.9739, + "step": 2791 + }, + { + "epoch": 0.20170859898495494, + "grad_norm": 6.175136660403412, + "learning_rate": 4.934485414828576e-06, + "loss": 0.9185, + "step": 2792 + }, + { + "epoch": 0.20178084418516445, + "grad_norm": 6.445456023496714, + "learning_rate": 4.934418874014619e-06, + "loss": 0.8877, + "step": 2793 + }, + { + "epoch": 0.20185308938537397, + "grad_norm": 6.683568065957371, + "learning_rate": 4.934352299875358e-06, + "loss": 0.9997, + "step": 2794 + }, + { + "epoch": 0.20192533458558348, + "grad_norm": 6.7032419474610325, + "learning_rate": 4.934285692411704e-06, + "loss": 0.974, + "step": 2795 + }, + { + "epoch": 0.20199757978579297, + "grad_norm": 8.848267173306581, + "learning_rate": 4.934219051624569e-06, + "loss": 0.9793, + "step": 2796 + }, + { + "epoch": 0.20206982498600248, + "grad_norm": 7.471355159454085, + "learning_rate": 4.934152377514866e-06, + "loss": 0.9234, + "step": 2797 + }, + { + "epoch": 0.202142070186212, + "grad_norm": 6.865513585627341, + "learning_rate": 4.934085670083506e-06, + "loss": 0.9763, + "step": 2798 + }, + { + "epoch": 0.2022143153864215, + "grad_norm": 7.248838923646103, + "learning_rate": 4.934018929331403e-06, + "loss": 0.9523, + "step": 2799 + }, + { + "epoch": 0.20228656058663103, + "grad_norm": 6.852424036751036, + "learning_rate": 4.9339521552594715e-06, + "loss": 0.7972, + "step": 2800 + }, + { + "epoch": 0.20235880578684054, + "grad_norm": 7.652208655169179, + "learning_rate": 4.9338853478686244e-06, + "loss": 0.9638, + "step": 2801 + }, + { + "epoch": 0.20243105098705005, + "grad_norm": 7.656879757263235, + "learning_rate": 4.933818507159776e-06, + "loss": 0.9381, + "step": 2802 + }, + { + "epoch": 0.20250329618725957, + "grad_norm": 6.523340720184048, + "learning_rate": 4.933751633133843e-06, + "loss": 0.9404, + "step": 2803 + }, + { + "epoch": 0.20257554138746908, + "grad_norm": 7.585026506592636, + "learning_rate": 4.933684725791741e-06, + "loss": 0.9433, + "step": 2804 + }, + { + "epoch": 0.20264778658767857, + "grad_norm": 6.777729579705525, + "learning_rate": 4.933617785134383e-06, + "loss": 0.95, + "step": 2805 + }, + { + "epoch": 0.20272003178788808, + "grad_norm": 7.599388007570058, + "learning_rate": 4.933550811162687e-06, + "loss": 0.944, + "step": 2806 + }, + { + "epoch": 0.2027922769880976, + "grad_norm": 6.912390279330247, + "learning_rate": 4.93348380387757e-06, + "loss": 0.8614, + "step": 2807 + }, + { + "epoch": 0.2028645221883071, + "grad_norm": 6.918593203483147, + "learning_rate": 4.93341676327995e-06, + "loss": 0.9119, + "step": 2808 + }, + { + "epoch": 0.20293676738851663, + "grad_norm": 7.802969773123143, + "learning_rate": 4.933349689370743e-06, + "loss": 0.9916, + "step": 2809 + }, + { + "epoch": 0.20300901258872614, + "grad_norm": 6.574120686842901, + "learning_rate": 4.933282582150869e-06, + "loss": 1.0079, + "step": 2810 + }, + { + "epoch": 0.20308125778893565, + "grad_norm": 7.895092958817466, + "learning_rate": 4.933215441621245e-06, + "loss": 0.8953, + "step": 2811 + }, + { + "epoch": 0.20315350298914517, + "grad_norm": 7.179399519863782, + "learning_rate": 4.9331482677827915e-06, + "loss": 0.8938, + "step": 2812 + }, + { + "epoch": 0.20322574818935468, + "grad_norm": 7.028341140917623, + "learning_rate": 4.933081060636427e-06, + "loss": 0.962, + "step": 2813 + }, + { + "epoch": 0.20329799338956417, + "grad_norm": 7.424444199641074, + "learning_rate": 4.933013820183072e-06, + "loss": 0.9892, + "step": 2814 + }, + { + "epoch": 0.20337023858977368, + "grad_norm": 7.032315999921641, + "learning_rate": 4.932946546423647e-06, + "loss": 0.9417, + "step": 2815 + }, + { + "epoch": 0.2034424837899832, + "grad_norm": 6.0594282725344835, + "learning_rate": 4.932879239359073e-06, + "loss": 0.9225, + "step": 2816 + }, + { + "epoch": 0.2035147289901927, + "grad_norm": 7.597470725964317, + "learning_rate": 4.932811898990271e-06, + "loss": 1.0034, + "step": 2817 + }, + { + "epoch": 0.20358697419040223, + "grad_norm": 7.153354642089126, + "learning_rate": 4.932744525318163e-06, + "loss": 0.8603, + "step": 2818 + }, + { + "epoch": 0.20365921939061174, + "grad_norm": 8.889714896712055, + "learning_rate": 4.932677118343671e-06, + "loss": 0.9087, + "step": 2819 + }, + { + "epoch": 0.20373146459082125, + "grad_norm": 7.122904686937711, + "learning_rate": 4.932609678067719e-06, + "loss": 0.9345, + "step": 2820 + }, + { + "epoch": 0.20380370979103077, + "grad_norm": 7.474198273496828, + "learning_rate": 4.932542204491229e-06, + "loss": 0.9439, + "step": 2821 + }, + { + "epoch": 0.20387595499124028, + "grad_norm": 6.377491482826566, + "learning_rate": 4.932474697615125e-06, + "loss": 1.0222, + "step": 2822 + }, + { + "epoch": 0.20394820019144977, + "grad_norm": 8.241381217968431, + "learning_rate": 4.932407157440331e-06, + "loss": 0.8777, + "step": 2823 + }, + { + "epoch": 0.20402044539165928, + "grad_norm": 8.99159908275171, + "learning_rate": 4.932339583967772e-06, + "loss": 0.9734, + "step": 2824 + }, + { + "epoch": 0.2040926905918688, + "grad_norm": 8.654809274142172, + "learning_rate": 4.932271977198372e-06, + "loss": 0.9561, + "step": 2825 + }, + { + "epoch": 0.2041649357920783, + "grad_norm": 7.874065283146435, + "learning_rate": 4.932204337133058e-06, + "loss": 1.0246, + "step": 2826 + }, + { + "epoch": 0.20423718099228783, + "grad_norm": 8.339540484093824, + "learning_rate": 4.932136663772754e-06, + "loss": 0.8874, + "step": 2827 + }, + { + "epoch": 0.20430942619249734, + "grad_norm": 6.823202982323498, + "learning_rate": 4.932068957118388e-06, + "loss": 0.9313, + "step": 2828 + }, + { + "epoch": 0.20438167139270685, + "grad_norm": 6.607196710433353, + "learning_rate": 4.932001217170886e-06, + "loss": 0.9881, + "step": 2829 + }, + { + "epoch": 0.20445391659291637, + "grad_norm": 6.558807787603366, + "learning_rate": 4.931933443931177e-06, + "loss": 0.9315, + "step": 2830 + }, + { + "epoch": 0.20452616179312588, + "grad_norm": 7.862662444991325, + "learning_rate": 4.931865637400186e-06, + "loss": 0.9293, + "step": 2831 + }, + { + "epoch": 0.20459840699333537, + "grad_norm": 8.877665374685105, + "learning_rate": 4.931797797578843e-06, + "loss": 0.9277, + "step": 2832 + }, + { + "epoch": 0.20467065219354488, + "grad_norm": 8.777727934404762, + "learning_rate": 4.9317299244680775e-06, + "loss": 0.9376, + "step": 2833 + }, + { + "epoch": 0.2047428973937544, + "grad_norm": 7.972045456377235, + "learning_rate": 4.931662018068816e-06, + "loss": 0.9795, + "step": 2834 + }, + { + "epoch": 0.2048151425939639, + "grad_norm": 8.579703541860344, + "learning_rate": 4.93159407838199e-06, + "loss": 0.9784, + "step": 2835 + }, + { + "epoch": 0.20488738779417343, + "grad_norm": 7.4684933414811345, + "learning_rate": 4.931526105408529e-06, + "loss": 0.9679, + "step": 2836 + }, + { + "epoch": 0.20495963299438294, + "grad_norm": 7.933578363475971, + "learning_rate": 4.931458099149363e-06, + "loss": 0.9042, + "step": 2837 + }, + { + "epoch": 0.20503187819459245, + "grad_norm": 7.5903355335560025, + "learning_rate": 4.931390059605424e-06, + "loss": 0.8873, + "step": 2838 + }, + { + "epoch": 0.20510412339480197, + "grad_norm": 8.375464583079667, + "learning_rate": 4.9313219867776445e-06, + "loss": 0.9563, + "step": 2839 + }, + { + "epoch": 0.20517636859501148, + "grad_norm": 7.29917332397162, + "learning_rate": 4.931253880666953e-06, + "loss": 0.9426, + "step": 2840 + }, + { + "epoch": 0.20524861379522097, + "grad_norm": 6.992793597159898, + "learning_rate": 4.931185741274284e-06, + "loss": 0.908, + "step": 2841 + }, + { + "epoch": 0.20532085899543048, + "grad_norm": 7.473337464457631, + "learning_rate": 4.9311175686005695e-06, + "loss": 1.0491, + "step": 2842 + }, + { + "epoch": 0.20539310419564, + "grad_norm": 9.369975053068028, + "learning_rate": 4.9310493626467435e-06, + "loss": 1.0768, + "step": 2843 + }, + { + "epoch": 0.2054653493958495, + "grad_norm": 9.294773922243886, + "learning_rate": 4.930981123413739e-06, + "loss": 0.9896, + "step": 2844 + }, + { + "epoch": 0.20553759459605903, + "grad_norm": 7.314924311008368, + "learning_rate": 4.930912850902491e-06, + "loss": 1.0189, + "step": 2845 + }, + { + "epoch": 0.20560983979626854, + "grad_norm": 5.959434389759619, + "learning_rate": 4.930844545113934e-06, + "loss": 0.9787, + "step": 2846 + }, + { + "epoch": 0.20568208499647805, + "grad_norm": 7.353407981864865, + "learning_rate": 4.9307762060490014e-06, + "loss": 0.98, + "step": 2847 + }, + { + "epoch": 0.20575433019668757, + "grad_norm": 6.016420782388955, + "learning_rate": 4.93070783370863e-06, + "loss": 0.9653, + "step": 2848 + }, + { + "epoch": 0.20582657539689708, + "grad_norm": 8.8850148606914, + "learning_rate": 4.9306394280937565e-06, + "loss": 0.9935, + "step": 2849 + }, + { + "epoch": 0.20589882059710657, + "grad_norm": 9.91675403032902, + "learning_rate": 4.930570989205317e-06, + "loss": 0.9709, + "step": 2850 + }, + { + "epoch": 0.20597106579731608, + "grad_norm": 6.5021813473764265, + "learning_rate": 4.930502517044247e-06, + "loss": 0.8667, + "step": 2851 + }, + { + "epoch": 0.2060433109975256, + "grad_norm": 5.679591119045544, + "learning_rate": 4.930434011611485e-06, + "loss": 0.9504, + "step": 2852 + }, + { + "epoch": 0.2061155561977351, + "grad_norm": 7.668364737610193, + "learning_rate": 4.930365472907967e-06, + "loss": 0.9302, + "step": 2853 + }, + { + "epoch": 0.20618780139794463, + "grad_norm": 10.007511369640135, + "learning_rate": 4.930296900934635e-06, + "loss": 0.9632, + "step": 2854 + }, + { + "epoch": 0.20626004659815414, + "grad_norm": 7.88871467051712, + "learning_rate": 4.9302282956924245e-06, + "loss": 1.0631, + "step": 2855 + }, + { + "epoch": 0.20633229179836365, + "grad_norm": 10.07956967903644, + "learning_rate": 4.930159657182275e-06, + "loss": 0.9043, + "step": 2856 + }, + { + "epoch": 0.20640453699857317, + "grad_norm": 6.594555846232711, + "learning_rate": 4.930090985405127e-06, + "loss": 0.9279, + "step": 2857 + }, + { + "epoch": 0.20647678219878265, + "grad_norm": 8.166067711281507, + "learning_rate": 4.93002228036192e-06, + "loss": 0.9265, + "step": 2858 + }, + { + "epoch": 0.20654902739899217, + "grad_norm": 10.094411349996081, + "learning_rate": 4.929953542053596e-06, + "loss": 1.0322, + "step": 2859 + }, + { + "epoch": 0.20662127259920168, + "grad_norm": 9.124301413354246, + "learning_rate": 4.929884770481094e-06, + "loss": 1.0096, + "step": 2860 + }, + { + "epoch": 0.2066935177994112, + "grad_norm": 7.850320564090639, + "learning_rate": 4.929815965645356e-06, + "loss": 0.8971, + "step": 2861 + }, + { + "epoch": 0.2067657629996207, + "grad_norm": 8.161754869739163, + "learning_rate": 4.929747127547324e-06, + "loss": 0.9226, + "step": 2862 + }, + { + "epoch": 0.20683800819983023, + "grad_norm": 8.017153469646155, + "learning_rate": 4.929678256187941e-06, + "loss": 0.8883, + "step": 2863 + }, + { + "epoch": 0.20691025340003974, + "grad_norm": 8.226133098064382, + "learning_rate": 4.929609351568148e-06, + "loss": 0.908, + "step": 2864 + }, + { + "epoch": 0.20698249860024925, + "grad_norm": 6.7495645453254, + "learning_rate": 4.92954041368889e-06, + "loss": 0.8926, + "step": 2865 + }, + { + "epoch": 0.20705474380045877, + "grad_norm": 8.037792583734705, + "learning_rate": 4.9294714425511105e-06, + "loss": 1.0686, + "step": 2866 + }, + { + "epoch": 0.20712698900066825, + "grad_norm": 6.473047570397796, + "learning_rate": 4.929402438155754e-06, + "loss": 0.9855, + "step": 2867 + }, + { + "epoch": 0.20719923420087777, + "grad_norm": 6.730306365477976, + "learning_rate": 4.929333400503763e-06, + "loss": 0.9125, + "step": 2868 + }, + { + "epoch": 0.20727147940108728, + "grad_norm": 8.37586128019433, + "learning_rate": 4.929264329596085e-06, + "loss": 0.9693, + "step": 2869 + }, + { + "epoch": 0.2073437246012968, + "grad_norm": 8.300064307273939, + "learning_rate": 4.929195225433664e-06, + "loss": 1.0175, + "step": 2870 + }, + { + "epoch": 0.2074159698015063, + "grad_norm": 9.690628432611856, + "learning_rate": 4.929126088017446e-06, + "loss": 1.0135, + "step": 2871 + }, + { + "epoch": 0.20748821500171583, + "grad_norm": 7.365430927916319, + "learning_rate": 4.929056917348379e-06, + "loss": 0.9753, + "step": 2872 + }, + { + "epoch": 0.20756046020192534, + "grad_norm": 8.200111983860438, + "learning_rate": 4.928987713427409e-06, + "loss": 0.9929, + "step": 2873 + }, + { + "epoch": 0.20763270540213485, + "grad_norm": 6.759291858488036, + "learning_rate": 4.928918476255482e-06, + "loss": 0.9596, + "step": 2874 + }, + { + "epoch": 0.20770495060234437, + "grad_norm": 6.036811754634119, + "learning_rate": 4.928849205833548e-06, + "loss": 0.8504, + "step": 2875 + }, + { + "epoch": 0.20777719580255385, + "grad_norm": 6.0577477775148685, + "learning_rate": 4.9287799021625535e-06, + "loss": 0.8913, + "step": 2876 + }, + { + "epoch": 0.20784944100276337, + "grad_norm": 6.880984943769803, + "learning_rate": 4.928710565243448e-06, + "loss": 0.9029, + "step": 2877 + }, + { + "epoch": 0.20792168620297288, + "grad_norm": 8.62145436703298, + "learning_rate": 4.928641195077182e-06, + "loss": 0.8602, + "step": 2878 + }, + { + "epoch": 0.2079939314031824, + "grad_norm": 7.68141516561455, + "learning_rate": 4.928571791664703e-06, + "loss": 0.9045, + "step": 2879 + }, + { + "epoch": 0.2080661766033919, + "grad_norm": 6.834864871203906, + "learning_rate": 4.9285023550069614e-06, + "loss": 0.9546, + "step": 2880 + }, + { + "epoch": 0.20813842180360143, + "grad_norm": 7.164459716700664, + "learning_rate": 4.928432885104908e-06, + "loss": 0.8776, + "step": 2881 + }, + { + "epoch": 0.20821066700381094, + "grad_norm": 8.162341886569209, + "learning_rate": 4.9283633819594945e-06, + "loss": 0.9895, + "step": 2882 + }, + { + "epoch": 0.20828291220402045, + "grad_norm": 6.2815269295296545, + "learning_rate": 4.928293845571671e-06, + "loss": 0.9078, + "step": 2883 + }, + { + "epoch": 0.20835515740422997, + "grad_norm": 7.355546542671473, + "learning_rate": 4.928224275942392e-06, + "loss": 0.96, + "step": 2884 + }, + { + "epoch": 0.20842740260443945, + "grad_norm": 7.125304901559339, + "learning_rate": 4.928154673072606e-06, + "loss": 0.9607, + "step": 2885 + }, + { + "epoch": 0.20849964780464897, + "grad_norm": 8.733129750371077, + "learning_rate": 4.9280850369632685e-06, + "loss": 0.9913, + "step": 2886 + }, + { + "epoch": 0.20857189300485848, + "grad_norm": 5.978770208561883, + "learning_rate": 4.9280153676153335e-06, + "loss": 0.9392, + "step": 2887 + }, + { + "epoch": 0.208644138205068, + "grad_norm": 6.929932737257983, + "learning_rate": 4.927945665029751e-06, + "loss": 0.8647, + "step": 2888 + }, + { + "epoch": 0.2087163834052775, + "grad_norm": 7.157497513824255, + "learning_rate": 4.92787592920748e-06, + "loss": 0.9238, + "step": 2889 + }, + { + "epoch": 0.20878862860548703, + "grad_norm": 7.076706093552387, + "learning_rate": 4.927806160149471e-06, + "loss": 0.9241, + "step": 2890 + }, + { + "epoch": 0.20886087380569654, + "grad_norm": 7.525935026863071, + "learning_rate": 4.9277363578566806e-06, + "loss": 0.982, + "step": 2891 + }, + { + "epoch": 0.20893311900590605, + "grad_norm": 7.162559692773182, + "learning_rate": 4.927666522330065e-06, + "loss": 0.9001, + "step": 2892 + }, + { + "epoch": 0.20900536420611557, + "grad_norm": 6.0838138268631825, + "learning_rate": 4.92759665357058e-06, + "loss": 0.9372, + "step": 2893 + }, + { + "epoch": 0.20907760940632505, + "grad_norm": 7.255108841987282, + "learning_rate": 4.927526751579182e-06, + "loss": 0.9098, + "step": 2894 + }, + { + "epoch": 0.20914985460653457, + "grad_norm": 6.704176600203693, + "learning_rate": 4.927456816356826e-06, + "loss": 0.9183, + "step": 2895 + }, + { + "epoch": 0.20922209980674408, + "grad_norm": 7.381487562990141, + "learning_rate": 4.927386847904473e-06, + "loss": 0.8559, + "step": 2896 + }, + { + "epoch": 0.2092943450069536, + "grad_norm": 6.420089139230394, + "learning_rate": 4.927316846223077e-06, + "loss": 0.8774, + "step": 2897 + }, + { + "epoch": 0.2093665902071631, + "grad_norm": 7.780785481632407, + "learning_rate": 4.927246811313599e-06, + "loss": 0.8442, + "step": 2898 + }, + { + "epoch": 0.20943883540737263, + "grad_norm": 8.046690114684287, + "learning_rate": 4.927176743176997e-06, + "loss": 0.9251, + "step": 2899 + }, + { + "epoch": 0.20951108060758214, + "grad_norm": 6.1912355802581045, + "learning_rate": 4.927106641814229e-06, + "loss": 0.8805, + "step": 2900 + }, + { + "epoch": 0.20958332580779165, + "grad_norm": 6.754008198062981, + "learning_rate": 4.927036507226256e-06, + "loss": 0.9848, + "step": 2901 + }, + { + "epoch": 0.20965557100800117, + "grad_norm": 6.234562538787211, + "learning_rate": 4.926966339414039e-06, + "loss": 0.9127, + "step": 2902 + }, + { + "epoch": 0.20972781620821065, + "grad_norm": 5.426648591751127, + "learning_rate": 4.926896138378535e-06, + "loss": 0.833, + "step": 2903 + }, + { + "epoch": 0.20980006140842017, + "grad_norm": 7.202039260295374, + "learning_rate": 4.926825904120709e-06, + "loss": 0.9493, + "step": 2904 + }, + { + "epoch": 0.20987230660862968, + "grad_norm": 8.078559908052194, + "learning_rate": 4.92675563664152e-06, + "loss": 0.9693, + "step": 2905 + }, + { + "epoch": 0.2099445518088392, + "grad_norm": 7.949690221353189, + "learning_rate": 4.926685335941931e-06, + "loss": 1.0673, + "step": 2906 + }, + { + "epoch": 0.2100167970090487, + "grad_norm": 7.322970727003809, + "learning_rate": 4.926615002022903e-06, + "loss": 0.8954, + "step": 2907 + }, + { + "epoch": 0.21008904220925823, + "grad_norm": 8.367423202941806, + "learning_rate": 4.926544634885401e-06, + "loss": 0.9462, + "step": 2908 + }, + { + "epoch": 0.21016128740946774, + "grad_norm": 8.081791471793533, + "learning_rate": 4.926474234530386e-06, + "loss": 0.9097, + "step": 2909 + }, + { + "epoch": 0.21023353260967725, + "grad_norm": 7.247801611823112, + "learning_rate": 4.926403800958824e-06, + "loss": 1.0008, + "step": 2910 + }, + { + "epoch": 0.21030577780988677, + "grad_norm": 7.48600047385, + "learning_rate": 4.926333334171676e-06, + "loss": 0.9692, + "step": 2911 + }, + { + "epoch": 0.21037802301009625, + "grad_norm": 9.590764976118873, + "learning_rate": 4.92626283416991e-06, + "loss": 0.9456, + "step": 2912 + }, + { + "epoch": 0.21045026821030577, + "grad_norm": 7.170798094103367, + "learning_rate": 4.926192300954489e-06, + "loss": 1.0406, + "step": 2913 + }, + { + "epoch": 0.21052251341051528, + "grad_norm": 8.703063635892285, + "learning_rate": 4.92612173452638e-06, + "loss": 0.8623, + "step": 2914 + }, + { + "epoch": 0.2105947586107248, + "grad_norm": 13.660194787442066, + "learning_rate": 4.926051134886548e-06, + "loss": 1.0566, + "step": 2915 + }, + { + "epoch": 0.2106670038109343, + "grad_norm": 8.569675974824051, + "learning_rate": 4.925980502035959e-06, + "loss": 1.002, + "step": 2916 + }, + { + "epoch": 0.21073924901114383, + "grad_norm": 6.025702103983176, + "learning_rate": 4.925909835975581e-06, + "loss": 0.9284, + "step": 2917 + }, + { + "epoch": 0.21081149421135334, + "grad_norm": 7.4330500265054855, + "learning_rate": 4.9258391367063814e-06, + "loss": 0.9188, + "step": 2918 + }, + { + "epoch": 0.21088373941156285, + "grad_norm": 6.679131521543806, + "learning_rate": 4.925768404229327e-06, + "loss": 0.8616, + "step": 2919 + }, + { + "epoch": 0.21095598461177237, + "grad_norm": 8.77856795326314, + "learning_rate": 4.925697638545387e-06, + "loss": 0.9136, + "step": 2920 + }, + { + "epoch": 0.21102822981198185, + "grad_norm": 9.208797759575232, + "learning_rate": 4.925626839655529e-06, + "loss": 0.9763, + "step": 2921 + }, + { + "epoch": 0.21110047501219137, + "grad_norm": 7.76864080375352, + "learning_rate": 4.925556007560723e-06, + "loss": 0.957, + "step": 2922 + }, + { + "epoch": 0.21117272021240088, + "grad_norm": 6.337986324393053, + "learning_rate": 4.92548514226194e-06, + "loss": 0.9282, + "step": 2923 + }, + { + "epoch": 0.2112449654126104, + "grad_norm": 6.31245830257732, + "learning_rate": 4.925414243760147e-06, + "loss": 0.8955, + "step": 2924 + }, + { + "epoch": 0.2113172106128199, + "grad_norm": 7.290096295509596, + "learning_rate": 4.925343312056318e-06, + "loss": 0.9739, + "step": 2925 + }, + { + "epoch": 0.21138945581302943, + "grad_norm": 7.35874544273457, + "learning_rate": 4.9252723471514205e-06, + "loss": 0.9718, + "step": 2926 + }, + { + "epoch": 0.21146170101323894, + "grad_norm": 9.994016383034174, + "learning_rate": 4.925201349046429e-06, + "loss": 0.9518, + "step": 2927 + }, + { + "epoch": 0.21153394621344845, + "grad_norm": 7.492637326417489, + "learning_rate": 4.925130317742313e-06, + "loss": 0.8837, + "step": 2928 + }, + { + "epoch": 0.21160619141365797, + "grad_norm": 6.394457276603284, + "learning_rate": 4.9250592532400466e-06, + "loss": 0.9747, + "step": 2929 + }, + { + "epoch": 0.21167843661386745, + "grad_norm": 6.737067089433598, + "learning_rate": 4.924988155540601e-06, + "loss": 0.9798, + "step": 2930 + }, + { + "epoch": 0.21175068181407697, + "grad_norm": 5.6920154797899105, + "learning_rate": 4.924917024644952e-06, + "loss": 0.9058, + "step": 2931 + }, + { + "epoch": 0.21182292701428648, + "grad_norm": 7.390827587488326, + "learning_rate": 4.92484586055407e-06, + "loss": 0.906, + "step": 2932 + }, + { + "epoch": 0.211895172214496, + "grad_norm": 6.975731606893718, + "learning_rate": 4.924774663268932e-06, + "loss": 0.7564, + "step": 2933 + }, + { + "epoch": 0.2119674174147055, + "grad_norm": 7.215922805237896, + "learning_rate": 4.9247034327905115e-06, + "loss": 0.9734, + "step": 2934 + }, + { + "epoch": 0.21203966261491503, + "grad_norm": 6.453058496460437, + "learning_rate": 4.924632169119784e-06, + "loss": 0.9691, + "step": 2935 + }, + { + "epoch": 0.21211190781512454, + "grad_norm": 7.726126539931582, + "learning_rate": 4.924560872257724e-06, + "loss": 0.9491, + "step": 2936 + }, + { + "epoch": 0.21218415301533405, + "grad_norm": 6.93909497870833, + "learning_rate": 4.9244895422053085e-06, + "loss": 0.9156, + "step": 2937 + }, + { + "epoch": 0.21225639821554357, + "grad_norm": 6.791324567857608, + "learning_rate": 4.924418178963514e-06, + "loss": 0.901, + "step": 2938 + }, + { + "epoch": 0.21232864341575305, + "grad_norm": 7.379048674760346, + "learning_rate": 4.924346782533317e-06, + "loss": 1.0545, + "step": 2939 + }, + { + "epoch": 0.21240088861596257, + "grad_norm": 6.923148216793387, + "learning_rate": 4.924275352915694e-06, + "loss": 0.9205, + "step": 2940 + }, + { + "epoch": 0.21247313381617208, + "grad_norm": 6.965572436730372, + "learning_rate": 4.924203890111624e-06, + "loss": 0.9382, + "step": 2941 + }, + { + "epoch": 0.2125453790163816, + "grad_norm": 6.742937136953759, + "learning_rate": 4.924132394122086e-06, + "loss": 0.9804, + "step": 2942 + }, + { + "epoch": 0.2126176242165911, + "grad_norm": 6.834937426846441, + "learning_rate": 4.924060864948057e-06, + "loss": 0.9324, + "step": 2943 + }, + { + "epoch": 0.21268986941680063, + "grad_norm": 21.430455777105003, + "learning_rate": 4.923989302590518e-06, + "loss": 0.9517, + "step": 2944 + }, + { + "epoch": 0.21276211461701014, + "grad_norm": 7.828975303900379, + "learning_rate": 4.923917707050446e-06, + "loss": 0.9294, + "step": 2945 + }, + { + "epoch": 0.21283435981721965, + "grad_norm": 7.0081995216080974, + "learning_rate": 4.923846078328823e-06, + "loss": 0.9718, + "step": 2946 + }, + { + "epoch": 0.21290660501742917, + "grad_norm": 8.462625801807958, + "learning_rate": 4.92377441642663e-06, + "loss": 0.9833, + "step": 2947 + }, + { + "epoch": 0.21297885021763865, + "grad_norm": 5.966473369912662, + "learning_rate": 4.923702721344847e-06, + "loss": 0.9127, + "step": 2948 + }, + { + "epoch": 0.21305109541784817, + "grad_norm": 6.3287133214175135, + "learning_rate": 4.923630993084455e-06, + "loss": 0.8757, + "step": 2949 + }, + { + "epoch": 0.21312334061805768, + "grad_norm": 6.291832658544329, + "learning_rate": 4.923559231646437e-06, + "loss": 0.9055, + "step": 2950 + }, + { + "epoch": 0.2131955858182672, + "grad_norm": 6.364318276071729, + "learning_rate": 4.923487437031774e-06, + "loss": 0.9161, + "step": 2951 + }, + { + "epoch": 0.2132678310184767, + "grad_norm": 6.301755920816584, + "learning_rate": 4.9234156092414505e-06, + "loss": 0.9306, + "step": 2952 + }, + { + "epoch": 0.21334007621868623, + "grad_norm": 7.059976042363742, + "learning_rate": 4.923343748276449e-06, + "loss": 1.0121, + "step": 2953 + }, + { + "epoch": 0.21341232141889574, + "grad_norm": 6.654759773028531, + "learning_rate": 4.9232718541377535e-06, + "loss": 0.9871, + "step": 2954 + }, + { + "epoch": 0.21348456661910525, + "grad_norm": 7.499391912604508, + "learning_rate": 4.923199926826347e-06, + "loss": 0.9726, + "step": 2955 + }, + { + "epoch": 0.21355681181931477, + "grad_norm": 7.269982522263959, + "learning_rate": 4.9231279663432156e-06, + "loss": 0.9237, + "step": 2956 + }, + { + "epoch": 0.21362905701952425, + "grad_norm": 6.51844869120282, + "learning_rate": 4.923055972689344e-06, + "loss": 0.9785, + "step": 2957 + }, + { + "epoch": 0.21370130221973377, + "grad_norm": 6.779455272656153, + "learning_rate": 4.922983945865717e-06, + "loss": 0.9127, + "step": 2958 + }, + { + "epoch": 0.21377354741994328, + "grad_norm": 6.812669104480889, + "learning_rate": 4.92291188587332e-06, + "loss": 0.8746, + "step": 2959 + }, + { + "epoch": 0.2138457926201528, + "grad_norm": 7.431122169004748, + "learning_rate": 4.9228397927131425e-06, + "loss": 0.9413, + "step": 2960 + }, + { + "epoch": 0.2139180378203623, + "grad_norm": 9.219778229865952, + "learning_rate": 4.922767666386169e-06, + "loss": 1.0039, + "step": 2961 + }, + { + "epoch": 0.21399028302057183, + "grad_norm": 7.194107866044354, + "learning_rate": 4.922695506893387e-06, + "loss": 0.9248, + "step": 2962 + }, + { + "epoch": 0.21406252822078134, + "grad_norm": 7.512821935778906, + "learning_rate": 4.922623314235784e-06, + "loss": 0.9067, + "step": 2963 + }, + { + "epoch": 0.21413477342099085, + "grad_norm": 6.547944703033493, + "learning_rate": 4.9225510884143504e-06, + "loss": 0.8541, + "step": 2964 + }, + { + "epoch": 0.21420701862120034, + "grad_norm": 7.210574321755552, + "learning_rate": 4.922478829430072e-06, + "loss": 0.9912, + "step": 2965 + }, + { + "epoch": 0.21427926382140985, + "grad_norm": 8.99875632276179, + "learning_rate": 4.922406537283941e-06, + "loss": 0.9952, + "step": 2966 + }, + { + "epoch": 0.21435150902161937, + "grad_norm": 10.513174601896466, + "learning_rate": 4.922334211976943e-06, + "loss": 0.9419, + "step": 2967 + }, + { + "epoch": 0.21442375422182888, + "grad_norm": 9.044640352385711, + "learning_rate": 4.922261853510072e-06, + "loss": 0.9351, + "step": 2968 + }, + { + "epoch": 0.2144959994220384, + "grad_norm": 7.5994954293051125, + "learning_rate": 4.922189461884317e-06, + "loss": 0.8884, + "step": 2969 + }, + { + "epoch": 0.2145682446222479, + "grad_norm": 7.936348658885923, + "learning_rate": 4.922117037100668e-06, + "loss": 1.0357, + "step": 2970 + }, + { + "epoch": 0.21464048982245743, + "grad_norm": 8.786544344122465, + "learning_rate": 4.9220445791601185e-06, + "loss": 0.9456, + "step": 2971 + }, + { + "epoch": 0.21471273502266694, + "grad_norm": 9.044404161859774, + "learning_rate": 4.921972088063659e-06, + "loss": 0.9743, + "step": 2972 + }, + { + "epoch": 0.21478498022287645, + "grad_norm": 6.984863537252049, + "learning_rate": 4.921899563812282e-06, + "loss": 0.9081, + "step": 2973 + }, + { + "epoch": 0.21485722542308594, + "grad_norm": 7.001743099622084, + "learning_rate": 4.921827006406981e-06, + "loss": 0.9117, + "step": 2974 + }, + { + "epoch": 0.21492947062329545, + "grad_norm": 7.9182219199011294, + "learning_rate": 4.921754415848748e-06, + "loss": 0.8887, + "step": 2975 + }, + { + "epoch": 0.21500171582350497, + "grad_norm": 7.049612253767352, + "learning_rate": 4.921681792138577e-06, + "loss": 1.0012, + "step": 2976 + }, + { + "epoch": 0.21507396102371448, + "grad_norm": 6.13072559785554, + "learning_rate": 4.921609135277463e-06, + "loss": 0.9014, + "step": 2977 + }, + { + "epoch": 0.215146206223924, + "grad_norm": 7.25354127885024, + "learning_rate": 4.9215364452664005e-06, + "loss": 0.9699, + "step": 2978 + }, + { + "epoch": 0.2152184514241335, + "grad_norm": 7.3983596427067555, + "learning_rate": 4.9214637221063845e-06, + "loss": 0.9056, + "step": 2979 + }, + { + "epoch": 0.21529069662434303, + "grad_norm": 6.800425897612297, + "learning_rate": 4.92139096579841e-06, + "loss": 0.9354, + "step": 2980 + }, + { + "epoch": 0.21536294182455254, + "grad_norm": 7.340983157719668, + "learning_rate": 4.921318176343474e-06, + "loss": 0.8737, + "step": 2981 + }, + { + "epoch": 0.21543518702476205, + "grad_norm": 7.045727553909249, + "learning_rate": 4.921245353742571e-06, + "loss": 0.9161, + "step": 2982 + }, + { + "epoch": 0.21550743222497154, + "grad_norm": 7.345037339377227, + "learning_rate": 4.921172497996699e-06, + "loss": 0.9747, + "step": 2983 + }, + { + "epoch": 0.21557967742518105, + "grad_norm": 9.034612432293807, + "learning_rate": 4.9210996091068565e-06, + "loss": 1.011, + "step": 2984 + }, + { + "epoch": 0.21565192262539057, + "grad_norm": 6.6888332285770735, + "learning_rate": 4.921026687074039e-06, + "loss": 1.0055, + "step": 2985 + }, + { + "epoch": 0.21572416782560008, + "grad_norm": 7.285127188698738, + "learning_rate": 4.9209537318992466e-06, + "loss": 0.9196, + "step": 2986 + }, + { + "epoch": 0.2157964130258096, + "grad_norm": 6.747628819810808, + "learning_rate": 4.920880743583478e-06, + "loss": 0.903, + "step": 2987 + }, + { + "epoch": 0.2158686582260191, + "grad_norm": 8.357889489065155, + "learning_rate": 4.92080772212773e-06, + "loss": 1.006, + "step": 2988 + }, + { + "epoch": 0.21594090342622863, + "grad_norm": 7.731141310277071, + "learning_rate": 4.9207346675330055e-06, + "loss": 0.8892, + "step": 2989 + }, + { + "epoch": 0.21601314862643814, + "grad_norm": 6.723852021322203, + "learning_rate": 4.920661579800303e-06, + "loss": 0.8607, + "step": 2990 + }, + { + "epoch": 0.21608539382664765, + "grad_norm": 7.715864746522006, + "learning_rate": 4.920588458930622e-06, + "loss": 0.9562, + "step": 2991 + }, + { + "epoch": 0.21615763902685714, + "grad_norm": 8.203065475974515, + "learning_rate": 4.920515304924965e-06, + "loss": 0.9777, + "step": 2992 + }, + { + "epoch": 0.21622988422706665, + "grad_norm": 6.796612333286663, + "learning_rate": 4.920442117784333e-06, + "loss": 0.8827, + "step": 2993 + }, + { + "epoch": 0.21630212942727617, + "grad_norm": 6.652284112328097, + "learning_rate": 4.920368897509727e-06, + "loss": 0.8814, + "step": 2994 + }, + { + "epoch": 0.21637437462748568, + "grad_norm": 7.060376683633454, + "learning_rate": 4.920295644102151e-06, + "loss": 0.8752, + "step": 2995 + }, + { + "epoch": 0.2164466198276952, + "grad_norm": 7.727145551971159, + "learning_rate": 4.9202223575626065e-06, + "loss": 0.9349, + "step": 2996 + }, + { + "epoch": 0.2165188650279047, + "grad_norm": 6.943558530459578, + "learning_rate": 4.920149037892097e-06, + "loss": 0.8562, + "step": 2997 + }, + { + "epoch": 0.21659111022811423, + "grad_norm": 10.124625870185548, + "learning_rate": 4.9200756850916264e-06, + "loss": 0.9096, + "step": 2998 + }, + { + "epoch": 0.21666335542832374, + "grad_norm": 6.2385295132972844, + "learning_rate": 4.9200022991621995e-06, + "loss": 0.8847, + "step": 2999 + }, + { + "epoch": 0.21673560062853325, + "grad_norm": 7.3880755360061405, + "learning_rate": 4.919928880104819e-06, + "loss": 0.8805, + "step": 3000 + }, + { + "epoch": 0.21680784582874274, + "grad_norm": 6.772775162253469, + "learning_rate": 4.919855427920491e-06, + "loss": 0.9124, + "step": 3001 + }, + { + "epoch": 0.21688009102895225, + "grad_norm": 6.313255944642083, + "learning_rate": 4.919781942610222e-06, + "loss": 1.0062, + "step": 3002 + }, + { + "epoch": 0.21695233622916177, + "grad_norm": 6.877727175711146, + "learning_rate": 4.919708424175017e-06, + "loss": 0.9834, + "step": 3003 + }, + { + "epoch": 0.21702458142937128, + "grad_norm": 8.276096107747177, + "learning_rate": 4.919634872615882e-06, + "loss": 0.9302, + "step": 3004 + }, + { + "epoch": 0.2170968266295808, + "grad_norm": 7.332966650840828, + "learning_rate": 4.919561287933824e-06, + "loss": 0.9447, + "step": 3005 + }, + { + "epoch": 0.2171690718297903, + "grad_norm": 7.818946315090767, + "learning_rate": 4.9194876701298515e-06, + "loss": 1.0125, + "step": 3006 + }, + { + "epoch": 0.21724131702999983, + "grad_norm": 8.507256944240725, + "learning_rate": 4.919414019204971e-06, + "loss": 0.9868, + "step": 3007 + }, + { + "epoch": 0.21731356223020934, + "grad_norm": 6.394062936233857, + "learning_rate": 4.919340335160191e-06, + "loss": 0.9752, + "step": 3008 + }, + { + "epoch": 0.21738580743041885, + "grad_norm": 9.572978343633489, + "learning_rate": 4.919266617996521e-06, + "loss": 0.9594, + "step": 3009 + }, + { + "epoch": 0.21745805263062834, + "grad_norm": 8.833193891852181, + "learning_rate": 4.919192867714968e-06, + "loss": 0.976, + "step": 3010 + }, + { + "epoch": 0.21753029783083785, + "grad_norm": 6.553599689047031, + "learning_rate": 4.919119084316544e-06, + "loss": 0.9785, + "step": 3011 + }, + { + "epoch": 0.21760254303104737, + "grad_norm": 8.177094114907966, + "learning_rate": 4.919045267802259e-06, + "loss": 0.9695, + "step": 3012 + }, + { + "epoch": 0.21767478823125688, + "grad_norm": 7.401061739660572, + "learning_rate": 4.918971418173121e-06, + "loss": 1.0776, + "step": 3013 + }, + { + "epoch": 0.2177470334314664, + "grad_norm": 8.292275053417091, + "learning_rate": 4.9188975354301425e-06, + "loss": 0.9349, + "step": 3014 + }, + { + "epoch": 0.2178192786316759, + "grad_norm": 7.91445822446593, + "learning_rate": 4.918823619574335e-06, + "loss": 0.9416, + "step": 3015 + }, + { + "epoch": 0.21789152383188543, + "grad_norm": 9.830337542213272, + "learning_rate": 4.91874967060671e-06, + "loss": 0.9388, + "step": 3016 + }, + { + "epoch": 0.21796376903209494, + "grad_norm": 7.8376657666163245, + "learning_rate": 4.918675688528282e-06, + "loss": 0.8668, + "step": 3017 + }, + { + "epoch": 0.21803601423230445, + "grad_norm": 7.063917169761407, + "learning_rate": 4.91860167334006e-06, + "loss": 0.9965, + "step": 3018 + }, + { + "epoch": 0.21810825943251394, + "grad_norm": 7.523614962535775, + "learning_rate": 4.918527625043059e-06, + "loss": 0.9257, + "step": 3019 + }, + { + "epoch": 0.21818050463272345, + "grad_norm": 7.304642588431453, + "learning_rate": 4.918453543638294e-06, + "loss": 0.9295, + "step": 3020 + }, + { + "epoch": 0.21825274983293297, + "grad_norm": 6.592973563957242, + "learning_rate": 4.918379429126776e-06, + "loss": 0.7957, + "step": 3021 + }, + { + "epoch": 0.21832499503314248, + "grad_norm": 6.256336509571404, + "learning_rate": 4.9183052815095225e-06, + "loss": 0.9426, + "step": 3022 + }, + { + "epoch": 0.218397240233352, + "grad_norm": 7.804298184969108, + "learning_rate": 4.918231100787547e-06, + "loss": 1.039, + "step": 3023 + }, + { + "epoch": 0.2184694854335615, + "grad_norm": 6.299374067910314, + "learning_rate": 4.9181568869618655e-06, + "loss": 0.9872, + "step": 3024 + }, + { + "epoch": 0.21854173063377103, + "grad_norm": 5.681351576632508, + "learning_rate": 4.918082640033494e-06, + "loss": 0.8517, + "step": 3025 + }, + { + "epoch": 0.21861397583398054, + "grad_norm": 6.245983511671814, + "learning_rate": 4.918008360003449e-06, + "loss": 0.8723, + "step": 3026 + }, + { + "epoch": 0.21868622103419005, + "grad_norm": 8.588179124745299, + "learning_rate": 4.917934046872746e-06, + "loss": 0.9254, + "step": 3027 + }, + { + "epoch": 0.21875846623439954, + "grad_norm": 6.239723607324349, + "learning_rate": 4.917859700642404e-06, + "loss": 0.9076, + "step": 3028 + }, + { + "epoch": 0.21883071143460905, + "grad_norm": 7.786448262866258, + "learning_rate": 4.9177853213134405e-06, + "loss": 0.8577, + "step": 3029 + }, + { + "epoch": 0.21890295663481857, + "grad_norm": 7.179724692164278, + "learning_rate": 4.917710908886872e-06, + "loss": 0.9948, + "step": 3030 + }, + { + "epoch": 0.21897520183502808, + "grad_norm": 9.565763427769614, + "learning_rate": 4.917636463363719e-06, + "loss": 1.052, + "step": 3031 + }, + { + "epoch": 0.2190474470352376, + "grad_norm": 7.437741572199468, + "learning_rate": 4.9175619847450005e-06, + "loss": 0.9214, + "step": 3032 + }, + { + "epoch": 0.2191196922354471, + "grad_norm": 8.683447901384259, + "learning_rate": 4.917487473031735e-06, + "loss": 0.9935, + "step": 3033 + }, + { + "epoch": 0.21919193743565663, + "grad_norm": 7.9547175103938175, + "learning_rate": 4.917412928224943e-06, + "loss": 0.9692, + "step": 3034 + }, + { + "epoch": 0.21926418263586614, + "grad_norm": 7.430511781874528, + "learning_rate": 4.917338350325645e-06, + "loss": 0.9948, + "step": 3035 + }, + { + "epoch": 0.21933642783607565, + "grad_norm": 8.060068636239919, + "learning_rate": 4.917263739334862e-06, + "loss": 0.9299, + "step": 3036 + }, + { + "epoch": 0.21940867303628514, + "grad_norm": 6.31828632106144, + "learning_rate": 4.917189095253615e-06, + "loss": 0.9417, + "step": 3037 + }, + { + "epoch": 0.21948091823649465, + "grad_norm": 7.682145153308036, + "learning_rate": 4.9171144180829265e-06, + "loss": 0.9582, + "step": 3038 + }, + { + "epoch": 0.21955316343670417, + "grad_norm": 6.608673419194829, + "learning_rate": 4.917039707823818e-06, + "loss": 0.9829, + "step": 3039 + }, + { + "epoch": 0.21962540863691368, + "grad_norm": 7.0174941395788455, + "learning_rate": 4.916964964477314e-06, + "loss": 0.8656, + "step": 3040 + }, + { + "epoch": 0.2196976538371232, + "grad_norm": 6.64880613925482, + "learning_rate": 4.916890188044435e-06, + "loss": 0.9262, + "step": 3041 + }, + { + "epoch": 0.2197698990373327, + "grad_norm": 6.434651753947534, + "learning_rate": 4.916815378526206e-06, + "loss": 0.9382, + "step": 3042 + }, + { + "epoch": 0.21984214423754223, + "grad_norm": 6.651060561887885, + "learning_rate": 4.9167405359236505e-06, + "loss": 0.9153, + "step": 3043 + }, + { + "epoch": 0.21991438943775174, + "grad_norm": 7.73292999640794, + "learning_rate": 4.9166656602377946e-06, + "loss": 0.8531, + "step": 3044 + }, + { + "epoch": 0.21998663463796125, + "grad_norm": 6.033659932831696, + "learning_rate": 4.916590751469662e-06, + "loss": 0.9111, + "step": 3045 + }, + { + "epoch": 0.22005887983817074, + "grad_norm": 7.020009189610566, + "learning_rate": 4.916515809620278e-06, + "loss": 0.9405, + "step": 3046 + }, + { + "epoch": 0.22013112503838025, + "grad_norm": 6.437365187927755, + "learning_rate": 4.9164408346906696e-06, + "loss": 1.0196, + "step": 3047 + }, + { + "epoch": 0.22020337023858977, + "grad_norm": 7.015454534485948, + "learning_rate": 4.916365826681861e-06, + "loss": 0.9849, + "step": 3048 + }, + { + "epoch": 0.22027561543879928, + "grad_norm": 7.18110279168092, + "learning_rate": 4.916290785594882e-06, + "loss": 1.0241, + "step": 3049 + }, + { + "epoch": 0.2203478606390088, + "grad_norm": 6.257179104375891, + "learning_rate": 4.916215711430757e-06, + "loss": 0.8653, + "step": 3050 + }, + { + "epoch": 0.2204201058392183, + "grad_norm": 8.67482530049687, + "learning_rate": 4.916140604190516e-06, + "loss": 0.9129, + "step": 3051 + }, + { + "epoch": 0.22049235103942783, + "grad_norm": 5.985682252576442, + "learning_rate": 4.916065463875186e-06, + "loss": 0.9338, + "step": 3052 + }, + { + "epoch": 0.22056459623963734, + "grad_norm": 7.228567869981421, + "learning_rate": 4.915990290485796e-06, + "loss": 0.8946, + "step": 3053 + }, + { + "epoch": 0.22063684143984685, + "grad_norm": 6.063167889172699, + "learning_rate": 4.915915084023374e-06, + "loss": 0.8654, + "step": 3054 + }, + { + "epoch": 0.22070908664005634, + "grad_norm": 8.04398414016139, + "learning_rate": 4.91583984448895e-06, + "loss": 0.9389, + "step": 3055 + }, + { + "epoch": 0.22078133184026585, + "grad_norm": 7.514227279154071, + "learning_rate": 4.915764571883555e-06, + "loss": 0.9598, + "step": 3056 + }, + { + "epoch": 0.22085357704047537, + "grad_norm": 5.995783277792636, + "learning_rate": 4.915689266208219e-06, + "loss": 0.8767, + "step": 3057 + }, + { + "epoch": 0.22092582224068488, + "grad_norm": 7.950901042811319, + "learning_rate": 4.915613927463973e-06, + "loss": 0.9243, + "step": 3058 + }, + { + "epoch": 0.2209980674408944, + "grad_norm": 6.6547227996943255, + "learning_rate": 4.915538555651846e-06, + "loss": 0.9502, + "step": 3059 + }, + { + "epoch": 0.2210703126411039, + "grad_norm": 6.580066820170609, + "learning_rate": 4.915463150772874e-06, + "loss": 0.8988, + "step": 3060 + }, + { + "epoch": 0.22114255784131343, + "grad_norm": 9.446438428032513, + "learning_rate": 4.915387712828085e-06, + "loss": 0.88, + "step": 3061 + }, + { + "epoch": 0.22121480304152294, + "grad_norm": 7.152902944842051, + "learning_rate": 4.915312241818514e-06, + "loss": 0.9792, + "step": 3062 + }, + { + "epoch": 0.22128704824173245, + "grad_norm": 6.074345921598813, + "learning_rate": 4.915236737745195e-06, + "loss": 0.8991, + "step": 3063 + }, + { + "epoch": 0.22135929344194194, + "grad_norm": 10.307272215997207, + "learning_rate": 4.91516120060916e-06, + "loss": 0.9635, + "step": 3064 + }, + { + "epoch": 0.22143153864215145, + "grad_norm": 6.667518211657408, + "learning_rate": 4.915085630411442e-06, + "loss": 0.8517, + "step": 3065 + }, + { + "epoch": 0.22150378384236097, + "grad_norm": 9.266247475928767, + "learning_rate": 4.915010027153079e-06, + "loss": 0.9712, + "step": 3066 + }, + { + "epoch": 0.22157602904257048, + "grad_norm": 7.3135304825300675, + "learning_rate": 4.914934390835102e-06, + "loss": 0.9846, + "step": 3067 + }, + { + "epoch": 0.22164827424278, + "grad_norm": 7.2406837852834585, + "learning_rate": 4.9148587214585496e-06, + "loss": 0.8977, + "step": 3068 + }, + { + "epoch": 0.2217205194429895, + "grad_norm": 6.867658544709307, + "learning_rate": 4.914783019024456e-06, + "loss": 0.9922, + "step": 3069 + }, + { + "epoch": 0.22179276464319903, + "grad_norm": 7.953039382443744, + "learning_rate": 4.914707283533857e-06, + "loss": 0.9987, + "step": 3070 + }, + { + "epoch": 0.22186500984340854, + "grad_norm": 9.624108607642539, + "learning_rate": 4.914631514987791e-06, + "loss": 0.9623, + "step": 3071 + }, + { + "epoch": 0.22193725504361803, + "grad_norm": 6.945726345850054, + "learning_rate": 4.914555713387295e-06, + "loss": 0.9149, + "step": 3072 + }, + { + "epoch": 0.22200950024382754, + "grad_norm": 7.0390221259073815, + "learning_rate": 4.9144798787334045e-06, + "loss": 0.9698, + "step": 3073 + }, + { + "epoch": 0.22208174544403705, + "grad_norm": 5.458073772442062, + "learning_rate": 4.91440401102716e-06, + "loss": 0.9429, + "step": 3074 + }, + { + "epoch": 0.22215399064424657, + "grad_norm": 9.732253919695163, + "learning_rate": 4.9143281102696e-06, + "loss": 0.9189, + "step": 3075 + }, + { + "epoch": 0.22222623584445608, + "grad_norm": 10.551355385156317, + "learning_rate": 4.9142521764617616e-06, + "loss": 1.0507, + "step": 3076 + }, + { + "epoch": 0.2222984810446656, + "grad_norm": 6.052238037793398, + "learning_rate": 4.914176209604686e-06, + "loss": 0.9008, + "step": 3077 + }, + { + "epoch": 0.2223707262448751, + "grad_norm": 8.704335099473468, + "learning_rate": 4.914100209699412e-06, + "loss": 0.919, + "step": 3078 + }, + { + "epoch": 0.22244297144508463, + "grad_norm": 8.46387930015647, + "learning_rate": 4.914024176746981e-06, + "loss": 1.0411, + "step": 3079 + }, + { + "epoch": 0.22251521664529414, + "grad_norm": 6.457711204179235, + "learning_rate": 4.913948110748433e-06, + "loss": 0.908, + "step": 3080 + }, + { + "epoch": 0.22258746184550363, + "grad_norm": 8.367856751362808, + "learning_rate": 4.913872011704811e-06, + "loss": 0.865, + "step": 3081 + }, + { + "epoch": 0.22265970704571314, + "grad_norm": 9.12059353553428, + "learning_rate": 4.913795879617154e-06, + "loss": 0.8475, + "step": 3082 + }, + { + "epoch": 0.22273195224592265, + "grad_norm": 6.710141082508472, + "learning_rate": 4.913719714486506e-06, + "loss": 0.9367, + "step": 3083 + }, + { + "epoch": 0.22280419744613217, + "grad_norm": 7.842015534389202, + "learning_rate": 4.913643516313909e-06, + "loss": 0.9756, + "step": 3084 + }, + { + "epoch": 0.22287644264634168, + "grad_norm": 7.795251711732443, + "learning_rate": 4.913567285100407e-06, + "loss": 0.9476, + "step": 3085 + }, + { + "epoch": 0.2229486878465512, + "grad_norm": 8.389037253756946, + "learning_rate": 4.913491020847043e-06, + "loss": 0.8799, + "step": 3086 + }, + { + "epoch": 0.2230209330467607, + "grad_norm": 7.4743036666644525, + "learning_rate": 4.91341472355486e-06, + "loss": 0.8948, + "step": 3087 + }, + { + "epoch": 0.22309317824697023, + "grad_norm": 6.312074552494607, + "learning_rate": 4.913338393224903e-06, + "loss": 1.0076, + "step": 3088 + }, + { + "epoch": 0.22316542344717974, + "grad_norm": 7.203839909953731, + "learning_rate": 4.913262029858219e-06, + "loss": 0.9741, + "step": 3089 + }, + { + "epoch": 0.22323766864738923, + "grad_norm": 8.078711482880967, + "learning_rate": 4.91318563345585e-06, + "loss": 0.9793, + "step": 3090 + }, + { + "epoch": 0.22330991384759874, + "grad_norm": 7.648548472206309, + "learning_rate": 4.913109204018844e-06, + "loss": 0.9847, + "step": 3091 + }, + { + "epoch": 0.22338215904780825, + "grad_norm": 6.615126251607525, + "learning_rate": 4.9130327415482475e-06, + "loss": 0.875, + "step": 3092 + }, + { + "epoch": 0.22345440424801777, + "grad_norm": 7.094899357518086, + "learning_rate": 4.9129562460451055e-06, + "loss": 0.8789, + "step": 3093 + }, + { + "epoch": 0.22352664944822728, + "grad_norm": 6.510341047737924, + "learning_rate": 4.912879717510465e-06, + "loss": 0.8718, + "step": 3094 + }, + { + "epoch": 0.2235988946484368, + "grad_norm": 6.805257324465936, + "learning_rate": 4.912803155945376e-06, + "loss": 0.8419, + "step": 3095 + }, + { + "epoch": 0.2236711398486463, + "grad_norm": 5.791595604915704, + "learning_rate": 4.912726561350885e-06, + "loss": 0.8943, + "step": 3096 + }, + { + "epoch": 0.22374338504885583, + "grad_norm": 8.544121167841764, + "learning_rate": 4.912649933728041e-06, + "loss": 0.9574, + "step": 3097 + }, + { + "epoch": 0.22381563024906534, + "grad_norm": 6.52938976900715, + "learning_rate": 4.912573273077892e-06, + "loss": 0.9323, + "step": 3098 + }, + { + "epoch": 0.22388787544927483, + "grad_norm": 6.690131124400322, + "learning_rate": 4.912496579401488e-06, + "loss": 0.7989, + "step": 3099 + }, + { + "epoch": 0.22396012064948434, + "grad_norm": 6.406505449365817, + "learning_rate": 4.91241985269988e-06, + "loss": 0.8848, + "step": 3100 + }, + { + "epoch": 0.22403236584969385, + "grad_norm": 8.611021351773733, + "learning_rate": 4.912343092974117e-06, + "loss": 1.0132, + "step": 3101 + }, + { + "epoch": 0.22410461104990337, + "grad_norm": 6.8359958144834145, + "learning_rate": 4.9122663002252495e-06, + "loss": 1.0136, + "step": 3102 + }, + { + "epoch": 0.22417685625011288, + "grad_norm": 7.578926091275827, + "learning_rate": 4.91218947445433e-06, + "loss": 1.0134, + "step": 3103 + }, + { + "epoch": 0.2242491014503224, + "grad_norm": 7.125072010412558, + "learning_rate": 4.912112615662409e-06, + "loss": 0.9589, + "step": 3104 + }, + { + "epoch": 0.2243213466505319, + "grad_norm": 6.829310962331035, + "learning_rate": 4.9120357238505395e-06, + "loss": 0.9574, + "step": 3105 + }, + { + "epoch": 0.22439359185074143, + "grad_norm": 6.427863454435477, + "learning_rate": 4.911958799019774e-06, + "loss": 0.8734, + "step": 3106 + }, + { + "epoch": 0.22446583705095094, + "grad_norm": 7.47700765434507, + "learning_rate": 4.911881841171165e-06, + "loss": 0.9814, + "step": 3107 + }, + { + "epoch": 0.22453808225116043, + "grad_norm": 6.92500348142681, + "learning_rate": 4.911804850305767e-06, + "loss": 0.9486, + "step": 3108 + }, + { + "epoch": 0.22461032745136994, + "grad_norm": 6.269894180013284, + "learning_rate": 4.911727826424632e-06, + "loss": 0.9448, + "step": 3109 + }, + { + "epoch": 0.22468257265157945, + "grad_norm": 7.354670549660465, + "learning_rate": 4.911650769528817e-06, + "loss": 0.938, + "step": 3110 + }, + { + "epoch": 0.22475481785178897, + "grad_norm": 8.01496964837394, + "learning_rate": 4.911573679619374e-06, + "loss": 0.8855, + "step": 3111 + }, + { + "epoch": 0.22482706305199848, + "grad_norm": 6.752986388613183, + "learning_rate": 4.911496556697361e-06, + "loss": 0.9454, + "step": 3112 + }, + { + "epoch": 0.224899308252208, + "grad_norm": 6.401946916087812, + "learning_rate": 4.911419400763832e-06, + "loss": 0.9105, + "step": 3113 + }, + { + "epoch": 0.2249715534524175, + "grad_norm": 7.940266044637242, + "learning_rate": 4.911342211819843e-06, + "loss": 0.9633, + "step": 3114 + }, + { + "epoch": 0.22504379865262703, + "grad_norm": 6.158550007079366, + "learning_rate": 4.911264989866452e-06, + "loss": 0.96, + "step": 3115 + }, + { + "epoch": 0.22511604385283654, + "grad_norm": 7.158171949539889, + "learning_rate": 4.911187734904716e-06, + "loss": 0.91, + "step": 3116 + }, + { + "epoch": 0.22518828905304603, + "grad_norm": 6.257446735565353, + "learning_rate": 4.911110446935692e-06, + "loss": 0.9539, + "step": 3117 + }, + { + "epoch": 0.22526053425325554, + "grad_norm": 6.11605025398161, + "learning_rate": 4.911033125960439e-06, + "loss": 0.8424, + "step": 3118 + }, + { + "epoch": 0.22533277945346505, + "grad_norm": 6.970891101853926, + "learning_rate": 4.910955771980013e-06, + "loss": 0.9864, + "step": 3119 + }, + { + "epoch": 0.22540502465367457, + "grad_norm": 7.321102462045292, + "learning_rate": 4.910878384995475e-06, + "loss": 0.8312, + "step": 3120 + }, + { + "epoch": 0.22547726985388408, + "grad_norm": 6.442234113340942, + "learning_rate": 4.910800965007884e-06, + "loss": 0.9834, + "step": 3121 + }, + { + "epoch": 0.2255495150540936, + "grad_norm": 8.29479103712742, + "learning_rate": 4.9107235120182985e-06, + "loss": 0.9536, + "step": 3122 + }, + { + "epoch": 0.2256217602543031, + "grad_norm": 7.645509496443326, + "learning_rate": 4.910646026027781e-06, + "loss": 0.9177, + "step": 3123 + }, + { + "epoch": 0.22569400545451263, + "grad_norm": 9.619875249549473, + "learning_rate": 4.910568507037391e-06, + "loss": 0.9854, + "step": 3124 + }, + { + "epoch": 0.22576625065472214, + "grad_norm": 6.8279680107453435, + "learning_rate": 4.9104909550481896e-06, + "loss": 0.9662, + "step": 3125 + }, + { + "epoch": 0.22583849585493163, + "grad_norm": 6.160131789444937, + "learning_rate": 4.910413370061239e-06, + "loss": 0.9641, + "step": 3126 + }, + { + "epoch": 0.22591074105514114, + "grad_norm": 8.666627248038612, + "learning_rate": 4.9103357520776e-06, + "loss": 0.9675, + "step": 3127 + }, + { + "epoch": 0.22598298625535065, + "grad_norm": 8.751325670363864, + "learning_rate": 4.910258101098338e-06, + "loss": 0.9064, + "step": 3128 + }, + { + "epoch": 0.22605523145556017, + "grad_norm": 8.218529063689141, + "learning_rate": 4.910180417124513e-06, + "loss": 0.971, + "step": 3129 + }, + { + "epoch": 0.22612747665576968, + "grad_norm": 8.386667806668752, + "learning_rate": 4.910102700157189e-06, + "loss": 0.9337, + "step": 3130 + }, + { + "epoch": 0.2261997218559792, + "grad_norm": 6.72445167092888, + "learning_rate": 4.91002495019743e-06, + "loss": 0.8877, + "step": 3131 + }, + { + "epoch": 0.2262719670561887, + "grad_norm": 6.741975323464707, + "learning_rate": 4.909947167246303e-06, + "loss": 0.8788, + "step": 3132 + }, + { + "epoch": 0.22634421225639823, + "grad_norm": 6.737132488128585, + "learning_rate": 4.909869351304868e-06, + "loss": 0.9511, + "step": 3133 + }, + { + "epoch": 0.22641645745660774, + "grad_norm": 7.136648460672608, + "learning_rate": 4.909791502374194e-06, + "loss": 0.8852, + "step": 3134 + }, + { + "epoch": 0.22648870265681723, + "grad_norm": 8.743444984683972, + "learning_rate": 4.909713620455345e-06, + "loss": 0.9785, + "step": 3135 + }, + { + "epoch": 0.22656094785702674, + "grad_norm": 6.790110060596202, + "learning_rate": 4.909635705549387e-06, + "loss": 0.8839, + "step": 3136 + }, + { + "epoch": 0.22663319305723625, + "grad_norm": 7.738578501831193, + "learning_rate": 4.9095577576573886e-06, + "loss": 0.8585, + "step": 3137 + }, + { + "epoch": 0.22670543825744577, + "grad_norm": 7.0259379347903, + "learning_rate": 4.909479776780414e-06, + "loss": 0.9751, + "step": 3138 + }, + { + "epoch": 0.22677768345765528, + "grad_norm": 6.654353055053424, + "learning_rate": 4.909401762919533e-06, + "loss": 0.9292, + "step": 3139 + }, + { + "epoch": 0.2268499286578648, + "grad_norm": 8.430450885701813, + "learning_rate": 4.909323716075813e-06, + "loss": 0.7966, + "step": 3140 + }, + { + "epoch": 0.2269221738580743, + "grad_norm": 7.352844839923541, + "learning_rate": 4.9092456362503206e-06, + "loss": 0.9965, + "step": 3141 + }, + { + "epoch": 0.22699441905828383, + "grad_norm": 7.500362133184387, + "learning_rate": 4.909167523444127e-06, + "loss": 1.0083, + "step": 3142 + }, + { + "epoch": 0.22706666425849334, + "grad_norm": 7.735502650198027, + "learning_rate": 4.9090893776583005e-06, + "loss": 0.9853, + "step": 3143 + }, + { + "epoch": 0.22713890945870283, + "grad_norm": 6.763169052111695, + "learning_rate": 4.90901119889391e-06, + "loss": 0.9042, + "step": 3144 + }, + { + "epoch": 0.22721115465891234, + "grad_norm": 7.600208942151227, + "learning_rate": 4.908932987152028e-06, + "loss": 0.9178, + "step": 3145 + }, + { + "epoch": 0.22728339985912185, + "grad_norm": 6.751637578051088, + "learning_rate": 4.908854742433723e-06, + "loss": 0.9124, + "step": 3146 + }, + { + "epoch": 0.22735564505933137, + "grad_norm": 7.69956854069698, + "learning_rate": 4.9087764647400684e-06, + "loss": 0.959, + "step": 3147 + }, + { + "epoch": 0.22742789025954088, + "grad_norm": 7.752516061224318, + "learning_rate": 4.9086981540721325e-06, + "loss": 1.0004, + "step": 3148 + }, + { + "epoch": 0.2275001354597504, + "grad_norm": 6.925527878896105, + "learning_rate": 4.90861981043099e-06, + "loss": 0.8726, + "step": 3149 + }, + { + "epoch": 0.2275723806599599, + "grad_norm": 6.257903328187597, + "learning_rate": 4.908541433817712e-06, + "loss": 0.8477, + "step": 3150 + }, + { + "epoch": 0.22764462586016943, + "grad_norm": 7.490079804060262, + "learning_rate": 4.908463024233372e-06, + "loss": 0.9226, + "step": 3151 + }, + { + "epoch": 0.22771687106037894, + "grad_norm": 6.844569048359705, + "learning_rate": 4.908384581679044e-06, + "loss": 0.9094, + "step": 3152 + }, + { + "epoch": 0.22778911626058843, + "grad_norm": 7.30688260801182, + "learning_rate": 4.9083061061558e-06, + "loss": 0.929, + "step": 3153 + }, + { + "epoch": 0.22786136146079794, + "grad_norm": 6.954651397151996, + "learning_rate": 4.908227597664717e-06, + "loss": 0.9446, + "step": 3154 + }, + { + "epoch": 0.22793360666100745, + "grad_norm": 7.82599883605808, + "learning_rate": 4.9081490562068655e-06, + "loss": 0.9522, + "step": 3155 + }, + { + "epoch": 0.22800585186121697, + "grad_norm": 6.930310897568946, + "learning_rate": 4.908070481783325e-06, + "loss": 0.8731, + "step": 3156 + }, + { + "epoch": 0.22807809706142648, + "grad_norm": 6.533479592898436, + "learning_rate": 4.907991874395169e-06, + "loss": 0.9554, + "step": 3157 + }, + { + "epoch": 0.228150342261636, + "grad_norm": 8.713540110081551, + "learning_rate": 4.907913234043474e-06, + "loss": 0.9764, + "step": 3158 + }, + { + "epoch": 0.2282225874618455, + "grad_norm": 6.410865274906192, + "learning_rate": 4.907834560729316e-06, + "loss": 0.9414, + "step": 3159 + }, + { + "epoch": 0.22829483266205503, + "grad_norm": 7.758964276360969, + "learning_rate": 4.9077558544537725e-06, + "loss": 0.9148, + "step": 3160 + }, + { + "epoch": 0.22836707786226454, + "grad_norm": 10.049074682653078, + "learning_rate": 4.907677115217922e-06, + "loss": 1.0859, + "step": 3161 + }, + { + "epoch": 0.22843932306247403, + "grad_norm": 7.544732778985537, + "learning_rate": 4.90759834302284e-06, + "loss": 0.9676, + "step": 3162 + }, + { + "epoch": 0.22851156826268354, + "grad_norm": 10.51878057918535, + "learning_rate": 4.9075195378696064e-06, + "loss": 0.984, + "step": 3163 + }, + { + "epoch": 0.22858381346289305, + "grad_norm": 11.11230945059053, + "learning_rate": 4.907440699759299e-06, + "loss": 0.9992, + "step": 3164 + }, + { + "epoch": 0.22865605866310257, + "grad_norm": 9.046854760340054, + "learning_rate": 4.907361828692999e-06, + "loss": 0.9487, + "step": 3165 + }, + { + "epoch": 0.22872830386331208, + "grad_norm": 6.2656587899455145, + "learning_rate": 4.907282924671784e-06, + "loss": 0.8778, + "step": 3166 + }, + { + "epoch": 0.2288005490635216, + "grad_norm": 8.51278398350988, + "learning_rate": 4.907203987696735e-06, + "loss": 0.8884, + "step": 3167 + }, + { + "epoch": 0.2288727942637311, + "grad_norm": 10.950667645728593, + "learning_rate": 4.907125017768932e-06, + "loss": 0.9767, + "step": 3168 + }, + { + "epoch": 0.22894503946394062, + "grad_norm": 8.649853267417406, + "learning_rate": 4.9070460148894575e-06, + "loss": 0.8746, + "step": 3169 + }, + { + "epoch": 0.22901728466415014, + "grad_norm": 8.098089167003428, + "learning_rate": 4.906966979059391e-06, + "loss": 0.9153, + "step": 3170 + }, + { + "epoch": 0.22908952986435963, + "grad_norm": 6.798932765910477, + "learning_rate": 4.9068879102798164e-06, + "loss": 0.9472, + "step": 3171 + }, + { + "epoch": 0.22916177506456914, + "grad_norm": 7.607708946364711, + "learning_rate": 4.906808808551815e-06, + "loss": 0.9214, + "step": 3172 + }, + { + "epoch": 0.22923402026477865, + "grad_norm": 9.749698340811186, + "learning_rate": 4.906729673876469e-06, + "loss": 0.9306, + "step": 3173 + }, + { + "epoch": 0.22930626546498817, + "grad_norm": 9.073806339503637, + "learning_rate": 4.906650506254863e-06, + "loss": 0.9931, + "step": 3174 + }, + { + "epoch": 0.22937851066519768, + "grad_norm": 7.43925545115104, + "learning_rate": 4.906571305688081e-06, + "loss": 0.9037, + "step": 3175 + }, + { + "epoch": 0.2294507558654072, + "grad_norm": 7.250732384882751, + "learning_rate": 4.906492072177205e-06, + "loss": 0.9495, + "step": 3176 + }, + { + "epoch": 0.2295230010656167, + "grad_norm": 7.055834570279712, + "learning_rate": 4.906412805723321e-06, + "loss": 0.9555, + "step": 3177 + }, + { + "epoch": 0.22959524626582622, + "grad_norm": 7.276937230745163, + "learning_rate": 4.906333506327516e-06, + "loss": 0.9014, + "step": 3178 + }, + { + "epoch": 0.2296674914660357, + "grad_norm": 9.200602420488691, + "learning_rate": 4.9062541739908715e-06, + "loss": 0.8693, + "step": 3179 + }, + { + "epoch": 0.22973973666624523, + "grad_norm": 6.8538178545500905, + "learning_rate": 4.906174808714476e-06, + "loss": 0.8847, + "step": 3180 + }, + { + "epoch": 0.22981198186645474, + "grad_norm": 7.12151237927136, + "learning_rate": 4.906095410499417e-06, + "loss": 0.8977, + "step": 3181 + }, + { + "epoch": 0.22988422706666425, + "grad_norm": 6.71699224145216, + "learning_rate": 4.9060159793467784e-06, + "loss": 0.9904, + "step": 3182 + }, + { + "epoch": 0.22995647226687377, + "grad_norm": 7.822738164565999, + "learning_rate": 4.905936515257651e-06, + "loss": 0.9392, + "step": 3183 + }, + { + "epoch": 0.23002871746708328, + "grad_norm": 6.782794547963458, + "learning_rate": 4.90585701823312e-06, + "loss": 0.9693, + "step": 3184 + }, + { + "epoch": 0.2301009626672928, + "grad_norm": 7.488043855457178, + "learning_rate": 4.905777488274274e-06, + "loss": 1.0002, + "step": 3185 + }, + { + "epoch": 0.2301732078675023, + "grad_norm": 8.80911904267989, + "learning_rate": 4.905697925382203e-06, + "loss": 0.9949, + "step": 3186 + }, + { + "epoch": 0.23024545306771182, + "grad_norm": 6.668534811850744, + "learning_rate": 4.905618329557994e-06, + "loss": 0.9066, + "step": 3187 + }, + { + "epoch": 0.2303176982679213, + "grad_norm": 7.582430719448373, + "learning_rate": 4.905538700802739e-06, + "loss": 1.0236, + "step": 3188 + }, + { + "epoch": 0.23038994346813083, + "grad_norm": 6.997542631149585, + "learning_rate": 4.905459039117527e-06, + "loss": 1.0503, + "step": 3189 + }, + { + "epoch": 0.23046218866834034, + "grad_norm": 8.234753478760432, + "learning_rate": 4.905379344503448e-06, + "loss": 0.9151, + "step": 3190 + }, + { + "epoch": 0.23053443386854985, + "grad_norm": 6.841933754072542, + "learning_rate": 4.905299616961594e-06, + "loss": 0.9958, + "step": 3191 + }, + { + "epoch": 0.23060667906875937, + "grad_norm": 6.997388625108711, + "learning_rate": 4.905219856493055e-06, + "loss": 0.896, + "step": 3192 + }, + { + "epoch": 0.23067892426896888, + "grad_norm": 7.055883498425932, + "learning_rate": 4.905140063098924e-06, + "loss": 0.9188, + "step": 3193 + }, + { + "epoch": 0.2307511694691784, + "grad_norm": 5.825375969668248, + "learning_rate": 4.9050602367802935e-06, + "loss": 0.8506, + "step": 3194 + }, + { + "epoch": 0.2308234146693879, + "grad_norm": 6.348937971558646, + "learning_rate": 4.9049803775382555e-06, + "loss": 0.9113, + "step": 3195 + }, + { + "epoch": 0.23089565986959742, + "grad_norm": 8.70202607865175, + "learning_rate": 4.904900485373903e-06, + "loss": 0.8969, + "step": 3196 + }, + { + "epoch": 0.2309679050698069, + "grad_norm": 5.530385267692156, + "learning_rate": 4.90482056028833e-06, + "loss": 0.8055, + "step": 3197 + }, + { + "epoch": 0.23104015027001643, + "grad_norm": 6.677645258435773, + "learning_rate": 4.9047406022826315e-06, + "loss": 0.9076, + "step": 3198 + }, + { + "epoch": 0.23111239547022594, + "grad_norm": 5.872841945938325, + "learning_rate": 4.904660611357901e-06, + "loss": 0.7943, + "step": 3199 + }, + { + "epoch": 0.23118464067043545, + "grad_norm": 6.600787560839982, + "learning_rate": 4.9045805875152345e-06, + "loss": 0.9632, + "step": 3200 + }, + { + "epoch": 0.23125688587064497, + "grad_norm": 7.872829804282401, + "learning_rate": 4.9045005307557256e-06, + "loss": 0.9201, + "step": 3201 + }, + { + "epoch": 0.23132913107085448, + "grad_norm": 7.979501928020765, + "learning_rate": 4.904420441080472e-06, + "loss": 0.8606, + "step": 3202 + }, + { + "epoch": 0.231401376271064, + "grad_norm": 7.989371391844067, + "learning_rate": 4.90434031849057e-06, + "loss": 1.0711, + "step": 3203 + }, + { + "epoch": 0.2314736214712735, + "grad_norm": 6.298923509452205, + "learning_rate": 4.904260162987115e-06, + "loss": 0.9381, + "step": 3204 + }, + { + "epoch": 0.23154586667148302, + "grad_norm": 7.386017926001984, + "learning_rate": 4.904179974571206e-06, + "loss": 0.9326, + "step": 3205 + }, + { + "epoch": 0.2316181118716925, + "grad_norm": 9.097464625370263, + "learning_rate": 4.904099753243939e-06, + "loss": 0.952, + "step": 3206 + }, + { + "epoch": 0.23169035707190203, + "grad_norm": 7.334430641382598, + "learning_rate": 4.904019499006414e-06, + "loss": 0.9913, + "step": 3207 + }, + { + "epoch": 0.23176260227211154, + "grad_norm": 6.696392101370183, + "learning_rate": 4.903939211859727e-06, + "loss": 0.8837, + "step": 3208 + }, + { + "epoch": 0.23183484747232105, + "grad_norm": 6.176322937383979, + "learning_rate": 4.90385889180498e-06, + "loss": 0.8789, + "step": 3209 + }, + { + "epoch": 0.23190709267253057, + "grad_norm": 7.097853494610679, + "learning_rate": 4.9037785388432715e-06, + "loss": 0.9989, + "step": 3210 + }, + { + "epoch": 0.23197933787274008, + "grad_norm": 7.855875923761845, + "learning_rate": 4.903698152975701e-06, + "loss": 0.8965, + "step": 3211 + }, + { + "epoch": 0.2320515830729496, + "grad_norm": 8.309256243422011, + "learning_rate": 4.9036177342033685e-06, + "loss": 0.9804, + "step": 3212 + }, + { + "epoch": 0.2321238282731591, + "grad_norm": 7.6888136322856395, + "learning_rate": 4.903537282527376e-06, + "loss": 0.9961, + "step": 3213 + }, + { + "epoch": 0.23219607347336862, + "grad_norm": 6.623830152186103, + "learning_rate": 4.903456797948825e-06, + "loss": 0.9167, + "step": 3214 + }, + { + "epoch": 0.2322683186735781, + "grad_norm": 6.757506823379284, + "learning_rate": 4.903376280468816e-06, + "loss": 0.9515, + "step": 3215 + }, + { + "epoch": 0.23234056387378763, + "grad_norm": 5.079277776306572, + "learning_rate": 4.903295730088451e-06, + "loss": 0.8399, + "step": 3216 + }, + { + "epoch": 0.23241280907399714, + "grad_norm": 7.258239799215107, + "learning_rate": 4.903215146808834e-06, + "loss": 0.8552, + "step": 3217 + }, + { + "epoch": 0.23248505427420665, + "grad_norm": 7.4234491546096155, + "learning_rate": 4.903134530631068e-06, + "loss": 0.9798, + "step": 3218 + }, + { + "epoch": 0.23255729947441617, + "grad_norm": 6.921046360265713, + "learning_rate": 4.9030538815562554e-06, + "loss": 0.9304, + "step": 3219 + }, + { + "epoch": 0.23262954467462568, + "grad_norm": 6.731378370954141, + "learning_rate": 4.902973199585502e-06, + "loss": 0.9751, + "step": 3220 + }, + { + "epoch": 0.2327017898748352, + "grad_norm": 6.393426630786651, + "learning_rate": 4.9028924847199115e-06, + "loss": 0.8985, + "step": 3221 + }, + { + "epoch": 0.2327740350750447, + "grad_norm": 9.387634715245103, + "learning_rate": 4.902811736960588e-06, + "loss": 0.9589, + "step": 3222 + }, + { + "epoch": 0.23284628027525422, + "grad_norm": 6.484646784590386, + "learning_rate": 4.9027309563086365e-06, + "loss": 0.9341, + "step": 3223 + }, + { + "epoch": 0.2329185254754637, + "grad_norm": 7.094398233649379, + "learning_rate": 4.902650142765165e-06, + "loss": 0.8992, + "step": 3224 + }, + { + "epoch": 0.23299077067567323, + "grad_norm": 5.836141764016469, + "learning_rate": 4.902569296331279e-06, + "loss": 0.9574, + "step": 3225 + }, + { + "epoch": 0.23306301587588274, + "grad_norm": 7.825118959150303, + "learning_rate": 4.902488417008084e-06, + "loss": 0.9011, + "step": 3226 + }, + { + "epoch": 0.23313526107609225, + "grad_norm": 8.480086842937961, + "learning_rate": 4.902407504796688e-06, + "loss": 0.9699, + "step": 3227 + }, + { + "epoch": 0.23320750627630177, + "grad_norm": 8.641231577657836, + "learning_rate": 4.902326559698198e-06, + "loss": 0.9182, + "step": 3228 + }, + { + "epoch": 0.23327975147651128, + "grad_norm": 5.952964480180856, + "learning_rate": 4.902245581713725e-06, + "loss": 0.8571, + "step": 3229 + }, + { + "epoch": 0.2333519966767208, + "grad_norm": 7.564519187034022, + "learning_rate": 4.9021645708443735e-06, + "loss": 0.9573, + "step": 3230 + }, + { + "epoch": 0.2334242418769303, + "grad_norm": 9.475743295305923, + "learning_rate": 4.9020835270912535e-06, + "loss": 0.934, + "step": 3231 + }, + { + "epoch": 0.23349648707713982, + "grad_norm": 8.05150949293026, + "learning_rate": 4.902002450455477e-06, + "loss": 0.9404, + "step": 3232 + }, + { + "epoch": 0.2335687322773493, + "grad_norm": 6.163090192841385, + "learning_rate": 4.90192134093815e-06, + "loss": 0.928, + "step": 3233 + }, + { + "epoch": 0.23364097747755883, + "grad_norm": 6.983439884268175, + "learning_rate": 4.901840198540386e-06, + "loss": 0.8767, + "step": 3234 + }, + { + "epoch": 0.23371322267776834, + "grad_norm": 7.813725978025334, + "learning_rate": 4.901759023263294e-06, + "loss": 0.8652, + "step": 3235 + }, + { + "epoch": 0.23378546787797785, + "grad_norm": 7.869720384606099, + "learning_rate": 4.901677815107986e-06, + "loss": 0.9858, + "step": 3236 + }, + { + "epoch": 0.23385771307818737, + "grad_norm": 7.669159884640666, + "learning_rate": 4.901596574075574e-06, + "loss": 1.0478, + "step": 3237 + }, + { + "epoch": 0.23392995827839688, + "grad_norm": 6.2890679590426615, + "learning_rate": 4.901515300167169e-06, + "loss": 0.9115, + "step": 3238 + }, + { + "epoch": 0.2340022034786064, + "grad_norm": 8.762859812707719, + "learning_rate": 4.901433993383885e-06, + "loss": 1.0607, + "step": 3239 + }, + { + "epoch": 0.2340744486788159, + "grad_norm": 7.870512637512635, + "learning_rate": 4.901352653726833e-06, + "loss": 0.8388, + "step": 3240 + }, + { + "epoch": 0.23414669387902542, + "grad_norm": 7.21586808976012, + "learning_rate": 4.901271281197129e-06, + "loss": 0.9347, + "step": 3241 + }, + { + "epoch": 0.2342189390792349, + "grad_norm": 7.139299198667591, + "learning_rate": 4.901189875795885e-06, + "loss": 0.9397, + "step": 3242 + }, + { + "epoch": 0.23429118427944443, + "grad_norm": 7.153536219544565, + "learning_rate": 4.9011084375242155e-06, + "loss": 1.0497, + "step": 3243 + }, + { + "epoch": 0.23436342947965394, + "grad_norm": 6.846638387877329, + "learning_rate": 4.901026966383237e-06, + "loss": 0.9874, + "step": 3244 + }, + { + "epoch": 0.23443567467986345, + "grad_norm": 8.834560189129483, + "learning_rate": 4.900945462374062e-06, + "loss": 0.8827, + "step": 3245 + }, + { + "epoch": 0.23450791988007297, + "grad_norm": 6.205979927387041, + "learning_rate": 4.90086392549781e-06, + "loss": 0.8783, + "step": 3246 + }, + { + "epoch": 0.23458016508028248, + "grad_norm": 8.04646160952712, + "learning_rate": 4.900782355755593e-06, + "loss": 0.8824, + "step": 3247 + }, + { + "epoch": 0.234652410280492, + "grad_norm": 6.37094196487265, + "learning_rate": 4.900700753148531e-06, + "loss": 0.9077, + "step": 3248 + }, + { + "epoch": 0.2347246554807015, + "grad_norm": 7.420398701857687, + "learning_rate": 4.900619117677739e-06, + "loss": 0.8322, + "step": 3249 + }, + { + "epoch": 0.23479690068091102, + "grad_norm": 9.756169641275, + "learning_rate": 4.9005374493443355e-06, + "loss": 1.0108, + "step": 3250 + }, + { + "epoch": 0.2348691458811205, + "grad_norm": 9.83324751304634, + "learning_rate": 4.900455748149438e-06, + "loss": 0.9865, + "step": 3251 + }, + { + "epoch": 0.23494139108133003, + "grad_norm": 7.607598882728903, + "learning_rate": 4.900374014094165e-06, + "loss": 0.9457, + "step": 3252 + }, + { + "epoch": 0.23501363628153954, + "grad_norm": 6.347799277234788, + "learning_rate": 4.900292247179636e-06, + "loss": 0.8952, + "step": 3253 + }, + { + "epoch": 0.23508588148174905, + "grad_norm": 7.915119247453952, + "learning_rate": 4.90021044740697e-06, + "loss": 0.9364, + "step": 3254 + }, + { + "epoch": 0.23515812668195857, + "grad_norm": 7.690271885781809, + "learning_rate": 4.900128614777286e-06, + "loss": 0.9131, + "step": 3255 + }, + { + "epoch": 0.23523037188216808, + "grad_norm": 6.9171195130920236, + "learning_rate": 4.900046749291705e-06, + "loss": 0.8665, + "step": 3256 + }, + { + "epoch": 0.2353026170823776, + "grad_norm": 8.251839634944432, + "learning_rate": 4.8999648509513475e-06, + "loss": 0.9955, + "step": 3257 + }, + { + "epoch": 0.2353748622825871, + "grad_norm": 6.365195420255148, + "learning_rate": 4.899882919757335e-06, + "loss": 0.8959, + "step": 3258 + }, + { + "epoch": 0.23544710748279662, + "grad_norm": 7.338818261750587, + "learning_rate": 4.899800955710789e-06, + "loss": 0.9442, + "step": 3259 + }, + { + "epoch": 0.2355193526830061, + "grad_norm": 6.741060058815137, + "learning_rate": 4.89971895881283e-06, + "loss": 0.9172, + "step": 3260 + }, + { + "epoch": 0.23559159788321563, + "grad_norm": 5.708405023203552, + "learning_rate": 4.899636929064583e-06, + "loss": 0.9105, + "step": 3261 + }, + { + "epoch": 0.23566384308342514, + "grad_norm": 6.239861466752894, + "learning_rate": 4.899554866467169e-06, + "loss": 0.8401, + "step": 3262 + }, + { + "epoch": 0.23573608828363465, + "grad_norm": 8.867245147114375, + "learning_rate": 4.899472771021712e-06, + "loss": 1.0305, + "step": 3263 + }, + { + "epoch": 0.23580833348384417, + "grad_norm": 8.429320941694291, + "learning_rate": 4.899390642729336e-06, + "loss": 0.9367, + "step": 3264 + }, + { + "epoch": 0.23588057868405368, + "grad_norm": 7.391390674116212, + "learning_rate": 4.899308481591164e-06, + "loss": 0.9384, + "step": 3265 + }, + { + "epoch": 0.2359528238842632, + "grad_norm": 7.080202854504771, + "learning_rate": 4.899226287608323e-06, + "loss": 0.8663, + "step": 3266 + }, + { + "epoch": 0.2360250690844727, + "grad_norm": 7.719489305434573, + "learning_rate": 4.899144060781937e-06, + "loss": 0.9159, + "step": 3267 + }, + { + "epoch": 0.23609731428468222, + "grad_norm": 8.169803976931416, + "learning_rate": 4.899061801113132e-06, + "loss": 1.026, + "step": 3268 + }, + { + "epoch": 0.2361695594848917, + "grad_norm": 7.967057351251763, + "learning_rate": 4.898979508603033e-06, + "loss": 1.0581, + "step": 3269 + }, + { + "epoch": 0.23624180468510123, + "grad_norm": 8.811073133598711, + "learning_rate": 4.898897183252767e-06, + "loss": 0.9661, + "step": 3270 + }, + { + "epoch": 0.23631404988531074, + "grad_norm": 7.005539745635988, + "learning_rate": 4.898814825063462e-06, + "loss": 0.8429, + "step": 3271 + }, + { + "epoch": 0.23638629508552025, + "grad_norm": 6.869349706709246, + "learning_rate": 4.8987324340362445e-06, + "loss": 0.9664, + "step": 3272 + }, + { + "epoch": 0.23645854028572977, + "grad_norm": 7.728217739315419, + "learning_rate": 4.8986500101722415e-06, + "loss": 0.8704, + "step": 3273 + }, + { + "epoch": 0.23653078548593928, + "grad_norm": 10.09122664618179, + "learning_rate": 4.898567553472583e-06, + "loss": 0.9946, + "step": 3274 + }, + { + "epoch": 0.2366030306861488, + "grad_norm": 7.112658335661394, + "learning_rate": 4.898485063938397e-06, + "loss": 0.9507, + "step": 3275 + }, + { + "epoch": 0.2366752758863583, + "grad_norm": 6.349056936806947, + "learning_rate": 4.898402541570812e-06, + "loss": 0.8483, + "step": 3276 + }, + { + "epoch": 0.23674752108656782, + "grad_norm": 7.948321797522585, + "learning_rate": 4.898319986370959e-06, + "loss": 0.9712, + "step": 3277 + }, + { + "epoch": 0.2368197662867773, + "grad_norm": 6.328759733791954, + "learning_rate": 4.898237398339969e-06, + "loss": 0.9402, + "step": 3278 + }, + { + "epoch": 0.23689201148698683, + "grad_norm": 7.016864080966776, + "learning_rate": 4.89815477747897e-06, + "loss": 0.8967, + "step": 3279 + }, + { + "epoch": 0.23696425668719634, + "grad_norm": 5.9942290845541795, + "learning_rate": 4.898072123789094e-06, + "loss": 0.9136, + "step": 3280 + }, + { + "epoch": 0.23703650188740585, + "grad_norm": 7.614567357858441, + "learning_rate": 4.8979894372714724e-06, + "loss": 1.0067, + "step": 3281 + }, + { + "epoch": 0.23710874708761537, + "grad_norm": 7.28720046093569, + "learning_rate": 4.8979067179272375e-06, + "loss": 0.9258, + "step": 3282 + }, + { + "epoch": 0.23718099228782488, + "grad_norm": 8.097514922789873, + "learning_rate": 4.897823965757521e-06, + "loss": 0.915, + "step": 3283 + }, + { + "epoch": 0.2372532374880344, + "grad_norm": 6.5560794484489735, + "learning_rate": 4.8977411807634575e-06, + "loss": 0.8833, + "step": 3284 + }, + { + "epoch": 0.2373254826882439, + "grad_norm": 7.829091513047842, + "learning_rate": 4.897658362946178e-06, + "loss": 0.9383, + "step": 3285 + }, + { + "epoch": 0.2373977278884534, + "grad_norm": 7.233626193023152, + "learning_rate": 4.897575512306818e-06, + "loss": 1.0069, + "step": 3286 + }, + { + "epoch": 0.2374699730886629, + "grad_norm": 7.356968447751488, + "learning_rate": 4.8974926288465095e-06, + "loss": 0.9071, + "step": 3287 + }, + { + "epoch": 0.23754221828887243, + "grad_norm": 6.643176784530677, + "learning_rate": 4.8974097125663885e-06, + "loss": 0.9254, + "step": 3288 + }, + { + "epoch": 0.23761446348908194, + "grad_norm": 7.0222859525940695, + "learning_rate": 4.897326763467591e-06, + "loss": 0.8641, + "step": 3289 + }, + { + "epoch": 0.23768670868929145, + "grad_norm": 7.193115429104139, + "learning_rate": 4.897243781551252e-06, + "loss": 0.8162, + "step": 3290 + }, + { + "epoch": 0.23775895388950097, + "grad_norm": 7.522389878709072, + "learning_rate": 4.897160766818506e-06, + "loss": 0.945, + "step": 3291 + }, + { + "epoch": 0.23783119908971048, + "grad_norm": 6.808231241245948, + "learning_rate": 4.89707771927049e-06, + "loss": 0.9415, + "step": 3292 + }, + { + "epoch": 0.23790344428992, + "grad_norm": 6.100673713376259, + "learning_rate": 4.896994638908342e-06, + "loss": 0.8862, + "step": 3293 + }, + { + "epoch": 0.2379756894901295, + "grad_norm": 7.768941804170745, + "learning_rate": 4.896911525733198e-06, + "loss": 1.0145, + "step": 3294 + }, + { + "epoch": 0.238047934690339, + "grad_norm": 6.174053338881002, + "learning_rate": 4.896828379746197e-06, + "loss": 0.8778, + "step": 3295 + }, + { + "epoch": 0.2381201798905485, + "grad_norm": 6.302139391268843, + "learning_rate": 4.896745200948476e-06, + "loss": 0.8998, + "step": 3296 + }, + { + "epoch": 0.23819242509075803, + "grad_norm": 6.835261685343796, + "learning_rate": 4.896661989341174e-06, + "loss": 0.8913, + "step": 3297 + }, + { + "epoch": 0.23826467029096754, + "grad_norm": 7.232306100123176, + "learning_rate": 4.896578744925431e-06, + "loss": 0.921, + "step": 3298 + }, + { + "epoch": 0.23833691549117705, + "grad_norm": 6.586306451891125, + "learning_rate": 4.896495467702385e-06, + "loss": 0.9908, + "step": 3299 + }, + { + "epoch": 0.23840916069138657, + "grad_norm": 6.635293700926779, + "learning_rate": 4.896412157673177e-06, + "loss": 0.8363, + "step": 3300 + }, + { + "epoch": 0.23848140589159608, + "grad_norm": 7.289136552996687, + "learning_rate": 4.896328814838948e-06, + "loss": 1.0144, + "step": 3301 + }, + { + "epoch": 0.2385536510918056, + "grad_norm": 7.419511599442889, + "learning_rate": 4.8962454392008374e-06, + "loss": 0.9995, + "step": 3302 + }, + { + "epoch": 0.2386258962920151, + "grad_norm": 6.918320546179362, + "learning_rate": 4.896162030759987e-06, + "loss": 0.927, + "step": 3303 + }, + { + "epoch": 0.2386981414922246, + "grad_norm": 6.574693668322151, + "learning_rate": 4.896078589517539e-06, + "loss": 0.9777, + "step": 3304 + }, + { + "epoch": 0.2387703866924341, + "grad_norm": 5.705613688059598, + "learning_rate": 4.895995115474636e-06, + "loss": 0.916, + "step": 3305 + }, + { + "epoch": 0.23884263189264363, + "grad_norm": 6.561304473970591, + "learning_rate": 4.895911608632421e-06, + "loss": 0.8587, + "step": 3306 + }, + { + "epoch": 0.23891487709285314, + "grad_norm": 5.805844341844562, + "learning_rate": 4.895828068992035e-06, + "loss": 0.8484, + "step": 3307 + }, + { + "epoch": 0.23898712229306265, + "grad_norm": 6.810688862962831, + "learning_rate": 4.895744496554623e-06, + "loss": 0.9431, + "step": 3308 + }, + { + "epoch": 0.23905936749327217, + "grad_norm": 7.177634725073577, + "learning_rate": 4.89566089132133e-06, + "loss": 0.832, + "step": 3309 + }, + { + "epoch": 0.23913161269348168, + "grad_norm": 6.548767253389182, + "learning_rate": 4.8955772532932984e-06, + "loss": 0.8954, + "step": 3310 + }, + { + "epoch": 0.2392038578936912, + "grad_norm": 7.835633474520199, + "learning_rate": 4.895493582471675e-06, + "loss": 0.9235, + "step": 3311 + }, + { + "epoch": 0.2392761030939007, + "grad_norm": 6.774972849436056, + "learning_rate": 4.895409878857604e-06, + "loss": 0.9304, + "step": 3312 + }, + { + "epoch": 0.2393483482941102, + "grad_norm": 6.253596987395472, + "learning_rate": 4.895326142452232e-06, + "loss": 1.0255, + "step": 3313 + }, + { + "epoch": 0.2394205934943197, + "grad_norm": 6.216256566706766, + "learning_rate": 4.895242373256706e-06, + "loss": 0.9647, + "step": 3314 + }, + { + "epoch": 0.23949283869452923, + "grad_norm": 7.367086788067582, + "learning_rate": 4.89515857127217e-06, + "loss": 0.8936, + "step": 3315 + }, + { + "epoch": 0.23956508389473874, + "grad_norm": 8.502089299750827, + "learning_rate": 4.895074736499774e-06, + "loss": 0.9425, + "step": 3316 + }, + { + "epoch": 0.23963732909494825, + "grad_norm": 7.617588130489942, + "learning_rate": 4.894990868940663e-06, + "loss": 0.9586, + "step": 3317 + }, + { + "epoch": 0.23970957429515777, + "grad_norm": 9.699623446676897, + "learning_rate": 4.894906968595988e-06, + "loss": 0.9702, + "step": 3318 + }, + { + "epoch": 0.23978181949536728, + "grad_norm": 6.65363156391164, + "learning_rate": 4.8948230354668955e-06, + "loss": 0.9367, + "step": 3319 + }, + { + "epoch": 0.2398540646955768, + "grad_norm": 7.107785493669842, + "learning_rate": 4.8947390695545364e-06, + "loss": 0.9421, + "step": 3320 + }, + { + "epoch": 0.2399263098957863, + "grad_norm": 7.949397264580327, + "learning_rate": 4.894655070860057e-06, + "loss": 0.9426, + "step": 3321 + }, + { + "epoch": 0.2399985550959958, + "grad_norm": 7.040017591367764, + "learning_rate": 4.89457103938461e-06, + "loss": 0.9475, + "step": 3322 + }, + { + "epoch": 0.2400708002962053, + "grad_norm": 10.197883416271976, + "learning_rate": 4.894486975129345e-06, + "loss": 0.9769, + "step": 3323 + }, + { + "epoch": 0.24014304549641483, + "grad_norm": 7.1721957265908, + "learning_rate": 4.894402878095411e-06, + "loss": 0.857, + "step": 3324 + }, + { + "epoch": 0.24021529069662434, + "grad_norm": 7.725018014238705, + "learning_rate": 4.894318748283962e-06, + "loss": 0.9446, + "step": 3325 + }, + { + "epoch": 0.24028753589683385, + "grad_norm": 7.378812418755585, + "learning_rate": 4.8942345856961485e-06, + "loss": 0.8946, + "step": 3326 + }, + { + "epoch": 0.24035978109704337, + "grad_norm": 9.175006286452632, + "learning_rate": 4.894150390333122e-06, + "loss": 0.9276, + "step": 3327 + }, + { + "epoch": 0.24043202629725288, + "grad_norm": 7.959537940871803, + "learning_rate": 4.894066162196036e-06, + "loss": 0.9183, + "step": 3328 + }, + { + "epoch": 0.2405042714974624, + "grad_norm": 6.380975353820484, + "learning_rate": 4.8939819012860426e-06, + "loss": 0.898, + "step": 3329 + }, + { + "epoch": 0.2405765166976719, + "grad_norm": 7.363498059225173, + "learning_rate": 4.893897607604296e-06, + "loss": 0.9293, + "step": 3330 + }, + { + "epoch": 0.2406487618978814, + "grad_norm": 7.182519663363241, + "learning_rate": 4.89381328115195e-06, + "loss": 0.9761, + "step": 3331 + }, + { + "epoch": 0.2407210070980909, + "grad_norm": 8.87652961551623, + "learning_rate": 4.893728921930159e-06, + "loss": 0.9433, + "step": 3332 + }, + { + "epoch": 0.24079325229830043, + "grad_norm": 6.49613867289413, + "learning_rate": 4.893644529940077e-06, + "loss": 0.9059, + "step": 3333 + }, + { + "epoch": 0.24086549749850994, + "grad_norm": 8.446415013170204, + "learning_rate": 4.8935601051828605e-06, + "loss": 0.9886, + "step": 3334 + }, + { + "epoch": 0.24093774269871945, + "grad_norm": 7.883139143226184, + "learning_rate": 4.893475647659664e-06, + "loss": 0.8917, + "step": 3335 + }, + { + "epoch": 0.24100998789892897, + "grad_norm": 7.094960651355799, + "learning_rate": 4.8933911573716455e-06, + "loss": 0.9332, + "step": 3336 + }, + { + "epoch": 0.24108223309913848, + "grad_norm": 8.224022396161054, + "learning_rate": 4.8933066343199594e-06, + "loss": 0.9312, + "step": 3337 + }, + { + "epoch": 0.241154478299348, + "grad_norm": 5.907091191853818, + "learning_rate": 4.893222078505764e-06, + "loss": 0.8527, + "step": 3338 + }, + { + "epoch": 0.2412267234995575, + "grad_norm": 7.198716674451996, + "learning_rate": 4.893137489930217e-06, + "loss": 0.9313, + "step": 3339 + }, + { + "epoch": 0.241298968699767, + "grad_norm": 6.912331505596481, + "learning_rate": 4.893052868594475e-06, + "loss": 0.9198, + "step": 3340 + }, + { + "epoch": 0.2413712138999765, + "grad_norm": 6.805556372125624, + "learning_rate": 4.892968214499699e-06, + "loss": 0.843, + "step": 3341 + }, + { + "epoch": 0.24144345910018603, + "grad_norm": 8.758002409292422, + "learning_rate": 4.8928835276470445e-06, + "loss": 0.9058, + "step": 3342 + }, + { + "epoch": 0.24151570430039554, + "grad_norm": 7.756619733409696, + "learning_rate": 4.892798808037673e-06, + "loss": 0.9451, + "step": 3343 + }, + { + "epoch": 0.24158794950060505, + "grad_norm": 7.683297349868994, + "learning_rate": 4.892714055672744e-06, + "loss": 0.9388, + "step": 3344 + }, + { + "epoch": 0.24166019470081457, + "grad_norm": 6.738729038563265, + "learning_rate": 4.8926292705534175e-06, + "loss": 0.9185, + "step": 3345 + }, + { + "epoch": 0.24173243990102408, + "grad_norm": 7.419513398946621, + "learning_rate": 4.892544452680853e-06, + "loss": 0.9173, + "step": 3346 + }, + { + "epoch": 0.2418046851012336, + "grad_norm": 8.64741015168952, + "learning_rate": 4.892459602056213e-06, + "loss": 0.8537, + "step": 3347 + }, + { + "epoch": 0.2418769303014431, + "grad_norm": 8.531579091638019, + "learning_rate": 4.8923747186806595e-06, + "loss": 0.951, + "step": 3348 + }, + { + "epoch": 0.2419491755016526, + "grad_norm": 6.6483921717388395, + "learning_rate": 4.8922898025553536e-06, + "loss": 0.8829, + "step": 3349 + }, + { + "epoch": 0.2420214207018621, + "grad_norm": 6.66245635914779, + "learning_rate": 4.892204853681457e-06, + "loss": 0.9797, + "step": 3350 + }, + { + "epoch": 0.24209366590207163, + "grad_norm": 7.51575316004618, + "learning_rate": 4.892119872060134e-06, + "loss": 0.9532, + "step": 3351 + }, + { + "epoch": 0.24216591110228114, + "grad_norm": 9.640878633844506, + "learning_rate": 4.892034857692547e-06, + "loss": 0.9324, + "step": 3352 + }, + { + "epoch": 0.24223815630249065, + "grad_norm": 7.019941263849748, + "learning_rate": 4.89194981057986e-06, + "loss": 0.907, + "step": 3353 + }, + { + "epoch": 0.24231040150270017, + "grad_norm": 9.510981186111062, + "learning_rate": 4.891864730723237e-06, + "loss": 0.9837, + "step": 3354 + }, + { + "epoch": 0.24238264670290968, + "grad_norm": 6.298200974107917, + "learning_rate": 4.891779618123844e-06, + "loss": 0.8663, + "step": 3355 + }, + { + "epoch": 0.2424548919031192, + "grad_norm": 6.24054429981544, + "learning_rate": 4.891694472782844e-06, + "loss": 0.9306, + "step": 3356 + }, + { + "epoch": 0.2425271371033287, + "grad_norm": 7.544743649606741, + "learning_rate": 4.891609294701404e-06, + "loss": 0.9236, + "step": 3357 + }, + { + "epoch": 0.2425993823035382, + "grad_norm": 7.670251118951922, + "learning_rate": 4.8915240838806905e-06, + "loss": 0.922, + "step": 3358 + }, + { + "epoch": 0.2426716275037477, + "grad_norm": 9.59797143166343, + "learning_rate": 4.891438840321868e-06, + "loss": 1.01, + "step": 3359 + }, + { + "epoch": 0.24274387270395723, + "grad_norm": 7.052121799501197, + "learning_rate": 4.8913535640261055e-06, + "loss": 0.8369, + "step": 3360 + }, + { + "epoch": 0.24281611790416674, + "grad_norm": 7.996089456367924, + "learning_rate": 4.89126825499457e-06, + "loss": 0.9486, + "step": 3361 + }, + { + "epoch": 0.24288836310437625, + "grad_norm": 8.26217215806662, + "learning_rate": 4.891182913228428e-06, + "loss": 0.9329, + "step": 3362 + }, + { + "epoch": 0.24296060830458577, + "grad_norm": 8.477720345029974, + "learning_rate": 4.891097538728849e-06, + "loss": 0.9376, + "step": 3363 + }, + { + "epoch": 0.24303285350479528, + "grad_norm": 7.120631702560234, + "learning_rate": 4.891012131497e-06, + "loss": 0.9309, + "step": 3364 + }, + { + "epoch": 0.2431050987050048, + "grad_norm": 7.71056225142851, + "learning_rate": 4.890926691534052e-06, + "loss": 0.9334, + "step": 3365 + }, + { + "epoch": 0.2431773439052143, + "grad_norm": 9.59449591206545, + "learning_rate": 4.890841218841175e-06, + "loss": 0.9725, + "step": 3366 + }, + { + "epoch": 0.2432495891054238, + "grad_norm": 9.841338137689002, + "learning_rate": 4.8907557134195375e-06, + "loss": 0.9415, + "step": 3367 + }, + { + "epoch": 0.2433218343056333, + "grad_norm": 9.592345889844161, + "learning_rate": 4.89067017527031e-06, + "loss": 0.9707, + "step": 3368 + }, + { + "epoch": 0.24339407950584283, + "grad_norm": 6.529947105298636, + "learning_rate": 4.890584604394665e-06, + "loss": 0.9298, + "step": 3369 + }, + { + "epoch": 0.24346632470605234, + "grad_norm": 9.732861837524451, + "learning_rate": 4.890499000793772e-06, + "loss": 1.0914, + "step": 3370 + }, + { + "epoch": 0.24353856990626185, + "grad_norm": 8.921020680692964, + "learning_rate": 4.890413364468804e-06, + "loss": 0.9249, + "step": 3371 + }, + { + "epoch": 0.24361081510647137, + "grad_norm": 8.930692200697198, + "learning_rate": 4.890327695420934e-06, + "loss": 1.0604, + "step": 3372 + }, + { + "epoch": 0.24368306030668088, + "grad_norm": 7.230236080636529, + "learning_rate": 4.890241993651332e-06, + "loss": 0.9436, + "step": 3373 + }, + { + "epoch": 0.2437553055068904, + "grad_norm": 7.7047573008302646, + "learning_rate": 4.890156259161175e-06, + "loss": 0.8822, + "step": 3374 + }, + { + "epoch": 0.2438275507070999, + "grad_norm": 6.819634018192607, + "learning_rate": 4.890070491951634e-06, + "loss": 0.9545, + "step": 3375 + }, + { + "epoch": 0.2438997959073094, + "grad_norm": 6.980923407829122, + "learning_rate": 4.889984692023883e-06, + "loss": 0.9752, + "step": 3376 + }, + { + "epoch": 0.2439720411075189, + "grad_norm": 11.742200494836455, + "learning_rate": 4.889898859379098e-06, + "loss": 0.9701, + "step": 3377 + }, + { + "epoch": 0.24404428630772843, + "grad_norm": 10.024275877715986, + "learning_rate": 4.889812994018453e-06, + "loss": 0.9259, + "step": 3378 + }, + { + "epoch": 0.24411653150793794, + "grad_norm": 7.8813823691860305, + "learning_rate": 4.8897270959431234e-06, + "loss": 1.0822, + "step": 3379 + }, + { + "epoch": 0.24418877670814745, + "grad_norm": 7.527874835312607, + "learning_rate": 4.889641165154286e-06, + "loss": 0.9331, + "step": 3380 + }, + { + "epoch": 0.24426102190835697, + "grad_norm": 10.441113605957916, + "learning_rate": 4.889555201653116e-06, + "loss": 0.944, + "step": 3381 + }, + { + "epoch": 0.24433326710856648, + "grad_norm": 10.254057662572572, + "learning_rate": 4.889469205440791e-06, + "loss": 0.9233, + "step": 3382 + }, + { + "epoch": 0.244405512308776, + "grad_norm": 8.968373573192624, + "learning_rate": 4.889383176518488e-06, + "loss": 0.9174, + "step": 3383 + }, + { + "epoch": 0.2444777575089855, + "grad_norm": 9.16120604658728, + "learning_rate": 4.889297114887383e-06, + "loss": 0.955, + "step": 3384 + }, + { + "epoch": 0.244550002709195, + "grad_norm": 6.210532121204722, + "learning_rate": 4.889211020548657e-06, + "loss": 1.0021, + "step": 3385 + }, + { + "epoch": 0.2446222479094045, + "grad_norm": 10.120206852182273, + "learning_rate": 4.889124893503488e-06, + "loss": 1.0188, + "step": 3386 + }, + { + "epoch": 0.24469449310961403, + "grad_norm": 6.656123630237487, + "learning_rate": 4.889038733753053e-06, + "loss": 0.9342, + "step": 3387 + }, + { + "epoch": 0.24476673830982354, + "grad_norm": 8.506173080107764, + "learning_rate": 4.888952541298533e-06, + "loss": 0.9642, + "step": 3388 + }, + { + "epoch": 0.24483898351003305, + "grad_norm": 7.902283167170763, + "learning_rate": 4.888866316141108e-06, + "loss": 0.914, + "step": 3389 + }, + { + "epoch": 0.24491122871024257, + "grad_norm": 7.081705362572292, + "learning_rate": 4.888780058281958e-06, + "loss": 0.9316, + "step": 3390 + }, + { + "epoch": 0.24498347391045208, + "grad_norm": 7.558471867961127, + "learning_rate": 4.8886937677222635e-06, + "loss": 0.8887, + "step": 3391 + }, + { + "epoch": 0.2450557191106616, + "grad_norm": 8.35705693984436, + "learning_rate": 4.888607444463206e-06, + "loss": 0.8785, + "step": 3392 + }, + { + "epoch": 0.2451279643108711, + "grad_norm": 7.975481608327213, + "learning_rate": 4.888521088505967e-06, + "loss": 0.938, + "step": 3393 + }, + { + "epoch": 0.2452002095110806, + "grad_norm": 7.368014373168843, + "learning_rate": 4.888434699851729e-06, + "loss": 0.9072, + "step": 3394 + }, + { + "epoch": 0.2452724547112901, + "grad_norm": 7.515946284225512, + "learning_rate": 4.888348278501674e-06, + "loss": 0.9056, + "step": 3395 + }, + { + "epoch": 0.24534469991149963, + "grad_norm": 8.326856563083572, + "learning_rate": 4.888261824456987e-06, + "loss": 0.9789, + "step": 3396 + }, + { + "epoch": 0.24541694511170914, + "grad_norm": 6.577963494064713, + "learning_rate": 4.888175337718849e-06, + "loss": 0.992, + "step": 3397 + }, + { + "epoch": 0.24548919031191865, + "grad_norm": 7.583498718917556, + "learning_rate": 4.888088818288444e-06, + "loss": 0.9563, + "step": 3398 + }, + { + "epoch": 0.24556143551212817, + "grad_norm": 9.335896639554935, + "learning_rate": 4.888002266166959e-06, + "loss": 0.9746, + "step": 3399 + }, + { + "epoch": 0.24563368071233768, + "grad_norm": 6.83143797229303, + "learning_rate": 4.887915681355576e-06, + "loss": 0.9189, + "step": 3400 + }, + { + "epoch": 0.2457059259125472, + "grad_norm": 7.680059290100753, + "learning_rate": 4.887829063855481e-06, + "loss": 1.0007, + "step": 3401 + }, + { + "epoch": 0.24577817111275668, + "grad_norm": 8.766836045324759, + "learning_rate": 4.887742413667862e-06, + "loss": 0.9515, + "step": 3402 + }, + { + "epoch": 0.2458504163129662, + "grad_norm": 7.0554371854743225, + "learning_rate": 4.887655730793902e-06, + "loss": 0.9596, + "step": 3403 + }, + { + "epoch": 0.2459226615131757, + "grad_norm": 6.535148953503847, + "learning_rate": 4.8875690152347894e-06, + "loss": 0.8811, + "step": 3404 + }, + { + "epoch": 0.24599490671338523, + "grad_norm": 7.171580589085781, + "learning_rate": 4.8874822669917105e-06, + "loss": 1.0251, + "step": 3405 + }, + { + "epoch": 0.24606715191359474, + "grad_norm": 6.456874690895416, + "learning_rate": 4.887395486065853e-06, + "loss": 0.8446, + "step": 3406 + }, + { + "epoch": 0.24613939711380425, + "grad_norm": 7.050719301580353, + "learning_rate": 4.887308672458405e-06, + "loss": 0.9721, + "step": 3407 + }, + { + "epoch": 0.24621164231401377, + "grad_norm": 6.623595466718833, + "learning_rate": 4.887221826170556e-06, + "loss": 0.8627, + "step": 3408 + }, + { + "epoch": 0.24628388751422328, + "grad_norm": 8.159965049070417, + "learning_rate": 4.887134947203492e-06, + "loss": 0.9124, + "step": 3409 + }, + { + "epoch": 0.2463561327144328, + "grad_norm": 9.534966216485076, + "learning_rate": 4.8870480355584055e-06, + "loss": 0.9994, + "step": 3410 + }, + { + "epoch": 0.24642837791464228, + "grad_norm": 7.129853586456502, + "learning_rate": 4.886961091236484e-06, + "loss": 0.9212, + "step": 3411 + }, + { + "epoch": 0.2465006231148518, + "grad_norm": 7.296707184064636, + "learning_rate": 4.886874114238919e-06, + "loss": 0.9492, + "step": 3412 + }, + { + "epoch": 0.2465728683150613, + "grad_norm": 6.614559944491949, + "learning_rate": 4.8867871045669e-06, + "loss": 0.8448, + "step": 3413 + }, + { + "epoch": 0.24664511351527083, + "grad_norm": 6.39018803900387, + "learning_rate": 4.88670006222162e-06, + "loss": 0.8646, + "step": 3414 + }, + { + "epoch": 0.24671735871548034, + "grad_norm": 7.7855335379602035, + "learning_rate": 4.886612987204268e-06, + "loss": 0.9718, + "step": 3415 + }, + { + "epoch": 0.24678960391568985, + "grad_norm": 7.583956710289486, + "learning_rate": 4.886525879516037e-06, + "loss": 0.9523, + "step": 3416 + }, + { + "epoch": 0.24686184911589937, + "grad_norm": 8.289725633965565, + "learning_rate": 4.886438739158121e-06, + "loss": 0.9414, + "step": 3417 + }, + { + "epoch": 0.24693409431610888, + "grad_norm": 6.913863598392561, + "learning_rate": 4.886351566131712e-06, + "loss": 0.8294, + "step": 3418 + }, + { + "epoch": 0.2470063395163184, + "grad_norm": 7.46916446573257, + "learning_rate": 4.886264360438001e-06, + "loss": 0.9936, + "step": 3419 + }, + { + "epoch": 0.24707858471652788, + "grad_norm": 6.04408818968401, + "learning_rate": 4.886177122078185e-06, + "loss": 0.8916, + "step": 3420 + }, + { + "epoch": 0.2471508299167374, + "grad_norm": 7.5065510431377485, + "learning_rate": 4.886089851053457e-06, + "loss": 0.9663, + "step": 3421 + }, + { + "epoch": 0.2472230751169469, + "grad_norm": 7.4482367457900995, + "learning_rate": 4.886002547365011e-06, + "loss": 0.9062, + "step": 3422 + }, + { + "epoch": 0.24729532031715643, + "grad_norm": 6.743851298365991, + "learning_rate": 4.885915211014043e-06, + "loss": 0.8788, + "step": 3423 + }, + { + "epoch": 0.24736756551736594, + "grad_norm": 7.905183139636387, + "learning_rate": 4.885827842001749e-06, + "loss": 0.9556, + "step": 3424 + }, + { + "epoch": 0.24743981071757545, + "grad_norm": 6.579476913020868, + "learning_rate": 4.885740440329324e-06, + "loss": 0.9348, + "step": 3425 + }, + { + "epoch": 0.24751205591778497, + "grad_norm": 6.621644807912775, + "learning_rate": 4.885653005997964e-06, + "loss": 0.9346, + "step": 3426 + }, + { + "epoch": 0.24758430111799448, + "grad_norm": 8.405409987217913, + "learning_rate": 4.8855655390088675e-06, + "loss": 1.0217, + "step": 3427 + }, + { + "epoch": 0.247656546318204, + "grad_norm": 6.432354991359174, + "learning_rate": 4.88547803936323e-06, + "loss": 0.8877, + "step": 3428 + }, + { + "epoch": 0.24772879151841348, + "grad_norm": 6.2756106744410305, + "learning_rate": 4.885390507062251e-06, + "loss": 0.8532, + "step": 3429 + }, + { + "epoch": 0.247801036718623, + "grad_norm": 7.518746661584228, + "learning_rate": 4.885302942107129e-06, + "loss": 1.0225, + "step": 3430 + }, + { + "epoch": 0.2478732819188325, + "grad_norm": 7.233602725636374, + "learning_rate": 4.885215344499061e-06, + "loss": 0.8716, + "step": 3431 + }, + { + "epoch": 0.24794552711904203, + "grad_norm": 6.848843843109922, + "learning_rate": 4.8851277142392466e-06, + "loss": 0.9746, + "step": 3432 + }, + { + "epoch": 0.24801777231925154, + "grad_norm": 8.408009171325734, + "learning_rate": 4.885040051328886e-06, + "loss": 0.955, + "step": 3433 + }, + { + "epoch": 0.24809001751946105, + "grad_norm": 7.4558399941462365, + "learning_rate": 4.8849523557691795e-06, + "loss": 0.9654, + "step": 3434 + }, + { + "epoch": 0.24816226271967057, + "grad_norm": 6.705526708614425, + "learning_rate": 4.884864627561326e-06, + "loss": 0.8632, + "step": 3435 + }, + { + "epoch": 0.24823450791988008, + "grad_norm": 6.222319072393497, + "learning_rate": 4.884776866706528e-06, + "loss": 0.8731, + "step": 3436 + }, + { + "epoch": 0.2483067531200896, + "grad_norm": 8.223273711228707, + "learning_rate": 4.884689073205986e-06, + "loss": 0.8586, + "step": 3437 + }, + { + "epoch": 0.24837899832029908, + "grad_norm": 7.077856601906289, + "learning_rate": 4.884601247060903e-06, + "loss": 0.9208, + "step": 3438 + }, + { + "epoch": 0.2484512435205086, + "grad_norm": 8.726581733976118, + "learning_rate": 4.88451338827248e-06, + "loss": 0.9522, + "step": 3439 + }, + { + "epoch": 0.2485234887207181, + "grad_norm": 7.077888939565265, + "learning_rate": 4.884425496841921e-06, + "loss": 0.988, + "step": 3440 + }, + { + "epoch": 0.24859573392092763, + "grad_norm": 9.151367552238948, + "learning_rate": 4.884337572770427e-06, + "loss": 1.0242, + "step": 3441 + }, + { + "epoch": 0.24866797912113714, + "grad_norm": 6.227319688119992, + "learning_rate": 4.884249616059203e-06, + "loss": 0.8627, + "step": 3442 + }, + { + "epoch": 0.24874022432134665, + "grad_norm": 6.396893176345085, + "learning_rate": 4.884161626709453e-06, + "loss": 0.8649, + "step": 3443 + }, + { + "epoch": 0.24881246952155617, + "grad_norm": 8.745652781006, + "learning_rate": 4.884073604722383e-06, + "loss": 0.9297, + "step": 3444 + }, + { + "epoch": 0.24888471472176568, + "grad_norm": 6.663077850761801, + "learning_rate": 4.883985550099195e-06, + "loss": 0.9596, + "step": 3445 + }, + { + "epoch": 0.2489569599219752, + "grad_norm": 7.201797992003591, + "learning_rate": 4.883897462841096e-06, + "loss": 0.9238, + "step": 3446 + }, + { + "epoch": 0.24902920512218468, + "grad_norm": 6.659312228869605, + "learning_rate": 4.883809342949292e-06, + "loss": 0.9028, + "step": 3447 + }, + { + "epoch": 0.2491014503223942, + "grad_norm": 7.406669508243614, + "learning_rate": 4.883721190424989e-06, + "loss": 0.892, + "step": 3448 + }, + { + "epoch": 0.2491736955226037, + "grad_norm": 7.8415760203955545, + "learning_rate": 4.883633005269394e-06, + "loss": 0.8985, + "step": 3449 + }, + { + "epoch": 0.24924594072281323, + "grad_norm": 8.213105442716756, + "learning_rate": 4.883544787483713e-06, + "loss": 0.8643, + "step": 3450 + }, + { + "epoch": 0.24931818592302274, + "grad_norm": 5.8504596888241736, + "learning_rate": 4.883456537069155e-06, + "loss": 0.8769, + "step": 3451 + }, + { + "epoch": 0.24939043112323225, + "grad_norm": 6.939346755007309, + "learning_rate": 4.883368254026928e-06, + "loss": 1.0508, + "step": 3452 + }, + { + "epoch": 0.24946267632344177, + "grad_norm": 7.653200873625398, + "learning_rate": 4.883279938358241e-06, + "loss": 0.9325, + "step": 3453 + }, + { + "epoch": 0.24953492152365128, + "grad_norm": 6.789116722039658, + "learning_rate": 4.8831915900643e-06, + "loss": 0.9184, + "step": 3454 + }, + { + "epoch": 0.2496071667238608, + "grad_norm": 6.273734535795627, + "learning_rate": 4.883103209146318e-06, + "loss": 0.8881, + "step": 3455 + }, + { + "epoch": 0.24967941192407028, + "grad_norm": 6.243132213008272, + "learning_rate": 4.883014795605503e-06, + "loss": 0.8914, + "step": 3456 + }, + { + "epoch": 0.2497516571242798, + "grad_norm": 6.374114629561585, + "learning_rate": 4.882926349443067e-06, + "loss": 0.8343, + "step": 3457 + }, + { + "epoch": 0.2498239023244893, + "grad_norm": 6.206470423353904, + "learning_rate": 4.882837870660218e-06, + "loss": 0.9313, + "step": 3458 + }, + { + "epoch": 0.24989614752469883, + "grad_norm": 6.303956245832075, + "learning_rate": 4.882749359258169e-06, + "loss": 0.8798, + "step": 3459 + }, + { + "epoch": 0.24996839272490834, + "grad_norm": 7.58755378515546, + "learning_rate": 4.882660815238132e-06, + "loss": 0.8613, + "step": 3460 + }, + { + "epoch": 0.2500406379251178, + "grad_norm": 5.7157241237047804, + "learning_rate": 4.882572238601317e-06, + "loss": 0.9001, + "step": 3461 + }, + { + "epoch": 0.25011288312532737, + "grad_norm": 8.720141228048396, + "learning_rate": 4.8824836293489395e-06, + "loss": 0.9527, + "step": 3462 + }, + { + "epoch": 0.25018512832553685, + "grad_norm": 7.26981198661979, + "learning_rate": 4.8823949874822105e-06, + "loss": 0.9461, + "step": 3463 + }, + { + "epoch": 0.2502573735257464, + "grad_norm": 7.297674817313924, + "learning_rate": 4.882306313002344e-06, + "loss": 0.9445, + "step": 3464 + }, + { + "epoch": 0.2503296187259559, + "grad_norm": 7.154221692889162, + "learning_rate": 4.882217605910553e-06, + "loss": 1.014, + "step": 3465 + }, + { + "epoch": 0.2504018639261654, + "grad_norm": 8.044597295968076, + "learning_rate": 4.882128866208054e-06, + "loss": 0.8515, + "step": 3466 + }, + { + "epoch": 0.2504741091263749, + "grad_norm": 6.461473583412736, + "learning_rate": 4.88204009389606e-06, + "loss": 0.8886, + "step": 3467 + }, + { + "epoch": 0.25054635432658445, + "grad_norm": 7.424916369034617, + "learning_rate": 4.881951288975786e-06, + "loss": 0.9515, + "step": 3468 + }, + { + "epoch": 0.25061859952679394, + "grad_norm": 8.290018758165882, + "learning_rate": 4.881862451448448e-06, + "loss": 1.0045, + "step": 3469 + }, + { + "epoch": 0.2506908447270034, + "grad_norm": 7.2560536334191115, + "learning_rate": 4.881773581315263e-06, + "loss": 0.9569, + "step": 3470 + }, + { + "epoch": 0.25076308992721297, + "grad_norm": 8.376426916834314, + "learning_rate": 4.881684678577448e-06, + "loss": 0.9666, + "step": 3471 + }, + { + "epoch": 0.25083533512742245, + "grad_norm": 6.5240886066344, + "learning_rate": 4.881595743236218e-06, + "loss": 0.9622, + "step": 3472 + }, + { + "epoch": 0.250907580327632, + "grad_norm": 7.575302258998353, + "learning_rate": 4.881506775292792e-06, + "loss": 1.0698, + "step": 3473 + }, + { + "epoch": 0.2509798255278415, + "grad_norm": 6.566626132639891, + "learning_rate": 4.8814177747483865e-06, + "loss": 0.9421, + "step": 3474 + }, + { + "epoch": 0.251052070728051, + "grad_norm": 7.201517517386364, + "learning_rate": 4.881328741604221e-06, + "loss": 0.8925, + "step": 3475 + }, + { + "epoch": 0.2511243159282605, + "grad_norm": 6.537192821397489, + "learning_rate": 4.881239675861515e-06, + "loss": 0.8786, + "step": 3476 + }, + { + "epoch": 0.25119656112847005, + "grad_norm": 8.946010992106954, + "learning_rate": 4.881150577521485e-06, + "loss": 0.9888, + "step": 3477 + }, + { + "epoch": 0.25126880632867954, + "grad_norm": 6.2953015200248625, + "learning_rate": 4.881061446585354e-06, + "loss": 0.9857, + "step": 3478 + }, + { + "epoch": 0.251341051528889, + "grad_norm": 6.741417125835039, + "learning_rate": 4.88097228305434e-06, + "loss": 0.9579, + "step": 3479 + }, + { + "epoch": 0.25141329672909857, + "grad_norm": 5.577879828186114, + "learning_rate": 4.880883086929664e-06, + "loss": 0.9148, + "step": 3480 + }, + { + "epoch": 0.25148554192930805, + "grad_norm": 6.222309263316906, + "learning_rate": 4.880793858212547e-06, + "loss": 0.9961, + "step": 3481 + }, + { + "epoch": 0.2515577871295176, + "grad_norm": 6.64448302451635, + "learning_rate": 4.880704596904211e-06, + "loss": 0.9446, + "step": 3482 + }, + { + "epoch": 0.2516300323297271, + "grad_norm": 6.480029017065566, + "learning_rate": 4.880615303005878e-06, + "loss": 0.998, + "step": 3483 + }, + { + "epoch": 0.2517022775299366, + "grad_norm": 6.793654106186298, + "learning_rate": 4.880525976518769e-06, + "loss": 0.9537, + "step": 3484 + }, + { + "epoch": 0.2517745227301461, + "grad_norm": 6.882881501942002, + "learning_rate": 4.880436617444108e-06, + "loss": 0.9486, + "step": 3485 + }, + { + "epoch": 0.25184676793035565, + "grad_norm": 6.623077401507356, + "learning_rate": 4.880347225783119e-06, + "loss": 0.8481, + "step": 3486 + }, + { + "epoch": 0.25191901313056514, + "grad_norm": 6.896385362854808, + "learning_rate": 4.880257801537023e-06, + "loss": 0.9711, + "step": 3487 + }, + { + "epoch": 0.2519912583307746, + "grad_norm": 6.435631341915334, + "learning_rate": 4.880168344707047e-06, + "loss": 0.905, + "step": 3488 + }, + { + "epoch": 0.25206350353098417, + "grad_norm": 7.493689107130491, + "learning_rate": 4.880078855294414e-06, + "loss": 0.887, + "step": 3489 + }, + { + "epoch": 0.25213574873119365, + "grad_norm": 8.323233880470013, + "learning_rate": 4.87998933330035e-06, + "loss": 0.974, + "step": 3490 + }, + { + "epoch": 0.2522079939314032, + "grad_norm": 5.910491550613456, + "learning_rate": 4.8798997787260795e-06, + "loss": 0.9941, + "step": 3491 + }, + { + "epoch": 0.2522802391316127, + "grad_norm": 7.235553950237457, + "learning_rate": 4.879810191572829e-06, + "loss": 0.9337, + "step": 3492 + }, + { + "epoch": 0.2523524843318222, + "grad_norm": 6.816845549060511, + "learning_rate": 4.879720571841825e-06, + "loss": 0.9043, + "step": 3493 + }, + { + "epoch": 0.2524247295320317, + "grad_norm": 7.38383446134453, + "learning_rate": 4.879630919534294e-06, + "loss": 0.8146, + "step": 3494 + }, + { + "epoch": 0.25249697473224125, + "grad_norm": 7.607549742185981, + "learning_rate": 4.879541234651462e-06, + "loss": 0.8639, + "step": 3495 + }, + { + "epoch": 0.25256921993245074, + "grad_norm": 8.412019812835855, + "learning_rate": 4.87945151719456e-06, + "loss": 0.9975, + "step": 3496 + }, + { + "epoch": 0.2526414651326602, + "grad_norm": 7.117572593948513, + "learning_rate": 4.879361767164814e-06, + "loss": 0.867, + "step": 3497 + }, + { + "epoch": 0.25271371033286977, + "grad_norm": 5.585396608427866, + "learning_rate": 4.879271984563452e-06, + "loss": 0.8911, + "step": 3498 + }, + { + "epoch": 0.25278595553307925, + "grad_norm": 6.908547411992625, + "learning_rate": 4.879182169391705e-06, + "loss": 0.9546, + "step": 3499 + }, + { + "epoch": 0.2528582007332888, + "grad_norm": 6.461022520601422, + "learning_rate": 4.879092321650801e-06, + "loss": 0.8859, + "step": 3500 + }, + { + "epoch": 0.2529304459334983, + "grad_norm": 6.715059340180001, + "learning_rate": 4.87900244134197e-06, + "loss": 0.9174, + "step": 3501 + }, + { + "epoch": 0.2530026911337078, + "grad_norm": 6.599348683442503, + "learning_rate": 4.878912528466443e-06, + "loss": 0.9168, + "step": 3502 + }, + { + "epoch": 0.2530749363339173, + "grad_norm": 7.62729997810855, + "learning_rate": 4.87882258302545e-06, + "loss": 0.91, + "step": 3503 + }, + { + "epoch": 0.25314718153412685, + "grad_norm": 6.207885760190201, + "learning_rate": 4.878732605020224e-06, + "loss": 0.8386, + "step": 3504 + }, + { + "epoch": 0.25321942673433634, + "grad_norm": 7.2130445251656035, + "learning_rate": 4.878642594451994e-06, + "loss": 0.8404, + "step": 3505 + }, + { + "epoch": 0.2532916719345458, + "grad_norm": 9.65150745061806, + "learning_rate": 4.878552551321996e-06, + "loss": 0.8866, + "step": 3506 + }, + { + "epoch": 0.25336391713475537, + "grad_norm": 6.54313199423222, + "learning_rate": 4.878462475631459e-06, + "loss": 0.9719, + "step": 3507 + }, + { + "epoch": 0.25343616233496485, + "grad_norm": 6.301877592737553, + "learning_rate": 4.878372367381617e-06, + "loss": 1.0084, + "step": 3508 + }, + { + "epoch": 0.2535084075351744, + "grad_norm": 7.542754570771383, + "learning_rate": 4.878282226573705e-06, + "loss": 0.9535, + "step": 3509 + }, + { + "epoch": 0.2535806527353839, + "grad_norm": 6.420536542201601, + "learning_rate": 4.878192053208955e-06, + "loss": 0.9328, + "step": 3510 + }, + { + "epoch": 0.2536528979355934, + "grad_norm": 7.349138855603785, + "learning_rate": 4.878101847288603e-06, + "loss": 0.9451, + "step": 3511 + }, + { + "epoch": 0.2537251431358029, + "grad_norm": 7.230274331783117, + "learning_rate": 4.878011608813884e-06, + "loss": 0.9435, + "step": 3512 + }, + { + "epoch": 0.25379738833601245, + "grad_norm": 5.764082208580598, + "learning_rate": 4.8779213377860315e-06, + "loss": 0.9273, + "step": 3513 + }, + { + "epoch": 0.25386963353622194, + "grad_norm": 6.474663873733681, + "learning_rate": 4.877831034206282e-06, + "loss": 0.885, + "step": 3514 + }, + { + "epoch": 0.2539418787364314, + "grad_norm": 8.55338219565005, + "learning_rate": 4.877740698075872e-06, + "loss": 0.8968, + "step": 3515 + }, + { + "epoch": 0.25401412393664097, + "grad_norm": 8.233926547323504, + "learning_rate": 4.877650329396038e-06, + "loss": 0.8953, + "step": 3516 + }, + { + "epoch": 0.25408636913685045, + "grad_norm": 6.3287518979604105, + "learning_rate": 4.8775599281680175e-06, + "loss": 0.9403, + "step": 3517 + }, + { + "epoch": 0.25415861433706, + "grad_norm": 7.51379626066785, + "learning_rate": 4.877469494393048e-06, + "loss": 0.9436, + "step": 3518 + }, + { + "epoch": 0.2542308595372695, + "grad_norm": 7.656242277180416, + "learning_rate": 4.877379028072367e-06, + "loss": 0.9568, + "step": 3519 + }, + { + "epoch": 0.254303104737479, + "grad_norm": 7.4672773497484615, + "learning_rate": 4.877288529207212e-06, + "loss": 1.0035, + "step": 3520 + }, + { + "epoch": 0.2543753499376885, + "grad_norm": 7.18450653101786, + "learning_rate": 4.877197997798824e-06, + "loss": 1.0059, + "step": 3521 + }, + { + "epoch": 0.25444759513789805, + "grad_norm": 6.218931569271465, + "learning_rate": 4.87710743384844e-06, + "loss": 0.921, + "step": 3522 + }, + { + "epoch": 0.25451984033810754, + "grad_norm": 6.307055249706599, + "learning_rate": 4.877016837357301e-06, + "loss": 0.9575, + "step": 3523 + }, + { + "epoch": 0.254592085538317, + "grad_norm": 6.488352389924567, + "learning_rate": 4.876926208326649e-06, + "loss": 0.8822, + "step": 3524 + }, + { + "epoch": 0.25466433073852657, + "grad_norm": 7.183881562002188, + "learning_rate": 4.876835546757721e-06, + "loss": 0.9734, + "step": 3525 + }, + { + "epoch": 0.25473657593873605, + "grad_norm": 6.5888723257770465, + "learning_rate": 4.87674485265176e-06, + "loss": 0.9232, + "step": 3526 + }, + { + "epoch": 0.2548088211389456, + "grad_norm": 6.359421188599156, + "learning_rate": 4.876654126010009e-06, + "loss": 0.9054, + "step": 3527 + }, + { + "epoch": 0.2548810663391551, + "grad_norm": 7.016412023905736, + "learning_rate": 4.876563366833706e-06, + "loss": 0.9017, + "step": 3528 + }, + { + "epoch": 0.2549533115393646, + "grad_norm": 6.798399164595432, + "learning_rate": 4.876472575124097e-06, + "loss": 0.8595, + "step": 3529 + }, + { + "epoch": 0.2550255567395741, + "grad_norm": 7.304072814210779, + "learning_rate": 4.876381750882424e-06, + "loss": 0.8878, + "step": 3530 + }, + { + "epoch": 0.25509780193978365, + "grad_norm": 6.212495812274589, + "learning_rate": 4.876290894109929e-06, + "loss": 0.8993, + "step": 3531 + }, + { + "epoch": 0.25517004713999314, + "grad_norm": 7.823363784494558, + "learning_rate": 4.876200004807858e-06, + "loss": 0.9318, + "step": 3532 + }, + { + "epoch": 0.2552422923402026, + "grad_norm": 6.19392508404625, + "learning_rate": 4.8761090829774535e-06, + "loss": 0.9016, + "step": 3533 + }, + { + "epoch": 0.25531453754041217, + "grad_norm": 6.953131857879343, + "learning_rate": 4.8760181286199605e-06, + "loss": 0.9837, + "step": 3534 + }, + { + "epoch": 0.25538678274062165, + "grad_norm": 5.840229789707083, + "learning_rate": 4.875927141736624e-06, + "loss": 0.8503, + "step": 3535 + }, + { + "epoch": 0.2554590279408312, + "grad_norm": 8.340354657213652, + "learning_rate": 4.875836122328689e-06, + "loss": 0.9188, + "step": 3536 + }, + { + "epoch": 0.2555312731410407, + "grad_norm": 6.429161523045155, + "learning_rate": 4.875745070397403e-06, + "loss": 0.8831, + "step": 3537 + }, + { + "epoch": 0.2556035183412502, + "grad_norm": 7.719427781714109, + "learning_rate": 4.8756539859440115e-06, + "loss": 1.0099, + "step": 3538 + }, + { + "epoch": 0.2556757635414597, + "grad_norm": 6.437044590516888, + "learning_rate": 4.875562868969762e-06, + "loss": 0.8635, + "step": 3539 + }, + { + "epoch": 0.25574800874166925, + "grad_norm": 6.840694215580332, + "learning_rate": 4.8754717194759e-06, + "loss": 0.9019, + "step": 3540 + }, + { + "epoch": 0.25582025394187874, + "grad_norm": 7.236279626187005, + "learning_rate": 4.875380537463677e-06, + "loss": 0.9502, + "step": 3541 + }, + { + "epoch": 0.2558924991420882, + "grad_norm": 7.220551237160293, + "learning_rate": 4.875289322934336e-06, + "loss": 0.878, + "step": 3542 + }, + { + "epoch": 0.25596474434229777, + "grad_norm": 6.451903590144806, + "learning_rate": 4.875198075889131e-06, + "loss": 0.8947, + "step": 3543 + }, + { + "epoch": 0.25603698954250725, + "grad_norm": 6.6460866212162015, + "learning_rate": 4.875106796329307e-06, + "loss": 0.9514, + "step": 3544 + }, + { + "epoch": 0.2561092347427168, + "grad_norm": 5.836169216555289, + "learning_rate": 4.8750154842561146e-06, + "loss": 0.9059, + "step": 3545 + }, + { + "epoch": 0.2561814799429263, + "grad_norm": 7.454507311704369, + "learning_rate": 4.874924139670805e-06, + "loss": 0.9591, + "step": 3546 + }, + { + "epoch": 0.2562537251431358, + "grad_norm": 8.347926776734841, + "learning_rate": 4.874832762574628e-06, + "loss": 1.0304, + "step": 3547 + }, + { + "epoch": 0.2563259703433453, + "grad_norm": 6.169933931315382, + "learning_rate": 4.874741352968835e-06, + "loss": 0.9694, + "step": 3548 + }, + { + "epoch": 0.25639821554355485, + "grad_norm": 5.51153222409843, + "learning_rate": 4.8746499108546764e-06, + "loss": 0.8441, + "step": 3549 + }, + { + "epoch": 0.25647046074376434, + "grad_norm": 9.169258537309515, + "learning_rate": 4.8745584362334045e-06, + "loss": 0.8878, + "step": 3550 + }, + { + "epoch": 0.2565427059439738, + "grad_norm": 6.886814854600981, + "learning_rate": 4.874466929106271e-06, + "loss": 0.9546, + "step": 3551 + }, + { + "epoch": 0.25661495114418337, + "grad_norm": 7.625986254335786, + "learning_rate": 4.874375389474528e-06, + "loss": 0.9144, + "step": 3552 + }, + { + "epoch": 0.25668719634439285, + "grad_norm": 6.764379937400028, + "learning_rate": 4.87428381733943e-06, + "loss": 0.8869, + "step": 3553 + }, + { + "epoch": 0.2567594415446024, + "grad_norm": 6.649156399330212, + "learning_rate": 4.874192212702231e-06, + "loss": 0.8783, + "step": 3554 + }, + { + "epoch": 0.2568316867448119, + "grad_norm": 6.297106717297493, + "learning_rate": 4.874100575564184e-06, + "loss": 0.9766, + "step": 3555 + }, + { + "epoch": 0.2569039319450214, + "grad_norm": 7.470065076919877, + "learning_rate": 4.874008905926543e-06, + "loss": 0.9411, + "step": 3556 + }, + { + "epoch": 0.2569761771452309, + "grad_norm": 7.127868610271444, + "learning_rate": 4.873917203790563e-06, + "loss": 0.9437, + "step": 3557 + }, + { + "epoch": 0.2570484223454404, + "grad_norm": 7.020860651191367, + "learning_rate": 4.8738254691575e-06, + "loss": 1.0402, + "step": 3558 + }, + { + "epoch": 0.25712066754564994, + "grad_norm": 7.97651969756207, + "learning_rate": 4.8737337020286095e-06, + "loss": 0.8523, + "step": 3559 + }, + { + "epoch": 0.2571929127458594, + "grad_norm": 7.783901023447194, + "learning_rate": 4.873641902405148e-06, + "loss": 0.8732, + "step": 3560 + }, + { + "epoch": 0.25726515794606897, + "grad_norm": 7.00135000017505, + "learning_rate": 4.873550070288371e-06, + "loss": 1.0305, + "step": 3561 + }, + { + "epoch": 0.25733740314627845, + "grad_norm": 7.594185522329474, + "learning_rate": 4.873458205679538e-06, + "loss": 0.9116, + "step": 3562 + }, + { + "epoch": 0.257409648346488, + "grad_norm": 8.194282343351283, + "learning_rate": 4.873366308579903e-06, + "loss": 0.9613, + "step": 3563 + }, + { + "epoch": 0.2574818935466975, + "grad_norm": 7.042176026952186, + "learning_rate": 4.8732743789907274e-06, + "loss": 0.933, + "step": 3564 + }, + { + "epoch": 0.257554138746907, + "grad_norm": 7.4058405123151045, + "learning_rate": 4.873182416913268e-06, + "loss": 0.9094, + "step": 3565 + }, + { + "epoch": 0.2576263839471165, + "grad_norm": 5.918841790491544, + "learning_rate": 4.873090422348784e-06, + "loss": 0.9634, + "step": 3566 + }, + { + "epoch": 0.257698629147326, + "grad_norm": 7.037813505571811, + "learning_rate": 4.872998395298535e-06, + "loss": 0.9854, + "step": 3567 + }, + { + "epoch": 0.25777087434753554, + "grad_norm": 8.121927003007814, + "learning_rate": 4.87290633576378e-06, + "loss": 0.9923, + "step": 3568 + }, + { + "epoch": 0.257843119547745, + "grad_norm": 9.398714716272172, + "learning_rate": 4.872814243745781e-06, + "loss": 0.9057, + "step": 3569 + }, + { + "epoch": 0.25791536474795457, + "grad_norm": 6.2307231408685055, + "learning_rate": 4.872722119245795e-06, + "loss": 0.8694, + "step": 3570 + }, + { + "epoch": 0.25798760994816405, + "grad_norm": 6.570788281209083, + "learning_rate": 4.872629962265087e-06, + "loss": 0.9389, + "step": 3571 + }, + { + "epoch": 0.2580598551483736, + "grad_norm": 7.466860735732666, + "learning_rate": 4.872537772804917e-06, + "loss": 0.8858, + "step": 3572 + }, + { + "epoch": 0.2581321003485831, + "grad_norm": 6.4114016776495895, + "learning_rate": 4.872445550866547e-06, + "loss": 0.9396, + "step": 3573 + }, + { + "epoch": 0.2582043455487926, + "grad_norm": 6.752568921694641, + "learning_rate": 4.872353296451239e-06, + "loss": 0.9846, + "step": 3574 + }, + { + "epoch": 0.2582765907490021, + "grad_norm": 7.740037482907716, + "learning_rate": 4.872261009560257e-06, + "loss": 0.9119, + "step": 3575 + }, + { + "epoch": 0.2583488359492116, + "grad_norm": 6.612789489524347, + "learning_rate": 4.872168690194864e-06, + "loss": 0.8894, + "step": 3576 + }, + { + "epoch": 0.25842108114942114, + "grad_norm": 8.542578026758768, + "learning_rate": 4.872076338356322e-06, + "loss": 0.9837, + "step": 3577 + }, + { + "epoch": 0.2584933263496306, + "grad_norm": 7.818200069027138, + "learning_rate": 4.871983954045898e-06, + "loss": 0.9012, + "step": 3578 + }, + { + "epoch": 0.25856557154984017, + "grad_norm": 7.6017973531295135, + "learning_rate": 4.871891537264855e-06, + "loss": 0.9584, + "step": 3579 + }, + { + "epoch": 0.25863781675004965, + "grad_norm": 5.6955131003552575, + "learning_rate": 4.871799088014459e-06, + "loss": 0.9561, + "step": 3580 + }, + { + "epoch": 0.2587100619502592, + "grad_norm": 8.37459187438409, + "learning_rate": 4.871706606295974e-06, + "loss": 0.8873, + "step": 3581 + }, + { + "epoch": 0.2587823071504687, + "grad_norm": 8.118031491854067, + "learning_rate": 4.871614092110668e-06, + "loss": 0.9088, + "step": 3582 + }, + { + "epoch": 0.2588545523506782, + "grad_norm": 10.397013646294328, + "learning_rate": 4.871521545459806e-06, + "loss": 0.9381, + "step": 3583 + }, + { + "epoch": 0.2589267975508877, + "grad_norm": 7.4611169603744125, + "learning_rate": 4.871428966344656e-06, + "loss": 0.9384, + "step": 3584 + }, + { + "epoch": 0.2589990427510972, + "grad_norm": 7.521483867826522, + "learning_rate": 4.871336354766484e-06, + "loss": 0.9472, + "step": 3585 + }, + { + "epoch": 0.25907128795130674, + "grad_norm": 6.090024464625436, + "learning_rate": 4.871243710726559e-06, + "loss": 0.8725, + "step": 3586 + }, + { + "epoch": 0.2591435331515162, + "grad_norm": 6.54139527488853, + "learning_rate": 4.871151034226149e-06, + "loss": 0.8667, + "step": 3587 + }, + { + "epoch": 0.25921577835172577, + "grad_norm": 6.996006507691661, + "learning_rate": 4.8710583252665225e-06, + "loss": 0.9323, + "step": 3588 + }, + { + "epoch": 0.25928802355193525, + "grad_norm": 8.784249115988132, + "learning_rate": 4.870965583848948e-06, + "loss": 0.9002, + "step": 3589 + }, + { + "epoch": 0.2593602687521448, + "grad_norm": 9.060488142122098, + "learning_rate": 4.870872809974695e-06, + "loss": 0.8659, + "step": 3590 + }, + { + "epoch": 0.2594325139523543, + "grad_norm": 6.890019604304844, + "learning_rate": 4.8707800036450355e-06, + "loss": 0.9378, + "step": 3591 + }, + { + "epoch": 0.2595047591525638, + "grad_norm": 7.030924200351188, + "learning_rate": 4.870687164861237e-06, + "loss": 0.9232, + "step": 3592 + }, + { + "epoch": 0.2595770043527733, + "grad_norm": 10.593975773658872, + "learning_rate": 4.870594293624573e-06, + "loss": 0.9376, + "step": 3593 + }, + { + "epoch": 0.2596492495529828, + "grad_norm": 7.90953339561677, + "learning_rate": 4.870501389936314e-06, + "loss": 1.0555, + "step": 3594 + }, + { + "epoch": 0.25972149475319234, + "grad_norm": 6.99641953587844, + "learning_rate": 4.8704084537977314e-06, + "loss": 0.9444, + "step": 3595 + }, + { + "epoch": 0.2597937399534018, + "grad_norm": 8.216409444232525, + "learning_rate": 4.870315485210097e-06, + "loss": 0.9127, + "step": 3596 + }, + { + "epoch": 0.25986598515361137, + "grad_norm": 10.6619378177602, + "learning_rate": 4.870222484174684e-06, + "loss": 0.8827, + "step": 3597 + }, + { + "epoch": 0.25993823035382085, + "grad_norm": 6.930212093482027, + "learning_rate": 4.870129450692766e-06, + "loss": 0.8839, + "step": 3598 + }, + { + "epoch": 0.2600104755540304, + "grad_norm": 8.189423080451089, + "learning_rate": 4.870036384765616e-06, + "loss": 0.9393, + "step": 3599 + }, + { + "epoch": 0.2600827207542399, + "grad_norm": 6.459669438458162, + "learning_rate": 4.869943286394508e-06, + "loss": 0.9727, + "step": 3600 + }, + { + "epoch": 0.2601549659544494, + "grad_norm": 6.941092995534367, + "learning_rate": 4.869850155580717e-06, + "loss": 0.9973, + "step": 3601 + }, + { + "epoch": 0.2602272111546589, + "grad_norm": 6.057870572067376, + "learning_rate": 4.8697569923255175e-06, + "loss": 0.9686, + "step": 3602 + }, + { + "epoch": 0.2602994563548684, + "grad_norm": 7.103525828285221, + "learning_rate": 4.869663796630184e-06, + "loss": 0.8995, + "step": 3603 + }, + { + "epoch": 0.26037170155507794, + "grad_norm": 9.549568957442489, + "learning_rate": 4.869570568495994e-06, + "loss": 1.0213, + "step": 3604 + }, + { + "epoch": 0.2604439467552874, + "grad_norm": 7.725003446806599, + "learning_rate": 4.869477307924222e-06, + "loss": 0.8976, + "step": 3605 + }, + { + "epoch": 0.26051619195549697, + "grad_norm": 6.410506458002808, + "learning_rate": 4.869384014916145e-06, + "loss": 0.9041, + "step": 3606 + }, + { + "epoch": 0.26058843715570645, + "grad_norm": 6.417109222343373, + "learning_rate": 4.869290689473041e-06, + "loss": 0.8728, + "step": 3607 + }, + { + "epoch": 0.260660682355916, + "grad_norm": 6.942231360551716, + "learning_rate": 4.869197331596187e-06, + "loss": 0.8941, + "step": 3608 + }, + { + "epoch": 0.2607329275561255, + "grad_norm": 6.652866416912285, + "learning_rate": 4.869103941286862e-06, + "loss": 0.9211, + "step": 3609 + }, + { + "epoch": 0.260805172756335, + "grad_norm": 6.727375402299681, + "learning_rate": 4.869010518546343e-06, + "loss": 0.9494, + "step": 3610 + }, + { + "epoch": 0.2608774179565445, + "grad_norm": 7.453723299655819, + "learning_rate": 4.868917063375909e-06, + "loss": 0.9939, + "step": 3611 + }, + { + "epoch": 0.260949663156754, + "grad_norm": 8.21291222314342, + "learning_rate": 4.86882357577684e-06, + "loss": 0.9769, + "step": 3612 + }, + { + "epoch": 0.26102190835696354, + "grad_norm": 5.820057867227856, + "learning_rate": 4.868730055750416e-06, + "loss": 0.9605, + "step": 3613 + }, + { + "epoch": 0.261094153557173, + "grad_norm": 7.1409532734477725, + "learning_rate": 4.868636503297916e-06, + "loss": 0.9108, + "step": 3614 + }, + { + "epoch": 0.26116639875738257, + "grad_norm": 6.789009682260021, + "learning_rate": 4.868542918420621e-06, + "loss": 0.9324, + "step": 3615 + }, + { + "epoch": 0.26123864395759205, + "grad_norm": 5.755989272584125, + "learning_rate": 4.868449301119814e-06, + "loss": 0.8625, + "step": 3616 + }, + { + "epoch": 0.2613108891578016, + "grad_norm": 6.061973627579933, + "learning_rate": 4.868355651396775e-06, + "loss": 0.9142, + "step": 3617 + }, + { + "epoch": 0.2613831343580111, + "grad_norm": 6.630917767139803, + "learning_rate": 4.868261969252784e-06, + "loss": 0.8956, + "step": 3618 + }, + { + "epoch": 0.2614553795582206, + "grad_norm": 6.895399312385476, + "learning_rate": 4.868168254689127e-06, + "loss": 0.8935, + "step": 3619 + }, + { + "epoch": 0.2615276247584301, + "grad_norm": 8.536695370380723, + "learning_rate": 4.868074507707085e-06, + "loss": 0.8901, + "step": 3620 + }, + { + "epoch": 0.2615998699586396, + "grad_norm": 6.7177577505858785, + "learning_rate": 4.8679807283079416e-06, + "loss": 0.956, + "step": 3621 + }, + { + "epoch": 0.26167211515884914, + "grad_norm": 7.437673871227892, + "learning_rate": 4.867886916492981e-06, + "loss": 0.9553, + "step": 3622 + }, + { + "epoch": 0.2617443603590586, + "grad_norm": 7.175012238920719, + "learning_rate": 4.867793072263487e-06, + "loss": 0.8572, + "step": 3623 + }, + { + "epoch": 0.26181660555926817, + "grad_norm": 6.537305734931834, + "learning_rate": 4.8676991956207444e-06, + "loss": 0.9141, + "step": 3624 + }, + { + "epoch": 0.26188885075947765, + "grad_norm": 8.0036808129245, + "learning_rate": 4.867605286566039e-06, + "loss": 0.945, + "step": 3625 + }, + { + "epoch": 0.2619610959596872, + "grad_norm": 6.457301231689392, + "learning_rate": 4.867511345100654e-06, + "loss": 1.0005, + "step": 3626 + }, + { + "epoch": 0.2620333411598967, + "grad_norm": 9.14314576953623, + "learning_rate": 4.867417371225877e-06, + "loss": 0.9399, + "step": 3627 + }, + { + "epoch": 0.2621055863601062, + "grad_norm": 7.320146003059196, + "learning_rate": 4.8673233649429954e-06, + "loss": 0.9458, + "step": 3628 + }, + { + "epoch": 0.2621778315603157, + "grad_norm": 6.8506766326186375, + "learning_rate": 4.867229326253295e-06, + "loss": 0.8538, + "step": 3629 + }, + { + "epoch": 0.2622500767605252, + "grad_norm": 7.39724918477869, + "learning_rate": 4.867135255158062e-06, + "loss": 0.882, + "step": 3630 + }, + { + "epoch": 0.26232232196073474, + "grad_norm": 6.344171218793606, + "learning_rate": 4.867041151658586e-06, + "loss": 0.9947, + "step": 3631 + }, + { + "epoch": 0.2623945671609442, + "grad_norm": 6.191980762481377, + "learning_rate": 4.866947015756155e-06, + "loss": 0.9206, + "step": 3632 + }, + { + "epoch": 0.26246681236115377, + "grad_norm": 8.242715567354997, + "learning_rate": 4.866852847452056e-06, + "loss": 0.9611, + "step": 3633 + }, + { + "epoch": 0.26253905756136325, + "grad_norm": 7.194810416962712, + "learning_rate": 4.86675864674758e-06, + "loss": 0.9555, + "step": 3634 + }, + { + "epoch": 0.2626113027615728, + "grad_norm": 6.249729608408884, + "learning_rate": 4.866664413644015e-06, + "loss": 0.9303, + "step": 3635 + }, + { + "epoch": 0.2626835479617823, + "grad_norm": 6.9006747703668045, + "learning_rate": 4.866570148142654e-06, + "loss": 0.9775, + "step": 3636 + }, + { + "epoch": 0.2627557931619918, + "grad_norm": 6.800283134400201, + "learning_rate": 4.8664758502447825e-06, + "loss": 0.9244, + "step": 3637 + }, + { + "epoch": 0.2628280383622013, + "grad_norm": 6.543029967064368, + "learning_rate": 4.866381519951696e-06, + "loss": 0.8834, + "step": 3638 + }, + { + "epoch": 0.2629002835624108, + "grad_norm": 6.069927267176156, + "learning_rate": 4.866287157264683e-06, + "loss": 0.9611, + "step": 3639 + }, + { + "epoch": 0.26297252876262034, + "grad_norm": 6.151972645057609, + "learning_rate": 4.866192762185036e-06, + "loss": 0.9002, + "step": 3640 + }, + { + "epoch": 0.2630447739628298, + "grad_norm": 8.69666887725104, + "learning_rate": 4.866098334714048e-06, + "loss": 0.9205, + "step": 3641 + }, + { + "epoch": 0.26311701916303937, + "grad_norm": 6.455445396605465, + "learning_rate": 4.86600387485301e-06, + "loss": 0.9696, + "step": 3642 + }, + { + "epoch": 0.26318926436324885, + "grad_norm": 6.330729836187062, + "learning_rate": 4.865909382603217e-06, + "loss": 0.9144, + "step": 3643 + }, + { + "epoch": 0.2632615095634584, + "grad_norm": 6.882164845646062, + "learning_rate": 4.8658148579659615e-06, + "loss": 0.9891, + "step": 3644 + }, + { + "epoch": 0.2633337547636679, + "grad_norm": 6.238971593181631, + "learning_rate": 4.865720300942537e-06, + "loss": 1.0056, + "step": 3645 + }, + { + "epoch": 0.2634059999638774, + "grad_norm": 6.906088986828518, + "learning_rate": 4.865625711534238e-06, + "loss": 1.0139, + "step": 3646 + }, + { + "epoch": 0.2634782451640869, + "grad_norm": 6.566405959529291, + "learning_rate": 4.8655310897423615e-06, + "loss": 0.9065, + "step": 3647 + }, + { + "epoch": 0.2635504903642964, + "grad_norm": 5.484020080384957, + "learning_rate": 4.865436435568199e-06, + "loss": 0.9129, + "step": 3648 + }, + { + "epoch": 0.26362273556450594, + "grad_norm": 5.483192251104573, + "learning_rate": 4.86534174901305e-06, + "loss": 0.8202, + "step": 3649 + }, + { + "epoch": 0.2636949807647154, + "grad_norm": 8.641609011696557, + "learning_rate": 4.865247030078208e-06, + "loss": 0.8627, + "step": 3650 + }, + { + "epoch": 0.26376722596492497, + "grad_norm": 7.660845689030851, + "learning_rate": 4.865152278764971e-06, + "loss": 0.8962, + "step": 3651 + }, + { + "epoch": 0.26383947116513445, + "grad_norm": 6.786438534742641, + "learning_rate": 4.865057495074636e-06, + "loss": 0.962, + "step": 3652 + }, + { + "epoch": 0.263911716365344, + "grad_norm": 6.554126738248612, + "learning_rate": 4.864962679008501e-06, + "loss": 0.918, + "step": 3653 + }, + { + "epoch": 0.2639839615655535, + "grad_norm": 5.789667713071554, + "learning_rate": 4.864867830567861e-06, + "loss": 0.8401, + "step": 3654 + }, + { + "epoch": 0.264056206765763, + "grad_norm": 5.984709010750596, + "learning_rate": 4.864772949754019e-06, + "loss": 0.8858, + "step": 3655 + }, + { + "epoch": 0.2641284519659725, + "grad_norm": 8.768434125413656, + "learning_rate": 4.864678036568269e-06, + "loss": 0.9294, + "step": 3656 + }, + { + "epoch": 0.264200697166182, + "grad_norm": 8.4247783688335, + "learning_rate": 4.864583091011914e-06, + "loss": 0.9896, + "step": 3657 + }, + { + "epoch": 0.26427294236639154, + "grad_norm": 5.5731132496419775, + "learning_rate": 4.8644881130862535e-06, + "loss": 0.839, + "step": 3658 + }, + { + "epoch": 0.264345187566601, + "grad_norm": 5.751706740796935, + "learning_rate": 4.864393102792585e-06, + "loss": 0.8733, + "step": 3659 + }, + { + "epoch": 0.26441743276681057, + "grad_norm": 7.063864247038298, + "learning_rate": 4.864298060132211e-06, + "loss": 0.9528, + "step": 3660 + }, + { + "epoch": 0.26448967796702005, + "grad_norm": 6.235132768895299, + "learning_rate": 4.864202985106433e-06, + "loss": 0.9816, + "step": 3661 + }, + { + "epoch": 0.2645619231672296, + "grad_norm": 7.802549082593412, + "learning_rate": 4.864107877716552e-06, + "loss": 0.955, + "step": 3662 + }, + { + "epoch": 0.2646341683674391, + "grad_norm": 8.411226182377531, + "learning_rate": 4.864012737963869e-06, + "loss": 0.9243, + "step": 3663 + }, + { + "epoch": 0.2647064135676486, + "grad_norm": 7.759345296003228, + "learning_rate": 4.863917565849687e-06, + "loss": 0.9572, + "step": 3664 + }, + { + "epoch": 0.2647786587678581, + "grad_norm": 7.80660519795877, + "learning_rate": 4.863822361375309e-06, + "loss": 0.9651, + "step": 3665 + }, + { + "epoch": 0.2648509039680676, + "grad_norm": 7.710229533894048, + "learning_rate": 4.8637271245420395e-06, + "loss": 1.0021, + "step": 3666 + }, + { + "epoch": 0.26492314916827714, + "grad_norm": 7.656895948914923, + "learning_rate": 4.863631855351179e-06, + "loss": 0.8723, + "step": 3667 + }, + { + "epoch": 0.2649953943684866, + "grad_norm": 7.216546585223379, + "learning_rate": 4.863536553804036e-06, + "loss": 0.9939, + "step": 3668 + }, + { + "epoch": 0.26506763956869617, + "grad_norm": 6.922609036276758, + "learning_rate": 4.863441219901911e-06, + "loss": 0.9479, + "step": 3669 + }, + { + "epoch": 0.26513988476890565, + "grad_norm": 6.151780728206239, + "learning_rate": 4.8633458536461115e-06, + "loss": 0.8946, + "step": 3670 + }, + { + "epoch": 0.2652121299691152, + "grad_norm": 8.774430093978816, + "learning_rate": 4.863250455037942e-06, + "loss": 0.926, + "step": 3671 + }, + { + "epoch": 0.2652843751693247, + "grad_norm": 8.838599980165128, + "learning_rate": 4.863155024078709e-06, + "loss": 1.0207, + "step": 3672 + }, + { + "epoch": 0.2653566203695342, + "grad_norm": 6.7439409541829365, + "learning_rate": 4.863059560769718e-06, + "loss": 0.8994, + "step": 3673 + }, + { + "epoch": 0.2654288655697437, + "grad_norm": 7.417405109576415, + "learning_rate": 4.862964065112277e-06, + "loss": 0.9985, + "step": 3674 + }, + { + "epoch": 0.2655011107699532, + "grad_norm": 7.136880706803535, + "learning_rate": 4.862868537107692e-06, + "loss": 1.0391, + "step": 3675 + }, + { + "epoch": 0.26557335597016274, + "grad_norm": 9.964382446569577, + "learning_rate": 4.8627729767572725e-06, + "loss": 0.9071, + "step": 3676 + }, + { + "epoch": 0.2656456011703722, + "grad_norm": 7.461455673678138, + "learning_rate": 4.862677384062325e-06, + "loss": 0.9111, + "step": 3677 + }, + { + "epoch": 0.26571784637058177, + "grad_norm": 6.0421483691881654, + "learning_rate": 4.8625817590241585e-06, + "loss": 0.9513, + "step": 3678 + }, + { + "epoch": 0.26579009157079125, + "grad_norm": 7.310411464732434, + "learning_rate": 4.862486101644081e-06, + "loss": 0.8926, + "step": 3679 + }, + { + "epoch": 0.2658623367710008, + "grad_norm": 8.941373399213353, + "learning_rate": 4.862390411923405e-06, + "loss": 0.9703, + "step": 3680 + }, + { + "epoch": 0.2659345819712103, + "grad_norm": 6.923364758825021, + "learning_rate": 4.862294689863438e-06, + "loss": 0.9966, + "step": 3681 + }, + { + "epoch": 0.2660068271714198, + "grad_norm": 7.047224691642248, + "learning_rate": 4.862198935465491e-06, + "loss": 0.9573, + "step": 3682 + }, + { + "epoch": 0.2660790723716293, + "grad_norm": 6.501682650577338, + "learning_rate": 4.862103148730874e-06, + "loss": 0.9911, + "step": 3683 + }, + { + "epoch": 0.2661513175718388, + "grad_norm": 6.15663571771107, + "learning_rate": 4.862007329660899e-06, + "loss": 0.9764, + "step": 3684 + }, + { + "epoch": 0.26622356277204834, + "grad_norm": 6.942634126152633, + "learning_rate": 4.861911478256878e-06, + "loss": 0.8904, + "step": 3685 + }, + { + "epoch": 0.2662958079722578, + "grad_norm": 7.088140075952797, + "learning_rate": 4.8618155945201225e-06, + "loss": 0.9096, + "step": 3686 + }, + { + "epoch": 0.26636805317246737, + "grad_norm": 5.9308469741933845, + "learning_rate": 4.861719678451946e-06, + "loss": 0.9705, + "step": 3687 + }, + { + "epoch": 0.26644029837267685, + "grad_norm": 6.726687546474891, + "learning_rate": 4.861623730053661e-06, + "loss": 0.9989, + "step": 3688 + }, + { + "epoch": 0.2665125435728864, + "grad_norm": 5.684742017812421, + "learning_rate": 4.86152774932658e-06, + "loss": 0.8825, + "step": 3689 + }, + { + "epoch": 0.2665847887730959, + "grad_norm": 7.917117644732653, + "learning_rate": 4.8614317362720175e-06, + "loss": 0.919, + "step": 3690 + }, + { + "epoch": 0.2666570339733054, + "grad_norm": 8.327498364089077, + "learning_rate": 4.861335690891289e-06, + "loss": 0.9007, + "step": 3691 + }, + { + "epoch": 0.2667292791735149, + "grad_norm": 6.563919640439131, + "learning_rate": 4.8612396131857075e-06, + "loss": 0.9453, + "step": 3692 + }, + { + "epoch": 0.2668015243737244, + "grad_norm": 8.288439354227727, + "learning_rate": 4.86114350315659e-06, + "loss": 0.971, + "step": 3693 + }, + { + "epoch": 0.26687376957393394, + "grad_norm": 6.053644063315294, + "learning_rate": 4.861047360805251e-06, + "loss": 0.903, + "step": 3694 + }, + { + "epoch": 0.2669460147741434, + "grad_norm": 6.502854820699275, + "learning_rate": 4.8609511861330065e-06, + "loss": 0.8859, + "step": 3695 + }, + { + "epoch": 0.26701825997435297, + "grad_norm": 9.107498734730903, + "learning_rate": 4.860854979141173e-06, + "loss": 0.9931, + "step": 3696 + }, + { + "epoch": 0.26709050517456245, + "grad_norm": 6.586554628466942, + "learning_rate": 4.8607587398310685e-06, + "loss": 0.9529, + "step": 3697 + }, + { + "epoch": 0.267162750374772, + "grad_norm": 7.454503985456675, + "learning_rate": 4.86066246820401e-06, + "loss": 0.9427, + "step": 3698 + }, + { + "epoch": 0.2672349955749815, + "grad_norm": 6.555409115522463, + "learning_rate": 4.860566164261315e-06, + "loss": 0.9883, + "step": 3699 + }, + { + "epoch": 0.267307240775191, + "grad_norm": 5.166976960411555, + "learning_rate": 4.860469828004303e-06, + "loss": 0.9314, + "step": 3700 + }, + { + "epoch": 0.2673794859754005, + "grad_norm": 6.355267672180188, + "learning_rate": 4.86037345943429e-06, + "loss": 0.9308, + "step": 3701 + }, + { + "epoch": 0.26745173117561, + "grad_norm": 7.237685432759839, + "learning_rate": 4.860277058552599e-06, + "loss": 0.9113, + "step": 3702 + }, + { + "epoch": 0.26752397637581954, + "grad_norm": 5.7571718022519125, + "learning_rate": 4.860180625360546e-06, + "loss": 0.9178, + "step": 3703 + }, + { + "epoch": 0.267596221576029, + "grad_norm": 7.745799249003254, + "learning_rate": 4.860084159859453e-06, + "loss": 0.9812, + "step": 3704 + }, + { + "epoch": 0.26766846677623857, + "grad_norm": 5.845919772338579, + "learning_rate": 4.859987662050641e-06, + "loss": 0.8664, + "step": 3705 + }, + { + "epoch": 0.26774071197644805, + "grad_norm": 7.258440826122257, + "learning_rate": 4.85989113193543e-06, + "loss": 0.9628, + "step": 3706 + }, + { + "epoch": 0.2678129571766576, + "grad_norm": 6.749901099716723, + "learning_rate": 4.859794569515143e-06, + "loss": 1.1135, + "step": 3707 + }, + { + "epoch": 0.2678852023768671, + "grad_norm": 6.629061911199032, + "learning_rate": 4.859697974791099e-06, + "loss": 0.8898, + "step": 3708 + }, + { + "epoch": 0.2679574475770766, + "grad_norm": 6.3258887590052195, + "learning_rate": 4.859601347764622e-06, + "loss": 0.9103, + "step": 3709 + }, + { + "epoch": 0.2680296927772861, + "grad_norm": 7.037051371881411, + "learning_rate": 4.859504688437036e-06, + "loss": 0.953, + "step": 3710 + }, + { + "epoch": 0.2681019379774956, + "grad_norm": 7.617285906816262, + "learning_rate": 4.859407996809661e-06, + "loss": 0.9497, + "step": 3711 + }, + { + "epoch": 0.26817418317770514, + "grad_norm": 6.644020845880022, + "learning_rate": 4.859311272883823e-06, + "loss": 0.8473, + "step": 3712 + }, + { + "epoch": 0.2682464283779146, + "grad_norm": 7.422149203405131, + "learning_rate": 4.859214516660846e-06, + "loss": 0.9505, + "step": 3713 + }, + { + "epoch": 0.26831867357812417, + "grad_norm": 6.418571420653606, + "learning_rate": 4.859117728142055e-06, + "loss": 0.8984, + "step": 3714 + }, + { + "epoch": 0.26839091877833365, + "grad_norm": 6.273041633096851, + "learning_rate": 4.859020907328773e-06, + "loss": 0.9082, + "step": 3715 + }, + { + "epoch": 0.2684631639785432, + "grad_norm": 11.375131459576906, + "learning_rate": 4.858924054222326e-06, + "loss": 0.9287, + "step": 3716 + }, + { + "epoch": 0.2685354091787527, + "grad_norm": 6.63445629154521, + "learning_rate": 4.858827168824042e-06, + "loss": 0.9456, + "step": 3717 + }, + { + "epoch": 0.2686076543789622, + "grad_norm": 8.699398537376556, + "learning_rate": 4.858730251135244e-06, + "loss": 0.9166, + "step": 3718 + }, + { + "epoch": 0.2686798995791717, + "grad_norm": 6.794906436368636, + "learning_rate": 4.858633301157261e-06, + "loss": 0.9543, + "step": 3719 + }, + { + "epoch": 0.2687521447793812, + "grad_norm": 7.591844610089186, + "learning_rate": 4.858536318891419e-06, + "loss": 0.8879, + "step": 3720 + }, + { + "epoch": 0.26882438997959074, + "grad_norm": 6.663353796210776, + "learning_rate": 4.858439304339046e-06, + "loss": 0.9072, + "step": 3721 + }, + { + "epoch": 0.2688966351798002, + "grad_norm": 7.497470937928862, + "learning_rate": 4.858342257501471e-06, + "loss": 0.9669, + "step": 3722 + }, + { + "epoch": 0.26896888038000977, + "grad_norm": 7.341500185212994, + "learning_rate": 4.85824517838002e-06, + "loss": 1.0237, + "step": 3723 + }, + { + "epoch": 0.26904112558021925, + "grad_norm": 6.2609111910049, + "learning_rate": 4.858148066976025e-06, + "loss": 0.8525, + "step": 3724 + }, + { + "epoch": 0.2691133707804288, + "grad_norm": 7.257708693502125, + "learning_rate": 4.858050923290814e-06, + "loss": 0.8885, + "step": 3725 + }, + { + "epoch": 0.2691856159806383, + "grad_norm": 6.181537259700568, + "learning_rate": 4.857953747325716e-06, + "loss": 0.9375, + "step": 3726 + }, + { + "epoch": 0.2692578611808478, + "grad_norm": 6.322377248396755, + "learning_rate": 4.857856539082062e-06, + "loss": 0.8079, + "step": 3727 + }, + { + "epoch": 0.2693301063810573, + "grad_norm": 6.692025053024139, + "learning_rate": 4.857759298561183e-06, + "loss": 0.8941, + "step": 3728 + }, + { + "epoch": 0.2694023515812668, + "grad_norm": 6.642792902107005, + "learning_rate": 4.85766202576441e-06, + "loss": 0.8747, + "step": 3729 + }, + { + "epoch": 0.26947459678147634, + "grad_norm": 9.283283871348283, + "learning_rate": 4.857564720693074e-06, + "loss": 0.8887, + "step": 3730 + }, + { + "epoch": 0.2695468419816858, + "grad_norm": 6.441459549127579, + "learning_rate": 4.857467383348509e-06, + "loss": 0.9455, + "step": 3731 + }, + { + "epoch": 0.26961908718189537, + "grad_norm": 7.422504342149523, + "learning_rate": 4.857370013732045e-06, + "loss": 0.9513, + "step": 3732 + }, + { + "epoch": 0.26969133238210485, + "grad_norm": 7.202888534537155, + "learning_rate": 4.857272611845015e-06, + "loss": 1.0435, + "step": 3733 + }, + { + "epoch": 0.2697635775823144, + "grad_norm": 8.05290325245852, + "learning_rate": 4.857175177688755e-06, + "loss": 0.8898, + "step": 3734 + }, + { + "epoch": 0.2698358227825239, + "grad_norm": 7.294905402842849, + "learning_rate": 4.857077711264596e-06, + "loss": 0.8357, + "step": 3735 + }, + { + "epoch": 0.2699080679827334, + "grad_norm": 8.494653254320204, + "learning_rate": 4.856980212573873e-06, + "loss": 0.9694, + "step": 3736 + }, + { + "epoch": 0.2699803131829429, + "grad_norm": 7.451975095960168, + "learning_rate": 4.856882681617922e-06, + "loss": 0.9619, + "step": 3737 + }, + { + "epoch": 0.2700525583831524, + "grad_norm": 7.400950664593441, + "learning_rate": 4.856785118398075e-06, + "loss": 0.8767, + "step": 3738 + }, + { + "epoch": 0.27012480358336194, + "grad_norm": 7.545446920234362, + "learning_rate": 4.856687522915672e-06, + "loss": 0.8873, + "step": 3739 + }, + { + "epoch": 0.2701970487835714, + "grad_norm": 7.464803634687477, + "learning_rate": 4.856589895172046e-06, + "loss": 0.9067, + "step": 3740 + }, + { + "epoch": 0.27026929398378097, + "grad_norm": 6.87152538928127, + "learning_rate": 4.856492235168533e-06, + "loss": 0.9262, + "step": 3741 + }, + { + "epoch": 0.27034153918399045, + "grad_norm": 5.93606646448646, + "learning_rate": 4.8563945429064715e-06, + "loss": 0.8624, + "step": 3742 + }, + { + "epoch": 0.2704137843842, + "grad_norm": 6.610485659949682, + "learning_rate": 4.8562968183872e-06, + "loss": 0.8865, + "step": 3743 + }, + { + "epoch": 0.2704860295844095, + "grad_norm": 10.11179272190551, + "learning_rate": 4.8561990616120525e-06, + "loss": 0.9726, + "step": 3744 + }, + { + "epoch": 0.270558274784619, + "grad_norm": 9.579938558170419, + "learning_rate": 4.85610127258237e-06, + "loss": 0.9339, + "step": 3745 + }, + { + "epoch": 0.2706305199848285, + "grad_norm": 7.488506665963625, + "learning_rate": 4.856003451299491e-06, + "loss": 0.9676, + "step": 3746 + }, + { + "epoch": 0.270702765185038, + "grad_norm": 6.208503908549201, + "learning_rate": 4.855905597764753e-06, + "loss": 0.8631, + "step": 3747 + }, + { + "epoch": 0.27077501038524754, + "grad_norm": 8.297027184191627, + "learning_rate": 4.855807711979498e-06, + "loss": 0.973, + "step": 3748 + }, + { + "epoch": 0.270847255585457, + "grad_norm": 7.6270561572746205, + "learning_rate": 4.855709793945064e-06, + "loss": 0.9455, + "step": 3749 + }, + { + "epoch": 0.27091950078566657, + "grad_norm": 8.398012798127873, + "learning_rate": 4.855611843662792e-06, + "loss": 0.9938, + "step": 3750 + }, + { + "epoch": 0.27099174598587605, + "grad_norm": 7.17430721753074, + "learning_rate": 4.855513861134022e-06, + "loss": 0.895, + "step": 3751 + }, + { + "epoch": 0.2710639911860856, + "grad_norm": 7.111018332564909, + "learning_rate": 4.855415846360098e-06, + "loss": 0.9865, + "step": 3752 + }, + { + "epoch": 0.2711362363862951, + "grad_norm": 5.774749895765407, + "learning_rate": 4.855317799342359e-06, + "loss": 0.845, + "step": 3753 + }, + { + "epoch": 0.2712084815865046, + "grad_norm": 6.863793412076307, + "learning_rate": 4.855219720082147e-06, + "loss": 0.9082, + "step": 3754 + }, + { + "epoch": 0.2712807267867141, + "grad_norm": 8.109965808198963, + "learning_rate": 4.855121608580807e-06, + "loss": 0.9025, + "step": 3755 + }, + { + "epoch": 0.2713529719869236, + "grad_norm": 8.11494645435392, + "learning_rate": 4.8550234648396795e-06, + "loss": 0.9945, + "step": 3756 + }, + { + "epoch": 0.27142521718713314, + "grad_norm": 7.448994449625189, + "learning_rate": 4.854925288860111e-06, + "loss": 0.9118, + "step": 3757 + }, + { + "epoch": 0.2714974623873426, + "grad_norm": 7.085837097466055, + "learning_rate": 4.854827080643443e-06, + "loss": 0.8763, + "step": 3758 + }, + { + "epoch": 0.27156970758755217, + "grad_norm": 6.834068941049368, + "learning_rate": 4.854728840191021e-06, + "loss": 0.8374, + "step": 3759 + }, + { + "epoch": 0.27164195278776165, + "grad_norm": 8.984575829956485, + "learning_rate": 4.854630567504189e-06, + "loss": 0.9267, + "step": 3760 + }, + { + "epoch": 0.2717141979879712, + "grad_norm": 8.320656352930019, + "learning_rate": 4.854532262584294e-06, + "loss": 0.938, + "step": 3761 + }, + { + "epoch": 0.2717864431881807, + "grad_norm": 7.908284645480052, + "learning_rate": 4.854433925432679e-06, + "loss": 1.048, + "step": 3762 + }, + { + "epoch": 0.2718586883883902, + "grad_norm": 5.803484087276326, + "learning_rate": 4.8543355560506924e-06, + "loss": 0.8115, + "step": 3763 + }, + { + "epoch": 0.2719309335885997, + "grad_norm": 5.661593952911714, + "learning_rate": 4.8542371544396795e-06, + "loss": 0.8758, + "step": 3764 + }, + { + "epoch": 0.2720031787888092, + "grad_norm": 5.385961311980471, + "learning_rate": 4.854138720600988e-06, + "loss": 0.8645, + "step": 3765 + }, + { + "epoch": 0.27207542398901874, + "grad_norm": 6.468036999315304, + "learning_rate": 4.854040254535966e-06, + "loss": 0.8779, + "step": 3766 + }, + { + "epoch": 0.2721476691892282, + "grad_norm": 10.654116598784166, + "learning_rate": 4.85394175624596e-06, + "loss": 0.9939, + "step": 3767 + }, + { + "epoch": 0.27221991438943777, + "grad_norm": 7.39916808117352, + "learning_rate": 4.85384322573232e-06, + "loss": 0.9572, + "step": 3768 + }, + { + "epoch": 0.27229215958964725, + "grad_norm": 8.102994727850334, + "learning_rate": 4.853744662996393e-06, + "loss": 0.8966, + "step": 3769 + }, + { + "epoch": 0.2723644047898568, + "grad_norm": 6.973019510845557, + "learning_rate": 4.85364606803953e-06, + "loss": 0.9677, + "step": 3770 + }, + { + "epoch": 0.2724366499900663, + "grad_norm": 9.869642518593276, + "learning_rate": 4.853547440863079e-06, + "loss": 0.9118, + "step": 3771 + }, + { + "epoch": 0.27250889519027577, + "grad_norm": 8.894771849100943, + "learning_rate": 4.85344878146839e-06, + "loss": 0.9586, + "step": 3772 + }, + { + "epoch": 0.2725811403904853, + "grad_norm": 10.783861634541204, + "learning_rate": 4.853350089856817e-06, + "loss": 0.9409, + "step": 3773 + }, + { + "epoch": 0.2726533855906948, + "grad_norm": 5.572047579965415, + "learning_rate": 4.853251366029707e-06, + "loss": 0.8593, + "step": 3774 + }, + { + "epoch": 0.27272563079090434, + "grad_norm": 5.74637854600991, + "learning_rate": 4.853152609988413e-06, + "loss": 0.8472, + "step": 3775 + }, + { + "epoch": 0.2727978759911138, + "grad_norm": 8.062916826222507, + "learning_rate": 4.853053821734287e-06, + "loss": 0.8866, + "step": 3776 + }, + { + "epoch": 0.27287012119132337, + "grad_norm": 9.903281744898893, + "learning_rate": 4.852955001268681e-06, + "loss": 0.9421, + "step": 3777 + }, + { + "epoch": 0.27294236639153285, + "grad_norm": 6.37858540556784, + "learning_rate": 4.852856148592948e-06, + "loss": 0.942, + "step": 3778 + }, + { + "epoch": 0.2730146115917424, + "grad_norm": 8.815106378477074, + "learning_rate": 4.852757263708442e-06, + "loss": 0.9444, + "step": 3779 + }, + { + "epoch": 0.2730868567919519, + "grad_norm": 6.56395131366348, + "learning_rate": 4.852658346616515e-06, + "loss": 0.8775, + "step": 3780 + }, + { + "epoch": 0.27315910199216137, + "grad_norm": 6.807337493469822, + "learning_rate": 4.852559397318522e-06, + "loss": 0.9536, + "step": 3781 + }, + { + "epoch": 0.2732313471923709, + "grad_norm": 9.494181005301394, + "learning_rate": 4.852460415815818e-06, + "loss": 0.9732, + "step": 3782 + }, + { + "epoch": 0.2733035923925804, + "grad_norm": 7.793431563246441, + "learning_rate": 4.852361402109757e-06, + "loss": 0.9377, + "step": 3783 + }, + { + "epoch": 0.27337583759278994, + "grad_norm": 7.449091493652803, + "learning_rate": 4.852262356201695e-06, + "loss": 0.8968, + "step": 3784 + }, + { + "epoch": 0.2734480827929994, + "grad_norm": 8.00227943371474, + "learning_rate": 4.852163278092988e-06, + "loss": 0.9955, + "step": 3785 + }, + { + "epoch": 0.27352032799320897, + "grad_norm": 6.686282323337315, + "learning_rate": 4.852064167784992e-06, + "loss": 0.8666, + "step": 3786 + }, + { + "epoch": 0.27359257319341845, + "grad_norm": 6.511333854664076, + "learning_rate": 4.851965025279064e-06, + "loss": 0.8784, + "step": 3787 + }, + { + "epoch": 0.273664818393628, + "grad_norm": 7.4157627997955835, + "learning_rate": 4.851865850576561e-06, + "loss": 0.973, + "step": 3788 + }, + { + "epoch": 0.2737370635938375, + "grad_norm": 7.437385365860926, + "learning_rate": 4.85176664367884e-06, + "loss": 0.9201, + "step": 3789 + }, + { + "epoch": 0.27380930879404697, + "grad_norm": 7.01006111804042, + "learning_rate": 4.851667404587259e-06, + "loss": 0.8216, + "step": 3790 + }, + { + "epoch": 0.2738815539942565, + "grad_norm": 7.179019070829659, + "learning_rate": 4.851568133303179e-06, + "loss": 0.9103, + "step": 3791 + }, + { + "epoch": 0.273953799194466, + "grad_norm": 7.658769714519367, + "learning_rate": 4.8514688298279564e-06, + "loss": 0.9526, + "step": 3792 + }, + { + "epoch": 0.27402604439467554, + "grad_norm": 7.125668143014731, + "learning_rate": 4.851369494162952e-06, + "loss": 0.9116, + "step": 3793 + }, + { + "epoch": 0.274098289594885, + "grad_norm": 6.749317275947712, + "learning_rate": 4.8512701263095235e-06, + "loss": 0.9065, + "step": 3794 + }, + { + "epoch": 0.27417053479509457, + "grad_norm": 8.06478589203551, + "learning_rate": 4.851170726269033e-06, + "loss": 0.8962, + "step": 3795 + }, + { + "epoch": 0.27424277999530405, + "grad_norm": 7.574635502572132, + "learning_rate": 4.8510712940428415e-06, + "loss": 0.9079, + "step": 3796 + }, + { + "epoch": 0.2743150251955136, + "grad_norm": 6.6569024424407655, + "learning_rate": 4.850971829632309e-06, + "loss": 0.882, + "step": 3797 + }, + { + "epoch": 0.2743872703957231, + "grad_norm": 7.5958480134516275, + "learning_rate": 4.850872333038797e-06, + "loss": 0.8896, + "step": 3798 + }, + { + "epoch": 0.27445951559593257, + "grad_norm": 7.2825858401089025, + "learning_rate": 4.850772804263669e-06, + "loss": 0.8768, + "step": 3799 + }, + { + "epoch": 0.2745317607961421, + "grad_norm": 6.711635211384284, + "learning_rate": 4.850673243308287e-06, + "loss": 0.9992, + "step": 3800 + }, + { + "epoch": 0.2746040059963516, + "grad_norm": 6.099957712996197, + "learning_rate": 4.8505736501740124e-06, + "loss": 0.9036, + "step": 3801 + }, + { + "epoch": 0.27467625119656114, + "grad_norm": 7.272515423897038, + "learning_rate": 4.85047402486221e-06, + "loss": 0.9075, + "step": 3802 + }, + { + "epoch": 0.2747484963967706, + "grad_norm": 7.260650441270553, + "learning_rate": 4.850374367374243e-06, + "loss": 0.8421, + "step": 3803 + }, + { + "epoch": 0.27482074159698017, + "grad_norm": 7.605156012846569, + "learning_rate": 4.850274677711476e-06, + "loss": 1.0071, + "step": 3804 + }, + { + "epoch": 0.27489298679718965, + "grad_norm": 6.168595845654929, + "learning_rate": 4.850174955875274e-06, + "loss": 0.9223, + "step": 3805 + }, + { + "epoch": 0.2749652319973992, + "grad_norm": 6.319438180699114, + "learning_rate": 4.850075201867001e-06, + "loss": 0.9271, + "step": 3806 + }, + { + "epoch": 0.2750374771976087, + "grad_norm": 7.820741746461377, + "learning_rate": 4.849975415688024e-06, + "loss": 1.0551, + "step": 3807 + }, + { + "epoch": 0.27510972239781817, + "grad_norm": 8.861054976022096, + "learning_rate": 4.849875597339708e-06, + "loss": 1.0196, + "step": 3808 + }, + { + "epoch": 0.2751819675980277, + "grad_norm": 6.823040848007157, + "learning_rate": 4.849775746823419e-06, + "loss": 1.0459, + "step": 3809 + }, + { + "epoch": 0.2752542127982372, + "grad_norm": 7.1524584191722695, + "learning_rate": 4.849675864140525e-06, + "loss": 0.9335, + "step": 3810 + }, + { + "epoch": 0.27532645799844674, + "grad_norm": 7.503393041485283, + "learning_rate": 4.849575949292392e-06, + "loss": 0.8488, + "step": 3811 + }, + { + "epoch": 0.2753987031986562, + "grad_norm": 7.243332789784303, + "learning_rate": 4.84947600228039e-06, + "loss": 0.9365, + "step": 3812 + }, + { + "epoch": 0.27547094839886577, + "grad_norm": 6.29924689741634, + "learning_rate": 4.849376023105885e-06, + "loss": 0.9296, + "step": 3813 + }, + { + "epoch": 0.27554319359907525, + "grad_norm": 5.846698853343607, + "learning_rate": 4.849276011770247e-06, + "loss": 0.9548, + "step": 3814 + }, + { + "epoch": 0.2756154387992848, + "grad_norm": 7.1651414830360345, + "learning_rate": 4.849175968274843e-06, + "loss": 0.9619, + "step": 3815 + }, + { + "epoch": 0.2756876839994943, + "grad_norm": 6.789235559503137, + "learning_rate": 4.8490758926210455e-06, + "loss": 1.0991, + "step": 3816 + }, + { + "epoch": 0.27575992919970377, + "grad_norm": 6.349555004477307, + "learning_rate": 4.848975784810222e-06, + "loss": 0.9523, + "step": 3817 + }, + { + "epoch": 0.2758321743999133, + "grad_norm": 8.244691007525407, + "learning_rate": 4.848875644843744e-06, + "loss": 0.9373, + "step": 3818 + }, + { + "epoch": 0.2759044196001228, + "grad_norm": 7.216369764932263, + "learning_rate": 4.848775472722983e-06, + "loss": 0.9854, + "step": 3819 + }, + { + "epoch": 0.27597666480033234, + "grad_norm": 7.459526208144858, + "learning_rate": 4.848675268449309e-06, + "loss": 0.9155, + "step": 3820 + }, + { + "epoch": 0.2760489100005418, + "grad_norm": 21.236608257286225, + "learning_rate": 4.8485750320240935e-06, + "loss": 0.8735, + "step": 3821 + }, + { + "epoch": 0.27612115520075137, + "grad_norm": 6.325321885916101, + "learning_rate": 4.848474763448711e-06, + "loss": 0.9107, + "step": 3822 + }, + { + "epoch": 0.27619340040096085, + "grad_norm": 6.533330120682737, + "learning_rate": 4.848374462724531e-06, + "loss": 0.929, + "step": 3823 + }, + { + "epoch": 0.2762656456011704, + "grad_norm": 7.529818703544724, + "learning_rate": 4.848274129852929e-06, + "loss": 0.8528, + "step": 3824 + }, + { + "epoch": 0.2763378908013799, + "grad_norm": 7.095456359251667, + "learning_rate": 4.848173764835277e-06, + "loss": 0.9204, + "step": 3825 + }, + { + "epoch": 0.27641013600158937, + "grad_norm": 7.7383238913041446, + "learning_rate": 4.848073367672949e-06, + "loss": 0.9061, + "step": 3826 + }, + { + "epoch": 0.2764823812017989, + "grad_norm": 6.678452976653872, + "learning_rate": 4.84797293836732e-06, + "loss": 0.93, + "step": 3827 + }, + { + "epoch": 0.2765546264020084, + "grad_norm": 6.22212901878185, + "learning_rate": 4.847872476919765e-06, + "loss": 0.8809, + "step": 3828 + }, + { + "epoch": 0.27662687160221794, + "grad_norm": 7.772081726539787, + "learning_rate": 4.847771983331658e-06, + "loss": 0.848, + "step": 3829 + }, + { + "epoch": 0.2766991168024274, + "grad_norm": 7.521916220805208, + "learning_rate": 4.847671457604376e-06, + "loss": 0.8628, + "step": 3830 + }, + { + "epoch": 0.27677136200263697, + "grad_norm": 6.288243893249238, + "learning_rate": 4.847570899739294e-06, + "loss": 0.8848, + "step": 3831 + }, + { + "epoch": 0.27684360720284645, + "grad_norm": 6.27892173014298, + "learning_rate": 4.847470309737791e-06, + "loss": 1.0144, + "step": 3832 + }, + { + "epoch": 0.276915852403056, + "grad_norm": 7.33897393936414, + "learning_rate": 4.84736968760124e-06, + "loss": 0.9656, + "step": 3833 + }, + { + "epoch": 0.2769880976032655, + "grad_norm": 5.702357075868112, + "learning_rate": 4.847269033331021e-06, + "loss": 0.8992, + "step": 3834 + }, + { + "epoch": 0.27706034280347497, + "grad_norm": 7.633870952391941, + "learning_rate": 4.8471683469285125e-06, + "loss": 0.9166, + "step": 3835 + }, + { + "epoch": 0.2771325880036845, + "grad_norm": 6.785399964588923, + "learning_rate": 4.847067628395091e-06, + "loss": 0.9929, + "step": 3836 + }, + { + "epoch": 0.277204833203894, + "grad_norm": 6.9888612134446495, + "learning_rate": 4.846966877732137e-06, + "loss": 0.9059, + "step": 3837 + }, + { + "epoch": 0.27727707840410354, + "grad_norm": 6.351775105989688, + "learning_rate": 4.8468660949410275e-06, + "loss": 0.9336, + "step": 3838 + }, + { + "epoch": 0.277349323604313, + "grad_norm": 6.197839286598981, + "learning_rate": 4.846765280023144e-06, + "loss": 0.9643, + "step": 3839 + }, + { + "epoch": 0.27742156880452257, + "grad_norm": 7.131180876414415, + "learning_rate": 4.846664432979867e-06, + "loss": 0.9979, + "step": 3840 + }, + { + "epoch": 0.27749381400473205, + "grad_norm": 10.173962544406209, + "learning_rate": 4.846563553812574e-06, + "loss": 0.9121, + "step": 3841 + }, + { + "epoch": 0.2775660592049416, + "grad_norm": 6.6493287973907975, + "learning_rate": 4.846462642522649e-06, + "loss": 0.9136, + "step": 3842 + }, + { + "epoch": 0.2776383044051511, + "grad_norm": 6.344269228577688, + "learning_rate": 4.846361699111471e-06, + "loss": 0.8863, + "step": 3843 + }, + { + "epoch": 0.27771054960536057, + "grad_norm": 6.944671772944748, + "learning_rate": 4.846260723580425e-06, + "loss": 0.8868, + "step": 3844 + }, + { + "epoch": 0.2777827948055701, + "grad_norm": 5.90815437738154, + "learning_rate": 4.84615971593089e-06, + "loss": 0.9266, + "step": 3845 + }, + { + "epoch": 0.2778550400057796, + "grad_norm": 6.153050554637893, + "learning_rate": 4.846058676164251e-06, + "loss": 0.9665, + "step": 3846 + }, + { + "epoch": 0.27792728520598914, + "grad_norm": 6.111403340760372, + "learning_rate": 4.845957604281891e-06, + "loss": 0.8377, + "step": 3847 + }, + { + "epoch": 0.2779995304061986, + "grad_norm": 5.442183658796735, + "learning_rate": 4.845856500285192e-06, + "loss": 0.8627, + "step": 3848 + }, + { + "epoch": 0.27807177560640817, + "grad_norm": 7.309738289757445, + "learning_rate": 4.845755364175539e-06, + "loss": 1.0454, + "step": 3849 + }, + { + "epoch": 0.27814402080661765, + "grad_norm": 6.034856636980389, + "learning_rate": 4.8456541959543165e-06, + "loss": 0.8905, + "step": 3850 + }, + { + "epoch": 0.2782162660068272, + "grad_norm": 6.666992688154552, + "learning_rate": 4.845552995622909e-06, + "loss": 0.9143, + "step": 3851 + }, + { + "epoch": 0.2782885112070367, + "grad_norm": 6.335671077070382, + "learning_rate": 4.8454517631827025e-06, + "loss": 0.9292, + "step": 3852 + }, + { + "epoch": 0.27836075640724617, + "grad_norm": 6.376434763101238, + "learning_rate": 4.845350498635083e-06, + "loss": 0.9752, + "step": 3853 + }, + { + "epoch": 0.2784330016074557, + "grad_norm": 8.250012946841167, + "learning_rate": 4.845249201981436e-06, + "loss": 0.9456, + "step": 3854 + }, + { + "epoch": 0.2785052468076652, + "grad_norm": 7.15073632136888, + "learning_rate": 4.845147873223147e-06, + "loss": 0.9182, + "step": 3855 + }, + { + "epoch": 0.27857749200787474, + "grad_norm": 6.757818991597316, + "learning_rate": 4.845046512361606e-06, + "loss": 0.9025, + "step": 3856 + }, + { + "epoch": 0.2786497372080842, + "grad_norm": 5.92486480687419, + "learning_rate": 4.8449451193981985e-06, + "loss": 0.888, + "step": 3857 + }, + { + "epoch": 0.27872198240829377, + "grad_norm": 6.500655508000051, + "learning_rate": 4.844843694334314e-06, + "loss": 0.9314, + "step": 3858 + }, + { + "epoch": 0.27879422760850325, + "grad_norm": 6.716346204257879, + "learning_rate": 4.84474223717134e-06, + "loss": 0.9315, + "step": 3859 + }, + { + "epoch": 0.2788664728087128, + "grad_norm": 7.248808401115592, + "learning_rate": 4.844640747910664e-06, + "loss": 0.9088, + "step": 3860 + }, + { + "epoch": 0.2789387180089223, + "grad_norm": 6.9338206939091815, + "learning_rate": 4.844539226553677e-06, + "loss": 0.9232, + "step": 3861 + }, + { + "epoch": 0.27901096320913177, + "grad_norm": 7.259400160208497, + "learning_rate": 4.844437673101769e-06, + "loss": 0.8401, + "step": 3862 + }, + { + "epoch": 0.2790832084093413, + "grad_norm": 7.469362880188477, + "learning_rate": 4.844336087556329e-06, + "loss": 0.9084, + "step": 3863 + }, + { + "epoch": 0.2791554536095508, + "grad_norm": 6.0204383518144295, + "learning_rate": 4.844234469918748e-06, + "loss": 0.8566, + "step": 3864 + }, + { + "epoch": 0.27922769880976034, + "grad_norm": 6.194581266449767, + "learning_rate": 4.844132820190418e-06, + "loss": 0.8317, + "step": 3865 + }, + { + "epoch": 0.2792999440099698, + "grad_norm": 7.641839615401486, + "learning_rate": 4.84403113837273e-06, + "loss": 0.8624, + "step": 3866 + }, + { + "epoch": 0.27937218921017937, + "grad_norm": 6.930813978427386, + "learning_rate": 4.843929424467075e-06, + "loss": 0.9179, + "step": 3867 + }, + { + "epoch": 0.27944443441038885, + "grad_norm": 6.913338316152623, + "learning_rate": 4.843827678474846e-06, + "loss": 0.9896, + "step": 3868 + }, + { + "epoch": 0.2795166796105984, + "grad_norm": 5.866564274385651, + "learning_rate": 4.8437259003974366e-06, + "loss": 0.9855, + "step": 3869 + }, + { + "epoch": 0.2795889248108079, + "grad_norm": 7.224714977093846, + "learning_rate": 4.843624090236239e-06, + "loss": 1.015, + "step": 3870 + }, + { + "epoch": 0.27966117001101737, + "grad_norm": 6.260358237841768, + "learning_rate": 4.8435222479926474e-06, + "loss": 0.8875, + "step": 3871 + }, + { + "epoch": 0.2797334152112269, + "grad_norm": 7.946471903332272, + "learning_rate": 4.843420373668056e-06, + "loss": 1.0021, + "step": 3872 + }, + { + "epoch": 0.2798056604114364, + "grad_norm": 10.18191206566764, + "learning_rate": 4.843318467263859e-06, + "loss": 0.9636, + "step": 3873 + }, + { + "epoch": 0.27987790561164594, + "grad_norm": 8.41183206930613, + "learning_rate": 4.843216528781452e-06, + "loss": 0.9618, + "step": 3874 + }, + { + "epoch": 0.2799501508118554, + "grad_norm": 7.626094051730535, + "learning_rate": 4.84311455822223e-06, + "loss": 0.9882, + "step": 3875 + }, + { + "epoch": 0.28002239601206497, + "grad_norm": 10.874273166264091, + "learning_rate": 4.843012555587588e-06, + "loss": 0.8766, + "step": 3876 + }, + { + "epoch": 0.28009464121227445, + "grad_norm": 6.96393203116722, + "learning_rate": 4.842910520878925e-06, + "loss": 0.9655, + "step": 3877 + }, + { + "epoch": 0.280166886412484, + "grad_norm": 6.579273694427934, + "learning_rate": 4.842808454097635e-06, + "loss": 0.9288, + "step": 3878 + }, + { + "epoch": 0.2802391316126935, + "grad_norm": 6.515148703458259, + "learning_rate": 4.842706355245117e-06, + "loss": 0.9023, + "step": 3879 + }, + { + "epoch": 0.28031137681290297, + "grad_norm": 8.112758393720524, + "learning_rate": 4.842604224322768e-06, + "loss": 0.9429, + "step": 3880 + }, + { + "epoch": 0.2803836220131125, + "grad_norm": 6.376322889511524, + "learning_rate": 4.842502061331986e-06, + "loss": 0.8521, + "step": 3881 + }, + { + "epoch": 0.280455867213322, + "grad_norm": 6.213650707003297, + "learning_rate": 4.842399866274169e-06, + "loss": 0.8794, + "step": 3882 + }, + { + "epoch": 0.28052811241353154, + "grad_norm": 6.876904033213729, + "learning_rate": 4.8422976391507175e-06, + "loss": 0.8596, + "step": 3883 + }, + { + "epoch": 0.280600357613741, + "grad_norm": 6.474289738056576, + "learning_rate": 4.842195379963029e-06, + "loss": 0.9859, + "step": 3884 + }, + { + "epoch": 0.28067260281395057, + "grad_norm": 6.835783759431898, + "learning_rate": 4.842093088712505e-06, + "loss": 0.8786, + "step": 3885 + }, + { + "epoch": 0.28074484801416005, + "grad_norm": 6.826530916433862, + "learning_rate": 4.841990765400545e-06, + "loss": 0.8285, + "step": 3886 + }, + { + "epoch": 0.2808170932143696, + "grad_norm": 6.273259028536231, + "learning_rate": 4.84188841002855e-06, + "loss": 0.9921, + "step": 3887 + }, + { + "epoch": 0.2808893384145791, + "grad_norm": 7.558601067906665, + "learning_rate": 4.841786022597921e-06, + "loss": 0.9485, + "step": 3888 + }, + { + "epoch": 0.28096158361478857, + "grad_norm": 7.090776668992703, + "learning_rate": 4.841683603110059e-06, + "loss": 0.9286, + "step": 3889 + }, + { + "epoch": 0.2810338288149981, + "grad_norm": 8.346689228031432, + "learning_rate": 4.841581151566367e-06, + "loss": 0.9544, + "step": 3890 + }, + { + "epoch": 0.2811060740152076, + "grad_norm": 7.058873688762694, + "learning_rate": 4.8414786679682475e-06, + "loss": 1.0053, + "step": 3891 + }, + { + "epoch": 0.28117831921541714, + "grad_norm": 6.4627873310724, + "learning_rate": 4.8413761523171035e-06, + "loss": 0.8743, + "step": 3892 + }, + { + "epoch": 0.2812505644156266, + "grad_norm": 8.448912185983758, + "learning_rate": 4.841273604614337e-06, + "loss": 0.9126, + "step": 3893 + }, + { + "epoch": 0.28132280961583617, + "grad_norm": 7.885388500679001, + "learning_rate": 4.841171024861353e-06, + "loss": 0.9243, + "step": 3894 + }, + { + "epoch": 0.28139505481604565, + "grad_norm": 5.856167090611748, + "learning_rate": 4.8410684130595555e-06, + "loss": 0.8908, + "step": 3895 + }, + { + "epoch": 0.2814673000162552, + "grad_norm": 7.2982226154541605, + "learning_rate": 4.840965769210349e-06, + "loss": 0.9774, + "step": 3896 + }, + { + "epoch": 0.2815395452164647, + "grad_norm": 6.325744331801465, + "learning_rate": 4.840863093315139e-06, + "loss": 0.9247, + "step": 3897 + }, + { + "epoch": 0.28161179041667417, + "grad_norm": 6.41638483526729, + "learning_rate": 4.8407603853753305e-06, + "loss": 0.9083, + "step": 3898 + }, + { + "epoch": 0.2816840356168837, + "grad_norm": 6.701558390193861, + "learning_rate": 4.84065764539233e-06, + "loss": 0.9781, + "step": 3899 + }, + { + "epoch": 0.2817562808170932, + "grad_norm": 7.68270327397507, + "learning_rate": 4.8405548733675445e-06, + "loss": 0.9103, + "step": 3900 + }, + { + "epoch": 0.28182852601730274, + "grad_norm": 7.349585239878655, + "learning_rate": 4.840452069302379e-06, + "loss": 0.9792, + "step": 3901 + }, + { + "epoch": 0.2819007712175122, + "grad_norm": 6.751354222968466, + "learning_rate": 4.840349233198242e-06, + "loss": 0.8866, + "step": 3902 + }, + { + "epoch": 0.28197301641772177, + "grad_norm": 9.36082831866515, + "learning_rate": 4.840246365056542e-06, + "loss": 1.019, + "step": 3903 + }, + { + "epoch": 0.28204526161793125, + "grad_norm": 5.612011473194665, + "learning_rate": 4.840143464878686e-06, + "loss": 0.935, + "step": 3904 + }, + { + "epoch": 0.2821175068181408, + "grad_norm": 6.47386431657732, + "learning_rate": 4.8400405326660825e-06, + "loss": 0.9679, + "step": 3905 + }, + { + "epoch": 0.2821897520183503, + "grad_norm": 7.388655353702121, + "learning_rate": 4.839937568420141e-06, + "loss": 0.92, + "step": 3906 + }, + { + "epoch": 0.28226199721855977, + "grad_norm": 6.150875940978536, + "learning_rate": 4.839834572142272e-06, + "loss": 0.8136, + "step": 3907 + }, + { + "epoch": 0.2823342424187693, + "grad_norm": 7.332927894944227, + "learning_rate": 4.839731543833883e-06, + "loss": 0.9043, + "step": 3908 + }, + { + "epoch": 0.2824064876189788, + "grad_norm": 7.597940426883377, + "learning_rate": 4.839628483496388e-06, + "loss": 0.9552, + "step": 3909 + }, + { + "epoch": 0.28247873281918834, + "grad_norm": 8.051418051598214, + "learning_rate": 4.839525391131194e-06, + "loss": 0.8599, + "step": 3910 + }, + { + "epoch": 0.2825509780193978, + "grad_norm": 6.749203387690151, + "learning_rate": 4.839422266739714e-06, + "loss": 0.8835, + "step": 3911 + }, + { + "epoch": 0.28262322321960737, + "grad_norm": 6.09410336765901, + "learning_rate": 4.83931911032336e-06, + "loss": 0.9571, + "step": 3912 + }, + { + "epoch": 0.28269546841981685, + "grad_norm": 7.395170656702119, + "learning_rate": 4.839215921883543e-06, + "loss": 0.9421, + "step": 3913 + }, + { + "epoch": 0.2827677136200264, + "grad_norm": 6.245079850479671, + "learning_rate": 4.839112701421678e-06, + "loss": 0.8654, + "step": 3914 + }, + { + "epoch": 0.2828399588202359, + "grad_norm": 8.058861673018328, + "learning_rate": 4.839009448939175e-06, + "loss": 0.9328, + "step": 3915 + }, + { + "epoch": 0.28291220402044537, + "grad_norm": 6.439444535272554, + "learning_rate": 4.838906164437449e-06, + "loss": 0.8851, + "step": 3916 + }, + { + "epoch": 0.2829844492206549, + "grad_norm": 6.112646295399891, + "learning_rate": 4.8388028479179135e-06, + "loss": 0.881, + "step": 3917 + }, + { + "epoch": 0.2830566944208644, + "grad_norm": 9.485420282571127, + "learning_rate": 4.838699499381983e-06, + "loss": 0.95, + "step": 3918 + }, + { + "epoch": 0.28312893962107394, + "grad_norm": 7.206997374581634, + "learning_rate": 4.8385961188310726e-06, + "loss": 0.8776, + "step": 3919 + }, + { + "epoch": 0.2832011848212834, + "grad_norm": 6.911557466839992, + "learning_rate": 4.838492706266597e-06, + "loss": 0.9371, + "step": 3920 + }, + { + "epoch": 0.28327343002149297, + "grad_norm": 8.260609134261406, + "learning_rate": 4.838389261689972e-06, + "loss": 0.9814, + "step": 3921 + }, + { + "epoch": 0.28334567522170245, + "grad_norm": 8.878831063391514, + "learning_rate": 4.838285785102613e-06, + "loss": 0.9985, + "step": 3922 + }, + { + "epoch": 0.283417920421912, + "grad_norm": 6.122343148854385, + "learning_rate": 4.838182276505938e-06, + "loss": 0.8884, + "step": 3923 + }, + { + "epoch": 0.2834901656221215, + "grad_norm": 9.28875079967468, + "learning_rate": 4.8380787359013624e-06, + "loss": 0.9448, + "step": 3924 + }, + { + "epoch": 0.28356241082233097, + "grad_norm": 6.523735140103609, + "learning_rate": 4.837975163290305e-06, + "loss": 0.9652, + "step": 3925 + }, + { + "epoch": 0.2836346560225405, + "grad_norm": 6.373371290202949, + "learning_rate": 4.837871558674183e-06, + "loss": 0.9431, + "step": 3926 + }, + { + "epoch": 0.28370690122275, + "grad_norm": 8.335585124639852, + "learning_rate": 4.837767922054414e-06, + "loss": 1.0193, + "step": 3927 + }, + { + "epoch": 0.28377914642295954, + "grad_norm": 7.785785624725963, + "learning_rate": 4.837664253432418e-06, + "loss": 0.949, + "step": 3928 + }, + { + "epoch": 0.283851391623169, + "grad_norm": 7.011014175080778, + "learning_rate": 4.837560552809613e-06, + "loss": 0.8532, + "step": 3929 + }, + { + "epoch": 0.28392363682337857, + "grad_norm": 6.984182197247969, + "learning_rate": 4.837456820187419e-06, + "loss": 0.9409, + "step": 3930 + }, + { + "epoch": 0.28399588202358805, + "grad_norm": 7.752872303938821, + "learning_rate": 4.837353055567256e-06, + "loss": 0.812, + "step": 3931 + }, + { + "epoch": 0.2840681272237976, + "grad_norm": 8.77832112729791, + "learning_rate": 4.837249258950545e-06, + "loss": 0.9598, + "step": 3932 + }, + { + "epoch": 0.2841403724240071, + "grad_norm": 7.160393593907931, + "learning_rate": 4.837145430338705e-06, + "loss": 0.8653, + "step": 3933 + }, + { + "epoch": 0.28421261762421657, + "grad_norm": 6.034070240070907, + "learning_rate": 4.837041569733161e-06, + "loss": 0.9029, + "step": 3934 + }, + { + "epoch": 0.2842848628244261, + "grad_norm": 9.655043838377747, + "learning_rate": 4.836937677135331e-06, + "loss": 0.8711, + "step": 3935 + }, + { + "epoch": 0.2843571080246356, + "grad_norm": 7.951606050845439, + "learning_rate": 4.836833752546638e-06, + "loss": 0.897, + "step": 3936 + }, + { + "epoch": 0.28442935322484514, + "grad_norm": 6.544970251238703, + "learning_rate": 4.836729795968506e-06, + "loss": 0.9034, + "step": 3937 + }, + { + "epoch": 0.2845015984250546, + "grad_norm": 6.214465328182751, + "learning_rate": 4.836625807402359e-06, + "loss": 0.8897, + "step": 3938 + }, + { + "epoch": 0.28457384362526417, + "grad_norm": 7.236911930313189, + "learning_rate": 4.8365217868496175e-06, + "loss": 0.9798, + "step": 3939 + }, + { + "epoch": 0.28464608882547365, + "grad_norm": 8.682444466793054, + "learning_rate": 4.8364177343117066e-06, + "loss": 0.9832, + "step": 3940 + }, + { + "epoch": 0.2847183340256832, + "grad_norm": 6.6798482596700355, + "learning_rate": 4.836313649790052e-06, + "loss": 1.025, + "step": 3941 + }, + { + "epoch": 0.2847905792258927, + "grad_norm": 7.863686078827021, + "learning_rate": 4.836209533286077e-06, + "loss": 1.0105, + "step": 3942 + }, + { + "epoch": 0.28486282442610217, + "grad_norm": 6.677676392242418, + "learning_rate": 4.836105384801208e-06, + "loss": 0.9376, + "step": 3943 + }, + { + "epoch": 0.2849350696263117, + "grad_norm": 6.2997934095630335, + "learning_rate": 4.83600120433687e-06, + "loss": 0.9118, + "step": 3944 + }, + { + "epoch": 0.2850073148265212, + "grad_norm": 6.06139371550917, + "learning_rate": 4.835896991894488e-06, + "loss": 0.8706, + "step": 3945 + }, + { + "epoch": 0.28507956002673074, + "grad_norm": 6.454128676865612, + "learning_rate": 4.835792747475492e-06, + "loss": 0.8921, + "step": 3946 + }, + { + "epoch": 0.2851518052269402, + "grad_norm": 7.233202713638132, + "learning_rate": 4.835688471081305e-06, + "loss": 0.8455, + "step": 3947 + }, + { + "epoch": 0.28522405042714977, + "grad_norm": 7.514986197432928, + "learning_rate": 4.835584162713358e-06, + "loss": 0.9537, + "step": 3948 + }, + { + "epoch": 0.28529629562735925, + "grad_norm": 6.444443892701809, + "learning_rate": 4.835479822373076e-06, + "loss": 0.8929, + "step": 3949 + }, + { + "epoch": 0.2853685408275688, + "grad_norm": 5.686258369149494, + "learning_rate": 4.83537545006189e-06, + "loss": 0.9945, + "step": 3950 + }, + { + "epoch": 0.2854407860277783, + "grad_norm": 5.963726706042559, + "learning_rate": 4.835271045781226e-06, + "loss": 0.9059, + "step": 3951 + }, + { + "epoch": 0.28551303122798777, + "grad_norm": 6.910858135901832, + "learning_rate": 4.835166609532515e-06, + "loss": 1.0059, + "step": 3952 + }, + { + "epoch": 0.2855852764281973, + "grad_norm": 7.051900556065487, + "learning_rate": 4.835062141317187e-06, + "loss": 0.8689, + "step": 3953 + }, + { + "epoch": 0.2856575216284068, + "grad_norm": 6.21153323597335, + "learning_rate": 4.834957641136671e-06, + "loss": 0.8517, + "step": 3954 + }, + { + "epoch": 0.28572976682861634, + "grad_norm": 6.180422347367994, + "learning_rate": 4.834853108992396e-06, + "loss": 0.9637, + "step": 3955 + }, + { + "epoch": 0.2858020120288258, + "grad_norm": 6.362396047526134, + "learning_rate": 4.834748544885798e-06, + "loss": 0.9024, + "step": 3956 + }, + { + "epoch": 0.28587425722903537, + "grad_norm": 7.913646510075225, + "learning_rate": 4.8346439488183025e-06, + "loss": 0.9109, + "step": 3957 + }, + { + "epoch": 0.28594650242924485, + "grad_norm": 6.8382618816131515, + "learning_rate": 4.834539320791346e-06, + "loss": 0.945, + "step": 3958 + }, + { + "epoch": 0.2860187476294544, + "grad_norm": 6.851657427453143, + "learning_rate": 4.834434660806358e-06, + "loss": 0.9158, + "step": 3959 + }, + { + "epoch": 0.2860909928296639, + "grad_norm": 7.726934749943396, + "learning_rate": 4.834329968864772e-06, + "loss": 0.8787, + "step": 3960 + }, + { + "epoch": 0.28616323802987337, + "grad_norm": 7.893742132598267, + "learning_rate": 4.834225244968021e-06, + "loss": 0.9904, + "step": 3961 + }, + { + "epoch": 0.2862354832300829, + "grad_norm": 6.546659115529933, + "learning_rate": 4.8341204891175395e-06, + "loss": 0.9018, + "step": 3962 + }, + { + "epoch": 0.2863077284302924, + "grad_norm": 6.543051830162838, + "learning_rate": 4.834015701314761e-06, + "loss": 0.9077, + "step": 3963 + }, + { + "epoch": 0.28637997363050194, + "grad_norm": 8.223604922263743, + "learning_rate": 4.833910881561119e-06, + "loss": 1.0087, + "step": 3964 + }, + { + "epoch": 0.2864522188307114, + "grad_norm": 6.7556676735413825, + "learning_rate": 4.833806029858049e-06, + "loss": 0.8724, + "step": 3965 + }, + { + "epoch": 0.28652446403092097, + "grad_norm": 6.56640770235335, + "learning_rate": 4.8337011462069874e-06, + "loss": 0.9308, + "step": 3966 + }, + { + "epoch": 0.28659670923113045, + "grad_norm": 7.774191477645989, + "learning_rate": 4.833596230609369e-06, + "loss": 0.9498, + "step": 3967 + }, + { + "epoch": 0.28666895443134, + "grad_norm": 5.779325917677955, + "learning_rate": 4.8334912830666295e-06, + "loss": 0.9375, + "step": 3968 + }, + { + "epoch": 0.2867411996315495, + "grad_norm": 6.112782652390897, + "learning_rate": 4.833386303580207e-06, + "loss": 0.9138, + "step": 3969 + }, + { + "epoch": 0.28681344483175897, + "grad_norm": 6.316976338682658, + "learning_rate": 4.833281292151537e-06, + "loss": 0.8497, + "step": 3970 + }, + { + "epoch": 0.2868856900319685, + "grad_norm": 7.532732172697746, + "learning_rate": 4.833176248782058e-06, + "loss": 0.8852, + "step": 3971 + }, + { + "epoch": 0.286957935232178, + "grad_norm": 7.016149148710646, + "learning_rate": 4.833071173473208e-06, + "loss": 1.0117, + "step": 3972 + }, + { + "epoch": 0.28703018043238754, + "grad_norm": 6.6675207862487085, + "learning_rate": 4.832966066226425e-06, + "loss": 0.9806, + "step": 3973 + }, + { + "epoch": 0.287102425632597, + "grad_norm": 6.187497533932589, + "learning_rate": 4.832860927043148e-06, + "loss": 0.9323, + "step": 3974 + }, + { + "epoch": 0.28717467083280657, + "grad_norm": 7.8337194739867435, + "learning_rate": 4.832755755924816e-06, + "loss": 1.0534, + "step": 3975 + }, + { + "epoch": 0.28724691603301605, + "grad_norm": 6.931506067745126, + "learning_rate": 4.83265055287287e-06, + "loss": 0.8683, + "step": 3976 + }, + { + "epoch": 0.2873191612332256, + "grad_norm": 5.969231231622882, + "learning_rate": 4.832545317888748e-06, + "loss": 0.9802, + "step": 3977 + }, + { + "epoch": 0.2873914064334351, + "grad_norm": 8.065156247729673, + "learning_rate": 4.832440050973892e-06, + "loss": 0.8274, + "step": 3978 + }, + { + "epoch": 0.28746365163364457, + "grad_norm": 5.793818488019722, + "learning_rate": 4.832334752129743e-06, + "loss": 0.8849, + "step": 3979 + }, + { + "epoch": 0.2875358968338541, + "grad_norm": 6.025818587921563, + "learning_rate": 4.832229421357742e-06, + "loss": 0.9004, + "step": 3980 + }, + { + "epoch": 0.2876081420340636, + "grad_norm": 5.460223018856411, + "learning_rate": 4.832124058659331e-06, + "loss": 0.9289, + "step": 3981 + }, + { + "epoch": 0.28768038723427314, + "grad_norm": 6.360786899853788, + "learning_rate": 4.832018664035952e-06, + "loss": 0.9054, + "step": 3982 + }, + { + "epoch": 0.2877526324344826, + "grad_norm": 6.629763348298592, + "learning_rate": 4.831913237489049e-06, + "loss": 0.8959, + "step": 3983 + }, + { + "epoch": 0.28782487763469217, + "grad_norm": 6.513115195946105, + "learning_rate": 4.831807779020063e-06, + "loss": 0.9414, + "step": 3984 + }, + { + "epoch": 0.28789712283490165, + "grad_norm": 6.649714310324583, + "learning_rate": 4.831702288630441e-06, + "loss": 0.903, + "step": 3985 + }, + { + "epoch": 0.28796936803511114, + "grad_norm": 8.069410098234945, + "learning_rate": 4.831596766321624e-06, + "loss": 1.0241, + "step": 3986 + }, + { + "epoch": 0.2880416132353207, + "grad_norm": 7.111116502018051, + "learning_rate": 4.8314912120950576e-06, + "loss": 0.9516, + "step": 3987 + }, + { + "epoch": 0.28811385843553017, + "grad_norm": 6.129115687646043, + "learning_rate": 4.831385625952188e-06, + "loss": 0.9374, + "step": 3988 + }, + { + "epoch": 0.2881861036357397, + "grad_norm": 7.500431557001221, + "learning_rate": 4.831280007894458e-06, + "loss": 0.9887, + "step": 3989 + }, + { + "epoch": 0.2882583488359492, + "grad_norm": 5.964364402911084, + "learning_rate": 4.831174357923315e-06, + "loss": 0.9425, + "step": 3990 + }, + { + "epoch": 0.28833059403615874, + "grad_norm": 5.729724463849952, + "learning_rate": 4.831068676040205e-06, + "loss": 0.8829, + "step": 3991 + }, + { + "epoch": 0.2884028392363682, + "grad_norm": 7.586451158507509, + "learning_rate": 4.830962962246575e-06, + "loss": 0.9314, + "step": 3992 + }, + { + "epoch": 0.28847508443657777, + "grad_norm": 7.256179018026663, + "learning_rate": 4.830857216543872e-06, + "loss": 0.9891, + "step": 3993 + }, + { + "epoch": 0.28854732963678725, + "grad_norm": 6.764000677829485, + "learning_rate": 4.830751438933543e-06, + "loss": 0.84, + "step": 3994 + }, + { + "epoch": 0.28861957483699674, + "grad_norm": 8.215271884647025, + "learning_rate": 4.830645629417038e-06, + "loss": 0.9421, + "step": 3995 + }, + { + "epoch": 0.2886918200372063, + "grad_norm": 7.104100410696886, + "learning_rate": 4.830539787995803e-06, + "loss": 0.8859, + "step": 3996 + }, + { + "epoch": 0.28876406523741577, + "grad_norm": 6.683947894347001, + "learning_rate": 4.8304339146712875e-06, + "loss": 0.9022, + "step": 3997 + }, + { + "epoch": 0.2888363104376253, + "grad_norm": 6.962590664559208, + "learning_rate": 4.830328009444941e-06, + "loss": 1.0064, + "step": 3998 + }, + { + "epoch": 0.2889085556378348, + "grad_norm": 7.782972520501586, + "learning_rate": 4.8302220723182146e-06, + "loss": 0.9284, + "step": 3999 + }, + { + "epoch": 0.28898080083804434, + "grad_norm": 6.876570920932557, + "learning_rate": 4.830116103292556e-06, + "loss": 0.9257, + "step": 4000 + }, + { + "epoch": 0.2890530460382538, + "grad_norm": 7.496989854105353, + "learning_rate": 4.830010102369418e-06, + "loss": 0.9969, + "step": 4001 + }, + { + "epoch": 0.28912529123846337, + "grad_norm": 6.5100671130118135, + "learning_rate": 4.829904069550251e-06, + "loss": 0.9333, + "step": 4002 + }, + { + "epoch": 0.28919753643867285, + "grad_norm": 5.64106643090695, + "learning_rate": 4.829798004836506e-06, + "loss": 0.8665, + "step": 4003 + }, + { + "epoch": 0.28926978163888234, + "grad_norm": 8.208132772044786, + "learning_rate": 4.829691908229634e-06, + "loss": 0.8693, + "step": 4004 + }, + { + "epoch": 0.2893420268390919, + "grad_norm": 7.641360631685092, + "learning_rate": 4.829585779731091e-06, + "loss": 0.9784, + "step": 4005 + }, + { + "epoch": 0.28941427203930137, + "grad_norm": 6.35968451719133, + "learning_rate": 4.829479619342326e-06, + "loss": 0.8544, + "step": 4006 + }, + { + "epoch": 0.2894865172395109, + "grad_norm": 7.269855276816012, + "learning_rate": 4.829373427064794e-06, + "loss": 0.978, + "step": 4007 + }, + { + "epoch": 0.2895587624397204, + "grad_norm": 6.9765020797180695, + "learning_rate": 4.829267202899949e-06, + "loss": 0.9441, + "step": 4008 + }, + { + "epoch": 0.28963100763992994, + "grad_norm": 6.729857733074443, + "learning_rate": 4.8291609468492436e-06, + "loss": 0.8894, + "step": 4009 + }, + { + "epoch": 0.2897032528401394, + "grad_norm": 6.598006045530382, + "learning_rate": 4.829054658914134e-06, + "loss": 0.8939, + "step": 4010 + }, + { + "epoch": 0.28977549804034897, + "grad_norm": 7.286942119411966, + "learning_rate": 4.8289483390960745e-06, + "loss": 0.9604, + "step": 4011 + }, + { + "epoch": 0.28984774324055845, + "grad_norm": 6.886391651956115, + "learning_rate": 4.828841987396521e-06, + "loss": 0.9083, + "step": 4012 + }, + { + "epoch": 0.28991998844076794, + "grad_norm": 8.373534572556142, + "learning_rate": 4.828735603816927e-06, + "loss": 0.8917, + "step": 4013 + }, + { + "epoch": 0.2899922336409775, + "grad_norm": 7.004478112111342, + "learning_rate": 4.8286291883587526e-06, + "loss": 1.021, + "step": 4014 + }, + { + "epoch": 0.29006447884118697, + "grad_norm": 6.745882261812341, + "learning_rate": 4.8285227410234525e-06, + "loss": 0.8346, + "step": 4015 + }, + { + "epoch": 0.2901367240413965, + "grad_norm": 7.140648238730648, + "learning_rate": 4.828416261812484e-06, + "loss": 0.9282, + "step": 4016 + }, + { + "epoch": 0.290208969241606, + "grad_norm": 7.831129399837673, + "learning_rate": 4.828309750727304e-06, + "loss": 0.9028, + "step": 4017 + }, + { + "epoch": 0.29028121444181554, + "grad_norm": 6.846173697717317, + "learning_rate": 4.828203207769372e-06, + "loss": 0.9284, + "step": 4018 + }, + { + "epoch": 0.290353459642025, + "grad_norm": 7.023321810608645, + "learning_rate": 4.828096632940146e-06, + "loss": 0.9947, + "step": 4019 + }, + { + "epoch": 0.29042570484223457, + "grad_norm": 7.5885509385309975, + "learning_rate": 4.827990026241084e-06, + "loss": 0.9637, + "step": 4020 + }, + { + "epoch": 0.29049795004244405, + "grad_norm": 7.248592897868071, + "learning_rate": 4.827883387673646e-06, + "loss": 0.9357, + "step": 4021 + }, + { + "epoch": 0.29057019524265354, + "grad_norm": 7.883479079766943, + "learning_rate": 4.827776717239293e-06, + "loss": 0.9157, + "step": 4022 + }, + { + "epoch": 0.2906424404428631, + "grad_norm": 6.790379720191262, + "learning_rate": 4.827670014939483e-06, + "loss": 0.9244, + "step": 4023 + }, + { + "epoch": 0.29071468564307257, + "grad_norm": 7.505092672021112, + "learning_rate": 4.827563280775678e-06, + "loss": 0.968, + "step": 4024 + }, + { + "epoch": 0.2907869308432821, + "grad_norm": 7.2337081964397925, + "learning_rate": 4.82745651474934e-06, + "loss": 0.8778, + "step": 4025 + }, + { + "epoch": 0.2908591760434916, + "grad_norm": 8.198019244381536, + "learning_rate": 4.827349716861929e-06, + "loss": 0.9492, + "step": 4026 + }, + { + "epoch": 0.29093142124370114, + "grad_norm": 6.2646718049301136, + "learning_rate": 4.827242887114907e-06, + "loss": 0.875, + "step": 4027 + }, + { + "epoch": 0.2910036664439106, + "grad_norm": 6.424901986766085, + "learning_rate": 4.8271360255097364e-06, + "loss": 0.8516, + "step": 4028 + }, + { + "epoch": 0.29107591164412017, + "grad_norm": 7.6947659525930705, + "learning_rate": 4.827029132047881e-06, + "loss": 0.9411, + "step": 4029 + }, + { + "epoch": 0.29114815684432965, + "grad_norm": 6.119331695369129, + "learning_rate": 4.8269222067308046e-06, + "loss": 0.9626, + "step": 4030 + }, + { + "epoch": 0.29122040204453914, + "grad_norm": 6.294104030315482, + "learning_rate": 4.826815249559968e-06, + "loss": 0.8731, + "step": 4031 + }, + { + "epoch": 0.2912926472447487, + "grad_norm": 6.543891317887414, + "learning_rate": 4.826708260536839e-06, + "loss": 0.8623, + "step": 4032 + }, + { + "epoch": 0.29136489244495817, + "grad_norm": 8.049589006819598, + "learning_rate": 4.82660123966288e-06, + "loss": 1.0111, + "step": 4033 + }, + { + "epoch": 0.2914371376451677, + "grad_norm": 6.123839599302735, + "learning_rate": 4.826494186939556e-06, + "loss": 0.8675, + "step": 4034 + }, + { + "epoch": 0.2915093828453772, + "grad_norm": 6.845556012149672, + "learning_rate": 4.826387102368333e-06, + "loss": 0.8496, + "step": 4035 + }, + { + "epoch": 0.29158162804558674, + "grad_norm": 6.591031494917331, + "learning_rate": 4.826279985950678e-06, + "loss": 0.9923, + "step": 4036 + }, + { + "epoch": 0.2916538732457962, + "grad_norm": 7.161114634271384, + "learning_rate": 4.826172837688055e-06, + "loss": 0.8888, + "step": 4037 + }, + { + "epoch": 0.29172611844600577, + "grad_norm": 6.71459008892247, + "learning_rate": 4.8260656575819325e-06, + "loss": 0.8525, + "step": 4038 + }, + { + "epoch": 0.29179836364621525, + "grad_norm": 5.507620647146197, + "learning_rate": 4.825958445633777e-06, + "loss": 0.942, + "step": 4039 + }, + { + "epoch": 0.29187060884642474, + "grad_norm": 6.40323981298054, + "learning_rate": 4.825851201845056e-06, + "loss": 0.8745, + "step": 4040 + }, + { + "epoch": 0.2919428540466343, + "grad_norm": 5.993935381078033, + "learning_rate": 4.82574392621724e-06, + "loss": 0.9007, + "step": 4041 + }, + { + "epoch": 0.29201509924684377, + "grad_norm": 6.386971994214693, + "learning_rate": 4.825636618751793e-06, + "loss": 0.8643, + "step": 4042 + }, + { + "epoch": 0.2920873444470533, + "grad_norm": 5.983528097200701, + "learning_rate": 4.825529279450188e-06, + "loss": 0.8686, + "step": 4043 + }, + { + "epoch": 0.2921595896472628, + "grad_norm": 7.143487602748061, + "learning_rate": 4.825421908313892e-06, + "loss": 0.9686, + "step": 4044 + }, + { + "epoch": 0.29223183484747234, + "grad_norm": 7.160056621817786, + "learning_rate": 4.825314505344376e-06, + "loss": 0.909, + "step": 4045 + }, + { + "epoch": 0.2923040800476818, + "grad_norm": 5.126034329874294, + "learning_rate": 4.8252070705431095e-06, + "loss": 0.8166, + "step": 4046 + }, + { + "epoch": 0.29237632524789137, + "grad_norm": 5.878490304014674, + "learning_rate": 4.825099603911564e-06, + "loss": 0.8223, + "step": 4047 + }, + { + "epoch": 0.29244857044810085, + "grad_norm": 5.492903552850529, + "learning_rate": 4.82499210545121e-06, + "loss": 0.8966, + "step": 4048 + }, + { + "epoch": 0.29252081564831034, + "grad_norm": 6.182137371431344, + "learning_rate": 4.8248845751635195e-06, + "loss": 0.9099, + "step": 4049 + }, + { + "epoch": 0.2925930608485199, + "grad_norm": 5.882238271380957, + "learning_rate": 4.824777013049965e-06, + "loss": 0.966, + "step": 4050 + }, + { + "epoch": 0.29266530604872937, + "grad_norm": 6.377224702295227, + "learning_rate": 4.824669419112017e-06, + "loss": 1.0012, + "step": 4051 + }, + { + "epoch": 0.2927375512489389, + "grad_norm": 6.739241609552504, + "learning_rate": 4.82456179335115e-06, + "loss": 0.9939, + "step": 4052 + }, + { + "epoch": 0.2928097964491484, + "grad_norm": 5.533583154362562, + "learning_rate": 4.824454135768838e-06, + "loss": 0.8438, + "step": 4053 + }, + { + "epoch": 0.29288204164935794, + "grad_norm": 5.993411579462889, + "learning_rate": 4.8243464463665525e-06, + "loss": 0.8762, + "step": 4054 + }, + { + "epoch": 0.2929542868495674, + "grad_norm": 7.327438354857602, + "learning_rate": 4.824238725145769e-06, + "loss": 1.0086, + "step": 4055 + }, + { + "epoch": 0.29302653204977697, + "grad_norm": 5.96520699205047, + "learning_rate": 4.824130972107963e-06, + "loss": 0.8994, + "step": 4056 + }, + { + "epoch": 0.29309877724998645, + "grad_norm": 7.088148686832171, + "learning_rate": 4.824023187254607e-06, + "loss": 0.8437, + "step": 4057 + }, + { + "epoch": 0.29317102245019594, + "grad_norm": 6.332828836257612, + "learning_rate": 4.823915370587179e-06, + "loss": 0.8843, + "step": 4058 + }, + { + "epoch": 0.2932432676504055, + "grad_norm": 8.089967764470744, + "learning_rate": 4.823807522107154e-06, + "loss": 0.8942, + "step": 4059 + }, + { + "epoch": 0.29331551285061497, + "grad_norm": 7.242423472423181, + "learning_rate": 4.823699641816009e-06, + "loss": 0.9288, + "step": 4060 + }, + { + "epoch": 0.2933877580508245, + "grad_norm": 5.8794097583384435, + "learning_rate": 4.823591729715219e-06, + "loss": 0.9519, + "step": 4061 + }, + { + "epoch": 0.293460003251034, + "grad_norm": 7.646850795419141, + "learning_rate": 4.823483785806262e-06, + "loss": 1.0168, + "step": 4062 + }, + { + "epoch": 0.29353224845124354, + "grad_norm": 7.319130003044409, + "learning_rate": 4.823375810090617e-06, + "loss": 0.9296, + "step": 4063 + }, + { + "epoch": 0.293604493651453, + "grad_norm": 9.889664589774629, + "learning_rate": 4.823267802569761e-06, + "loss": 0.9184, + "step": 4064 + }, + { + "epoch": 0.29367673885166257, + "grad_norm": 7.09357899195033, + "learning_rate": 4.8231597632451725e-06, + "loss": 0.9118, + "step": 4065 + }, + { + "epoch": 0.29374898405187205, + "grad_norm": 7.370180268483357, + "learning_rate": 4.8230516921183315e-06, + "loss": 0.9908, + "step": 4066 + }, + { + "epoch": 0.29382122925208154, + "grad_norm": 6.315404544778504, + "learning_rate": 4.822943589190715e-06, + "loss": 0.9967, + "step": 4067 + }, + { + "epoch": 0.2938934744522911, + "grad_norm": 6.999168619103456, + "learning_rate": 4.8228354544638055e-06, + "loss": 1.0304, + "step": 4068 + }, + { + "epoch": 0.29396571965250057, + "grad_norm": 5.918608798927844, + "learning_rate": 4.822727287939082e-06, + "loss": 0.905, + "step": 4069 + }, + { + "epoch": 0.2940379648527101, + "grad_norm": 5.73353940608188, + "learning_rate": 4.822619089618025e-06, + "loss": 0.8502, + "step": 4070 + }, + { + "epoch": 0.2941102100529196, + "grad_norm": 7.112953308355517, + "learning_rate": 4.8225108595021166e-06, + "loss": 0.9576, + "step": 4071 + }, + { + "epoch": 0.29418245525312914, + "grad_norm": 7.398453999453619, + "learning_rate": 4.822402597592838e-06, + "loss": 0.9106, + "step": 4072 + }, + { + "epoch": 0.2942547004533386, + "grad_norm": 6.819703379694143, + "learning_rate": 4.8222943038916705e-06, + "loss": 0.821, + "step": 4073 + }, + { + "epoch": 0.29432694565354817, + "grad_norm": 6.125573267675934, + "learning_rate": 4.822185978400097e-06, + "loss": 0.9015, + "step": 4074 + }, + { + "epoch": 0.29439919085375765, + "grad_norm": 6.119048048793006, + "learning_rate": 4.822077621119601e-06, + "loss": 0.9153, + "step": 4075 + }, + { + "epoch": 0.29447143605396714, + "grad_norm": 5.799532957017423, + "learning_rate": 4.8219692320516656e-06, + "loss": 0.9534, + "step": 4076 + }, + { + "epoch": 0.2945436812541767, + "grad_norm": 6.458450742905388, + "learning_rate": 4.8218608111977735e-06, + "loss": 0.8755, + "step": 4077 + }, + { + "epoch": 0.29461592645438617, + "grad_norm": 5.524420018198796, + "learning_rate": 4.82175235855941e-06, + "loss": 0.8652, + "step": 4078 + }, + { + "epoch": 0.2946881716545957, + "grad_norm": 5.882695778224523, + "learning_rate": 4.82164387413806e-06, + "loss": 0.9111, + "step": 4079 + }, + { + "epoch": 0.2947604168548052, + "grad_norm": 5.834418531928686, + "learning_rate": 4.821535357935207e-06, + "loss": 0.854, + "step": 4080 + }, + { + "epoch": 0.29483266205501474, + "grad_norm": 6.754770359340269, + "learning_rate": 4.821426809952338e-06, + "loss": 0.8295, + "step": 4081 + }, + { + "epoch": 0.2949049072552242, + "grad_norm": 6.844746556295703, + "learning_rate": 4.821318230190939e-06, + "loss": 0.8738, + "step": 4082 + }, + { + "epoch": 0.29497715245543377, + "grad_norm": 6.537884276786366, + "learning_rate": 4.8212096186524945e-06, + "loss": 0.9387, + "step": 4083 + }, + { + "epoch": 0.29504939765564325, + "grad_norm": 6.416197854622936, + "learning_rate": 4.821100975338494e-06, + "loss": 0.8931, + "step": 4084 + }, + { + "epoch": 0.29512164285585274, + "grad_norm": 6.67796572975946, + "learning_rate": 4.8209923002504224e-06, + "loss": 0.9614, + "step": 4085 + }, + { + "epoch": 0.2951938880560623, + "grad_norm": 6.672860351985467, + "learning_rate": 4.820883593389769e-06, + "loss": 0.928, + "step": 4086 + }, + { + "epoch": 0.29526613325627177, + "grad_norm": 7.805833825897737, + "learning_rate": 4.820774854758021e-06, + "loss": 0.9141, + "step": 4087 + }, + { + "epoch": 0.2953383784564813, + "grad_norm": 6.619253293246622, + "learning_rate": 4.8206660843566674e-06, + "loss": 0.9651, + "step": 4088 + }, + { + "epoch": 0.2954106236566908, + "grad_norm": 6.43150480041823, + "learning_rate": 4.820557282187197e-06, + "loss": 1.074, + "step": 4089 + }, + { + "epoch": 0.29548286885690034, + "grad_norm": 6.772434956803988, + "learning_rate": 4.820448448251098e-06, + "loss": 0.9018, + "step": 4090 + }, + { + "epoch": 0.2955551140571098, + "grad_norm": 7.6445344935640795, + "learning_rate": 4.820339582549863e-06, + "loss": 0.909, + "step": 4091 + }, + { + "epoch": 0.29562735925731937, + "grad_norm": 8.350105294117647, + "learning_rate": 4.82023068508498e-06, + "loss": 0.9156, + "step": 4092 + }, + { + "epoch": 0.29569960445752885, + "grad_norm": 6.396478411256762, + "learning_rate": 4.82012175585794e-06, + "loss": 0.962, + "step": 4093 + }, + { + "epoch": 0.29577184965773834, + "grad_norm": 7.665643402587598, + "learning_rate": 4.820012794870236e-06, + "loss": 0.9348, + "step": 4094 + }, + { + "epoch": 0.2958440948579479, + "grad_norm": 6.29725725324718, + "learning_rate": 4.819903802123357e-06, + "loss": 0.8994, + "step": 4095 + }, + { + "epoch": 0.29591634005815737, + "grad_norm": 6.96971582126204, + "learning_rate": 4.819794777618797e-06, + "loss": 0.9322, + "step": 4096 + }, + { + "epoch": 0.2959885852583669, + "grad_norm": 6.765025052787591, + "learning_rate": 4.8196857213580476e-06, + "loss": 0.9078, + "step": 4097 + }, + { + "epoch": 0.2960608304585764, + "grad_norm": 7.785719970261085, + "learning_rate": 4.819576633342602e-06, + "loss": 0.8824, + "step": 4098 + }, + { + "epoch": 0.29613307565878594, + "grad_norm": 6.256863297041985, + "learning_rate": 4.8194675135739525e-06, + "loss": 0.8527, + "step": 4099 + }, + { + "epoch": 0.2962053208589954, + "grad_norm": 5.832544037235301, + "learning_rate": 4.819358362053595e-06, + "loss": 0.9995, + "step": 4100 + }, + { + "epoch": 0.29627756605920497, + "grad_norm": 6.99205901811077, + "learning_rate": 4.819249178783021e-06, + "loss": 0.9343, + "step": 4101 + }, + { + "epoch": 0.29634981125941445, + "grad_norm": 7.129130989608214, + "learning_rate": 4.819139963763727e-06, + "loss": 0.8944, + "step": 4102 + }, + { + "epoch": 0.29642205645962394, + "grad_norm": 7.525065181614631, + "learning_rate": 4.819030716997208e-06, + "loss": 0.8713, + "step": 4103 + }, + { + "epoch": 0.2964943016598335, + "grad_norm": 7.734701993561175, + "learning_rate": 4.818921438484958e-06, + "loss": 0.9504, + "step": 4104 + }, + { + "epoch": 0.29656654686004297, + "grad_norm": 8.17215321287882, + "learning_rate": 4.818812128228475e-06, + "loss": 0.8336, + "step": 4105 + }, + { + "epoch": 0.2966387920602525, + "grad_norm": 5.852805236130798, + "learning_rate": 4.818702786229254e-06, + "loss": 0.8384, + "step": 4106 + }, + { + "epoch": 0.296711037260462, + "grad_norm": 7.716499525639797, + "learning_rate": 4.818593412488792e-06, + "loss": 0.9274, + "step": 4107 + }, + { + "epoch": 0.29678328246067154, + "grad_norm": 7.335071906717001, + "learning_rate": 4.818484007008587e-06, + "loss": 0.9138, + "step": 4108 + }, + { + "epoch": 0.296855527660881, + "grad_norm": 9.387909812464788, + "learning_rate": 4.818374569790136e-06, + "loss": 0.8593, + "step": 4109 + }, + { + "epoch": 0.29692777286109057, + "grad_norm": 6.915249169881225, + "learning_rate": 4.8182651008349374e-06, + "loss": 0.8849, + "step": 4110 + }, + { + "epoch": 0.29700001806130005, + "grad_norm": 5.7445543458625545, + "learning_rate": 4.818155600144489e-06, + "loss": 0.8924, + "step": 4111 + }, + { + "epoch": 0.29707226326150954, + "grad_norm": 6.5218695486545855, + "learning_rate": 4.818046067720291e-06, + "loss": 0.8996, + "step": 4112 + }, + { + "epoch": 0.2971445084617191, + "grad_norm": 6.130264511002238, + "learning_rate": 4.817936503563842e-06, + "loss": 0.9916, + "step": 4113 + }, + { + "epoch": 0.29721675366192857, + "grad_norm": 6.478960170910162, + "learning_rate": 4.817826907676642e-06, + "loss": 0.8346, + "step": 4114 + }, + { + "epoch": 0.2972889988621381, + "grad_norm": 9.607187012719347, + "learning_rate": 4.8177172800601915e-06, + "loss": 0.987, + "step": 4115 + }, + { + "epoch": 0.2973612440623476, + "grad_norm": 6.527420600516327, + "learning_rate": 4.8176076207159905e-06, + "loss": 0.9241, + "step": 4116 + }, + { + "epoch": 0.29743348926255714, + "grad_norm": 5.506048864356335, + "learning_rate": 4.817497929645541e-06, + "loss": 0.8394, + "step": 4117 + }, + { + "epoch": 0.2975057344627666, + "grad_norm": 6.667423809289878, + "learning_rate": 4.8173882068503444e-06, + "loss": 1.0013, + "step": 4118 + }, + { + "epoch": 0.29757797966297617, + "grad_norm": 7.2532362456462325, + "learning_rate": 4.817278452331902e-06, + "loss": 0.9559, + "step": 4119 + }, + { + "epoch": 0.29765022486318565, + "grad_norm": 7.437791321763051, + "learning_rate": 4.8171686660917174e-06, + "loss": 0.8554, + "step": 4120 + }, + { + "epoch": 0.29772247006339514, + "grad_norm": 5.817711903278709, + "learning_rate": 4.817058848131293e-06, + "loss": 0.8647, + "step": 4121 + }, + { + "epoch": 0.2977947152636047, + "grad_norm": 7.03814684535963, + "learning_rate": 4.8169489984521314e-06, + "loss": 1.0031, + "step": 4122 + }, + { + "epoch": 0.29786696046381417, + "grad_norm": 7.184557768632009, + "learning_rate": 4.816839117055738e-06, + "loss": 0.9952, + "step": 4123 + }, + { + "epoch": 0.2979392056640237, + "grad_norm": 7.364009102786197, + "learning_rate": 4.816729203943615e-06, + "loss": 0.9382, + "step": 4124 + }, + { + "epoch": 0.2980114508642332, + "grad_norm": 8.696131966664295, + "learning_rate": 4.816619259117269e-06, + "loss": 0.9148, + "step": 4125 + }, + { + "epoch": 0.29808369606444274, + "grad_norm": 5.671424028107209, + "learning_rate": 4.816509282578203e-06, + "loss": 0.9516, + "step": 4126 + }, + { + "epoch": 0.2981559412646522, + "grad_norm": 6.667825725252854, + "learning_rate": 4.8163992743279244e-06, + "loss": 0.9515, + "step": 4127 + }, + { + "epoch": 0.29822818646486177, + "grad_norm": 7.333025174457573, + "learning_rate": 4.816289234367938e-06, + "loss": 0.8993, + "step": 4128 + }, + { + "epoch": 0.29830043166507125, + "grad_norm": 7.071999701573292, + "learning_rate": 4.81617916269975e-06, + "loss": 0.9322, + "step": 4129 + }, + { + "epoch": 0.29837267686528074, + "grad_norm": 7.928358926956966, + "learning_rate": 4.8160690593248685e-06, + "loss": 0.9504, + "step": 4130 + }, + { + "epoch": 0.2984449220654903, + "grad_norm": 7.159328548125848, + "learning_rate": 4.8159589242448e-06, + "loss": 0.9217, + "step": 4131 + }, + { + "epoch": 0.29851716726569977, + "grad_norm": 7.298578818996176, + "learning_rate": 4.815848757461051e-06, + "loss": 0.928, + "step": 4132 + }, + { + "epoch": 0.2985894124659093, + "grad_norm": 8.831691325418959, + "learning_rate": 4.815738558975131e-06, + "loss": 0.9168, + "step": 4133 + }, + { + "epoch": 0.2986616576661188, + "grad_norm": 6.377316521517887, + "learning_rate": 4.815628328788548e-06, + "loss": 0.9028, + "step": 4134 + }, + { + "epoch": 0.29873390286632834, + "grad_norm": 5.9675592163369675, + "learning_rate": 4.815518066902813e-06, + "loss": 0.8826, + "step": 4135 + }, + { + "epoch": 0.2988061480665378, + "grad_norm": 6.374676565305557, + "learning_rate": 4.815407773319431e-06, + "loss": 0.8533, + "step": 4136 + }, + { + "epoch": 0.29887839326674737, + "grad_norm": 7.255425626335585, + "learning_rate": 4.815297448039916e-06, + "loss": 0.9825, + "step": 4137 + }, + { + "epoch": 0.29895063846695685, + "grad_norm": 7.20315836404285, + "learning_rate": 4.815187091065776e-06, + "loss": 1.0064, + "step": 4138 + }, + { + "epoch": 0.29902288366716634, + "grad_norm": 6.472386614122028, + "learning_rate": 4.8150767023985225e-06, + "loss": 0.9237, + "step": 4139 + }, + { + "epoch": 0.2990951288673759, + "grad_norm": 7.070581184466579, + "learning_rate": 4.814966282039667e-06, + "loss": 0.8429, + "step": 4140 + }, + { + "epoch": 0.29916737406758537, + "grad_norm": 7.117920420121156, + "learning_rate": 4.81485582999072e-06, + "loss": 0.942, + "step": 4141 + }, + { + "epoch": 0.2992396192677949, + "grad_norm": 6.830412387713531, + "learning_rate": 4.814745346253193e-06, + "loss": 0.9948, + "step": 4142 + }, + { + "epoch": 0.2993118644680044, + "grad_norm": 8.275956905852007, + "learning_rate": 4.8146348308286015e-06, + "loss": 0.8983, + "step": 4143 + }, + { + "epoch": 0.29938410966821394, + "grad_norm": 6.137259534176777, + "learning_rate": 4.814524283718455e-06, + "loss": 0.8774, + "step": 4144 + }, + { + "epoch": 0.2994563548684234, + "grad_norm": 7.690700206055611, + "learning_rate": 4.8144137049242686e-06, + "loss": 0.8557, + "step": 4145 + }, + { + "epoch": 0.29952860006863297, + "grad_norm": 6.223150948438482, + "learning_rate": 4.8143030944475555e-06, + "loss": 0.9111, + "step": 4146 + }, + { + "epoch": 0.29960084526884245, + "grad_norm": 7.846513360188784, + "learning_rate": 4.814192452289831e-06, + "loss": 0.8984, + "step": 4147 + }, + { + "epoch": 0.29967309046905194, + "grad_norm": 6.4823053573062515, + "learning_rate": 4.814081778452607e-06, + "loss": 0.9412, + "step": 4148 + }, + { + "epoch": 0.2997453356692615, + "grad_norm": 5.692211170131612, + "learning_rate": 4.813971072937401e-06, + "loss": 0.8749, + "step": 4149 + }, + { + "epoch": 0.29981758086947097, + "grad_norm": 7.322813927773994, + "learning_rate": 4.813860335745728e-06, + "loss": 0.9349, + "step": 4150 + }, + { + "epoch": 0.2998898260696805, + "grad_norm": 6.029303834625073, + "learning_rate": 4.813749566879103e-06, + "loss": 0.8732, + "step": 4151 + }, + { + "epoch": 0.29996207126989, + "grad_norm": 7.67793385846485, + "learning_rate": 4.813638766339044e-06, + "loss": 0.9184, + "step": 4152 + }, + { + "epoch": 0.30003431647009954, + "grad_norm": 5.812415665865848, + "learning_rate": 4.813527934127066e-06, + "loss": 0.9353, + "step": 4153 + }, + { + "epoch": 0.300106561670309, + "grad_norm": 8.056123326195733, + "learning_rate": 4.8134170702446865e-06, + "loss": 1.007, + "step": 4154 + }, + { + "epoch": 0.30017880687051857, + "grad_norm": 6.274715839654271, + "learning_rate": 4.813306174693424e-06, + "loss": 0.862, + "step": 4155 + }, + { + "epoch": 0.30025105207072805, + "grad_norm": 8.39732051074948, + "learning_rate": 4.813195247474796e-06, + "loss": 0.9868, + "step": 4156 + }, + { + "epoch": 0.30032329727093754, + "grad_norm": 5.577496319559714, + "learning_rate": 4.813084288590321e-06, + "loss": 0.8387, + "step": 4157 + }, + { + "epoch": 0.3003955424711471, + "grad_norm": 8.206310300467475, + "learning_rate": 4.812973298041518e-06, + "loss": 0.9131, + "step": 4158 + }, + { + "epoch": 0.30046778767135657, + "grad_norm": 6.828235057930245, + "learning_rate": 4.812862275829907e-06, + "loss": 0.9195, + "step": 4159 + }, + { + "epoch": 0.3005400328715661, + "grad_norm": 8.007154603310005, + "learning_rate": 4.812751221957007e-06, + "loss": 0.9504, + "step": 4160 + }, + { + "epoch": 0.3006122780717756, + "grad_norm": 6.907290833350956, + "learning_rate": 4.812640136424338e-06, + "loss": 0.9497, + "step": 4161 + }, + { + "epoch": 0.30068452327198514, + "grad_norm": 6.573085712207722, + "learning_rate": 4.812529019233422e-06, + "loss": 0.8145, + "step": 4162 + }, + { + "epoch": 0.3007567684721946, + "grad_norm": 6.382694462743483, + "learning_rate": 4.812417870385779e-06, + "loss": 0.9311, + "step": 4163 + }, + { + "epoch": 0.30082901367240417, + "grad_norm": 7.803896874446891, + "learning_rate": 4.8123066898829316e-06, + "loss": 0.9332, + "step": 4164 + }, + { + "epoch": 0.30090125887261365, + "grad_norm": 6.896028575457427, + "learning_rate": 4.8121954777264e-06, + "loss": 0.8035, + "step": 4165 + }, + { + "epoch": 0.30097350407282314, + "grad_norm": 7.188843178673366, + "learning_rate": 4.812084233917708e-06, + "loss": 0.9455, + "step": 4166 + }, + { + "epoch": 0.3010457492730327, + "grad_norm": 7.463486097350339, + "learning_rate": 4.811972958458377e-06, + "loss": 0.8545, + "step": 4167 + }, + { + "epoch": 0.30111799447324217, + "grad_norm": 6.052436262423038, + "learning_rate": 4.8118616513499326e-06, + "loss": 0.9347, + "step": 4168 + }, + { + "epoch": 0.3011902396734517, + "grad_norm": 7.066944985337105, + "learning_rate": 4.811750312593897e-06, + "loss": 0.8448, + "step": 4169 + }, + { + "epoch": 0.3012624848736612, + "grad_norm": 7.515275530710118, + "learning_rate": 4.811638942191794e-06, + "loss": 0.9517, + "step": 4170 + }, + { + "epoch": 0.30133473007387074, + "grad_norm": 6.384011575043015, + "learning_rate": 4.81152754014515e-06, + "loss": 0.8664, + "step": 4171 + }, + { + "epoch": 0.3014069752740802, + "grad_norm": 6.3052734707750595, + "learning_rate": 4.811416106455488e-06, + "loss": 0.911, + "step": 4172 + }, + { + "epoch": 0.30147922047428977, + "grad_norm": 6.626390994892129, + "learning_rate": 4.811304641124334e-06, + "loss": 0.9455, + "step": 4173 + }, + { + "epoch": 0.30155146567449925, + "grad_norm": 6.581024183436687, + "learning_rate": 4.811193144153214e-06, + "loss": 0.9522, + "step": 4174 + }, + { + "epoch": 0.30162371087470874, + "grad_norm": 7.861376407072229, + "learning_rate": 4.811081615543655e-06, + "loss": 0.8681, + "step": 4175 + }, + { + "epoch": 0.3016959560749183, + "grad_norm": 6.226143433984386, + "learning_rate": 4.810970055297182e-06, + "loss": 0.8966, + "step": 4176 + }, + { + "epoch": 0.30176820127512777, + "grad_norm": 7.2341537936111875, + "learning_rate": 4.8108584634153246e-06, + "loss": 0.9367, + "step": 4177 + }, + { + "epoch": 0.3018404464753373, + "grad_norm": 6.955808383191121, + "learning_rate": 4.810746839899608e-06, + "loss": 0.9133, + "step": 4178 + }, + { + "epoch": 0.3019126916755468, + "grad_norm": 6.407001432682721, + "learning_rate": 4.810635184751562e-06, + "loss": 0.9386, + "step": 4179 + }, + { + "epoch": 0.30198493687575634, + "grad_norm": 5.801457570527165, + "learning_rate": 4.810523497972715e-06, + "loss": 0.9408, + "step": 4180 + }, + { + "epoch": 0.3020571820759658, + "grad_norm": 5.847588079408086, + "learning_rate": 4.810411779564594e-06, + "loss": 0.8928, + "step": 4181 + }, + { + "epoch": 0.30212942727617537, + "grad_norm": 8.25210035926342, + "learning_rate": 4.81030002952873e-06, + "loss": 0.9551, + "step": 4182 + }, + { + "epoch": 0.30220167247638485, + "grad_norm": 7.155516740971421, + "learning_rate": 4.810188247866653e-06, + "loss": 0.8699, + "step": 4183 + }, + { + "epoch": 0.30227391767659434, + "grad_norm": 6.707168319981138, + "learning_rate": 4.810076434579892e-06, + "loss": 0.8889, + "step": 4184 + }, + { + "epoch": 0.3023461628768039, + "grad_norm": 5.237984486024188, + "learning_rate": 4.809964589669978e-06, + "loss": 0.8742, + "step": 4185 + }, + { + "epoch": 0.30241840807701337, + "grad_norm": 6.8706695790051775, + "learning_rate": 4.8098527131384435e-06, + "loss": 0.9148, + "step": 4186 + }, + { + "epoch": 0.3024906532772229, + "grad_norm": 6.216577505063767, + "learning_rate": 4.809740804986819e-06, + "loss": 0.8905, + "step": 4187 + }, + { + "epoch": 0.3025628984774324, + "grad_norm": 9.162231574608764, + "learning_rate": 4.809628865216635e-06, + "loss": 0.9219, + "step": 4188 + }, + { + "epoch": 0.30263514367764194, + "grad_norm": 6.664554483872561, + "learning_rate": 4.809516893829425e-06, + "loss": 0.9186, + "step": 4189 + }, + { + "epoch": 0.3027073888778514, + "grad_norm": 6.081287022445597, + "learning_rate": 4.8094048908267234e-06, + "loss": 0.9791, + "step": 4190 + }, + { + "epoch": 0.30277963407806097, + "grad_norm": 6.152656862072055, + "learning_rate": 4.809292856210062e-06, + "loss": 0.8453, + "step": 4191 + }, + { + "epoch": 0.30285187927827045, + "grad_norm": 6.10565556236113, + "learning_rate": 4.809180789980973e-06, + "loss": 0.8973, + "step": 4192 + }, + { + "epoch": 0.30292412447847994, + "grad_norm": 7.882076172327755, + "learning_rate": 4.809068692140993e-06, + "loss": 0.9464, + "step": 4193 + }, + { + "epoch": 0.3029963696786895, + "grad_norm": 6.04972988818267, + "learning_rate": 4.808956562691655e-06, + "loss": 0.8423, + "step": 4194 + }, + { + "epoch": 0.30306861487889897, + "grad_norm": 6.039501810812109, + "learning_rate": 4.808844401634495e-06, + "loss": 0.9763, + "step": 4195 + }, + { + "epoch": 0.3031408600791085, + "grad_norm": 5.63585327863889, + "learning_rate": 4.808732208971046e-06, + "loss": 0.888, + "step": 4196 + }, + { + "epoch": 0.303213105279318, + "grad_norm": 7.161273642268068, + "learning_rate": 4.808619984702848e-06, + "loss": 0.8663, + "step": 4197 + }, + { + "epoch": 0.30328535047952754, + "grad_norm": 8.503538404997853, + "learning_rate": 4.808507728831434e-06, + "loss": 1.0018, + "step": 4198 + }, + { + "epoch": 0.303357595679737, + "grad_norm": 6.255017212280693, + "learning_rate": 4.808395441358341e-06, + "loss": 0.9043, + "step": 4199 + }, + { + "epoch": 0.30342984087994657, + "grad_norm": 7.127453297997589, + "learning_rate": 4.808283122285108e-06, + "loss": 0.9725, + "step": 4200 + }, + { + "epoch": 0.30350208608015605, + "grad_norm": 7.252207452869307, + "learning_rate": 4.80817077161327e-06, + "loss": 0.9715, + "step": 4201 + }, + { + "epoch": 0.30357433128036554, + "grad_norm": 6.221728661050583, + "learning_rate": 4.8080583893443675e-06, + "loss": 0.8512, + "step": 4202 + }, + { + "epoch": 0.3036465764805751, + "grad_norm": 9.035977825541178, + "learning_rate": 4.807945975479937e-06, + "loss": 1.0117, + "step": 4203 + }, + { + "epoch": 0.30371882168078457, + "grad_norm": 5.279529539591935, + "learning_rate": 4.807833530021518e-06, + "loss": 0.8448, + "step": 4204 + }, + { + "epoch": 0.3037910668809941, + "grad_norm": 6.1913554192511215, + "learning_rate": 4.807721052970651e-06, + "loss": 0.9157, + "step": 4205 + }, + { + "epoch": 0.3038633120812036, + "grad_norm": 6.620882878279197, + "learning_rate": 4.807608544328873e-06, + "loss": 0.8626, + "step": 4206 + }, + { + "epoch": 0.30393555728141314, + "grad_norm": 5.9110488368859535, + "learning_rate": 4.807496004097728e-06, + "loss": 0.962, + "step": 4207 + }, + { + "epoch": 0.3040078024816226, + "grad_norm": 6.641260884076991, + "learning_rate": 4.8073834322787526e-06, + "loss": 0.9156, + "step": 4208 + }, + { + "epoch": 0.3040800476818321, + "grad_norm": 7.487987307578521, + "learning_rate": 4.80727082887349e-06, + "loss": 0.8965, + "step": 4209 + }, + { + "epoch": 0.30415229288204165, + "grad_norm": 7.79779695552655, + "learning_rate": 4.807158193883481e-06, + "loss": 0.9325, + "step": 4210 + }, + { + "epoch": 0.30422453808225114, + "grad_norm": 5.473269268285117, + "learning_rate": 4.807045527310268e-06, + "loss": 0.9394, + "step": 4211 + }, + { + "epoch": 0.3042967832824607, + "grad_norm": 6.2773337230590425, + "learning_rate": 4.806932829155393e-06, + "loss": 0.956, + "step": 4212 + }, + { + "epoch": 0.30436902848267017, + "grad_norm": 10.167428868806809, + "learning_rate": 4.806820099420398e-06, + "loss": 0.99, + "step": 4213 + }, + { + "epoch": 0.3044412736828797, + "grad_norm": 5.823197232369867, + "learning_rate": 4.806707338106829e-06, + "loss": 0.8073, + "step": 4214 + }, + { + "epoch": 0.3045135188830892, + "grad_norm": 7.2987358775445665, + "learning_rate": 4.806594545216225e-06, + "loss": 0.9003, + "step": 4215 + }, + { + "epoch": 0.30458576408329874, + "grad_norm": 8.275006859537376, + "learning_rate": 4.806481720750134e-06, + "loss": 0.9037, + "step": 4216 + }, + { + "epoch": 0.3046580092835082, + "grad_norm": 7.117699882077646, + "learning_rate": 4.8063688647101e-06, + "loss": 0.8696, + "step": 4217 + }, + { + "epoch": 0.3047302544837177, + "grad_norm": 8.784334231634542, + "learning_rate": 4.806255977097666e-06, + "loss": 0.8815, + "step": 4218 + }, + { + "epoch": 0.30480249968392725, + "grad_norm": 7.108873018421487, + "learning_rate": 4.806143057914378e-06, + "loss": 0.9526, + "step": 4219 + }, + { + "epoch": 0.30487474488413674, + "grad_norm": 7.842411489379399, + "learning_rate": 4.806030107161784e-06, + "loss": 0.8845, + "step": 4220 + }, + { + "epoch": 0.3049469900843463, + "grad_norm": 8.292137043197252, + "learning_rate": 4.805917124841426e-06, + "loss": 0.9844, + "step": 4221 + }, + { + "epoch": 0.30501923528455577, + "grad_norm": 5.214892577896406, + "learning_rate": 4.805804110954854e-06, + "loss": 0.7989, + "step": 4222 + }, + { + "epoch": 0.3050914804847653, + "grad_norm": 6.248509343719883, + "learning_rate": 4.805691065503614e-06, + "loss": 0.8924, + "step": 4223 + }, + { + "epoch": 0.3051637256849748, + "grad_norm": 7.283101775491536, + "learning_rate": 4.805577988489253e-06, + "loss": 1.0139, + "step": 4224 + }, + { + "epoch": 0.30523597088518434, + "grad_norm": 7.235833369135752, + "learning_rate": 4.805464879913321e-06, + "loss": 0.9561, + "step": 4225 + }, + { + "epoch": 0.3053082160853938, + "grad_norm": 6.420434052242064, + "learning_rate": 4.805351739777363e-06, + "loss": 1.0233, + "step": 4226 + }, + { + "epoch": 0.3053804612856033, + "grad_norm": 5.612024898003442, + "learning_rate": 4.805238568082931e-06, + "loss": 0.8432, + "step": 4227 + }, + { + "epoch": 0.30545270648581285, + "grad_norm": 7.405245040764656, + "learning_rate": 4.805125364831572e-06, + "loss": 0.9266, + "step": 4228 + }, + { + "epoch": 0.30552495168602234, + "grad_norm": 7.217950421592976, + "learning_rate": 4.805012130024838e-06, + "loss": 0.9654, + "step": 4229 + }, + { + "epoch": 0.3055971968862319, + "grad_norm": 6.721601581909458, + "learning_rate": 4.8048988636642764e-06, + "loss": 0.9542, + "step": 4230 + }, + { + "epoch": 0.30566944208644137, + "grad_norm": 6.098201123757791, + "learning_rate": 4.8047855657514395e-06, + "loss": 0.9087, + "step": 4231 + }, + { + "epoch": 0.3057416872866509, + "grad_norm": 7.641876804647672, + "learning_rate": 4.804672236287877e-06, + "loss": 0.7996, + "step": 4232 + }, + { + "epoch": 0.3058139324868604, + "grad_norm": 6.14429502039147, + "learning_rate": 4.804558875275141e-06, + "loss": 0.8706, + "step": 4233 + }, + { + "epoch": 0.30588617768706994, + "grad_norm": 7.388458386008366, + "learning_rate": 4.8044454827147846e-06, + "loss": 0.9359, + "step": 4234 + }, + { + "epoch": 0.3059584228872794, + "grad_norm": 6.273874688027664, + "learning_rate": 4.8043320586083585e-06, + "loss": 1.0096, + "step": 4235 + }, + { + "epoch": 0.3060306680874889, + "grad_norm": 6.4113594334708965, + "learning_rate": 4.804218602957416e-06, + "loss": 0.8319, + "step": 4236 + }, + { + "epoch": 0.30610291328769845, + "grad_norm": 7.457914401032957, + "learning_rate": 4.804105115763509e-06, + "loss": 0.9501, + "step": 4237 + }, + { + "epoch": 0.30617515848790794, + "grad_norm": 6.519874120327928, + "learning_rate": 4.803991597028193e-06, + "loss": 0.897, + "step": 4238 + }, + { + "epoch": 0.3062474036881175, + "grad_norm": 5.953300253223357, + "learning_rate": 4.80387804675302e-06, + "loss": 0.9234, + "step": 4239 + }, + { + "epoch": 0.30631964888832697, + "grad_norm": 5.855472658854599, + "learning_rate": 4.803764464939545e-06, + "loss": 0.905, + "step": 4240 + }, + { + "epoch": 0.3063918940885365, + "grad_norm": 7.484515672608056, + "learning_rate": 4.803650851589324e-06, + "loss": 0.9019, + "step": 4241 + }, + { + "epoch": 0.306464139288746, + "grad_norm": 5.9204512120032655, + "learning_rate": 4.803537206703912e-06, + "loss": 0.9142, + "step": 4242 + }, + { + "epoch": 0.30653638448895554, + "grad_norm": 7.8822349132579905, + "learning_rate": 4.803423530284864e-06, + "loss": 0.9244, + "step": 4243 + }, + { + "epoch": 0.306608629689165, + "grad_norm": 7.1504314372469535, + "learning_rate": 4.803309822333736e-06, + "loss": 0.8779, + "step": 4244 + }, + { + "epoch": 0.3066808748893745, + "grad_norm": 6.366114688665925, + "learning_rate": 4.803196082852085e-06, + "loss": 0.9106, + "step": 4245 + }, + { + "epoch": 0.30675312008958405, + "grad_norm": 8.05891563523408, + "learning_rate": 4.803082311841468e-06, + "loss": 0.971, + "step": 4246 + }, + { + "epoch": 0.30682536528979354, + "grad_norm": 7.23441085641762, + "learning_rate": 4.8029685093034415e-06, + "loss": 0.8967, + "step": 4247 + }, + { + "epoch": 0.3068976104900031, + "grad_norm": 6.426011287200827, + "learning_rate": 4.802854675239566e-06, + "loss": 0.9262, + "step": 4248 + }, + { + "epoch": 0.30696985569021257, + "grad_norm": 5.932519369692555, + "learning_rate": 4.802740809651397e-06, + "loss": 0.9095, + "step": 4249 + }, + { + "epoch": 0.3070421008904221, + "grad_norm": 7.306425000159721, + "learning_rate": 4.802626912540494e-06, + "loss": 0.9321, + "step": 4250 + }, + { + "epoch": 0.3071143460906316, + "grad_norm": 6.131741608831108, + "learning_rate": 4.802512983908417e-06, + "loss": 0.9014, + "step": 4251 + }, + { + "epoch": 0.30718659129084114, + "grad_norm": 6.485952603647194, + "learning_rate": 4.802399023756724e-06, + "loss": 0.8687, + "step": 4252 + }, + { + "epoch": 0.3072588364910506, + "grad_norm": 6.992914428431764, + "learning_rate": 4.802285032086977e-06, + "loss": 0.9631, + "step": 4253 + }, + { + "epoch": 0.3073310816912601, + "grad_norm": 6.109868880892553, + "learning_rate": 4.802171008900736e-06, + "loss": 0.9342, + "step": 4254 + }, + { + "epoch": 0.30740332689146965, + "grad_norm": 6.1470326778200475, + "learning_rate": 4.802056954199561e-06, + "loss": 0.8696, + "step": 4255 + }, + { + "epoch": 0.30747557209167914, + "grad_norm": 6.351305141138285, + "learning_rate": 4.801942867985013e-06, + "loss": 0.9277, + "step": 4256 + }, + { + "epoch": 0.3075478172918887, + "grad_norm": 7.14790757513251, + "learning_rate": 4.801828750258656e-06, + "loss": 0.8974, + "step": 4257 + }, + { + "epoch": 0.30762006249209817, + "grad_norm": 7.785605318599316, + "learning_rate": 4.801714601022049e-06, + "loss": 0.9658, + "step": 4258 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 6.997661472432321, + "learning_rate": 4.801600420276757e-06, + "loss": 0.903, + "step": 4259 + }, + { + "epoch": 0.3077645528925172, + "grad_norm": 6.66066713583076, + "learning_rate": 4.801486208024343e-06, + "loss": 0.9772, + "step": 4260 + }, + { + "epoch": 0.30783679809272674, + "grad_norm": 6.474980546487578, + "learning_rate": 4.801371964266369e-06, + "loss": 0.7631, + "step": 4261 + }, + { + "epoch": 0.3079090432929362, + "grad_norm": 5.634744638175822, + "learning_rate": 4.8012576890044e-06, + "loss": 0.8155, + "step": 4262 + }, + { + "epoch": 0.3079812884931457, + "grad_norm": 7.481783041770916, + "learning_rate": 4.80114338224e-06, + "loss": 0.9829, + "step": 4263 + }, + { + "epoch": 0.30805353369335525, + "grad_norm": 6.857350669845354, + "learning_rate": 4.801029043974733e-06, + "loss": 0.9063, + "step": 4264 + }, + { + "epoch": 0.30812577889356474, + "grad_norm": 6.446787236587549, + "learning_rate": 4.800914674210166e-06, + "loss": 0.8139, + "step": 4265 + }, + { + "epoch": 0.3081980240937743, + "grad_norm": 7.897659640774201, + "learning_rate": 4.800800272947863e-06, + "loss": 0.9484, + "step": 4266 + }, + { + "epoch": 0.30827026929398377, + "grad_norm": 7.621979662437402, + "learning_rate": 4.800685840189392e-06, + "loss": 0.9624, + "step": 4267 + }, + { + "epoch": 0.3083425144941933, + "grad_norm": 6.749974568636988, + "learning_rate": 4.800571375936317e-06, + "loss": 0.903, + "step": 4268 + }, + { + "epoch": 0.3084147596944028, + "grad_norm": 6.419996743364786, + "learning_rate": 4.800456880190207e-06, + "loss": 0.8461, + "step": 4269 + }, + { + "epoch": 0.30848700489461234, + "grad_norm": 7.559118097930276, + "learning_rate": 4.800342352952627e-06, + "loss": 1.0204, + "step": 4270 + }, + { + "epoch": 0.3085592500948218, + "grad_norm": 6.958584474459126, + "learning_rate": 4.800227794225147e-06, + "loss": 0.9452, + "step": 4271 + }, + { + "epoch": 0.3086314952950313, + "grad_norm": 5.9739711753576366, + "learning_rate": 4.800113204009336e-06, + "loss": 0.8881, + "step": 4272 + }, + { + "epoch": 0.30870374049524085, + "grad_norm": 6.503191531206253, + "learning_rate": 4.79999858230676e-06, + "loss": 0.9188, + "step": 4273 + }, + { + "epoch": 0.30877598569545034, + "grad_norm": 7.183779342270177, + "learning_rate": 4.799883929118988e-06, + "loss": 0.9652, + "step": 4274 + }, + { + "epoch": 0.3088482308956599, + "grad_norm": 7.2537187707160715, + "learning_rate": 4.7997692444475925e-06, + "loss": 1.0026, + "step": 4275 + }, + { + "epoch": 0.30892047609586937, + "grad_norm": 7.928650255437358, + "learning_rate": 4.799654528294141e-06, + "loss": 0.9672, + "step": 4276 + }, + { + "epoch": 0.3089927212960789, + "grad_norm": 5.259989906852525, + "learning_rate": 4.799539780660205e-06, + "loss": 0.8666, + "step": 4277 + }, + { + "epoch": 0.3090649664962884, + "grad_norm": 7.49081086846017, + "learning_rate": 4.799425001547354e-06, + "loss": 0.9368, + "step": 4278 + }, + { + "epoch": 0.30913721169649794, + "grad_norm": 5.990939610935417, + "learning_rate": 4.799310190957161e-06, + "loss": 0.8665, + "step": 4279 + }, + { + "epoch": 0.3092094568967074, + "grad_norm": 8.11766119888258, + "learning_rate": 4.799195348891197e-06, + "loss": 0.9205, + "step": 4280 + }, + { + "epoch": 0.3092817020969169, + "grad_norm": 7.1683061483944295, + "learning_rate": 4.799080475351032e-06, + "loss": 0.9352, + "step": 4281 + }, + { + "epoch": 0.30935394729712645, + "grad_norm": 6.221264814712149, + "learning_rate": 4.798965570338243e-06, + "loss": 0.8914, + "step": 4282 + }, + { + "epoch": 0.30942619249733594, + "grad_norm": 8.497990538985295, + "learning_rate": 4.798850633854399e-06, + "loss": 0.8713, + "step": 4283 + }, + { + "epoch": 0.3094984376975455, + "grad_norm": 7.662824524919452, + "learning_rate": 4.798735665901074e-06, + "loss": 0.8973, + "step": 4284 + }, + { + "epoch": 0.30957068289775497, + "grad_norm": 6.392078635389732, + "learning_rate": 4.798620666479844e-06, + "loss": 0.826, + "step": 4285 + }, + { + "epoch": 0.3096429280979645, + "grad_norm": 7.30740857365016, + "learning_rate": 4.798505635592281e-06, + "loss": 0.99, + "step": 4286 + }, + { + "epoch": 0.309715173298174, + "grad_norm": 7.020701994832436, + "learning_rate": 4.798390573239962e-06, + "loss": 0.9204, + "step": 4287 + }, + { + "epoch": 0.30978741849838354, + "grad_norm": 7.331076997372641, + "learning_rate": 4.798275479424459e-06, + "loss": 0.9785, + "step": 4288 + }, + { + "epoch": 0.309859663698593, + "grad_norm": 8.78285110375567, + "learning_rate": 4.798160354147349e-06, + "loss": 0.9907, + "step": 4289 + }, + { + "epoch": 0.3099319088988025, + "grad_norm": 7.614982904125718, + "learning_rate": 4.79804519741021e-06, + "loss": 0.946, + "step": 4290 + }, + { + "epoch": 0.31000415409901205, + "grad_norm": 7.048783477362756, + "learning_rate": 4.797930009214615e-06, + "loss": 1.0755, + "step": 4291 + }, + { + "epoch": 0.31007639929922154, + "grad_norm": 7.73511799027567, + "learning_rate": 4.797814789562142e-06, + "loss": 0.9444, + "step": 4292 + }, + { + "epoch": 0.3101486444994311, + "grad_norm": 7.622621008793574, + "learning_rate": 4.79769953845437e-06, + "loss": 0.9546, + "step": 4293 + }, + { + "epoch": 0.31022088969964057, + "grad_norm": 6.667442975939303, + "learning_rate": 4.797584255892875e-06, + "loss": 0.8289, + "step": 4294 + }, + { + "epoch": 0.3102931348998501, + "grad_norm": 5.547412427199836, + "learning_rate": 4.7974689418792356e-06, + "loss": 0.8791, + "step": 4295 + }, + { + "epoch": 0.3103653801000596, + "grad_norm": 6.424980062609661, + "learning_rate": 4.79735359641503e-06, + "loss": 0.9233, + "step": 4296 + }, + { + "epoch": 0.31043762530026914, + "grad_norm": 9.008670868550537, + "learning_rate": 4.797238219501837e-06, + "loss": 0.9512, + "step": 4297 + }, + { + "epoch": 0.3105098705004786, + "grad_norm": 6.135029891235854, + "learning_rate": 4.797122811141237e-06, + "loss": 0.9532, + "step": 4298 + }, + { + "epoch": 0.3105821157006881, + "grad_norm": 6.927153974665259, + "learning_rate": 4.797007371334809e-06, + "loss": 0.9871, + "step": 4299 + }, + { + "epoch": 0.31065436090089765, + "grad_norm": 7.2573389200645435, + "learning_rate": 4.796891900084134e-06, + "loss": 0.9623, + "step": 4300 + }, + { + "epoch": 0.31072660610110714, + "grad_norm": 7.665922073473762, + "learning_rate": 4.796776397390792e-06, + "loss": 1.0481, + "step": 4301 + }, + { + "epoch": 0.3107988513013167, + "grad_norm": 6.87019013431866, + "learning_rate": 4.796660863256365e-06, + "loss": 0.8725, + "step": 4302 + }, + { + "epoch": 0.31087109650152617, + "grad_norm": 7.98201470471822, + "learning_rate": 4.796545297682433e-06, + "loss": 1.0227, + "step": 4303 + }, + { + "epoch": 0.3109433417017357, + "grad_norm": 7.70088407717376, + "learning_rate": 4.796429700670579e-06, + "loss": 0.8846, + "step": 4304 + }, + { + "epoch": 0.3110155869019452, + "grad_norm": 6.861585812571409, + "learning_rate": 4.796314072222386e-06, + "loss": 0.9203, + "step": 4305 + }, + { + "epoch": 0.31108783210215474, + "grad_norm": 6.405505923346976, + "learning_rate": 4.796198412339437e-06, + "loss": 0.8288, + "step": 4306 + }, + { + "epoch": 0.3111600773023642, + "grad_norm": 6.267724196840988, + "learning_rate": 4.796082721023314e-06, + "loss": 0.9118, + "step": 4307 + }, + { + "epoch": 0.3112323225025737, + "grad_norm": 6.495679446443402, + "learning_rate": 4.795966998275602e-06, + "loss": 0.9283, + "step": 4308 + }, + { + "epoch": 0.31130456770278325, + "grad_norm": 6.812522118208791, + "learning_rate": 4.795851244097883e-06, + "loss": 0.8473, + "step": 4309 + }, + { + "epoch": 0.31137681290299274, + "grad_norm": 5.90436180651941, + "learning_rate": 4.795735458491745e-06, + "loss": 0.8978, + "step": 4310 + }, + { + "epoch": 0.3114490581032023, + "grad_norm": 6.222308650249105, + "learning_rate": 4.79561964145877e-06, + "loss": 0.8457, + "step": 4311 + }, + { + "epoch": 0.31152130330341177, + "grad_norm": 6.208900203896287, + "learning_rate": 4.795503793000544e-06, + "loss": 0.8992, + "step": 4312 + }, + { + "epoch": 0.3115935485036213, + "grad_norm": 7.205400550186162, + "learning_rate": 4.7953879131186544e-06, + "loss": 0.9245, + "step": 4313 + }, + { + "epoch": 0.3116657937038308, + "grad_norm": 6.415976086731295, + "learning_rate": 4.795272001814686e-06, + "loss": 0.9259, + "step": 4314 + }, + { + "epoch": 0.31173803890404034, + "grad_norm": 6.552285228179996, + "learning_rate": 4.795156059090225e-06, + "loss": 0.9431, + "step": 4315 + }, + { + "epoch": 0.3118102841042498, + "grad_norm": 6.230614773465157, + "learning_rate": 4.795040084946862e-06, + "loss": 0.8699, + "step": 4316 + }, + { + "epoch": 0.3118825293044593, + "grad_norm": 8.992926466975568, + "learning_rate": 4.79492407938618e-06, + "loss": 1.0185, + "step": 4317 + }, + { + "epoch": 0.31195477450466885, + "grad_norm": 7.039822248468996, + "learning_rate": 4.794808042409771e-06, + "loss": 0.9162, + "step": 4318 + }, + { + "epoch": 0.31202701970487834, + "grad_norm": 5.758169590139612, + "learning_rate": 4.794691974019221e-06, + "loss": 0.8987, + "step": 4319 + }, + { + "epoch": 0.3120992649050879, + "grad_norm": 6.823800050657087, + "learning_rate": 4.79457587421612e-06, + "loss": 0.9665, + "step": 4320 + }, + { + "epoch": 0.31217151010529737, + "grad_norm": 7.722757758484998, + "learning_rate": 4.794459743002056e-06, + "loss": 0.8884, + "step": 4321 + }, + { + "epoch": 0.3122437553055069, + "grad_norm": 5.929783032518487, + "learning_rate": 4.7943435803786204e-06, + "loss": 0.8389, + "step": 4322 + }, + { + "epoch": 0.3123160005057164, + "grad_norm": 6.7958672138104745, + "learning_rate": 4.794227386347402e-06, + "loss": 0.9172, + "step": 4323 + }, + { + "epoch": 0.31238824570592594, + "grad_norm": 6.243369590924195, + "learning_rate": 4.794111160909993e-06, + "loss": 0.9618, + "step": 4324 + }, + { + "epoch": 0.3124604909061354, + "grad_norm": 6.193480404742214, + "learning_rate": 4.793994904067982e-06, + "loss": 0.8618, + "step": 4325 + }, + { + "epoch": 0.3125327361063449, + "grad_norm": 6.783285512294805, + "learning_rate": 4.793878615822964e-06, + "loss": 0.9588, + "step": 4326 + }, + { + "epoch": 0.31260498130655445, + "grad_norm": 7.523803068253666, + "learning_rate": 4.793762296176527e-06, + "loss": 0.828, + "step": 4327 + }, + { + "epoch": 0.31267722650676394, + "grad_norm": 5.920725366125673, + "learning_rate": 4.7936459451302655e-06, + "loss": 0.9042, + "step": 4328 + }, + { + "epoch": 0.3127494717069735, + "grad_norm": 6.770580423962465, + "learning_rate": 4.7935295626857725e-06, + "loss": 0.985, + "step": 4329 + }, + { + "epoch": 0.31282171690718297, + "grad_norm": 6.453046673537207, + "learning_rate": 4.79341314884464e-06, + "loss": 0.8557, + "step": 4330 + }, + { + "epoch": 0.3128939621073925, + "grad_norm": 5.863304346535955, + "learning_rate": 4.793296703608463e-06, + "loss": 0.8685, + "step": 4331 + }, + { + "epoch": 0.312966207307602, + "grad_norm": 6.522762642862086, + "learning_rate": 4.793180226978834e-06, + "loss": 0.8483, + "step": 4332 + }, + { + "epoch": 0.31303845250781154, + "grad_norm": 6.422761804835997, + "learning_rate": 4.793063718957348e-06, + "loss": 0.9671, + "step": 4333 + }, + { + "epoch": 0.313110697708021, + "grad_norm": 6.072903545315299, + "learning_rate": 4.7929471795456015e-06, + "loss": 1.005, + "step": 4334 + }, + { + "epoch": 0.3131829429082305, + "grad_norm": 6.419485422588037, + "learning_rate": 4.792830608745187e-06, + "loss": 0.8755, + "step": 4335 + }, + { + "epoch": 0.31325518810844005, + "grad_norm": 7.0060210899233795, + "learning_rate": 4.792714006557703e-06, + "loss": 0.8056, + "step": 4336 + }, + { + "epoch": 0.31332743330864954, + "grad_norm": 7.737462392786485, + "learning_rate": 4.792597372984743e-06, + "loss": 0.922, + "step": 4337 + }, + { + "epoch": 0.3133996785088591, + "grad_norm": 8.24826493947174, + "learning_rate": 4.792480708027906e-06, + "loss": 0.9316, + "step": 4338 + }, + { + "epoch": 0.31347192370906857, + "grad_norm": 7.204417346529733, + "learning_rate": 4.792364011688788e-06, + "loss": 0.9202, + "step": 4339 + }, + { + "epoch": 0.3135441689092781, + "grad_norm": 7.306736166175676, + "learning_rate": 4.792247283968986e-06, + "loss": 0.9152, + "step": 4340 + }, + { + "epoch": 0.3136164141094876, + "grad_norm": 7.149517773992507, + "learning_rate": 4.7921305248701e-06, + "loss": 0.9663, + "step": 4341 + }, + { + "epoch": 0.31368865930969714, + "grad_norm": 7.6137071375876495, + "learning_rate": 4.7920137343937256e-06, + "loss": 1.0001, + "step": 4342 + }, + { + "epoch": 0.3137609045099066, + "grad_norm": 8.099795642676733, + "learning_rate": 4.791896912541463e-06, + "loss": 0.9373, + "step": 4343 + }, + { + "epoch": 0.3138331497101161, + "grad_norm": 7.223991835850425, + "learning_rate": 4.791780059314911e-06, + "loss": 0.9701, + "step": 4344 + }, + { + "epoch": 0.31390539491032565, + "grad_norm": 8.547524380466427, + "learning_rate": 4.79166317471567e-06, + "loss": 0.855, + "step": 4345 + }, + { + "epoch": 0.31397764011053514, + "grad_norm": 8.562867985078528, + "learning_rate": 4.791546258745339e-06, + "loss": 0.9034, + "step": 4346 + }, + { + "epoch": 0.3140498853107447, + "grad_norm": 6.618685322014161, + "learning_rate": 4.791429311405518e-06, + "loss": 0.8505, + "step": 4347 + }, + { + "epoch": 0.31412213051095417, + "grad_norm": 7.263732269848409, + "learning_rate": 4.791312332697811e-06, + "loss": 0.9553, + "step": 4348 + }, + { + "epoch": 0.3141943757111637, + "grad_norm": 6.733855260632097, + "learning_rate": 4.791195322623816e-06, + "loss": 0.919, + "step": 4349 + }, + { + "epoch": 0.3142666209113732, + "grad_norm": 6.060143081436318, + "learning_rate": 4.791078281185137e-06, + "loss": 0.9405, + "step": 4350 + }, + { + "epoch": 0.31433886611158274, + "grad_norm": 7.304225837684006, + "learning_rate": 4.790961208383374e-06, + "loss": 0.9026, + "step": 4351 + }, + { + "epoch": 0.3144111113117922, + "grad_norm": 7.845773139624116, + "learning_rate": 4.790844104220132e-06, + "loss": 0.9337, + "step": 4352 + }, + { + "epoch": 0.3144833565120017, + "grad_norm": 6.242038386047512, + "learning_rate": 4.7907269686970125e-06, + "loss": 0.9252, + "step": 4353 + }, + { + "epoch": 0.31455560171221125, + "grad_norm": 8.039887173748246, + "learning_rate": 4.79060980181562e-06, + "loss": 0.8475, + "step": 4354 + }, + { + "epoch": 0.31462784691242074, + "grad_norm": 6.256695022532301, + "learning_rate": 4.790492603577557e-06, + "loss": 0.8585, + "step": 4355 + }, + { + "epoch": 0.3147000921126303, + "grad_norm": 6.346978930209816, + "learning_rate": 4.790375373984429e-06, + "loss": 0.9449, + "step": 4356 + }, + { + "epoch": 0.31477233731283977, + "grad_norm": 6.1641011786848905, + "learning_rate": 4.79025811303784e-06, + "loss": 0.979, + "step": 4357 + }, + { + "epoch": 0.3148445825130493, + "grad_norm": 7.970064660627295, + "learning_rate": 4.790140820739397e-06, + "loss": 0.8178, + "step": 4358 + }, + { + "epoch": 0.3149168277132588, + "grad_norm": 7.1398877320425616, + "learning_rate": 4.790023497090702e-06, + "loss": 0.8821, + "step": 4359 + }, + { + "epoch": 0.31498907291346834, + "grad_norm": 6.491272275384374, + "learning_rate": 4.789906142093366e-06, + "loss": 0.9122, + "step": 4360 + }, + { + "epoch": 0.3150613181136778, + "grad_norm": 6.5561117414037025, + "learning_rate": 4.789788755748991e-06, + "loss": 0.9251, + "step": 4361 + }, + { + "epoch": 0.3151335633138873, + "grad_norm": 7.508802207789411, + "learning_rate": 4.7896713380591865e-06, + "loss": 1.0746, + "step": 4362 + }, + { + "epoch": 0.31520580851409685, + "grad_norm": 6.379116449671287, + "learning_rate": 4.78955388902556e-06, + "loss": 0.8595, + "step": 4363 + }, + { + "epoch": 0.31527805371430634, + "grad_norm": 6.809683401223756, + "learning_rate": 4.789436408649718e-06, + "loss": 1.014, + "step": 4364 + }, + { + "epoch": 0.3153502989145159, + "grad_norm": 6.090558434049723, + "learning_rate": 4.7893188969332685e-06, + "loss": 0.9111, + "step": 4365 + }, + { + "epoch": 0.31542254411472537, + "grad_norm": 7.374583087063825, + "learning_rate": 4.789201353877822e-06, + "loss": 0.9684, + "step": 4366 + }, + { + "epoch": 0.3154947893149349, + "grad_norm": 6.95887336995735, + "learning_rate": 4.789083779484985e-06, + "loss": 0.9622, + "step": 4367 + }, + { + "epoch": 0.3155670345151444, + "grad_norm": 7.4654002319071004, + "learning_rate": 4.788966173756369e-06, + "loss": 0.8949, + "step": 4368 + }, + { + "epoch": 0.31563927971535394, + "grad_norm": 6.371729591585102, + "learning_rate": 4.788848536693584e-06, + "loss": 0.9534, + "step": 4369 + }, + { + "epoch": 0.3157115249155634, + "grad_norm": 7.078911541822186, + "learning_rate": 4.78873086829824e-06, + "loss": 0.8916, + "step": 4370 + }, + { + "epoch": 0.3157837701157729, + "grad_norm": 6.917929874873126, + "learning_rate": 4.788613168571946e-06, + "loss": 0.8797, + "step": 4371 + }, + { + "epoch": 0.31585601531598245, + "grad_norm": 6.014903632174476, + "learning_rate": 4.788495437516315e-06, + "loss": 0.8881, + "step": 4372 + }, + { + "epoch": 0.31592826051619194, + "grad_norm": 6.8737642044500555, + "learning_rate": 4.788377675132959e-06, + "loss": 0.9434, + "step": 4373 + }, + { + "epoch": 0.3160005057164015, + "grad_norm": 7.04712103099127, + "learning_rate": 4.788259881423489e-06, + "loss": 0.9124, + "step": 4374 + }, + { + "epoch": 0.31607275091661097, + "grad_norm": 7.036591666704524, + "learning_rate": 4.788142056389518e-06, + "loss": 0.8038, + "step": 4375 + }, + { + "epoch": 0.3161449961168205, + "grad_norm": 7.70400964945885, + "learning_rate": 4.788024200032659e-06, + "loss": 0.8363, + "step": 4376 + }, + { + "epoch": 0.31621724131703, + "grad_norm": 6.1433906830408125, + "learning_rate": 4.787906312354525e-06, + "loss": 0.9214, + "step": 4377 + }, + { + "epoch": 0.31628948651723954, + "grad_norm": 7.298322186940281, + "learning_rate": 4.787788393356731e-06, + "loss": 0.8739, + "step": 4378 + }, + { + "epoch": 0.316361731717449, + "grad_norm": 8.240524861924754, + "learning_rate": 4.787670443040889e-06, + "loss": 0.9824, + "step": 4379 + }, + { + "epoch": 0.3164339769176585, + "grad_norm": 6.1379749122124325, + "learning_rate": 4.787552461408616e-06, + "loss": 0.8883, + "step": 4380 + }, + { + "epoch": 0.31650622211786805, + "grad_norm": 6.583862701894123, + "learning_rate": 4.787434448461525e-06, + "loss": 0.8615, + "step": 4381 + }, + { + "epoch": 0.31657846731807754, + "grad_norm": 8.276373582685583, + "learning_rate": 4.7873164042012335e-06, + "loss": 0.9503, + "step": 4382 + }, + { + "epoch": 0.3166507125182871, + "grad_norm": 6.6752955224843795, + "learning_rate": 4.787198328629356e-06, + "loss": 0.9234, + "step": 4383 + }, + { + "epoch": 0.31672295771849657, + "grad_norm": 5.899488989473077, + "learning_rate": 4.787080221747509e-06, + "loss": 0.897, + "step": 4384 + }, + { + "epoch": 0.3167952029187061, + "grad_norm": 6.7292254025127045, + "learning_rate": 4.786962083557309e-06, + "loss": 0.8015, + "step": 4385 + }, + { + "epoch": 0.3168674481189156, + "grad_norm": 7.262796355879905, + "learning_rate": 4.786843914060375e-06, + "loss": 0.8814, + "step": 4386 + }, + { + "epoch": 0.31693969331912514, + "grad_norm": 7.984946631705289, + "learning_rate": 4.786725713258324e-06, + "loss": 0.9795, + "step": 4387 + }, + { + "epoch": 0.3170119385193346, + "grad_norm": 6.742084241586892, + "learning_rate": 4.786607481152772e-06, + "loss": 0.9014, + "step": 4388 + }, + { + "epoch": 0.3170841837195441, + "grad_norm": 6.677238506194272, + "learning_rate": 4.786489217745339e-06, + "loss": 0.9821, + "step": 4389 + }, + { + "epoch": 0.31715642891975365, + "grad_norm": 5.932083068163147, + "learning_rate": 4.786370923037644e-06, + "loss": 0.8552, + "step": 4390 + }, + { + "epoch": 0.31722867411996314, + "grad_norm": 6.886710164221109, + "learning_rate": 4.786252597031307e-06, + "loss": 0.8668, + "step": 4391 + }, + { + "epoch": 0.3173009193201727, + "grad_norm": 6.3634302204976345, + "learning_rate": 4.786134239727947e-06, + "loss": 0.8804, + "step": 4392 + }, + { + "epoch": 0.31737316452038217, + "grad_norm": 7.967202189600843, + "learning_rate": 4.786015851129184e-06, + "loss": 0.8521, + "step": 4393 + }, + { + "epoch": 0.3174454097205917, + "grad_norm": 6.653180341369469, + "learning_rate": 4.785897431236639e-06, + "loss": 0.9712, + "step": 4394 + }, + { + "epoch": 0.3175176549208012, + "grad_norm": 6.765921006984482, + "learning_rate": 4.785778980051934e-06, + "loss": 1.0132, + "step": 4395 + }, + { + "epoch": 0.31758990012101074, + "grad_norm": 5.715585635692597, + "learning_rate": 4.785660497576689e-06, + "loss": 0.8951, + "step": 4396 + }, + { + "epoch": 0.3176621453212202, + "grad_norm": 6.735979566130826, + "learning_rate": 4.785541983812525e-06, + "loss": 0.9498, + "step": 4397 + }, + { + "epoch": 0.3177343905214297, + "grad_norm": 5.572635631664857, + "learning_rate": 4.785423438761067e-06, + "loss": 0.8084, + "step": 4398 + }, + { + "epoch": 0.31780663572163925, + "grad_norm": 8.661107211968169, + "learning_rate": 4.785304862423937e-06, + "loss": 0.9183, + "step": 4399 + }, + { + "epoch": 0.31787888092184874, + "grad_norm": 6.50812697366318, + "learning_rate": 4.7851862548027575e-06, + "loss": 0.8426, + "step": 4400 + }, + { + "epoch": 0.3179511261220583, + "grad_norm": 7.5381511552472515, + "learning_rate": 4.785067615899153e-06, + "loss": 0.9728, + "step": 4401 + }, + { + "epoch": 0.31802337132226777, + "grad_norm": 6.672711143540304, + "learning_rate": 4.784948945714747e-06, + "loss": 0.9388, + "step": 4402 + }, + { + "epoch": 0.3180956165224773, + "grad_norm": 7.552450264972475, + "learning_rate": 4.7848302442511626e-06, + "loss": 0.8919, + "step": 4403 + }, + { + "epoch": 0.3181678617226868, + "grad_norm": 8.239154970115687, + "learning_rate": 4.784711511510028e-06, + "loss": 0.9715, + "step": 4404 + }, + { + "epoch": 0.31824010692289634, + "grad_norm": 7.455335501189919, + "learning_rate": 4.784592747492966e-06, + "loss": 1.0081, + "step": 4405 + }, + { + "epoch": 0.3183123521231058, + "grad_norm": 7.238230656191233, + "learning_rate": 4.784473952201604e-06, + "loss": 0.9194, + "step": 4406 + }, + { + "epoch": 0.3183845973233153, + "grad_norm": 7.157216102653873, + "learning_rate": 4.784355125637567e-06, + "loss": 0.9227, + "step": 4407 + }, + { + "epoch": 0.31845684252352485, + "grad_norm": 6.080860454005172, + "learning_rate": 4.784236267802481e-06, + "loss": 0.9383, + "step": 4408 + }, + { + "epoch": 0.31852908772373434, + "grad_norm": 9.564204444324265, + "learning_rate": 4.7841173786979744e-06, + "loss": 0.9048, + "step": 4409 + }, + { + "epoch": 0.3186013329239439, + "grad_norm": 6.314733129134398, + "learning_rate": 4.783998458325675e-06, + "loss": 0.9295, + "step": 4410 + }, + { + "epoch": 0.31867357812415337, + "grad_norm": 6.735040830900469, + "learning_rate": 4.78387950668721e-06, + "loss": 0.993, + "step": 4411 + }, + { + "epoch": 0.3187458233243629, + "grad_norm": 9.477961626415084, + "learning_rate": 4.7837605237842076e-06, + "loss": 1.0559, + "step": 4412 + }, + { + "epoch": 0.3188180685245724, + "grad_norm": 7.869922757187745, + "learning_rate": 4.783641509618297e-06, + "loss": 0.9046, + "step": 4413 + }, + { + "epoch": 0.31889031372478194, + "grad_norm": 6.866428927901594, + "learning_rate": 4.783522464191107e-06, + "loss": 0.9316, + "step": 4414 + }, + { + "epoch": 0.3189625589249914, + "grad_norm": 7.139047794907158, + "learning_rate": 4.783403387504268e-06, + "loss": 0.9148, + "step": 4415 + }, + { + "epoch": 0.3190348041252009, + "grad_norm": 7.344852186238693, + "learning_rate": 4.783284279559409e-06, + "loss": 0.8412, + "step": 4416 + }, + { + "epoch": 0.31910704932541045, + "grad_norm": 6.022296485740154, + "learning_rate": 4.7831651403581615e-06, + "loss": 1.02, + "step": 4417 + }, + { + "epoch": 0.31917929452561994, + "grad_norm": 6.26193714772362, + "learning_rate": 4.783045969902156e-06, + "loss": 0.8657, + "step": 4418 + }, + { + "epoch": 0.3192515397258295, + "grad_norm": 7.385179638701989, + "learning_rate": 4.782926768193024e-06, + "loss": 0.8774, + "step": 4419 + }, + { + "epoch": 0.31932378492603897, + "grad_norm": 6.432732159027387, + "learning_rate": 4.782807535232396e-06, + "loss": 0.904, + "step": 4420 + }, + { + "epoch": 0.3193960301262485, + "grad_norm": 6.25816483285598, + "learning_rate": 4.782688271021907e-06, + "loss": 0.9265, + "step": 4421 + }, + { + "epoch": 0.319468275326458, + "grad_norm": 7.090358377069752, + "learning_rate": 4.782568975563187e-06, + "loss": 0.8973, + "step": 4422 + }, + { + "epoch": 0.3195405205266675, + "grad_norm": 7.486716905506269, + "learning_rate": 4.78244964885787e-06, + "loss": 1.0153, + "step": 4423 + }, + { + "epoch": 0.319612765726877, + "grad_norm": 6.217996676262832, + "learning_rate": 4.782330290907589e-06, + "loss": 0.9349, + "step": 4424 + }, + { + "epoch": 0.3196850109270865, + "grad_norm": 6.046021078372697, + "learning_rate": 4.7822109017139795e-06, + "loss": 0.8841, + "step": 4425 + }, + { + "epoch": 0.31975725612729605, + "grad_norm": 6.529910009444641, + "learning_rate": 4.782091481278674e-06, + "loss": 0.8892, + "step": 4426 + }, + { + "epoch": 0.31982950132750554, + "grad_norm": 7.327995380483445, + "learning_rate": 4.781972029603309e-06, + "loss": 0.9197, + "step": 4427 + }, + { + "epoch": 0.3199017465277151, + "grad_norm": 5.623955014967311, + "learning_rate": 4.781852546689518e-06, + "loss": 0.9416, + "step": 4428 + }, + { + "epoch": 0.31997399172792457, + "grad_norm": 7.078633473302967, + "learning_rate": 4.781733032538938e-06, + "loss": 1.022, + "step": 4429 + }, + { + "epoch": 0.3200462369281341, + "grad_norm": 5.682478480031877, + "learning_rate": 4.781613487153204e-06, + "loss": 0.77, + "step": 4430 + }, + { + "epoch": 0.3201184821283436, + "grad_norm": 8.947412502546346, + "learning_rate": 4.781493910533953e-06, + "loss": 0.9351, + "step": 4431 + }, + { + "epoch": 0.3201907273285531, + "grad_norm": 7.0119184120249525, + "learning_rate": 4.781374302682822e-06, + "loss": 0.9445, + "step": 4432 + }, + { + "epoch": 0.3202629725287626, + "grad_norm": 7.412847614339443, + "learning_rate": 4.7812546636014474e-06, + "loss": 0.8986, + "step": 4433 + }, + { + "epoch": 0.3203352177289721, + "grad_norm": 6.406645674232813, + "learning_rate": 4.781134993291468e-06, + "loss": 0.8933, + "step": 4434 + }, + { + "epoch": 0.32040746292918165, + "grad_norm": 6.672965539075813, + "learning_rate": 4.781015291754523e-06, + "loss": 0.8576, + "step": 4435 + }, + { + "epoch": 0.32047970812939114, + "grad_norm": 8.43512833783124, + "learning_rate": 4.780895558992248e-06, + "loss": 0.95, + "step": 4436 + }, + { + "epoch": 0.3205519533296007, + "grad_norm": 6.559566459751176, + "learning_rate": 4.7807757950062846e-06, + "loss": 0.8535, + "step": 4437 + }, + { + "epoch": 0.32062419852981017, + "grad_norm": 7.653453580906327, + "learning_rate": 4.780655999798272e-06, + "loss": 0.9002, + "step": 4438 + }, + { + "epoch": 0.3206964437300197, + "grad_norm": 6.3945175291926715, + "learning_rate": 4.7805361733698485e-06, + "loss": 0.9394, + "step": 4439 + }, + { + "epoch": 0.3207686889302292, + "grad_norm": 5.8840661403898595, + "learning_rate": 4.7804163157226555e-06, + "loss": 0.8854, + "step": 4440 + }, + { + "epoch": 0.3208409341304387, + "grad_norm": 8.057177302200913, + "learning_rate": 4.780296426858335e-06, + "loss": 0.8932, + "step": 4441 + }, + { + "epoch": 0.3209131793306482, + "grad_norm": 7.594408548315693, + "learning_rate": 4.780176506778526e-06, + "loss": 0.9099, + "step": 4442 + }, + { + "epoch": 0.3209854245308577, + "grad_norm": 8.094199149729903, + "learning_rate": 4.780056555484871e-06, + "loss": 0.9489, + "step": 4443 + }, + { + "epoch": 0.32105766973106725, + "grad_norm": 6.350614394854932, + "learning_rate": 4.779936572979012e-06, + "loss": 0.8506, + "step": 4444 + }, + { + "epoch": 0.32112991493127674, + "grad_norm": 5.644639211499963, + "learning_rate": 4.779816559262593e-06, + "loss": 0.8372, + "step": 4445 + }, + { + "epoch": 0.3212021601314863, + "grad_norm": 5.5544991548479015, + "learning_rate": 4.779696514337255e-06, + "loss": 0.983, + "step": 4446 + }, + { + "epoch": 0.32127440533169577, + "grad_norm": 6.245640874861774, + "learning_rate": 4.779576438204641e-06, + "loss": 0.9362, + "step": 4447 + }, + { + "epoch": 0.3213466505319053, + "grad_norm": 6.4046810554942875, + "learning_rate": 4.779456330866396e-06, + "loss": 0.8656, + "step": 4448 + }, + { + "epoch": 0.3214188957321148, + "grad_norm": 7.692929715169576, + "learning_rate": 4.779336192324163e-06, + "loss": 0.9145, + "step": 4449 + }, + { + "epoch": 0.3214911409323243, + "grad_norm": 7.003754834880314, + "learning_rate": 4.779216022579587e-06, + "loss": 0.8829, + "step": 4450 + }, + { + "epoch": 0.3215633861325338, + "grad_norm": 6.8978665246257655, + "learning_rate": 4.779095821634314e-06, + "loss": 0.9175, + "step": 4451 + }, + { + "epoch": 0.3216356313327433, + "grad_norm": 9.00507593066765, + "learning_rate": 4.778975589489989e-06, + "loss": 1.0036, + "step": 4452 + }, + { + "epoch": 0.32170787653295285, + "grad_norm": 5.629203243216786, + "learning_rate": 4.778855326148259e-06, + "loss": 0.8968, + "step": 4453 + }, + { + "epoch": 0.32178012173316234, + "grad_norm": 6.9777221399398695, + "learning_rate": 4.778735031610767e-06, + "loss": 0.9308, + "step": 4454 + }, + { + "epoch": 0.3218523669333719, + "grad_norm": 6.2030542787608995, + "learning_rate": 4.778614705879162e-06, + "loss": 0.8868, + "step": 4455 + }, + { + "epoch": 0.32192461213358137, + "grad_norm": 5.833963777897186, + "learning_rate": 4.778494348955092e-06, + "loss": 0.8668, + "step": 4456 + }, + { + "epoch": 0.3219968573337909, + "grad_norm": 7.066678321876751, + "learning_rate": 4.778373960840203e-06, + "loss": 0.9945, + "step": 4457 + }, + { + "epoch": 0.3220691025340004, + "grad_norm": 7.94976915708116, + "learning_rate": 4.778253541536143e-06, + "loss": 0.9241, + "step": 4458 + }, + { + "epoch": 0.3221413477342099, + "grad_norm": 5.859873025709807, + "learning_rate": 4.7781330910445615e-06, + "loss": 0.9077, + "step": 4459 + }, + { + "epoch": 0.3222135929344194, + "grad_norm": 6.985346852389891, + "learning_rate": 4.778012609367106e-06, + "loss": 0.8862, + "step": 4460 + }, + { + "epoch": 0.3222858381346289, + "grad_norm": 7.259220705268764, + "learning_rate": 4.777892096505427e-06, + "loss": 1.0466, + "step": 4461 + }, + { + "epoch": 0.32235808333483845, + "grad_norm": 6.437073924954725, + "learning_rate": 4.777771552461174e-06, + "loss": 0.9445, + "step": 4462 + }, + { + "epoch": 0.32243032853504794, + "grad_norm": 7.162126685195198, + "learning_rate": 4.777650977235997e-06, + "loss": 0.9816, + "step": 4463 + }, + { + "epoch": 0.3225025737352575, + "grad_norm": 6.702197315249609, + "learning_rate": 4.777530370831547e-06, + "loss": 0.8693, + "step": 4464 + }, + { + "epoch": 0.32257481893546697, + "grad_norm": 6.025637213856125, + "learning_rate": 4.777409733249475e-06, + "loss": 0.8574, + "step": 4465 + }, + { + "epoch": 0.3226470641356765, + "grad_norm": 6.2627266056689335, + "learning_rate": 4.777289064491431e-06, + "loss": 0.9002, + "step": 4466 + }, + { + "epoch": 0.322719309335886, + "grad_norm": 7.539317768103906, + "learning_rate": 4.777168364559069e-06, + "loss": 0.9931, + "step": 4467 + }, + { + "epoch": 0.3227915545360955, + "grad_norm": 7.327980804651432, + "learning_rate": 4.777047633454038e-06, + "loss": 0.9416, + "step": 4468 + }, + { + "epoch": 0.322863799736305, + "grad_norm": 8.72352780926185, + "learning_rate": 4.776926871177995e-06, + "loss": 0.9503, + "step": 4469 + }, + { + "epoch": 0.3229360449365145, + "grad_norm": 7.183165993330806, + "learning_rate": 4.776806077732591e-06, + "loss": 0.8904, + "step": 4470 + }, + { + "epoch": 0.32300829013672405, + "grad_norm": 8.320208882864192, + "learning_rate": 4.776685253119478e-06, + "loss": 0.9694, + "step": 4471 + }, + { + "epoch": 0.32308053533693354, + "grad_norm": 6.306199054493331, + "learning_rate": 4.776564397340313e-06, + "loss": 0.8731, + "step": 4472 + }, + { + "epoch": 0.3231527805371431, + "grad_norm": 5.3968074120650185, + "learning_rate": 4.776443510396749e-06, + "loss": 0.8529, + "step": 4473 + }, + { + "epoch": 0.32322502573735257, + "grad_norm": 6.966870520619299, + "learning_rate": 4.776322592290441e-06, + "loss": 0.8997, + "step": 4474 + }, + { + "epoch": 0.3232972709375621, + "grad_norm": 7.795978135259823, + "learning_rate": 4.776201643023044e-06, + "loss": 0.9264, + "step": 4475 + }, + { + "epoch": 0.3233695161377716, + "grad_norm": 8.381435825185278, + "learning_rate": 4.776080662596213e-06, + "loss": 0.8795, + "step": 4476 + }, + { + "epoch": 0.3234417613379811, + "grad_norm": 6.909541799808492, + "learning_rate": 4.775959651011606e-06, + "loss": 0.856, + "step": 4477 + }, + { + "epoch": 0.3235140065381906, + "grad_norm": 6.067451264225367, + "learning_rate": 4.775838608270878e-06, + "loss": 0.9753, + "step": 4478 + }, + { + "epoch": 0.3235862517384001, + "grad_norm": 6.930652159903299, + "learning_rate": 4.775717534375686e-06, + "loss": 0.9508, + "step": 4479 + }, + { + "epoch": 0.32365849693860965, + "grad_norm": 6.582688151103448, + "learning_rate": 4.775596429327689e-06, + "loss": 0.9438, + "step": 4480 + }, + { + "epoch": 0.32373074213881914, + "grad_norm": 5.376350410621779, + "learning_rate": 4.775475293128544e-06, + "loss": 0.9571, + "step": 4481 + }, + { + "epoch": 0.3238029873390287, + "grad_norm": 8.429596993258338, + "learning_rate": 4.7753541257799085e-06, + "loss": 0.9121, + "step": 4482 + }, + { + "epoch": 0.32387523253923817, + "grad_norm": 6.692273584240139, + "learning_rate": 4.775232927283442e-06, + "loss": 0.8995, + "step": 4483 + }, + { + "epoch": 0.3239474777394477, + "grad_norm": 6.448731633348924, + "learning_rate": 4.775111697640803e-06, + "loss": 0.9495, + "step": 4484 + }, + { + "epoch": 0.3240197229396572, + "grad_norm": 8.307280084647752, + "learning_rate": 4.774990436853651e-06, + "loss": 0.9413, + "step": 4485 + }, + { + "epoch": 0.3240919681398667, + "grad_norm": 7.292416372359135, + "learning_rate": 4.774869144923646e-06, + "loss": 0.8757, + "step": 4486 + }, + { + "epoch": 0.3241642133400762, + "grad_norm": 8.375286495945089, + "learning_rate": 4.7747478218524505e-06, + "loss": 0.977, + "step": 4487 + }, + { + "epoch": 0.3242364585402857, + "grad_norm": 6.412759590471605, + "learning_rate": 4.774626467641722e-06, + "loss": 0.9213, + "step": 4488 + }, + { + "epoch": 0.32430870374049525, + "grad_norm": 6.512406466758755, + "learning_rate": 4.774505082293124e-06, + "loss": 0.904, + "step": 4489 + }, + { + "epoch": 0.32438094894070474, + "grad_norm": 6.413459108246836, + "learning_rate": 4.774383665808317e-06, + "loss": 0.9579, + "step": 4490 + }, + { + "epoch": 0.3244531941409143, + "grad_norm": 6.203187111031929, + "learning_rate": 4.774262218188964e-06, + "loss": 0.9124, + "step": 4491 + }, + { + "epoch": 0.32452543934112377, + "grad_norm": 6.085360185029558, + "learning_rate": 4.774140739436727e-06, + "loss": 0.848, + "step": 4492 + }, + { + "epoch": 0.3245976845413333, + "grad_norm": 6.965707431201072, + "learning_rate": 4.774019229553268e-06, + "loss": 0.9196, + "step": 4493 + }, + { + "epoch": 0.3246699297415428, + "grad_norm": 7.074802182940126, + "learning_rate": 4.773897688540253e-06, + "loss": 0.902, + "step": 4494 + }, + { + "epoch": 0.3247421749417523, + "grad_norm": 5.584103583350674, + "learning_rate": 4.773776116399343e-06, + "loss": 0.8599, + "step": 4495 + }, + { + "epoch": 0.3248144201419618, + "grad_norm": 32.23463629923176, + "learning_rate": 4.773654513132204e-06, + "loss": 0.9569, + "step": 4496 + }, + { + "epoch": 0.3248866653421713, + "grad_norm": 6.267827966643117, + "learning_rate": 4.773532878740501e-06, + "loss": 0.9222, + "step": 4497 + }, + { + "epoch": 0.32495891054238085, + "grad_norm": 6.070513276421441, + "learning_rate": 4.773411213225897e-06, + "loss": 0.972, + "step": 4498 + }, + { + "epoch": 0.32503115574259034, + "grad_norm": 8.01980142932606, + "learning_rate": 4.773289516590059e-06, + "loss": 0.9311, + "step": 4499 + }, + { + "epoch": 0.3251034009427999, + "grad_norm": 6.577840549725071, + "learning_rate": 4.773167788834653e-06, + "loss": 0.9293, + "step": 4500 + }, + { + "epoch": 0.32517564614300937, + "grad_norm": 7.457140978190202, + "learning_rate": 4.773046029961343e-06, + "loss": 0.9481, + "step": 4501 + }, + { + "epoch": 0.3252478913432189, + "grad_norm": 6.55479312852539, + "learning_rate": 4.7729242399718e-06, + "loss": 0.9891, + "step": 4502 + }, + { + "epoch": 0.3253201365434284, + "grad_norm": 6.125988316110397, + "learning_rate": 4.772802418867688e-06, + "loss": 0.8283, + "step": 4503 + }, + { + "epoch": 0.3253923817436379, + "grad_norm": 7.981916015366686, + "learning_rate": 4.7726805666506755e-06, + "loss": 0.9402, + "step": 4504 + }, + { + "epoch": 0.3254646269438474, + "grad_norm": 8.516978193086084, + "learning_rate": 4.7725586833224316e-06, + "loss": 0.9394, + "step": 4505 + }, + { + "epoch": 0.3255368721440569, + "grad_norm": 9.60040032823644, + "learning_rate": 4.772436768884623e-06, + "loss": 0.8856, + "step": 4506 + }, + { + "epoch": 0.32560911734426645, + "grad_norm": 6.247839897712742, + "learning_rate": 4.77231482333892e-06, + "loss": 0.9325, + "step": 4507 + }, + { + "epoch": 0.32568136254447594, + "grad_norm": 6.45682388221919, + "learning_rate": 4.772192846686991e-06, + "loss": 0.9791, + "step": 4508 + }, + { + "epoch": 0.3257536077446855, + "grad_norm": 8.183157897850458, + "learning_rate": 4.772070838930506e-06, + "loss": 0.9308, + "step": 4509 + }, + { + "epoch": 0.32582585294489497, + "grad_norm": 8.478696717931829, + "learning_rate": 4.7719488000711355e-06, + "loss": 0.9333, + "step": 4510 + }, + { + "epoch": 0.3258980981451045, + "grad_norm": 8.832953342925615, + "learning_rate": 4.7718267301105505e-06, + "loss": 0.8417, + "step": 4511 + }, + { + "epoch": 0.325970343345314, + "grad_norm": 6.55535761575667, + "learning_rate": 4.771704629050421e-06, + "loss": 1.007, + "step": 4512 + }, + { + "epoch": 0.3260425885455235, + "grad_norm": 8.182628317904024, + "learning_rate": 4.77158249689242e-06, + "loss": 0.8943, + "step": 4513 + }, + { + "epoch": 0.326114833745733, + "grad_norm": 7.148988729600644, + "learning_rate": 4.771460333638217e-06, + "loss": 0.9332, + "step": 4514 + }, + { + "epoch": 0.3261870789459425, + "grad_norm": 6.875526269363887, + "learning_rate": 4.771338139289486e-06, + "loss": 0.9713, + "step": 4515 + }, + { + "epoch": 0.32625932414615205, + "grad_norm": 6.710204185414996, + "learning_rate": 4.7712159138479e-06, + "loss": 0.942, + "step": 4516 + }, + { + "epoch": 0.32633156934636154, + "grad_norm": 8.691327685740422, + "learning_rate": 4.771093657315131e-06, + "loss": 0.8681, + "step": 4517 + }, + { + "epoch": 0.3264038145465711, + "grad_norm": 7.96503460605381, + "learning_rate": 4.770971369692854e-06, + "loss": 0.8776, + "step": 4518 + }, + { + "epoch": 0.32647605974678057, + "grad_norm": 6.7203662436455405, + "learning_rate": 4.770849050982741e-06, + "loss": 0.9634, + "step": 4519 + }, + { + "epoch": 0.3265483049469901, + "grad_norm": 5.691384466649287, + "learning_rate": 4.7707267011864685e-06, + "loss": 0.8189, + "step": 4520 + }, + { + "epoch": 0.3266205501471996, + "grad_norm": 6.220540311350511, + "learning_rate": 4.770604320305711e-06, + "loss": 1.0182, + "step": 4521 + }, + { + "epoch": 0.3266927953474091, + "grad_norm": 6.9153994889342965, + "learning_rate": 4.770481908342142e-06, + "loss": 0.8471, + "step": 4522 + }, + { + "epoch": 0.3267650405476186, + "grad_norm": 6.916163721144532, + "learning_rate": 4.770359465297441e-06, + "loss": 0.9513, + "step": 4523 + }, + { + "epoch": 0.3268372857478281, + "grad_norm": 8.631936642503542, + "learning_rate": 4.77023699117328e-06, + "loss": 0.8723, + "step": 4524 + }, + { + "epoch": 0.32690953094803765, + "grad_norm": 7.328905011209388, + "learning_rate": 4.770114485971338e-06, + "loss": 0.9335, + "step": 4525 + }, + { + "epoch": 0.32698177614824714, + "grad_norm": 6.283032216648998, + "learning_rate": 4.76999194969329e-06, + "loss": 0.8017, + "step": 4526 + }, + { + "epoch": 0.3270540213484567, + "grad_norm": 8.702844913160266, + "learning_rate": 4.769869382340816e-06, + "loss": 0.9257, + "step": 4527 + }, + { + "epoch": 0.32712626654866617, + "grad_norm": 7.126904149513143, + "learning_rate": 4.769746783915592e-06, + "loss": 0.8975, + "step": 4528 + }, + { + "epoch": 0.3271985117488757, + "grad_norm": 6.943244274274534, + "learning_rate": 4.769624154419298e-06, + "loss": 0.9148, + "step": 4529 + }, + { + "epoch": 0.3272707569490852, + "grad_norm": 5.95917417911011, + "learning_rate": 4.76950149385361e-06, + "loss": 0.8842, + "step": 4530 + }, + { + "epoch": 0.3273430021492947, + "grad_norm": 7.108893141293934, + "learning_rate": 4.7693788022202095e-06, + "loss": 0.8635, + "step": 4531 + }, + { + "epoch": 0.3274152473495042, + "grad_norm": 5.625714066004546, + "learning_rate": 4.769256079520775e-06, + "loss": 0.8527, + "step": 4532 + }, + { + "epoch": 0.3274874925497137, + "grad_norm": 5.528390608682419, + "learning_rate": 4.769133325756987e-06, + "loss": 0.8587, + "step": 4533 + }, + { + "epoch": 0.32755973774992325, + "grad_norm": 5.675984892896961, + "learning_rate": 4.769010540930525e-06, + "loss": 0.886, + "step": 4534 + }, + { + "epoch": 0.32763198295013274, + "grad_norm": 8.236545573442694, + "learning_rate": 4.768887725043071e-06, + "loss": 0.8884, + "step": 4535 + }, + { + "epoch": 0.3277042281503423, + "grad_norm": 7.376388677687504, + "learning_rate": 4.768764878096306e-06, + "loss": 0.9797, + "step": 4536 + }, + { + "epoch": 0.32777647335055177, + "grad_norm": 6.049255060409395, + "learning_rate": 4.768642000091911e-06, + "loss": 0.8931, + "step": 4537 + }, + { + "epoch": 0.3278487185507613, + "grad_norm": 6.294098272603118, + "learning_rate": 4.768519091031569e-06, + "loss": 0.9396, + "step": 4538 + }, + { + "epoch": 0.3279209637509708, + "grad_norm": 9.876957180330885, + "learning_rate": 4.768396150916961e-06, + "loss": 0.9466, + "step": 4539 + }, + { + "epoch": 0.3279932089511803, + "grad_norm": 7.470656147728116, + "learning_rate": 4.7682731797497715e-06, + "loss": 0.9287, + "step": 4540 + }, + { + "epoch": 0.3280654541513898, + "grad_norm": 7.159669550510677, + "learning_rate": 4.768150177531684e-06, + "loss": 0.9516, + "step": 4541 + }, + { + "epoch": 0.3281376993515993, + "grad_norm": 6.503581600850105, + "learning_rate": 4.768027144264381e-06, + "loss": 0.8792, + "step": 4542 + }, + { + "epoch": 0.32820994455180885, + "grad_norm": 5.988933848674472, + "learning_rate": 4.767904079949548e-06, + "loss": 0.9939, + "step": 4543 + }, + { + "epoch": 0.32828218975201834, + "grad_norm": 5.321456539705975, + "learning_rate": 4.767780984588868e-06, + "loss": 0.9366, + "step": 4544 + }, + { + "epoch": 0.3283544349522279, + "grad_norm": 7.869188374374241, + "learning_rate": 4.767657858184028e-06, + "loss": 0.895, + "step": 4545 + }, + { + "epoch": 0.32842668015243737, + "grad_norm": 6.980947451409152, + "learning_rate": 4.767534700736713e-06, + "loss": 0.9036, + "step": 4546 + }, + { + "epoch": 0.3284989253526469, + "grad_norm": 5.636990461984645, + "learning_rate": 4.767411512248607e-06, + "loss": 0.8743, + "step": 4547 + }, + { + "epoch": 0.3285711705528564, + "grad_norm": 6.704229232783803, + "learning_rate": 4.767288292721399e-06, + "loss": 0.9043, + "step": 4548 + }, + { + "epoch": 0.3286434157530659, + "grad_norm": 6.13892572007367, + "learning_rate": 4.767165042156775e-06, + "loss": 0.842, + "step": 4549 + }, + { + "epoch": 0.3287156609532754, + "grad_norm": 6.293138176160448, + "learning_rate": 4.767041760556421e-06, + "loss": 0.9651, + "step": 4550 + }, + { + "epoch": 0.3287879061534849, + "grad_norm": 6.5989885537755955, + "learning_rate": 4.7669184479220264e-06, + "loss": 0.9518, + "step": 4551 + }, + { + "epoch": 0.32886015135369445, + "grad_norm": 6.0093999980182415, + "learning_rate": 4.766795104255279e-06, + "loss": 0.9077, + "step": 4552 + }, + { + "epoch": 0.32893239655390394, + "grad_norm": 10.430072631583045, + "learning_rate": 4.766671729557865e-06, + "loss": 0.9961, + "step": 4553 + }, + { + "epoch": 0.3290046417541135, + "grad_norm": 6.6506060101869195, + "learning_rate": 4.7665483238314756e-06, + "loss": 0.9559, + "step": 4554 + }, + { + "epoch": 0.32907688695432297, + "grad_norm": 5.379474108494246, + "learning_rate": 4.7664248870778e-06, + "loss": 0.848, + "step": 4555 + }, + { + "epoch": 0.3291491321545325, + "grad_norm": 6.887043339982468, + "learning_rate": 4.766301419298527e-06, + "loss": 0.9566, + "step": 4556 + }, + { + "epoch": 0.329221377354742, + "grad_norm": 7.310631146405149, + "learning_rate": 4.766177920495347e-06, + "loss": 0.8964, + "step": 4557 + }, + { + "epoch": 0.3292936225549515, + "grad_norm": 7.597839510161291, + "learning_rate": 4.766054390669952e-06, + "loss": 0.8927, + "step": 4558 + }, + { + "epoch": 0.329365867755161, + "grad_norm": 7.663353190131538, + "learning_rate": 4.76593082982403e-06, + "loss": 0.9656, + "step": 4559 + }, + { + "epoch": 0.3294381129553705, + "grad_norm": 6.721099868411576, + "learning_rate": 4.765807237959276e-06, + "loss": 0.9115, + "step": 4560 + }, + { + "epoch": 0.32951035815558005, + "grad_norm": 5.765048362615097, + "learning_rate": 4.765683615077379e-06, + "loss": 0.9201, + "step": 4561 + }, + { + "epoch": 0.32958260335578954, + "grad_norm": 6.1681719137099345, + "learning_rate": 4.765559961180033e-06, + "loss": 0.9141, + "step": 4562 + }, + { + "epoch": 0.3296548485559991, + "grad_norm": 7.636233200676704, + "learning_rate": 4.765436276268931e-06, + "loss": 1.0125, + "step": 4563 + }, + { + "epoch": 0.32972709375620857, + "grad_norm": 7.7780698842494065, + "learning_rate": 4.7653125603457645e-06, + "loss": 0.9935, + "step": 4564 + }, + { + "epoch": 0.3297993389564181, + "grad_norm": 6.962303293075891, + "learning_rate": 4.765188813412228e-06, + "loss": 0.9501, + "step": 4565 + }, + { + "epoch": 0.3298715841566276, + "grad_norm": 6.671179706159548, + "learning_rate": 4.765065035470016e-06, + "loss": 0.9106, + "step": 4566 + }, + { + "epoch": 0.3299438293568371, + "grad_norm": 8.511046916607805, + "learning_rate": 4.764941226520822e-06, + "loss": 1.0572, + "step": 4567 + }, + { + "epoch": 0.3300160745570466, + "grad_norm": 6.662059558960627, + "learning_rate": 4.7648173865663405e-06, + "loss": 0.9862, + "step": 4568 + }, + { + "epoch": 0.3300883197572561, + "grad_norm": 6.3439252857364155, + "learning_rate": 4.7646935156082685e-06, + "loss": 0.8924, + "step": 4569 + }, + { + "epoch": 0.33016056495746565, + "grad_norm": 7.619011700636349, + "learning_rate": 4.7645696136483e-06, + "loss": 0.8756, + "step": 4570 + }, + { + "epoch": 0.33023281015767514, + "grad_norm": 9.099840569409547, + "learning_rate": 4.764445680688132e-06, + "loss": 0.9397, + "step": 4571 + }, + { + "epoch": 0.3303050553578847, + "grad_norm": 7.4647775723943885, + "learning_rate": 4.764321716729462e-06, + "loss": 0.9566, + "step": 4572 + }, + { + "epoch": 0.33037730055809417, + "grad_norm": 6.724941505830064, + "learning_rate": 4.7641977217739846e-06, + "loss": 0.9446, + "step": 4573 + }, + { + "epoch": 0.3304495457583037, + "grad_norm": 5.682255265503632, + "learning_rate": 4.764073695823399e-06, + "loss": 0.9142, + "step": 4574 + }, + { + "epoch": 0.3305217909585132, + "grad_norm": 5.786242169984508, + "learning_rate": 4.7639496388794016e-06, + "loss": 0.8468, + "step": 4575 + }, + { + "epoch": 0.3305940361587227, + "grad_norm": 6.94421886162472, + "learning_rate": 4.763825550943692e-06, + "loss": 0.9181, + "step": 4576 + }, + { + "epoch": 0.3306662813589322, + "grad_norm": 7.0148313621431155, + "learning_rate": 4.763701432017969e-06, + "loss": 0.924, + "step": 4577 + }, + { + "epoch": 0.3307385265591417, + "grad_norm": 8.175697264260458, + "learning_rate": 4.76357728210393e-06, + "loss": 0.8624, + "step": 4578 + }, + { + "epoch": 0.33081077175935125, + "grad_norm": 12.823833235524356, + "learning_rate": 4.763453101203276e-06, + "loss": 0.8512, + "step": 4579 + }, + { + "epoch": 0.33088301695956074, + "grad_norm": 7.232174499550143, + "learning_rate": 4.763328889317707e-06, + "loss": 0.8609, + "step": 4580 + }, + { + "epoch": 0.3309552621597703, + "grad_norm": 6.4668290239120845, + "learning_rate": 4.763204646448922e-06, + "loss": 0.8468, + "step": 4581 + }, + { + "epoch": 0.33102750735997977, + "grad_norm": 6.92764682246621, + "learning_rate": 4.763080372598623e-06, + "loss": 0.8925, + "step": 4582 + }, + { + "epoch": 0.3310997525601893, + "grad_norm": 8.159491468553087, + "learning_rate": 4.76295606776851e-06, + "loss": 0.9046, + "step": 4583 + }, + { + "epoch": 0.3311719977603988, + "grad_norm": 6.051961331825747, + "learning_rate": 4.762831731960287e-06, + "loss": 0.8321, + "step": 4584 + }, + { + "epoch": 0.3312442429606083, + "grad_norm": 7.095002588939665, + "learning_rate": 4.762707365175654e-06, + "loss": 0.9224, + "step": 4585 + }, + { + "epoch": 0.3313164881608178, + "grad_norm": 6.820716033218987, + "learning_rate": 4.762582967416314e-06, + "loss": 0.8255, + "step": 4586 + }, + { + "epoch": 0.3313887333610273, + "grad_norm": 6.335385676012026, + "learning_rate": 4.76245853868397e-06, + "loss": 0.9201, + "step": 4587 + }, + { + "epoch": 0.33146097856123685, + "grad_norm": 6.450798742236638, + "learning_rate": 4.762334078980325e-06, + "loss": 0.9333, + "step": 4588 + }, + { + "epoch": 0.33153322376144634, + "grad_norm": 6.803681151367446, + "learning_rate": 4.762209588307084e-06, + "loss": 0.828, + "step": 4589 + }, + { + "epoch": 0.3316054689616559, + "grad_norm": 6.073122451811528, + "learning_rate": 4.762085066665949e-06, + "loss": 0.9103, + "step": 4590 + }, + { + "epoch": 0.33167771416186537, + "grad_norm": 6.348368950374429, + "learning_rate": 4.7619605140586266e-06, + "loss": 0.8629, + "step": 4591 + }, + { + "epoch": 0.3317499593620749, + "grad_norm": 5.905782691189766, + "learning_rate": 4.76183593048682e-06, + "loss": 0.8705, + "step": 4592 + }, + { + "epoch": 0.3318222045622844, + "grad_norm": 7.975331898064014, + "learning_rate": 4.761711315952236e-06, + "loss": 0.9896, + "step": 4593 + }, + { + "epoch": 0.3318944497624939, + "grad_norm": 6.904683345354034, + "learning_rate": 4.761586670456582e-06, + "loss": 0.8771, + "step": 4594 + }, + { + "epoch": 0.3319666949627034, + "grad_norm": 6.400778913782681, + "learning_rate": 4.761461994001561e-06, + "loss": 0.9009, + "step": 4595 + }, + { + "epoch": 0.3320389401629129, + "grad_norm": 9.017416737983492, + "learning_rate": 4.7613372865888814e-06, + "loss": 1.0099, + "step": 4596 + }, + { + "epoch": 0.33211118536312245, + "grad_norm": 9.53451412153191, + "learning_rate": 4.76121254822025e-06, + "loss": 0.911, + "step": 4597 + }, + { + "epoch": 0.33218343056333194, + "grad_norm": 7.146466760022019, + "learning_rate": 4.761087778897375e-06, + "loss": 0.8891, + "step": 4598 + }, + { + "epoch": 0.3322556757635415, + "grad_norm": 6.923483771408653, + "learning_rate": 4.7609629786219636e-06, + "loss": 1.0629, + "step": 4599 + }, + { + "epoch": 0.33232792096375097, + "grad_norm": 9.095520057007288, + "learning_rate": 4.760838147395724e-06, + "loss": 0.9414, + "step": 4600 + }, + { + "epoch": 0.3324001661639605, + "grad_norm": 7.7375010944914155, + "learning_rate": 4.7607132852203664e-06, + "loss": 0.8828, + "step": 4601 + }, + { + "epoch": 0.33247241136417, + "grad_norm": 7.436322030779664, + "learning_rate": 4.760588392097599e-06, + "loss": 0.9731, + "step": 4602 + }, + { + "epoch": 0.3325446565643795, + "grad_norm": 8.40670468827235, + "learning_rate": 4.760463468029132e-06, + "loss": 0.9671, + "step": 4603 + }, + { + "epoch": 0.332616901764589, + "grad_norm": 6.147884358556335, + "learning_rate": 4.760338513016675e-06, + "loss": 0.8898, + "step": 4604 + }, + { + "epoch": 0.3326891469647985, + "grad_norm": 6.673310529661644, + "learning_rate": 4.760213527061939e-06, + "loss": 0.9019, + "step": 4605 + }, + { + "epoch": 0.33276139216500805, + "grad_norm": 8.07564307330155, + "learning_rate": 4.760088510166635e-06, + "loss": 0.874, + "step": 4606 + }, + { + "epoch": 0.33283363736521754, + "grad_norm": 7.948656546842253, + "learning_rate": 4.759963462332473e-06, + "loss": 0.8461, + "step": 4607 + }, + { + "epoch": 0.3329058825654271, + "grad_norm": 7.5483154438615685, + "learning_rate": 4.7598383835611675e-06, + "loss": 0.8723, + "step": 4608 + }, + { + "epoch": 0.33297812776563657, + "grad_norm": 7.269199599500816, + "learning_rate": 4.759713273854428e-06, + "loss": 0.8442, + "step": 4609 + }, + { + "epoch": 0.3330503729658461, + "grad_norm": 6.997603687473545, + "learning_rate": 4.759588133213968e-06, + "loss": 0.8175, + "step": 4610 + }, + { + "epoch": 0.3331226181660556, + "grad_norm": 6.909865869746723, + "learning_rate": 4.759462961641503e-06, + "loss": 0.8517, + "step": 4611 + }, + { + "epoch": 0.3331948633662651, + "grad_norm": 7.552928573838356, + "learning_rate": 4.759337759138742e-06, + "loss": 0.9204, + "step": 4612 + }, + { + "epoch": 0.3332671085664746, + "grad_norm": 9.006056125521047, + "learning_rate": 4.759212525707403e-06, + "loss": 0.8718, + "step": 4613 + }, + { + "epoch": 0.3333393537666841, + "grad_norm": 8.78090376527716, + "learning_rate": 4.759087261349198e-06, + "loss": 0.904, + "step": 4614 + }, + { + "epoch": 0.33341159896689365, + "grad_norm": 7.293927465272214, + "learning_rate": 4.758961966065842e-06, + "loss": 0.9572, + "step": 4615 + }, + { + "epoch": 0.33348384416710314, + "grad_norm": 7.014592627879927, + "learning_rate": 4.758836639859051e-06, + "loss": 1.0019, + "step": 4616 + }, + { + "epoch": 0.3335560893673127, + "grad_norm": 6.893316111048146, + "learning_rate": 4.75871128273054e-06, + "loss": 0.8535, + "step": 4617 + }, + { + "epoch": 0.33362833456752217, + "grad_norm": 8.852052491540494, + "learning_rate": 4.758585894682026e-06, + "loss": 0.935, + "step": 4618 + }, + { + "epoch": 0.3337005797677317, + "grad_norm": 9.88984896491489, + "learning_rate": 4.758460475715225e-06, + "loss": 0.8213, + "step": 4619 + }, + { + "epoch": 0.3337728249679412, + "grad_norm": 8.541322679493755, + "learning_rate": 4.7583350258318526e-06, + "loss": 0.8586, + "step": 4620 + }, + { + "epoch": 0.3338450701681507, + "grad_norm": 6.28264818714276, + "learning_rate": 4.758209545033629e-06, + "loss": 0.9701, + "step": 4621 + }, + { + "epoch": 0.3339173153683602, + "grad_norm": 6.57091251858555, + "learning_rate": 4.758084033322268e-06, + "loss": 0.9482, + "step": 4622 + }, + { + "epoch": 0.3339895605685697, + "grad_norm": 7.047404402578056, + "learning_rate": 4.757958490699492e-06, + "loss": 0.8956, + "step": 4623 + }, + { + "epoch": 0.33406180576877925, + "grad_norm": 8.766227279888211, + "learning_rate": 4.757832917167015e-06, + "loss": 0.8761, + "step": 4624 + }, + { + "epoch": 0.33413405096898874, + "grad_norm": 8.699786163859144, + "learning_rate": 4.75770731272656e-06, + "loss": 0.9168, + "step": 4625 + }, + { + "epoch": 0.3342062961691983, + "grad_norm": 5.887906770361994, + "learning_rate": 4.757581677379845e-06, + "loss": 0.903, + "step": 4626 + }, + { + "epoch": 0.33427854136940777, + "grad_norm": 6.325491651643354, + "learning_rate": 4.7574560111285885e-06, + "loss": 0.8813, + "step": 4627 + }, + { + "epoch": 0.3343507865696173, + "grad_norm": 6.506562368203225, + "learning_rate": 4.7573303139745134e-06, + "loss": 1.0088, + "step": 4628 + }, + { + "epoch": 0.3344230317698268, + "grad_norm": 9.460490738316148, + "learning_rate": 4.757204585919338e-06, + "loss": 0.8936, + "step": 4629 + }, + { + "epoch": 0.3344952769700363, + "grad_norm": 7.448751449930505, + "learning_rate": 4.757078826964785e-06, + "loss": 0.9217, + "step": 4630 + }, + { + "epoch": 0.3345675221702458, + "grad_norm": 6.258743883560023, + "learning_rate": 4.756953037112575e-06, + "loss": 0.9815, + "step": 4631 + }, + { + "epoch": 0.3346397673704553, + "grad_norm": 8.193029967677985, + "learning_rate": 4.75682721636443e-06, + "loss": 0.9477, + "step": 4632 + }, + { + "epoch": 0.33471201257066485, + "grad_norm": 6.395993242118276, + "learning_rate": 4.7567013647220736e-06, + "loss": 0.862, + "step": 4633 + }, + { + "epoch": 0.33478425777087434, + "grad_norm": 6.437868570683813, + "learning_rate": 4.756575482187228e-06, + "loss": 0.9263, + "step": 4634 + }, + { + "epoch": 0.3348565029710839, + "grad_norm": 6.908680759922932, + "learning_rate": 4.756449568761615e-06, + "loss": 0.9042, + "step": 4635 + }, + { + "epoch": 0.33492874817129337, + "grad_norm": 8.868417367215025, + "learning_rate": 4.7563236244469605e-06, + "loss": 0.8425, + "step": 4636 + }, + { + "epoch": 0.33500099337150285, + "grad_norm": 7.622948542553719, + "learning_rate": 4.756197649244987e-06, + "loss": 0.9332, + "step": 4637 + }, + { + "epoch": 0.3350732385717124, + "grad_norm": 6.088726144241528, + "learning_rate": 4.756071643157419e-06, + "loss": 0.9039, + "step": 4638 + }, + { + "epoch": 0.3351454837719219, + "grad_norm": 8.073689111650388, + "learning_rate": 4.755945606185982e-06, + "loss": 0.9388, + "step": 4639 + }, + { + "epoch": 0.3352177289721314, + "grad_norm": 6.39175755694379, + "learning_rate": 4.755819538332403e-06, + "loss": 0.8881, + "step": 4640 + }, + { + "epoch": 0.3352899741723409, + "grad_norm": 6.350774474485868, + "learning_rate": 4.755693439598405e-06, + "loss": 0.8921, + "step": 4641 + }, + { + "epoch": 0.33536221937255045, + "grad_norm": 8.917256938850453, + "learning_rate": 4.755567309985714e-06, + "loss": 0.8154, + "step": 4642 + }, + { + "epoch": 0.33543446457275994, + "grad_norm": 6.56027212927606, + "learning_rate": 4.755441149496059e-06, + "loss": 0.9723, + "step": 4643 + }, + { + "epoch": 0.3355067097729695, + "grad_norm": 6.886204138281266, + "learning_rate": 4.755314958131166e-06, + "loss": 0.8996, + "step": 4644 + }, + { + "epoch": 0.33557895497317897, + "grad_norm": 6.041045619570703, + "learning_rate": 4.755188735892762e-06, + "loss": 0.8751, + "step": 4645 + }, + { + "epoch": 0.33565120017338845, + "grad_norm": 7.844403121930659, + "learning_rate": 4.755062482782575e-06, + "loss": 0.9456, + "step": 4646 + }, + { + "epoch": 0.335723445373598, + "grad_norm": 7.813785538764216, + "learning_rate": 4.754936198802334e-06, + "loss": 0.8417, + "step": 4647 + }, + { + "epoch": 0.3357956905738075, + "grad_norm": 7.471886905802675, + "learning_rate": 4.754809883953767e-06, + "loss": 0.8902, + "step": 4648 + }, + { + "epoch": 0.335867935774017, + "grad_norm": 7.246150606245942, + "learning_rate": 4.754683538238604e-06, + "loss": 0.8454, + "step": 4649 + }, + { + "epoch": 0.3359401809742265, + "grad_norm": 6.766201496891236, + "learning_rate": 4.754557161658574e-06, + "loss": 0.9435, + "step": 4650 + }, + { + "epoch": 0.33601242617443605, + "grad_norm": 8.549249573730187, + "learning_rate": 4.7544307542154065e-06, + "loss": 0.9082, + "step": 4651 + }, + { + "epoch": 0.33608467137464554, + "grad_norm": 6.314611100811529, + "learning_rate": 4.754304315910832e-06, + "loss": 0.9901, + "step": 4652 + }, + { + "epoch": 0.3361569165748551, + "grad_norm": 6.312663463326768, + "learning_rate": 4.754177846746583e-06, + "loss": 0.8354, + "step": 4653 + }, + { + "epoch": 0.33622916177506457, + "grad_norm": 5.468282450716626, + "learning_rate": 4.754051346724389e-06, + "loss": 0.851, + "step": 4654 + }, + { + "epoch": 0.33630140697527405, + "grad_norm": 6.618234887038955, + "learning_rate": 4.753924815845982e-06, + "loss": 0.9072, + "step": 4655 + }, + { + "epoch": 0.3363736521754836, + "grad_norm": 6.746021935235876, + "learning_rate": 4.753798254113094e-06, + "loss": 0.9499, + "step": 4656 + }, + { + "epoch": 0.3364458973756931, + "grad_norm": 5.905960318106848, + "learning_rate": 4.753671661527459e-06, + "loss": 0.9066, + "step": 4657 + }, + { + "epoch": 0.3365181425759026, + "grad_norm": 6.733565492058412, + "learning_rate": 4.7535450380908074e-06, + "loss": 0.88, + "step": 4658 + }, + { + "epoch": 0.3365903877761121, + "grad_norm": 6.168734367064792, + "learning_rate": 4.753418383804876e-06, + "loss": 0.9302, + "step": 4659 + }, + { + "epoch": 0.33666263297632165, + "grad_norm": 5.705202492439364, + "learning_rate": 4.753291698671395e-06, + "loss": 0.8838, + "step": 4660 + }, + { + "epoch": 0.33673487817653114, + "grad_norm": 5.992058584723924, + "learning_rate": 4.7531649826921e-06, + "loss": 0.9271, + "step": 4661 + }, + { + "epoch": 0.3368071233767407, + "grad_norm": 6.91635042256569, + "learning_rate": 4.753038235868726e-06, + "loss": 0.9169, + "step": 4662 + }, + { + "epoch": 0.33687936857695017, + "grad_norm": 5.888133202455875, + "learning_rate": 4.752911458203009e-06, + "loss": 0.9322, + "step": 4663 + }, + { + "epoch": 0.33695161377715965, + "grad_norm": 6.13507155099197, + "learning_rate": 4.752784649696682e-06, + "loss": 0.8698, + "step": 4664 + }, + { + "epoch": 0.3370238589773692, + "grad_norm": 6.405683092294392, + "learning_rate": 4.752657810351483e-06, + "loss": 0.9707, + "step": 4665 + }, + { + "epoch": 0.3370961041775787, + "grad_norm": 7.611332635679957, + "learning_rate": 4.752530940169148e-06, + "loss": 0.9492, + "step": 4666 + }, + { + "epoch": 0.3371683493777882, + "grad_norm": 5.908880061942853, + "learning_rate": 4.7524040391514134e-06, + "loss": 0.902, + "step": 4667 + }, + { + "epoch": 0.3372405945779977, + "grad_norm": 6.9288763111210505, + "learning_rate": 4.752277107300016e-06, + "loss": 0.9526, + "step": 4668 + }, + { + "epoch": 0.33731283977820725, + "grad_norm": 8.253871125102735, + "learning_rate": 4.752150144616694e-06, + "loss": 0.9096, + "step": 4669 + }, + { + "epoch": 0.33738508497841674, + "grad_norm": 5.593747783639138, + "learning_rate": 4.752023151103184e-06, + "loss": 0.8567, + "step": 4670 + }, + { + "epoch": 0.3374573301786263, + "grad_norm": 6.668302271792145, + "learning_rate": 4.7518961267612276e-06, + "loss": 0.9121, + "step": 4671 + }, + { + "epoch": 0.33752957537883577, + "grad_norm": 6.165203573883175, + "learning_rate": 4.751769071592561e-06, + "loss": 0.987, + "step": 4672 + }, + { + "epoch": 0.33760182057904525, + "grad_norm": 7.398327932382314, + "learning_rate": 4.751641985598924e-06, + "loss": 0.9055, + "step": 4673 + }, + { + "epoch": 0.3376740657792548, + "grad_norm": 6.862778221285516, + "learning_rate": 4.7515148687820565e-06, + "loss": 0.8558, + "step": 4674 + }, + { + "epoch": 0.3377463109794643, + "grad_norm": 6.729327441047722, + "learning_rate": 4.751387721143698e-06, + "loss": 0.7887, + "step": 4675 + }, + { + "epoch": 0.3378185561796738, + "grad_norm": 6.665789864419787, + "learning_rate": 4.751260542685591e-06, + "loss": 0.8645, + "step": 4676 + }, + { + "epoch": 0.3378908013798833, + "grad_norm": 6.490522664658773, + "learning_rate": 4.751133333409474e-06, + "loss": 1.0351, + "step": 4677 + }, + { + "epoch": 0.33796304658009285, + "grad_norm": 6.647204059441934, + "learning_rate": 4.75100609331709e-06, + "loss": 0.8491, + "step": 4678 + }, + { + "epoch": 0.33803529178030234, + "grad_norm": 8.049667673774062, + "learning_rate": 4.750878822410181e-06, + "loss": 0.9304, + "step": 4679 + }, + { + "epoch": 0.3381075369805119, + "grad_norm": 6.009663746800595, + "learning_rate": 4.7507515206904875e-06, + "loss": 0.8, + "step": 4680 + }, + { + "epoch": 0.33817978218072137, + "grad_norm": 7.036728822453794, + "learning_rate": 4.750624188159754e-06, + "loss": 0.9536, + "step": 4681 + }, + { + "epoch": 0.33825202738093085, + "grad_norm": 6.04311741040075, + "learning_rate": 4.750496824819723e-06, + "loss": 0.8914, + "step": 4682 + }, + { + "epoch": 0.3383242725811404, + "grad_norm": 6.572899126693448, + "learning_rate": 4.750369430672137e-06, + "loss": 0.8721, + "step": 4683 + }, + { + "epoch": 0.3383965177813499, + "grad_norm": 7.142173014984764, + "learning_rate": 4.750242005718742e-06, + "loss": 0.8689, + "step": 4684 + }, + { + "epoch": 0.3384687629815594, + "grad_norm": 5.724448444833667, + "learning_rate": 4.75011454996128e-06, + "loss": 0.9143, + "step": 4685 + }, + { + "epoch": 0.3385410081817689, + "grad_norm": 6.198560904564676, + "learning_rate": 4.7499870634014975e-06, + "loss": 0.8788, + "step": 4686 + }, + { + "epoch": 0.33861325338197845, + "grad_norm": 5.457298102984559, + "learning_rate": 4.749859546041139e-06, + "loss": 0.7786, + "step": 4687 + }, + { + "epoch": 0.33868549858218794, + "grad_norm": 6.446283958512788, + "learning_rate": 4.74973199788195e-06, + "loss": 0.8519, + "step": 4688 + }, + { + "epoch": 0.3387577437823975, + "grad_norm": 5.892891445307782, + "learning_rate": 4.749604418925677e-06, + "loss": 0.8952, + "step": 4689 + }, + { + "epoch": 0.33882998898260697, + "grad_norm": 6.655723013634846, + "learning_rate": 4.749476809174066e-06, + "loss": 0.8691, + "step": 4690 + }, + { + "epoch": 0.33890223418281645, + "grad_norm": 8.259396836839748, + "learning_rate": 4.749349168628864e-06, + "loss": 0.9287, + "step": 4691 + }, + { + "epoch": 0.338974479383026, + "grad_norm": 6.07880811603494, + "learning_rate": 4.7492214972918195e-06, + "loss": 0.9129, + "step": 4692 + }, + { + "epoch": 0.3390467245832355, + "grad_norm": 7.2553885593379945, + "learning_rate": 4.749093795164677e-06, + "loss": 0.9494, + "step": 4693 + }, + { + "epoch": 0.339118969783445, + "grad_norm": 9.288093689622594, + "learning_rate": 4.748966062249189e-06, + "loss": 0.9486, + "step": 4694 + }, + { + "epoch": 0.3391912149836545, + "grad_norm": 7.706859990811352, + "learning_rate": 4.748838298547101e-06, + "loss": 0.9275, + "step": 4695 + }, + { + "epoch": 0.33926346018386405, + "grad_norm": 6.12590642956128, + "learning_rate": 4.748710504060162e-06, + "loss": 0.8555, + "step": 4696 + }, + { + "epoch": 0.33933570538407354, + "grad_norm": 7.682569706082618, + "learning_rate": 4.748582678790123e-06, + "loss": 0.953, + "step": 4697 + }, + { + "epoch": 0.3394079505842831, + "grad_norm": 6.197705724209281, + "learning_rate": 4.748454822738733e-06, + "loss": 0.9044, + "step": 4698 + }, + { + "epoch": 0.33948019578449257, + "grad_norm": 7.293869412449017, + "learning_rate": 4.748326935907742e-06, + "loss": 0.8726, + "step": 4699 + }, + { + "epoch": 0.33955244098470205, + "grad_norm": 5.961991723107009, + "learning_rate": 4.748199018298901e-06, + "loss": 0.8354, + "step": 4700 + }, + { + "epoch": 0.3396246861849116, + "grad_norm": 7.015330285082026, + "learning_rate": 4.748071069913962e-06, + "loss": 0.8635, + "step": 4701 + }, + { + "epoch": 0.3396969313851211, + "grad_norm": 5.918049001621709, + "learning_rate": 4.747943090754675e-06, + "loss": 0.8834, + "step": 4702 + }, + { + "epoch": 0.3397691765853306, + "grad_norm": 5.810893503316402, + "learning_rate": 4.747815080822792e-06, + "loss": 0.8249, + "step": 4703 + }, + { + "epoch": 0.3398414217855401, + "grad_norm": 8.857823473186226, + "learning_rate": 4.747687040120066e-06, + "loss": 0.9339, + "step": 4704 + }, + { + "epoch": 0.33991366698574965, + "grad_norm": 6.383215005573057, + "learning_rate": 4.74755896864825e-06, + "loss": 0.864, + "step": 4705 + }, + { + "epoch": 0.33998591218595914, + "grad_norm": 8.806623948268198, + "learning_rate": 4.747430866409096e-06, + "loss": 0.9873, + "step": 4706 + }, + { + "epoch": 0.3400581573861687, + "grad_norm": 8.304719653924113, + "learning_rate": 4.747302733404359e-06, + "loss": 0.9586, + "step": 4707 + }, + { + "epoch": 0.34013040258637817, + "grad_norm": 6.400421915452198, + "learning_rate": 4.747174569635793e-06, + "loss": 0.913, + "step": 4708 + }, + { + "epoch": 0.34020264778658765, + "grad_norm": 6.764633141246059, + "learning_rate": 4.747046375105151e-06, + "loss": 0.9683, + "step": 4709 + }, + { + "epoch": 0.3402748929867972, + "grad_norm": 5.634527318400954, + "learning_rate": 4.74691814981419e-06, + "loss": 0.8647, + "step": 4710 + }, + { + "epoch": 0.3403471381870067, + "grad_norm": 5.70614786380475, + "learning_rate": 4.746789893764664e-06, + "loss": 0.8682, + "step": 4711 + }, + { + "epoch": 0.3404193833872162, + "grad_norm": 5.736774659692827, + "learning_rate": 4.746661606958328e-06, + "loss": 0.91, + "step": 4712 + }, + { + "epoch": 0.3404916285874257, + "grad_norm": 6.326015996941099, + "learning_rate": 4.746533289396939e-06, + "loss": 0.8407, + "step": 4713 + }, + { + "epoch": 0.34056387378763525, + "grad_norm": 8.022983913359383, + "learning_rate": 4.7464049410822545e-06, + "loss": 0.8569, + "step": 4714 + }, + { + "epoch": 0.34063611898784474, + "grad_norm": 9.013791854902998, + "learning_rate": 4.746276562016029e-06, + "loss": 0.8877, + "step": 4715 + }, + { + "epoch": 0.3407083641880543, + "grad_norm": 5.875087980869647, + "learning_rate": 4.746148152200023e-06, + "loss": 0.7604, + "step": 4716 + }, + { + "epoch": 0.34078060938826377, + "grad_norm": 6.2975513483114325, + "learning_rate": 4.746019711635992e-06, + "loss": 0.855, + "step": 4717 + }, + { + "epoch": 0.34085285458847325, + "grad_norm": 6.266088302111875, + "learning_rate": 4.745891240325696e-06, + "loss": 0.8928, + "step": 4718 + }, + { + "epoch": 0.3409250997886828, + "grad_norm": 6.746715452950137, + "learning_rate": 4.745762738270892e-06, + "loss": 0.9546, + "step": 4719 + }, + { + "epoch": 0.3409973449888923, + "grad_norm": 6.097724127284761, + "learning_rate": 4.74563420547334e-06, + "loss": 0.8528, + "step": 4720 + }, + { + "epoch": 0.3410695901891018, + "grad_norm": 6.839299117647531, + "learning_rate": 4.745505641934799e-06, + "loss": 0.8877, + "step": 4721 + }, + { + "epoch": 0.3411418353893113, + "grad_norm": 7.194454643147366, + "learning_rate": 4.7453770476570305e-06, + "loss": 0.9382, + "step": 4722 + }, + { + "epoch": 0.34121408058952085, + "grad_norm": 8.802052899972043, + "learning_rate": 4.745248422641793e-06, + "loss": 0.9408, + "step": 4723 + }, + { + "epoch": 0.34128632578973034, + "grad_norm": 7.63198757496908, + "learning_rate": 4.745119766890847e-06, + "loss": 0.9871, + "step": 4724 + }, + { + "epoch": 0.3413585709899399, + "grad_norm": 7.757157251012987, + "learning_rate": 4.744991080405955e-06, + "loss": 0.8698, + "step": 4725 + }, + { + "epoch": 0.34143081619014937, + "grad_norm": 7.777635830386212, + "learning_rate": 4.744862363188879e-06, + "loss": 0.9681, + "step": 4726 + }, + { + "epoch": 0.34150306139035885, + "grad_norm": 8.30135338101824, + "learning_rate": 4.74473361524138e-06, + "loss": 0.9716, + "step": 4727 + }, + { + "epoch": 0.3415753065905684, + "grad_norm": 5.93776758494489, + "learning_rate": 4.7446048365652205e-06, + "loss": 0.9634, + "step": 4728 + }, + { + "epoch": 0.3416475517907779, + "grad_norm": 8.78739741171892, + "learning_rate": 4.744476027162164e-06, + "loss": 0.9651, + "step": 4729 + }, + { + "epoch": 0.3417197969909874, + "grad_norm": 7.551232132570863, + "learning_rate": 4.744347187033973e-06, + "loss": 0.9158, + "step": 4730 + }, + { + "epoch": 0.3417920421911969, + "grad_norm": 8.599694765573668, + "learning_rate": 4.744218316182411e-06, + "loss": 0.9641, + "step": 4731 + }, + { + "epoch": 0.34186428739140645, + "grad_norm": 8.248917942178652, + "learning_rate": 4.744089414609244e-06, + "loss": 0.9556, + "step": 4732 + }, + { + "epoch": 0.34193653259161594, + "grad_norm": 5.723685047774995, + "learning_rate": 4.743960482316234e-06, + "loss": 0.9496, + "step": 4733 + }, + { + "epoch": 0.3420087777918255, + "grad_norm": 7.598791136720495, + "learning_rate": 4.743831519305149e-06, + "loss": 0.8765, + "step": 4734 + }, + { + "epoch": 0.34208102299203497, + "grad_norm": 5.9543145060176546, + "learning_rate": 4.743702525577752e-06, + "loss": 0.9256, + "step": 4735 + }, + { + "epoch": 0.34215326819224445, + "grad_norm": 11.859226391731948, + "learning_rate": 4.743573501135809e-06, + "loss": 0.9687, + "step": 4736 + }, + { + "epoch": 0.342225513392454, + "grad_norm": 7.232358053998726, + "learning_rate": 4.743444445981087e-06, + "loss": 0.941, + "step": 4737 + }, + { + "epoch": 0.3422977585926635, + "grad_norm": 7.939735157967306, + "learning_rate": 4.743315360115352e-06, + "loss": 1.0065, + "step": 4738 + }, + { + "epoch": 0.342370003792873, + "grad_norm": 5.297435089476951, + "learning_rate": 4.743186243540373e-06, + "loss": 0.8728, + "step": 4739 + }, + { + "epoch": 0.3424422489930825, + "grad_norm": 6.219597686807579, + "learning_rate": 4.743057096257915e-06, + "loss": 0.9657, + "step": 4740 + }, + { + "epoch": 0.34251449419329205, + "grad_norm": 7.215688873840963, + "learning_rate": 4.742927918269748e-06, + "loss": 0.8286, + "step": 4741 + }, + { + "epoch": 0.34258673939350154, + "grad_norm": 7.171193341873666, + "learning_rate": 4.742798709577638e-06, + "loss": 0.8916, + "step": 4742 + }, + { + "epoch": 0.3426589845937111, + "grad_norm": 7.806766450777952, + "learning_rate": 4.742669470183356e-06, + "loss": 0.9031, + "step": 4743 + }, + { + "epoch": 0.34273122979392057, + "grad_norm": 6.930360987125074, + "learning_rate": 4.74254020008867e-06, + "loss": 0.8826, + "step": 4744 + }, + { + "epoch": 0.34280347499413005, + "grad_norm": 7.1875615656330645, + "learning_rate": 4.74241089929535e-06, + "loss": 0.9451, + "step": 4745 + }, + { + "epoch": 0.3428757201943396, + "grad_norm": 5.938651204187698, + "learning_rate": 4.742281567805165e-06, + "loss": 0.8556, + "step": 4746 + }, + { + "epoch": 0.3429479653945491, + "grad_norm": 6.36124656936514, + "learning_rate": 4.742152205619887e-06, + "loss": 0.9079, + "step": 4747 + }, + { + "epoch": 0.3430202105947586, + "grad_norm": 6.468430958127217, + "learning_rate": 4.742022812741287e-06, + "loss": 0.9156, + "step": 4748 + }, + { + "epoch": 0.3430924557949681, + "grad_norm": 7.663848718428299, + "learning_rate": 4.741893389171134e-06, + "loss": 0.9361, + "step": 4749 + }, + { + "epoch": 0.34316470099517765, + "grad_norm": 7.126372355755827, + "learning_rate": 4.741763934911202e-06, + "loss": 0.8589, + "step": 4750 + }, + { + "epoch": 0.34323694619538714, + "grad_norm": 6.923181552585386, + "learning_rate": 4.741634449963262e-06, + "loss": 0.8968, + "step": 4751 + }, + { + "epoch": 0.3433091913955967, + "grad_norm": 7.798393025287676, + "learning_rate": 4.741504934329087e-06, + "loss": 0.9613, + "step": 4752 + }, + { + "epoch": 0.34338143659580617, + "grad_norm": 6.490537357978344, + "learning_rate": 4.741375388010451e-06, + "loss": 0.9241, + "step": 4753 + }, + { + "epoch": 0.34345368179601565, + "grad_norm": 5.708863765083316, + "learning_rate": 4.741245811009125e-06, + "loss": 0.785, + "step": 4754 + }, + { + "epoch": 0.3435259269962252, + "grad_norm": 7.0510652860032765, + "learning_rate": 4.741116203326885e-06, + "loss": 0.9242, + "step": 4755 + }, + { + "epoch": 0.3435981721964347, + "grad_norm": 5.572461413270852, + "learning_rate": 4.740986564965503e-06, + "loss": 0.9212, + "step": 4756 + }, + { + "epoch": 0.3436704173966442, + "grad_norm": 5.513802116388157, + "learning_rate": 4.7408568959267555e-06, + "loss": 0.837, + "step": 4757 + }, + { + "epoch": 0.3437426625968537, + "grad_norm": 7.283550373861911, + "learning_rate": 4.740727196212418e-06, + "loss": 0.9051, + "step": 4758 + }, + { + "epoch": 0.34381490779706325, + "grad_norm": 6.470470844570698, + "learning_rate": 4.7405974658242634e-06, + "loss": 0.9448, + "step": 4759 + }, + { + "epoch": 0.34388715299727274, + "grad_norm": 7.869287992694313, + "learning_rate": 4.74046770476407e-06, + "loss": 0.9147, + "step": 4760 + }, + { + "epoch": 0.3439593981974823, + "grad_norm": 6.825514933964289, + "learning_rate": 4.740337913033614e-06, + "loss": 0.9089, + "step": 4761 + }, + { + "epoch": 0.34403164339769177, + "grad_norm": 7.1371209626842065, + "learning_rate": 4.74020809063467e-06, + "loss": 0.9267, + "step": 4762 + }, + { + "epoch": 0.34410388859790125, + "grad_norm": 7.247300533353916, + "learning_rate": 4.7400782375690176e-06, + "loss": 1.001, + "step": 4763 + }, + { + "epoch": 0.3441761337981108, + "grad_norm": 7.396895154209182, + "learning_rate": 4.7399483538384335e-06, + "loss": 0.919, + "step": 4764 + }, + { + "epoch": 0.3442483789983203, + "grad_norm": 6.620262611458682, + "learning_rate": 4.739818439444695e-06, + "loss": 0.9751, + "step": 4765 + }, + { + "epoch": 0.3443206241985298, + "grad_norm": 5.551886215671014, + "learning_rate": 4.739688494389582e-06, + "loss": 0.8822, + "step": 4766 + }, + { + "epoch": 0.3443928693987393, + "grad_norm": 5.974918712877045, + "learning_rate": 4.739558518674872e-06, + "loss": 0.8527, + "step": 4767 + }, + { + "epoch": 0.34446511459894885, + "grad_norm": 7.593633203432431, + "learning_rate": 4.739428512302345e-06, + "loss": 0.9646, + "step": 4768 + }, + { + "epoch": 0.34453735979915834, + "grad_norm": 6.184839158183312, + "learning_rate": 4.739298475273781e-06, + "loss": 0.8128, + "step": 4769 + }, + { + "epoch": 0.3446096049993679, + "grad_norm": 5.537779149582069, + "learning_rate": 4.7391684075909585e-06, + "loss": 0.8214, + "step": 4770 + }, + { + "epoch": 0.34468185019957737, + "grad_norm": 7.2764533619788105, + "learning_rate": 4.739038309255659e-06, + "loss": 0.9267, + "step": 4771 + }, + { + "epoch": 0.34475409539978685, + "grad_norm": 6.2085337083811885, + "learning_rate": 4.738908180269665e-06, + "loss": 0.9266, + "step": 4772 + }, + { + "epoch": 0.3448263405999964, + "grad_norm": 6.3424985055166125, + "learning_rate": 4.738778020634755e-06, + "loss": 0.8218, + "step": 4773 + }, + { + "epoch": 0.3448985858002059, + "grad_norm": 5.143467450637894, + "learning_rate": 4.738647830352713e-06, + "loss": 0.9342, + "step": 4774 + }, + { + "epoch": 0.3449708310004154, + "grad_norm": 5.329779672846147, + "learning_rate": 4.738517609425319e-06, + "loss": 0.8666, + "step": 4775 + }, + { + "epoch": 0.3450430762006249, + "grad_norm": 6.605101446026752, + "learning_rate": 4.738387357854359e-06, + "loss": 0.8882, + "step": 4776 + }, + { + "epoch": 0.34511532140083445, + "grad_norm": 7.093025876363604, + "learning_rate": 4.738257075641613e-06, + "loss": 0.8746, + "step": 4777 + }, + { + "epoch": 0.34518756660104394, + "grad_norm": 6.329158444260092, + "learning_rate": 4.738126762788866e-06, + "loss": 0.894, + "step": 4778 + }, + { + "epoch": 0.3452598118012535, + "grad_norm": 7.466997140428408, + "learning_rate": 4.7379964192979015e-06, + "loss": 0.8719, + "step": 4779 + }, + { + "epoch": 0.34533205700146297, + "grad_norm": 6.0327775829206765, + "learning_rate": 4.737866045170503e-06, + "loss": 0.9509, + "step": 4780 + }, + { + "epoch": 0.34540430220167245, + "grad_norm": 6.984185747488387, + "learning_rate": 4.737735640408456e-06, + "loss": 0.9864, + "step": 4781 + }, + { + "epoch": 0.345476547401882, + "grad_norm": 6.133565091163463, + "learning_rate": 4.737605205013546e-06, + "loss": 0.8378, + "step": 4782 + }, + { + "epoch": 0.3455487926020915, + "grad_norm": 8.150637979636787, + "learning_rate": 4.737474738987558e-06, + "loss": 0.8201, + "step": 4783 + }, + { + "epoch": 0.345621037802301, + "grad_norm": 5.63423399111414, + "learning_rate": 4.737344242332278e-06, + "loss": 0.9026, + "step": 4784 + }, + { + "epoch": 0.3456932830025105, + "grad_norm": 6.743355765853028, + "learning_rate": 4.737213715049492e-06, + "loss": 0.8988, + "step": 4785 + }, + { + "epoch": 0.34576552820272005, + "grad_norm": 7.829797744964475, + "learning_rate": 4.737083157140988e-06, + "loss": 0.9473, + "step": 4786 + }, + { + "epoch": 0.34583777340292954, + "grad_norm": 7.5027183692623565, + "learning_rate": 4.736952568608553e-06, + "loss": 1.0386, + "step": 4787 + }, + { + "epoch": 0.3459100186031391, + "grad_norm": 7.166156159190252, + "learning_rate": 4.736821949453973e-06, + "loss": 0.9248, + "step": 4788 + }, + { + "epoch": 0.34598226380334857, + "grad_norm": 6.480498476517867, + "learning_rate": 4.736691299679038e-06, + "loss": 0.921, + "step": 4789 + }, + { + "epoch": 0.34605450900355805, + "grad_norm": 8.547341844825484, + "learning_rate": 4.736560619285537e-06, + "loss": 0.9479, + "step": 4790 + }, + { + "epoch": 0.3461267542037676, + "grad_norm": 7.3208953397408605, + "learning_rate": 4.736429908275255e-06, + "loss": 0.9253, + "step": 4791 + }, + { + "epoch": 0.3461989994039771, + "grad_norm": 7.282421885476452, + "learning_rate": 4.7362991666499856e-06, + "loss": 0.9073, + "step": 4792 + }, + { + "epoch": 0.3462712446041866, + "grad_norm": 7.866684171150689, + "learning_rate": 4.736168394411517e-06, + "loss": 0.9267, + "step": 4793 + }, + { + "epoch": 0.3463434898043961, + "grad_norm": 6.391222489833286, + "learning_rate": 4.736037591561639e-06, + "loss": 0.8795, + "step": 4794 + }, + { + "epoch": 0.34641573500460565, + "grad_norm": 8.767495674804456, + "learning_rate": 4.735906758102144e-06, + "loss": 0.8775, + "step": 4795 + }, + { + "epoch": 0.34648798020481514, + "grad_norm": 7.6276968267371, + "learning_rate": 4.7357758940348195e-06, + "loss": 0.9336, + "step": 4796 + }, + { + "epoch": 0.3465602254050247, + "grad_norm": 7.446086895308853, + "learning_rate": 4.73564499936146e-06, + "loss": 0.8151, + "step": 4797 + }, + { + "epoch": 0.34663247060523417, + "grad_norm": 7.0886666658037285, + "learning_rate": 4.735514074083855e-06, + "loss": 0.9652, + "step": 4798 + }, + { + "epoch": 0.34670471580544365, + "grad_norm": 7.789517709789422, + "learning_rate": 4.7353831182038e-06, + "loss": 0.9289, + "step": 4799 + }, + { + "epoch": 0.3467769610056532, + "grad_norm": 7.102468636325011, + "learning_rate": 4.735252131723085e-06, + "loss": 0.9773, + "step": 4800 + }, + { + "epoch": 0.3468492062058627, + "grad_norm": 6.688518981308947, + "learning_rate": 4.7351211146435036e-06, + "loss": 0.8854, + "step": 4801 + }, + { + "epoch": 0.3469214514060722, + "grad_norm": 6.512356091335268, + "learning_rate": 4.7349900669668506e-06, + "loss": 0.9496, + "step": 4802 + }, + { + "epoch": 0.3469936966062817, + "grad_norm": 8.53447106445957, + "learning_rate": 4.734858988694918e-06, + "loss": 0.9007, + "step": 4803 + }, + { + "epoch": 0.34706594180649125, + "grad_norm": 7.611924764512946, + "learning_rate": 4.734727879829502e-06, + "loss": 1.0246, + "step": 4804 + }, + { + "epoch": 0.34713818700670074, + "grad_norm": 5.731649889457857, + "learning_rate": 4.7345967403723955e-06, + "loss": 0.8358, + "step": 4805 + }, + { + "epoch": 0.3472104322069103, + "grad_norm": 5.734139502572373, + "learning_rate": 4.734465570325394e-06, + "loss": 0.9071, + "step": 4806 + }, + { + "epoch": 0.34728267740711977, + "grad_norm": 7.836994803419747, + "learning_rate": 4.734334369690296e-06, + "loss": 0.9454, + "step": 4807 + }, + { + "epoch": 0.34735492260732925, + "grad_norm": 6.87729869907075, + "learning_rate": 4.734203138468893e-06, + "loss": 0.8749, + "step": 4808 + }, + { + "epoch": 0.3474271678075388, + "grad_norm": 6.687434971573438, + "learning_rate": 4.7340718766629856e-06, + "loss": 0.8804, + "step": 4809 + }, + { + "epoch": 0.3474994130077483, + "grad_norm": 6.478084003559828, + "learning_rate": 4.733940584274368e-06, + "loss": 0.939, + "step": 4810 + }, + { + "epoch": 0.3475716582079578, + "grad_norm": 7.367200703775946, + "learning_rate": 4.733809261304838e-06, + "loss": 0.9452, + "step": 4811 + }, + { + "epoch": 0.3476439034081673, + "grad_norm": 6.609927323442475, + "learning_rate": 4.733677907756194e-06, + "loss": 1.0468, + "step": 4812 + }, + { + "epoch": 0.34771614860837685, + "grad_norm": 6.919729481519643, + "learning_rate": 4.733546523630234e-06, + "loss": 0.8875, + "step": 4813 + }, + { + "epoch": 0.34778839380858634, + "grad_norm": 5.820319709517347, + "learning_rate": 4.733415108928756e-06, + "loss": 0.8658, + "step": 4814 + }, + { + "epoch": 0.3478606390087959, + "grad_norm": 7.105193868377313, + "learning_rate": 4.73328366365356e-06, + "loss": 0.8159, + "step": 4815 + }, + { + "epoch": 0.34793288420900537, + "grad_norm": 6.5540068390071085, + "learning_rate": 4.733152187806444e-06, + "loss": 0.8662, + "step": 4816 + }, + { + "epoch": 0.34800512940921485, + "grad_norm": 7.057643875023051, + "learning_rate": 4.7330206813892085e-06, + "loss": 0.9895, + "step": 4817 + }, + { + "epoch": 0.3480773746094244, + "grad_norm": 5.906394351224589, + "learning_rate": 4.732889144403654e-06, + "loss": 0.8879, + "step": 4818 + }, + { + "epoch": 0.3481496198096339, + "grad_norm": 5.732092795601168, + "learning_rate": 4.732757576851581e-06, + "loss": 0.9363, + "step": 4819 + }, + { + "epoch": 0.3482218650098434, + "grad_norm": 5.664929890480773, + "learning_rate": 4.73262597873479e-06, + "loss": 0.8782, + "step": 4820 + }, + { + "epoch": 0.3482941102100529, + "grad_norm": 6.024748623617686, + "learning_rate": 4.732494350055083e-06, + "loss": 0.9051, + "step": 4821 + }, + { + "epoch": 0.34836635541026245, + "grad_norm": 6.895924024955538, + "learning_rate": 4.732362690814262e-06, + "loss": 0.9158, + "step": 4822 + }, + { + "epoch": 0.34843860061047194, + "grad_norm": 5.5850359849445015, + "learning_rate": 4.732231001014129e-06, + "loss": 1.0064, + "step": 4823 + }, + { + "epoch": 0.3485108458106815, + "grad_norm": 7.8584129396885265, + "learning_rate": 4.732099280656486e-06, + "loss": 0.9082, + "step": 4824 + }, + { + "epoch": 0.34858309101089097, + "grad_norm": 7.697970075684421, + "learning_rate": 4.731967529743138e-06, + "loss": 0.9528, + "step": 4825 + }, + { + "epoch": 0.34865533621110045, + "grad_norm": 8.030673826456507, + "learning_rate": 4.731835748275887e-06, + "loss": 0.8399, + "step": 4826 + }, + { + "epoch": 0.34872758141131, + "grad_norm": 5.638044702304652, + "learning_rate": 4.731703936256537e-06, + "loss": 0.8677, + "step": 4827 + }, + { + "epoch": 0.3487998266115195, + "grad_norm": 6.656935105862315, + "learning_rate": 4.731572093686894e-06, + "loss": 0.9461, + "step": 4828 + }, + { + "epoch": 0.348872071811729, + "grad_norm": 7.309242240153499, + "learning_rate": 4.731440220568761e-06, + "loss": 0.8635, + "step": 4829 + }, + { + "epoch": 0.3489443170119385, + "grad_norm": 6.656515053499895, + "learning_rate": 4.731308316903945e-06, + "loss": 0.9509, + "step": 4830 + }, + { + "epoch": 0.34901656221214805, + "grad_norm": 6.465496922252779, + "learning_rate": 4.73117638269425e-06, + "loss": 0.9387, + "step": 4831 + }, + { + "epoch": 0.34908880741235754, + "grad_norm": 6.052854120104628, + "learning_rate": 4.731044417941483e-06, + "loss": 0.902, + "step": 4832 + }, + { + "epoch": 0.3491610526125671, + "grad_norm": 5.137125882451521, + "learning_rate": 4.730912422647449e-06, + "loss": 0.9325, + "step": 4833 + }, + { + "epoch": 0.34923329781277657, + "grad_norm": 7.37473878559517, + "learning_rate": 4.730780396813957e-06, + "loss": 0.9161, + "step": 4834 + }, + { + "epoch": 0.34930554301298605, + "grad_norm": 6.348134597360939, + "learning_rate": 4.730648340442814e-06, + "loss": 0.9087, + "step": 4835 + }, + { + "epoch": 0.3493777882131956, + "grad_norm": 6.084119179886959, + "learning_rate": 4.7305162535358265e-06, + "loss": 0.9036, + "step": 4836 + }, + { + "epoch": 0.3494500334134051, + "grad_norm": 6.644831503173456, + "learning_rate": 4.730384136094803e-06, + "loss": 0.8998, + "step": 4837 + }, + { + "epoch": 0.3495222786136146, + "grad_norm": 6.313926611110418, + "learning_rate": 4.730251988121554e-06, + "loss": 0.8668, + "step": 4838 + }, + { + "epoch": 0.3495945238138241, + "grad_norm": 5.982793613980866, + "learning_rate": 4.730119809617886e-06, + "loss": 0.8702, + "step": 4839 + }, + { + "epoch": 0.34966676901403365, + "grad_norm": 6.363790792597847, + "learning_rate": 4.7299876005856085e-06, + "loss": 0.9335, + "step": 4840 + }, + { + "epoch": 0.34973901421424314, + "grad_norm": 5.777259232816549, + "learning_rate": 4.729855361026533e-06, + "loss": 0.9321, + "step": 4841 + }, + { + "epoch": 0.3498112594144527, + "grad_norm": 7.729196251935109, + "learning_rate": 4.729723090942469e-06, + "loss": 0.9696, + "step": 4842 + }, + { + "epoch": 0.34988350461466217, + "grad_norm": 7.4130467642597235, + "learning_rate": 4.729590790335228e-06, + "loss": 0.9163, + "step": 4843 + }, + { + "epoch": 0.34995574981487165, + "grad_norm": 6.715547872023216, + "learning_rate": 4.729458459206619e-06, + "loss": 0.9378, + "step": 4844 + }, + { + "epoch": 0.3500279950150812, + "grad_norm": 7.241359165395393, + "learning_rate": 4.7293260975584555e-06, + "loss": 0.8755, + "step": 4845 + }, + { + "epoch": 0.3501002402152907, + "grad_norm": 6.153964320295394, + "learning_rate": 4.729193705392548e-06, + "loss": 0.9362, + "step": 4846 + }, + { + "epoch": 0.3501724854155002, + "grad_norm": 6.541357660790818, + "learning_rate": 4.72906128271071e-06, + "loss": 0.9262, + "step": 4847 + }, + { + "epoch": 0.3502447306157097, + "grad_norm": 6.240764661056168, + "learning_rate": 4.728928829514754e-06, + "loss": 0.8284, + "step": 4848 + }, + { + "epoch": 0.35031697581591925, + "grad_norm": 9.163450983299896, + "learning_rate": 4.728796345806492e-06, + "loss": 1.0413, + "step": 4849 + }, + { + "epoch": 0.35038922101612874, + "grad_norm": 8.588068967402812, + "learning_rate": 4.72866383158774e-06, + "loss": 0.8553, + "step": 4850 + }, + { + "epoch": 0.3504614662163382, + "grad_norm": 5.773281234670593, + "learning_rate": 4.728531286860309e-06, + "loss": 0.9089, + "step": 4851 + }, + { + "epoch": 0.35053371141654777, + "grad_norm": 7.070594402615584, + "learning_rate": 4.728398711626016e-06, + "loss": 0.9063, + "step": 4852 + }, + { + "epoch": 0.35060595661675725, + "grad_norm": 6.371737075218412, + "learning_rate": 4.728266105886675e-06, + "loss": 0.9184, + "step": 4853 + }, + { + "epoch": 0.3506782018169668, + "grad_norm": 8.023578230265317, + "learning_rate": 4.728133469644101e-06, + "loss": 0.9957, + "step": 4854 + }, + { + "epoch": 0.3507504470171763, + "grad_norm": 8.93158873228693, + "learning_rate": 4.728000802900109e-06, + "loss": 0.9389, + "step": 4855 + }, + { + "epoch": 0.3508226922173858, + "grad_norm": 7.792955533353454, + "learning_rate": 4.7278681056565165e-06, + "loss": 0.791, + "step": 4856 + }, + { + "epoch": 0.3508949374175953, + "grad_norm": 6.759743334169402, + "learning_rate": 4.72773537791514e-06, + "loss": 0.8517, + "step": 4857 + }, + { + "epoch": 0.35096718261780485, + "grad_norm": 9.683335719762127, + "learning_rate": 4.7276026196777955e-06, + "loss": 0.9379, + "step": 4858 + }, + { + "epoch": 0.35103942781801434, + "grad_norm": 8.912872315236815, + "learning_rate": 4.727469830946301e-06, + "loss": 1.0097, + "step": 4859 + }, + { + "epoch": 0.3511116730182238, + "grad_norm": 6.490038060339724, + "learning_rate": 4.7273370117224735e-06, + "loss": 0.9321, + "step": 4860 + }, + { + "epoch": 0.35118391821843337, + "grad_norm": 6.323096116194099, + "learning_rate": 4.727204162008132e-06, + "loss": 0.9648, + "step": 4861 + }, + { + "epoch": 0.35125616341864285, + "grad_norm": 7.922158621802766, + "learning_rate": 4.727071281805095e-06, + "loss": 0.9139, + "step": 4862 + }, + { + "epoch": 0.3513284086188524, + "grad_norm": 7.620837717140178, + "learning_rate": 4.726938371115182e-06, + "loss": 0.8704, + "step": 4863 + }, + { + "epoch": 0.3514006538190619, + "grad_norm": 6.160224677112821, + "learning_rate": 4.72680542994021e-06, + "loss": 0.9316, + "step": 4864 + }, + { + "epoch": 0.3514728990192714, + "grad_norm": 8.205823588859584, + "learning_rate": 4.7266724582820025e-06, + "loss": 0.9798, + "step": 4865 + }, + { + "epoch": 0.3515451442194809, + "grad_norm": 9.939693184920326, + "learning_rate": 4.726539456142377e-06, + "loss": 0.9463, + "step": 4866 + }, + { + "epoch": 0.35161738941969045, + "grad_norm": 5.252251823332373, + "learning_rate": 4.726406423523156e-06, + "loss": 0.8108, + "step": 4867 + }, + { + "epoch": 0.35168963461989994, + "grad_norm": 7.858255416575531, + "learning_rate": 4.726273360426158e-06, + "loss": 0.9343, + "step": 4868 + }, + { + "epoch": 0.3517618798201094, + "grad_norm": 5.891706048743006, + "learning_rate": 4.7261402668532075e-06, + "loss": 0.8706, + "step": 4869 + }, + { + "epoch": 0.35183412502031897, + "grad_norm": 6.899268302268688, + "learning_rate": 4.726007142806125e-06, + "loss": 0.9625, + "step": 4870 + }, + { + "epoch": 0.35190637022052845, + "grad_norm": 9.246484036703661, + "learning_rate": 4.725873988286733e-06, + "loss": 0.8858, + "step": 4871 + }, + { + "epoch": 0.351978615420738, + "grad_norm": 10.018311329243488, + "learning_rate": 4.725740803296855e-06, + "loss": 0.9347, + "step": 4872 + }, + { + "epoch": 0.3520508606209475, + "grad_norm": 9.381572809834738, + "learning_rate": 4.7256075878383125e-06, + "loss": 0.8859, + "step": 4873 + }, + { + "epoch": 0.352123105821157, + "grad_norm": 6.214186637852806, + "learning_rate": 4.725474341912931e-06, + "loss": 0.9023, + "step": 4874 + }, + { + "epoch": 0.3521953510213665, + "grad_norm": 7.045548612352585, + "learning_rate": 4.725341065522534e-06, + "loss": 0.85, + "step": 4875 + }, + { + "epoch": 0.35226759622157605, + "grad_norm": 7.861548667325699, + "learning_rate": 4.725207758668945e-06, + "loss": 0.911, + "step": 4876 + }, + { + "epoch": 0.35233984142178554, + "grad_norm": 6.849743313400339, + "learning_rate": 4.7250744213539905e-06, + "loss": 0.7814, + "step": 4877 + }, + { + "epoch": 0.352412086621995, + "grad_norm": 8.005900591120424, + "learning_rate": 4.724941053579493e-06, + "loss": 1.021, + "step": 4878 + }, + { + "epoch": 0.35248433182220457, + "grad_norm": 8.794414169292613, + "learning_rate": 4.724807655347281e-06, + "loss": 0.9711, + "step": 4879 + }, + { + "epoch": 0.35255657702241405, + "grad_norm": 8.534585042385078, + "learning_rate": 4.724674226659181e-06, + "loss": 0.9037, + "step": 4880 + }, + { + "epoch": 0.3526288222226236, + "grad_norm": 6.61368068833947, + "learning_rate": 4.724540767517017e-06, + "loss": 0.8663, + "step": 4881 + }, + { + "epoch": 0.3527010674228331, + "grad_norm": 8.228293791885772, + "learning_rate": 4.724407277922616e-06, + "loss": 0.9321, + "step": 4882 + }, + { + "epoch": 0.3527733126230426, + "grad_norm": 7.757025702800414, + "learning_rate": 4.724273757877808e-06, + "loss": 0.9164, + "step": 4883 + }, + { + "epoch": 0.3528455578232521, + "grad_norm": 8.151308162992391, + "learning_rate": 4.724140207384419e-06, + "loss": 0.8744, + "step": 4884 + }, + { + "epoch": 0.35291780302346165, + "grad_norm": 5.991667365585661, + "learning_rate": 4.724006626444277e-06, + "loss": 0.9936, + "step": 4885 + }, + { + "epoch": 0.35299004822367114, + "grad_norm": 7.456715868379862, + "learning_rate": 4.723873015059212e-06, + "loss": 0.9514, + "step": 4886 + }, + { + "epoch": 0.3530622934238806, + "grad_norm": 7.361815742340331, + "learning_rate": 4.723739373231051e-06, + "loss": 0.9352, + "step": 4887 + }, + { + "epoch": 0.35313453862409017, + "grad_norm": 7.040250315984422, + "learning_rate": 4.723605700961625e-06, + "loss": 0.8654, + "step": 4888 + }, + { + "epoch": 0.35320678382429965, + "grad_norm": 7.982055805008355, + "learning_rate": 4.723471998252764e-06, + "loss": 0.8885, + "step": 4889 + }, + { + "epoch": 0.3532790290245092, + "grad_norm": 7.045594904779572, + "learning_rate": 4.723338265106298e-06, + "loss": 0.9284, + "step": 4890 + }, + { + "epoch": 0.3533512742247187, + "grad_norm": 6.9757819171554445, + "learning_rate": 4.723204501524057e-06, + "loss": 0.9254, + "step": 4891 + }, + { + "epoch": 0.3534235194249282, + "grad_norm": 6.671180563885735, + "learning_rate": 4.723070707507873e-06, + "loss": 0.9172, + "step": 4892 + }, + { + "epoch": 0.3534957646251377, + "grad_norm": 7.615900832836879, + "learning_rate": 4.722936883059575e-06, + "loss": 0.8363, + "step": 4893 + }, + { + "epoch": 0.35356800982534725, + "grad_norm": 8.518594934363396, + "learning_rate": 4.722803028181e-06, + "loss": 0.9305, + "step": 4894 + }, + { + "epoch": 0.35364025502555674, + "grad_norm": 7.708305922450911, + "learning_rate": 4.722669142873976e-06, + "loss": 0.9412, + "step": 4895 + }, + { + "epoch": 0.3537125002257662, + "grad_norm": 6.018964677668047, + "learning_rate": 4.722535227140337e-06, + "loss": 0.9591, + "step": 4896 + }, + { + "epoch": 0.35378474542597577, + "grad_norm": 6.310936158102884, + "learning_rate": 4.722401280981917e-06, + "loss": 0.8832, + "step": 4897 + }, + { + "epoch": 0.35385699062618525, + "grad_norm": 5.459292009724019, + "learning_rate": 4.722267304400549e-06, + "loss": 0.8813, + "step": 4898 + }, + { + "epoch": 0.3539292358263948, + "grad_norm": 6.946128359613895, + "learning_rate": 4.722133297398067e-06, + "loss": 0.817, + "step": 4899 + }, + { + "epoch": 0.3540014810266043, + "grad_norm": 9.00629501596249, + "learning_rate": 4.721999259976305e-06, + "loss": 0.8911, + "step": 4900 + }, + { + "epoch": 0.3540737262268138, + "grad_norm": 7.3039210932335275, + "learning_rate": 4.7218651921370995e-06, + "loss": 0.8221, + "step": 4901 + }, + { + "epoch": 0.3541459714270233, + "grad_norm": 6.351632468576023, + "learning_rate": 4.721731093882284e-06, + "loss": 0.8819, + "step": 4902 + }, + { + "epoch": 0.35421821662723285, + "grad_norm": 7.255725047124695, + "learning_rate": 4.721596965213695e-06, + "loss": 0.9731, + "step": 4903 + }, + { + "epoch": 0.35429046182744234, + "grad_norm": 6.738085651599497, + "learning_rate": 4.721462806133168e-06, + "loss": 0.9219, + "step": 4904 + }, + { + "epoch": 0.3543627070276518, + "grad_norm": 6.936539867048155, + "learning_rate": 4.721328616642541e-06, + "loss": 0.8716, + "step": 4905 + }, + { + "epoch": 0.35443495222786137, + "grad_norm": 8.337714251692192, + "learning_rate": 4.721194396743649e-06, + "loss": 1.0583, + "step": 4906 + }, + { + "epoch": 0.35450719742807085, + "grad_norm": 7.4452289844804405, + "learning_rate": 4.721060146438331e-06, + "loss": 0.948, + "step": 4907 + }, + { + "epoch": 0.3545794426282804, + "grad_norm": 6.721874273593638, + "learning_rate": 4.720925865728424e-06, + "loss": 0.873, + "step": 4908 + }, + { + "epoch": 0.3546516878284899, + "grad_norm": 5.672519951924406, + "learning_rate": 4.720791554615767e-06, + "loss": 0.8823, + "step": 4909 + }, + { + "epoch": 0.3547239330286994, + "grad_norm": 9.500782181512577, + "learning_rate": 4.720657213102196e-06, + "loss": 0.9975, + "step": 4910 + }, + { + "epoch": 0.3547961782289089, + "grad_norm": 5.80890766006183, + "learning_rate": 4.720522841189553e-06, + "loss": 0.8909, + "step": 4911 + }, + { + "epoch": 0.35486842342911845, + "grad_norm": 7.082382976132078, + "learning_rate": 4.720388438879677e-06, + "loss": 1.0269, + "step": 4912 + }, + { + "epoch": 0.35494066862932794, + "grad_norm": 6.0012857331108185, + "learning_rate": 4.720254006174407e-06, + "loss": 0.8668, + "step": 4913 + }, + { + "epoch": 0.3550129138295374, + "grad_norm": 6.580836373832006, + "learning_rate": 4.720119543075584e-06, + "loss": 0.8437, + "step": 4914 + }, + { + "epoch": 0.35508515902974697, + "grad_norm": 6.576531803986631, + "learning_rate": 4.719985049585047e-06, + "loss": 0.8884, + "step": 4915 + }, + { + "epoch": 0.35515740422995645, + "grad_norm": 6.354360642104119, + "learning_rate": 4.71985052570464e-06, + "loss": 0.9202, + "step": 4916 + }, + { + "epoch": 0.355229649430166, + "grad_norm": 7.059490001293942, + "learning_rate": 4.719715971436202e-06, + "loss": 0.8587, + "step": 4917 + }, + { + "epoch": 0.3553018946303755, + "grad_norm": 6.259889331850962, + "learning_rate": 4.719581386781576e-06, + "loss": 0.8806, + "step": 4918 + }, + { + "epoch": 0.355374139830585, + "grad_norm": 6.222224965927488, + "learning_rate": 4.719446771742604e-06, + "loss": 0.8756, + "step": 4919 + }, + { + "epoch": 0.3554463850307945, + "grad_norm": 9.801724683069573, + "learning_rate": 4.71931212632113e-06, + "loss": 0.9288, + "step": 4920 + }, + { + "epoch": 0.35551863023100405, + "grad_norm": 5.880368316429272, + "learning_rate": 4.719177450518995e-06, + "loss": 0.9322, + "step": 4921 + }, + { + "epoch": 0.35559087543121354, + "grad_norm": 6.541198746130326, + "learning_rate": 4.719042744338044e-06, + "loss": 0.883, + "step": 4922 + }, + { + "epoch": 0.355663120631423, + "grad_norm": 7.341911702680111, + "learning_rate": 4.7189080077801205e-06, + "loss": 1.006, + "step": 4923 + }, + { + "epoch": 0.35573536583163257, + "grad_norm": 7.210970034376091, + "learning_rate": 4.71877324084707e-06, + "loss": 0.925, + "step": 4924 + }, + { + "epoch": 0.35580761103184205, + "grad_norm": 7.007113248541146, + "learning_rate": 4.718638443540736e-06, + "loss": 1.0031, + "step": 4925 + }, + { + "epoch": 0.3558798562320516, + "grad_norm": 5.589575376539317, + "learning_rate": 4.718503615862965e-06, + "loss": 0.9169, + "step": 4926 + }, + { + "epoch": 0.3559521014322611, + "grad_norm": 6.620192889028193, + "learning_rate": 4.718368757815601e-06, + "loss": 0.9676, + "step": 4927 + }, + { + "epoch": 0.3560243466324706, + "grad_norm": 7.052375220189145, + "learning_rate": 4.718233869400492e-06, + "loss": 0.8658, + "step": 4928 + }, + { + "epoch": 0.3560965918326801, + "grad_norm": 6.481501740116167, + "learning_rate": 4.718098950619484e-06, + "loss": 0.907, + "step": 4929 + }, + { + "epoch": 0.35616883703288965, + "grad_norm": 6.52317551869151, + "learning_rate": 4.717964001474422e-06, + "loss": 0.9071, + "step": 4930 + }, + { + "epoch": 0.35624108223309914, + "grad_norm": 5.323341168561284, + "learning_rate": 4.717829021967157e-06, + "loss": 0.9234, + "step": 4931 + }, + { + "epoch": 0.3563133274333086, + "grad_norm": 6.301280106569244, + "learning_rate": 4.717694012099533e-06, + "loss": 0.9423, + "step": 4932 + }, + { + "epoch": 0.35638557263351817, + "grad_norm": 7.1438605993325, + "learning_rate": 4.717558971873401e-06, + "loss": 0.8649, + "step": 4933 + }, + { + "epoch": 0.35645781783372765, + "grad_norm": 6.237873978956593, + "learning_rate": 4.717423901290608e-06, + "loss": 0.8732, + "step": 4934 + }, + { + "epoch": 0.3565300630339372, + "grad_norm": 7.462036690954316, + "learning_rate": 4.717288800353004e-06, + "loss": 0.8712, + "step": 4935 + }, + { + "epoch": 0.3566023082341467, + "grad_norm": 5.857999513031538, + "learning_rate": 4.717153669062437e-06, + "loss": 0.8249, + "step": 4936 + }, + { + "epoch": 0.3566745534343562, + "grad_norm": 5.871740532716421, + "learning_rate": 4.717018507420759e-06, + "loss": 0.9077, + "step": 4937 + }, + { + "epoch": 0.3567467986345657, + "grad_norm": 6.929176079458273, + "learning_rate": 4.716883315429819e-06, + "loss": 0.9238, + "step": 4938 + }, + { + "epoch": 0.35681904383477525, + "grad_norm": 7.624888434922804, + "learning_rate": 4.716748093091467e-06, + "loss": 0.8851, + "step": 4939 + }, + { + "epoch": 0.35689128903498474, + "grad_norm": 6.647667738285555, + "learning_rate": 4.716612840407555e-06, + "loss": 0.8153, + "step": 4940 + }, + { + "epoch": 0.3569635342351942, + "grad_norm": 6.544773246969527, + "learning_rate": 4.7164775573799335e-06, + "loss": 0.8422, + "step": 4941 + }, + { + "epoch": 0.35703577943540377, + "grad_norm": 7.085958226351303, + "learning_rate": 4.716342244010457e-06, + "loss": 0.8414, + "step": 4942 + }, + { + "epoch": 0.35710802463561325, + "grad_norm": 9.525553417398637, + "learning_rate": 4.716206900300974e-06, + "loss": 0.9852, + "step": 4943 + }, + { + "epoch": 0.3571802698358228, + "grad_norm": 6.616585624192105, + "learning_rate": 4.716071526253341e-06, + "loss": 0.9624, + "step": 4944 + }, + { + "epoch": 0.3572525150360323, + "grad_norm": 7.109905382430848, + "learning_rate": 4.715936121869408e-06, + "loss": 0.9158, + "step": 4945 + }, + { + "epoch": 0.3573247602362418, + "grad_norm": 5.886601781663906, + "learning_rate": 4.715800687151031e-06, + "loss": 1.0144, + "step": 4946 + }, + { + "epoch": 0.3573970054364513, + "grad_norm": 5.509353487056074, + "learning_rate": 4.715665222100063e-06, + "loss": 0.876, + "step": 4947 + }, + { + "epoch": 0.35746925063666085, + "grad_norm": 5.679100793394427, + "learning_rate": 4.715529726718359e-06, + "loss": 0.7607, + "step": 4948 + }, + { + "epoch": 0.35754149583687034, + "grad_norm": 7.393595124893546, + "learning_rate": 4.715394201007773e-06, + "loss": 0.9082, + "step": 4949 + }, + { + "epoch": 0.3576137410370798, + "grad_norm": 7.3679941813790775, + "learning_rate": 4.71525864497016e-06, + "loss": 0.905, + "step": 4950 + }, + { + "epoch": 0.35768598623728937, + "grad_norm": 7.920610616630715, + "learning_rate": 4.715123058607376e-06, + "loss": 0.9713, + "step": 4951 + }, + { + "epoch": 0.35775823143749885, + "grad_norm": 7.555563275327042, + "learning_rate": 4.714987441921277e-06, + "loss": 0.894, + "step": 4952 + }, + { + "epoch": 0.3578304766377084, + "grad_norm": 6.832243979857055, + "learning_rate": 4.7148517949137205e-06, + "loss": 0.9417, + "step": 4953 + }, + { + "epoch": 0.3579027218379179, + "grad_norm": 7.254915084079018, + "learning_rate": 4.714716117586563e-06, + "loss": 0.8664, + "step": 4954 + }, + { + "epoch": 0.3579749670381274, + "grad_norm": 6.613445931111876, + "learning_rate": 4.714580409941661e-06, + "loss": 0.9687, + "step": 4955 + }, + { + "epoch": 0.3580472122383369, + "grad_norm": 7.129334854410234, + "learning_rate": 4.714444671980873e-06, + "loss": 0.9598, + "step": 4956 + }, + { + "epoch": 0.35811945743854645, + "grad_norm": 6.245145209207071, + "learning_rate": 4.714308903706057e-06, + "loss": 0.9256, + "step": 4957 + }, + { + "epoch": 0.35819170263875594, + "grad_norm": 8.629167517406128, + "learning_rate": 4.714173105119071e-06, + "loss": 0.9485, + "step": 4958 + }, + { + "epoch": 0.3582639478389654, + "grad_norm": 8.052011217301843, + "learning_rate": 4.714037276221774e-06, + "loss": 0.9336, + "step": 4959 + }, + { + "epoch": 0.35833619303917497, + "grad_norm": 7.75341106534365, + "learning_rate": 4.713901417016026e-06, + "loss": 0.9014, + "step": 4960 + }, + { + "epoch": 0.35840843823938445, + "grad_norm": 6.764616505649515, + "learning_rate": 4.713765527503686e-06, + "loss": 0.9146, + "step": 4961 + }, + { + "epoch": 0.358480683439594, + "grad_norm": 6.2252059473493775, + "learning_rate": 4.713629607686616e-06, + "loss": 0.8882, + "step": 4962 + }, + { + "epoch": 0.3585529286398035, + "grad_norm": 8.1681396816763, + "learning_rate": 4.713493657566674e-06, + "loss": 0.9399, + "step": 4963 + }, + { + "epoch": 0.358625173840013, + "grad_norm": 9.081185696525822, + "learning_rate": 4.7133576771457246e-06, + "loss": 0.8967, + "step": 4964 + }, + { + "epoch": 0.3586974190402225, + "grad_norm": 7.250726334592307, + "learning_rate": 4.713221666425626e-06, + "loss": 0.927, + "step": 4965 + }, + { + "epoch": 0.35876966424043205, + "grad_norm": 6.880336008516436, + "learning_rate": 4.713085625408242e-06, + "loss": 0.9904, + "step": 4966 + }, + { + "epoch": 0.35884190944064154, + "grad_norm": 7.48058603857136, + "learning_rate": 4.712949554095433e-06, + "loss": 0.8682, + "step": 4967 + }, + { + "epoch": 0.358914154640851, + "grad_norm": 6.579162370882893, + "learning_rate": 4.7128134524890625e-06, + "loss": 0.9681, + "step": 4968 + }, + { + "epoch": 0.35898639984106057, + "grad_norm": 6.895976023815985, + "learning_rate": 4.712677320590995e-06, + "loss": 0.9969, + "step": 4969 + }, + { + "epoch": 0.35905864504127005, + "grad_norm": 7.56386484487934, + "learning_rate": 4.712541158403093e-06, + "loss": 0.9243, + "step": 4970 + }, + { + "epoch": 0.3591308902414796, + "grad_norm": 5.997600711487352, + "learning_rate": 4.71240496592722e-06, + "loss": 0.8798, + "step": 4971 + }, + { + "epoch": 0.3592031354416891, + "grad_norm": 7.335537088047281, + "learning_rate": 4.7122687431652404e-06, + "loss": 0.9176, + "step": 4972 + }, + { + "epoch": 0.3592753806418986, + "grad_norm": 6.059490912176211, + "learning_rate": 4.71213249011902e-06, + "loss": 0.8983, + "step": 4973 + }, + { + "epoch": 0.3593476258421081, + "grad_norm": 7.2123159824743315, + "learning_rate": 4.711996206790425e-06, + "loss": 0.8341, + "step": 4974 + }, + { + "epoch": 0.35941987104231765, + "grad_norm": 6.637187471952393, + "learning_rate": 4.711859893181317e-06, + "loss": 0.8325, + "step": 4975 + }, + { + "epoch": 0.35949211624252714, + "grad_norm": 7.252367619618292, + "learning_rate": 4.7117235492935654e-06, + "loss": 0.875, + "step": 4976 + }, + { + "epoch": 0.3595643614427366, + "grad_norm": 6.852940628487041, + "learning_rate": 4.711587175129036e-06, + "loss": 0.9612, + "step": 4977 + }, + { + "epoch": 0.35963660664294617, + "grad_norm": 5.521938438893067, + "learning_rate": 4.711450770689595e-06, + "loss": 0.9368, + "step": 4978 + }, + { + "epoch": 0.35970885184315565, + "grad_norm": 8.146482736079312, + "learning_rate": 4.711314335977109e-06, + "loss": 0.9944, + "step": 4979 + }, + { + "epoch": 0.3597810970433652, + "grad_norm": 8.039947431401963, + "learning_rate": 4.711177870993449e-06, + "loss": 0.8816, + "step": 4980 + }, + { + "epoch": 0.3598533422435747, + "grad_norm": 6.22656770751429, + "learning_rate": 4.71104137574048e-06, + "loss": 0.8752, + "step": 4981 + }, + { + "epoch": 0.3599255874437842, + "grad_norm": 5.752992929147599, + "learning_rate": 4.710904850220071e-06, + "loss": 0.8335, + "step": 4982 + }, + { + "epoch": 0.3599978326439937, + "grad_norm": 5.681863527983998, + "learning_rate": 4.71076829443409e-06, + "loss": 0.8607, + "step": 4983 + }, + { + "epoch": 0.36007007784420325, + "grad_norm": 5.768033483442453, + "learning_rate": 4.710631708384409e-06, + "loss": 0.9113, + "step": 4984 + }, + { + "epoch": 0.36014232304441274, + "grad_norm": 8.599244514815586, + "learning_rate": 4.710495092072896e-06, + "loss": 0.8845, + "step": 4985 + }, + { + "epoch": 0.3602145682446222, + "grad_norm": 6.6664333620574014, + "learning_rate": 4.710358445501422e-06, + "loss": 1.0013, + "step": 4986 + }, + { + "epoch": 0.36028681344483177, + "grad_norm": 6.990984969917284, + "learning_rate": 4.710221768671857e-06, + "loss": 0.8581, + "step": 4987 + }, + { + "epoch": 0.36035905864504125, + "grad_norm": 9.010902582785217, + "learning_rate": 4.710085061586071e-06, + "loss": 0.9043, + "step": 4988 + }, + { + "epoch": 0.3604313038452508, + "grad_norm": 5.49456223525628, + "learning_rate": 4.709948324245938e-06, + "loss": 0.9596, + "step": 4989 + }, + { + "epoch": 0.3605035490454603, + "grad_norm": 6.709405122359751, + "learning_rate": 4.709811556653328e-06, + "loss": 0.9269, + "step": 4990 + }, + { + "epoch": 0.3605757942456698, + "grad_norm": 7.094159219508558, + "learning_rate": 4.7096747588101134e-06, + "loss": 0.9666, + "step": 4991 + }, + { + "epoch": 0.3606480394458793, + "grad_norm": 6.690142813442611, + "learning_rate": 4.709537930718167e-06, + "loss": 0.8876, + "step": 4992 + }, + { + "epoch": 0.36072028464608885, + "grad_norm": 7.140229930007096, + "learning_rate": 4.709401072379361e-06, + "loss": 0.9425, + "step": 4993 + }, + { + "epoch": 0.36079252984629834, + "grad_norm": 5.609426854140983, + "learning_rate": 4.709264183795572e-06, + "loss": 0.8603, + "step": 4994 + }, + { + "epoch": 0.3608647750465078, + "grad_norm": 7.312993742095219, + "learning_rate": 4.7091272649686704e-06, + "loss": 0.9317, + "step": 4995 + }, + { + "epoch": 0.36093702024671737, + "grad_norm": 7.883961256422651, + "learning_rate": 4.708990315900531e-06, + "loss": 0.9281, + "step": 4996 + }, + { + "epoch": 0.36100926544692685, + "grad_norm": 6.757226254827905, + "learning_rate": 4.7088533365930315e-06, + "loss": 0.892, + "step": 4997 + }, + { + "epoch": 0.3610815106471364, + "grad_norm": 7.787677117502665, + "learning_rate": 4.708716327048043e-06, + "loss": 0.9, + "step": 4998 + }, + { + "epoch": 0.3611537558473459, + "grad_norm": 6.424814410071113, + "learning_rate": 4.708579287267444e-06, + "loss": 0.8679, + "step": 4999 + }, + { + "epoch": 0.3612260010475554, + "grad_norm": 6.434901926142485, + "learning_rate": 4.7084422172531085e-06, + "loss": 0.8629, + "step": 5000 + }, + { + "epoch": 0.3612982462477649, + "grad_norm": 7.9968300738009965, + "learning_rate": 4.708305117006914e-06, + "loss": 0.9542, + "step": 5001 + }, + { + "epoch": 0.36137049144797445, + "grad_norm": 6.459383905752508, + "learning_rate": 4.708167986530737e-06, + "loss": 0.9646, + "step": 5002 + }, + { + "epoch": 0.36144273664818394, + "grad_norm": 6.632019930194133, + "learning_rate": 4.708030825826456e-06, + "loss": 0.9413, + "step": 5003 + }, + { + "epoch": 0.3615149818483934, + "grad_norm": 6.486362235494471, + "learning_rate": 4.7078936348959456e-06, + "loss": 0.9766, + "step": 5004 + }, + { + "epoch": 0.36158722704860297, + "grad_norm": 6.505022602672438, + "learning_rate": 4.707756413741087e-06, + "loss": 0.8711, + "step": 5005 + }, + { + "epoch": 0.36165947224881245, + "grad_norm": 5.717785696640359, + "learning_rate": 4.707619162363757e-06, + "loss": 1.0074, + "step": 5006 + }, + { + "epoch": 0.361731717449022, + "grad_norm": 6.40044128565547, + "learning_rate": 4.707481880765835e-06, + "loss": 0.8916, + "step": 5007 + }, + { + "epoch": 0.3618039626492315, + "grad_norm": 5.483510180155958, + "learning_rate": 4.7073445689492e-06, + "loss": 0.886, + "step": 5008 + }, + { + "epoch": 0.361876207849441, + "grad_norm": 6.2171024459539295, + "learning_rate": 4.707207226915731e-06, + "loss": 0.859, + "step": 5009 + }, + { + "epoch": 0.3619484530496505, + "grad_norm": 7.26783374315313, + "learning_rate": 4.707069854667309e-06, + "loss": 0.8519, + "step": 5010 + }, + { + "epoch": 0.36202069824986005, + "grad_norm": 5.7918989102936305, + "learning_rate": 4.706932452205815e-06, + "loss": 0.8291, + "step": 5011 + }, + { + "epoch": 0.36209294345006954, + "grad_norm": 5.779093902210485, + "learning_rate": 4.706795019533129e-06, + "loss": 0.8611, + "step": 5012 + }, + { + "epoch": 0.362165188650279, + "grad_norm": 6.268945285117184, + "learning_rate": 4.706657556651133e-06, + "loss": 0.8412, + "step": 5013 + }, + { + "epoch": 0.36223743385048857, + "grad_norm": 6.754185544489044, + "learning_rate": 4.706520063561708e-06, + "loss": 0.8861, + "step": 5014 + }, + { + "epoch": 0.36230967905069805, + "grad_norm": 7.938975760172434, + "learning_rate": 4.706382540266736e-06, + "loss": 0.9324, + "step": 5015 + }, + { + "epoch": 0.3623819242509076, + "grad_norm": 6.856251902827761, + "learning_rate": 4.706244986768102e-06, + "loss": 0.8514, + "step": 5016 + }, + { + "epoch": 0.3624541694511171, + "grad_norm": 7.1145598903137595, + "learning_rate": 4.706107403067686e-06, + "loss": 0.8715, + "step": 5017 + }, + { + "epoch": 0.3625264146513266, + "grad_norm": 6.519998157711588, + "learning_rate": 4.705969789167372e-06, + "loss": 0.9317, + "step": 5018 + }, + { + "epoch": 0.3625986598515361, + "grad_norm": 6.571639608465659, + "learning_rate": 4.705832145069045e-06, + "loss": 0.9178, + "step": 5019 + }, + { + "epoch": 0.36267090505174565, + "grad_norm": 6.5244058038835995, + "learning_rate": 4.705694470774589e-06, + "loss": 0.957, + "step": 5020 + }, + { + "epoch": 0.36274315025195514, + "grad_norm": 7.021474324140929, + "learning_rate": 4.7055567662858876e-06, + "loss": 0.9329, + "step": 5021 + }, + { + "epoch": 0.3628153954521646, + "grad_norm": 5.924442751147241, + "learning_rate": 4.705419031604826e-06, + "loss": 0.9623, + "step": 5022 + }, + { + "epoch": 0.36288764065237417, + "grad_norm": 6.038704963947284, + "learning_rate": 4.705281266733292e-06, + "loss": 0.8708, + "step": 5023 + }, + { + "epoch": 0.36295988585258365, + "grad_norm": 7.551878224360889, + "learning_rate": 4.705143471673169e-06, + "loss": 0.9237, + "step": 5024 + }, + { + "epoch": 0.3630321310527932, + "grad_norm": 6.412808963701604, + "learning_rate": 4.705005646426344e-06, + "loss": 0.8918, + "step": 5025 + }, + { + "epoch": 0.3631043762530027, + "grad_norm": 6.1287945262482655, + "learning_rate": 4.704867790994704e-06, + "loss": 0.7919, + "step": 5026 + }, + { + "epoch": 0.3631766214532122, + "grad_norm": 7.174600187320572, + "learning_rate": 4.704729905380135e-06, + "loss": 0.8449, + "step": 5027 + }, + { + "epoch": 0.3632488666534217, + "grad_norm": 6.916511196855715, + "learning_rate": 4.704591989584527e-06, + "loss": 0.9097, + "step": 5028 + }, + { + "epoch": 0.36332111185363125, + "grad_norm": 7.9823590325853955, + "learning_rate": 4.704454043609765e-06, + "loss": 0.9288, + "step": 5029 + }, + { + "epoch": 0.36339335705384074, + "grad_norm": 6.431356220461797, + "learning_rate": 4.704316067457739e-06, + "loss": 0.8204, + "step": 5030 + }, + { + "epoch": 0.3634656022540502, + "grad_norm": 7.920124409787002, + "learning_rate": 4.704178061130338e-06, + "loss": 0.9231, + "step": 5031 + }, + { + "epoch": 0.36353784745425977, + "grad_norm": 8.974143715816506, + "learning_rate": 4.704040024629451e-06, + "loss": 0.8762, + "step": 5032 + }, + { + "epoch": 0.36361009265446925, + "grad_norm": 6.119944140110771, + "learning_rate": 4.703901957956967e-06, + "loss": 0.8917, + "step": 5033 + }, + { + "epoch": 0.3636823378546788, + "grad_norm": 6.571206266777593, + "learning_rate": 4.703763861114776e-06, + "loss": 0.9474, + "step": 5034 + }, + { + "epoch": 0.3637545830548883, + "grad_norm": 6.701470444292257, + "learning_rate": 4.70362573410477e-06, + "loss": 0.9708, + "step": 5035 + }, + { + "epoch": 0.3638268282550978, + "grad_norm": 7.626118312187047, + "learning_rate": 4.703487576928838e-06, + "loss": 0.9127, + "step": 5036 + }, + { + "epoch": 0.3638990734553073, + "grad_norm": 6.045129805087031, + "learning_rate": 4.7033493895888715e-06, + "loss": 0.9084, + "step": 5037 + }, + { + "epoch": 0.36397131865551685, + "grad_norm": 6.783132827882214, + "learning_rate": 4.703211172086764e-06, + "loss": 0.7618, + "step": 5038 + }, + { + "epoch": 0.36404356385572634, + "grad_norm": 7.623692134569398, + "learning_rate": 4.703072924424405e-06, + "loss": 0.8703, + "step": 5039 + }, + { + "epoch": 0.3641158090559358, + "grad_norm": 6.008494404165058, + "learning_rate": 4.702934646603689e-06, + "loss": 0.9508, + "step": 5040 + }, + { + "epoch": 0.36418805425614537, + "grad_norm": 6.6059339146347416, + "learning_rate": 4.702796338626507e-06, + "loss": 0.8222, + "step": 5041 + }, + { + "epoch": 0.36426029945635485, + "grad_norm": 7.10213831548948, + "learning_rate": 4.7026580004947545e-06, + "loss": 0.9414, + "step": 5042 + }, + { + "epoch": 0.3643325446565644, + "grad_norm": 8.03145756434722, + "learning_rate": 4.702519632210324e-06, + "loss": 0.9593, + "step": 5043 + }, + { + "epoch": 0.3644047898567739, + "grad_norm": 5.963620523211805, + "learning_rate": 4.70238123377511e-06, + "loss": 0.9251, + "step": 5044 + }, + { + "epoch": 0.3644770350569834, + "grad_norm": 6.666621907401829, + "learning_rate": 4.7022428051910066e-06, + "loss": 0.9167, + "step": 5045 + }, + { + "epoch": 0.3645492802571929, + "grad_norm": 7.752112808178339, + "learning_rate": 4.70210434645991e-06, + "loss": 0.9508, + "step": 5046 + }, + { + "epoch": 0.36462152545740245, + "grad_norm": 7.42533713451984, + "learning_rate": 4.7019658575837134e-06, + "loss": 0.9227, + "step": 5047 + }, + { + "epoch": 0.36469377065761194, + "grad_norm": 5.92707246355916, + "learning_rate": 4.701827338564316e-06, + "loss": 0.8724, + "step": 5048 + }, + { + "epoch": 0.3647660158578214, + "grad_norm": 7.168652311156676, + "learning_rate": 4.70168878940361e-06, + "loss": 0.9482, + "step": 5049 + }, + { + "epoch": 0.36483826105803097, + "grad_norm": 5.7039286008114, + "learning_rate": 4.7015502101034935e-06, + "loss": 1.0019, + "step": 5050 + }, + { + "epoch": 0.36491050625824045, + "grad_norm": 7.276956626763879, + "learning_rate": 4.701411600665866e-06, + "loss": 0.8795, + "step": 5051 + }, + { + "epoch": 0.36498275145845, + "grad_norm": 7.253462655139403, + "learning_rate": 4.701272961092622e-06, + "loss": 0.8687, + "step": 5052 + }, + { + "epoch": 0.3650549966586595, + "grad_norm": 6.586369872465699, + "learning_rate": 4.70113429138566e-06, + "loss": 1.0009, + "step": 5053 + }, + { + "epoch": 0.365127241858869, + "grad_norm": 7.464548118083085, + "learning_rate": 4.700995591546879e-06, + "loss": 0.8832, + "step": 5054 + }, + { + "epoch": 0.3651994870590785, + "grad_norm": 7.28006901802631, + "learning_rate": 4.700856861578177e-06, + "loss": 0.8895, + "step": 5055 + }, + { + "epoch": 0.36527173225928805, + "grad_norm": 6.862759322250542, + "learning_rate": 4.7007181014814544e-06, + "loss": 0.8947, + "step": 5056 + }, + { + "epoch": 0.36534397745949754, + "grad_norm": 7.35593290074971, + "learning_rate": 4.700579311258609e-06, + "loss": 0.9413, + "step": 5057 + }, + { + "epoch": 0.365416222659707, + "grad_norm": 8.277262177364689, + "learning_rate": 4.7004404909115405e-06, + "loss": 0.864, + "step": 5058 + }, + { + "epoch": 0.36548846785991657, + "grad_norm": 6.390918976753785, + "learning_rate": 4.700301640442152e-06, + "loss": 0.8117, + "step": 5059 + }, + { + "epoch": 0.36556071306012605, + "grad_norm": 5.943007635212768, + "learning_rate": 4.700162759852342e-06, + "loss": 0.8244, + "step": 5060 + }, + { + "epoch": 0.3656329582603356, + "grad_norm": 8.200134778659653, + "learning_rate": 4.700023849144011e-06, + "loss": 0.8931, + "step": 5061 + }, + { + "epoch": 0.3657052034605451, + "grad_norm": 4.96972953689855, + "learning_rate": 4.699884908319063e-06, + "loss": 0.8491, + "step": 5062 + }, + { + "epoch": 0.3657774486607546, + "grad_norm": 7.384574752475754, + "learning_rate": 4.699745937379399e-06, + "loss": 0.8493, + "step": 5063 + }, + { + "epoch": 0.3658496938609641, + "grad_norm": 6.414743143411688, + "learning_rate": 4.69960693632692e-06, + "loss": 0.8667, + "step": 5064 + }, + { + "epoch": 0.3659219390611736, + "grad_norm": 6.419975649588446, + "learning_rate": 4.69946790516353e-06, + "loss": 0.8907, + "step": 5065 + }, + { + "epoch": 0.36599418426138314, + "grad_norm": 6.246172228732276, + "learning_rate": 4.699328843891132e-06, + "loss": 0.9023, + "step": 5066 + }, + { + "epoch": 0.3660664294615926, + "grad_norm": 6.645849277712391, + "learning_rate": 4.699189752511631e-06, + "loss": 0.9341, + "step": 5067 + }, + { + "epoch": 0.36613867466180217, + "grad_norm": 8.65545804914606, + "learning_rate": 4.699050631026929e-06, + "loss": 0.9438, + "step": 5068 + }, + { + "epoch": 0.36621091986201165, + "grad_norm": 7.766229924486542, + "learning_rate": 4.6989114794389315e-06, + "loss": 0.8394, + "step": 5069 + }, + { + "epoch": 0.3662831650622212, + "grad_norm": 7.421230440761344, + "learning_rate": 4.698772297749543e-06, + "loss": 0.8711, + "step": 5070 + }, + { + "epoch": 0.3663554102624307, + "grad_norm": 7.081762999952766, + "learning_rate": 4.698633085960669e-06, + "loss": 0.9101, + "step": 5071 + }, + { + "epoch": 0.3664276554626402, + "grad_norm": 7.229764123975974, + "learning_rate": 4.6984938440742154e-06, + "loss": 0.8247, + "step": 5072 + }, + { + "epoch": 0.3664999006628497, + "grad_norm": 7.225530965229627, + "learning_rate": 4.6983545720920875e-06, + "loss": 0.8749, + "step": 5073 + }, + { + "epoch": 0.3665721458630592, + "grad_norm": 7.320512343440108, + "learning_rate": 4.6982152700161935e-06, + "loss": 0.8496, + "step": 5074 + }, + { + "epoch": 0.36664439106326874, + "grad_norm": 6.671605696592146, + "learning_rate": 4.698075937848438e-06, + "loss": 0.8011, + "step": 5075 + }, + { + "epoch": 0.3667166362634782, + "grad_norm": 7.420610500833346, + "learning_rate": 4.69793657559073e-06, + "loss": 0.976, + "step": 5076 + }, + { + "epoch": 0.36678888146368777, + "grad_norm": 6.78803922362451, + "learning_rate": 4.697797183244978e-06, + "loss": 0.8946, + "step": 5077 + }, + { + "epoch": 0.36686112666389725, + "grad_norm": 6.529413430477025, + "learning_rate": 4.697657760813087e-06, + "loss": 1.0076, + "step": 5078 + }, + { + "epoch": 0.3669333718641068, + "grad_norm": 6.971961132630625, + "learning_rate": 4.697518308296969e-06, + "loss": 1.0296, + "step": 5079 + }, + { + "epoch": 0.3670056170643163, + "grad_norm": 10.202471818798566, + "learning_rate": 4.697378825698532e-06, + "loss": 0.8563, + "step": 5080 + }, + { + "epoch": 0.3670778622645258, + "grad_norm": 6.511243632374442, + "learning_rate": 4.6972393130196845e-06, + "loss": 0.8924, + "step": 5081 + }, + { + "epoch": 0.3671501074647353, + "grad_norm": 7.225060813146289, + "learning_rate": 4.697099770262336e-06, + "loss": 0.8592, + "step": 5082 + }, + { + "epoch": 0.3672223526649448, + "grad_norm": 6.043441546832439, + "learning_rate": 4.696960197428398e-06, + "loss": 0.9356, + "step": 5083 + }, + { + "epoch": 0.36729459786515434, + "grad_norm": 5.995754647373749, + "learning_rate": 4.696820594519782e-06, + "loss": 0.9047, + "step": 5084 + }, + { + "epoch": 0.3673668430653638, + "grad_norm": 5.133989219544241, + "learning_rate": 4.696680961538397e-06, + "loss": 0.8688, + "step": 5085 + }, + { + "epoch": 0.36743908826557337, + "grad_norm": 5.168807949862192, + "learning_rate": 4.696541298486155e-06, + "loss": 0.9125, + "step": 5086 + }, + { + "epoch": 0.36751133346578285, + "grad_norm": 6.578675308223925, + "learning_rate": 4.696401605364968e-06, + "loss": 0.9334, + "step": 5087 + }, + { + "epoch": 0.3675835786659924, + "grad_norm": 6.805889877446283, + "learning_rate": 4.6962618821767485e-06, + "loss": 0.8058, + "step": 5088 + }, + { + "epoch": 0.3676558238662019, + "grad_norm": 6.837610007452434, + "learning_rate": 4.6961221289234095e-06, + "loss": 0.8759, + "step": 5089 + }, + { + "epoch": 0.3677280690664114, + "grad_norm": 7.091554814617222, + "learning_rate": 4.695982345606864e-06, + "loss": 0.8883, + "step": 5090 + }, + { + "epoch": 0.3678003142666209, + "grad_norm": 11.533417288708007, + "learning_rate": 4.695842532229025e-06, + "loss": 0.9535, + "step": 5091 + }, + { + "epoch": 0.3678725594668304, + "grad_norm": 6.664302629301291, + "learning_rate": 4.695702688791806e-06, + "loss": 0.9188, + "step": 5092 + }, + { + "epoch": 0.36794480466703994, + "grad_norm": 8.262444560316558, + "learning_rate": 4.695562815297122e-06, + "loss": 0.8967, + "step": 5093 + }, + { + "epoch": 0.3680170498672494, + "grad_norm": 5.867434236136001, + "learning_rate": 4.695422911746889e-06, + "loss": 0.875, + "step": 5094 + }, + { + "epoch": 0.36808929506745897, + "grad_norm": 7.964415085479196, + "learning_rate": 4.6952829781430194e-06, + "loss": 0.991, + "step": 5095 + }, + { + "epoch": 0.36816154026766845, + "grad_norm": 6.394462347387637, + "learning_rate": 4.695143014487432e-06, + "loss": 0.8164, + "step": 5096 + }, + { + "epoch": 0.368233785467878, + "grad_norm": 7.274515729693799, + "learning_rate": 4.69500302078204e-06, + "loss": 0.8488, + "step": 5097 + }, + { + "epoch": 0.3683060306680875, + "grad_norm": 6.662723742193948, + "learning_rate": 4.694862997028762e-06, + "loss": 0.9688, + "step": 5098 + }, + { + "epoch": 0.368378275868297, + "grad_norm": 9.377027775491749, + "learning_rate": 4.694722943229513e-06, + "loss": 0.9626, + "step": 5099 + }, + { + "epoch": 0.3684505210685065, + "grad_norm": 7.243423636245577, + "learning_rate": 4.694582859386211e-06, + "loss": 0.9235, + "step": 5100 + }, + { + "epoch": 0.368522766268716, + "grad_norm": 9.052727422125162, + "learning_rate": 4.694442745500773e-06, + "loss": 0.8921, + "step": 5101 + }, + { + "epoch": 0.36859501146892554, + "grad_norm": 6.3811735292461345, + "learning_rate": 4.694302601575119e-06, + "loss": 0.9253, + "step": 5102 + }, + { + "epoch": 0.368667256669135, + "grad_norm": 6.61165065682988, + "learning_rate": 4.694162427611165e-06, + "loss": 0.8607, + "step": 5103 + }, + { + "epoch": 0.36873950186934457, + "grad_norm": 5.746179057511724, + "learning_rate": 4.694022223610832e-06, + "loss": 0.8858, + "step": 5104 + }, + { + "epoch": 0.36881174706955405, + "grad_norm": 6.592532655695097, + "learning_rate": 4.693881989576038e-06, + "loss": 0.8893, + "step": 5105 + }, + { + "epoch": 0.3688839922697636, + "grad_norm": 6.267846529388446, + "learning_rate": 4.693741725508703e-06, + "loss": 0.9237, + "step": 5106 + }, + { + "epoch": 0.3689562374699731, + "grad_norm": 6.59901803543394, + "learning_rate": 4.693601431410747e-06, + "loss": 0.8916, + "step": 5107 + }, + { + "epoch": 0.3690284826701826, + "grad_norm": 8.260869120023877, + "learning_rate": 4.693461107284091e-06, + "loss": 0.938, + "step": 5108 + }, + { + "epoch": 0.3691007278703921, + "grad_norm": 8.338718454526882, + "learning_rate": 4.693320753130655e-06, + "loss": 0.9511, + "step": 5109 + }, + { + "epoch": 0.3691729730706016, + "grad_norm": 6.723385936939159, + "learning_rate": 4.693180368952361e-06, + "loss": 0.9767, + "step": 5110 + }, + { + "epoch": 0.36924521827081114, + "grad_norm": 6.968546637745322, + "learning_rate": 4.693039954751131e-06, + "loss": 0.9438, + "step": 5111 + }, + { + "epoch": 0.3693174634710206, + "grad_norm": 7.601604653660099, + "learning_rate": 4.692899510528886e-06, + "loss": 1.011, + "step": 5112 + }, + { + "epoch": 0.36938970867123017, + "grad_norm": 10.02838987652679, + "learning_rate": 4.69275903628755e-06, + "loss": 0.9059, + "step": 5113 + }, + { + "epoch": 0.36946195387143965, + "grad_norm": 7.4327120716622925, + "learning_rate": 4.692618532029046e-06, + "loss": 0.9632, + "step": 5114 + }, + { + "epoch": 0.3695341990716492, + "grad_norm": 6.9158631129929296, + "learning_rate": 4.692477997755296e-06, + "loss": 0.9647, + "step": 5115 + }, + { + "epoch": 0.3696064442718587, + "grad_norm": 6.823168039908594, + "learning_rate": 4.692337433468224e-06, + "loss": 0.8878, + "step": 5116 + }, + { + "epoch": 0.3696786894720682, + "grad_norm": 6.277819555513047, + "learning_rate": 4.692196839169756e-06, + "loss": 0.896, + "step": 5117 + }, + { + "epoch": 0.3697509346722777, + "grad_norm": 5.803460424011385, + "learning_rate": 4.692056214861815e-06, + "loss": 0.9324, + "step": 5118 + }, + { + "epoch": 0.3698231798724872, + "grad_norm": 5.827552910510954, + "learning_rate": 4.691915560546326e-06, + "loss": 0.9059, + "step": 5119 + }, + { + "epoch": 0.36989542507269674, + "grad_norm": 7.751625136753079, + "learning_rate": 4.691774876225215e-06, + "loss": 0.8827, + "step": 5120 + }, + { + "epoch": 0.3699676702729062, + "grad_norm": 5.202894784227545, + "learning_rate": 4.691634161900408e-06, + "loss": 0.9443, + "step": 5121 + }, + { + "epoch": 0.37003991547311577, + "grad_norm": 7.302734897365587, + "learning_rate": 4.69149341757383e-06, + "loss": 0.9627, + "step": 5122 + }, + { + "epoch": 0.37011216067332525, + "grad_norm": 8.094212817064404, + "learning_rate": 4.69135264324741e-06, + "loss": 0.9108, + "step": 5123 + }, + { + "epoch": 0.3701844058735348, + "grad_norm": 8.001072334899915, + "learning_rate": 4.691211838923073e-06, + "loss": 0.9936, + "step": 5124 + }, + { + "epoch": 0.3702566510737443, + "grad_norm": 7.057774946456157, + "learning_rate": 4.691071004602748e-06, + "loss": 0.9693, + "step": 5125 + }, + { + "epoch": 0.3703288962739538, + "grad_norm": 8.28314026186061, + "learning_rate": 4.690930140288362e-06, + "loss": 0.9636, + "step": 5126 + }, + { + "epoch": 0.3704011414741633, + "grad_norm": 6.2135374373501255, + "learning_rate": 4.690789245981843e-06, + "loss": 0.9795, + "step": 5127 + }, + { + "epoch": 0.3704733866743728, + "grad_norm": 6.704372618905871, + "learning_rate": 4.690648321685121e-06, + "loss": 0.945, + "step": 5128 + }, + { + "epoch": 0.37054563187458234, + "grad_norm": 6.921217773097917, + "learning_rate": 4.690507367400124e-06, + "loss": 0.9778, + "step": 5129 + }, + { + "epoch": 0.3706178770747918, + "grad_norm": 8.479396307707528, + "learning_rate": 4.690366383128782e-06, + "loss": 0.8512, + "step": 5130 + }, + { + "epoch": 0.37069012227500137, + "grad_norm": 5.849313627227383, + "learning_rate": 4.690225368873025e-06, + "loss": 0.9582, + "step": 5131 + }, + { + "epoch": 0.37076236747521085, + "grad_norm": 7.8086357630810745, + "learning_rate": 4.690084324634783e-06, + "loss": 0.9135, + "step": 5132 + }, + { + "epoch": 0.3708346126754204, + "grad_norm": 6.740914623562459, + "learning_rate": 4.689943250415988e-06, + "loss": 0.8988, + "step": 5133 + }, + { + "epoch": 0.3709068578756299, + "grad_norm": 6.4966877053893715, + "learning_rate": 4.6898021462185696e-06, + "loss": 0.919, + "step": 5134 + }, + { + "epoch": 0.3709791030758394, + "grad_norm": 6.945389943501024, + "learning_rate": 4.68966101204446e-06, + "loss": 0.8766, + "step": 5135 + }, + { + "epoch": 0.3710513482760489, + "grad_norm": 6.791957014309537, + "learning_rate": 4.689519847895591e-06, + "loss": 1.0238, + "step": 5136 + }, + { + "epoch": 0.3711235934762584, + "grad_norm": 7.453350967404564, + "learning_rate": 4.689378653773896e-06, + "loss": 0.9656, + "step": 5137 + }, + { + "epoch": 0.37119583867646794, + "grad_norm": 6.517584563227267, + "learning_rate": 4.689237429681307e-06, + "loss": 0.9392, + "step": 5138 + }, + { + "epoch": 0.3712680838766774, + "grad_norm": 5.981430883273412, + "learning_rate": 4.689096175619757e-06, + "loss": 0.8759, + "step": 5139 + }, + { + "epoch": 0.37134032907688697, + "grad_norm": 5.178580279413186, + "learning_rate": 4.688954891591181e-06, + "loss": 0.83, + "step": 5140 + }, + { + "epoch": 0.37141257427709645, + "grad_norm": 7.447807542406068, + "learning_rate": 4.688813577597511e-06, + "loss": 0.9338, + "step": 5141 + }, + { + "epoch": 0.371484819477306, + "grad_norm": 6.980011876486919, + "learning_rate": 4.688672233640683e-06, + "loss": 0.9568, + "step": 5142 + }, + { + "epoch": 0.3715570646775155, + "grad_norm": 6.205145134592536, + "learning_rate": 4.688530859722632e-06, + "loss": 0.8556, + "step": 5143 + }, + { + "epoch": 0.371629309877725, + "grad_norm": 6.674487993698643, + "learning_rate": 4.6883894558452926e-06, + "loss": 0.8532, + "step": 5144 + }, + { + "epoch": 0.3717015550779345, + "grad_norm": 5.766907091953759, + "learning_rate": 4.688248022010601e-06, + "loss": 0.9644, + "step": 5145 + }, + { + "epoch": 0.371773800278144, + "grad_norm": 7.451814100353716, + "learning_rate": 4.688106558220492e-06, + "loss": 0.9803, + "step": 5146 + }, + { + "epoch": 0.37184604547835354, + "grad_norm": 7.183477453687505, + "learning_rate": 4.687965064476904e-06, + "loss": 0.886, + "step": 5147 + }, + { + "epoch": 0.371918290678563, + "grad_norm": 5.773376051558987, + "learning_rate": 4.687823540781773e-06, + "loss": 0.8708, + "step": 5148 + }, + { + "epoch": 0.37199053587877257, + "grad_norm": 5.524377378706426, + "learning_rate": 4.687681987137036e-06, + "loss": 0.889, + "step": 5149 + }, + { + "epoch": 0.37206278107898205, + "grad_norm": 10.651660098501262, + "learning_rate": 4.687540403544633e-06, + "loss": 0.9234, + "step": 5150 + }, + { + "epoch": 0.3721350262791916, + "grad_norm": 6.7471324339709655, + "learning_rate": 4.687398790006498e-06, + "loss": 1.0042, + "step": 5151 + }, + { + "epoch": 0.3722072714794011, + "grad_norm": 6.832052188271946, + "learning_rate": 4.687257146524573e-06, + "loss": 0.9489, + "step": 5152 + }, + { + "epoch": 0.3722795166796106, + "grad_norm": 7.208090041213159, + "learning_rate": 4.687115473100796e-06, + "loss": 0.9724, + "step": 5153 + }, + { + "epoch": 0.3723517618798201, + "grad_norm": 6.55278647753131, + "learning_rate": 4.686973769737106e-06, + "loss": 0.8545, + "step": 5154 + }, + { + "epoch": 0.3724240070800296, + "grad_norm": 7.946023764792948, + "learning_rate": 4.686832036435444e-06, + "loss": 0.9161, + "step": 5155 + }, + { + "epoch": 0.37249625228023914, + "grad_norm": 6.953969016979536, + "learning_rate": 4.686690273197749e-06, + "loss": 0.9249, + "step": 5156 + }, + { + "epoch": 0.3725684974804486, + "grad_norm": 6.706705910862999, + "learning_rate": 4.686548480025962e-06, + "loss": 0.9076, + "step": 5157 + }, + { + "epoch": 0.37264074268065817, + "grad_norm": 7.413722907791714, + "learning_rate": 4.6864066569220235e-06, + "loss": 0.9018, + "step": 5158 + }, + { + "epoch": 0.37271298788086765, + "grad_norm": 6.420228473246232, + "learning_rate": 4.686264803887877e-06, + "loss": 0.8885, + "step": 5159 + }, + { + "epoch": 0.3727852330810772, + "grad_norm": 7.977526091864991, + "learning_rate": 4.6861229209254615e-06, + "loss": 0.9132, + "step": 5160 + }, + { + "epoch": 0.3728574782812867, + "grad_norm": 8.086695126040269, + "learning_rate": 4.685981008036721e-06, + "loss": 0.9636, + "step": 5161 + }, + { + "epoch": 0.3729297234814962, + "grad_norm": 5.537066488755875, + "learning_rate": 4.685839065223597e-06, + "loss": 0.7833, + "step": 5162 + }, + { + "epoch": 0.3730019686817057, + "grad_norm": 8.636725816935146, + "learning_rate": 4.685697092488034e-06, + "loss": 0.9737, + "step": 5163 + }, + { + "epoch": 0.3730742138819152, + "grad_norm": 7.015888166695513, + "learning_rate": 4.685555089831976e-06, + "loss": 0.9113, + "step": 5164 + }, + { + "epoch": 0.37314645908212474, + "grad_norm": 7.070644577405248, + "learning_rate": 4.6854130572573645e-06, + "loss": 0.8787, + "step": 5165 + }, + { + "epoch": 0.3732187042823342, + "grad_norm": 6.5957421642500265, + "learning_rate": 4.685270994766146e-06, + "loss": 0.9693, + "step": 5166 + }, + { + "epoch": 0.37329094948254377, + "grad_norm": 5.848307907702943, + "learning_rate": 4.685128902360263e-06, + "loss": 0.8612, + "step": 5167 + }, + { + "epoch": 0.37336319468275325, + "grad_norm": 7.805717270496901, + "learning_rate": 4.684986780041663e-06, + "loss": 0.9352, + "step": 5168 + }, + { + "epoch": 0.3734354398829628, + "grad_norm": 7.859752122692915, + "learning_rate": 4.68484462781229e-06, + "loss": 0.9766, + "step": 5169 + }, + { + "epoch": 0.3735076850831723, + "grad_norm": 6.392596325497311, + "learning_rate": 4.6847024456740905e-06, + "loss": 0.8656, + "step": 5170 + }, + { + "epoch": 0.3735799302833818, + "grad_norm": 6.339049152588703, + "learning_rate": 4.684560233629011e-06, + "loss": 0.9277, + "step": 5171 + }, + { + "epoch": 0.3736521754835913, + "grad_norm": 6.063789466504572, + "learning_rate": 4.684417991678999e-06, + "loss": 0.8256, + "step": 5172 + }, + { + "epoch": 0.3737244206838008, + "grad_norm": 7.572334389599429, + "learning_rate": 4.684275719825999e-06, + "loss": 0.8787, + "step": 5173 + }, + { + "epoch": 0.37379666588401034, + "grad_norm": 6.023949825204559, + "learning_rate": 4.684133418071962e-06, + "loss": 0.9036, + "step": 5174 + }, + { + "epoch": 0.3738689110842198, + "grad_norm": 7.1291034326325935, + "learning_rate": 4.683991086418833e-06, + "loss": 1.0101, + "step": 5175 + }, + { + "epoch": 0.37394115628442937, + "grad_norm": 6.404376691138799, + "learning_rate": 4.683848724868563e-06, + "loss": 0.8408, + "step": 5176 + }, + { + "epoch": 0.37401340148463885, + "grad_norm": 6.152167346181337, + "learning_rate": 4.6837063334230995e-06, + "loss": 0.9046, + "step": 5177 + }, + { + "epoch": 0.3740856466848484, + "grad_norm": 10.09554877064485, + "learning_rate": 4.6835639120843915e-06, + "loss": 1.0101, + "step": 5178 + }, + { + "epoch": 0.3741578918850579, + "grad_norm": 6.303982871385725, + "learning_rate": 4.683421460854388e-06, + "loss": 0.9325, + "step": 5179 + }, + { + "epoch": 0.3742301370852674, + "grad_norm": 5.2066898448654495, + "learning_rate": 4.683278979735041e-06, + "loss": 0.9144, + "step": 5180 + }, + { + "epoch": 0.3743023822854769, + "grad_norm": 6.49747241834457, + "learning_rate": 4.6831364687283e-06, + "loss": 0.8527, + "step": 5181 + }, + { + "epoch": 0.3743746274856864, + "grad_norm": 7.981557569200936, + "learning_rate": 4.682993927836116e-06, + "loss": 0.9736, + "step": 5182 + }, + { + "epoch": 0.37444687268589594, + "grad_norm": 7.689783765037047, + "learning_rate": 4.682851357060439e-06, + "loss": 0.9701, + "step": 5183 + }, + { + "epoch": 0.3745191178861054, + "grad_norm": 7.064428555649147, + "learning_rate": 4.682708756403223e-06, + "loss": 0.8188, + "step": 5184 + }, + { + "epoch": 0.37459136308631497, + "grad_norm": 6.158399797148886, + "learning_rate": 4.682566125866419e-06, + "loss": 0.9297, + "step": 5185 + }, + { + "epoch": 0.37466360828652445, + "grad_norm": 6.214285776924421, + "learning_rate": 4.682423465451979e-06, + "loss": 0.8724, + "step": 5186 + }, + { + "epoch": 0.374735853486734, + "grad_norm": 8.720623293657559, + "learning_rate": 4.682280775161856e-06, + "loss": 0.9322, + "step": 5187 + }, + { + "epoch": 0.3748080986869435, + "grad_norm": 7.08422777101162, + "learning_rate": 4.682138054998004e-06, + "loss": 0.9724, + "step": 5188 + }, + { + "epoch": 0.374880343887153, + "grad_norm": 7.39011295794929, + "learning_rate": 4.681995304962375e-06, + "loss": 0.8678, + "step": 5189 + }, + { + "epoch": 0.3749525890873625, + "grad_norm": 5.602174009944292, + "learning_rate": 4.681852525056927e-06, + "loss": 0.864, + "step": 5190 + }, + { + "epoch": 0.375024834287572, + "grad_norm": 8.76544308941145, + "learning_rate": 4.68170971528361e-06, + "loss": 0.8856, + "step": 5191 + }, + { + "epoch": 0.37509707948778154, + "grad_norm": 7.705475918517907, + "learning_rate": 4.6815668756443824e-06, + "loss": 0.9837, + "step": 5192 + }, + { + "epoch": 0.375169324687991, + "grad_norm": 6.387727187683236, + "learning_rate": 4.681424006141197e-06, + "loss": 0.9513, + "step": 5193 + }, + { + "epoch": 0.37524156988820057, + "grad_norm": 8.688880865554596, + "learning_rate": 4.681281106776011e-06, + "loss": 0.9268, + "step": 5194 + }, + { + "epoch": 0.37531381508841005, + "grad_norm": 7.763398432684578, + "learning_rate": 4.6811381775507805e-06, + "loss": 0.9226, + "step": 5195 + }, + { + "epoch": 0.3753860602886196, + "grad_norm": 5.467472297727353, + "learning_rate": 4.680995218467462e-06, + "loss": 0.8596, + "step": 5196 + }, + { + "epoch": 0.3754583054888291, + "grad_norm": 8.207812555771701, + "learning_rate": 4.680852229528012e-06, + "loss": 0.9231, + "step": 5197 + }, + { + "epoch": 0.3755305506890386, + "grad_norm": 8.9850033747646, + "learning_rate": 4.680709210734389e-06, + "loss": 0.9454, + "step": 5198 + }, + { + "epoch": 0.3756027958892481, + "grad_norm": 7.227321275621066, + "learning_rate": 4.680566162088549e-06, + "loss": 0.9379, + "step": 5199 + }, + { + "epoch": 0.3756750410894576, + "grad_norm": 7.9634467934911, + "learning_rate": 4.680423083592452e-06, + "loss": 0.8719, + "step": 5200 + }, + { + "epoch": 0.37574728628966714, + "grad_norm": 7.294698059652239, + "learning_rate": 4.680279975248057e-06, + "loss": 0.9425, + "step": 5201 + }, + { + "epoch": 0.3758195314898766, + "grad_norm": 6.788698105421446, + "learning_rate": 4.68013683705732e-06, + "loss": 0.9084, + "step": 5202 + }, + { + "epoch": 0.37589177669008617, + "grad_norm": 6.136327739572049, + "learning_rate": 4.679993669022204e-06, + "loss": 1.0004, + "step": 5203 + }, + { + "epoch": 0.37596402189029565, + "grad_norm": 6.032596734255846, + "learning_rate": 4.679850471144667e-06, + "loss": 0.8435, + "step": 5204 + }, + { + "epoch": 0.3760362670905052, + "grad_norm": 7.674036409110242, + "learning_rate": 4.679707243426669e-06, + "loss": 0.9929, + "step": 5205 + }, + { + "epoch": 0.3761085122907147, + "grad_norm": 7.77518113100094, + "learning_rate": 4.6795639858701715e-06, + "loss": 0.852, + "step": 5206 + }, + { + "epoch": 0.3761807574909242, + "grad_norm": 8.1826217911768, + "learning_rate": 4.679420698477135e-06, + "loss": 1.0343, + "step": 5207 + }, + { + "epoch": 0.3762530026911337, + "grad_norm": 6.965133481249379, + "learning_rate": 4.679277381249523e-06, + "loss": 0.8963, + "step": 5208 + }, + { + "epoch": 0.3763252478913432, + "grad_norm": 6.234743647563232, + "learning_rate": 4.679134034189294e-06, + "loss": 0.9972, + "step": 5209 + }, + { + "epoch": 0.37639749309155274, + "grad_norm": 6.87425810538939, + "learning_rate": 4.678990657298413e-06, + "loss": 0.8532, + "step": 5210 + }, + { + "epoch": 0.3764697382917622, + "grad_norm": 6.23533160296825, + "learning_rate": 4.678847250578841e-06, + "loss": 0.8139, + "step": 5211 + }, + { + "epoch": 0.37654198349197177, + "grad_norm": 7.27191323146106, + "learning_rate": 4.6787038140325424e-06, + "loss": 0.8056, + "step": 5212 + }, + { + "epoch": 0.37661422869218125, + "grad_norm": 7.137676272912936, + "learning_rate": 4.678560347661481e-06, + "loss": 0.9202, + "step": 5213 + }, + { + "epoch": 0.3766864738923908, + "grad_norm": 6.236934585783608, + "learning_rate": 4.678416851467618e-06, + "loss": 0.8631, + "step": 5214 + }, + { + "epoch": 0.3767587190926003, + "grad_norm": 9.12725258810826, + "learning_rate": 4.6782733254529215e-06, + "loss": 0.9854, + "step": 5215 + }, + { + "epoch": 0.3768309642928098, + "grad_norm": 6.477478330716541, + "learning_rate": 4.678129769619354e-06, + "loss": 0.9014, + "step": 5216 + }, + { + "epoch": 0.3769032094930193, + "grad_norm": 7.813736962621977, + "learning_rate": 4.6779861839688815e-06, + "loss": 0.9765, + "step": 5217 + }, + { + "epoch": 0.3769754546932288, + "grad_norm": 6.84066633313946, + "learning_rate": 4.6778425685034685e-06, + "loss": 0.8257, + "step": 5218 + }, + { + "epoch": 0.37704769989343834, + "grad_norm": 6.065330237999914, + "learning_rate": 4.677698923225082e-06, + "loss": 0.9208, + "step": 5219 + }, + { + "epoch": 0.3771199450936478, + "grad_norm": 6.236672190927614, + "learning_rate": 4.677555248135688e-06, + "loss": 0.9505, + "step": 5220 + }, + { + "epoch": 0.37719219029385737, + "grad_norm": 8.981060444975782, + "learning_rate": 4.6774115432372534e-06, + "loss": 0.9996, + "step": 5221 + }, + { + "epoch": 0.37726443549406685, + "grad_norm": 6.894792676081173, + "learning_rate": 4.6772678085317455e-06, + "loss": 1.0062, + "step": 5222 + }, + { + "epoch": 0.3773366806942764, + "grad_norm": 5.651377734432659, + "learning_rate": 4.677124044021132e-06, + "loss": 0.9562, + "step": 5223 + }, + { + "epoch": 0.3774089258944859, + "grad_norm": 6.2394350402566126, + "learning_rate": 4.676980249707381e-06, + "loss": 0.9509, + "step": 5224 + }, + { + "epoch": 0.3774811710946954, + "grad_norm": 7.718039290724301, + "learning_rate": 4.676836425592461e-06, + "loss": 0.9725, + "step": 5225 + }, + { + "epoch": 0.3775534162949049, + "grad_norm": 6.788129419750857, + "learning_rate": 4.6766925716783394e-06, + "loss": 0.8483, + "step": 5226 + }, + { + "epoch": 0.3776256614951144, + "grad_norm": 5.179892603523089, + "learning_rate": 4.6765486879669865e-06, + "loss": 0.9069, + "step": 5227 + }, + { + "epoch": 0.37769790669532394, + "grad_norm": 6.379885129462498, + "learning_rate": 4.676404774460373e-06, + "loss": 0.9014, + "step": 5228 + }, + { + "epoch": 0.3777701518955334, + "grad_norm": 7.919990751858331, + "learning_rate": 4.676260831160467e-06, + "loss": 0.904, + "step": 5229 + }, + { + "epoch": 0.37784239709574297, + "grad_norm": 6.18926483570892, + "learning_rate": 4.6761168580692415e-06, + "loss": 0.8886, + "step": 5230 + }, + { + "epoch": 0.37791464229595245, + "grad_norm": 5.997025706279517, + "learning_rate": 4.675972855188665e-06, + "loss": 0.9297, + "step": 5231 + }, + { + "epoch": 0.377986887496162, + "grad_norm": 7.5248844581427585, + "learning_rate": 4.675828822520709e-06, + "loss": 0.9755, + "step": 5232 + }, + { + "epoch": 0.3780591326963715, + "grad_norm": 7.152870946316465, + "learning_rate": 4.675684760067347e-06, + "loss": 0.8706, + "step": 5233 + }, + { + "epoch": 0.378131377896581, + "grad_norm": 6.5987622343059416, + "learning_rate": 4.6755406678305495e-06, + "loss": 0.8689, + "step": 5234 + }, + { + "epoch": 0.3782036230967905, + "grad_norm": 6.649839080928658, + "learning_rate": 4.67539654581229e-06, + "loss": 0.8952, + "step": 5235 + }, + { + "epoch": 0.378275868297, + "grad_norm": 5.754754795713634, + "learning_rate": 4.675252394014539e-06, + "loss": 1.0599, + "step": 5236 + }, + { + "epoch": 0.37834811349720954, + "grad_norm": 5.384715149552601, + "learning_rate": 4.675108212439273e-06, + "loss": 0.9237, + "step": 5237 + }, + { + "epoch": 0.378420358697419, + "grad_norm": 6.296313693868167, + "learning_rate": 4.6749640010884644e-06, + "loss": 0.9189, + "step": 5238 + }, + { + "epoch": 0.37849260389762857, + "grad_norm": 7.818078086761677, + "learning_rate": 4.674819759964088e-06, + "loss": 0.9619, + "step": 5239 + }, + { + "epoch": 0.37856484909783805, + "grad_norm": 7.330105446841567, + "learning_rate": 4.6746754890681165e-06, + "loss": 0.8886, + "step": 5240 + }, + { + "epoch": 0.3786370942980476, + "grad_norm": 6.538299114919882, + "learning_rate": 4.674531188402527e-06, + "loss": 0.8499, + "step": 5241 + }, + { + "epoch": 0.3787093394982571, + "grad_norm": 8.482710364150735, + "learning_rate": 4.674386857969293e-06, + "loss": 0.8991, + "step": 5242 + }, + { + "epoch": 0.3787815846984666, + "grad_norm": 7.74690430863758, + "learning_rate": 4.674242497770393e-06, + "loss": 0.9199, + "step": 5243 + }, + { + "epoch": 0.3788538298986761, + "grad_norm": 8.44429838600778, + "learning_rate": 4.6740981078078e-06, + "loss": 1.0164, + "step": 5244 + }, + { + "epoch": 0.3789260750988856, + "grad_norm": 7.340236131575411, + "learning_rate": 4.673953688083492e-06, + "loss": 0.8953, + "step": 5245 + }, + { + "epoch": 0.37899832029909514, + "grad_norm": 5.460622797508534, + "learning_rate": 4.673809238599446e-06, + "loss": 0.9007, + "step": 5246 + }, + { + "epoch": 0.3790705654993046, + "grad_norm": 7.939321954636003, + "learning_rate": 4.67366475935764e-06, + "loss": 0.8335, + "step": 5247 + }, + { + "epoch": 0.37914281069951417, + "grad_norm": 5.618651304131356, + "learning_rate": 4.673520250360051e-06, + "loss": 0.8569, + "step": 5248 + }, + { + "epoch": 0.37921505589972365, + "grad_norm": 6.728843027683952, + "learning_rate": 4.673375711608656e-06, + "loss": 0.8603, + "step": 5249 + }, + { + "epoch": 0.3792873010999332, + "grad_norm": 6.681867277287038, + "learning_rate": 4.6732311431054365e-06, + "loss": 0.9007, + "step": 5250 + }, + { + "epoch": 0.3793595463001427, + "grad_norm": 5.777967686262714, + "learning_rate": 4.67308654485237e-06, + "loss": 0.8322, + "step": 5251 + }, + { + "epoch": 0.3794317915003522, + "grad_norm": 5.537389246950336, + "learning_rate": 4.672941916851436e-06, + "loss": 0.9093, + "step": 5252 + }, + { + "epoch": 0.3795040367005617, + "grad_norm": 6.999095040816409, + "learning_rate": 4.6727972591046135e-06, + "loss": 0.8817, + "step": 5253 + }, + { + "epoch": 0.3795762819007712, + "grad_norm": 6.767179033145967, + "learning_rate": 4.672652571613885e-06, + "loss": 0.8812, + "step": 5254 + }, + { + "epoch": 0.37964852710098074, + "grad_norm": 6.796395681734202, + "learning_rate": 4.672507854381229e-06, + "loss": 0.9239, + "step": 5255 + }, + { + "epoch": 0.3797207723011902, + "grad_norm": 7.088823801032163, + "learning_rate": 4.672363107408627e-06, + "loss": 0.9581, + "step": 5256 + }, + { + "epoch": 0.37979301750139977, + "grad_norm": 6.376696790885593, + "learning_rate": 4.67221833069806e-06, + "loss": 0.8994, + "step": 5257 + }, + { + "epoch": 0.37986526270160925, + "grad_norm": 8.602248549928449, + "learning_rate": 4.672073524251513e-06, + "loss": 0.8846, + "step": 5258 + }, + { + "epoch": 0.3799375079018188, + "grad_norm": 5.956914061347313, + "learning_rate": 4.671928688070964e-06, + "loss": 0.8963, + "step": 5259 + }, + { + "epoch": 0.3800097531020283, + "grad_norm": 7.609100274173148, + "learning_rate": 4.671783822158398e-06, + "loss": 0.9859, + "step": 5260 + }, + { + "epoch": 0.3800819983022378, + "grad_norm": 7.458661915735435, + "learning_rate": 4.671638926515798e-06, + "loss": 0.8197, + "step": 5261 + }, + { + "epoch": 0.3801542435024473, + "grad_norm": 8.7500762936127, + "learning_rate": 4.671494001145147e-06, + "loss": 0.9667, + "step": 5262 + }, + { + "epoch": 0.3802264887026568, + "grad_norm": 5.85459082029613, + "learning_rate": 4.67134904604843e-06, + "loss": 0.8026, + "step": 5263 + }, + { + "epoch": 0.38029873390286634, + "grad_norm": 5.757862189193667, + "learning_rate": 4.67120406122763e-06, + "loss": 0.9325, + "step": 5264 + }, + { + "epoch": 0.3803709791030758, + "grad_norm": 7.325228055456917, + "learning_rate": 4.671059046684733e-06, + "loss": 0.8296, + "step": 5265 + }, + { + "epoch": 0.38044322430328537, + "grad_norm": 6.316151686370755, + "learning_rate": 4.670914002421722e-06, + "loss": 0.9203, + "step": 5266 + }, + { + "epoch": 0.38051546950349485, + "grad_norm": 6.912438015375346, + "learning_rate": 4.670768928440584e-06, + "loss": 0.8814, + "step": 5267 + }, + { + "epoch": 0.3805877147037044, + "grad_norm": 7.514442779126188, + "learning_rate": 4.670623824743306e-06, + "loss": 0.8536, + "step": 5268 + }, + { + "epoch": 0.3806599599039139, + "grad_norm": 6.007059077331013, + "learning_rate": 4.670478691331872e-06, + "loss": 0.8721, + "step": 5269 + }, + { + "epoch": 0.3807322051041234, + "grad_norm": 7.491724662287539, + "learning_rate": 4.6703335282082715e-06, + "loss": 0.87, + "step": 5270 + }, + { + "epoch": 0.3808044503043329, + "grad_norm": 8.457503054102686, + "learning_rate": 4.670188335374489e-06, + "loss": 0.8924, + "step": 5271 + }, + { + "epoch": 0.3808766955045424, + "grad_norm": 7.70937875067995, + "learning_rate": 4.670043112832513e-06, + "loss": 0.9104, + "step": 5272 + }, + { + "epoch": 0.38094894070475194, + "grad_norm": 8.094087925046212, + "learning_rate": 4.669897860584333e-06, + "loss": 0.9031, + "step": 5273 + }, + { + "epoch": 0.3810211859049614, + "grad_norm": 8.740633065278278, + "learning_rate": 4.669752578631935e-06, + "loss": 0.9094, + "step": 5274 + }, + { + "epoch": 0.38109343110517097, + "grad_norm": 8.063126414013325, + "learning_rate": 4.669607266977309e-06, + "loss": 0.8882, + "step": 5275 + }, + { + "epoch": 0.38116567630538045, + "grad_norm": 7.878681835937013, + "learning_rate": 4.6694619256224445e-06, + "loss": 0.854, + "step": 5276 + }, + { + "epoch": 0.38123792150559, + "grad_norm": 7.753062227756895, + "learning_rate": 4.669316554569331e-06, + "loss": 0.9796, + "step": 5277 + }, + { + "epoch": 0.3813101667057995, + "grad_norm": 7.7244022076360075, + "learning_rate": 4.669171153819957e-06, + "loss": 0.9053, + "step": 5278 + }, + { + "epoch": 0.38138241190600897, + "grad_norm": 6.622597888699732, + "learning_rate": 4.669025723376315e-06, + "loss": 0.9277, + "step": 5279 + }, + { + "epoch": 0.3814546571062185, + "grad_norm": 5.399036512251063, + "learning_rate": 4.668880263240395e-06, + "loss": 0.8608, + "step": 5280 + }, + { + "epoch": 0.381526902306428, + "grad_norm": 7.258471570923072, + "learning_rate": 4.668734773414188e-06, + "loss": 0.8997, + "step": 5281 + }, + { + "epoch": 0.38159914750663754, + "grad_norm": 6.977178702315354, + "learning_rate": 4.668589253899686e-06, + "loss": 0.853, + "step": 5282 + }, + { + "epoch": 0.381671392706847, + "grad_norm": 6.9911257486472405, + "learning_rate": 4.66844370469888e-06, + "loss": 0.9587, + "step": 5283 + }, + { + "epoch": 0.38174363790705657, + "grad_norm": 6.159866742265672, + "learning_rate": 4.668298125813765e-06, + "loss": 0.8299, + "step": 5284 + }, + { + "epoch": 0.38181588310726605, + "grad_norm": 6.270549104486571, + "learning_rate": 4.668152517246332e-06, + "loss": 0.8714, + "step": 5285 + }, + { + "epoch": 0.3818881283074756, + "grad_norm": 5.831314318793861, + "learning_rate": 4.668006878998574e-06, + "loss": 0.8402, + "step": 5286 + }, + { + "epoch": 0.3819603735076851, + "grad_norm": 6.487775898911888, + "learning_rate": 4.6678612110724855e-06, + "loss": 0.8742, + "step": 5287 + }, + { + "epoch": 0.38203261870789457, + "grad_norm": 6.858721795403867, + "learning_rate": 4.667715513470059e-06, + "loss": 0.8911, + "step": 5288 + }, + { + "epoch": 0.3821048639081041, + "grad_norm": 7.387530269657609, + "learning_rate": 4.6675697861932915e-06, + "loss": 0.9566, + "step": 5289 + }, + { + "epoch": 0.3821771091083136, + "grad_norm": 7.7016223734105, + "learning_rate": 4.667424029244176e-06, + "loss": 0.9973, + "step": 5290 + }, + { + "epoch": 0.38224935430852314, + "grad_norm": 7.762902133618123, + "learning_rate": 4.667278242624709e-06, + "loss": 0.8858, + "step": 5291 + }, + { + "epoch": 0.3823215995087326, + "grad_norm": 5.79128075647287, + "learning_rate": 4.667132426336886e-06, + "loss": 0.861, + "step": 5292 + }, + { + "epoch": 0.38239384470894217, + "grad_norm": 8.592887918407891, + "learning_rate": 4.666986580382702e-06, + "loss": 0.9645, + "step": 5293 + }, + { + "epoch": 0.38246608990915165, + "grad_norm": 5.9884513338125585, + "learning_rate": 4.666840704764154e-06, + "loss": 0.9016, + "step": 5294 + }, + { + "epoch": 0.3825383351093612, + "grad_norm": 5.702560104193849, + "learning_rate": 4.66669479948324e-06, + "loss": 0.8322, + "step": 5295 + }, + { + "epoch": 0.3826105803095707, + "grad_norm": 6.47259142054136, + "learning_rate": 4.666548864541956e-06, + "loss": 0.8425, + "step": 5296 + }, + { + "epoch": 0.38268282550978017, + "grad_norm": 7.7901195547309765, + "learning_rate": 4.6664028999423e-06, + "loss": 0.8839, + "step": 5297 + }, + { + "epoch": 0.3827550707099897, + "grad_norm": 8.547896134155614, + "learning_rate": 4.666256905686271e-06, + "loss": 0.8388, + "step": 5298 + }, + { + "epoch": 0.3828273159101992, + "grad_norm": 5.064805812596523, + "learning_rate": 4.666110881775867e-06, + "loss": 0.809, + "step": 5299 + }, + { + "epoch": 0.38289956111040874, + "grad_norm": 5.94027869806983, + "learning_rate": 4.665964828213086e-06, + "loss": 0.8777, + "step": 5300 + }, + { + "epoch": 0.3829718063106182, + "grad_norm": 8.467201232054283, + "learning_rate": 4.665818744999929e-06, + "loss": 0.9039, + "step": 5301 + }, + { + "epoch": 0.38304405151082777, + "grad_norm": 8.16075413304552, + "learning_rate": 4.665672632138395e-06, + "loss": 0.9486, + "step": 5302 + }, + { + "epoch": 0.38311629671103725, + "grad_norm": 7.824605855064482, + "learning_rate": 4.665526489630484e-06, + "loss": 0.9313, + "step": 5303 + }, + { + "epoch": 0.3831885419112468, + "grad_norm": 5.581113383467594, + "learning_rate": 4.6653803174781974e-06, + "loss": 0.9061, + "step": 5304 + }, + { + "epoch": 0.3832607871114563, + "grad_norm": 5.795127913846406, + "learning_rate": 4.665234115683535e-06, + "loss": 0.8592, + "step": 5305 + }, + { + "epoch": 0.38333303231166577, + "grad_norm": 6.805796273401009, + "learning_rate": 4.665087884248499e-06, + "loss": 0.9627, + "step": 5306 + }, + { + "epoch": 0.3834052775118753, + "grad_norm": 8.110773395957159, + "learning_rate": 4.66494162317509e-06, + "loss": 0.8043, + "step": 5307 + }, + { + "epoch": 0.3834775227120848, + "grad_norm": 7.518235227192382, + "learning_rate": 4.664795332465313e-06, + "loss": 0.9383, + "step": 5308 + }, + { + "epoch": 0.38354976791229434, + "grad_norm": 7.681522681763637, + "learning_rate": 4.664649012121168e-06, + "loss": 0.9536, + "step": 5309 + }, + { + "epoch": 0.3836220131125038, + "grad_norm": 6.630927547053831, + "learning_rate": 4.664502662144658e-06, + "loss": 0.8503, + "step": 5310 + }, + { + "epoch": 0.38369425831271337, + "grad_norm": 6.562735417094396, + "learning_rate": 4.664356282537787e-06, + "loss": 0.8419, + "step": 5311 + }, + { + "epoch": 0.38376650351292285, + "grad_norm": 6.9281393599028, + "learning_rate": 4.66420987330256e-06, + "loss": 0.8683, + "step": 5312 + }, + { + "epoch": 0.3838387487131324, + "grad_norm": 6.031197819459346, + "learning_rate": 4.66406343444098e-06, + "loss": 0.8993, + "step": 5313 + }, + { + "epoch": 0.3839109939133419, + "grad_norm": 6.532367227448123, + "learning_rate": 4.663916965955052e-06, + "loss": 0.8755, + "step": 5314 + }, + { + "epoch": 0.38398323911355137, + "grad_norm": 6.318095229698819, + "learning_rate": 4.66377046784678e-06, + "loss": 0.8935, + "step": 5315 + }, + { + "epoch": 0.3840554843137609, + "grad_norm": 5.787537491019902, + "learning_rate": 4.663623940118172e-06, + "loss": 0.9163, + "step": 5316 + }, + { + "epoch": 0.3841277295139704, + "grad_norm": 5.410911534880761, + "learning_rate": 4.66347738277123e-06, + "loss": 0.8645, + "step": 5317 + }, + { + "epoch": 0.38419997471417994, + "grad_norm": 6.507170903153059, + "learning_rate": 4.663330795807964e-06, + "loss": 0.8351, + "step": 5318 + }, + { + "epoch": 0.3842722199143894, + "grad_norm": 6.471244589007722, + "learning_rate": 4.6631841792303785e-06, + "loss": 0.9064, + "step": 5319 + }, + { + "epoch": 0.38434446511459897, + "grad_norm": 5.86102060225009, + "learning_rate": 4.663037533040482e-06, + "loss": 0.925, + "step": 5320 + }, + { + "epoch": 0.38441671031480845, + "grad_norm": 6.815366369462466, + "learning_rate": 4.662890857240281e-06, + "loss": 0.8939, + "step": 5321 + }, + { + "epoch": 0.384488955515018, + "grad_norm": 6.3613101348763115, + "learning_rate": 4.662744151831783e-06, + "loss": 0.9133, + "step": 5322 + }, + { + "epoch": 0.3845612007152275, + "grad_norm": 6.008624236550697, + "learning_rate": 4.662597416816997e-06, + "loss": 0.8555, + "step": 5323 + }, + { + "epoch": 0.38463344591543697, + "grad_norm": 6.166314123575401, + "learning_rate": 4.662450652197932e-06, + "loss": 0.9034, + "step": 5324 + }, + { + "epoch": 0.3847056911156465, + "grad_norm": 7.329202991574133, + "learning_rate": 4.662303857976595e-06, + "loss": 0.877, + "step": 5325 + }, + { + "epoch": 0.384777936315856, + "grad_norm": 7.52133805403313, + "learning_rate": 4.662157034154998e-06, + "loss": 0.7772, + "step": 5326 + }, + { + "epoch": 0.38485018151606554, + "grad_norm": 6.472393391992109, + "learning_rate": 4.662010180735151e-06, + "loss": 0.9287, + "step": 5327 + }, + { + "epoch": 0.384922426716275, + "grad_norm": 6.4271934001118565, + "learning_rate": 4.661863297719063e-06, + "loss": 0.9184, + "step": 5328 + }, + { + "epoch": 0.38499467191648457, + "grad_norm": 6.860303950568769, + "learning_rate": 4.661716385108744e-06, + "loss": 0.9197, + "step": 5329 + }, + { + "epoch": 0.38506691711669405, + "grad_norm": 6.344296286226838, + "learning_rate": 4.661569442906208e-06, + "loss": 0.8996, + "step": 5330 + }, + { + "epoch": 0.3851391623169036, + "grad_norm": 6.327791030924986, + "learning_rate": 4.6614224711134624e-06, + "loss": 0.9865, + "step": 5331 + }, + { + "epoch": 0.3852114075171131, + "grad_norm": 5.3622435964183035, + "learning_rate": 4.661275469732522e-06, + "loss": 0.8947, + "step": 5332 + }, + { + "epoch": 0.38528365271732257, + "grad_norm": 7.7788481778081175, + "learning_rate": 4.6611284387653995e-06, + "loss": 0.994, + "step": 5333 + }, + { + "epoch": 0.3853558979175321, + "grad_norm": 8.029318491898048, + "learning_rate": 4.660981378214106e-06, + "loss": 0.9198, + "step": 5334 + }, + { + "epoch": 0.3854281431177416, + "grad_norm": 5.866793805969968, + "learning_rate": 4.6608342880806555e-06, + "loss": 0.814, + "step": 5335 + }, + { + "epoch": 0.38550038831795114, + "grad_norm": 5.943539088023998, + "learning_rate": 4.660687168367062e-06, + "loss": 0.9264, + "step": 5336 + }, + { + "epoch": 0.3855726335181606, + "grad_norm": 6.8892826494188775, + "learning_rate": 4.660540019075338e-06, + "loss": 0.953, + "step": 5337 + }, + { + "epoch": 0.38564487871837017, + "grad_norm": 6.1608163386830785, + "learning_rate": 4.660392840207498e-06, + "loss": 0.9115, + "step": 5338 + }, + { + "epoch": 0.38571712391857965, + "grad_norm": 9.23372228287518, + "learning_rate": 4.6602456317655584e-06, + "loss": 0.9477, + "step": 5339 + }, + { + "epoch": 0.3857893691187892, + "grad_norm": 7.302657848037782, + "learning_rate": 4.660098393751534e-06, + "loss": 1.0181, + "step": 5340 + }, + { + "epoch": 0.3858616143189987, + "grad_norm": 7.534534545373702, + "learning_rate": 4.659951126167439e-06, + "loss": 0.9636, + "step": 5341 + }, + { + "epoch": 0.38593385951920817, + "grad_norm": 5.9968565017709095, + "learning_rate": 4.65980382901529e-06, + "loss": 0.8788, + "step": 5342 + }, + { + "epoch": 0.3860061047194177, + "grad_norm": 6.843273128808941, + "learning_rate": 4.659656502297104e-06, + "loss": 0.8988, + "step": 5343 + }, + { + "epoch": 0.3860783499196272, + "grad_norm": 6.661932440482471, + "learning_rate": 4.6595091460148976e-06, + "loss": 0.9442, + "step": 5344 + }, + { + "epoch": 0.38615059511983674, + "grad_norm": 8.468702252805898, + "learning_rate": 4.659361760170687e-06, + "loss": 0.925, + "step": 5345 + }, + { + "epoch": 0.3862228403200462, + "grad_norm": 7.59064460962306, + "learning_rate": 4.659214344766492e-06, + "loss": 0.9951, + "step": 5346 + }, + { + "epoch": 0.38629508552025577, + "grad_norm": 6.198030085020115, + "learning_rate": 4.6590668998043275e-06, + "loss": 1.0044, + "step": 5347 + }, + { + "epoch": 0.38636733072046525, + "grad_norm": 5.778117883078534, + "learning_rate": 4.658919425286214e-06, + "loss": 0.9512, + "step": 5348 + }, + { + "epoch": 0.3864395759206748, + "grad_norm": 8.185005565121763, + "learning_rate": 4.65877192121417e-06, + "loss": 0.9772, + "step": 5349 + }, + { + "epoch": 0.3865118211208843, + "grad_norm": 7.5840814521858, + "learning_rate": 4.6586243875902145e-06, + "loss": 0.9522, + "step": 5350 + }, + { + "epoch": 0.38658406632109377, + "grad_norm": 6.195329432807093, + "learning_rate": 4.658476824416367e-06, + "loss": 0.9954, + "step": 5351 + }, + { + "epoch": 0.3866563115213033, + "grad_norm": 7.163867344584944, + "learning_rate": 4.658329231694648e-06, + "loss": 0.9203, + "step": 5352 + }, + { + "epoch": 0.3867285567215128, + "grad_norm": 7.872058621847324, + "learning_rate": 4.6581816094270785e-06, + "loss": 0.9247, + "step": 5353 + }, + { + "epoch": 0.38680080192172234, + "grad_norm": 6.727424734680501, + "learning_rate": 4.658033957615677e-06, + "loss": 0.9036, + "step": 5354 + }, + { + "epoch": 0.3868730471219318, + "grad_norm": 6.109693435994636, + "learning_rate": 4.657886276262466e-06, + "loss": 0.9675, + "step": 5355 + }, + { + "epoch": 0.38694529232214137, + "grad_norm": 6.0243702919517474, + "learning_rate": 4.657738565369469e-06, + "loss": 0.8901, + "step": 5356 + }, + { + "epoch": 0.38701753752235085, + "grad_norm": 5.9982214516885, + "learning_rate": 4.6575908249387055e-06, + "loss": 0.9202, + "step": 5357 + }, + { + "epoch": 0.3870897827225604, + "grad_norm": 7.117761783481973, + "learning_rate": 4.657443054972199e-06, + "loss": 0.9406, + "step": 5358 + }, + { + "epoch": 0.3871620279227699, + "grad_norm": 5.743297153092525, + "learning_rate": 4.6572952554719715e-06, + "loss": 0.8916, + "step": 5359 + }, + { + "epoch": 0.38723427312297937, + "grad_norm": 7.320967507548578, + "learning_rate": 4.657147426440049e-06, + "loss": 0.9024, + "step": 5360 + }, + { + "epoch": 0.3873065183231889, + "grad_norm": 7.04047924014468, + "learning_rate": 4.656999567878451e-06, + "loss": 0.9487, + "step": 5361 + }, + { + "epoch": 0.3873787635233984, + "grad_norm": 6.554062423636937, + "learning_rate": 4.656851679789205e-06, + "loss": 0.937, + "step": 5362 + }, + { + "epoch": 0.38745100872360794, + "grad_norm": 5.479716961087754, + "learning_rate": 4.6567037621743335e-06, + "loss": 0.8606, + "step": 5363 + }, + { + "epoch": 0.3875232539238174, + "grad_norm": 7.690059328879684, + "learning_rate": 4.6565558150358625e-06, + "loss": 1.0513, + "step": 5364 + }, + { + "epoch": 0.38759549912402697, + "grad_norm": 6.83853856764813, + "learning_rate": 4.656407838375817e-06, + "loss": 0.9258, + "step": 5365 + }, + { + "epoch": 0.38766774432423645, + "grad_norm": 6.895854323740319, + "learning_rate": 4.656259832196222e-06, + "loss": 0.9639, + "step": 5366 + }, + { + "epoch": 0.387739989524446, + "grad_norm": 7.278346452828578, + "learning_rate": 4.656111796499104e-06, + "loss": 1.0007, + "step": 5367 + }, + { + "epoch": 0.3878122347246555, + "grad_norm": 5.491362812317087, + "learning_rate": 4.65596373128649e-06, + "loss": 0.8086, + "step": 5368 + }, + { + "epoch": 0.38788447992486497, + "grad_norm": 6.573785868472691, + "learning_rate": 4.655815636560407e-06, + "loss": 1.0255, + "step": 5369 + }, + { + "epoch": 0.3879567251250745, + "grad_norm": 6.349719916947648, + "learning_rate": 4.655667512322881e-06, + "loss": 0.8685, + "step": 5370 + }, + { + "epoch": 0.388028970325284, + "grad_norm": 6.5107582267173445, + "learning_rate": 4.655519358575941e-06, + "loss": 0.8321, + "step": 5371 + }, + { + "epoch": 0.38810121552549354, + "grad_norm": 6.989757128249455, + "learning_rate": 4.655371175321615e-06, + "loss": 0.9576, + "step": 5372 + }, + { + "epoch": 0.388173460725703, + "grad_norm": 6.29066391926687, + "learning_rate": 4.655222962561929e-06, + "loss": 0.8825, + "step": 5373 + }, + { + "epoch": 0.38824570592591257, + "grad_norm": 7.221366903406396, + "learning_rate": 4.6550747202989166e-06, + "loss": 0.9607, + "step": 5374 + }, + { + "epoch": 0.38831795112612205, + "grad_norm": 7.155207795458592, + "learning_rate": 4.6549264485346035e-06, + "loss": 0.8403, + "step": 5375 + }, + { + "epoch": 0.3883901963263316, + "grad_norm": 5.873524257286058, + "learning_rate": 4.65477814727102e-06, + "loss": 0.9241, + "step": 5376 + }, + { + "epoch": 0.3884624415265411, + "grad_norm": 7.325577738505056, + "learning_rate": 4.654629816510198e-06, + "loss": 0.9468, + "step": 5377 + }, + { + "epoch": 0.38853468672675057, + "grad_norm": 7.114753717380832, + "learning_rate": 4.654481456254166e-06, + "loss": 0.8085, + "step": 5378 + }, + { + "epoch": 0.3886069319269601, + "grad_norm": 6.809219550449997, + "learning_rate": 4.654333066504956e-06, + "loss": 1.0039, + "step": 5379 + }, + { + "epoch": 0.3886791771271696, + "grad_norm": 7.61792965134567, + "learning_rate": 4.654184647264599e-06, + "loss": 0.9537, + "step": 5380 + }, + { + "epoch": 0.38875142232737914, + "grad_norm": 6.0460734464504355, + "learning_rate": 4.654036198535127e-06, + "loss": 0.8863, + "step": 5381 + }, + { + "epoch": 0.3888236675275886, + "grad_norm": 5.938572515695158, + "learning_rate": 4.653887720318572e-06, + "loss": 0.9434, + "step": 5382 + }, + { + "epoch": 0.38889591272779817, + "grad_norm": 6.359037873051114, + "learning_rate": 4.653739212616966e-06, + "loss": 0.9238, + "step": 5383 + }, + { + "epoch": 0.38896815792800765, + "grad_norm": 5.6649517755568555, + "learning_rate": 4.6535906754323425e-06, + "loss": 0.8632, + "step": 5384 + }, + { + "epoch": 0.3890404031282172, + "grad_norm": 6.48468472763678, + "learning_rate": 4.653442108766735e-06, + "loss": 0.877, + "step": 5385 + }, + { + "epoch": 0.3891126483284267, + "grad_norm": 6.488755991342682, + "learning_rate": 4.653293512622176e-06, + "loss": 0.8993, + "step": 5386 + }, + { + "epoch": 0.38918489352863617, + "grad_norm": 7.235435589356359, + "learning_rate": 4.6531448870007025e-06, + "loss": 0.9405, + "step": 5387 + }, + { + "epoch": 0.3892571387288457, + "grad_norm": 7.95080964386143, + "learning_rate": 4.652996231904346e-06, + "loss": 0.8874, + "step": 5388 + }, + { + "epoch": 0.3893293839290552, + "grad_norm": 6.525655147942103, + "learning_rate": 4.652847547335144e-06, + "loss": 0.9027, + "step": 5389 + }, + { + "epoch": 0.38940162912926474, + "grad_norm": 6.030187009122539, + "learning_rate": 4.65269883329513e-06, + "loss": 0.8401, + "step": 5390 + }, + { + "epoch": 0.3894738743294742, + "grad_norm": 6.384571444737618, + "learning_rate": 4.65255008978634e-06, + "loss": 0.9341, + "step": 5391 + }, + { + "epoch": 0.38954611952968377, + "grad_norm": 5.661552851875542, + "learning_rate": 4.652401316810811e-06, + "loss": 0.8544, + "step": 5392 + }, + { + "epoch": 0.38961836472989325, + "grad_norm": 7.102634328064015, + "learning_rate": 4.652252514370579e-06, + "loss": 0.9567, + "step": 5393 + }, + { + "epoch": 0.3896906099301028, + "grad_norm": 7.187888756894763, + "learning_rate": 4.652103682467682e-06, + "loss": 0.8806, + "step": 5394 + }, + { + "epoch": 0.3897628551303123, + "grad_norm": 8.329896548797702, + "learning_rate": 4.651954821104156e-06, + "loss": 0.9739, + "step": 5395 + }, + { + "epoch": 0.38983510033052177, + "grad_norm": 5.85167984278206, + "learning_rate": 4.651805930282039e-06, + "loss": 0.8509, + "step": 5396 + }, + { + "epoch": 0.3899073455307313, + "grad_norm": 7.5454550092348125, + "learning_rate": 4.651657010003371e-06, + "loss": 0.9159, + "step": 5397 + }, + { + "epoch": 0.3899795907309408, + "grad_norm": 6.054661038248627, + "learning_rate": 4.651508060270188e-06, + "loss": 0.8189, + "step": 5398 + }, + { + "epoch": 0.39005183593115034, + "grad_norm": 6.492642860647578, + "learning_rate": 4.65135908108453e-06, + "loss": 0.9586, + "step": 5399 + }, + { + "epoch": 0.3901240811313598, + "grad_norm": 7.630617917538972, + "learning_rate": 4.651210072448437e-06, + "loss": 0.926, + "step": 5400 + }, + { + "epoch": 0.39019632633156937, + "grad_norm": 6.0301522160133745, + "learning_rate": 4.651061034363948e-06, + "loss": 0.8986, + "step": 5401 + }, + { + "epoch": 0.39026857153177885, + "grad_norm": 8.387800313061636, + "learning_rate": 4.650911966833104e-06, + "loss": 0.9049, + "step": 5402 + }, + { + "epoch": 0.3903408167319884, + "grad_norm": 6.855989007146473, + "learning_rate": 4.650762869857946e-06, + "loss": 0.8529, + "step": 5403 + }, + { + "epoch": 0.3904130619321979, + "grad_norm": 6.37455718054571, + "learning_rate": 4.650613743440513e-06, + "loss": 0.9229, + "step": 5404 + }, + { + "epoch": 0.39048530713240737, + "grad_norm": 7.312490870804264, + "learning_rate": 4.650464587582848e-06, + "loss": 0.8041, + "step": 5405 + }, + { + "epoch": 0.3905575523326169, + "grad_norm": 5.636749204049748, + "learning_rate": 4.6503154022869925e-06, + "loss": 0.9189, + "step": 5406 + }, + { + "epoch": 0.3906297975328264, + "grad_norm": 7.21063410326322, + "learning_rate": 4.65016618755499e-06, + "loss": 0.908, + "step": 5407 + }, + { + "epoch": 0.39070204273303594, + "grad_norm": 6.344234654746953, + "learning_rate": 4.65001694338888e-06, + "loss": 0.8697, + "step": 5408 + }, + { + "epoch": 0.3907742879332454, + "grad_norm": 7.878407059204757, + "learning_rate": 4.649867669790708e-06, + "loss": 0.8492, + "step": 5409 + }, + { + "epoch": 0.39084653313345497, + "grad_norm": 8.25508977923218, + "learning_rate": 4.649718366762518e-06, + "loss": 0.918, + "step": 5410 + }, + { + "epoch": 0.39091877833366445, + "grad_norm": 6.339138215035714, + "learning_rate": 4.649569034306352e-06, + "loss": 0.8712, + "step": 5411 + }, + { + "epoch": 0.390991023533874, + "grad_norm": 6.066433918159699, + "learning_rate": 4.649419672424254e-06, + "loss": 0.9045, + "step": 5412 + }, + { + "epoch": 0.3910632687340835, + "grad_norm": 5.088075624935569, + "learning_rate": 4.649270281118271e-06, + "loss": 0.9452, + "step": 5413 + }, + { + "epoch": 0.39113551393429297, + "grad_norm": 8.197601842267986, + "learning_rate": 4.649120860390446e-06, + "loss": 0.8363, + "step": 5414 + }, + { + "epoch": 0.3912077591345025, + "grad_norm": 8.000304693141722, + "learning_rate": 4.648971410242825e-06, + "loss": 0.9318, + "step": 5415 + }, + { + "epoch": 0.391280004334712, + "grad_norm": 7.108924264557807, + "learning_rate": 4.648821930677454e-06, + "loss": 0.8719, + "step": 5416 + }, + { + "epoch": 0.39135224953492154, + "grad_norm": 7.098718726372552, + "learning_rate": 4.6486724216963795e-06, + "loss": 0.8669, + "step": 5417 + }, + { + "epoch": 0.391424494735131, + "grad_norm": 6.608068170154289, + "learning_rate": 4.6485228833016485e-06, + "loss": 0.8829, + "step": 5418 + }, + { + "epoch": 0.39149673993534057, + "grad_norm": 7.031555169158799, + "learning_rate": 4.648373315495306e-06, + "loss": 0.9997, + "step": 5419 + }, + { + "epoch": 0.39156898513555005, + "grad_norm": 4.6371101400602734, + "learning_rate": 4.648223718279402e-06, + "loss": 0.7336, + "step": 5420 + }, + { + "epoch": 0.3916412303357596, + "grad_norm": 6.987321543589906, + "learning_rate": 4.648074091655983e-06, + "loss": 0.8773, + "step": 5421 + }, + { + "epoch": 0.3917134755359691, + "grad_norm": 7.684623117928401, + "learning_rate": 4.647924435627097e-06, + "loss": 0.919, + "step": 5422 + }, + { + "epoch": 0.39178572073617857, + "grad_norm": 6.8070511331555705, + "learning_rate": 4.647774750194794e-06, + "loss": 0.9248, + "step": 5423 + }, + { + "epoch": 0.3918579659363881, + "grad_norm": 5.989237669769854, + "learning_rate": 4.647625035361122e-06, + "loss": 0.9633, + "step": 5424 + }, + { + "epoch": 0.3919302111365976, + "grad_norm": 7.603835202313077, + "learning_rate": 4.647475291128131e-06, + "loss": 0.9416, + "step": 5425 + }, + { + "epoch": 0.39200245633680714, + "grad_norm": 5.84857826889014, + "learning_rate": 4.64732551749787e-06, + "loss": 0.8919, + "step": 5426 + }, + { + "epoch": 0.3920747015370166, + "grad_norm": 7.913283284304088, + "learning_rate": 4.64717571447239e-06, + "loss": 0.9226, + "step": 5427 + }, + { + "epoch": 0.39214694673722617, + "grad_norm": 8.450211507644397, + "learning_rate": 4.647025882053743e-06, + "loss": 1.0459, + "step": 5428 + }, + { + "epoch": 0.39221919193743565, + "grad_norm": 7.110302673850779, + "learning_rate": 4.646876020243978e-06, + "loss": 0.922, + "step": 5429 + }, + { + "epoch": 0.3922914371376452, + "grad_norm": 6.095315428935787, + "learning_rate": 4.646726129045146e-06, + "loss": 0.9363, + "step": 5430 + }, + { + "epoch": 0.3923636823378547, + "grad_norm": 5.7580655791565265, + "learning_rate": 4.646576208459302e-06, + "loss": 0.8129, + "step": 5431 + }, + { + "epoch": 0.39243592753806417, + "grad_norm": 8.32649968016742, + "learning_rate": 4.646426258488495e-06, + "loss": 0.8697, + "step": 5432 + }, + { + "epoch": 0.3925081727382737, + "grad_norm": 7.759136352064683, + "learning_rate": 4.64627627913478e-06, + "loss": 0.9647, + "step": 5433 + }, + { + "epoch": 0.3925804179384832, + "grad_norm": 6.813514283994227, + "learning_rate": 4.646126270400209e-06, + "loss": 0.9098, + "step": 5434 + }, + { + "epoch": 0.39265266313869274, + "grad_norm": 5.47423779481711, + "learning_rate": 4.645976232286835e-06, + "loss": 0.9386, + "step": 5435 + }, + { + "epoch": 0.3927249083389022, + "grad_norm": 7.5562291904664605, + "learning_rate": 4.645826164796714e-06, + "loss": 0.8995, + "step": 5436 + }, + { + "epoch": 0.39279715353911177, + "grad_norm": 7.097073083327919, + "learning_rate": 4.645676067931898e-06, + "loss": 0.9003, + "step": 5437 + }, + { + "epoch": 0.39286939873932125, + "grad_norm": 8.017406600251107, + "learning_rate": 4.645525941694442e-06, + "loss": 0.8731, + "step": 5438 + }, + { + "epoch": 0.3929416439395308, + "grad_norm": 7.478111879852939, + "learning_rate": 4.645375786086404e-06, + "loss": 0.87, + "step": 5439 + }, + { + "epoch": 0.3930138891397403, + "grad_norm": 6.7101902573470245, + "learning_rate": 4.645225601109835e-06, + "loss": 0.8424, + "step": 5440 + }, + { + "epoch": 0.39308613433994977, + "grad_norm": 7.264725036668521, + "learning_rate": 4.6450753867667944e-06, + "loss": 0.8467, + "step": 5441 + }, + { + "epoch": 0.3931583795401593, + "grad_norm": 5.521103512852642, + "learning_rate": 4.644925143059337e-06, + "loss": 0.8625, + "step": 5442 + }, + { + "epoch": 0.3932306247403688, + "grad_norm": 6.547916739165738, + "learning_rate": 4.644774869989519e-06, + "loss": 0.8882, + "step": 5443 + }, + { + "epoch": 0.39330286994057834, + "grad_norm": 7.325407455621096, + "learning_rate": 4.6446245675593994e-06, + "loss": 0.8846, + "step": 5444 + }, + { + "epoch": 0.3933751151407878, + "grad_norm": 6.144614129983793, + "learning_rate": 4.644474235771035e-06, + "loss": 0.8964, + "step": 5445 + }, + { + "epoch": 0.39344736034099737, + "grad_norm": 5.672547523834089, + "learning_rate": 4.644323874626482e-06, + "loss": 0.8125, + "step": 5446 + }, + { + "epoch": 0.39351960554120685, + "grad_norm": 5.382637275756039, + "learning_rate": 4.644173484127801e-06, + "loss": 0.8375, + "step": 5447 + }, + { + "epoch": 0.3935918507414164, + "grad_norm": 9.218907655563994, + "learning_rate": 4.64402306427705e-06, + "loss": 0.9101, + "step": 5448 + }, + { + "epoch": 0.3936640959416259, + "grad_norm": 6.195100989946392, + "learning_rate": 4.643872615076287e-06, + "loss": 0.8902, + "step": 5449 + }, + { + "epoch": 0.39373634114183537, + "grad_norm": 6.110752896685929, + "learning_rate": 4.643722136527573e-06, + "loss": 0.8366, + "step": 5450 + }, + { + "epoch": 0.3938085863420449, + "grad_norm": 7.129826834824256, + "learning_rate": 4.643571628632968e-06, + "loss": 0.8957, + "step": 5451 + }, + { + "epoch": 0.3938808315422544, + "grad_norm": 6.951785389299425, + "learning_rate": 4.643421091394531e-06, + "loss": 0.9546, + "step": 5452 + }, + { + "epoch": 0.39395307674246394, + "grad_norm": 10.147748470041543, + "learning_rate": 4.643270524814324e-06, + "loss": 0.9208, + "step": 5453 + }, + { + "epoch": 0.3940253219426734, + "grad_norm": 8.477941276114738, + "learning_rate": 4.6431199288944074e-06, + "loss": 0.8326, + "step": 5454 + }, + { + "epoch": 0.39409756714288297, + "grad_norm": 6.889411663742396, + "learning_rate": 4.642969303636843e-06, + "loss": 0.906, + "step": 5455 + }, + { + "epoch": 0.39416981234309245, + "grad_norm": 8.363908853728415, + "learning_rate": 4.642818649043693e-06, + "loss": 0.8627, + "step": 5456 + }, + { + "epoch": 0.394242057543302, + "grad_norm": 7.55940448050331, + "learning_rate": 4.6426679651170195e-06, + "loss": 0.9284, + "step": 5457 + }, + { + "epoch": 0.3943143027435115, + "grad_norm": 8.48514425538352, + "learning_rate": 4.6425172518588855e-06, + "loss": 0.9706, + "step": 5458 + }, + { + "epoch": 0.39438654794372097, + "grad_norm": 8.56335489502248, + "learning_rate": 4.642366509271353e-06, + "loss": 0.9906, + "step": 5459 + }, + { + "epoch": 0.3944587931439305, + "grad_norm": 6.095702186298389, + "learning_rate": 4.6422157373564865e-06, + "loss": 0.8714, + "step": 5460 + }, + { + "epoch": 0.39453103834414, + "grad_norm": 5.977757552599972, + "learning_rate": 4.642064936116351e-06, + "loss": 0.8376, + "step": 5461 + }, + { + "epoch": 0.39460328354434954, + "grad_norm": 6.106393071727004, + "learning_rate": 4.641914105553009e-06, + "loss": 0.9621, + "step": 5462 + }, + { + "epoch": 0.394675528744559, + "grad_norm": 7.106585078865537, + "learning_rate": 4.6417632456685256e-06, + "loss": 0.9677, + "step": 5463 + }, + { + "epoch": 0.39474777394476857, + "grad_norm": 5.49539807003279, + "learning_rate": 4.641612356464967e-06, + "loss": 0.8622, + "step": 5464 + }, + { + "epoch": 0.39482001914497805, + "grad_norm": 6.626020244996945, + "learning_rate": 4.641461437944398e-06, + "loss": 0.9309, + "step": 5465 + }, + { + "epoch": 0.3948922643451876, + "grad_norm": 6.291953006653773, + "learning_rate": 4.641310490108885e-06, + "loss": 0.9521, + "step": 5466 + }, + { + "epoch": 0.3949645095453971, + "grad_norm": 5.377253038028371, + "learning_rate": 4.641159512960493e-06, + "loss": 0.8483, + "step": 5467 + }, + { + "epoch": 0.39503675474560657, + "grad_norm": 6.0356318709227805, + "learning_rate": 4.64100850650129e-06, + "loss": 0.8133, + "step": 5468 + }, + { + "epoch": 0.3951089999458161, + "grad_norm": 5.443484290872491, + "learning_rate": 4.640857470733343e-06, + "loss": 0.9509, + "step": 5469 + }, + { + "epoch": 0.3951812451460256, + "grad_norm": 6.028023444744606, + "learning_rate": 4.64070640565872e-06, + "loss": 0.8208, + "step": 5470 + }, + { + "epoch": 0.39525349034623514, + "grad_norm": 6.513253711277544, + "learning_rate": 4.640555311279489e-06, + "loss": 0.9199, + "step": 5471 + }, + { + "epoch": 0.3953257355464446, + "grad_norm": 6.721526951559956, + "learning_rate": 4.640404187597717e-06, + "loss": 0.8222, + "step": 5472 + }, + { + "epoch": 0.39539798074665417, + "grad_norm": 6.111697952915849, + "learning_rate": 4.640253034615473e-06, + "loss": 0.8855, + "step": 5473 + }, + { + "epoch": 0.39547022594686365, + "grad_norm": 7.047840128441778, + "learning_rate": 4.640101852334827e-06, + "loss": 0.9382, + "step": 5474 + }, + { + "epoch": 0.3955424711470732, + "grad_norm": 6.714048364032711, + "learning_rate": 4.639950640757849e-06, + "loss": 0.905, + "step": 5475 + }, + { + "epoch": 0.3956147163472827, + "grad_norm": 6.8464759727796904, + "learning_rate": 4.639799399886607e-06, + "loss": 0.8658, + "step": 5476 + }, + { + "epoch": 0.39568696154749217, + "grad_norm": 6.764960487628137, + "learning_rate": 4.639648129723175e-06, + "loss": 0.8674, + "step": 5477 + }, + { + "epoch": 0.3957592067477017, + "grad_norm": 5.742067919673245, + "learning_rate": 4.63949683026962e-06, + "loss": 0.8564, + "step": 5478 + }, + { + "epoch": 0.3958314519479112, + "grad_norm": 9.191216579933164, + "learning_rate": 4.6393455015280145e-06, + "loss": 0.8443, + "step": 5479 + }, + { + "epoch": 0.39590369714812074, + "grad_norm": 7.3592669242163815, + "learning_rate": 4.6391941435004305e-06, + "loss": 0.9716, + "step": 5480 + }, + { + "epoch": 0.3959759423483302, + "grad_norm": 6.776921028783727, + "learning_rate": 4.639042756188939e-06, + "loss": 0.9247, + "step": 5481 + }, + { + "epoch": 0.39604818754853977, + "grad_norm": 7.056156516659594, + "learning_rate": 4.638891339595614e-06, + "loss": 0.9134, + "step": 5482 + }, + { + "epoch": 0.39612043274874925, + "grad_norm": 6.527836687981032, + "learning_rate": 4.638739893722527e-06, + "loss": 0.9092, + "step": 5483 + }, + { + "epoch": 0.3961926779489588, + "grad_norm": 5.3372248915946034, + "learning_rate": 4.638588418571751e-06, + "loss": 0.8256, + "step": 5484 + }, + { + "epoch": 0.3962649231491683, + "grad_norm": 7.3330792758744785, + "learning_rate": 4.63843691414536e-06, + "loss": 0.8914, + "step": 5485 + }, + { + "epoch": 0.39633716834937777, + "grad_norm": 5.742910243631381, + "learning_rate": 4.638285380445428e-06, + "loss": 0.7993, + "step": 5486 + }, + { + "epoch": 0.3964094135495873, + "grad_norm": 6.81222981608362, + "learning_rate": 4.63813381747403e-06, + "loss": 0.944, + "step": 5487 + }, + { + "epoch": 0.3964816587497968, + "grad_norm": 8.098543728502115, + "learning_rate": 4.63798222523324e-06, + "loss": 0.9342, + "step": 5488 + }, + { + "epoch": 0.39655390395000634, + "grad_norm": 6.531870835288059, + "learning_rate": 4.637830603725133e-06, + "loss": 0.8763, + "step": 5489 + }, + { + "epoch": 0.3966261491502158, + "grad_norm": 6.204331134215682, + "learning_rate": 4.637678952951786e-06, + "loss": 0.9467, + "step": 5490 + }, + { + "epoch": 0.39669839435042537, + "grad_norm": 8.322282355577528, + "learning_rate": 4.637527272915273e-06, + "loss": 0.97, + "step": 5491 + }, + { + "epoch": 0.39677063955063485, + "grad_norm": 6.449210764777764, + "learning_rate": 4.637375563617671e-06, + "loss": 0.9058, + "step": 5492 + }, + { + "epoch": 0.3968428847508444, + "grad_norm": 5.956077664454983, + "learning_rate": 4.637223825061058e-06, + "loss": 0.8645, + "step": 5493 + }, + { + "epoch": 0.3969151299510539, + "grad_norm": 6.552287556953213, + "learning_rate": 4.6370720572475104e-06, + "loss": 0.9296, + "step": 5494 + }, + { + "epoch": 0.39698737515126337, + "grad_norm": 7.065603738893681, + "learning_rate": 4.636920260179105e-06, + "loss": 0.9559, + "step": 5495 + }, + { + "epoch": 0.3970596203514729, + "grad_norm": 6.126263546526864, + "learning_rate": 4.63676843385792e-06, + "loss": 0.8418, + "step": 5496 + }, + { + "epoch": 0.3971318655516824, + "grad_norm": 6.492005934397017, + "learning_rate": 4.636616578286036e-06, + "loss": 0.8432, + "step": 5497 + }, + { + "epoch": 0.39720411075189194, + "grad_norm": 6.90642537028119, + "learning_rate": 4.636464693465529e-06, + "loss": 0.9155, + "step": 5498 + }, + { + "epoch": 0.3972763559521014, + "grad_norm": 6.483976420826195, + "learning_rate": 4.636312779398479e-06, + "loss": 0.9359, + "step": 5499 + }, + { + "epoch": 0.39734860115231097, + "grad_norm": 8.571883135050141, + "learning_rate": 4.636160836086966e-06, + "loss": 0.8205, + "step": 5500 + }, + { + "epoch": 0.39742084635252045, + "grad_norm": 6.6812732264344294, + "learning_rate": 4.636008863533069e-06, + "loss": 0.8874, + "step": 5501 + }, + { + "epoch": 0.39749309155272994, + "grad_norm": 5.885037228216167, + "learning_rate": 4.63585686173887e-06, + "loss": 0.8865, + "step": 5502 + }, + { + "epoch": 0.3975653367529395, + "grad_norm": 8.029636324845438, + "learning_rate": 4.635704830706449e-06, + "loss": 0.959, + "step": 5503 + }, + { + "epoch": 0.39763758195314897, + "grad_norm": 6.489787074565187, + "learning_rate": 4.635552770437887e-06, + "loss": 0.8183, + "step": 5504 + }, + { + "epoch": 0.3977098271533585, + "grad_norm": 7.6528462220177635, + "learning_rate": 4.6354006809352655e-06, + "loss": 0.8803, + "step": 5505 + }, + { + "epoch": 0.397782072353568, + "grad_norm": 6.638771570393667, + "learning_rate": 4.635248562200666e-06, + "loss": 0.9064, + "step": 5506 + }, + { + "epoch": 0.39785431755377754, + "grad_norm": 6.299958110851716, + "learning_rate": 4.635096414236173e-06, + "loss": 0.8671, + "step": 5507 + }, + { + "epoch": 0.397926562753987, + "grad_norm": 10.309982454893648, + "learning_rate": 4.6349442370438676e-06, + "loss": 0.9873, + "step": 5508 + }, + { + "epoch": 0.39799880795419657, + "grad_norm": 8.089849880037514, + "learning_rate": 4.634792030625833e-06, + "loss": 0.8818, + "step": 5509 + }, + { + "epoch": 0.39807105315440605, + "grad_norm": 6.712638026688445, + "learning_rate": 4.634639794984153e-06, + "loss": 0.9052, + "step": 5510 + }, + { + "epoch": 0.39814329835461554, + "grad_norm": 7.819170492133971, + "learning_rate": 4.634487530120911e-06, + "loss": 0.8318, + "step": 5511 + }, + { + "epoch": 0.3982155435548251, + "grad_norm": 10.202889455789977, + "learning_rate": 4.634335236038193e-06, + "loss": 0.9469, + "step": 5512 + }, + { + "epoch": 0.39828778875503457, + "grad_norm": 5.805467843220054, + "learning_rate": 4.634182912738084e-06, + "loss": 0.9502, + "step": 5513 + }, + { + "epoch": 0.3983600339552441, + "grad_norm": 7.5900430304673305, + "learning_rate": 4.634030560222665e-06, + "loss": 0.9215, + "step": 5514 + }, + { + "epoch": 0.3984322791554536, + "grad_norm": 6.692724736016703, + "learning_rate": 4.633878178494027e-06, + "loss": 0.9141, + "step": 5515 + }, + { + "epoch": 0.39850452435566314, + "grad_norm": 7.795924310307969, + "learning_rate": 4.633725767554253e-06, + "loss": 0.8769, + "step": 5516 + }, + { + "epoch": 0.3985767695558726, + "grad_norm": 6.3086521016070725, + "learning_rate": 4.633573327405429e-06, + "loss": 0.8888, + "step": 5517 + }, + { + "epoch": 0.39864901475608217, + "grad_norm": 7.02412752319875, + "learning_rate": 4.633420858049644e-06, + "loss": 0.9647, + "step": 5518 + }, + { + "epoch": 0.39872125995629165, + "grad_norm": 6.146740690159342, + "learning_rate": 4.633268359488983e-06, + "loss": 0.8807, + "step": 5519 + }, + { + "epoch": 0.39879350515650114, + "grad_norm": 7.806168821354075, + "learning_rate": 4.6331158317255355e-06, + "loss": 0.9375, + "step": 5520 + }, + { + "epoch": 0.3988657503567107, + "grad_norm": 5.707069349847156, + "learning_rate": 4.632963274761388e-06, + "loss": 0.9119, + "step": 5521 + }, + { + "epoch": 0.39893799555692017, + "grad_norm": 8.263458833448004, + "learning_rate": 4.632810688598629e-06, + "loss": 0.9217, + "step": 5522 + }, + { + "epoch": 0.3990102407571297, + "grad_norm": 10.448967943835822, + "learning_rate": 4.632658073239348e-06, + "loss": 0.9123, + "step": 5523 + }, + { + "epoch": 0.3990824859573392, + "grad_norm": 8.137029106239037, + "learning_rate": 4.632505428685634e-06, + "loss": 0.9093, + "step": 5524 + }, + { + "epoch": 0.39915473115754874, + "grad_norm": 6.345999182748246, + "learning_rate": 4.632352754939577e-06, + "loss": 0.9565, + "step": 5525 + }, + { + "epoch": 0.3992269763577582, + "grad_norm": 7.181042764327857, + "learning_rate": 4.632200052003265e-06, + "loss": 0.9087, + "step": 5526 + }, + { + "epoch": 0.39929922155796777, + "grad_norm": 6.561009991747948, + "learning_rate": 4.632047319878792e-06, + "loss": 0.8092, + "step": 5527 + }, + { + "epoch": 0.39937146675817725, + "grad_norm": 7.678729504957362, + "learning_rate": 4.631894558568245e-06, + "loss": 0.8661, + "step": 5528 + }, + { + "epoch": 0.39944371195838674, + "grad_norm": 7.670379928071803, + "learning_rate": 4.631741768073717e-06, + "loss": 0.9024, + "step": 5529 + }, + { + "epoch": 0.3995159571585963, + "grad_norm": 5.688559632574785, + "learning_rate": 4.6315889483973e-06, + "loss": 0.8961, + "step": 5530 + }, + { + "epoch": 0.39958820235880577, + "grad_norm": 6.793696219223508, + "learning_rate": 4.631436099541085e-06, + "loss": 1.0175, + "step": 5531 + }, + { + "epoch": 0.3996604475590153, + "grad_norm": 6.596136013981322, + "learning_rate": 4.631283221507164e-06, + "loss": 0.9409, + "step": 5532 + }, + { + "epoch": 0.3997326927592248, + "grad_norm": 7.079141101787255, + "learning_rate": 4.631130314297631e-06, + "loss": 0.9124, + "step": 5533 + }, + { + "epoch": 0.39980493795943434, + "grad_norm": 6.968460419116476, + "learning_rate": 4.630977377914579e-06, + "loss": 0.8944, + "step": 5534 + }, + { + "epoch": 0.3998771831596438, + "grad_norm": 9.234028404689541, + "learning_rate": 4.630824412360101e-06, + "loss": 0.9817, + "step": 5535 + }, + { + "epoch": 0.39994942835985337, + "grad_norm": 6.403883185117668, + "learning_rate": 4.630671417636292e-06, + "loss": 0.9314, + "step": 5536 + }, + { + "epoch": 0.40002167356006285, + "grad_norm": 6.116741295687528, + "learning_rate": 4.630518393745245e-06, + "loss": 0.7783, + "step": 5537 + }, + { + "epoch": 0.40009391876027234, + "grad_norm": 6.617422124493077, + "learning_rate": 4.630365340689056e-06, + "loss": 0.7966, + "step": 5538 + }, + { + "epoch": 0.4001661639604819, + "grad_norm": 6.501428593854302, + "learning_rate": 4.630212258469818e-06, + "loss": 0.9144, + "step": 5539 + }, + { + "epoch": 0.40023840916069137, + "grad_norm": 8.723117185817236, + "learning_rate": 4.63005914708963e-06, + "loss": 0.9708, + "step": 5540 + }, + { + "epoch": 0.4003106543609009, + "grad_norm": 5.57062579928452, + "learning_rate": 4.629906006550585e-06, + "loss": 0.8318, + "step": 5541 + }, + { + "epoch": 0.4003828995611104, + "grad_norm": 6.803217452073343, + "learning_rate": 4.629752836854781e-06, + "loss": 0.8456, + "step": 5542 + }, + { + "epoch": 0.40045514476131994, + "grad_norm": 6.723085220189439, + "learning_rate": 4.629599638004315e-06, + "loss": 0.8344, + "step": 5543 + }, + { + "epoch": 0.4005273899615294, + "grad_norm": 4.997738517500355, + "learning_rate": 4.629446410001283e-06, + "loss": 0.8527, + "step": 5544 + }, + { + "epoch": 0.40059963516173897, + "grad_norm": 6.417070582504126, + "learning_rate": 4.629293152847782e-06, + "loss": 0.8677, + "step": 5545 + }, + { + "epoch": 0.40067188036194845, + "grad_norm": 6.7729770807508975, + "learning_rate": 4.629139866545913e-06, + "loss": 0.8178, + "step": 5546 + }, + { + "epoch": 0.40074412556215794, + "grad_norm": 5.469681142827177, + "learning_rate": 4.6289865510977706e-06, + "loss": 0.8566, + "step": 5547 + }, + { + "epoch": 0.4008163707623675, + "grad_norm": 6.062310913912562, + "learning_rate": 4.628833206505457e-06, + "loss": 0.915, + "step": 5548 + }, + { + "epoch": 0.40088861596257697, + "grad_norm": 7.414869998093226, + "learning_rate": 4.6286798327710684e-06, + "loss": 0.941, + "step": 5549 + }, + { + "epoch": 0.4009608611627865, + "grad_norm": 7.946228754706168, + "learning_rate": 4.628526429896706e-06, + "loss": 0.9561, + "step": 5550 + }, + { + "epoch": 0.401033106362996, + "grad_norm": 6.7076031147256865, + "learning_rate": 4.628372997884469e-06, + "loss": 0.9674, + "step": 5551 + }, + { + "epoch": 0.40110535156320554, + "grad_norm": 6.509033381690775, + "learning_rate": 4.628219536736459e-06, + "loss": 0.8085, + "step": 5552 + }, + { + "epoch": 0.401177596763415, + "grad_norm": 5.869169873055243, + "learning_rate": 4.628066046454776e-06, + "loss": 0.9059, + "step": 5553 + }, + { + "epoch": 0.40124984196362457, + "grad_norm": 5.918323589300099, + "learning_rate": 4.627912527041521e-06, + "loss": 0.8669, + "step": 5554 + }, + { + "epoch": 0.40132208716383405, + "grad_norm": 6.812178857915897, + "learning_rate": 4.627758978498796e-06, + "loss": 0.8827, + "step": 5555 + }, + { + "epoch": 0.40139433236404354, + "grad_norm": 5.8602502601486455, + "learning_rate": 4.627605400828702e-06, + "loss": 0.8602, + "step": 5556 + }, + { + "epoch": 0.4014665775642531, + "grad_norm": 6.490771565001591, + "learning_rate": 4.627451794033342e-06, + "loss": 0.83, + "step": 5557 + }, + { + "epoch": 0.40153882276446257, + "grad_norm": 7.349024659922461, + "learning_rate": 4.62729815811482e-06, + "loss": 0.9682, + "step": 5558 + }, + { + "epoch": 0.4016110679646721, + "grad_norm": 7.400995764879722, + "learning_rate": 4.627144493075237e-06, + "loss": 0.9374, + "step": 5559 + }, + { + "epoch": 0.4016833131648816, + "grad_norm": 7.29701275256416, + "learning_rate": 4.626990798916697e-06, + "loss": 0.9461, + "step": 5560 + }, + { + "epoch": 0.40175555836509114, + "grad_norm": 7.187670896405807, + "learning_rate": 4.626837075641306e-06, + "loss": 0.9592, + "step": 5561 + }, + { + "epoch": 0.4018278035653006, + "grad_norm": 7.538586094484501, + "learning_rate": 4.626683323251166e-06, + "loss": 0.916, + "step": 5562 + }, + { + "epoch": 0.40190004876551016, + "grad_norm": 6.172441751074704, + "learning_rate": 4.626529541748382e-06, + "loss": 0.9277, + "step": 5563 + }, + { + "epoch": 0.40197229396571965, + "grad_norm": 8.197631158829417, + "learning_rate": 4.626375731135061e-06, + "loss": 0.9218, + "step": 5564 + }, + { + "epoch": 0.40204453916592914, + "grad_norm": 7.173403242590685, + "learning_rate": 4.626221891413306e-06, + "loss": 0.9231, + "step": 5565 + }, + { + "epoch": 0.4021167843661387, + "grad_norm": 7.634251719481501, + "learning_rate": 4.626068022585225e-06, + "loss": 0.8548, + "step": 5566 + }, + { + "epoch": 0.40218902956634817, + "grad_norm": 7.4103202097866365, + "learning_rate": 4.6259141246529235e-06, + "loss": 0.9286, + "step": 5567 + }, + { + "epoch": 0.4022612747665577, + "grad_norm": 7.441882469177983, + "learning_rate": 4.625760197618508e-06, + "loss": 0.878, + "step": 5568 + }, + { + "epoch": 0.4023335199667672, + "grad_norm": 7.465341468559057, + "learning_rate": 4.625606241484086e-06, + "loss": 0.9622, + "step": 5569 + }, + { + "epoch": 0.40240576516697674, + "grad_norm": 6.153285208224934, + "learning_rate": 4.625452256251765e-06, + "loss": 0.8877, + "step": 5570 + }, + { + "epoch": 0.4024780103671862, + "grad_norm": 6.630019679435879, + "learning_rate": 4.6252982419236524e-06, + "loss": 0.8446, + "step": 5571 + }, + { + "epoch": 0.40255025556739576, + "grad_norm": 6.126404581756353, + "learning_rate": 4.625144198501857e-06, + "loss": 0.8953, + "step": 5572 + }, + { + "epoch": 0.40262250076760525, + "grad_norm": 8.05419446766399, + "learning_rate": 4.6249901259884886e-06, + "loss": 1.0324, + "step": 5573 + }, + { + "epoch": 0.40269474596781474, + "grad_norm": 6.99474600755489, + "learning_rate": 4.624836024385655e-06, + "loss": 0.9612, + "step": 5574 + }, + { + "epoch": 0.4027669911680243, + "grad_norm": 6.0767431576103705, + "learning_rate": 4.624681893695466e-06, + "loss": 0.9443, + "step": 5575 + }, + { + "epoch": 0.40283923636823377, + "grad_norm": 8.835896486006714, + "learning_rate": 4.624527733920032e-06, + "loss": 0.9713, + "step": 5576 + }, + { + "epoch": 0.4029114815684433, + "grad_norm": 8.091403930463363, + "learning_rate": 4.624373545061463e-06, + "loss": 0.9119, + "step": 5577 + }, + { + "epoch": 0.4029837267686528, + "grad_norm": 5.963162827987002, + "learning_rate": 4.624219327121869e-06, + "loss": 0.9103, + "step": 5578 + }, + { + "epoch": 0.40305597196886234, + "grad_norm": 6.315556758642176, + "learning_rate": 4.624065080103362e-06, + "loss": 0.9285, + "step": 5579 + }, + { + "epoch": 0.4031282171690718, + "grad_norm": 6.237798759323389, + "learning_rate": 4.6239108040080524e-06, + "loss": 0.905, + "step": 5580 + }, + { + "epoch": 0.40320046236928136, + "grad_norm": 5.534730495104742, + "learning_rate": 4.623756498838054e-06, + "loss": 0.9302, + "step": 5581 + }, + { + "epoch": 0.40327270756949085, + "grad_norm": 5.529668032182635, + "learning_rate": 4.623602164595478e-06, + "loss": 0.8555, + "step": 5582 + }, + { + "epoch": 0.40334495276970034, + "grad_norm": 8.157475288112852, + "learning_rate": 4.623447801282437e-06, + "loss": 0.9006, + "step": 5583 + }, + { + "epoch": 0.4034171979699099, + "grad_norm": 6.6751723706566946, + "learning_rate": 4.623293408901044e-06, + "loss": 0.8753, + "step": 5584 + }, + { + "epoch": 0.40348944317011937, + "grad_norm": 7.313075898796791, + "learning_rate": 4.623138987453414e-06, + "loss": 0.7452, + "step": 5585 + }, + { + "epoch": 0.4035616883703289, + "grad_norm": 6.418016300036879, + "learning_rate": 4.622984536941658e-06, + "loss": 0.9538, + "step": 5586 + }, + { + "epoch": 0.4036339335705384, + "grad_norm": 7.515944761583848, + "learning_rate": 4.622830057367894e-06, + "loss": 0.9723, + "step": 5587 + }, + { + "epoch": 0.40370617877074794, + "grad_norm": 7.090206387075635, + "learning_rate": 4.622675548734233e-06, + "loss": 0.9453, + "step": 5588 + }, + { + "epoch": 0.4037784239709574, + "grad_norm": 9.00426424518783, + "learning_rate": 4.622521011042793e-06, + "loss": 0.9632, + "step": 5589 + }, + { + "epoch": 0.40385066917116696, + "grad_norm": 5.671026833750753, + "learning_rate": 4.622366444295688e-06, + "loss": 0.8742, + "step": 5590 + }, + { + "epoch": 0.40392291437137645, + "grad_norm": 6.435321920859722, + "learning_rate": 4.622211848495035e-06, + "loss": 0.9493, + "step": 5591 + }, + { + "epoch": 0.40399515957158594, + "grad_norm": 8.117544656483265, + "learning_rate": 4.622057223642949e-06, + "loss": 0.8736, + "step": 5592 + }, + { + "epoch": 0.4040674047717955, + "grad_norm": 7.0815222126421755, + "learning_rate": 4.621902569741548e-06, + "loss": 0.9734, + "step": 5593 + }, + { + "epoch": 0.40413964997200497, + "grad_norm": 5.959196583918108, + "learning_rate": 4.621747886792948e-06, + "loss": 0.8622, + "step": 5594 + }, + { + "epoch": 0.4042118951722145, + "grad_norm": 7.082149750801196, + "learning_rate": 4.621593174799266e-06, + "loss": 0.9013, + "step": 5595 + }, + { + "epoch": 0.404284140372424, + "grad_norm": 7.772356826577665, + "learning_rate": 4.621438433762621e-06, + "loss": 0.8985, + "step": 5596 + }, + { + "epoch": 0.40435638557263354, + "grad_norm": 8.497358023750033, + "learning_rate": 4.621283663685132e-06, + "loss": 0.937, + "step": 5597 + }, + { + "epoch": 0.404428630772843, + "grad_norm": 7.7951868710437076, + "learning_rate": 4.621128864568916e-06, + "loss": 0.8825, + "step": 5598 + }, + { + "epoch": 0.40450087597305256, + "grad_norm": 9.242016207864266, + "learning_rate": 4.620974036416093e-06, + "loss": 0.9692, + "step": 5599 + }, + { + "epoch": 0.40457312117326205, + "grad_norm": 6.379384048050371, + "learning_rate": 4.620819179228782e-06, + "loss": 0.8563, + "step": 5600 + }, + { + "epoch": 0.40464536637347154, + "grad_norm": 6.636669605560392, + "learning_rate": 4.6206642930091035e-06, + "loss": 0.8873, + "step": 5601 + }, + { + "epoch": 0.4047176115736811, + "grad_norm": 7.533459098013695, + "learning_rate": 4.620509377759177e-06, + "loss": 1.0536, + "step": 5602 + }, + { + "epoch": 0.40478985677389057, + "grad_norm": 6.238078229320084, + "learning_rate": 4.620354433481123e-06, + "loss": 0.9183, + "step": 5603 + }, + { + "epoch": 0.4048621019741001, + "grad_norm": 6.342563762532238, + "learning_rate": 4.620199460177065e-06, + "loss": 0.8271, + "step": 5604 + }, + { + "epoch": 0.4049343471743096, + "grad_norm": 7.4911619882746034, + "learning_rate": 4.620044457849121e-06, + "loss": 0.8676, + "step": 5605 + }, + { + "epoch": 0.40500659237451914, + "grad_norm": 6.955758202674405, + "learning_rate": 4.619889426499416e-06, + "loss": 0.9174, + "step": 5606 + }, + { + "epoch": 0.4050788375747286, + "grad_norm": 5.602389009968743, + "learning_rate": 4.61973436613007e-06, + "loss": 0.8616, + "step": 5607 + }, + { + "epoch": 0.40515108277493816, + "grad_norm": 6.212934218646179, + "learning_rate": 4.619579276743206e-06, + "loss": 0.9593, + "step": 5608 + }, + { + "epoch": 0.40522332797514765, + "grad_norm": 8.840229101880249, + "learning_rate": 4.619424158340947e-06, + "loss": 0.9436, + "step": 5609 + }, + { + "epoch": 0.40529557317535714, + "grad_norm": 6.4986755782420955, + "learning_rate": 4.619269010925418e-06, + "loss": 0.8593, + "step": 5610 + }, + { + "epoch": 0.4053678183755667, + "grad_norm": 6.364003889196564, + "learning_rate": 4.619113834498741e-06, + "loss": 0.7917, + "step": 5611 + }, + { + "epoch": 0.40544006357577617, + "grad_norm": 5.983026975193131, + "learning_rate": 4.618958629063042e-06, + "loss": 0.9161, + "step": 5612 + }, + { + "epoch": 0.4055123087759857, + "grad_norm": 8.499910241943367, + "learning_rate": 4.618803394620444e-06, + "loss": 0.916, + "step": 5613 + }, + { + "epoch": 0.4055845539761952, + "grad_norm": 9.858988468676099, + "learning_rate": 4.618648131173072e-06, + "loss": 0.9655, + "step": 5614 + }, + { + "epoch": 0.40565679917640474, + "grad_norm": 7.1429594468556115, + "learning_rate": 4.618492838723052e-06, + "loss": 0.9562, + "step": 5615 + }, + { + "epoch": 0.4057290443766142, + "grad_norm": 5.988261502128234, + "learning_rate": 4.618337517272511e-06, + "loss": 0.9303, + "step": 5616 + }, + { + "epoch": 0.40580128957682376, + "grad_norm": 6.103498124974912, + "learning_rate": 4.6181821668235735e-06, + "loss": 0.9027, + "step": 5617 + }, + { + "epoch": 0.40587353477703325, + "grad_norm": 8.297048333435407, + "learning_rate": 4.618026787378368e-06, + "loss": 0.9848, + "step": 5618 + }, + { + "epoch": 0.40594577997724274, + "grad_norm": 8.245688496673864, + "learning_rate": 4.6178713789390195e-06, + "loss": 0.9003, + "step": 5619 + }, + { + "epoch": 0.4060180251774523, + "grad_norm": 9.320195442737257, + "learning_rate": 4.617715941507656e-06, + "loss": 0.9867, + "step": 5620 + }, + { + "epoch": 0.40609027037766177, + "grad_norm": 6.474226397987066, + "learning_rate": 4.617560475086405e-06, + "loss": 0.8244, + "step": 5621 + }, + { + "epoch": 0.4061625155778713, + "grad_norm": 7.794613313314623, + "learning_rate": 4.617404979677396e-06, + "loss": 0.9339, + "step": 5622 + }, + { + "epoch": 0.4062347607780808, + "grad_norm": 6.668217319711498, + "learning_rate": 4.617249455282757e-06, + "loss": 0.817, + "step": 5623 + }, + { + "epoch": 0.40630700597829034, + "grad_norm": 7.567796097990598, + "learning_rate": 4.617093901904618e-06, + "loss": 0.933, + "step": 5624 + }, + { + "epoch": 0.4063792511784998, + "grad_norm": 6.442326190278117, + "learning_rate": 4.616938319545106e-06, + "loss": 0.9007, + "step": 5625 + }, + { + "epoch": 0.40645149637870936, + "grad_norm": 5.743985555009026, + "learning_rate": 4.616782708206352e-06, + "loss": 0.9312, + "step": 5626 + }, + { + "epoch": 0.40652374157891885, + "grad_norm": 6.367015550784039, + "learning_rate": 4.616627067890487e-06, + "loss": 0.9469, + "step": 5627 + }, + { + "epoch": 0.40659598677912834, + "grad_norm": 6.693292978679291, + "learning_rate": 4.61647139859964e-06, + "loss": 0.8868, + "step": 5628 + }, + { + "epoch": 0.4066682319793379, + "grad_norm": 5.956098799972938, + "learning_rate": 4.6163157003359425e-06, + "loss": 0.9342, + "step": 5629 + }, + { + "epoch": 0.40674047717954737, + "grad_norm": 7.152450685727765, + "learning_rate": 4.616159973101527e-06, + "loss": 0.8848, + "step": 5630 + }, + { + "epoch": 0.4068127223797569, + "grad_norm": 5.4510250912363345, + "learning_rate": 4.616004216898525e-06, + "loss": 0.8754, + "step": 5631 + }, + { + "epoch": 0.4068849675799664, + "grad_norm": 7.897028796468699, + "learning_rate": 4.6158484317290675e-06, + "loss": 0.9678, + "step": 5632 + }, + { + "epoch": 0.40695721278017594, + "grad_norm": 6.4364095708758695, + "learning_rate": 4.615692617595289e-06, + "loss": 0.9087, + "step": 5633 + }, + { + "epoch": 0.4070294579803854, + "grad_norm": 7.17787335112055, + "learning_rate": 4.61553677449932e-06, + "loss": 0.9296, + "step": 5634 + }, + { + "epoch": 0.40710170318059496, + "grad_norm": 7.859793134301543, + "learning_rate": 4.615380902443296e-06, + "loss": 1.0561, + "step": 5635 + }, + { + "epoch": 0.40717394838080445, + "grad_norm": 7.041194681755839, + "learning_rate": 4.615225001429349e-06, + "loss": 0.905, + "step": 5636 + }, + { + "epoch": 0.40724619358101394, + "grad_norm": 6.4257649244663435, + "learning_rate": 4.615069071459615e-06, + "loss": 0.9216, + "step": 5637 + }, + { + "epoch": 0.4073184387812235, + "grad_norm": 6.370332093497584, + "learning_rate": 4.6149131125362275e-06, + "loss": 0.8841, + "step": 5638 + }, + { + "epoch": 0.40739068398143297, + "grad_norm": 7.543755875958902, + "learning_rate": 4.614757124661321e-06, + "loss": 0.9115, + "step": 5639 + }, + { + "epoch": 0.4074629291816425, + "grad_norm": 6.668563986045746, + "learning_rate": 4.614601107837033e-06, + "loss": 0.96, + "step": 5640 + }, + { + "epoch": 0.407535174381852, + "grad_norm": 7.835378609804658, + "learning_rate": 4.6144450620654956e-06, + "loss": 0.8862, + "step": 5641 + }, + { + "epoch": 0.40760741958206154, + "grad_norm": 8.066726404814826, + "learning_rate": 4.614288987348848e-06, + "loss": 0.9638, + "step": 5642 + }, + { + "epoch": 0.407679664782271, + "grad_norm": 5.806985187283676, + "learning_rate": 4.614132883689226e-06, + "loss": 0.8306, + "step": 5643 + }, + { + "epoch": 0.40775190998248056, + "grad_norm": 7.663365634730115, + "learning_rate": 4.613976751088767e-06, + "loss": 0.9867, + "step": 5644 + }, + { + "epoch": 0.40782415518269005, + "grad_norm": 5.525772572352422, + "learning_rate": 4.6138205895496065e-06, + "loss": 0.943, + "step": 5645 + }, + { + "epoch": 0.40789640038289954, + "grad_norm": 7.2149235876402935, + "learning_rate": 4.613664399073884e-06, + "loss": 0.8635, + "step": 5646 + }, + { + "epoch": 0.4079686455831091, + "grad_norm": 8.41347218942492, + "learning_rate": 4.613508179663737e-06, + "loss": 0.8695, + "step": 5647 + }, + { + "epoch": 0.40804089078331857, + "grad_norm": 6.434853315245088, + "learning_rate": 4.613351931321303e-06, + "loss": 0.9537, + "step": 5648 + }, + { + "epoch": 0.4081131359835281, + "grad_norm": 5.79342211275955, + "learning_rate": 4.613195654048723e-06, + "loss": 0.881, + "step": 5649 + }, + { + "epoch": 0.4081853811837376, + "grad_norm": 5.939288703125339, + "learning_rate": 4.613039347848135e-06, + "loss": 1.0039, + "step": 5650 + }, + { + "epoch": 0.40825762638394714, + "grad_norm": 6.6776232647417535, + "learning_rate": 4.61288301272168e-06, + "loss": 0.9311, + "step": 5651 + }, + { + "epoch": 0.4083298715841566, + "grad_norm": 6.871145104004368, + "learning_rate": 4.612726648671496e-06, + "loss": 0.8714, + "step": 5652 + }, + { + "epoch": 0.40840211678436616, + "grad_norm": 5.9055298058343295, + "learning_rate": 4.6125702556997245e-06, + "loss": 0.8291, + "step": 5653 + }, + { + "epoch": 0.40847436198457565, + "grad_norm": 6.189506744797721, + "learning_rate": 4.612413833808507e-06, + "loss": 0.8848, + "step": 5654 + }, + { + "epoch": 0.40854660718478514, + "grad_norm": 8.352032497445093, + "learning_rate": 4.612257382999984e-06, + "loss": 0.8819, + "step": 5655 + }, + { + "epoch": 0.4086188523849947, + "grad_norm": 5.786835153180699, + "learning_rate": 4.612100903276298e-06, + "loss": 0.9971, + "step": 5656 + }, + { + "epoch": 0.40869109758520417, + "grad_norm": 6.067091628620597, + "learning_rate": 4.61194439463959e-06, + "loss": 0.993, + "step": 5657 + }, + { + "epoch": 0.4087633427854137, + "grad_norm": 6.210889592975616, + "learning_rate": 4.611787857092004e-06, + "loss": 0.8889, + "step": 5658 + }, + { + "epoch": 0.4088355879856232, + "grad_norm": 5.805061092137383, + "learning_rate": 4.611631290635681e-06, + "loss": 0.871, + "step": 5659 + }, + { + "epoch": 0.40890783318583274, + "grad_norm": 6.50292257482435, + "learning_rate": 4.611474695272765e-06, + "loss": 0.9146, + "step": 5660 + }, + { + "epoch": 0.4089800783860422, + "grad_norm": 7.0957265108512235, + "learning_rate": 4.611318071005401e-06, + "loss": 0.9087, + "step": 5661 + }, + { + "epoch": 0.40905232358625176, + "grad_norm": 6.499163647177456, + "learning_rate": 4.61116141783573e-06, + "loss": 0.9171, + "step": 5662 + }, + { + "epoch": 0.40912456878646125, + "grad_norm": 4.8832027187824805, + "learning_rate": 4.6110047357659e-06, + "loss": 0.8315, + "step": 5663 + }, + { + "epoch": 0.40919681398667074, + "grad_norm": 8.078252027682344, + "learning_rate": 4.610848024798054e-06, + "loss": 0.8823, + "step": 5664 + }, + { + "epoch": 0.4092690591868803, + "grad_norm": 6.371749348358015, + "learning_rate": 4.610691284934337e-06, + "loss": 0.9087, + "step": 5665 + }, + { + "epoch": 0.40934130438708977, + "grad_norm": 7.4012666185318405, + "learning_rate": 4.6105345161768965e-06, + "loss": 0.9569, + "step": 5666 + }, + { + "epoch": 0.4094135495872993, + "grad_norm": 6.826755272816775, + "learning_rate": 4.610377718527876e-06, + "loss": 0.8434, + "step": 5667 + }, + { + "epoch": 0.4094857947875088, + "grad_norm": 5.165793724558454, + "learning_rate": 4.610220891989423e-06, + "loss": 0.8206, + "step": 5668 + }, + { + "epoch": 0.40955803998771834, + "grad_norm": 5.820709010193488, + "learning_rate": 4.610064036563685e-06, + "loss": 0.8519, + "step": 5669 + }, + { + "epoch": 0.4096302851879278, + "grad_norm": 6.4499574881638635, + "learning_rate": 4.609907152252808e-06, + "loss": 0.9419, + "step": 5670 + }, + { + "epoch": 0.40970253038813736, + "grad_norm": 7.367186723306561, + "learning_rate": 4.6097502390589415e-06, + "loss": 0.9374, + "step": 5671 + }, + { + "epoch": 0.40977477558834685, + "grad_norm": 7.023249843241781, + "learning_rate": 4.609593296984231e-06, + "loss": 0.9079, + "step": 5672 + }, + { + "epoch": 0.40984702078855634, + "grad_norm": 7.620905042352304, + "learning_rate": 4.609436326030828e-06, + "loss": 0.9444, + "step": 5673 + }, + { + "epoch": 0.4099192659887659, + "grad_norm": 6.164499090903336, + "learning_rate": 4.609279326200879e-06, + "loss": 0.8966, + "step": 5674 + }, + { + "epoch": 0.40999151118897537, + "grad_norm": 5.940321723625395, + "learning_rate": 4.609122297496533e-06, + "loss": 0.8615, + "step": 5675 + }, + { + "epoch": 0.4100637563891849, + "grad_norm": 6.709422747686848, + "learning_rate": 4.6089652399199405e-06, + "loss": 0.8882, + "step": 5676 + }, + { + "epoch": 0.4101360015893944, + "grad_norm": 7.599219593335514, + "learning_rate": 4.608808153473252e-06, + "loss": 0.9666, + "step": 5677 + }, + { + "epoch": 0.41020824678960394, + "grad_norm": 6.54826190979764, + "learning_rate": 4.608651038158616e-06, + "loss": 0.8825, + "step": 5678 + }, + { + "epoch": 0.4102804919898134, + "grad_norm": 7.318968430671098, + "learning_rate": 4.608493893978186e-06, + "loss": 0.8835, + "step": 5679 + }, + { + "epoch": 0.41035273719002296, + "grad_norm": 5.729147542285789, + "learning_rate": 4.60833672093411e-06, + "loss": 0.8695, + "step": 5680 + }, + { + "epoch": 0.41042498239023245, + "grad_norm": 6.714585543955756, + "learning_rate": 4.608179519028543e-06, + "loss": 0.8074, + "step": 5681 + }, + { + "epoch": 0.41049722759044194, + "grad_norm": 8.184991117273357, + "learning_rate": 4.608022288263635e-06, + "loss": 0.9151, + "step": 5682 + }, + { + "epoch": 0.4105694727906515, + "grad_norm": 6.2188484529149175, + "learning_rate": 4.607865028641539e-06, + "loss": 0.9211, + "step": 5683 + }, + { + "epoch": 0.41064171799086097, + "grad_norm": 6.547714871452619, + "learning_rate": 4.607707740164406e-06, + "loss": 0.8554, + "step": 5684 + }, + { + "epoch": 0.4107139631910705, + "grad_norm": 8.75973361720741, + "learning_rate": 4.6075504228343915e-06, + "loss": 0.9122, + "step": 5685 + }, + { + "epoch": 0.41078620839128, + "grad_norm": 8.24992185613441, + "learning_rate": 4.607393076653648e-06, + "loss": 0.9138, + "step": 5686 + }, + { + "epoch": 0.41085845359148954, + "grad_norm": 8.0287093956966, + "learning_rate": 4.60723570162433e-06, + "loss": 0.8847, + "step": 5687 + }, + { + "epoch": 0.410930698791699, + "grad_norm": 8.039701178771839, + "learning_rate": 4.60707829774859e-06, + "loss": 0.9224, + "step": 5688 + }, + { + "epoch": 0.41100294399190856, + "grad_norm": 6.451564794176451, + "learning_rate": 4.606920865028585e-06, + "loss": 0.885, + "step": 5689 + }, + { + "epoch": 0.41107518919211805, + "grad_norm": 7.6941544179351675, + "learning_rate": 4.6067634034664695e-06, + "loss": 0.9924, + "step": 5690 + }, + { + "epoch": 0.41114743439232754, + "grad_norm": 7.423125151701423, + "learning_rate": 4.606605913064399e-06, + "loss": 0.9151, + "step": 5691 + }, + { + "epoch": 0.4112196795925371, + "grad_norm": 8.558405210297789, + "learning_rate": 4.606448393824528e-06, + "loss": 0.8031, + "step": 5692 + }, + { + "epoch": 0.41129192479274657, + "grad_norm": 9.292144033260746, + "learning_rate": 4.606290845749015e-06, + "loss": 0.9703, + "step": 5693 + }, + { + "epoch": 0.4113641699929561, + "grad_norm": 9.014964061290938, + "learning_rate": 4.606133268840016e-06, + "loss": 0.9491, + "step": 5694 + }, + { + "epoch": 0.4114364151931656, + "grad_norm": 7.556086066576254, + "learning_rate": 4.605975663099688e-06, + "loss": 1.0175, + "step": 5695 + }, + { + "epoch": 0.41150866039337514, + "grad_norm": 6.622601344772109, + "learning_rate": 4.605818028530188e-06, + "loss": 0.9073, + "step": 5696 + }, + { + "epoch": 0.4115809055935846, + "grad_norm": 7.133836854412978, + "learning_rate": 4.6056603651336736e-06, + "loss": 0.889, + "step": 5697 + }, + { + "epoch": 0.41165315079379416, + "grad_norm": 6.3784182490921335, + "learning_rate": 4.605502672912304e-06, + "loss": 0.9724, + "step": 5698 + }, + { + "epoch": 0.41172539599400365, + "grad_norm": 7.238275452810685, + "learning_rate": 4.605344951868238e-06, + "loss": 0.8679, + "step": 5699 + }, + { + "epoch": 0.41179764119421314, + "grad_norm": 5.4504592632006315, + "learning_rate": 4.605187202003635e-06, + "loss": 0.859, + "step": 5700 + }, + { + "epoch": 0.4118698863944227, + "grad_norm": 8.783433527317696, + "learning_rate": 4.6050294233206524e-06, + "loss": 0.8966, + "step": 5701 + }, + { + "epoch": 0.41194213159463217, + "grad_norm": 9.75404000333804, + "learning_rate": 4.604871615821452e-06, + "loss": 0.8976, + "step": 5702 + }, + { + "epoch": 0.4120143767948417, + "grad_norm": 8.178268703062537, + "learning_rate": 4.604713779508194e-06, + "loss": 0.9137, + "step": 5703 + }, + { + "epoch": 0.4120866219950512, + "grad_norm": 7.0825930339440415, + "learning_rate": 4.6045559143830375e-06, + "loss": 0.8668, + "step": 5704 + }, + { + "epoch": 0.41215886719526074, + "grad_norm": 7.887944315684774, + "learning_rate": 4.604398020448145e-06, + "loss": 0.9372, + "step": 5705 + }, + { + "epoch": 0.4122311123954702, + "grad_norm": 8.880745760558169, + "learning_rate": 4.6042400977056775e-06, + "loss": 0.8793, + "step": 5706 + }, + { + "epoch": 0.41230335759567976, + "grad_norm": 7.572948205629462, + "learning_rate": 4.604082146157798e-06, + "loss": 0.9512, + "step": 5707 + }, + { + "epoch": 0.41237560279588925, + "grad_norm": 7.846526972786621, + "learning_rate": 4.603924165806667e-06, + "loss": 0.8988, + "step": 5708 + }, + { + "epoch": 0.41244784799609874, + "grad_norm": 6.374854291858885, + "learning_rate": 4.603766156654448e-06, + "loss": 0.8962, + "step": 5709 + }, + { + "epoch": 0.4125200931963083, + "grad_norm": 7.3645926281673875, + "learning_rate": 4.603608118703302e-06, + "loss": 0.9129, + "step": 5710 + }, + { + "epoch": 0.41259233839651777, + "grad_norm": 6.905307342959647, + "learning_rate": 4.603450051955396e-06, + "loss": 0.8882, + "step": 5711 + }, + { + "epoch": 0.4126645835967273, + "grad_norm": 7.905933238080767, + "learning_rate": 4.603291956412892e-06, + "loss": 0.9315, + "step": 5712 + }, + { + "epoch": 0.4127368287969368, + "grad_norm": 6.428993308135784, + "learning_rate": 4.603133832077953e-06, + "loss": 0.8942, + "step": 5713 + }, + { + "epoch": 0.41280907399714634, + "grad_norm": 8.562372137075656, + "learning_rate": 4.602975678952746e-06, + "loss": 0.8836, + "step": 5714 + }, + { + "epoch": 0.4128813191973558, + "grad_norm": 7.5289903471975155, + "learning_rate": 4.602817497039435e-06, + "loss": 0.8982, + "step": 5715 + }, + { + "epoch": 0.4129535643975653, + "grad_norm": 8.018929973197114, + "learning_rate": 4.6026592863401844e-06, + "loss": 0.9593, + "step": 5716 + }, + { + "epoch": 0.41302580959777485, + "grad_norm": 6.864700648260955, + "learning_rate": 4.602501046857161e-06, + "loss": 0.8924, + "step": 5717 + }, + { + "epoch": 0.41309805479798434, + "grad_norm": 5.432770875865136, + "learning_rate": 4.60234277859253e-06, + "loss": 0.8489, + "step": 5718 + }, + { + "epoch": 0.4131702999981939, + "grad_norm": 6.930790311328111, + "learning_rate": 4.6021844815484594e-06, + "loss": 0.8626, + "step": 5719 + }, + { + "epoch": 0.41324254519840337, + "grad_norm": 10.60004429358099, + "learning_rate": 4.602026155727116e-06, + "loss": 0.9441, + "step": 5720 + }, + { + "epoch": 0.4133147903986129, + "grad_norm": 5.981451291451093, + "learning_rate": 4.601867801130666e-06, + "loss": 0.905, + "step": 5721 + }, + { + "epoch": 0.4133870355988224, + "grad_norm": 7.501425544046351, + "learning_rate": 4.601709417761278e-06, + "loss": 0.9701, + "step": 5722 + }, + { + "epoch": 0.41345928079903194, + "grad_norm": 6.73221222385346, + "learning_rate": 4.60155100562112e-06, + "loss": 0.9316, + "step": 5723 + }, + { + "epoch": 0.4135315259992414, + "grad_norm": 6.609860954726681, + "learning_rate": 4.60139256471236e-06, + "loss": 0.8143, + "step": 5724 + }, + { + "epoch": 0.4136037711994509, + "grad_norm": 7.012746649499635, + "learning_rate": 4.6012340950371684e-06, + "loss": 0.8702, + "step": 5725 + }, + { + "epoch": 0.41367601639966045, + "grad_norm": 5.634675753453576, + "learning_rate": 4.601075596597713e-06, + "loss": 0.8777, + "step": 5726 + }, + { + "epoch": 0.41374826159986994, + "grad_norm": 7.247567393531384, + "learning_rate": 4.6009170693961635e-06, + "loss": 0.9442, + "step": 5727 + }, + { + "epoch": 0.4138205068000795, + "grad_norm": 8.934756411416467, + "learning_rate": 4.600758513434691e-06, + "loss": 0.8589, + "step": 5728 + }, + { + "epoch": 0.41389275200028897, + "grad_norm": 6.517096703170559, + "learning_rate": 4.600599928715466e-06, + "loss": 0.8288, + "step": 5729 + }, + { + "epoch": 0.4139649972004985, + "grad_norm": 5.519498939716559, + "learning_rate": 4.600441315240659e-06, + "loss": 0.8202, + "step": 5730 + }, + { + "epoch": 0.414037242400708, + "grad_norm": 7.0642646677931635, + "learning_rate": 4.60028267301244e-06, + "loss": 0.927, + "step": 5731 + }, + { + "epoch": 0.41410948760091754, + "grad_norm": 9.581440437350336, + "learning_rate": 4.600124002032983e-06, + "loss": 0.936, + "step": 5732 + }, + { + "epoch": 0.414181732801127, + "grad_norm": 7.419951179528865, + "learning_rate": 4.59996530230446e-06, + "loss": 0.9257, + "step": 5733 + }, + { + "epoch": 0.4142539780013365, + "grad_norm": 7.407887981861405, + "learning_rate": 4.599806573829041e-06, + "loss": 0.9422, + "step": 5734 + }, + { + "epoch": 0.41432622320154605, + "grad_norm": 7.369303167933058, + "learning_rate": 4.599647816608901e-06, + "loss": 0.8648, + "step": 5735 + }, + { + "epoch": 0.41439846840175554, + "grad_norm": 6.331092873192917, + "learning_rate": 4.5994890306462124e-06, + "loss": 0.9296, + "step": 5736 + }, + { + "epoch": 0.4144707136019651, + "grad_norm": 6.5896718218161885, + "learning_rate": 4.599330215943149e-06, + "loss": 0.9285, + "step": 5737 + }, + { + "epoch": 0.41454295880217457, + "grad_norm": 7.778973717553535, + "learning_rate": 4.5991713725018855e-06, + "loss": 0.8403, + "step": 5738 + }, + { + "epoch": 0.4146152040023841, + "grad_norm": 5.966526436179014, + "learning_rate": 4.599012500324595e-06, + "loss": 0.8787, + "step": 5739 + }, + { + "epoch": 0.4146874492025936, + "grad_norm": 7.411428451924639, + "learning_rate": 4.598853599413455e-06, + "loss": 0.9196, + "step": 5740 + }, + { + "epoch": 0.41475969440280314, + "grad_norm": 7.3218175751489305, + "learning_rate": 4.598694669770637e-06, + "loss": 0.876, + "step": 5741 + }, + { + "epoch": 0.4148319396030126, + "grad_norm": 7.230292270182158, + "learning_rate": 4.5985357113983195e-06, + "loss": 0.8706, + "step": 5742 + }, + { + "epoch": 0.4149041848032221, + "grad_norm": 6.207448533605678, + "learning_rate": 4.598376724298676e-06, + "loss": 0.8566, + "step": 5743 + }, + { + "epoch": 0.41497643000343165, + "grad_norm": 6.224988712162676, + "learning_rate": 4.598217708473887e-06, + "loss": 0.9033, + "step": 5744 + }, + { + "epoch": 0.41504867520364114, + "grad_norm": 7.155066246509767, + "learning_rate": 4.598058663926125e-06, + "loss": 0.9444, + "step": 5745 + }, + { + "epoch": 0.4151209204038507, + "grad_norm": 6.053237603616749, + "learning_rate": 4.597899590657569e-06, + "loss": 0.8554, + "step": 5746 + }, + { + "epoch": 0.41519316560406017, + "grad_norm": 7.436312284120597, + "learning_rate": 4.597740488670397e-06, + "loss": 0.8965, + "step": 5747 + }, + { + "epoch": 0.4152654108042697, + "grad_norm": 6.596550658550294, + "learning_rate": 4.597581357966786e-06, + "loss": 0.8864, + "step": 5748 + }, + { + "epoch": 0.4153376560044792, + "grad_norm": 6.1325723977681, + "learning_rate": 4.597422198548915e-06, + "loss": 0.926, + "step": 5749 + }, + { + "epoch": 0.41540990120468874, + "grad_norm": 6.721094760276136, + "learning_rate": 4.597263010418962e-06, + "loss": 0.8972, + "step": 5750 + }, + { + "epoch": 0.4154821464048982, + "grad_norm": 6.742182124943541, + "learning_rate": 4.597103793579109e-06, + "loss": 0.902, + "step": 5751 + }, + { + "epoch": 0.4155543916051077, + "grad_norm": 5.777308754753713, + "learning_rate": 4.596944548031531e-06, + "loss": 0.8902, + "step": 5752 + }, + { + "epoch": 0.41562663680531725, + "grad_norm": 7.258790310629013, + "learning_rate": 4.5967852737784114e-06, + "loss": 1.0126, + "step": 5753 + }, + { + "epoch": 0.41569888200552674, + "grad_norm": 9.541943364247343, + "learning_rate": 4.59662597082193e-06, + "loss": 0.9309, + "step": 5754 + }, + { + "epoch": 0.4157711272057363, + "grad_norm": 6.658199114475178, + "learning_rate": 4.596466639164266e-06, + "loss": 0.8019, + "step": 5755 + }, + { + "epoch": 0.41584337240594577, + "grad_norm": 5.962814815375448, + "learning_rate": 4.596307278807601e-06, + "loss": 0.9559, + "step": 5756 + }, + { + "epoch": 0.4159156176061553, + "grad_norm": 6.916682446320087, + "learning_rate": 4.596147889754118e-06, + "loss": 0.9581, + "step": 5757 + }, + { + "epoch": 0.4159878628063648, + "grad_norm": 7.560390461925071, + "learning_rate": 4.595988472005998e-06, + "loss": 0.9539, + "step": 5758 + }, + { + "epoch": 0.41606010800657434, + "grad_norm": 7.988243524513608, + "learning_rate": 4.595829025565422e-06, + "loss": 0.8257, + "step": 5759 + }, + { + "epoch": 0.4161323532067838, + "grad_norm": 6.809365767691079, + "learning_rate": 4.595669550434576e-06, + "loss": 0.8793, + "step": 5760 + }, + { + "epoch": 0.4162045984069933, + "grad_norm": 6.536452853946492, + "learning_rate": 4.59551004661564e-06, + "loss": 0.9553, + "step": 5761 + }, + { + "epoch": 0.41627684360720285, + "grad_norm": 6.635965736783579, + "learning_rate": 4.595350514110798e-06, + "loss": 0.812, + "step": 5762 + }, + { + "epoch": 0.41634908880741234, + "grad_norm": 5.929047038555217, + "learning_rate": 4.595190952922235e-06, + "loss": 0.8482, + "step": 5763 + }, + { + "epoch": 0.4164213340076219, + "grad_norm": 7.774589169555591, + "learning_rate": 4.5950313630521345e-06, + "loss": 0.9062, + "step": 5764 + }, + { + "epoch": 0.41649357920783137, + "grad_norm": 9.115041655143436, + "learning_rate": 4.594871744502682e-06, + "loss": 0.9507, + "step": 5765 + }, + { + "epoch": 0.4165658244080409, + "grad_norm": 8.177005477502236, + "learning_rate": 4.59471209727606e-06, + "loss": 1.0295, + "step": 5766 + }, + { + "epoch": 0.4166380696082504, + "grad_norm": 6.931944951406219, + "learning_rate": 4.594552421374457e-06, + "loss": 0.8787, + "step": 5767 + }, + { + "epoch": 0.41671031480845994, + "grad_norm": 6.421483828471903, + "learning_rate": 4.594392716800059e-06, + "loss": 0.9063, + "step": 5768 + }, + { + "epoch": 0.4167825600086694, + "grad_norm": 7.476837759023253, + "learning_rate": 4.5942329835550496e-06, + "loss": 0.9172, + "step": 5769 + }, + { + "epoch": 0.4168548052088789, + "grad_norm": 6.133762864262034, + "learning_rate": 4.594073221641616e-06, + "loss": 0.9438, + "step": 5770 + }, + { + "epoch": 0.41692705040908845, + "grad_norm": 7.005197502799094, + "learning_rate": 4.593913431061947e-06, + "loss": 0.91, + "step": 5771 + }, + { + "epoch": 0.41699929560929794, + "grad_norm": 6.549501170025994, + "learning_rate": 4.593753611818229e-06, + "loss": 0.9825, + "step": 5772 + }, + { + "epoch": 0.4170715408095075, + "grad_norm": 5.700402014760258, + "learning_rate": 4.593593763912649e-06, + "loss": 0.8863, + "step": 5773 + }, + { + "epoch": 0.41714378600971697, + "grad_norm": 6.8739315243047105, + "learning_rate": 4.593433887347397e-06, + "loss": 0.9772, + "step": 5774 + }, + { + "epoch": 0.4172160312099265, + "grad_norm": 8.523940983210402, + "learning_rate": 4.59327398212466e-06, + "loss": 0.9745, + "step": 5775 + }, + { + "epoch": 0.417288276410136, + "grad_norm": 7.29952582295523, + "learning_rate": 4.593114048246627e-06, + "loss": 0.8881, + "step": 5776 + }, + { + "epoch": 0.41736052161034554, + "grad_norm": 5.464785114052833, + "learning_rate": 4.592954085715488e-06, + "loss": 0.8658, + "step": 5777 + }, + { + "epoch": 0.417432766810555, + "grad_norm": 6.231409423992603, + "learning_rate": 4.592794094533433e-06, + "loss": 0.7857, + "step": 5778 + }, + { + "epoch": 0.4175050120107645, + "grad_norm": 7.924803664606372, + "learning_rate": 4.5926340747026515e-06, + "loss": 0.8025, + "step": 5779 + }, + { + "epoch": 0.41757725721097405, + "grad_norm": 6.379053958600624, + "learning_rate": 4.5924740262253346e-06, + "loss": 0.8745, + "step": 5780 + }, + { + "epoch": 0.41764950241118354, + "grad_norm": 6.816733068812751, + "learning_rate": 4.592313949103673e-06, + "loss": 0.886, + "step": 5781 + }, + { + "epoch": 0.4177217476113931, + "grad_norm": 6.9881955477905855, + "learning_rate": 4.592153843339859e-06, + "loss": 0.8944, + "step": 5782 + }, + { + "epoch": 0.41779399281160257, + "grad_norm": 6.3291045007756255, + "learning_rate": 4.591993708936081e-06, + "loss": 0.9153, + "step": 5783 + }, + { + "epoch": 0.4178662380118121, + "grad_norm": 7.027795103886908, + "learning_rate": 4.591833545894535e-06, + "loss": 0.9938, + "step": 5784 + }, + { + "epoch": 0.4179384832120216, + "grad_norm": 6.734669265159499, + "learning_rate": 4.591673354217412e-06, + "loss": 0.8341, + "step": 5785 + }, + { + "epoch": 0.41801072841223114, + "grad_norm": 7.688880362288513, + "learning_rate": 4.591513133906904e-06, + "loss": 0.91, + "step": 5786 + }, + { + "epoch": 0.4180829736124406, + "grad_norm": 5.39778594792795, + "learning_rate": 4.591352884965206e-06, + "loss": 0.8266, + "step": 5787 + }, + { + "epoch": 0.4181552188126501, + "grad_norm": 5.797521508864969, + "learning_rate": 4.59119260739451e-06, + "loss": 0.8984, + "step": 5788 + }, + { + "epoch": 0.41822746401285965, + "grad_norm": 5.602793282999247, + "learning_rate": 4.591032301197012e-06, + "loss": 0.8808, + "step": 5789 + }, + { + "epoch": 0.41829970921306914, + "grad_norm": 6.983201169361895, + "learning_rate": 4.590871966374905e-06, + "loss": 0.8656, + "step": 5790 + }, + { + "epoch": 0.4183719544132787, + "grad_norm": 8.659667294214275, + "learning_rate": 4.590711602930384e-06, + "loss": 0.9395, + "step": 5791 + }, + { + "epoch": 0.41844419961348817, + "grad_norm": 5.3287140193460925, + "learning_rate": 4.590551210865644e-06, + "loss": 0.8581, + "step": 5792 + }, + { + "epoch": 0.4185164448136977, + "grad_norm": 5.90617830747659, + "learning_rate": 4.590390790182882e-06, + "loss": 0.8789, + "step": 5793 + }, + { + "epoch": 0.4185886900139072, + "grad_norm": 5.988148428382666, + "learning_rate": 4.590230340884293e-06, + "loss": 0.8446, + "step": 5794 + }, + { + "epoch": 0.41866093521411674, + "grad_norm": 8.566072707112353, + "learning_rate": 4.590069862972073e-06, + "loss": 0.8625, + "step": 5795 + }, + { + "epoch": 0.4187331804143262, + "grad_norm": 6.093207232438546, + "learning_rate": 4.5899093564484205e-06, + "loss": 0.902, + "step": 5796 + }, + { + "epoch": 0.4188054256145357, + "grad_norm": 6.613169922091601, + "learning_rate": 4.58974882131553e-06, + "loss": 0.9502, + "step": 5797 + }, + { + "epoch": 0.41887767081474525, + "grad_norm": 6.220594583040493, + "learning_rate": 4.589588257575602e-06, + "loss": 0.8516, + "step": 5798 + }, + { + "epoch": 0.41894991601495474, + "grad_norm": 7.744209341198913, + "learning_rate": 4.589427665230834e-06, + "loss": 0.9553, + "step": 5799 + }, + { + "epoch": 0.4190221612151643, + "grad_norm": 6.459633710651189, + "learning_rate": 4.589267044283422e-06, + "loss": 0.8186, + "step": 5800 + }, + { + "epoch": 0.41909440641537377, + "grad_norm": 6.593743636141775, + "learning_rate": 4.589106394735567e-06, + "loss": 0.8798, + "step": 5801 + }, + { + "epoch": 0.4191666516155833, + "grad_norm": 8.294495322135385, + "learning_rate": 4.588945716589467e-06, + "loss": 0.9831, + "step": 5802 + }, + { + "epoch": 0.4192388968157928, + "grad_norm": 6.3772964829078, + "learning_rate": 4.588785009847323e-06, + "loss": 0.89, + "step": 5803 + }, + { + "epoch": 0.41931114201600234, + "grad_norm": 6.622079745444727, + "learning_rate": 4.588624274511333e-06, + "loss": 0.8526, + "step": 5804 + }, + { + "epoch": 0.4193833872162118, + "grad_norm": 7.615322287659392, + "learning_rate": 4.588463510583699e-06, + "loss": 0.8975, + "step": 5805 + }, + { + "epoch": 0.4194556324164213, + "grad_norm": 8.20638142209597, + "learning_rate": 4.588302718066621e-06, + "loss": 0.9152, + "step": 5806 + }, + { + "epoch": 0.41952787761663085, + "grad_norm": 5.569720093630084, + "learning_rate": 4.5881418969623e-06, + "loss": 0.8867, + "step": 5807 + }, + { + "epoch": 0.41960012281684034, + "grad_norm": 5.946464116249239, + "learning_rate": 4.587981047272939e-06, + "loss": 0.8953, + "step": 5808 + }, + { + "epoch": 0.4196723680170499, + "grad_norm": 6.020321446958379, + "learning_rate": 4.587820169000737e-06, + "loss": 0.9589, + "step": 5809 + }, + { + "epoch": 0.41974461321725937, + "grad_norm": 5.579396335393931, + "learning_rate": 4.5876592621478995e-06, + "loss": 0.9012, + "step": 5810 + }, + { + "epoch": 0.4198168584174689, + "grad_norm": 7.586142163015433, + "learning_rate": 4.587498326716627e-06, + "loss": 0.9919, + "step": 5811 + }, + { + "epoch": 0.4198891036176784, + "grad_norm": 6.259563605847019, + "learning_rate": 4.587337362709123e-06, + "loss": 0.9207, + "step": 5812 + }, + { + "epoch": 0.41996134881788794, + "grad_norm": 5.688416124276571, + "learning_rate": 4.5871763701275915e-06, + "loss": 0.9768, + "step": 5813 + }, + { + "epoch": 0.4200335940180974, + "grad_norm": 7.677161234887857, + "learning_rate": 4.587015348974236e-06, + "loss": 0.9622, + "step": 5814 + }, + { + "epoch": 0.4201058392183069, + "grad_norm": 6.693025961932286, + "learning_rate": 4.586854299251261e-06, + "loss": 0.9955, + "step": 5815 + }, + { + "epoch": 0.42017808441851645, + "grad_norm": 5.8426058868097535, + "learning_rate": 4.586693220960871e-06, + "loss": 0.8921, + "step": 5816 + }, + { + "epoch": 0.42025032961872594, + "grad_norm": 4.940743069381034, + "learning_rate": 4.58653211410527e-06, + "loss": 0.8029, + "step": 5817 + }, + { + "epoch": 0.4203225748189355, + "grad_norm": 6.129390155271331, + "learning_rate": 4.586370978686665e-06, + "loss": 0.8373, + "step": 5818 + }, + { + "epoch": 0.42039482001914497, + "grad_norm": 5.847545023887929, + "learning_rate": 4.586209814707262e-06, + "loss": 0.895, + "step": 5819 + }, + { + "epoch": 0.4204670652193545, + "grad_norm": 6.703663339760698, + "learning_rate": 4.586048622169265e-06, + "loss": 0.9475, + "step": 5820 + }, + { + "epoch": 0.420539310419564, + "grad_norm": 6.197599241532368, + "learning_rate": 4.585887401074884e-06, + "loss": 0.8869, + "step": 5821 + }, + { + "epoch": 0.42061155561977354, + "grad_norm": 5.540185123201283, + "learning_rate": 4.585726151426323e-06, + "loss": 0.861, + "step": 5822 + }, + { + "epoch": 0.420683800819983, + "grad_norm": 6.075400872210544, + "learning_rate": 4.585564873225791e-06, + "loss": 0.9641, + "step": 5823 + }, + { + "epoch": 0.4207560460201925, + "grad_norm": 7.792043041039877, + "learning_rate": 4.585403566475494e-06, + "loss": 0.9429, + "step": 5824 + }, + { + "epoch": 0.42082829122040205, + "grad_norm": 6.359410691219715, + "learning_rate": 4.585242231177643e-06, + "loss": 0.8128, + "step": 5825 + }, + { + "epoch": 0.42090053642061154, + "grad_norm": 6.9533299105164295, + "learning_rate": 4.585080867334444e-06, + "loss": 0.8864, + "step": 5826 + }, + { + "epoch": 0.4209727816208211, + "grad_norm": 5.735071789559089, + "learning_rate": 4.584919474948108e-06, + "loss": 0.8428, + "step": 5827 + }, + { + "epoch": 0.42104502682103057, + "grad_norm": 6.072627152815277, + "learning_rate": 4.5847580540208415e-06, + "loss": 0.8651, + "step": 5828 + }, + { + "epoch": 0.4211172720212401, + "grad_norm": 7.297442983866555, + "learning_rate": 4.584596604554857e-06, + "loss": 0.9429, + "step": 5829 + }, + { + "epoch": 0.4211895172214496, + "grad_norm": 5.201809186921285, + "learning_rate": 4.584435126552363e-06, + "loss": 0.8717, + "step": 5830 + }, + { + "epoch": 0.42126176242165914, + "grad_norm": 6.659597495348261, + "learning_rate": 4.58427362001557e-06, + "loss": 0.8428, + "step": 5831 + }, + { + "epoch": 0.4213340076218686, + "grad_norm": 7.640345156796926, + "learning_rate": 4.5841120849466915e-06, + "loss": 0.8676, + "step": 5832 + }, + { + "epoch": 0.4214062528220781, + "grad_norm": 6.727059836615856, + "learning_rate": 4.583950521347935e-06, + "loss": 0.8832, + "step": 5833 + }, + { + "epoch": 0.42147849802228765, + "grad_norm": 7.089144787343391, + "learning_rate": 4.583788929221514e-06, + "loss": 0.8699, + "step": 5834 + }, + { + "epoch": 0.42155074322249714, + "grad_norm": 5.777192872755217, + "learning_rate": 4.5836273085696415e-06, + "loss": 0.9505, + "step": 5835 + }, + { + "epoch": 0.4216229884227067, + "grad_norm": 5.629518855456949, + "learning_rate": 4.583465659394529e-06, + "loss": 0.9284, + "step": 5836 + }, + { + "epoch": 0.42169523362291617, + "grad_norm": 6.135570202532531, + "learning_rate": 4.5833039816983886e-06, + "loss": 0.8805, + "step": 5837 + }, + { + "epoch": 0.4217674788231257, + "grad_norm": 7.792043530603028, + "learning_rate": 4.583142275483434e-06, + "loss": 0.8546, + "step": 5838 + }, + { + "epoch": 0.4218397240233352, + "grad_norm": 6.6524873943130585, + "learning_rate": 4.58298054075188e-06, + "loss": 0.8428, + "step": 5839 + }, + { + "epoch": 0.42191196922354474, + "grad_norm": 6.485512948437132, + "learning_rate": 4.58281877750594e-06, + "loss": 0.9374, + "step": 5840 + }, + { + "epoch": 0.4219842144237542, + "grad_norm": 6.839923781923984, + "learning_rate": 4.582656985747827e-06, + "loss": 0.9151, + "step": 5841 + }, + { + "epoch": 0.4220564596239637, + "grad_norm": 8.560174110994874, + "learning_rate": 4.582495165479758e-06, + "loss": 0.9689, + "step": 5842 + }, + { + "epoch": 0.42212870482417325, + "grad_norm": 5.470293099816534, + "learning_rate": 4.582333316703946e-06, + "loss": 0.783, + "step": 5843 + }, + { + "epoch": 0.42220095002438274, + "grad_norm": 7.442475778209057, + "learning_rate": 4.582171439422608e-06, + "loss": 0.9724, + "step": 5844 + }, + { + "epoch": 0.4222731952245923, + "grad_norm": 6.903996578953891, + "learning_rate": 4.58200953363796e-06, + "loss": 0.8868, + "step": 5845 + }, + { + "epoch": 0.42234544042480177, + "grad_norm": 6.641419127608531, + "learning_rate": 4.581847599352219e-06, + "loss": 0.8183, + "step": 5846 + }, + { + "epoch": 0.4224176856250113, + "grad_norm": 6.278851558859579, + "learning_rate": 4.581685636567599e-06, + "loss": 0.8349, + "step": 5847 + }, + { + "epoch": 0.4224899308252208, + "grad_norm": 7.791326777206739, + "learning_rate": 4.58152364528632e-06, + "loss": 0.8718, + "step": 5848 + }, + { + "epoch": 0.42256217602543034, + "grad_norm": 6.926232613164961, + "learning_rate": 4.581361625510599e-06, + "loss": 0.8519, + "step": 5849 + }, + { + "epoch": 0.4226344212256398, + "grad_norm": 8.388217800938204, + "learning_rate": 4.581199577242652e-06, + "loss": 0.925, + "step": 5850 + }, + { + "epoch": 0.4227066664258493, + "grad_norm": 5.45141994632154, + "learning_rate": 4.581037500484699e-06, + "loss": 0.8803, + "step": 5851 + }, + { + "epoch": 0.42277891162605885, + "grad_norm": 6.258148679620854, + "learning_rate": 4.580875395238959e-06, + "loss": 0.914, + "step": 5852 + }, + { + "epoch": 0.42285115682626834, + "grad_norm": 6.217387754671841, + "learning_rate": 4.580713261507651e-06, + "loss": 0.9347, + "step": 5853 + }, + { + "epoch": 0.4229234020264779, + "grad_norm": 5.775975476072878, + "learning_rate": 4.580551099292993e-06, + "loss": 0.8423, + "step": 5854 + }, + { + "epoch": 0.42299564722668737, + "grad_norm": 8.043174114819882, + "learning_rate": 4.580388908597207e-06, + "loss": 1.0322, + "step": 5855 + }, + { + "epoch": 0.4230678924268969, + "grad_norm": 6.798121967151916, + "learning_rate": 4.580226689422511e-06, + "loss": 0.8463, + "step": 5856 + }, + { + "epoch": 0.4231401376271064, + "grad_norm": 6.291939062154467, + "learning_rate": 4.5800644417711274e-06, + "loss": 0.8952, + "step": 5857 + }, + { + "epoch": 0.42321238282731594, + "grad_norm": 5.303426556170486, + "learning_rate": 4.5799021656452766e-06, + "loss": 0.8556, + "step": 5858 + }, + { + "epoch": 0.4232846280275254, + "grad_norm": 5.820986223590842, + "learning_rate": 4.57973986104718e-06, + "loss": 0.8654, + "step": 5859 + }, + { + "epoch": 0.4233568732277349, + "grad_norm": 7.164770659179037, + "learning_rate": 4.57957752797906e-06, + "loss": 0.9833, + "step": 5860 + }, + { + "epoch": 0.42342911842794445, + "grad_norm": 8.84767780764482, + "learning_rate": 4.579415166443137e-06, + "loss": 0.8602, + "step": 5861 + }, + { + "epoch": 0.42350136362815394, + "grad_norm": 6.744182198774989, + "learning_rate": 4.579252776441636e-06, + "loss": 0.8993, + "step": 5862 + }, + { + "epoch": 0.4235736088283635, + "grad_norm": 6.857461371290416, + "learning_rate": 4.57909035797678e-06, + "loss": 0.9733, + "step": 5863 + }, + { + "epoch": 0.42364585402857297, + "grad_norm": 5.858130727260683, + "learning_rate": 4.578927911050789e-06, + "loss": 0.8843, + "step": 5864 + }, + { + "epoch": 0.4237180992287825, + "grad_norm": 5.611716629106518, + "learning_rate": 4.578765435665891e-06, + "loss": 0.8714, + "step": 5865 + }, + { + "epoch": 0.423790344428992, + "grad_norm": 7.314197832243753, + "learning_rate": 4.578602931824307e-06, + "loss": 0.9429, + "step": 5866 + }, + { + "epoch": 0.42386258962920154, + "grad_norm": 7.669077811972179, + "learning_rate": 4.578440399528264e-06, + "loss": 0.8986, + "step": 5867 + }, + { + "epoch": 0.423934834829411, + "grad_norm": 9.182713098460216, + "learning_rate": 4.5782778387799845e-06, + "loss": 0.8974, + "step": 5868 + }, + { + "epoch": 0.4240070800296205, + "grad_norm": 7.105354664679918, + "learning_rate": 4.578115249581695e-06, + "loss": 1.0272, + "step": 5869 + }, + { + "epoch": 0.42407932522983005, + "grad_norm": 6.059715024586488, + "learning_rate": 4.577952631935622e-06, + "loss": 0.8887, + "step": 5870 + }, + { + "epoch": 0.42415157043003954, + "grad_norm": 6.3825378732420495, + "learning_rate": 4.57778998584399e-06, + "loss": 0.8912, + "step": 5871 + }, + { + "epoch": 0.4242238156302491, + "grad_norm": 6.278521955975001, + "learning_rate": 4.577627311309028e-06, + "loss": 0.8794, + "step": 5872 + }, + { + "epoch": 0.42429606083045857, + "grad_norm": 6.0921328232750955, + "learning_rate": 4.5774646083329595e-06, + "loss": 0.9806, + "step": 5873 + }, + { + "epoch": 0.4243683060306681, + "grad_norm": 10.727143229568428, + "learning_rate": 4.577301876918016e-06, + "loss": 0.8466, + "step": 5874 + }, + { + "epoch": 0.4244405512308776, + "grad_norm": 7.191729022920102, + "learning_rate": 4.57713911706642e-06, + "loss": 0.8938, + "step": 5875 + }, + { + "epoch": 0.42451279643108714, + "grad_norm": 6.668297123212005, + "learning_rate": 4.576976328780404e-06, + "loss": 0.9059, + "step": 5876 + }, + { + "epoch": 0.4245850416312966, + "grad_norm": 7.435437597699756, + "learning_rate": 4.576813512062194e-06, + "loss": 0.9032, + "step": 5877 + }, + { + "epoch": 0.4246572868315061, + "grad_norm": 5.642583026401904, + "learning_rate": 4.57665066691402e-06, + "loss": 0.855, + "step": 5878 + }, + { + "epoch": 0.42472953203171565, + "grad_norm": 5.580018649053002, + "learning_rate": 4.57648779333811e-06, + "loss": 0.7558, + "step": 5879 + }, + { + "epoch": 0.42480177723192514, + "grad_norm": 6.152663372153959, + "learning_rate": 4.576324891336695e-06, + "loss": 0.792, + "step": 5880 + }, + { + "epoch": 0.4248740224321347, + "grad_norm": 5.87867589353508, + "learning_rate": 4.576161960912004e-06, + "loss": 0.8974, + "step": 5881 + }, + { + "epoch": 0.42494626763234417, + "grad_norm": 5.858804334189559, + "learning_rate": 4.575999002066268e-06, + "loss": 0.8967, + "step": 5882 + }, + { + "epoch": 0.4250185128325537, + "grad_norm": 6.711841242625673, + "learning_rate": 4.575836014801718e-06, + "loss": 0.9108, + "step": 5883 + }, + { + "epoch": 0.4250907580327632, + "grad_norm": 6.723446646047405, + "learning_rate": 4.575672999120585e-06, + "loss": 0.8467, + "step": 5884 + }, + { + "epoch": 0.42516300323297274, + "grad_norm": 4.831452738269701, + "learning_rate": 4.5755099550250996e-06, + "loss": 0.809, + "step": 5885 + }, + { + "epoch": 0.4252352484331822, + "grad_norm": 7.098089697225128, + "learning_rate": 4.5753468825174944e-06, + "loss": 0.8239, + "step": 5886 + }, + { + "epoch": 0.4253074936333917, + "grad_norm": 6.415570879753006, + "learning_rate": 4.575183781600002e-06, + "loss": 0.8744, + "step": 5887 + }, + { + "epoch": 0.42537973883360125, + "grad_norm": 6.573632670325822, + "learning_rate": 4.575020652274855e-06, + "loss": 0.9449, + "step": 5888 + }, + { + "epoch": 0.42545198403381074, + "grad_norm": 8.283970108987692, + "learning_rate": 4.574857494544286e-06, + "loss": 0.9328, + "step": 5889 + }, + { + "epoch": 0.4255242292340203, + "grad_norm": 5.748386322482163, + "learning_rate": 4.574694308410529e-06, + "loss": 0.8943, + "step": 5890 + }, + { + "epoch": 0.42559647443422977, + "grad_norm": 5.907090868962461, + "learning_rate": 4.574531093875818e-06, + "loss": 0.8806, + "step": 5891 + }, + { + "epoch": 0.4256687196344393, + "grad_norm": 6.30228678057027, + "learning_rate": 4.5743678509423875e-06, + "loss": 0.9413, + "step": 5892 + }, + { + "epoch": 0.4257409648346488, + "grad_norm": 5.663860112224667, + "learning_rate": 4.574204579612471e-06, + "loss": 0.93, + "step": 5893 + }, + { + "epoch": 0.42581321003485834, + "grad_norm": 7.074728312852981, + "learning_rate": 4.574041279888305e-06, + "loss": 0.9032, + "step": 5894 + }, + { + "epoch": 0.4258854552350678, + "grad_norm": 6.755328300624534, + "learning_rate": 4.573877951772124e-06, + "loss": 0.8869, + "step": 5895 + }, + { + "epoch": 0.4259577004352773, + "grad_norm": 6.353609287850962, + "learning_rate": 4.573714595266164e-06, + "loss": 0.8606, + "step": 5896 + }, + { + "epoch": 0.42602994563548685, + "grad_norm": 7.7970061205157055, + "learning_rate": 4.573551210372661e-06, + "loss": 0.8324, + "step": 5897 + }, + { + "epoch": 0.42610219083569634, + "grad_norm": 6.087674756058086, + "learning_rate": 4.573387797093852e-06, + "loss": 0.9385, + "step": 5898 + }, + { + "epoch": 0.4261744360359059, + "grad_norm": 5.656495652739248, + "learning_rate": 4.573224355431974e-06, + "loss": 0.9198, + "step": 5899 + }, + { + "epoch": 0.42624668123611537, + "grad_norm": 7.043731868148524, + "learning_rate": 4.573060885389266e-06, + "loss": 0.9297, + "step": 5900 + }, + { + "epoch": 0.4263189264363249, + "grad_norm": 6.451869888724908, + "learning_rate": 4.5728973869679625e-06, + "loss": 0.9722, + "step": 5901 + }, + { + "epoch": 0.4263911716365344, + "grad_norm": 8.445477818398121, + "learning_rate": 4.5727338601703035e-06, + "loss": 0.9034, + "step": 5902 + }, + { + "epoch": 0.42646341683674394, + "grad_norm": 7.393629177263942, + "learning_rate": 4.572570304998527e-06, + "loss": 0.9194, + "step": 5903 + }, + { + "epoch": 0.4265356620369534, + "grad_norm": 6.608952790611036, + "learning_rate": 4.5724067214548725e-06, + "loss": 0.9225, + "step": 5904 + }, + { + "epoch": 0.4266079072371629, + "grad_norm": 7.214606082390934, + "learning_rate": 4.572243109541579e-06, + "loss": 0.899, + "step": 5905 + }, + { + "epoch": 0.42668015243737245, + "grad_norm": 6.3477914649060105, + "learning_rate": 4.572079469260886e-06, + "loss": 0.9228, + "step": 5906 + }, + { + "epoch": 0.42675239763758194, + "grad_norm": 5.662943720326898, + "learning_rate": 4.571915800615035e-06, + "loss": 0.8434, + "step": 5907 + }, + { + "epoch": 0.4268246428377915, + "grad_norm": 6.188596069665281, + "learning_rate": 4.571752103606265e-06, + "loss": 0.893, + "step": 5908 + }, + { + "epoch": 0.42689688803800097, + "grad_norm": 6.399199388018566, + "learning_rate": 4.571588378236817e-06, + "loss": 0.924, + "step": 5909 + }, + { + "epoch": 0.4269691332382105, + "grad_norm": 6.352445008551426, + "learning_rate": 4.5714246245089325e-06, + "loss": 1.0217, + "step": 5910 + }, + { + "epoch": 0.42704137843842, + "grad_norm": 6.045887316886276, + "learning_rate": 4.571260842424854e-06, + "loss": 0.8557, + "step": 5911 + }, + { + "epoch": 0.42711362363862954, + "grad_norm": 6.541668189305738, + "learning_rate": 4.571097031986822e-06, + "loss": 0.8527, + "step": 5912 + }, + { + "epoch": 0.427185868838839, + "grad_norm": 6.117646142907743, + "learning_rate": 4.57093319319708e-06, + "loss": 0.8592, + "step": 5913 + }, + { + "epoch": 0.4272581140390485, + "grad_norm": 6.741198134517213, + "learning_rate": 4.57076932605787e-06, + "loss": 0.9666, + "step": 5914 + }, + { + "epoch": 0.42733035923925805, + "grad_norm": 6.066180812677744, + "learning_rate": 4.570605430571437e-06, + "loss": 0.9324, + "step": 5915 + }, + { + "epoch": 0.42740260443946754, + "grad_norm": 6.60259589960838, + "learning_rate": 4.570441506740023e-06, + "loss": 0.8148, + "step": 5916 + }, + { + "epoch": 0.4274748496396771, + "grad_norm": 7.186416677020733, + "learning_rate": 4.570277554565872e-06, + "loss": 0.9751, + "step": 5917 + }, + { + "epoch": 0.42754709483988657, + "grad_norm": 6.175544053794091, + "learning_rate": 4.5701135740512285e-06, + "loss": 0.8791, + "step": 5918 + }, + { + "epoch": 0.4276193400400961, + "grad_norm": 6.566482062415567, + "learning_rate": 4.569949565198338e-06, + "loss": 0.7983, + "step": 5919 + }, + { + "epoch": 0.4276915852403056, + "grad_norm": 7.3947566865067795, + "learning_rate": 4.569785528009445e-06, + "loss": 0.8925, + "step": 5920 + }, + { + "epoch": 0.42776383044051514, + "grad_norm": 7.378391585164968, + "learning_rate": 4.569621462486795e-06, + "loss": 0.9575, + "step": 5921 + }, + { + "epoch": 0.4278360756407246, + "grad_norm": 8.67635591227699, + "learning_rate": 4.569457368632635e-06, + "loss": 0.8661, + "step": 5922 + }, + { + "epoch": 0.4279083208409341, + "grad_norm": 6.111996607734578, + "learning_rate": 4.569293246449209e-06, + "loss": 0.9155, + "step": 5923 + }, + { + "epoch": 0.42798056604114365, + "grad_norm": 7.453858921215427, + "learning_rate": 4.569129095938767e-06, + "loss": 0.8795, + "step": 5924 + }, + { + "epoch": 0.42805281124135314, + "grad_norm": 5.849783164676317, + "learning_rate": 4.568964917103553e-06, + "loss": 0.9694, + "step": 5925 + }, + { + "epoch": 0.4281250564415627, + "grad_norm": 6.90883812373475, + "learning_rate": 4.568800709945816e-06, + "loss": 0.8678, + "step": 5926 + }, + { + "epoch": 0.42819730164177217, + "grad_norm": 8.130866177858673, + "learning_rate": 4.568636474467803e-06, + "loss": 1.0266, + "step": 5927 + }, + { + "epoch": 0.4282695468419817, + "grad_norm": 5.344638862078526, + "learning_rate": 4.568472210671764e-06, + "loss": 0.8706, + "step": 5928 + }, + { + "epoch": 0.4283417920421912, + "grad_norm": 8.272785510487932, + "learning_rate": 4.568307918559946e-06, + "loss": 0.8789, + "step": 5929 + }, + { + "epoch": 0.4284140372424007, + "grad_norm": 5.6185015970383825, + "learning_rate": 4.568143598134598e-06, + "loss": 0.913, + "step": 5930 + }, + { + "epoch": 0.4284862824426102, + "grad_norm": 8.455956737454558, + "learning_rate": 4.5679792493979705e-06, + "loss": 0.9069, + "step": 5931 + }, + { + "epoch": 0.4285585276428197, + "grad_norm": 5.862107435800472, + "learning_rate": 4.567814872352313e-06, + "loss": 0.8495, + "step": 5932 + }, + { + "epoch": 0.42863077284302925, + "grad_norm": 6.58237955773517, + "learning_rate": 4.567650466999874e-06, + "loss": 1.0102, + "step": 5933 + }, + { + "epoch": 0.42870301804323874, + "grad_norm": 6.147452793009895, + "learning_rate": 4.567486033342907e-06, + "loss": 0.9204, + "step": 5934 + }, + { + "epoch": 0.4287752632434483, + "grad_norm": 6.3731644923830935, + "learning_rate": 4.567321571383662e-06, + "loss": 0.9532, + "step": 5935 + }, + { + "epoch": 0.42884750844365777, + "grad_norm": 7.464268061837986, + "learning_rate": 4.567157081124388e-06, + "loss": 0.9647, + "step": 5936 + }, + { + "epoch": 0.4289197536438673, + "grad_norm": 7.579364478612848, + "learning_rate": 4.56699256256734e-06, + "loss": 0.9151, + "step": 5937 + }, + { + "epoch": 0.4289919988440768, + "grad_norm": 7.871589043039531, + "learning_rate": 4.5668280157147685e-06, + "loss": 0.8963, + "step": 5938 + }, + { + "epoch": 0.4290642440442863, + "grad_norm": 6.430898299762991, + "learning_rate": 4.566663440568926e-06, + "loss": 0.8002, + "step": 5939 + }, + { + "epoch": 0.4291364892444958, + "grad_norm": 6.942325322965828, + "learning_rate": 4.566498837132066e-06, + "loss": 0.9851, + "step": 5940 + }, + { + "epoch": 0.4292087344447053, + "grad_norm": 7.599120701568815, + "learning_rate": 4.56633420540644e-06, + "loss": 0.9092, + "step": 5941 + }, + { + "epoch": 0.42928097964491485, + "grad_norm": 5.887792093275932, + "learning_rate": 4.566169545394305e-06, + "loss": 0.9069, + "step": 5942 + }, + { + "epoch": 0.42935322484512434, + "grad_norm": 6.132651707187513, + "learning_rate": 4.566004857097913e-06, + "loss": 0.8764, + "step": 5943 + }, + { + "epoch": 0.4294254700453339, + "grad_norm": 6.677899180384419, + "learning_rate": 4.565840140519518e-06, + "loss": 0.9776, + "step": 5944 + }, + { + "epoch": 0.42949771524554337, + "grad_norm": 6.629071406126165, + "learning_rate": 4.565675395661376e-06, + "loss": 0.9317, + "step": 5945 + }, + { + "epoch": 0.4295699604457529, + "grad_norm": 6.378464000763287, + "learning_rate": 4.565510622525741e-06, + "loss": 0.9041, + "step": 5946 + }, + { + "epoch": 0.4296422056459624, + "grad_norm": 6.531251752205208, + "learning_rate": 4.565345821114871e-06, + "loss": 0.8668, + "step": 5947 + }, + { + "epoch": 0.4297144508461719, + "grad_norm": 6.992860150082603, + "learning_rate": 4.565180991431019e-06, + "loss": 0.9191, + "step": 5948 + }, + { + "epoch": 0.4297866960463814, + "grad_norm": 6.536708175524792, + "learning_rate": 4.565016133476442e-06, + "loss": 0.9275, + "step": 5949 + }, + { + "epoch": 0.4298589412465909, + "grad_norm": 6.134624471075882, + "learning_rate": 4.564851247253398e-06, + "loss": 0.8569, + "step": 5950 + }, + { + "epoch": 0.42993118644680045, + "grad_norm": 7.107250390810484, + "learning_rate": 4.5646863327641445e-06, + "loss": 0.861, + "step": 5951 + }, + { + "epoch": 0.43000343164700994, + "grad_norm": 10.466502571844416, + "learning_rate": 4.564521390010938e-06, + "loss": 0.9202, + "step": 5952 + }, + { + "epoch": 0.4300756768472195, + "grad_norm": 6.266455388396667, + "learning_rate": 4.564356418996036e-06, + "loss": 0.9597, + "step": 5953 + }, + { + "epoch": 0.43014792204742897, + "grad_norm": 6.908263590437248, + "learning_rate": 4.564191419721698e-06, + "loss": 0.9821, + "step": 5954 + }, + { + "epoch": 0.4302201672476385, + "grad_norm": 6.943433543907662, + "learning_rate": 4.5640263921901825e-06, + "loss": 0.8765, + "step": 5955 + }, + { + "epoch": 0.430292412447848, + "grad_norm": 6.378772591699074, + "learning_rate": 4.5638613364037475e-06, + "loss": 0.8461, + "step": 5956 + }, + { + "epoch": 0.4303646576480575, + "grad_norm": 6.2565892389567415, + "learning_rate": 4.563696252364654e-06, + "loss": 0.9562, + "step": 5957 + }, + { + "epoch": 0.430436902848267, + "grad_norm": 5.756243425959872, + "learning_rate": 4.563531140075161e-06, + "loss": 0.8601, + "step": 5958 + }, + { + "epoch": 0.4305091480484765, + "grad_norm": 5.776411021204141, + "learning_rate": 4.563365999537529e-06, + "loss": 0.9299, + "step": 5959 + }, + { + "epoch": 0.43058139324868605, + "grad_norm": 6.823454841956867, + "learning_rate": 4.563200830754018e-06, + "loss": 0.9929, + "step": 5960 + }, + { + "epoch": 0.43065363844889554, + "grad_norm": 6.6953506735711645, + "learning_rate": 4.56303563372689e-06, + "loss": 0.8947, + "step": 5961 + }, + { + "epoch": 0.4307258836491051, + "grad_norm": 6.241364273195617, + "learning_rate": 4.562870408458406e-06, + "loss": 0.875, + "step": 5962 + }, + { + "epoch": 0.43079812884931457, + "grad_norm": 5.083013722528985, + "learning_rate": 4.562705154950828e-06, + "loss": 0.7822, + "step": 5963 + }, + { + "epoch": 0.4308703740495241, + "grad_norm": 6.47849236646036, + "learning_rate": 4.562539873206418e-06, + "loss": 0.964, + "step": 5964 + }, + { + "epoch": 0.4309426192497336, + "grad_norm": 7.856604267833394, + "learning_rate": 4.562374563227438e-06, + "loss": 0.877, + "step": 5965 + }, + { + "epoch": 0.4310148644499431, + "grad_norm": 6.389247456684092, + "learning_rate": 4.562209225016152e-06, + "loss": 0.9991, + "step": 5966 + }, + { + "epoch": 0.4310871096501526, + "grad_norm": 6.864369166674858, + "learning_rate": 4.562043858574823e-06, + "loss": 0.8891, + "step": 5967 + }, + { + "epoch": 0.4311593548503621, + "grad_norm": 7.291990552702542, + "learning_rate": 4.561878463905714e-06, + "loss": 0.866, + "step": 5968 + }, + { + "epoch": 0.43123160005057165, + "grad_norm": 7.628737487332669, + "learning_rate": 4.561713041011091e-06, + "loss": 0.9795, + "step": 5969 + }, + { + "epoch": 0.43130384525078114, + "grad_norm": 7.856266324580917, + "learning_rate": 4.561547589893217e-06, + "loss": 0.9339, + "step": 5970 + }, + { + "epoch": 0.4313760904509907, + "grad_norm": 5.486846280099383, + "learning_rate": 4.5613821105543566e-06, + "loss": 0.7987, + "step": 5971 + }, + { + "epoch": 0.43144833565120017, + "grad_norm": 6.527816234863124, + "learning_rate": 4.561216602996775e-06, + "loss": 0.8781, + "step": 5972 + }, + { + "epoch": 0.4315205808514097, + "grad_norm": 7.728313251551811, + "learning_rate": 4.561051067222741e-06, + "loss": 0.8695, + "step": 5973 + }, + { + "epoch": 0.4315928260516192, + "grad_norm": 7.1722779004987265, + "learning_rate": 4.560885503234516e-06, + "loss": 0.8922, + "step": 5974 + }, + { + "epoch": 0.4316650712518287, + "grad_norm": 7.079008539898719, + "learning_rate": 4.560719911034369e-06, + "loss": 0.8895, + "step": 5975 + }, + { + "epoch": 0.4317373164520382, + "grad_norm": 7.737112096575227, + "learning_rate": 4.5605542906245665e-06, + "loss": 0.9311, + "step": 5976 + }, + { + "epoch": 0.4318095616522477, + "grad_norm": 7.688011834698161, + "learning_rate": 4.5603886420073765e-06, + "loss": 0.9371, + "step": 5977 + }, + { + "epoch": 0.43188180685245725, + "grad_norm": 7.889594949398617, + "learning_rate": 4.560222965185065e-06, + "loss": 0.8365, + "step": 5978 + }, + { + "epoch": 0.43195405205266674, + "grad_norm": 6.997450773087106, + "learning_rate": 4.5600572601599005e-06, + "loss": 0.917, + "step": 5979 + }, + { + "epoch": 0.4320262972528763, + "grad_norm": 7.6364320198729345, + "learning_rate": 4.559891526934152e-06, + "loss": 0.9752, + "step": 5980 + }, + { + "epoch": 0.43209854245308577, + "grad_norm": 7.3364925804207255, + "learning_rate": 4.559725765510088e-06, + "loss": 0.9058, + "step": 5981 + }, + { + "epoch": 0.4321707876532953, + "grad_norm": 6.4673197911315405, + "learning_rate": 4.559559975889978e-06, + "loss": 0.8693, + "step": 5982 + }, + { + "epoch": 0.4322430328535048, + "grad_norm": 6.713786150079114, + "learning_rate": 4.55939415807609e-06, + "loss": 0.8955, + "step": 5983 + }, + { + "epoch": 0.4323152780537143, + "grad_norm": 7.9589105633080415, + "learning_rate": 4.559228312070696e-06, + "loss": 0.8678, + "step": 5984 + }, + { + "epoch": 0.4323875232539238, + "grad_norm": 8.045236483666903, + "learning_rate": 4.559062437876064e-06, + "loss": 0.8012, + "step": 5985 + }, + { + "epoch": 0.4324597684541333, + "grad_norm": 6.260156542493442, + "learning_rate": 4.558896535494467e-06, + "loss": 0.886, + "step": 5986 + }, + { + "epoch": 0.43253201365434285, + "grad_norm": 6.761373474720091, + "learning_rate": 4.558730604928175e-06, + "loss": 0.8764, + "step": 5987 + }, + { + "epoch": 0.43260425885455234, + "grad_norm": 6.5116742275022625, + "learning_rate": 4.55856464617946e-06, + "loss": 0.8838, + "step": 5988 + }, + { + "epoch": 0.4326765040547619, + "grad_norm": 6.812947389803511, + "learning_rate": 4.558398659250593e-06, + "loss": 0.8793, + "step": 5989 + }, + { + "epoch": 0.43274874925497137, + "grad_norm": 5.960725353060577, + "learning_rate": 4.558232644143847e-06, + "loss": 0.9521, + "step": 5990 + }, + { + "epoch": 0.4328209944551809, + "grad_norm": 5.9447106195832555, + "learning_rate": 4.558066600861493e-06, + "loss": 0.7735, + "step": 5991 + }, + { + "epoch": 0.4328932396553904, + "grad_norm": 5.561723290554229, + "learning_rate": 4.5579005294058056e-06, + "loss": 0.8324, + "step": 5992 + }, + { + "epoch": 0.4329654848555999, + "grad_norm": 6.984805646416577, + "learning_rate": 4.5577344297790576e-06, + "loss": 0.9293, + "step": 5993 + }, + { + "epoch": 0.4330377300558094, + "grad_norm": 6.636153135844803, + "learning_rate": 4.557568301983524e-06, + "loss": 0.949, + "step": 5994 + }, + { + "epoch": 0.4331099752560189, + "grad_norm": 6.837975421675018, + "learning_rate": 4.557402146021477e-06, + "loss": 0.8056, + "step": 5995 + }, + { + "epoch": 0.43318222045622845, + "grad_norm": 7.307547693713564, + "learning_rate": 4.557235961895192e-06, + "loss": 0.8756, + "step": 5996 + }, + { + "epoch": 0.43325446565643794, + "grad_norm": 7.054436325383941, + "learning_rate": 4.557069749606945e-06, + "loss": 0.9268, + "step": 5997 + }, + { + "epoch": 0.4333267108566475, + "grad_norm": 5.702093161687254, + "learning_rate": 4.5569035091590095e-06, + "loss": 0.8716, + "step": 5998 + }, + { + "epoch": 0.43339895605685697, + "grad_norm": 6.4371693350300765, + "learning_rate": 4.5567372405536626e-06, + "loss": 0.9662, + "step": 5999 + }, + { + "epoch": 0.4334712012570665, + "grad_norm": 5.722262945102281, + "learning_rate": 4.556570943793179e-06, + "loss": 0.8742, + "step": 6000 + }, + { + "epoch": 0.433543446457276, + "grad_norm": 6.4138599879533045, + "learning_rate": 4.556404618879837e-06, + "loss": 0.7794, + "step": 6001 + }, + { + "epoch": 0.4336156916574855, + "grad_norm": 7.1211968695393, + "learning_rate": 4.556238265815912e-06, + "loss": 0.9067, + "step": 6002 + }, + { + "epoch": 0.433687936857695, + "grad_norm": 7.42731092049913, + "learning_rate": 4.5560718846036825e-06, + "loss": 0.8693, + "step": 6003 + }, + { + "epoch": 0.4337601820579045, + "grad_norm": 5.947241250976326, + "learning_rate": 4.555905475245424e-06, + "loss": 0.8699, + "step": 6004 + }, + { + "epoch": 0.43383242725811405, + "grad_norm": 7.64555739512039, + "learning_rate": 4.5557390377434166e-06, + "loss": 0.965, + "step": 6005 + }, + { + "epoch": 0.43390467245832354, + "grad_norm": 6.089493580575572, + "learning_rate": 4.555572572099938e-06, + "loss": 0.9113, + "step": 6006 + }, + { + "epoch": 0.4339769176585331, + "grad_norm": 7.63478611055067, + "learning_rate": 4.555406078317266e-06, + "loss": 0.9329, + "step": 6007 + }, + { + "epoch": 0.43404916285874257, + "grad_norm": 6.704520838241176, + "learning_rate": 4.555239556397681e-06, + "loss": 0.9105, + "step": 6008 + }, + { + "epoch": 0.4341214080589521, + "grad_norm": 6.386014212382553, + "learning_rate": 4.555073006343464e-06, + "loss": 0.8303, + "step": 6009 + }, + { + "epoch": 0.4341936532591616, + "grad_norm": 6.975515870190064, + "learning_rate": 4.554906428156891e-06, + "loss": 0.9068, + "step": 6010 + }, + { + "epoch": 0.4342658984593711, + "grad_norm": 9.127314861826235, + "learning_rate": 4.554739821840246e-06, + "loss": 0.9624, + "step": 6011 + }, + { + "epoch": 0.4343381436595806, + "grad_norm": 7.336729419283318, + "learning_rate": 4.554573187395807e-06, + "loss": 0.8545, + "step": 6012 + }, + { + "epoch": 0.4344103888597901, + "grad_norm": 7.01039169632053, + "learning_rate": 4.5544065248258575e-06, + "loss": 0.8818, + "step": 6013 + }, + { + "epoch": 0.43448263405999965, + "grad_norm": 5.612879941182678, + "learning_rate": 4.554239834132677e-06, + "loss": 0.9153, + "step": 6014 + }, + { + "epoch": 0.43455487926020914, + "grad_norm": 6.609998019819786, + "learning_rate": 4.554073115318549e-06, + "loss": 0.9227, + "step": 6015 + }, + { + "epoch": 0.4346271244604187, + "grad_norm": 7.301384718738984, + "learning_rate": 4.553906368385754e-06, + "loss": 0.8434, + "step": 6016 + }, + { + "epoch": 0.43469936966062817, + "grad_norm": 7.697118680800506, + "learning_rate": 4.553739593336577e-06, + "loss": 0.8887, + "step": 6017 + }, + { + "epoch": 0.4347716148608377, + "grad_norm": 6.5607832343363715, + "learning_rate": 4.5535727901732975e-06, + "loss": 0.8829, + "step": 6018 + }, + { + "epoch": 0.4348438600610472, + "grad_norm": 8.658879180336678, + "learning_rate": 4.553405958898202e-06, + "loss": 0.9778, + "step": 6019 + }, + { + "epoch": 0.4349161052612567, + "grad_norm": 6.008513450656389, + "learning_rate": 4.553239099513574e-06, + "loss": 0.9005, + "step": 6020 + }, + { + "epoch": 0.4349883504614662, + "grad_norm": 6.057890722730555, + "learning_rate": 4.553072212021696e-06, + "loss": 0.835, + "step": 6021 + }, + { + "epoch": 0.4350605956616757, + "grad_norm": 6.565443559524427, + "learning_rate": 4.552905296424855e-06, + "loss": 0.8997, + "step": 6022 + }, + { + "epoch": 0.43513284086188525, + "grad_norm": 5.957286111351376, + "learning_rate": 4.552738352725333e-06, + "loss": 0.86, + "step": 6023 + }, + { + "epoch": 0.43520508606209474, + "grad_norm": 5.759953384210783, + "learning_rate": 4.552571380925417e-06, + "loss": 0.9201, + "step": 6024 + }, + { + "epoch": 0.4352773312623043, + "grad_norm": 6.908755577199723, + "learning_rate": 4.5524043810273926e-06, + "loss": 0.8864, + "step": 6025 + }, + { + "epoch": 0.43534957646251377, + "grad_norm": 6.520532895643121, + "learning_rate": 4.552237353033546e-06, + "loss": 0.9671, + "step": 6026 + }, + { + "epoch": 0.4354218216627233, + "grad_norm": 6.073202223522837, + "learning_rate": 4.552070296946164e-06, + "loss": 0.854, + "step": 6027 + }, + { + "epoch": 0.4354940668629328, + "grad_norm": 6.492702495838146, + "learning_rate": 4.551903212767532e-06, + "loss": 1.0154, + "step": 6028 + }, + { + "epoch": 0.4355663120631423, + "grad_norm": 7.159393286546376, + "learning_rate": 4.551736100499938e-06, + "loss": 0.8935, + "step": 6029 + }, + { + "epoch": 0.4356385572633518, + "grad_norm": 6.898873234398272, + "learning_rate": 4.551568960145671e-06, + "loss": 0.9499, + "step": 6030 + }, + { + "epoch": 0.4357108024635613, + "grad_norm": 6.05545137570482, + "learning_rate": 4.551401791707017e-06, + "loss": 0.9041, + "step": 6031 + }, + { + "epoch": 0.43578304766377085, + "grad_norm": 7.478324084694555, + "learning_rate": 4.551234595186266e-06, + "loss": 0.9441, + "step": 6032 + }, + { + "epoch": 0.43585529286398034, + "grad_norm": 5.477766179521073, + "learning_rate": 4.551067370585706e-06, + "loss": 0.7793, + "step": 6033 + }, + { + "epoch": 0.4359275380641899, + "grad_norm": 5.914822931420567, + "learning_rate": 4.550900117907625e-06, + "loss": 0.8628, + "step": 6034 + }, + { + "epoch": 0.43599978326439937, + "grad_norm": 5.789554714019127, + "learning_rate": 4.550732837154315e-06, + "loss": 0.9009, + "step": 6035 + }, + { + "epoch": 0.4360720284646089, + "grad_norm": 6.558226729047254, + "learning_rate": 4.550565528328065e-06, + "loss": 0.8532, + "step": 6036 + }, + { + "epoch": 0.4361442736648184, + "grad_norm": 6.469477663254706, + "learning_rate": 4.550398191431163e-06, + "loss": 0.8684, + "step": 6037 + }, + { + "epoch": 0.4362165188650279, + "grad_norm": 5.545773163012163, + "learning_rate": 4.550230826465904e-06, + "loss": 0.8881, + "step": 6038 + }, + { + "epoch": 0.4362887640652374, + "grad_norm": 6.920584737074415, + "learning_rate": 4.550063433434576e-06, + "loss": 0.8684, + "step": 6039 + }, + { + "epoch": 0.4363610092654469, + "grad_norm": 5.424888630791801, + "learning_rate": 4.549896012339472e-06, + "loss": 0.8811, + "step": 6040 + }, + { + "epoch": 0.43643325446565645, + "grad_norm": 8.57097345687963, + "learning_rate": 4.549728563182882e-06, + "loss": 1.0096, + "step": 6041 + }, + { + "epoch": 0.43650549966586594, + "grad_norm": 7.932012628634697, + "learning_rate": 4.549561085967101e-06, + "loss": 0.9863, + "step": 6042 + }, + { + "epoch": 0.4365777448660755, + "grad_norm": 6.6842378164430665, + "learning_rate": 4.54939358069442e-06, + "loss": 0.9024, + "step": 6043 + }, + { + "epoch": 0.43664999006628497, + "grad_norm": 8.321727249146038, + "learning_rate": 4.549226047367131e-06, + "loss": 0.8562, + "step": 6044 + }, + { + "epoch": 0.4367222352664945, + "grad_norm": 7.044871787751667, + "learning_rate": 4.549058485987529e-06, + "loss": 0.861, + "step": 6045 + }, + { + "epoch": 0.436794480466704, + "grad_norm": 7.085710582864031, + "learning_rate": 4.548890896557907e-06, + "loss": 0.8628, + "step": 6046 + }, + { + "epoch": 0.4368667256669135, + "grad_norm": 6.658072208763538, + "learning_rate": 4.54872327908056e-06, + "loss": 0.9425, + "step": 6047 + }, + { + "epoch": 0.436938970867123, + "grad_norm": 7.189630607273434, + "learning_rate": 4.5485556335577825e-06, + "loss": 0.9214, + "step": 6048 + }, + { + "epoch": 0.4370112160673325, + "grad_norm": 6.545131988476782, + "learning_rate": 4.548387959991868e-06, + "loss": 0.9027, + "step": 6049 + }, + { + "epoch": 0.43708346126754205, + "grad_norm": 5.938005606556472, + "learning_rate": 4.548220258385114e-06, + "loss": 0.8639, + "step": 6050 + }, + { + "epoch": 0.43715570646775154, + "grad_norm": 6.596850493370663, + "learning_rate": 4.548052528739813e-06, + "loss": 0.9846, + "step": 6051 + }, + { + "epoch": 0.4372279516679611, + "grad_norm": 5.958818891609965, + "learning_rate": 4.547884771058265e-06, + "loss": 0.848, + "step": 6052 + }, + { + "epoch": 0.43730019686817057, + "grad_norm": 6.070261912092136, + "learning_rate": 4.5477169853427635e-06, + "loss": 0.8548, + "step": 6053 + }, + { + "epoch": 0.4373724420683801, + "grad_norm": 6.552149284609039, + "learning_rate": 4.547549171595608e-06, + "loss": 0.9037, + "step": 6054 + }, + { + "epoch": 0.4374446872685896, + "grad_norm": 6.034943867622263, + "learning_rate": 4.547381329819092e-06, + "loss": 0.9019, + "step": 6055 + }, + { + "epoch": 0.4375169324687991, + "grad_norm": 6.283158804630755, + "learning_rate": 4.547213460015516e-06, + "loss": 0.8743, + "step": 6056 + }, + { + "epoch": 0.4375891776690086, + "grad_norm": 6.479613391975841, + "learning_rate": 4.547045562187178e-06, + "loss": 0.9086, + "step": 6057 + }, + { + "epoch": 0.4376614228692181, + "grad_norm": 6.953313177755567, + "learning_rate": 4.546877636336376e-06, + "loss": 0.9487, + "step": 6058 + }, + { + "epoch": 0.43773366806942765, + "grad_norm": 6.211280208705343, + "learning_rate": 4.546709682465407e-06, + "loss": 0.8995, + "step": 6059 + }, + { + "epoch": 0.43780591326963714, + "grad_norm": 6.52484079217886, + "learning_rate": 4.5465417005765724e-06, + "loss": 0.9199, + "step": 6060 + }, + { + "epoch": 0.4378781584698467, + "grad_norm": 6.182022907314603, + "learning_rate": 4.54637369067217e-06, + "loss": 0.8675, + "step": 6061 + }, + { + "epoch": 0.43795040367005617, + "grad_norm": 6.350934249756217, + "learning_rate": 4.546205652754502e-06, + "loss": 0.8176, + "step": 6062 + }, + { + "epoch": 0.4380226488702657, + "grad_norm": 9.63365359010488, + "learning_rate": 4.546037586825866e-06, + "loss": 0.9469, + "step": 6063 + }, + { + "epoch": 0.4380948940704752, + "grad_norm": 5.540657792959832, + "learning_rate": 4.545869492888566e-06, + "loss": 0.9483, + "step": 6064 + }, + { + "epoch": 0.4381671392706847, + "grad_norm": 6.767858264403928, + "learning_rate": 4.5457013709449e-06, + "loss": 0.965, + "step": 6065 + }, + { + "epoch": 0.4382393844708942, + "grad_norm": 6.10195605074829, + "learning_rate": 4.545533220997171e-06, + "loss": 0.8925, + "step": 6066 + }, + { + "epoch": 0.4383116296711037, + "grad_norm": 6.535839165976747, + "learning_rate": 4.5453650430476796e-06, + "loss": 0.9646, + "step": 6067 + }, + { + "epoch": 0.43838387487131325, + "grad_norm": 6.974262332534375, + "learning_rate": 4.54519683709873e-06, + "loss": 0.8618, + "step": 6068 + }, + { + "epoch": 0.43845612007152274, + "grad_norm": 7.1109955334542745, + "learning_rate": 4.545028603152623e-06, + "loss": 0.9215, + "step": 6069 + }, + { + "epoch": 0.4385283652717323, + "grad_norm": 6.54730849549311, + "learning_rate": 4.544860341211662e-06, + "loss": 0.929, + "step": 6070 + }, + { + "epoch": 0.43860061047194177, + "grad_norm": 5.633049694045919, + "learning_rate": 4.544692051278152e-06, + "loss": 0.8609, + "step": 6071 + }, + { + "epoch": 0.4386728556721513, + "grad_norm": 7.2806068758526505, + "learning_rate": 4.544523733354394e-06, + "loss": 0.9521, + "step": 6072 + }, + { + "epoch": 0.4387451008723608, + "grad_norm": 6.165294528927062, + "learning_rate": 4.544355387442694e-06, + "loss": 0.824, + "step": 6073 + }, + { + "epoch": 0.4388173460725703, + "grad_norm": 8.572843443942942, + "learning_rate": 4.544187013545356e-06, + "loss": 0.9474, + "step": 6074 + }, + { + "epoch": 0.4388895912727798, + "grad_norm": 6.939139232627842, + "learning_rate": 4.544018611664685e-06, + "loss": 1.039, + "step": 6075 + }, + { + "epoch": 0.4389618364729893, + "grad_norm": 6.462782609024821, + "learning_rate": 4.5438501818029875e-06, + "loss": 0.9111, + "step": 6076 + }, + { + "epoch": 0.43903408167319885, + "grad_norm": 6.738949242251973, + "learning_rate": 4.543681723962567e-06, + "loss": 0.9374, + "step": 6077 + }, + { + "epoch": 0.43910632687340834, + "grad_norm": 6.846784084592225, + "learning_rate": 4.54351323814573e-06, + "loss": 0.9556, + "step": 6078 + }, + { + "epoch": 0.4391785720736179, + "grad_norm": 5.7085941375778235, + "learning_rate": 4.543344724354784e-06, + "loss": 0.9148, + "step": 6079 + }, + { + "epoch": 0.43925081727382737, + "grad_norm": 6.543371606068057, + "learning_rate": 4.543176182592035e-06, + "loss": 0.8541, + "step": 6080 + }, + { + "epoch": 0.4393230624740369, + "grad_norm": 6.021666190897633, + "learning_rate": 4.543007612859791e-06, + "loss": 0.8775, + "step": 6081 + }, + { + "epoch": 0.4393953076742464, + "grad_norm": 6.674723461580578, + "learning_rate": 4.542839015160358e-06, + "loss": 0.8819, + "step": 6082 + }, + { + "epoch": 0.4394675528744559, + "grad_norm": 8.805022401710794, + "learning_rate": 4.542670389496047e-06, + "loss": 0.9435, + "step": 6083 + }, + { + "epoch": 0.4395397980746654, + "grad_norm": 5.126743485297278, + "learning_rate": 4.542501735869163e-06, + "loss": 0.9085, + "step": 6084 + }, + { + "epoch": 0.4396120432748749, + "grad_norm": 5.752830140588617, + "learning_rate": 4.542333054282016e-06, + "loss": 0.8669, + "step": 6085 + }, + { + "epoch": 0.43968428847508445, + "grad_norm": 5.749508380605453, + "learning_rate": 4.542164344736916e-06, + "loss": 0.8152, + "step": 6086 + }, + { + "epoch": 0.43975653367529394, + "grad_norm": 6.5108458191889325, + "learning_rate": 4.541995607236171e-06, + "loss": 0.8819, + "step": 6087 + }, + { + "epoch": 0.4398287788755035, + "grad_norm": 6.553893631156645, + "learning_rate": 4.541826841782092e-06, + "loss": 0.8443, + "step": 6088 + }, + { + "epoch": 0.43990102407571297, + "grad_norm": 6.654763499012057, + "learning_rate": 4.541658048376989e-06, + "loss": 0.8745, + "step": 6089 + }, + { + "epoch": 0.4399732692759225, + "grad_norm": 5.2577517365661395, + "learning_rate": 4.541489227023173e-06, + "loss": 0.8394, + "step": 6090 + }, + { + "epoch": 0.440045514476132, + "grad_norm": 6.781673295130396, + "learning_rate": 4.541320377722953e-06, + "loss": 0.896, + "step": 6091 + }, + { + "epoch": 0.4401177596763415, + "grad_norm": 5.862612712240226, + "learning_rate": 4.5411515004786436e-06, + "loss": 0.8479, + "step": 6092 + }, + { + "epoch": 0.440190004876551, + "grad_norm": 5.679701940211182, + "learning_rate": 4.540982595292555e-06, + "loss": 0.8568, + "step": 6093 + }, + { + "epoch": 0.4402622500767605, + "grad_norm": 6.873082396363168, + "learning_rate": 4.540813662166998e-06, + "loss": 0.8422, + "step": 6094 + }, + { + "epoch": 0.44033449527697005, + "grad_norm": 6.275312511671481, + "learning_rate": 4.540644701104287e-06, + "loss": 0.8848, + "step": 6095 + }, + { + "epoch": 0.44040674047717954, + "grad_norm": 6.431378166619524, + "learning_rate": 4.5404757121067354e-06, + "loss": 0.8704, + "step": 6096 + }, + { + "epoch": 0.4404789856773891, + "grad_norm": 5.874244154337273, + "learning_rate": 4.540306695176655e-06, + "loss": 0.8108, + "step": 6097 + }, + { + "epoch": 0.44055123087759857, + "grad_norm": 5.5792310456979015, + "learning_rate": 4.540137650316361e-06, + "loss": 0.9614, + "step": 6098 + }, + { + "epoch": 0.4406234760778081, + "grad_norm": 6.610726292769949, + "learning_rate": 4.539968577528165e-06, + "loss": 0.953, + "step": 6099 + }, + { + "epoch": 0.4406957212780176, + "grad_norm": 6.343100528637888, + "learning_rate": 4.539799476814384e-06, + "loss": 0.7811, + "step": 6100 + }, + { + "epoch": 0.4407679664782271, + "grad_norm": 8.210195520560996, + "learning_rate": 4.539630348177332e-06, + "loss": 0.8738, + "step": 6101 + }, + { + "epoch": 0.4408402116784366, + "grad_norm": 6.323325666075411, + "learning_rate": 4.539461191619324e-06, + "loss": 0.9039, + "step": 6102 + }, + { + "epoch": 0.4409124568786461, + "grad_norm": 6.82272970705403, + "learning_rate": 4.539292007142676e-06, + "loss": 0.8662, + "step": 6103 + }, + { + "epoch": 0.44098470207885565, + "grad_norm": 6.414851076295902, + "learning_rate": 4.539122794749704e-06, + "loss": 0.9335, + "step": 6104 + }, + { + "epoch": 0.44105694727906514, + "grad_norm": 9.393766617141615, + "learning_rate": 4.538953554442725e-06, + "loss": 0.9171, + "step": 6105 + }, + { + "epoch": 0.4411291924792747, + "grad_norm": 7.438011344193262, + "learning_rate": 4.538784286224054e-06, + "loss": 0.9353, + "step": 6106 + }, + { + "epoch": 0.44120143767948417, + "grad_norm": 7.641683119073571, + "learning_rate": 4.538614990096008e-06, + "loss": 0.8706, + "step": 6107 + }, + { + "epoch": 0.4412736828796937, + "grad_norm": 6.72794213584665, + "learning_rate": 4.5384456660609075e-06, + "loss": 0.8595, + "step": 6108 + }, + { + "epoch": 0.4413459280799032, + "grad_norm": 7.063835625400395, + "learning_rate": 4.538276314121069e-06, + "loss": 0.9184, + "step": 6109 + }, + { + "epoch": 0.4414181732801127, + "grad_norm": 6.785066295460131, + "learning_rate": 4.538106934278808e-06, + "loss": 0.8905, + "step": 6110 + }, + { + "epoch": 0.4414904184803222, + "grad_norm": 6.854674934008741, + "learning_rate": 4.537937526536447e-06, + "loss": 0.9945, + "step": 6111 + }, + { + "epoch": 0.4415626636805317, + "grad_norm": 5.999690365749159, + "learning_rate": 4.537768090896304e-06, + "loss": 0.9253, + "step": 6112 + }, + { + "epoch": 0.44163490888074125, + "grad_norm": 7.257678996686566, + "learning_rate": 4.537598627360698e-06, + "loss": 0.8992, + "step": 6113 + }, + { + "epoch": 0.44170715408095074, + "grad_norm": 7.482670408705204, + "learning_rate": 4.5374291359319474e-06, + "loss": 0.9138, + "step": 6114 + }, + { + "epoch": 0.4417793992811603, + "grad_norm": 6.596050710785057, + "learning_rate": 4.537259616612375e-06, + "loss": 0.8442, + "step": 6115 + }, + { + "epoch": 0.44185164448136977, + "grad_norm": 7.552902563100198, + "learning_rate": 4.537090069404301e-06, + "loss": 0.9189, + "step": 6116 + }, + { + "epoch": 0.4419238896815793, + "grad_norm": 10.57482371803334, + "learning_rate": 4.536920494310044e-06, + "loss": 0.9085, + "step": 6117 + }, + { + "epoch": 0.4419961348817888, + "grad_norm": 6.3314833030381115, + "learning_rate": 4.536750891331928e-06, + "loss": 0.8833, + "step": 6118 + }, + { + "epoch": 0.4420683800819983, + "grad_norm": 7.440698785267815, + "learning_rate": 4.5365812604722735e-06, + "loss": 0.8726, + "step": 6119 + }, + { + "epoch": 0.4421406252822078, + "grad_norm": 7.299560836750649, + "learning_rate": 4.536411601733403e-06, + "loss": 0.9283, + "step": 6120 + }, + { + "epoch": 0.4422128704824173, + "grad_norm": 5.350281232055466, + "learning_rate": 4.536241915117639e-06, + "loss": 0.8233, + "step": 6121 + }, + { + "epoch": 0.44228511568262685, + "grad_norm": 7.36279321161155, + "learning_rate": 4.536072200627304e-06, + "loss": 1.0036, + "step": 6122 + }, + { + "epoch": 0.44235736088283634, + "grad_norm": 6.993862185971867, + "learning_rate": 4.535902458264722e-06, + "loss": 0.8822, + "step": 6123 + }, + { + "epoch": 0.4424296060830459, + "grad_norm": 7.214699934273189, + "learning_rate": 4.535732688032215e-06, + "loss": 0.9247, + "step": 6124 + }, + { + "epoch": 0.44250185128325537, + "grad_norm": 6.247263500046534, + "learning_rate": 4.535562889932109e-06, + "loss": 0.9227, + "step": 6125 + }, + { + "epoch": 0.4425740964834649, + "grad_norm": 7.719719829713225, + "learning_rate": 4.535393063966727e-06, + "loss": 0.9839, + "step": 6126 + }, + { + "epoch": 0.4426463416836744, + "grad_norm": 7.467191014725057, + "learning_rate": 4.5352232101383945e-06, + "loss": 0.9508, + "step": 6127 + }, + { + "epoch": 0.4427185868838839, + "grad_norm": 6.539003287609002, + "learning_rate": 4.535053328449437e-06, + "loss": 0.8762, + "step": 6128 + }, + { + "epoch": 0.4427908320840934, + "grad_norm": 5.585866989010828, + "learning_rate": 4.534883418902179e-06, + "loss": 0.7791, + "step": 6129 + }, + { + "epoch": 0.4428630772843029, + "grad_norm": 4.916808799801322, + "learning_rate": 4.5347134814989465e-06, + "loss": 0.8592, + "step": 6130 + }, + { + "epoch": 0.44293532248451245, + "grad_norm": 6.176004539665965, + "learning_rate": 4.534543516242068e-06, + "loss": 0.8767, + "step": 6131 + }, + { + "epoch": 0.44300756768472194, + "grad_norm": 5.518120652254435, + "learning_rate": 4.534373523133867e-06, + "loss": 0.8716, + "step": 6132 + }, + { + "epoch": 0.4430798128849315, + "grad_norm": 6.949214084016843, + "learning_rate": 4.534203502176673e-06, + "loss": 0.8765, + "step": 6133 + }, + { + "epoch": 0.44315205808514097, + "grad_norm": 5.2176466820592315, + "learning_rate": 4.534033453372812e-06, + "loss": 0.8502, + "step": 6134 + }, + { + "epoch": 0.4432243032853505, + "grad_norm": 5.600699939581875, + "learning_rate": 4.533863376724612e-06, + "loss": 0.8969, + "step": 6135 + }, + { + "epoch": 0.44329654848556, + "grad_norm": 5.248996457052752, + "learning_rate": 4.533693272234402e-06, + "loss": 0.8636, + "step": 6136 + }, + { + "epoch": 0.4433687936857695, + "grad_norm": 5.98980800925981, + "learning_rate": 4.53352313990451e-06, + "loss": 0.8799, + "step": 6137 + }, + { + "epoch": 0.443441038885979, + "grad_norm": 5.332762369746799, + "learning_rate": 4.533352979737265e-06, + "loss": 0.8994, + "step": 6138 + }, + { + "epoch": 0.4435132840861885, + "grad_norm": 6.665461939995994, + "learning_rate": 4.533182791734997e-06, + "loss": 0.8881, + "step": 6139 + }, + { + "epoch": 0.44358552928639805, + "grad_norm": 7.734348859646483, + "learning_rate": 4.533012575900035e-06, + "loss": 0.8694, + "step": 6140 + }, + { + "epoch": 0.44365777448660754, + "grad_norm": 6.689479356700393, + "learning_rate": 4.532842332234709e-06, + "loss": 0.8748, + "step": 6141 + }, + { + "epoch": 0.4437300196868171, + "grad_norm": 7.079519374503795, + "learning_rate": 4.532672060741351e-06, + "loss": 0.9392, + "step": 6142 + }, + { + "epoch": 0.44380226488702657, + "grad_norm": 8.391563320439017, + "learning_rate": 4.532501761422289e-06, + "loss": 0.8815, + "step": 6143 + }, + { + "epoch": 0.44387451008723605, + "grad_norm": 7.738747580282492, + "learning_rate": 4.532331434279857e-06, + "loss": 0.8877, + "step": 6144 + }, + { + "epoch": 0.4439467552874456, + "grad_norm": 6.605070836440328, + "learning_rate": 4.532161079316386e-06, + "loss": 0.897, + "step": 6145 + }, + { + "epoch": 0.4440190004876551, + "grad_norm": 5.977236162503033, + "learning_rate": 4.531990696534208e-06, + "loss": 0.7865, + "step": 6146 + }, + { + "epoch": 0.4440912456878646, + "grad_norm": 6.828744541580111, + "learning_rate": 4.531820285935654e-06, + "loss": 0.9396, + "step": 6147 + }, + { + "epoch": 0.4441634908880741, + "grad_norm": 7.935229359648931, + "learning_rate": 4.531649847523059e-06, + "loss": 0.8895, + "step": 6148 + }, + { + "epoch": 0.44423573608828365, + "grad_norm": 7.196516932310467, + "learning_rate": 4.531479381298754e-06, + "loss": 0.9263, + "step": 6149 + }, + { + "epoch": 0.44430798128849314, + "grad_norm": 8.09193806636716, + "learning_rate": 4.531308887265074e-06, + "loss": 0.9549, + "step": 6150 + }, + { + "epoch": 0.4443802264887027, + "grad_norm": 7.635521055776933, + "learning_rate": 4.531138365424352e-06, + "loss": 0.7982, + "step": 6151 + }, + { + "epoch": 0.44445247168891217, + "grad_norm": 6.599704458382078, + "learning_rate": 4.530967815778924e-06, + "loss": 0.8944, + "step": 6152 + }, + { + "epoch": 0.44452471688912165, + "grad_norm": 6.357657385710717, + "learning_rate": 4.530797238331122e-06, + "loss": 0.8836, + "step": 6153 + }, + { + "epoch": 0.4445969620893312, + "grad_norm": 7.1001098624312675, + "learning_rate": 4.530626633083284e-06, + "loss": 0.9502, + "step": 6154 + }, + { + "epoch": 0.4446692072895407, + "grad_norm": 8.29332753580919, + "learning_rate": 4.530456000037744e-06, + "loss": 0.9841, + "step": 6155 + }, + { + "epoch": 0.4447414524897502, + "grad_norm": 5.591765152830908, + "learning_rate": 4.530285339196838e-06, + "loss": 0.8682, + "step": 6156 + }, + { + "epoch": 0.4448136976899597, + "grad_norm": 7.823062196098369, + "learning_rate": 4.530114650562901e-06, + "loss": 0.9151, + "step": 6157 + }, + { + "epoch": 0.44488594289016925, + "grad_norm": 7.132998343947365, + "learning_rate": 4.529943934138271e-06, + "loss": 0.8584, + "step": 6158 + }, + { + "epoch": 0.44495818809037874, + "grad_norm": 7.778855043315315, + "learning_rate": 4.529773189925286e-06, + "loss": 0.8731, + "step": 6159 + }, + { + "epoch": 0.4450304332905883, + "grad_norm": 6.371328755341587, + "learning_rate": 4.529602417926281e-06, + "loss": 0.9377, + "step": 6160 + }, + { + "epoch": 0.44510267849079777, + "grad_norm": 5.189021427459497, + "learning_rate": 4.529431618143595e-06, + "loss": 0.8654, + "step": 6161 + }, + { + "epoch": 0.44517492369100725, + "grad_norm": 6.41193208614943, + "learning_rate": 4.529260790579566e-06, + "loss": 0.9692, + "step": 6162 + }, + { + "epoch": 0.4452471688912168, + "grad_norm": 7.940047928980149, + "learning_rate": 4.529089935236532e-06, + "loss": 0.8753, + "step": 6163 + }, + { + "epoch": 0.4453194140914263, + "grad_norm": 5.663050151987512, + "learning_rate": 4.528919052116832e-06, + "loss": 0.8744, + "step": 6164 + }, + { + "epoch": 0.4453916592916358, + "grad_norm": 5.928933800605543, + "learning_rate": 4.5287481412228065e-06, + "loss": 0.8196, + "step": 6165 + }, + { + "epoch": 0.4454639044918453, + "grad_norm": 6.567147489394534, + "learning_rate": 4.528577202556794e-06, + "loss": 0.7633, + "step": 6166 + }, + { + "epoch": 0.44553614969205485, + "grad_norm": 7.020439551957238, + "learning_rate": 4.528406236121134e-06, + "loss": 1.0152, + "step": 6167 + }, + { + "epoch": 0.44560839489226434, + "grad_norm": 7.4177990452546805, + "learning_rate": 4.528235241918168e-06, + "loss": 0.9086, + "step": 6168 + }, + { + "epoch": 0.4456806400924739, + "grad_norm": 6.792760405984114, + "learning_rate": 4.528064219950236e-06, + "loss": 0.8932, + "step": 6169 + }, + { + "epoch": 0.44575288529268337, + "grad_norm": 6.279488843294224, + "learning_rate": 4.52789317021968e-06, + "loss": 0.9091, + "step": 6170 + }, + { + "epoch": 0.44582513049289285, + "grad_norm": 8.132750116085186, + "learning_rate": 4.527722092728841e-06, + "loss": 0.9439, + "step": 6171 + }, + { + "epoch": 0.4458973756931024, + "grad_norm": 5.752039340398006, + "learning_rate": 4.527550987480061e-06, + "loss": 0.7933, + "step": 6172 + }, + { + "epoch": 0.4459696208933119, + "grad_norm": 8.24596792004896, + "learning_rate": 4.527379854475682e-06, + "loss": 0.8677, + "step": 6173 + }, + { + "epoch": 0.4460418660935214, + "grad_norm": 6.688589702489752, + "learning_rate": 4.527208693718047e-06, + "loss": 0.9229, + "step": 6174 + }, + { + "epoch": 0.4461141112937309, + "grad_norm": 7.714891712667366, + "learning_rate": 4.527037505209499e-06, + "loss": 0.8027, + "step": 6175 + }, + { + "epoch": 0.44618635649394045, + "grad_norm": 6.433972920198048, + "learning_rate": 4.526866288952382e-06, + "loss": 0.882, + "step": 6176 + }, + { + "epoch": 0.44625860169414994, + "grad_norm": 6.302497719836551, + "learning_rate": 4.526695044949039e-06, + "loss": 0.8577, + "step": 6177 + }, + { + "epoch": 0.4463308468943595, + "grad_norm": 6.094775304368667, + "learning_rate": 4.5265237732018144e-06, + "loss": 0.8627, + "step": 6178 + }, + { + "epoch": 0.44640309209456897, + "grad_norm": 7.1913669964300775, + "learning_rate": 4.526352473713053e-06, + "loss": 0.9448, + "step": 6179 + }, + { + "epoch": 0.44647533729477845, + "grad_norm": 5.494394393468362, + "learning_rate": 4.526181146485098e-06, + "loss": 0.7972, + "step": 6180 + }, + { + "epoch": 0.446547582494988, + "grad_norm": 7.270948465596182, + "learning_rate": 4.5260097915202985e-06, + "loss": 0.8411, + "step": 6181 + }, + { + "epoch": 0.4466198276951975, + "grad_norm": 6.113987579003652, + "learning_rate": 4.525838408820997e-06, + "loss": 0.8969, + "step": 6182 + }, + { + "epoch": 0.446692072895407, + "grad_norm": 7.83969266600497, + "learning_rate": 4.525666998389541e-06, + "loss": 0.8819, + "step": 6183 + }, + { + "epoch": 0.4467643180956165, + "grad_norm": 5.3890564336435665, + "learning_rate": 4.525495560228276e-06, + "loss": 0.8676, + "step": 6184 + }, + { + "epoch": 0.44683656329582605, + "grad_norm": 7.698344947006188, + "learning_rate": 4.52532409433955e-06, + "loss": 0.9224, + "step": 6185 + }, + { + "epoch": 0.44690880849603554, + "grad_norm": 11.138446011595274, + "learning_rate": 4.525152600725709e-06, + "loss": 1.0183, + "step": 6186 + }, + { + "epoch": 0.4469810536962451, + "grad_norm": 7.188347011912535, + "learning_rate": 4.524981079389103e-06, + "loss": 0.9561, + "step": 6187 + }, + { + "epoch": 0.44705329889645457, + "grad_norm": 4.940694427484992, + "learning_rate": 4.5248095303320775e-06, + "loss": 0.8682, + "step": 6188 + }, + { + "epoch": 0.44712554409666405, + "grad_norm": 4.866996455231836, + "learning_rate": 4.5246379535569815e-06, + "loss": 0.9103, + "step": 6189 + }, + { + "epoch": 0.4471977892968736, + "grad_norm": 7.0189035482077236, + "learning_rate": 4.524466349066164e-06, + "loss": 0.8297, + "step": 6190 + }, + { + "epoch": 0.4472700344970831, + "grad_norm": 6.708580091530727, + "learning_rate": 4.524294716861974e-06, + "loss": 0.9415, + "step": 6191 + }, + { + "epoch": 0.4473422796972926, + "grad_norm": 9.741275111395758, + "learning_rate": 4.524123056946761e-06, + "loss": 0.8434, + "step": 6192 + }, + { + "epoch": 0.4474145248975021, + "grad_norm": 5.254388700548816, + "learning_rate": 4.523951369322874e-06, + "loss": 0.8971, + "step": 6193 + }, + { + "epoch": 0.44748677009771165, + "grad_norm": 6.611444099795203, + "learning_rate": 4.523779653992666e-06, + "loss": 0.9844, + "step": 6194 + }, + { + "epoch": 0.44755901529792114, + "grad_norm": 6.580346536526133, + "learning_rate": 4.523607910958485e-06, + "loss": 0.9253, + "step": 6195 + }, + { + "epoch": 0.4476312604981307, + "grad_norm": 7.007730030607969, + "learning_rate": 4.523436140222683e-06, + "loss": 0.9449, + "step": 6196 + }, + { + "epoch": 0.44770350569834017, + "grad_norm": 9.64507825187869, + "learning_rate": 4.523264341787612e-06, + "loss": 0.8767, + "step": 6197 + }, + { + "epoch": 0.44777575089854965, + "grad_norm": 5.6915667739673665, + "learning_rate": 4.523092515655623e-06, + "loss": 0.9531, + "step": 6198 + }, + { + "epoch": 0.4478479960987592, + "grad_norm": 6.830024786662621, + "learning_rate": 4.522920661829068e-06, + "loss": 0.8257, + "step": 6199 + }, + { + "epoch": 0.4479202412989687, + "grad_norm": 6.4155090410741, + "learning_rate": 4.522748780310299e-06, + "loss": 0.8122, + "step": 6200 + }, + { + "epoch": 0.4479924864991782, + "grad_norm": 6.770783534722718, + "learning_rate": 4.52257687110167e-06, + "loss": 0.9523, + "step": 6201 + }, + { + "epoch": 0.4480647316993877, + "grad_norm": 8.18410322474379, + "learning_rate": 4.5224049342055355e-06, + "loss": 0.9358, + "step": 6202 + }, + { + "epoch": 0.44813697689959725, + "grad_norm": 6.039141142457708, + "learning_rate": 4.5222329696242465e-06, + "loss": 0.8116, + "step": 6203 + }, + { + "epoch": 0.44820922209980674, + "grad_norm": 5.7697828234124815, + "learning_rate": 4.522060977360159e-06, + "loss": 0.8405, + "step": 6204 + }, + { + "epoch": 0.4482814673000163, + "grad_norm": 9.147115584201513, + "learning_rate": 4.521888957415627e-06, + "loss": 0.9484, + "step": 6205 + }, + { + "epoch": 0.44835371250022577, + "grad_norm": 6.220564227746824, + "learning_rate": 4.521716909793004e-06, + "loss": 0.856, + "step": 6206 + }, + { + "epoch": 0.44842595770043525, + "grad_norm": 7.019885020822379, + "learning_rate": 4.5215448344946465e-06, + "loss": 0.8909, + "step": 6207 + }, + { + "epoch": 0.4484982029006448, + "grad_norm": 9.978050939402765, + "learning_rate": 4.52137273152291e-06, + "loss": 0.9277, + "step": 6208 + }, + { + "epoch": 0.4485704481008543, + "grad_norm": 6.49540724255085, + "learning_rate": 4.52120060088015e-06, + "loss": 0.9275, + "step": 6209 + }, + { + "epoch": 0.4486426933010638, + "grad_norm": 6.242095832008394, + "learning_rate": 4.521028442568723e-06, + "loss": 0.8101, + "step": 6210 + }, + { + "epoch": 0.4487149385012733, + "grad_norm": 5.825260389017259, + "learning_rate": 4.5208562565909875e-06, + "loss": 0.98, + "step": 6211 + }, + { + "epoch": 0.44878718370148285, + "grad_norm": 6.366666845085315, + "learning_rate": 4.520684042949297e-06, + "loss": 0.9113, + "step": 6212 + }, + { + "epoch": 0.44885942890169234, + "grad_norm": 7.602848081488628, + "learning_rate": 4.520511801646013e-06, + "loss": 0.8322, + "step": 6213 + }, + { + "epoch": 0.4489316741019019, + "grad_norm": 7.981629976623837, + "learning_rate": 4.520339532683489e-06, + "loss": 0.8635, + "step": 6214 + }, + { + "epoch": 0.44900391930211137, + "grad_norm": 9.095396751402859, + "learning_rate": 4.520167236064087e-06, + "loss": 0.8675, + "step": 6215 + }, + { + "epoch": 0.44907616450232085, + "grad_norm": 7.634751634836248, + "learning_rate": 4.519994911790163e-06, + "loss": 0.8403, + "step": 6216 + }, + { + "epoch": 0.4491484097025304, + "grad_norm": 6.6685022052464635, + "learning_rate": 4.5198225598640775e-06, + "loss": 0.8453, + "step": 6217 + }, + { + "epoch": 0.4492206549027399, + "grad_norm": 8.494879582782593, + "learning_rate": 4.51965018028819e-06, + "loss": 0.9268, + "step": 6218 + }, + { + "epoch": 0.4492929001029494, + "grad_norm": 6.616573805201431, + "learning_rate": 4.519477773064858e-06, + "loss": 0.8786, + "step": 6219 + }, + { + "epoch": 0.4493651453031589, + "grad_norm": 8.678062966784047, + "learning_rate": 4.519305338196446e-06, + "loss": 0.907, + "step": 6220 + }, + { + "epoch": 0.44943739050336845, + "grad_norm": 7.902455501171648, + "learning_rate": 4.51913287568531e-06, + "loss": 0.9519, + "step": 6221 + }, + { + "epoch": 0.44950963570357794, + "grad_norm": 5.139252360552015, + "learning_rate": 4.518960385533813e-06, + "loss": 0.8221, + "step": 6222 + }, + { + "epoch": 0.4495818809037875, + "grad_norm": 8.663607668826854, + "learning_rate": 4.518787867744317e-06, + "loss": 0.9122, + "step": 6223 + }, + { + "epoch": 0.44965412610399697, + "grad_norm": 6.9938016424241125, + "learning_rate": 4.518615322319181e-06, + "loss": 0.9225, + "step": 6224 + }, + { + "epoch": 0.44972637130420645, + "grad_norm": 6.625418883702485, + "learning_rate": 4.518442749260768e-06, + "loss": 0.9358, + "step": 6225 + }, + { + "epoch": 0.449798616504416, + "grad_norm": 5.760148091001953, + "learning_rate": 4.518270148571443e-06, + "loss": 0.8724, + "step": 6226 + }, + { + "epoch": 0.4498708617046255, + "grad_norm": 5.976251332279481, + "learning_rate": 4.5180975202535656e-06, + "loss": 0.8015, + "step": 6227 + }, + { + "epoch": 0.449943106904835, + "grad_norm": 6.018772956273556, + "learning_rate": 4.517924864309501e-06, + "loss": 0.874, + "step": 6228 + }, + { + "epoch": 0.4500153521050445, + "grad_norm": 5.619208681863235, + "learning_rate": 4.517752180741611e-06, + "loss": 0.9214, + "step": 6229 + }, + { + "epoch": 0.45008759730525405, + "grad_norm": 7.489845522419473, + "learning_rate": 4.517579469552261e-06, + "loss": 0.9379, + "step": 6230 + }, + { + "epoch": 0.45015984250546354, + "grad_norm": 6.315925197648792, + "learning_rate": 4.517406730743814e-06, + "loss": 0.8289, + "step": 6231 + }, + { + "epoch": 0.4502320877056731, + "grad_norm": 6.0214745555182665, + "learning_rate": 4.517233964318635e-06, + "loss": 0.8802, + "step": 6232 + }, + { + "epoch": 0.45030433290588257, + "grad_norm": 6.420665172212464, + "learning_rate": 4.5170611702790905e-06, + "loss": 0.9613, + "step": 6233 + }, + { + "epoch": 0.45037657810609205, + "grad_norm": 5.296656421213742, + "learning_rate": 4.516888348627543e-06, + "loss": 0.8017, + "step": 6234 + }, + { + "epoch": 0.4504488233063016, + "grad_norm": 6.6893376472696975, + "learning_rate": 4.516715499366361e-06, + "loss": 0.8587, + "step": 6235 + }, + { + "epoch": 0.4505210685065111, + "grad_norm": 5.308832855116964, + "learning_rate": 4.51654262249791e-06, + "loss": 0.8902, + "step": 6236 + }, + { + "epoch": 0.4505933137067206, + "grad_norm": 6.961353978842765, + "learning_rate": 4.516369718024556e-06, + "loss": 0.8864, + "step": 6237 + }, + { + "epoch": 0.4506655589069301, + "grad_norm": 5.658555714764997, + "learning_rate": 4.5161967859486665e-06, + "loss": 0.7665, + "step": 6238 + }, + { + "epoch": 0.45073780410713965, + "grad_norm": 6.520948545110483, + "learning_rate": 4.516023826272608e-06, + "loss": 0.8931, + "step": 6239 + }, + { + "epoch": 0.45081004930734914, + "grad_norm": 6.754400443711041, + "learning_rate": 4.515850838998748e-06, + "loss": 0.8645, + "step": 6240 + }, + { + "epoch": 0.4508822945075587, + "grad_norm": 6.112660024845624, + "learning_rate": 4.515677824129456e-06, + "loss": 0.8089, + "step": 6241 + }, + { + "epoch": 0.45095453970776816, + "grad_norm": 6.817880168368639, + "learning_rate": 4.515504781667101e-06, + "loss": 0.9194, + "step": 6242 + }, + { + "epoch": 0.45102678490797765, + "grad_norm": 6.188919983349496, + "learning_rate": 4.515331711614048e-06, + "loss": 0.8037, + "step": 6243 + }, + { + "epoch": 0.4510990301081872, + "grad_norm": 5.545496293776338, + "learning_rate": 4.51515861397267e-06, + "loss": 0.8148, + "step": 6244 + }, + { + "epoch": 0.4511712753083967, + "grad_norm": 7.064053254943077, + "learning_rate": 4.514985488745335e-06, + "loss": 0.9026, + "step": 6245 + }, + { + "epoch": 0.4512435205086062, + "grad_norm": 8.226725258806612, + "learning_rate": 4.514812335934413e-06, + "loss": 0.976, + "step": 6246 + }, + { + "epoch": 0.4513157657088157, + "grad_norm": 5.311038006838494, + "learning_rate": 4.514639155542275e-06, + "loss": 0.8527, + "step": 6247 + }, + { + "epoch": 0.45138801090902525, + "grad_norm": 7.007440155241918, + "learning_rate": 4.514465947571291e-06, + "loss": 0.9033, + "step": 6248 + }, + { + "epoch": 0.45146025610923474, + "grad_norm": 6.585648463593783, + "learning_rate": 4.514292712023832e-06, + "loss": 0.7723, + "step": 6249 + }, + { + "epoch": 0.4515325013094443, + "grad_norm": 6.201576942971025, + "learning_rate": 4.51411944890227e-06, + "loss": 0.9084, + "step": 6250 + }, + { + "epoch": 0.45160474650965376, + "grad_norm": 6.106451481410805, + "learning_rate": 4.5139461582089775e-06, + "loss": 0.8142, + "step": 6251 + }, + { + "epoch": 0.45167699170986325, + "grad_norm": 5.414363880218708, + "learning_rate": 4.513772839946324e-06, + "loss": 0.8422, + "step": 6252 + }, + { + "epoch": 0.4517492369100728, + "grad_norm": 6.9124631249546225, + "learning_rate": 4.513599494116685e-06, + "loss": 0.9021, + "step": 6253 + }, + { + "epoch": 0.4518214821102823, + "grad_norm": 6.403858464141737, + "learning_rate": 4.5134261207224324e-06, + "loss": 0.9085, + "step": 6254 + }, + { + "epoch": 0.4518937273104918, + "grad_norm": 8.270666881458986, + "learning_rate": 4.5132527197659395e-06, + "loss": 0.9188, + "step": 6255 + }, + { + "epoch": 0.4519659725107013, + "grad_norm": 6.4595584158654775, + "learning_rate": 4.51307929124958e-06, + "loss": 0.8419, + "step": 6256 + }, + { + "epoch": 0.45203821771091085, + "grad_norm": 7.273547637793941, + "learning_rate": 4.5129058351757285e-06, + "loss": 0.849, + "step": 6257 + }, + { + "epoch": 0.45211046291112034, + "grad_norm": 6.910324620793592, + "learning_rate": 4.512732351546758e-06, + "loss": 0.8812, + "step": 6258 + }, + { + "epoch": 0.4521827081113299, + "grad_norm": 7.254166030956798, + "learning_rate": 4.512558840365045e-06, + "loss": 0.9098, + "step": 6259 + }, + { + "epoch": 0.45225495331153936, + "grad_norm": 6.91431518782602, + "learning_rate": 4.512385301632964e-06, + "loss": 0.9458, + "step": 6260 + }, + { + "epoch": 0.45232719851174885, + "grad_norm": 6.5694755175310195, + "learning_rate": 4.512211735352891e-06, + "loss": 0.8922, + "step": 6261 + }, + { + "epoch": 0.4523994437119584, + "grad_norm": 6.9404502300464594, + "learning_rate": 4.512038141527202e-06, + "loss": 0.9862, + "step": 6262 + }, + { + "epoch": 0.4524716889121679, + "grad_norm": 5.948895892113517, + "learning_rate": 4.511864520158272e-06, + "loss": 0.9084, + "step": 6263 + }, + { + "epoch": 0.4525439341123774, + "grad_norm": 7.369358038304864, + "learning_rate": 4.51169087124848e-06, + "loss": 0.9262, + "step": 6264 + }, + { + "epoch": 0.4526161793125869, + "grad_norm": 7.038122184176297, + "learning_rate": 4.511517194800202e-06, + "loss": 0.7959, + "step": 6265 + }, + { + "epoch": 0.45268842451279645, + "grad_norm": 6.359724105512926, + "learning_rate": 4.511343490815814e-06, + "loss": 0.8856, + "step": 6266 + }, + { + "epoch": 0.45276066971300594, + "grad_norm": 6.0978792725270035, + "learning_rate": 4.511169759297696e-06, + "loss": 0.9006, + "step": 6267 + }, + { + "epoch": 0.4528329149132155, + "grad_norm": 6.823239601782285, + "learning_rate": 4.510996000248226e-06, + "loss": 0.994, + "step": 6268 + }, + { + "epoch": 0.45290516011342496, + "grad_norm": 6.100651202853426, + "learning_rate": 4.510822213669782e-06, + "loss": 0.9572, + "step": 6269 + }, + { + "epoch": 0.45297740531363445, + "grad_norm": 7.293979241724707, + "learning_rate": 4.5106483995647435e-06, + "loss": 0.97, + "step": 6270 + }, + { + "epoch": 0.453049650513844, + "grad_norm": 8.133322810598768, + "learning_rate": 4.510474557935489e-06, + "loss": 1.0196, + "step": 6271 + }, + { + "epoch": 0.4531218957140535, + "grad_norm": 6.927093398772841, + "learning_rate": 4.510300688784399e-06, + "loss": 0.8924, + "step": 6272 + }, + { + "epoch": 0.453194140914263, + "grad_norm": 7.06416800727515, + "learning_rate": 4.510126792113853e-06, + "loss": 0.8992, + "step": 6273 + }, + { + "epoch": 0.4532663861144725, + "grad_norm": 6.24845104096823, + "learning_rate": 4.5099528679262325e-06, + "loss": 0.9025, + "step": 6274 + }, + { + "epoch": 0.45333863131468205, + "grad_norm": 6.346775179237189, + "learning_rate": 4.509778916223918e-06, + "loss": 1.0398, + "step": 6275 + }, + { + "epoch": 0.45341087651489154, + "grad_norm": 6.214681089755393, + "learning_rate": 4.509604937009291e-06, + "loss": 0.8336, + "step": 6276 + }, + { + "epoch": 0.4534831217151011, + "grad_norm": 7.174896866858006, + "learning_rate": 4.5094309302847315e-06, + "loss": 0.834, + "step": 6277 + }, + { + "epoch": 0.45355536691531056, + "grad_norm": 5.822690500997813, + "learning_rate": 4.509256896052624e-06, + "loss": 0.8548, + "step": 6278 + }, + { + "epoch": 0.45362761211552005, + "grad_norm": 5.841115102518452, + "learning_rate": 4.5090828343153495e-06, + "loss": 0.8776, + "step": 6279 + }, + { + "epoch": 0.4536998573157296, + "grad_norm": 5.691177017947111, + "learning_rate": 4.50890874507529e-06, + "loss": 0.817, + "step": 6280 + }, + { + "epoch": 0.4537721025159391, + "grad_norm": 7.0001465918314105, + "learning_rate": 4.508734628334831e-06, + "loss": 0.894, + "step": 6281 + }, + { + "epoch": 0.4538443477161486, + "grad_norm": 6.202236313663592, + "learning_rate": 4.508560484096353e-06, + "loss": 0.9569, + "step": 6282 + }, + { + "epoch": 0.4539165929163581, + "grad_norm": 6.1121978872329805, + "learning_rate": 4.508386312362243e-06, + "loss": 0.7981, + "step": 6283 + }, + { + "epoch": 0.45398883811656765, + "grad_norm": 6.759231471329253, + "learning_rate": 4.508212113134883e-06, + "loss": 0.9524, + "step": 6284 + }, + { + "epoch": 0.45406108331677714, + "grad_norm": 6.008154414136785, + "learning_rate": 4.508037886416658e-06, + "loss": 0.8478, + "step": 6285 + }, + { + "epoch": 0.4541333285169867, + "grad_norm": 6.61759592572572, + "learning_rate": 4.507863632209955e-06, + "loss": 0.8172, + "step": 6286 + }, + { + "epoch": 0.45420557371719616, + "grad_norm": 7.391817476554021, + "learning_rate": 4.507689350517157e-06, + "loss": 0.982, + "step": 6287 + }, + { + "epoch": 0.45427781891740565, + "grad_norm": 7.31391775619168, + "learning_rate": 4.50751504134065e-06, + "loss": 0.9557, + "step": 6288 + }, + { + "epoch": 0.4543500641176152, + "grad_norm": 4.948934135894405, + "learning_rate": 4.507340704682822e-06, + "loss": 0.8358, + "step": 6289 + }, + { + "epoch": 0.4544223093178247, + "grad_norm": 5.998163896001094, + "learning_rate": 4.507166340546058e-06, + "loss": 0.7975, + "step": 6290 + }, + { + "epoch": 0.4544945545180342, + "grad_norm": 7.852054895552098, + "learning_rate": 4.5069919489327444e-06, + "loss": 0.9864, + "step": 6291 + }, + { + "epoch": 0.4545667997182437, + "grad_norm": 6.369255319900288, + "learning_rate": 4.5068175298452704e-06, + "loss": 0.8543, + "step": 6292 + }, + { + "epoch": 0.45463904491845325, + "grad_norm": 6.317533695567354, + "learning_rate": 4.506643083286022e-06, + "loss": 0.9242, + "step": 6293 + }, + { + "epoch": 0.45471129011866274, + "grad_norm": 6.724238651104802, + "learning_rate": 4.506468609257389e-06, + "loss": 0.9896, + "step": 6294 + }, + { + "epoch": 0.4547835353188723, + "grad_norm": 5.941676548133255, + "learning_rate": 4.506294107761757e-06, + "loss": 0.9677, + "step": 6295 + }, + { + "epoch": 0.45485578051908176, + "grad_norm": 5.3921985927877785, + "learning_rate": 4.506119578801518e-06, + "loss": 0.8729, + "step": 6296 + }, + { + "epoch": 0.45492802571929125, + "grad_norm": 7.7066226470024475, + "learning_rate": 4.505945022379058e-06, + "loss": 0.8978, + "step": 6297 + }, + { + "epoch": 0.4550002709195008, + "grad_norm": 5.884742288578024, + "learning_rate": 4.505770438496769e-06, + "loss": 0.9168, + "step": 6298 + }, + { + "epoch": 0.4550725161197103, + "grad_norm": 7.202571293200962, + "learning_rate": 4.50559582715704e-06, + "loss": 0.8641, + "step": 6299 + }, + { + "epoch": 0.4551447613199198, + "grad_norm": 5.2674852475474365, + "learning_rate": 4.505421188362261e-06, + "loss": 0.895, + "step": 6300 + }, + { + "epoch": 0.4552170065201293, + "grad_norm": 6.136913624272963, + "learning_rate": 4.505246522114824e-06, + "loss": 0.8583, + "step": 6301 + }, + { + "epoch": 0.45528925172033885, + "grad_norm": 6.443697418383155, + "learning_rate": 4.505071828417119e-06, + "loss": 0.902, + "step": 6302 + }, + { + "epoch": 0.45536149692054834, + "grad_norm": 6.360966813599245, + "learning_rate": 4.504897107271537e-06, + "loss": 0.7956, + "step": 6303 + }, + { + "epoch": 0.4554337421207579, + "grad_norm": 6.338452763013627, + "learning_rate": 4.50472235868047e-06, + "loss": 0.8242, + "step": 6304 + }, + { + "epoch": 0.45550598732096736, + "grad_norm": 6.8803102272529975, + "learning_rate": 4.5045475826463105e-06, + "loss": 0.8627, + "step": 6305 + }, + { + "epoch": 0.45557823252117685, + "grad_norm": 6.185288467526286, + "learning_rate": 4.504372779171452e-06, + "loss": 0.8986, + "step": 6306 + }, + { + "epoch": 0.4556504777213864, + "grad_norm": 5.782616036352142, + "learning_rate": 4.5041979482582855e-06, + "loss": 0.9138, + "step": 6307 + }, + { + "epoch": 0.4557227229215959, + "grad_norm": 5.702974500368686, + "learning_rate": 4.504023089909206e-06, + "loss": 0.8729, + "step": 6308 + }, + { + "epoch": 0.4557949681218054, + "grad_norm": 7.443937705081156, + "learning_rate": 4.503848204126606e-06, + "loss": 0.8947, + "step": 6309 + }, + { + "epoch": 0.4558672133220149, + "grad_norm": 5.517995179204989, + "learning_rate": 4.50367329091288e-06, + "loss": 0.8739, + "step": 6310 + }, + { + "epoch": 0.45593945852222445, + "grad_norm": 7.073479144540681, + "learning_rate": 4.503498350270422e-06, + "loss": 0.8834, + "step": 6311 + }, + { + "epoch": 0.45601170372243394, + "grad_norm": 5.626161074017179, + "learning_rate": 4.503323382201628e-06, + "loss": 0.8714, + "step": 6312 + }, + { + "epoch": 0.4560839489226435, + "grad_norm": 7.20345333782585, + "learning_rate": 4.503148386708892e-06, + "loss": 0.9213, + "step": 6313 + }, + { + "epoch": 0.45615619412285296, + "grad_norm": 8.431602847306497, + "learning_rate": 4.502973363794609e-06, + "loss": 0.8787, + "step": 6314 + }, + { + "epoch": 0.45622843932306245, + "grad_norm": 5.939985859144442, + "learning_rate": 4.502798313461178e-06, + "loss": 0.8513, + "step": 6315 + }, + { + "epoch": 0.456300684523272, + "grad_norm": 6.19203035596558, + "learning_rate": 4.502623235710991e-06, + "loss": 0.8283, + "step": 6316 + }, + { + "epoch": 0.4563729297234815, + "grad_norm": 7.220985495814385, + "learning_rate": 4.502448130546448e-06, + "loss": 0.9485, + "step": 6317 + }, + { + "epoch": 0.456445174923691, + "grad_norm": 9.072817907368881, + "learning_rate": 4.5022729979699445e-06, + "loss": 0.905, + "step": 6318 + }, + { + "epoch": 0.4565174201239005, + "grad_norm": 6.751319191205019, + "learning_rate": 4.502097837983879e-06, + "loss": 0.9533, + "step": 6319 + }, + { + "epoch": 0.45658966532411005, + "grad_norm": 5.957036052764457, + "learning_rate": 4.501922650590648e-06, + "loss": 0.9422, + "step": 6320 + }, + { + "epoch": 0.45666191052431954, + "grad_norm": 6.480449619000544, + "learning_rate": 4.50174743579265e-06, + "loss": 0.8369, + "step": 6321 + }, + { + "epoch": 0.4567341557245291, + "grad_norm": 6.1629512350841615, + "learning_rate": 4.501572193592284e-06, + "loss": 0.9178, + "step": 6322 + }, + { + "epoch": 0.45680640092473856, + "grad_norm": 6.407944171467061, + "learning_rate": 4.501396923991949e-06, + "loss": 0.9337, + "step": 6323 + }, + { + "epoch": 0.45687864612494805, + "grad_norm": 5.1932126498288875, + "learning_rate": 4.5012216269940445e-06, + "loss": 0.9388, + "step": 6324 + }, + { + "epoch": 0.4569508913251576, + "grad_norm": 7.098595934373357, + "learning_rate": 4.501046302600969e-06, + "loss": 0.8906, + "step": 6325 + }, + { + "epoch": 0.4570231365253671, + "grad_norm": 5.356492048423047, + "learning_rate": 4.500870950815124e-06, + "loss": 0.9426, + "step": 6326 + }, + { + "epoch": 0.4570953817255766, + "grad_norm": 7.334830853696883, + "learning_rate": 4.5006955716389086e-06, + "loss": 0.9633, + "step": 6327 + }, + { + "epoch": 0.4571676269257861, + "grad_norm": 7.012280182954601, + "learning_rate": 4.500520165074725e-06, + "loss": 0.9877, + "step": 6328 + }, + { + "epoch": 0.45723987212599565, + "grad_norm": 6.555219408170699, + "learning_rate": 4.5003447311249734e-06, + "loss": 0.7926, + "step": 6329 + }, + { + "epoch": 0.45731211732620514, + "grad_norm": 6.170736396160383, + "learning_rate": 4.500169269792055e-06, + "loss": 0.9285, + "step": 6330 + }, + { + "epoch": 0.4573843625264147, + "grad_norm": 6.230943849440694, + "learning_rate": 4.499993781078374e-06, + "loss": 0.8224, + "step": 6331 + }, + { + "epoch": 0.45745660772662416, + "grad_norm": 6.2750304445538845, + "learning_rate": 4.499818264986329e-06, + "loss": 0.8951, + "step": 6332 + }, + { + "epoch": 0.45752885292683365, + "grad_norm": 6.3127396037640935, + "learning_rate": 4.499642721518326e-06, + "loss": 0.9234, + "step": 6333 + }, + { + "epoch": 0.4576010981270432, + "grad_norm": 5.5155715723664995, + "learning_rate": 4.499467150676766e-06, + "loss": 0.7933, + "step": 6334 + }, + { + "epoch": 0.4576733433272527, + "grad_norm": 6.2866090905528615, + "learning_rate": 4.499291552464053e-06, + "loss": 0.8262, + "step": 6335 + }, + { + "epoch": 0.4577455885274622, + "grad_norm": 7.300993122845274, + "learning_rate": 4.4991159268825925e-06, + "loss": 0.9554, + "step": 6336 + }, + { + "epoch": 0.4578178337276717, + "grad_norm": 6.020685460299785, + "learning_rate": 4.498940273934786e-06, + "loss": 0.9465, + "step": 6337 + }, + { + "epoch": 0.45789007892788125, + "grad_norm": 6.460230132453318, + "learning_rate": 4.498764593623039e-06, + "loss": 0.801, + "step": 6338 + }, + { + "epoch": 0.45796232412809074, + "grad_norm": 8.319210699495335, + "learning_rate": 4.4985888859497575e-06, + "loss": 0.939, + "step": 6339 + }, + { + "epoch": 0.4580345693283003, + "grad_norm": 6.52678531470045, + "learning_rate": 4.498413150917345e-06, + "loss": 0.9729, + "step": 6340 + }, + { + "epoch": 0.45810681452850976, + "grad_norm": 6.6748417639103765, + "learning_rate": 4.498237388528209e-06, + "loss": 0.8403, + "step": 6341 + }, + { + "epoch": 0.45817905972871925, + "grad_norm": 7.5604177083063515, + "learning_rate": 4.498061598784754e-06, + "loss": 0.9156, + "step": 6342 + }, + { + "epoch": 0.4582513049289288, + "grad_norm": 6.808533239834383, + "learning_rate": 4.497885781689388e-06, + "loss": 0.9234, + "step": 6343 + }, + { + "epoch": 0.4583235501291383, + "grad_norm": 6.2837449617746985, + "learning_rate": 4.497709937244516e-06, + "loss": 0.8794, + "step": 6344 + }, + { + "epoch": 0.4583957953293478, + "grad_norm": 7.744182987746679, + "learning_rate": 4.497534065452547e-06, + "loss": 1.0452, + "step": 6345 + }, + { + "epoch": 0.4584680405295573, + "grad_norm": 6.481778353396434, + "learning_rate": 4.497358166315887e-06, + "loss": 0.9663, + "step": 6346 + }, + { + "epoch": 0.45854028572976685, + "grad_norm": 6.964426658238172, + "learning_rate": 4.497182239836944e-06, + "loss": 0.876, + "step": 6347 + }, + { + "epoch": 0.45861253092997634, + "grad_norm": 7.657868879788246, + "learning_rate": 4.497006286018128e-06, + "loss": 0.8959, + "step": 6348 + }, + { + "epoch": 0.4586847761301859, + "grad_norm": 6.356695186962418, + "learning_rate": 4.4968303048618455e-06, + "loss": 0.9085, + "step": 6349 + }, + { + "epoch": 0.45875702133039536, + "grad_norm": 6.071354014475832, + "learning_rate": 4.496654296370507e-06, + "loss": 0.8736, + "step": 6350 + }, + { + "epoch": 0.45882926653060485, + "grad_norm": 6.814824538907348, + "learning_rate": 4.496478260546522e-06, + "loss": 0.9647, + "step": 6351 + }, + { + "epoch": 0.4589015117308144, + "grad_norm": 6.24144342258379, + "learning_rate": 4.496302197392299e-06, + "loss": 0.9202, + "step": 6352 + }, + { + "epoch": 0.4589737569310239, + "grad_norm": 5.702220270168585, + "learning_rate": 4.496126106910249e-06, + "loss": 1.0396, + "step": 6353 + }, + { + "epoch": 0.4590460021312334, + "grad_norm": 7.16559986188283, + "learning_rate": 4.495949989102783e-06, + "loss": 0.8792, + "step": 6354 + }, + { + "epoch": 0.4591182473314429, + "grad_norm": 5.167484136740702, + "learning_rate": 4.495773843972311e-06, + "loss": 0.8392, + "step": 6355 + }, + { + "epoch": 0.45919049253165245, + "grad_norm": 6.620932427975709, + "learning_rate": 4.495597671521245e-06, + "loss": 0.9156, + "step": 6356 + }, + { + "epoch": 0.45926273773186194, + "grad_norm": 5.904500066281212, + "learning_rate": 4.4954214717519965e-06, + "loss": 0.9036, + "step": 6357 + }, + { + "epoch": 0.4593349829320714, + "grad_norm": 6.098596767932915, + "learning_rate": 4.495245244666977e-06, + "loss": 0.9322, + "step": 6358 + }, + { + "epoch": 0.45940722813228096, + "grad_norm": 5.2860639854135085, + "learning_rate": 4.495068990268599e-06, + "loss": 0.9454, + "step": 6359 + }, + { + "epoch": 0.45947947333249045, + "grad_norm": 6.1991799827474035, + "learning_rate": 4.4948927085592765e-06, + "loss": 0.8238, + "step": 6360 + }, + { + "epoch": 0.4595517185327, + "grad_norm": 6.087715173321937, + "learning_rate": 4.494716399541421e-06, + "loss": 0.8949, + "step": 6361 + }, + { + "epoch": 0.4596239637329095, + "grad_norm": 5.260935520481047, + "learning_rate": 4.494540063217447e-06, + "loss": 0.8627, + "step": 6362 + }, + { + "epoch": 0.459696208933119, + "grad_norm": 6.717182491111618, + "learning_rate": 4.494363699589768e-06, + "loss": 1.0549, + "step": 6363 + }, + { + "epoch": 0.4597684541333285, + "grad_norm": 7.748860675373988, + "learning_rate": 4.494187308660799e-06, + "loss": 0.885, + "step": 6364 + }, + { + "epoch": 0.45984069933353805, + "grad_norm": 7.292919843251254, + "learning_rate": 4.494010890432954e-06, + "loss": 0.9774, + "step": 6365 + }, + { + "epoch": 0.45991294453374754, + "grad_norm": 6.585715076315271, + "learning_rate": 4.493834444908648e-06, + "loss": 0.9206, + "step": 6366 + }, + { + "epoch": 0.459985189733957, + "grad_norm": 6.306671473290922, + "learning_rate": 4.4936579720902965e-06, + "loss": 0.9827, + "step": 6367 + }, + { + "epoch": 0.46005743493416656, + "grad_norm": 6.901270387326117, + "learning_rate": 4.493481471980316e-06, + "loss": 0.8804, + "step": 6368 + }, + { + "epoch": 0.46012968013437605, + "grad_norm": 5.4949020688022765, + "learning_rate": 4.493304944581121e-06, + "loss": 0.8008, + "step": 6369 + }, + { + "epoch": 0.4602019253345856, + "grad_norm": 8.590623099450726, + "learning_rate": 4.49312838989513e-06, + "loss": 0.9922, + "step": 6370 + }, + { + "epoch": 0.4602741705347951, + "grad_norm": 6.479610153998675, + "learning_rate": 4.492951807924758e-06, + "loss": 0.8164, + "step": 6371 + }, + { + "epoch": 0.4603464157350046, + "grad_norm": 6.351993410102251, + "learning_rate": 4.4927751986724235e-06, + "loss": 0.8335, + "step": 6372 + }, + { + "epoch": 0.4604186609352141, + "grad_norm": 5.764157653833902, + "learning_rate": 4.492598562140544e-06, + "loss": 0.8227, + "step": 6373 + }, + { + "epoch": 0.46049090613542365, + "grad_norm": 5.579403856215023, + "learning_rate": 4.492421898331536e-06, + "loss": 0.7902, + "step": 6374 + }, + { + "epoch": 0.46056315133563314, + "grad_norm": 5.45852000764201, + "learning_rate": 4.492245207247821e-06, + "loss": 0.8584, + "step": 6375 + }, + { + "epoch": 0.4606353965358426, + "grad_norm": 5.702090485688773, + "learning_rate": 4.492068488891815e-06, + "loss": 0.8968, + "step": 6376 + }, + { + "epoch": 0.46070764173605216, + "grad_norm": 7.036425775150974, + "learning_rate": 4.491891743265939e-06, + "loss": 0.9068, + "step": 6377 + }, + { + "epoch": 0.46077988693626165, + "grad_norm": 6.6534495305456085, + "learning_rate": 4.491714970372611e-06, + "loss": 0.8857, + "step": 6378 + }, + { + "epoch": 0.4608521321364712, + "grad_norm": 6.686885342025282, + "learning_rate": 4.491538170214251e-06, + "loss": 0.8231, + "step": 6379 + }, + { + "epoch": 0.4609243773366807, + "grad_norm": 9.5448829083382, + "learning_rate": 4.4913613427932816e-06, + "loss": 1.0138, + "step": 6380 + }, + { + "epoch": 0.4609966225368902, + "grad_norm": 7.15730404483319, + "learning_rate": 4.49118448811212e-06, + "loss": 0.9243, + "step": 6381 + }, + { + "epoch": 0.4610688677370997, + "grad_norm": 7.0528582358965854, + "learning_rate": 4.491007606173189e-06, + "loss": 0.9342, + "step": 6382 + }, + { + "epoch": 0.46114111293730925, + "grad_norm": 7.930171436895591, + "learning_rate": 4.49083069697891e-06, + "loss": 0.8767, + "step": 6383 + }, + { + "epoch": 0.46121335813751874, + "grad_norm": 7.648175649086346, + "learning_rate": 4.490653760531705e-06, + "loss": 0.9223, + "step": 6384 + }, + { + "epoch": 0.4612856033377282, + "grad_norm": 5.786379626196919, + "learning_rate": 4.490476796833995e-06, + "loss": 0.9062, + "step": 6385 + }, + { + "epoch": 0.46135784853793776, + "grad_norm": 6.030544417621049, + "learning_rate": 4.490299805888204e-06, + "loss": 0.8931, + "step": 6386 + }, + { + "epoch": 0.46143009373814725, + "grad_norm": 6.929865304758083, + "learning_rate": 4.4901227876967525e-06, + "loss": 0.9567, + "step": 6387 + }, + { + "epoch": 0.4615023389383568, + "grad_norm": 6.906259942371237, + "learning_rate": 4.489945742262066e-06, + "loss": 0.8744, + "step": 6388 + }, + { + "epoch": 0.4615745841385663, + "grad_norm": 6.873935408962743, + "learning_rate": 4.489768669586568e-06, + "loss": 0.767, + "step": 6389 + }, + { + "epoch": 0.4616468293387758, + "grad_norm": 6.2206298440542405, + "learning_rate": 4.489591569672682e-06, + "loss": 0.8997, + "step": 6390 + }, + { + "epoch": 0.4617190745389853, + "grad_norm": 5.644646476440676, + "learning_rate": 4.489414442522831e-06, + "loss": 0.9193, + "step": 6391 + }, + { + "epoch": 0.46179131973919485, + "grad_norm": 5.7766454557076, + "learning_rate": 4.489237288139442e-06, + "loss": 0.8163, + "step": 6392 + }, + { + "epoch": 0.46186356493940434, + "grad_norm": 5.702023250814752, + "learning_rate": 4.489060106524938e-06, + "loss": 0.808, + "step": 6393 + }, + { + "epoch": 0.4619358101396138, + "grad_norm": 7.2897273799956475, + "learning_rate": 4.488882897681747e-06, + "loss": 0.9502, + "step": 6394 + }, + { + "epoch": 0.46200805533982336, + "grad_norm": 6.579499814580026, + "learning_rate": 4.488705661612293e-06, + "loss": 0.8737, + "step": 6395 + }, + { + "epoch": 0.46208030054003285, + "grad_norm": 6.793499970243845, + "learning_rate": 4.488528398319002e-06, + "loss": 0.8723, + "step": 6396 + }, + { + "epoch": 0.4621525457402424, + "grad_norm": 6.57270557127058, + "learning_rate": 4.488351107804302e-06, + "loss": 0.9206, + "step": 6397 + }, + { + "epoch": 0.4622247909404519, + "grad_norm": 6.865054896302581, + "learning_rate": 4.488173790070618e-06, + "loss": 0.8634, + "step": 6398 + }, + { + "epoch": 0.4622970361406614, + "grad_norm": 7.632273722348509, + "learning_rate": 4.487996445120379e-06, + "loss": 0.8621, + "step": 6399 + }, + { + "epoch": 0.4623692813408709, + "grad_norm": 9.034359512131095, + "learning_rate": 4.487819072956012e-06, + "loss": 0.9834, + "step": 6400 + }, + { + "epoch": 0.46244152654108045, + "grad_norm": 7.79886236453165, + "learning_rate": 4.487641673579946e-06, + "loss": 0.8949, + "step": 6401 + }, + { + "epoch": 0.46251377174128994, + "grad_norm": 6.138755766210099, + "learning_rate": 4.48746424699461e-06, + "loss": 0.8522, + "step": 6402 + }, + { + "epoch": 0.4625860169414994, + "grad_norm": 8.427315902487132, + "learning_rate": 4.487286793202429e-06, + "loss": 0.9731, + "step": 6403 + }, + { + "epoch": 0.46265826214170896, + "grad_norm": 6.6074348650293535, + "learning_rate": 4.487109312205836e-06, + "loss": 0.7772, + "step": 6404 + }, + { + "epoch": 0.46273050734191845, + "grad_norm": 5.269148284592068, + "learning_rate": 4.48693180400726e-06, + "loss": 0.8452, + "step": 6405 + }, + { + "epoch": 0.462802752542128, + "grad_norm": 6.717464732172289, + "learning_rate": 4.486754268609129e-06, + "loss": 0.8279, + "step": 6406 + }, + { + "epoch": 0.4628749977423375, + "grad_norm": 5.871713246487316, + "learning_rate": 4.486576706013876e-06, + "loss": 0.9118, + "step": 6407 + }, + { + "epoch": 0.462947242942547, + "grad_norm": 7.467181052927354, + "learning_rate": 4.48639911622393e-06, + "loss": 0.869, + "step": 6408 + }, + { + "epoch": 0.4630194881427565, + "grad_norm": 6.585908539008091, + "learning_rate": 4.486221499241722e-06, + "loss": 0.9367, + "step": 6409 + }, + { + "epoch": 0.46309173334296605, + "grad_norm": 8.004850347717655, + "learning_rate": 4.486043855069685e-06, + "loss": 0.8752, + "step": 6410 + }, + { + "epoch": 0.46316397854317554, + "grad_norm": 6.725528296385238, + "learning_rate": 4.485866183710248e-06, + "loss": 0.8256, + "step": 6411 + }, + { + "epoch": 0.463236223743385, + "grad_norm": 8.467122840106498, + "learning_rate": 4.485688485165845e-06, + "loss": 0.925, + "step": 6412 + }, + { + "epoch": 0.46330846894359456, + "grad_norm": 7.883854081722008, + "learning_rate": 4.48551075943891e-06, + "loss": 0.9308, + "step": 6413 + }, + { + "epoch": 0.46338071414380405, + "grad_norm": 7.4062355781768865, + "learning_rate": 4.485333006531874e-06, + "loss": 0.9383, + "step": 6414 + }, + { + "epoch": 0.4634529593440136, + "grad_norm": 5.740704860142156, + "learning_rate": 4.48515522644717e-06, + "loss": 0.831, + "step": 6415 + }, + { + "epoch": 0.4635252045442231, + "grad_norm": 5.751063372927966, + "learning_rate": 4.484977419187232e-06, + "loss": 0.8431, + "step": 6416 + }, + { + "epoch": 0.4635974497444326, + "grad_norm": 6.581392541339798, + "learning_rate": 4.4847995847544954e-06, + "loss": 0.893, + "step": 6417 + }, + { + "epoch": 0.4636696949446421, + "grad_norm": 6.705619152364138, + "learning_rate": 4.484621723151393e-06, + "loss": 1.0039, + "step": 6418 + }, + { + "epoch": 0.46374194014485165, + "grad_norm": 6.587296785698295, + "learning_rate": 4.4844438343803595e-06, + "loss": 0.8979, + "step": 6419 + }, + { + "epoch": 0.46381418534506114, + "grad_norm": 7.334524260254892, + "learning_rate": 4.484265918443832e-06, + "loss": 0.9199, + "step": 6420 + }, + { + "epoch": 0.4638864305452706, + "grad_norm": 5.911765777614587, + "learning_rate": 4.484087975344244e-06, + "loss": 0.9327, + "step": 6421 + }, + { + "epoch": 0.46395867574548016, + "grad_norm": 5.9529837043394345, + "learning_rate": 4.483910005084033e-06, + "loss": 0.7932, + "step": 6422 + }, + { + "epoch": 0.46403092094568965, + "grad_norm": 5.17805006287244, + "learning_rate": 4.483732007665633e-06, + "loss": 0.8756, + "step": 6423 + }, + { + "epoch": 0.4641031661458992, + "grad_norm": 8.076603818180006, + "learning_rate": 4.483553983091483e-06, + "loss": 0.8805, + "step": 6424 + }, + { + "epoch": 0.4641754113461087, + "grad_norm": 5.633307701067573, + "learning_rate": 4.483375931364019e-06, + "loss": 0.871, + "step": 6425 + }, + { + "epoch": 0.4642476565463182, + "grad_norm": 7.134897952937248, + "learning_rate": 4.4831978524856785e-06, + "loss": 0.8718, + "step": 6426 + }, + { + "epoch": 0.4643199017465277, + "grad_norm": 5.833293515024071, + "learning_rate": 4.483019746458899e-06, + "loss": 0.8519, + "step": 6427 + }, + { + "epoch": 0.46439214694673725, + "grad_norm": 5.644610658502851, + "learning_rate": 4.482841613286119e-06, + "loss": 0.8503, + "step": 6428 + }, + { + "epoch": 0.46446439214694674, + "grad_norm": 5.905762667456775, + "learning_rate": 4.482663452969778e-06, + "loss": 0.936, + "step": 6429 + }, + { + "epoch": 0.4645366373471562, + "grad_norm": 5.587230612727544, + "learning_rate": 4.482485265512312e-06, + "loss": 0.9204, + "step": 6430 + }, + { + "epoch": 0.46460888254736576, + "grad_norm": 5.29881228688197, + "learning_rate": 4.482307050916163e-06, + "loss": 0.8683, + "step": 6431 + }, + { + "epoch": 0.46468112774757525, + "grad_norm": 6.55732130347461, + "learning_rate": 4.4821288091837685e-06, + "loss": 0.7811, + "step": 6432 + }, + { + "epoch": 0.4647533729477848, + "grad_norm": 6.807106332705246, + "learning_rate": 4.4819505403175715e-06, + "loss": 0.849, + "step": 6433 + }, + { + "epoch": 0.4648256181479943, + "grad_norm": 5.707311311360645, + "learning_rate": 4.48177224432001e-06, + "loss": 0.8602, + "step": 6434 + }, + { + "epoch": 0.4648978633482038, + "grad_norm": 7.012449637655467, + "learning_rate": 4.481593921193524e-06, + "loss": 0.8725, + "step": 6435 + }, + { + "epoch": 0.4649701085484133, + "grad_norm": 9.583902977861499, + "learning_rate": 4.481415570940557e-06, + "loss": 1.0169, + "step": 6436 + }, + { + "epoch": 0.46504235374862285, + "grad_norm": 5.79543597104501, + "learning_rate": 4.481237193563548e-06, + "loss": 0.9231, + "step": 6437 + }, + { + "epoch": 0.46511459894883234, + "grad_norm": 5.716579004325516, + "learning_rate": 4.481058789064942e-06, + "loss": 0.9228, + "step": 6438 + }, + { + "epoch": 0.4651868441490418, + "grad_norm": 6.848075440410161, + "learning_rate": 4.480880357447178e-06, + "loss": 0.9324, + "step": 6439 + }, + { + "epoch": 0.46525908934925136, + "grad_norm": 6.505740052099244, + "learning_rate": 4.4807018987127e-06, + "loss": 0.883, + "step": 6440 + }, + { + "epoch": 0.46533133454946085, + "grad_norm": 6.05435325438295, + "learning_rate": 4.480523412863952e-06, + "loss": 0.8924, + "step": 6441 + }, + { + "epoch": 0.4654035797496704, + "grad_norm": 7.646446459835756, + "learning_rate": 4.480344899903375e-06, + "loss": 0.8487, + "step": 6442 + }, + { + "epoch": 0.4654758249498799, + "grad_norm": 7.449879065434526, + "learning_rate": 4.480166359833415e-06, + "loss": 0.911, + "step": 6443 + }, + { + "epoch": 0.4655480701500894, + "grad_norm": 6.252109019163053, + "learning_rate": 4.479987792656514e-06, + "loss": 0.9483, + "step": 6444 + }, + { + "epoch": 0.4656203153502989, + "grad_norm": 6.927485755952572, + "learning_rate": 4.479809198375118e-06, + "loss": 0.9609, + "step": 6445 + }, + { + "epoch": 0.46569256055050845, + "grad_norm": 6.492093779877838, + "learning_rate": 4.479630576991672e-06, + "loss": 0.7989, + "step": 6446 + }, + { + "epoch": 0.46576480575071794, + "grad_norm": 6.239465303742888, + "learning_rate": 4.479451928508619e-06, + "loss": 0.8819, + "step": 6447 + }, + { + "epoch": 0.4658370509509274, + "grad_norm": 5.552526898823472, + "learning_rate": 4.479273252928407e-06, + "loss": 0.8776, + "step": 6448 + }, + { + "epoch": 0.46590929615113696, + "grad_norm": 5.687046913292886, + "learning_rate": 4.479094550253481e-06, + "loss": 0.8985, + "step": 6449 + }, + { + "epoch": 0.46598154135134645, + "grad_norm": 8.043790177820695, + "learning_rate": 4.478915820486287e-06, + "loss": 0.8419, + "step": 6450 + }, + { + "epoch": 0.466053786551556, + "grad_norm": 5.498292831421276, + "learning_rate": 4.478737063629271e-06, + "loss": 0.8333, + "step": 6451 + }, + { + "epoch": 0.4661260317517655, + "grad_norm": 6.344665160666603, + "learning_rate": 4.4785582796848835e-06, + "loss": 0.868, + "step": 6452 + }, + { + "epoch": 0.466198276951975, + "grad_norm": 7.954771459762423, + "learning_rate": 4.478379468655567e-06, + "loss": 0.9126, + "step": 6453 + }, + { + "epoch": 0.4662705221521845, + "grad_norm": 6.329589674247134, + "learning_rate": 4.478200630543773e-06, + "loss": 0.8001, + "step": 6454 + }, + { + "epoch": 0.46634276735239405, + "grad_norm": 7.240383479041975, + "learning_rate": 4.4780217653519475e-06, + "loss": 0.9547, + "step": 6455 + }, + { + "epoch": 0.46641501255260354, + "grad_norm": 7.449523695498587, + "learning_rate": 4.4778428730825405e-06, + "loss": 0.8696, + "step": 6456 + }, + { + "epoch": 0.466487257752813, + "grad_norm": 7.68643329942146, + "learning_rate": 4.477663953738001e-06, + "loss": 0.8413, + "step": 6457 + }, + { + "epoch": 0.46655950295302256, + "grad_norm": 7.753486341353588, + "learning_rate": 4.477485007320776e-06, + "loss": 0.9209, + "step": 6458 + }, + { + "epoch": 0.46663174815323205, + "grad_norm": 5.8246071351472795, + "learning_rate": 4.4773060338333174e-06, + "loss": 0.8479, + "step": 6459 + }, + { + "epoch": 0.4667039933534416, + "grad_norm": 5.996479273029717, + "learning_rate": 4.477127033278074e-06, + "loss": 0.892, + "step": 6460 + }, + { + "epoch": 0.4667762385536511, + "grad_norm": 7.208682748194106, + "learning_rate": 4.476948005657497e-06, + "loss": 0.9346, + "step": 6461 + }, + { + "epoch": 0.4668484837538606, + "grad_norm": 7.9179010600060495, + "learning_rate": 4.476768950974037e-06, + "loss": 0.9081, + "step": 6462 + }, + { + "epoch": 0.4669207289540701, + "grad_norm": 8.194003485185638, + "learning_rate": 4.476589869230145e-06, + "loss": 0.9229, + "step": 6463 + }, + { + "epoch": 0.46699297415427965, + "grad_norm": 7.491892947074185, + "learning_rate": 4.476410760428272e-06, + "loss": 0.8638, + "step": 6464 + }, + { + "epoch": 0.46706521935448914, + "grad_norm": 8.75439517125273, + "learning_rate": 4.4762316245708705e-06, + "loss": 0.9064, + "step": 6465 + }, + { + "epoch": 0.4671374645546986, + "grad_norm": 8.555876683185136, + "learning_rate": 4.476052461660392e-06, + "loss": 0.8534, + "step": 6466 + }, + { + "epoch": 0.46720970975490816, + "grad_norm": 8.97683277171454, + "learning_rate": 4.47587327169929e-06, + "loss": 0.9391, + "step": 6467 + }, + { + "epoch": 0.46728195495511765, + "grad_norm": 6.015559684101579, + "learning_rate": 4.475694054690018e-06, + "loss": 0.7995, + "step": 6468 + }, + { + "epoch": 0.4673542001553272, + "grad_norm": 8.807624063172973, + "learning_rate": 4.4755148106350264e-06, + "loss": 0.9035, + "step": 6469 + }, + { + "epoch": 0.4674264453555367, + "grad_norm": 5.810446755965177, + "learning_rate": 4.475335539536773e-06, + "loss": 0.9572, + "step": 6470 + }, + { + "epoch": 0.4674986905557462, + "grad_norm": 7.291519454196617, + "learning_rate": 4.475156241397708e-06, + "loss": 0.8776, + "step": 6471 + }, + { + "epoch": 0.4675709357559557, + "grad_norm": 7.376789716385806, + "learning_rate": 4.47497691622029e-06, + "loss": 0.9185, + "step": 6472 + }, + { + "epoch": 0.46764318095616525, + "grad_norm": 6.624714107462694, + "learning_rate": 4.4747975640069685e-06, + "loss": 0.9058, + "step": 6473 + }, + { + "epoch": 0.46771542615637474, + "grad_norm": 6.3269213626726355, + "learning_rate": 4.474618184760203e-06, + "loss": 0.8637, + "step": 6474 + }, + { + "epoch": 0.4677876713565842, + "grad_norm": 7.802424411083636, + "learning_rate": 4.4744387784824485e-06, + "loss": 0.8511, + "step": 6475 + }, + { + "epoch": 0.46785991655679376, + "grad_norm": 8.915165237033651, + "learning_rate": 4.47425934517616e-06, + "loss": 0.8432, + "step": 6476 + }, + { + "epoch": 0.46793216175700325, + "grad_norm": 6.3023230977333, + "learning_rate": 4.474079884843793e-06, + "loss": 0.8072, + "step": 6477 + }, + { + "epoch": 0.4680044069572128, + "grad_norm": 7.124393905827402, + "learning_rate": 4.4739003974878055e-06, + "loss": 0.8921, + "step": 6478 + }, + { + "epoch": 0.4680766521574223, + "grad_norm": 8.671131988861761, + "learning_rate": 4.473720883110655e-06, + "loss": 0.8354, + "step": 6479 + }, + { + "epoch": 0.4681488973576318, + "grad_norm": 7.212506918019591, + "learning_rate": 4.473541341714798e-06, + "loss": 0.9661, + "step": 6480 + }, + { + "epoch": 0.4682211425578413, + "grad_norm": 5.986876759117179, + "learning_rate": 4.473361773302691e-06, + "loss": 0.8476, + "step": 6481 + }, + { + "epoch": 0.46829338775805085, + "grad_norm": 5.75205724651091, + "learning_rate": 4.473182177876795e-06, + "loss": 0.9299, + "step": 6482 + }, + { + "epoch": 0.46836563295826034, + "grad_norm": 6.750668810453356, + "learning_rate": 4.473002555439567e-06, + "loss": 0.8658, + "step": 6483 + }, + { + "epoch": 0.4684378781584698, + "grad_norm": 6.067888519711489, + "learning_rate": 4.472822905993465e-06, + "loss": 0.822, + "step": 6484 + }, + { + "epoch": 0.46851012335867936, + "grad_norm": 5.858056166682081, + "learning_rate": 4.472643229540949e-06, + "loss": 0.8561, + "step": 6485 + }, + { + "epoch": 0.46858236855888885, + "grad_norm": 6.0647676297272834, + "learning_rate": 4.4724635260844805e-06, + "loss": 0.848, + "step": 6486 + }, + { + "epoch": 0.4686546137590984, + "grad_norm": 6.761630176160195, + "learning_rate": 4.4722837956265165e-06, + "loss": 0.9346, + "step": 6487 + }, + { + "epoch": 0.4687268589593079, + "grad_norm": 6.058622398284138, + "learning_rate": 4.472104038169519e-06, + "loss": 0.9212, + "step": 6488 + }, + { + "epoch": 0.4687991041595174, + "grad_norm": 8.821749114902842, + "learning_rate": 4.471924253715949e-06, + "loss": 0.8671, + "step": 6489 + }, + { + "epoch": 0.4688713493597269, + "grad_norm": 5.918622011696133, + "learning_rate": 4.471744442268266e-06, + "loss": 0.883, + "step": 6490 + }, + { + "epoch": 0.46894359455993645, + "grad_norm": 6.436156790142307, + "learning_rate": 4.471564603828934e-06, + "loss": 0.9719, + "step": 6491 + }, + { + "epoch": 0.46901583976014594, + "grad_norm": 6.9048466011703225, + "learning_rate": 4.471384738400412e-06, + "loss": 0.8932, + "step": 6492 + }, + { + "epoch": 0.4690880849603554, + "grad_norm": 7.901902522593056, + "learning_rate": 4.471204845985164e-06, + "loss": 0.8316, + "step": 6493 + }, + { + "epoch": 0.46916033016056496, + "grad_norm": 10.51274425390663, + "learning_rate": 4.471024926585653e-06, + "loss": 0.8985, + "step": 6494 + }, + { + "epoch": 0.46923257536077445, + "grad_norm": 6.617326719222465, + "learning_rate": 4.4708449802043405e-06, + "loss": 0.8923, + "step": 6495 + }, + { + "epoch": 0.469304820560984, + "grad_norm": 7.093310909713829, + "learning_rate": 4.470665006843691e-06, + "loss": 0.8834, + "step": 6496 + }, + { + "epoch": 0.4693770657611935, + "grad_norm": 6.550594320342776, + "learning_rate": 4.470485006506166e-06, + "loss": 0.8782, + "step": 6497 + }, + { + "epoch": 0.469449310961403, + "grad_norm": 8.15430412611953, + "learning_rate": 4.470304979194233e-06, + "loss": 0.8651, + "step": 6498 + }, + { + "epoch": 0.4695215561616125, + "grad_norm": 7.9944927809986455, + "learning_rate": 4.470124924910354e-06, + "loss": 0.8714, + "step": 6499 + }, + { + "epoch": 0.46959380136182205, + "grad_norm": 5.2192002519094105, + "learning_rate": 4.469944843656995e-06, + "loss": 0.8696, + "step": 6500 + }, + { + "epoch": 0.46966604656203154, + "grad_norm": 5.532424026291681, + "learning_rate": 4.4697647354366205e-06, + "loss": 0.843, + "step": 6501 + }, + { + "epoch": 0.469738291762241, + "grad_norm": 6.880340166775686, + "learning_rate": 4.469584600251695e-06, + "loss": 0.823, + "step": 6502 + }, + { + "epoch": 0.46981053696245056, + "grad_norm": 6.877039104120291, + "learning_rate": 4.4694044381046875e-06, + "loss": 0.9217, + "step": 6503 + }, + { + "epoch": 0.46988278216266005, + "grad_norm": 6.6564901245933354, + "learning_rate": 4.469224248998061e-06, + "loss": 0.8476, + "step": 6504 + }, + { + "epoch": 0.4699550273628696, + "grad_norm": 8.572234252121333, + "learning_rate": 4.4690440329342845e-06, + "loss": 0.8972, + "step": 6505 + }, + { + "epoch": 0.4700272725630791, + "grad_norm": 8.958139769362912, + "learning_rate": 4.4688637899158225e-06, + "loss": 0.9891, + "step": 6506 + }, + { + "epoch": 0.4700995177632886, + "grad_norm": 7.012544291147311, + "learning_rate": 4.468683519945146e-06, + "loss": 0.847, + "step": 6507 + }, + { + "epoch": 0.4701717629634981, + "grad_norm": 5.525990199773094, + "learning_rate": 4.468503223024719e-06, + "loss": 0.8997, + "step": 6508 + }, + { + "epoch": 0.47024400816370765, + "grad_norm": 6.096151807822183, + "learning_rate": 4.468322899157013e-06, + "loss": 0.864, + "step": 6509 + }, + { + "epoch": 0.47031625336391714, + "grad_norm": 8.16802619465094, + "learning_rate": 4.468142548344493e-06, + "loss": 0.9341, + "step": 6510 + }, + { + "epoch": 0.4703884985641266, + "grad_norm": 8.548667705757257, + "learning_rate": 4.467962170589631e-06, + "loss": 0.8962, + "step": 6511 + }, + { + "epoch": 0.47046074376433616, + "grad_norm": 7.4565969253755116, + "learning_rate": 4.467781765894894e-06, + "loss": 1.0679, + "step": 6512 + }, + { + "epoch": 0.47053298896454565, + "grad_norm": 6.338162371389183, + "learning_rate": 4.467601334262753e-06, + "loss": 0.8375, + "step": 6513 + }, + { + "epoch": 0.4706052341647552, + "grad_norm": 6.337805758291586, + "learning_rate": 4.467420875695677e-06, + "loss": 0.8471, + "step": 6514 + }, + { + "epoch": 0.4706774793649647, + "grad_norm": 7.403685564401249, + "learning_rate": 4.467240390196138e-06, + "loss": 0.8915, + "step": 6515 + }, + { + "epoch": 0.4707497245651742, + "grad_norm": 5.710806243682311, + "learning_rate": 4.467059877766604e-06, + "loss": 0.86, + "step": 6516 + }, + { + "epoch": 0.4708219697653837, + "grad_norm": 6.023076503044224, + "learning_rate": 4.466879338409549e-06, + "loss": 0.8572, + "step": 6517 + }, + { + "epoch": 0.47089421496559325, + "grad_norm": 7.9191733424014625, + "learning_rate": 4.466698772127442e-06, + "loss": 0.8607, + "step": 6518 + }, + { + "epoch": 0.47096646016580274, + "grad_norm": 6.963389708621144, + "learning_rate": 4.466518178922756e-06, + "loss": 0.9057, + "step": 6519 + }, + { + "epoch": 0.4710387053660122, + "grad_norm": 7.056821447569089, + "learning_rate": 4.4663375587979635e-06, + "loss": 0.8687, + "step": 6520 + }, + { + "epoch": 0.47111095056622176, + "grad_norm": 8.47338470099261, + "learning_rate": 4.466156911755536e-06, + "loss": 0.865, + "step": 6521 + }, + { + "epoch": 0.47118319576643125, + "grad_norm": 5.989106461711697, + "learning_rate": 4.465976237797948e-06, + "loss": 0.8765, + "step": 6522 + }, + { + "epoch": 0.4712554409666408, + "grad_norm": 6.47513136564754, + "learning_rate": 4.465795536927671e-06, + "loss": 0.9902, + "step": 6523 + }, + { + "epoch": 0.4713276861668503, + "grad_norm": 7.523203750100574, + "learning_rate": 4.4656148091471795e-06, + "loss": 0.8984, + "step": 6524 + }, + { + "epoch": 0.4713999313670598, + "grad_norm": 7.636660805406767, + "learning_rate": 4.465434054458947e-06, + "loss": 0.894, + "step": 6525 + }, + { + "epoch": 0.4714721765672693, + "grad_norm": 8.781275630384082, + "learning_rate": 4.465253272865449e-06, + "loss": 0.8998, + "step": 6526 + }, + { + "epoch": 0.47154442176747885, + "grad_norm": 7.838434736128689, + "learning_rate": 4.46507246436916e-06, + "loss": 0.9479, + "step": 6527 + }, + { + "epoch": 0.47161666696768834, + "grad_norm": 7.299370610526257, + "learning_rate": 4.464891628972554e-06, + "loss": 0.93, + "step": 6528 + }, + { + "epoch": 0.4716889121678978, + "grad_norm": 7.415895772010173, + "learning_rate": 4.4647107666781076e-06, + "loss": 0.9104, + "step": 6529 + }, + { + "epoch": 0.47176115736810736, + "grad_norm": 7.782972275434723, + "learning_rate": 4.464529877488296e-06, + "loss": 0.9063, + "step": 6530 + }, + { + "epoch": 0.47183340256831685, + "grad_norm": 6.5329710230632125, + "learning_rate": 4.464348961405596e-06, + "loss": 1.0224, + "step": 6531 + }, + { + "epoch": 0.4719056477685264, + "grad_norm": 6.076643030089617, + "learning_rate": 4.464168018432483e-06, + "loss": 0.8976, + "step": 6532 + }, + { + "epoch": 0.4719778929687359, + "grad_norm": 6.020205173481261, + "learning_rate": 4.463987048571437e-06, + "loss": 0.8266, + "step": 6533 + }, + { + "epoch": 0.4720501381689454, + "grad_norm": 6.295977128526571, + "learning_rate": 4.463806051824932e-06, + "loss": 0.8839, + "step": 6534 + }, + { + "epoch": 0.4721223833691549, + "grad_norm": 7.051295482026939, + "learning_rate": 4.463625028195447e-06, + "loss": 0.875, + "step": 6535 + }, + { + "epoch": 0.47219462856936445, + "grad_norm": 5.77843707960966, + "learning_rate": 4.463443977685459e-06, + "loss": 0.8274, + "step": 6536 + }, + { + "epoch": 0.47226687376957394, + "grad_norm": 7.814303990938218, + "learning_rate": 4.463262900297449e-06, + "loss": 0.9985, + "step": 6537 + }, + { + "epoch": 0.4723391189697834, + "grad_norm": 7.325760775172678, + "learning_rate": 4.463081796033893e-06, + "loss": 0.9261, + "step": 6538 + }, + { + "epoch": 0.47241136416999296, + "grad_norm": 8.184818673242994, + "learning_rate": 4.462900664897273e-06, + "loss": 0.9211, + "step": 6539 + }, + { + "epoch": 0.47248360937020245, + "grad_norm": 6.731210341199471, + "learning_rate": 4.4627195068900655e-06, + "loss": 0.884, + "step": 6540 + }, + { + "epoch": 0.472555854570412, + "grad_norm": 9.296130238682695, + "learning_rate": 4.462538322014753e-06, + "loss": 0.9596, + "step": 6541 + }, + { + "epoch": 0.4726280997706215, + "grad_norm": 5.66379478084968, + "learning_rate": 4.462357110273814e-06, + "loss": 0.8923, + "step": 6542 + }, + { + "epoch": 0.472700344970831, + "grad_norm": 7.082048486603404, + "learning_rate": 4.46217587166973e-06, + "loss": 0.7627, + "step": 6543 + }, + { + "epoch": 0.4727725901710405, + "grad_norm": 6.7454694449109, + "learning_rate": 4.461994606204983e-06, + "loss": 0.9484, + "step": 6544 + }, + { + "epoch": 0.47284483537125005, + "grad_norm": 6.534692468506761, + "learning_rate": 4.4618133138820515e-06, + "loss": 0.8844, + "step": 6545 + }, + { + "epoch": 0.47291708057145954, + "grad_norm": 8.986969734009346, + "learning_rate": 4.461631994703419e-06, + "loss": 0.9046, + "step": 6546 + }, + { + "epoch": 0.472989325771669, + "grad_norm": 9.422747873642507, + "learning_rate": 4.4614506486715685e-06, + "loss": 1.0268, + "step": 6547 + }, + { + "epoch": 0.47306157097187856, + "grad_norm": 6.174004218858489, + "learning_rate": 4.461269275788981e-06, + "loss": 0.8263, + "step": 6548 + }, + { + "epoch": 0.47313381617208805, + "grad_norm": 6.041618645137368, + "learning_rate": 4.46108787605814e-06, + "loss": 0.8328, + "step": 6549 + }, + { + "epoch": 0.4732060613722976, + "grad_norm": 8.738207909280959, + "learning_rate": 4.460906449481529e-06, + "loss": 0.8535, + "step": 6550 + }, + { + "epoch": 0.4732783065725071, + "grad_norm": 6.166847054532336, + "learning_rate": 4.460724996061632e-06, + "loss": 0.8676, + "step": 6551 + }, + { + "epoch": 0.4733505517727166, + "grad_norm": 5.986665212776897, + "learning_rate": 4.460543515800931e-06, + "loss": 0.8796, + "step": 6552 + }, + { + "epoch": 0.4734227969729261, + "grad_norm": 7.3127394416928215, + "learning_rate": 4.4603620087019116e-06, + "loss": 0.9552, + "step": 6553 + }, + { + "epoch": 0.47349504217313565, + "grad_norm": 6.3108185898830955, + "learning_rate": 4.460180474767059e-06, + "loss": 0.8109, + "step": 6554 + }, + { + "epoch": 0.47356728737334514, + "grad_norm": 6.117572562812745, + "learning_rate": 4.459998913998858e-06, + "loss": 0.8914, + "step": 6555 + }, + { + "epoch": 0.4736395325735546, + "grad_norm": 6.271088688953981, + "learning_rate": 4.459817326399792e-06, + "loss": 0.8432, + "step": 6556 + }, + { + "epoch": 0.47371177777376416, + "grad_norm": 6.65077062745099, + "learning_rate": 4.45963571197235e-06, + "loss": 0.9525, + "step": 6557 + }, + { + "epoch": 0.47378402297397365, + "grad_norm": 6.3415809267975956, + "learning_rate": 4.4594540707190165e-06, + "loss": 0.9308, + "step": 6558 + }, + { + "epoch": 0.4738562681741832, + "grad_norm": 6.837803595740083, + "learning_rate": 4.459272402642278e-06, + "loss": 0.7895, + "step": 6559 + }, + { + "epoch": 0.4739285133743927, + "grad_norm": 5.439623538623179, + "learning_rate": 4.459090707744621e-06, + "loss": 0.7817, + "step": 6560 + }, + { + "epoch": 0.4740007585746022, + "grad_norm": 6.180370500439684, + "learning_rate": 4.458908986028535e-06, + "loss": 0.897, + "step": 6561 + }, + { + "epoch": 0.4740730037748117, + "grad_norm": 8.710634453809357, + "learning_rate": 4.458727237496504e-06, + "loss": 0.8878, + "step": 6562 + }, + { + "epoch": 0.47414524897502125, + "grad_norm": 6.19107075934501, + "learning_rate": 4.45854546215102e-06, + "loss": 0.9092, + "step": 6563 + }, + { + "epoch": 0.47421749417523074, + "grad_norm": 8.032674343996476, + "learning_rate": 4.458363659994567e-06, + "loss": 0.9604, + "step": 6564 + }, + { + "epoch": 0.4742897393754402, + "grad_norm": 7.600901811200159, + "learning_rate": 4.4581818310296375e-06, + "loss": 0.9959, + "step": 6565 + }, + { + "epoch": 0.47436198457564976, + "grad_norm": 6.489795597660535, + "learning_rate": 4.4579999752587185e-06, + "loss": 0.8803, + "step": 6566 + }, + { + "epoch": 0.47443422977585925, + "grad_norm": 5.760986447106953, + "learning_rate": 4.457818092684301e-06, + "loss": 0.8499, + "step": 6567 + }, + { + "epoch": 0.4745064749760688, + "grad_norm": 6.710017717454604, + "learning_rate": 4.457636183308873e-06, + "loss": 0.8288, + "step": 6568 + }, + { + "epoch": 0.4745787201762783, + "grad_norm": 6.758103204549324, + "learning_rate": 4.457454247134927e-06, + "loss": 0.9005, + "step": 6569 + }, + { + "epoch": 0.4746509653764878, + "grad_norm": 6.2576454702271755, + "learning_rate": 4.457272284164951e-06, + "loss": 0.9116, + "step": 6570 + }, + { + "epoch": 0.4747232105766973, + "grad_norm": 5.487130279946562, + "learning_rate": 4.4570902944014375e-06, + "loss": 0.8248, + "step": 6571 + }, + { + "epoch": 0.4747954557769068, + "grad_norm": 6.180520485005787, + "learning_rate": 4.456908277846878e-06, + "loss": 0.8907, + "step": 6572 + }, + { + "epoch": 0.47486770097711634, + "grad_norm": 5.958707499815659, + "learning_rate": 4.456726234503763e-06, + "loss": 0.8082, + "step": 6573 + }, + { + "epoch": 0.4749399461773258, + "grad_norm": 7.1004147583074495, + "learning_rate": 4.456544164374585e-06, + "loss": 0.9299, + "step": 6574 + }, + { + "epoch": 0.47501219137753536, + "grad_norm": 4.839352882252928, + "learning_rate": 4.456362067461837e-06, + "loss": 0.7849, + "step": 6575 + }, + { + "epoch": 0.47508443657774485, + "grad_norm": 5.851105166645236, + "learning_rate": 4.456179943768011e-06, + "loss": 0.8305, + "step": 6576 + }, + { + "epoch": 0.4751566817779544, + "grad_norm": 6.0796957070347455, + "learning_rate": 4.4559977932956e-06, + "loss": 0.9285, + "step": 6577 + }, + { + "epoch": 0.4752289269781639, + "grad_norm": 8.936756449894254, + "learning_rate": 4.455815616047099e-06, + "loss": 0.9392, + "step": 6578 + }, + { + "epoch": 0.4753011721783734, + "grad_norm": 6.642864684351143, + "learning_rate": 4.455633412025e-06, + "loss": 0.8639, + "step": 6579 + }, + { + "epoch": 0.4753734173785829, + "grad_norm": 6.947795479864768, + "learning_rate": 4.455451181231799e-06, + "loss": 0.9056, + "step": 6580 + }, + { + "epoch": 0.4754456625787924, + "grad_norm": 6.01772614482427, + "learning_rate": 4.455268923669989e-06, + "loss": 0.8102, + "step": 6581 + }, + { + "epoch": 0.47551790777900194, + "grad_norm": 7.117404838116114, + "learning_rate": 4.455086639342065e-06, + "loss": 0.8841, + "step": 6582 + }, + { + "epoch": 0.4755901529792114, + "grad_norm": 6.133730213482421, + "learning_rate": 4.4549043282505235e-06, + "loss": 0.9679, + "step": 6583 + }, + { + "epoch": 0.47566239817942096, + "grad_norm": 10.494937948086196, + "learning_rate": 4.45472199039786e-06, + "loss": 0.9437, + "step": 6584 + }, + { + "epoch": 0.47573464337963045, + "grad_norm": 5.771185284657195, + "learning_rate": 4.4545396257865696e-06, + "loss": 0.8618, + "step": 6585 + }, + { + "epoch": 0.47580688857984, + "grad_norm": 6.102815272318741, + "learning_rate": 4.45435723441915e-06, + "loss": 0.8617, + "step": 6586 + }, + { + "epoch": 0.4758791337800495, + "grad_norm": 7.205903483493776, + "learning_rate": 4.454174816298097e-06, + "loss": 0.865, + "step": 6587 + }, + { + "epoch": 0.475951378980259, + "grad_norm": 6.983428686154073, + "learning_rate": 4.453992371425908e-06, + "loss": 0.892, + "step": 6588 + }, + { + "epoch": 0.4760236241804685, + "grad_norm": 9.518326198989563, + "learning_rate": 4.45380989980508e-06, + "loss": 0.8771, + "step": 6589 + }, + { + "epoch": 0.476095869380678, + "grad_norm": 7.581595532624548, + "learning_rate": 4.453627401438112e-06, + "loss": 0.9819, + "step": 6590 + }, + { + "epoch": 0.47616811458088754, + "grad_norm": 7.641739528007744, + "learning_rate": 4.453444876327502e-06, + "loss": 0.8642, + "step": 6591 + }, + { + "epoch": 0.476240359781097, + "grad_norm": 8.63663129619902, + "learning_rate": 4.453262324475749e-06, + "loss": 0.8584, + "step": 6592 + }, + { + "epoch": 0.47631260498130656, + "grad_norm": 6.8632868081223934, + "learning_rate": 4.4530797458853505e-06, + "loss": 0.8357, + "step": 6593 + }, + { + "epoch": 0.47638485018151605, + "grad_norm": 7.2145228043463545, + "learning_rate": 4.452897140558807e-06, + "loss": 0.9323, + "step": 6594 + }, + { + "epoch": 0.4764570953817256, + "grad_norm": 7.095267381421731, + "learning_rate": 4.452714508498618e-06, + "loss": 0.897, + "step": 6595 + }, + { + "epoch": 0.4765293405819351, + "grad_norm": 5.820005759534575, + "learning_rate": 4.452531849707283e-06, + "loss": 0.8404, + "step": 6596 + }, + { + "epoch": 0.4766015857821446, + "grad_norm": 6.835410973026129, + "learning_rate": 4.452349164187303e-06, + "loss": 0.7917, + "step": 6597 + }, + { + "epoch": 0.4766738309823541, + "grad_norm": 7.617863301351382, + "learning_rate": 4.45216645194118e-06, + "loss": 0.8767, + "step": 6598 + }, + { + "epoch": 0.4767460761825636, + "grad_norm": 7.090478352711027, + "learning_rate": 4.451983712971413e-06, + "loss": 0.9148, + "step": 6599 + }, + { + "epoch": 0.47681832138277314, + "grad_norm": 9.99812299278751, + "learning_rate": 4.451800947280505e-06, + "loss": 0.9217, + "step": 6600 + }, + { + "epoch": 0.4768905665829826, + "grad_norm": 6.840305245264181, + "learning_rate": 4.451618154870958e-06, + "loss": 0.8282, + "step": 6601 + }, + { + "epoch": 0.47696281178319216, + "grad_norm": 5.976048346450297, + "learning_rate": 4.451435335745272e-06, + "loss": 0.8965, + "step": 6602 + }, + { + "epoch": 0.47703505698340165, + "grad_norm": 6.917915813603225, + "learning_rate": 4.4512524899059525e-06, + "loss": 0.8584, + "step": 6603 + }, + { + "epoch": 0.4771073021836112, + "grad_norm": 7.31824287462316, + "learning_rate": 4.451069617355502e-06, + "loss": 0.8853, + "step": 6604 + }, + { + "epoch": 0.4771795473838207, + "grad_norm": 6.774055285588318, + "learning_rate": 4.450886718096423e-06, + "loss": 0.7718, + "step": 6605 + }, + { + "epoch": 0.4772517925840302, + "grad_norm": 6.093548111138836, + "learning_rate": 4.4507037921312204e-06, + "loss": 0.8862, + "step": 6606 + }, + { + "epoch": 0.4773240377842397, + "grad_norm": 6.417371372647266, + "learning_rate": 4.450520839462396e-06, + "loss": 0.8045, + "step": 6607 + }, + { + "epoch": 0.4773962829844492, + "grad_norm": 9.971908021284639, + "learning_rate": 4.450337860092457e-06, + "loss": 0.9523, + "step": 6608 + }, + { + "epoch": 0.47746852818465874, + "grad_norm": 6.613459774521123, + "learning_rate": 4.450154854023907e-06, + "loss": 0.8481, + "step": 6609 + }, + { + "epoch": 0.4775407733848682, + "grad_norm": 6.306262872416248, + "learning_rate": 4.4499718212592504e-06, + "loss": 0.8633, + "step": 6610 + }, + { + "epoch": 0.47761301858507776, + "grad_norm": 6.236342805533926, + "learning_rate": 4.449788761800994e-06, + "loss": 0.9344, + "step": 6611 + }, + { + "epoch": 0.47768526378528725, + "grad_norm": 6.500720644396617, + "learning_rate": 4.449605675651643e-06, + "loss": 0.9822, + "step": 6612 + }, + { + "epoch": 0.4777575089854968, + "grad_norm": 6.9221166564228716, + "learning_rate": 4.449422562813704e-06, + "loss": 0.9072, + "step": 6613 + }, + { + "epoch": 0.4778297541857063, + "grad_norm": 6.0574546348142, + "learning_rate": 4.4492394232896845e-06, + "loss": 0.8732, + "step": 6614 + }, + { + "epoch": 0.4779019993859158, + "grad_norm": 6.353417157437092, + "learning_rate": 4.4490562570820905e-06, + "loss": 0.7824, + "step": 6615 + }, + { + "epoch": 0.4779742445861253, + "grad_norm": 7.9169371976974245, + "learning_rate": 4.44887306419343e-06, + "loss": 0.8311, + "step": 6616 + }, + { + "epoch": 0.4780464897863348, + "grad_norm": 6.287928736120293, + "learning_rate": 4.448689844626209e-06, + "loss": 0.8532, + "step": 6617 + }, + { + "epoch": 0.47811873498654434, + "grad_norm": 6.071379775163763, + "learning_rate": 4.448506598382939e-06, + "loss": 0.8857, + "step": 6618 + }, + { + "epoch": 0.4781909801867538, + "grad_norm": 7.57998678402516, + "learning_rate": 4.448323325466125e-06, + "loss": 0.805, + "step": 6619 + }, + { + "epoch": 0.47826322538696336, + "grad_norm": 7.365172222788677, + "learning_rate": 4.448140025878279e-06, + "loss": 0.7794, + "step": 6620 + }, + { + "epoch": 0.47833547058717285, + "grad_norm": 7.099287787162945, + "learning_rate": 4.447956699621908e-06, + "loss": 0.8928, + "step": 6621 + }, + { + "epoch": 0.4784077157873824, + "grad_norm": 8.252665695935296, + "learning_rate": 4.447773346699522e-06, + "loss": 0.934, + "step": 6622 + }, + { + "epoch": 0.4784799609875919, + "grad_norm": 7.238178217443302, + "learning_rate": 4.447589967113631e-06, + "loss": 0.8748, + "step": 6623 + }, + { + "epoch": 0.4785522061878014, + "grad_norm": 6.505050164512359, + "learning_rate": 4.447406560866746e-06, + "loss": 0.8776, + "step": 6624 + }, + { + "epoch": 0.4786244513880109, + "grad_norm": 5.348920495760869, + "learning_rate": 4.447223127961377e-06, + "loss": 0.8165, + "step": 6625 + }, + { + "epoch": 0.4786966965882204, + "grad_norm": 6.05744235462779, + "learning_rate": 4.447039668400036e-06, + "loss": 0.8318, + "step": 6626 + }, + { + "epoch": 0.47876894178842994, + "grad_norm": 6.717858543694258, + "learning_rate": 4.446856182185233e-06, + "loss": 0.8662, + "step": 6627 + }, + { + "epoch": 0.4788411869886394, + "grad_norm": 8.442035275614913, + "learning_rate": 4.4466726693194805e-06, + "loss": 0.9801, + "step": 6628 + }, + { + "epoch": 0.47891343218884896, + "grad_norm": 6.895792643109116, + "learning_rate": 4.446489129805291e-06, + "loss": 0.8517, + "step": 6629 + }, + { + "epoch": 0.47898567738905845, + "grad_norm": 6.805185012937028, + "learning_rate": 4.446305563645177e-06, + "loss": 0.8856, + "step": 6630 + }, + { + "epoch": 0.479057922589268, + "grad_norm": 6.93962078498561, + "learning_rate": 4.4461219708416504e-06, + "loss": 0.9403, + "step": 6631 + }, + { + "epoch": 0.4791301677894775, + "grad_norm": 6.320962804056331, + "learning_rate": 4.445938351397225e-06, + "loss": 0.9907, + "step": 6632 + }, + { + "epoch": 0.479202412989687, + "grad_norm": 6.615389204620789, + "learning_rate": 4.445754705314415e-06, + "loss": 0.9554, + "step": 6633 + }, + { + "epoch": 0.4792746581898965, + "grad_norm": 8.276589748574647, + "learning_rate": 4.445571032595734e-06, + "loss": 0.8665, + "step": 6634 + }, + { + "epoch": 0.479346903390106, + "grad_norm": 8.04815770268119, + "learning_rate": 4.445387333243695e-06, + "loss": 0.8669, + "step": 6635 + }, + { + "epoch": 0.47941914859031554, + "grad_norm": 8.06811328033369, + "learning_rate": 4.445203607260815e-06, + "loss": 0.9599, + "step": 6636 + }, + { + "epoch": 0.479491393790525, + "grad_norm": 7.1366262779654965, + "learning_rate": 4.445019854649607e-06, + "loss": 0.8804, + "step": 6637 + }, + { + "epoch": 0.47956363899073456, + "grad_norm": 6.486441041812992, + "learning_rate": 4.444836075412589e-06, + "loss": 0.7644, + "step": 6638 + }, + { + "epoch": 0.47963588419094405, + "grad_norm": 6.707654298649085, + "learning_rate": 4.444652269552274e-06, + "loss": 0.8624, + "step": 6639 + }, + { + "epoch": 0.4797081293911536, + "grad_norm": 8.131296388462989, + "learning_rate": 4.44446843707118e-06, + "loss": 0.9165, + "step": 6640 + }, + { + "epoch": 0.4797803745913631, + "grad_norm": 10.325615328355966, + "learning_rate": 4.444284577971822e-06, + "loss": 1.031, + "step": 6641 + }, + { + "epoch": 0.4798526197915726, + "grad_norm": 6.958549937762043, + "learning_rate": 4.444100692256719e-06, + "loss": 0.8941, + "step": 6642 + }, + { + "epoch": 0.4799248649917821, + "grad_norm": 9.480205191150063, + "learning_rate": 4.443916779928385e-06, + "loss": 0.8816, + "step": 6643 + }, + { + "epoch": 0.4799971101919916, + "grad_norm": 6.334399919879601, + "learning_rate": 4.443732840989341e-06, + "loss": 0.9162, + "step": 6644 + }, + { + "epoch": 0.48006935539220114, + "grad_norm": 9.0863132546711, + "learning_rate": 4.443548875442104e-06, + "loss": 0.9281, + "step": 6645 + }, + { + "epoch": 0.4801416005924106, + "grad_norm": 8.73205863250046, + "learning_rate": 4.443364883289192e-06, + "loss": 0.935, + "step": 6646 + }, + { + "epoch": 0.48021384579262016, + "grad_norm": 7.132873468044474, + "learning_rate": 4.443180864533123e-06, + "loss": 0.8088, + "step": 6647 + }, + { + "epoch": 0.48028609099282965, + "grad_norm": 9.21037755401482, + "learning_rate": 4.442996819176417e-06, + "loss": 0.959, + "step": 6648 + }, + { + "epoch": 0.4803583361930392, + "grad_norm": 7.149827498909338, + "learning_rate": 4.442812747221593e-06, + "loss": 0.95, + "step": 6649 + }, + { + "epoch": 0.4804305813932487, + "grad_norm": 5.906643324576957, + "learning_rate": 4.442628648671171e-06, + "loss": 0.8374, + "step": 6650 + }, + { + "epoch": 0.4805028265934582, + "grad_norm": 7.469561034024397, + "learning_rate": 4.442444523527672e-06, + "loss": 0.8784, + "step": 6651 + }, + { + "epoch": 0.4805750717936677, + "grad_norm": 7.732699619285596, + "learning_rate": 4.442260371793614e-06, + "loss": 0.9121, + "step": 6652 + }, + { + "epoch": 0.4806473169938772, + "grad_norm": 5.505810356124909, + "learning_rate": 4.442076193471521e-06, + "loss": 0.8757, + "step": 6653 + }, + { + "epoch": 0.48071956219408674, + "grad_norm": 9.242563918792705, + "learning_rate": 4.441891988563912e-06, + "loss": 0.9538, + "step": 6654 + }, + { + "epoch": 0.4807918073942962, + "grad_norm": 7.698796354074686, + "learning_rate": 4.44170775707331e-06, + "loss": 0.8303, + "step": 6655 + }, + { + "epoch": 0.48086405259450576, + "grad_norm": 6.2024755642404585, + "learning_rate": 4.441523499002236e-06, + "loss": 0.8062, + "step": 6656 + }, + { + "epoch": 0.48093629779471525, + "grad_norm": 6.079013318260126, + "learning_rate": 4.441339214353213e-06, + "loss": 0.915, + "step": 6657 + }, + { + "epoch": 0.4810085429949248, + "grad_norm": 6.401122781494629, + "learning_rate": 4.441154903128764e-06, + "loss": 0.9039, + "step": 6658 + }, + { + "epoch": 0.4810807881951343, + "grad_norm": 8.65461092968588, + "learning_rate": 4.44097056533141e-06, + "loss": 0.9069, + "step": 6659 + }, + { + "epoch": 0.4811530333953438, + "grad_norm": 8.182988678407021, + "learning_rate": 4.4407862009636785e-06, + "loss": 0.8934, + "step": 6660 + }, + { + "epoch": 0.4812252785955533, + "grad_norm": 9.153222322321026, + "learning_rate": 4.440601810028089e-06, + "loss": 0.8521, + "step": 6661 + }, + { + "epoch": 0.4812975237957628, + "grad_norm": 8.619178023301485, + "learning_rate": 4.440417392527167e-06, + "loss": 0.8776, + "step": 6662 + }, + { + "epoch": 0.48136976899597234, + "grad_norm": 5.997226073865708, + "learning_rate": 4.44023294846344e-06, + "loss": 0.9021, + "step": 6663 + }, + { + "epoch": 0.4814420141961818, + "grad_norm": 6.807002658054981, + "learning_rate": 4.4400484778394285e-06, + "loss": 0.8872, + "step": 6664 + }, + { + "epoch": 0.48151425939639136, + "grad_norm": 8.799891765969251, + "learning_rate": 4.439863980657661e-06, + "loss": 0.9801, + "step": 6665 + }, + { + "epoch": 0.48158650459660085, + "grad_norm": 7.092990649748657, + "learning_rate": 4.4396794569206605e-06, + "loss": 0.9027, + "step": 6666 + }, + { + "epoch": 0.4816587497968104, + "grad_norm": 7.550449575213505, + "learning_rate": 4.439494906630954e-06, + "loss": 0.847, + "step": 6667 + }, + { + "epoch": 0.4817309949970199, + "grad_norm": 7.30226344722215, + "learning_rate": 4.439310329791069e-06, + "loss": 0.8582, + "step": 6668 + }, + { + "epoch": 0.4818032401972294, + "grad_norm": 6.626509872259922, + "learning_rate": 4.4391257264035315e-06, + "loss": 0.8662, + "step": 6669 + }, + { + "epoch": 0.4818754853974389, + "grad_norm": 7.097127639661173, + "learning_rate": 4.438941096470868e-06, + "loss": 0.8918, + "step": 6670 + }, + { + "epoch": 0.4819477305976484, + "grad_norm": 6.051210569218968, + "learning_rate": 4.4387564399956066e-06, + "loss": 0.8123, + "step": 6671 + }, + { + "epoch": 0.48201997579785794, + "grad_norm": 6.370805145593844, + "learning_rate": 4.438571756980275e-06, + "loss": 0.9307, + "step": 6672 + }, + { + "epoch": 0.4820922209980674, + "grad_norm": 6.183067198147424, + "learning_rate": 4.438387047427402e-06, + "loss": 0.9129, + "step": 6673 + }, + { + "epoch": 0.48216446619827696, + "grad_norm": 7.826597144594165, + "learning_rate": 4.438202311339514e-06, + "loss": 1.0619, + "step": 6674 + }, + { + "epoch": 0.48223671139848645, + "grad_norm": 7.403898614322195, + "learning_rate": 4.438017548719141e-06, + "loss": 0.9693, + "step": 6675 + }, + { + "epoch": 0.482308956598696, + "grad_norm": 7.086487131556979, + "learning_rate": 4.437832759568814e-06, + "loss": 0.9086, + "step": 6676 + }, + { + "epoch": 0.4823812017989055, + "grad_norm": 6.210889592975616, + "learning_rate": 4.4376479438910605e-06, + "loss": 0.915, + "step": 6677 + }, + { + "epoch": 0.482453446999115, + "grad_norm": 6.658147836837772, + "learning_rate": 4.437463101688411e-06, + "loss": 0.8949, + "step": 6678 + }, + { + "epoch": 0.4825256921993245, + "grad_norm": 7.314161845418555, + "learning_rate": 4.4372782329633955e-06, + "loss": 0.9342, + "step": 6679 + }, + { + "epoch": 0.482597937399534, + "grad_norm": 5.7580248354886, + "learning_rate": 4.437093337718547e-06, + "loss": 0.9291, + "step": 6680 + }, + { + "epoch": 0.48267018259974354, + "grad_norm": 7.095100173511856, + "learning_rate": 4.436908415956393e-06, + "loss": 0.974, + "step": 6681 + }, + { + "epoch": 0.482742427799953, + "grad_norm": 6.336926628825767, + "learning_rate": 4.436723467679467e-06, + "loss": 0.828, + "step": 6682 + }, + { + "epoch": 0.48281467300016256, + "grad_norm": 5.2829733299724495, + "learning_rate": 4.436538492890301e-06, + "loss": 0.8314, + "step": 6683 + }, + { + "epoch": 0.48288691820037205, + "grad_norm": 8.574197839524617, + "learning_rate": 4.436353491591427e-06, + "loss": 0.9484, + "step": 6684 + }, + { + "epoch": 0.4829591634005816, + "grad_norm": 6.050040114088508, + "learning_rate": 4.436168463785376e-06, + "loss": 0.9023, + "step": 6685 + }, + { + "epoch": 0.4830314086007911, + "grad_norm": 7.899989367127808, + "learning_rate": 4.435983409474682e-06, + "loss": 0.879, + "step": 6686 + }, + { + "epoch": 0.4831036538010006, + "grad_norm": 6.048197441491123, + "learning_rate": 4.43579832866188e-06, + "loss": 0.9236, + "step": 6687 + }, + { + "epoch": 0.4831758990012101, + "grad_norm": 6.230282312551478, + "learning_rate": 4.4356132213495e-06, + "loss": 0.855, + "step": 6688 + }, + { + "epoch": 0.4832481442014196, + "grad_norm": 6.020582499663459, + "learning_rate": 4.435428087540079e-06, + "loss": 0.9903, + "step": 6689 + }, + { + "epoch": 0.48332038940162914, + "grad_norm": 7.306200492958741, + "learning_rate": 4.43524292723615e-06, + "loss": 0.9413, + "step": 6690 + }, + { + "epoch": 0.4833926346018386, + "grad_norm": 6.737738882157769, + "learning_rate": 4.4350577404402485e-06, + "loss": 0.8251, + "step": 6691 + }, + { + "epoch": 0.48346487980204816, + "grad_norm": 5.842584993644421, + "learning_rate": 4.434872527154908e-06, + "loss": 0.9116, + "step": 6692 + }, + { + "epoch": 0.48353712500225765, + "grad_norm": 6.464391152019546, + "learning_rate": 4.434687287382665e-06, + "loss": 0.8953, + "step": 6693 + }, + { + "epoch": 0.4836093702024672, + "grad_norm": 5.677174344047824, + "learning_rate": 4.4345020211260555e-06, + "loss": 0.8363, + "step": 6694 + }, + { + "epoch": 0.4836816154026767, + "grad_norm": 6.756079867300256, + "learning_rate": 4.434316728387616e-06, + "loss": 0.8997, + "step": 6695 + }, + { + "epoch": 0.4837538606028862, + "grad_norm": 6.53949388838021, + "learning_rate": 4.434131409169882e-06, + "loss": 0.8465, + "step": 6696 + }, + { + "epoch": 0.4838261058030957, + "grad_norm": 5.756177817688371, + "learning_rate": 4.43394606347539e-06, + "loss": 0.9172, + "step": 6697 + }, + { + "epoch": 0.4838983510033052, + "grad_norm": 6.387709869102241, + "learning_rate": 4.43376069130668e-06, + "loss": 0.9277, + "step": 6698 + }, + { + "epoch": 0.48397059620351474, + "grad_norm": 6.53577379591186, + "learning_rate": 4.433575292666285e-06, + "loss": 0.9491, + "step": 6699 + }, + { + "epoch": 0.4840428414037242, + "grad_norm": 5.636307265819223, + "learning_rate": 4.4333898675567475e-06, + "loss": 0.9245, + "step": 6700 + }, + { + "epoch": 0.48411508660393376, + "grad_norm": 5.953273020428487, + "learning_rate": 4.433204415980603e-06, + "loss": 0.7505, + "step": 6701 + }, + { + "epoch": 0.48418733180414325, + "grad_norm": 6.237066390689234, + "learning_rate": 4.433018937940392e-06, + "loss": 0.9712, + "step": 6702 + }, + { + "epoch": 0.4842595770043528, + "grad_norm": 8.017967551063196, + "learning_rate": 4.432833433438651e-06, + "loss": 0.9636, + "step": 6703 + }, + { + "epoch": 0.4843318222045623, + "grad_norm": 6.234542959144947, + "learning_rate": 4.432647902477922e-06, + "loss": 0.942, + "step": 6704 + }, + { + "epoch": 0.4844040674047718, + "grad_norm": 5.588673424448919, + "learning_rate": 4.4324623450607446e-06, + "loss": 0.874, + "step": 6705 + }, + { + "epoch": 0.4844763126049813, + "grad_norm": 7.252938562944785, + "learning_rate": 4.432276761189658e-06, + "loss": 0.8436, + "step": 6706 + }, + { + "epoch": 0.4845485578051908, + "grad_norm": 7.201061424854759, + "learning_rate": 4.432091150867201e-06, + "loss": 0.9594, + "step": 6707 + }, + { + "epoch": 0.48462080300540034, + "grad_norm": 6.428863361330084, + "learning_rate": 4.431905514095918e-06, + "loss": 0.8874, + "step": 6708 + }, + { + "epoch": 0.4846930482056098, + "grad_norm": 7.023490727342802, + "learning_rate": 4.431719850878348e-06, + "loss": 0.906, + "step": 6709 + }, + { + "epoch": 0.48476529340581936, + "grad_norm": 5.8015210229041845, + "learning_rate": 4.431534161217034e-06, + "loss": 0.8662, + "step": 6710 + }, + { + "epoch": 0.48483753860602885, + "grad_norm": 6.6092220496117555, + "learning_rate": 4.431348445114516e-06, + "loss": 0.9208, + "step": 6711 + }, + { + "epoch": 0.4849097838062384, + "grad_norm": 6.204735073732405, + "learning_rate": 4.431162702573338e-06, + "loss": 0.8936, + "step": 6712 + }, + { + "epoch": 0.4849820290064479, + "grad_norm": 5.8692309685226425, + "learning_rate": 4.4309769335960415e-06, + "loss": 0.9141, + "step": 6713 + }, + { + "epoch": 0.4850542742066574, + "grad_norm": 5.136607910816998, + "learning_rate": 4.43079113818517e-06, + "loss": 0.8368, + "step": 6714 + }, + { + "epoch": 0.4851265194068669, + "grad_norm": 6.70340128864966, + "learning_rate": 4.4306053163432675e-06, + "loss": 0.8615, + "step": 6715 + }, + { + "epoch": 0.4851987646070764, + "grad_norm": 6.041806168973585, + "learning_rate": 4.430419468072877e-06, + "loss": 0.9302, + "step": 6716 + }, + { + "epoch": 0.48527100980728594, + "grad_norm": 6.087362374819906, + "learning_rate": 4.430233593376543e-06, + "loss": 0.9103, + "step": 6717 + }, + { + "epoch": 0.4853432550074954, + "grad_norm": 7.50788795203046, + "learning_rate": 4.430047692256809e-06, + "loss": 1.0285, + "step": 6718 + }, + { + "epoch": 0.48541550020770496, + "grad_norm": 5.6097833596352915, + "learning_rate": 4.429861764716222e-06, + "loss": 0.8955, + "step": 6719 + }, + { + "epoch": 0.48548774540791445, + "grad_norm": 6.72768244796122, + "learning_rate": 4.429675810757325e-06, + "loss": 0.8994, + "step": 6720 + }, + { + "epoch": 0.485559990608124, + "grad_norm": 6.103549687404953, + "learning_rate": 4.429489830382665e-06, + "loss": 0.8479, + "step": 6721 + }, + { + "epoch": 0.4856322358083335, + "grad_norm": 5.980088254857839, + "learning_rate": 4.4293038235947875e-06, + "loss": 0.8464, + "step": 6722 + }, + { + "epoch": 0.485704481008543, + "grad_norm": 8.143073067156575, + "learning_rate": 4.429117790396238e-06, + "loss": 0.977, + "step": 6723 + }, + { + "epoch": 0.4857767262087525, + "grad_norm": 5.890450472485006, + "learning_rate": 4.428931730789564e-06, + "loss": 0.8579, + "step": 6724 + }, + { + "epoch": 0.485848971408962, + "grad_norm": 7.284483883181837, + "learning_rate": 4.4287456447773124e-06, + "loss": 0.8213, + "step": 6725 + }, + { + "epoch": 0.48592121660917154, + "grad_norm": 7.070987428939104, + "learning_rate": 4.42855953236203e-06, + "loss": 0.8386, + "step": 6726 + }, + { + "epoch": 0.485993461809381, + "grad_norm": 8.155913247358477, + "learning_rate": 4.428373393546266e-06, + "loss": 0.878, + "step": 6727 + }, + { + "epoch": 0.48606570700959056, + "grad_norm": 9.551675891125157, + "learning_rate": 4.428187228332566e-06, + "loss": 0.8298, + "step": 6728 + }, + { + "epoch": 0.48613795220980005, + "grad_norm": 6.378358741525574, + "learning_rate": 4.428001036723482e-06, + "loss": 0.922, + "step": 6729 + }, + { + "epoch": 0.4862101974100096, + "grad_norm": 7.445149823199838, + "learning_rate": 4.42781481872156e-06, + "loss": 0.8728, + "step": 6730 + }, + { + "epoch": 0.4862824426102191, + "grad_norm": 6.843059626904696, + "learning_rate": 4.4276285743293496e-06, + "loss": 0.8589, + "step": 6731 + }, + { + "epoch": 0.4863546878104286, + "grad_norm": 7.359398066328634, + "learning_rate": 4.427442303549401e-06, + "loss": 0.8656, + "step": 6732 + }, + { + "epoch": 0.4864269330106381, + "grad_norm": 6.505469148740531, + "learning_rate": 4.427256006384264e-06, + "loss": 0.9492, + "step": 6733 + }, + { + "epoch": 0.4864991782108476, + "grad_norm": 6.492677819273999, + "learning_rate": 4.427069682836488e-06, + "loss": 0.8301, + "step": 6734 + }, + { + "epoch": 0.48657142341105714, + "grad_norm": 6.692191216545489, + "learning_rate": 4.426883332908625e-06, + "loss": 0.9164, + "step": 6735 + }, + { + "epoch": 0.4866436686112666, + "grad_norm": 7.135870955733084, + "learning_rate": 4.4266969566032245e-06, + "loss": 0.9245, + "step": 6736 + }, + { + "epoch": 0.48671591381147616, + "grad_norm": 7.286461532697505, + "learning_rate": 4.426510553922839e-06, + "loss": 0.9155, + "step": 6737 + }, + { + "epoch": 0.48678815901168565, + "grad_norm": 7.503349573475658, + "learning_rate": 4.426324124870021e-06, + "loss": 0.8339, + "step": 6738 + }, + { + "epoch": 0.4868604042118952, + "grad_norm": 5.382344395806381, + "learning_rate": 4.42613766944732e-06, + "loss": 0.7906, + "step": 6739 + }, + { + "epoch": 0.4869326494121047, + "grad_norm": 5.891157616939726, + "learning_rate": 4.4259511876572905e-06, + "loss": 0.9265, + "step": 6740 + }, + { + "epoch": 0.4870048946123142, + "grad_norm": 6.826053401153582, + "learning_rate": 4.425764679502485e-06, + "loss": 0.8947, + "step": 6741 + }, + { + "epoch": 0.4870771398125237, + "grad_norm": 5.772939286267412, + "learning_rate": 4.425578144985455e-06, + "loss": 0.7846, + "step": 6742 + }, + { + "epoch": 0.4871493850127332, + "grad_norm": 5.833440652495867, + "learning_rate": 4.425391584108757e-06, + "loss": 0.9238, + "step": 6743 + }, + { + "epoch": 0.48722163021294274, + "grad_norm": 6.35200842386451, + "learning_rate": 4.425204996874942e-06, + "loss": 0.9115, + "step": 6744 + }, + { + "epoch": 0.4872938754131522, + "grad_norm": 6.251244993186857, + "learning_rate": 4.425018383286566e-06, + "loss": 0.834, + "step": 6745 + }, + { + "epoch": 0.48736612061336176, + "grad_norm": 6.711461571868237, + "learning_rate": 4.424831743346182e-06, + "loss": 0.8804, + "step": 6746 + }, + { + "epoch": 0.48743836581357125, + "grad_norm": 8.528842684194514, + "learning_rate": 4.424645077056347e-06, + "loss": 0.9378, + "step": 6747 + }, + { + "epoch": 0.4875106110137808, + "grad_norm": 6.96021380874379, + "learning_rate": 4.4244583844196154e-06, + "loss": 0.7874, + "step": 6748 + }, + { + "epoch": 0.4875828562139903, + "grad_norm": 7.5635294568235985, + "learning_rate": 4.424271665438542e-06, + "loss": 1.0115, + "step": 6749 + }, + { + "epoch": 0.4876551014141998, + "grad_norm": 6.137719784781164, + "learning_rate": 4.4240849201156844e-06, + "loss": 0.8074, + "step": 6750 + }, + { + "epoch": 0.4877273466144093, + "grad_norm": 8.687133342223222, + "learning_rate": 4.423898148453597e-06, + "loss": 0.8541, + "step": 6751 + }, + { + "epoch": 0.4877995918146188, + "grad_norm": 6.8755268241864504, + "learning_rate": 4.423711350454839e-06, + "loss": 0.8498, + "step": 6752 + }, + { + "epoch": 0.48787183701482834, + "grad_norm": 6.19576228121781, + "learning_rate": 4.4235245261219654e-06, + "loss": 0.8648, + "step": 6753 + }, + { + "epoch": 0.4879440822150378, + "grad_norm": 7.425767379820029, + "learning_rate": 4.423337675457535e-06, + "loss": 0.9117, + "step": 6754 + }, + { + "epoch": 0.48801632741524736, + "grad_norm": 6.83186318314577, + "learning_rate": 4.423150798464105e-06, + "loss": 0.9439, + "step": 6755 + }, + { + "epoch": 0.48808857261545685, + "grad_norm": 5.91038634765651, + "learning_rate": 4.422963895144234e-06, + "loss": 0.8176, + "step": 6756 + }, + { + "epoch": 0.4881608178156664, + "grad_norm": 8.017882388029975, + "learning_rate": 4.42277696550048e-06, + "loss": 0.9454, + "step": 6757 + }, + { + "epoch": 0.4882330630158759, + "grad_norm": 6.67967036730259, + "learning_rate": 4.422590009535404e-06, + "loss": 0.8942, + "step": 6758 + }, + { + "epoch": 0.4883053082160854, + "grad_norm": 7.291384475333794, + "learning_rate": 4.4224030272515615e-06, + "loss": 1.0277, + "step": 6759 + }, + { + "epoch": 0.4883775534162949, + "grad_norm": 6.344293580467116, + "learning_rate": 4.422216018651515e-06, + "loss": 0.8549, + "step": 6760 + }, + { + "epoch": 0.4884497986165044, + "grad_norm": 7.640345156796926, + "learning_rate": 4.422028983737823e-06, + "loss": 0.8922, + "step": 6761 + }, + { + "epoch": 0.48852204381671394, + "grad_norm": 6.146083434452542, + "learning_rate": 4.421841922513047e-06, + "loss": 0.8474, + "step": 6762 + }, + { + "epoch": 0.4885942890169234, + "grad_norm": 5.6756085174593425, + "learning_rate": 4.421654834979748e-06, + "loss": 0.9193, + "step": 6763 + }, + { + "epoch": 0.48866653421713296, + "grad_norm": 6.4649709083982385, + "learning_rate": 4.4214677211404855e-06, + "loss": 0.8231, + "step": 6764 + }, + { + "epoch": 0.48873877941734245, + "grad_norm": 5.99931935582315, + "learning_rate": 4.421280580997822e-06, + "loss": 0.9123, + "step": 6765 + }, + { + "epoch": 0.488811024617552, + "grad_norm": 6.58490061714642, + "learning_rate": 4.4210934145543195e-06, + "loss": 0.8517, + "step": 6766 + }, + { + "epoch": 0.4888832698177615, + "grad_norm": 5.922250217437579, + "learning_rate": 4.42090622181254e-06, + "loss": 0.8232, + "step": 6767 + }, + { + "epoch": 0.488955515017971, + "grad_norm": 5.772458211517078, + "learning_rate": 4.4207190027750444e-06, + "loss": 0.8597, + "step": 6768 + }, + { + "epoch": 0.4890277602181805, + "grad_norm": 8.241219211487877, + "learning_rate": 4.4205317574443986e-06, + "loss": 0.885, + "step": 6769 + }, + { + "epoch": 0.48910000541839, + "grad_norm": 5.978619310089953, + "learning_rate": 4.420344485823164e-06, + "loss": 0.8542, + "step": 6770 + }, + { + "epoch": 0.48917225061859954, + "grad_norm": 6.745419396213296, + "learning_rate": 4.420157187913904e-06, + "loss": 0.9047, + "step": 6771 + }, + { + "epoch": 0.489244495818809, + "grad_norm": 6.112176979438547, + "learning_rate": 4.419969863719182e-06, + "loss": 0.8796, + "step": 6772 + }, + { + "epoch": 0.48931674101901856, + "grad_norm": 6.326110066944897, + "learning_rate": 4.419782513241565e-06, + "loss": 0.8915, + "step": 6773 + }, + { + "epoch": 0.48938898621922805, + "grad_norm": 5.95882689381387, + "learning_rate": 4.419595136483615e-06, + "loss": 0.9363, + "step": 6774 + }, + { + "epoch": 0.4894612314194376, + "grad_norm": 6.600920480060491, + "learning_rate": 4.419407733447899e-06, + "loss": 0.8146, + "step": 6775 + }, + { + "epoch": 0.4895334766196471, + "grad_norm": 7.418440817325178, + "learning_rate": 4.41922030413698e-06, + "loss": 0.8698, + "step": 6776 + }, + { + "epoch": 0.4896057218198566, + "grad_norm": 5.355974636767092, + "learning_rate": 4.419032848553426e-06, + "loss": 0.9129, + "step": 6777 + }, + { + "epoch": 0.4896779670200661, + "grad_norm": 7.417707248797125, + "learning_rate": 4.418845366699803e-06, + "loss": 0.9056, + "step": 6778 + }, + { + "epoch": 0.4897502122202756, + "grad_norm": 6.123715947296529, + "learning_rate": 4.418657858578677e-06, + "loss": 0.9025, + "step": 6779 + }, + { + "epoch": 0.48982245742048514, + "grad_norm": 7.321541698259695, + "learning_rate": 4.418470324192613e-06, + "loss": 0.9932, + "step": 6780 + }, + { + "epoch": 0.4898947026206946, + "grad_norm": 7.381348286102747, + "learning_rate": 4.418282763544181e-06, + "loss": 0.9639, + "step": 6781 + }, + { + "epoch": 0.48996694782090416, + "grad_norm": 6.185597136774796, + "learning_rate": 4.418095176635947e-06, + "loss": 0.8707, + "step": 6782 + }, + { + "epoch": 0.49003919302111365, + "grad_norm": 5.362577943746719, + "learning_rate": 4.4179075634704795e-06, + "loss": 0.8793, + "step": 6783 + }, + { + "epoch": 0.4901114382213232, + "grad_norm": 6.347764121679572, + "learning_rate": 4.417719924050347e-06, + "loss": 0.9553, + "step": 6784 + }, + { + "epoch": 0.4901836834215327, + "grad_norm": 6.019336377949666, + "learning_rate": 4.417532258378117e-06, + "loss": 0.8959, + "step": 6785 + }, + { + "epoch": 0.4902559286217422, + "grad_norm": 6.179035909567812, + "learning_rate": 4.4173445664563595e-06, + "loss": 0.9413, + "step": 6786 + }, + { + "epoch": 0.4903281738219517, + "grad_norm": 7.721890324230419, + "learning_rate": 4.417156848287644e-06, + "loss": 0.8818, + "step": 6787 + }, + { + "epoch": 0.4904004190221612, + "grad_norm": 4.8541842393461545, + "learning_rate": 4.41696910387454e-06, + "loss": 0.8295, + "step": 6788 + }, + { + "epoch": 0.49047266422237074, + "grad_norm": 5.488696360246698, + "learning_rate": 4.416781333219617e-06, + "loss": 0.8642, + "step": 6789 + }, + { + "epoch": 0.4905449094225802, + "grad_norm": 6.83080862282427, + "learning_rate": 4.416593536325445e-06, + "loss": 0.9461, + "step": 6790 + }, + { + "epoch": 0.49061715462278976, + "grad_norm": 5.227440456669721, + "learning_rate": 4.416405713194597e-06, + "loss": 0.826, + "step": 6791 + }, + { + "epoch": 0.49068939982299925, + "grad_norm": 6.772850635980738, + "learning_rate": 4.416217863829642e-06, + "loss": 0.8831, + "step": 6792 + }, + { + "epoch": 0.4907616450232088, + "grad_norm": 5.868171457002538, + "learning_rate": 4.416029988233152e-06, + "loss": 0.8857, + "step": 6793 + }, + { + "epoch": 0.4908338902234183, + "grad_norm": 8.130890105109476, + "learning_rate": 4.4158420864077e-06, + "loss": 0.847, + "step": 6794 + }, + { + "epoch": 0.49090613542362777, + "grad_norm": 8.211846647009667, + "learning_rate": 4.415654158355856e-06, + "loss": 0.9166, + "step": 6795 + }, + { + "epoch": 0.4909783806238373, + "grad_norm": 6.165446117685088, + "learning_rate": 4.415466204080196e-06, + "loss": 0.8587, + "step": 6796 + }, + { + "epoch": 0.4910506258240468, + "grad_norm": 6.143856683311253, + "learning_rate": 4.41527822358329e-06, + "loss": 1.0003, + "step": 6797 + }, + { + "epoch": 0.49112287102425634, + "grad_norm": 7.09817837194524, + "learning_rate": 4.415090216867712e-06, + "loss": 0.9167, + "step": 6798 + }, + { + "epoch": 0.4911951162244658, + "grad_norm": 7.399614540577171, + "learning_rate": 4.4149021839360365e-06, + "loss": 0.8473, + "step": 6799 + }, + { + "epoch": 0.49126736142467536, + "grad_norm": 6.5576508544603005, + "learning_rate": 4.414714124790837e-06, + "loss": 0.848, + "step": 6800 + }, + { + "epoch": 0.49133960662488485, + "grad_norm": 5.7932258903862035, + "learning_rate": 4.414526039434687e-06, + "loss": 0.874, + "step": 6801 + }, + { + "epoch": 0.4914118518250944, + "grad_norm": 9.079702742496025, + "learning_rate": 4.4143379278701615e-06, + "loss": 0.9104, + "step": 6802 + }, + { + "epoch": 0.4914840970253039, + "grad_norm": 6.273077207411565, + "learning_rate": 4.4141497900998374e-06, + "loss": 0.8947, + "step": 6803 + }, + { + "epoch": 0.49155634222551337, + "grad_norm": 5.474850287633176, + "learning_rate": 4.413961626126288e-06, + "loss": 0.8295, + "step": 6804 + }, + { + "epoch": 0.4916285874257229, + "grad_norm": 6.787613234444797, + "learning_rate": 4.413773435952092e-06, + "loss": 0.8911, + "step": 6805 + }, + { + "epoch": 0.4917008326259324, + "grad_norm": 5.314632391718777, + "learning_rate": 4.413585219579821e-06, + "loss": 0.9056, + "step": 6806 + }, + { + "epoch": 0.49177307782614194, + "grad_norm": 6.76135513853001, + "learning_rate": 4.413396977012055e-06, + "loss": 0.8994, + "step": 6807 + }, + { + "epoch": 0.4918453230263514, + "grad_norm": 8.563534862047103, + "learning_rate": 4.413208708251371e-06, + "loss": 0.9302, + "step": 6808 + }, + { + "epoch": 0.49191756822656096, + "grad_norm": 8.21554677061937, + "learning_rate": 4.4130204133003445e-06, + "loss": 0.8827, + "step": 6809 + }, + { + "epoch": 0.49198981342677045, + "grad_norm": 5.930019766403114, + "learning_rate": 4.412832092161554e-06, + "loss": 0.8729, + "step": 6810 + }, + { + "epoch": 0.49206205862698, + "grad_norm": 5.825454550104398, + "learning_rate": 4.412643744837578e-06, + "loss": 0.8508, + "step": 6811 + }, + { + "epoch": 0.4921343038271895, + "grad_norm": 6.492250077259334, + "learning_rate": 4.412455371330994e-06, + "loss": 0.9005, + "step": 6812 + }, + { + "epoch": 0.49220654902739897, + "grad_norm": 6.864025164666986, + "learning_rate": 4.41226697164438e-06, + "loss": 0.9436, + "step": 6813 + }, + { + "epoch": 0.4922787942276085, + "grad_norm": 5.685039616885413, + "learning_rate": 4.4120785457803165e-06, + "loss": 0.8036, + "step": 6814 + }, + { + "epoch": 0.492351039427818, + "grad_norm": 4.696357191911987, + "learning_rate": 4.411890093741382e-06, + "loss": 0.8203, + "step": 6815 + }, + { + "epoch": 0.49242328462802754, + "grad_norm": 5.901522566034304, + "learning_rate": 4.411701615530157e-06, + "loss": 0.8386, + "step": 6816 + }, + { + "epoch": 0.492495529828237, + "grad_norm": 7.622059739842924, + "learning_rate": 4.411513111149222e-06, + "loss": 0.8149, + "step": 6817 + }, + { + "epoch": 0.49256777502844656, + "grad_norm": 8.354714493773018, + "learning_rate": 4.411324580601155e-06, + "loss": 0.8629, + "step": 6818 + }, + { + "epoch": 0.49264002022865605, + "grad_norm": 5.559505728288361, + "learning_rate": 4.411136023888539e-06, + "loss": 0.8543, + "step": 6819 + }, + { + "epoch": 0.4927122654288656, + "grad_norm": 7.374130456385551, + "learning_rate": 4.410947441013956e-06, + "loss": 0.9366, + "step": 6820 + }, + { + "epoch": 0.4927845106290751, + "grad_norm": 6.0957372310604665, + "learning_rate": 4.410758831979985e-06, + "loss": 0.9117, + "step": 6821 + }, + { + "epoch": 0.49285675582928457, + "grad_norm": 6.958478945124166, + "learning_rate": 4.41057019678921e-06, + "loss": 0.8542, + "step": 6822 + }, + { + "epoch": 0.4929290010294941, + "grad_norm": 5.8140463464108105, + "learning_rate": 4.410381535444212e-06, + "loss": 0.899, + "step": 6823 + }, + { + "epoch": 0.4930012462297036, + "grad_norm": 7.917032601448748, + "learning_rate": 4.410192847947574e-06, + "loss": 0.9571, + "step": 6824 + }, + { + "epoch": 0.49307349142991314, + "grad_norm": 6.438865146514563, + "learning_rate": 4.410004134301879e-06, + "loss": 0.9269, + "step": 6825 + }, + { + "epoch": 0.4931457366301226, + "grad_norm": 7.302363224805528, + "learning_rate": 4.409815394509711e-06, + "loss": 0.9521, + "step": 6826 + }, + { + "epoch": 0.49321798183033216, + "grad_norm": 7.0835788759046165, + "learning_rate": 4.409626628573654e-06, + "loss": 0.8655, + "step": 6827 + }, + { + "epoch": 0.49329022703054165, + "grad_norm": 7.307293464803622, + "learning_rate": 4.4094378364962886e-06, + "loss": 0.8839, + "step": 6828 + }, + { + "epoch": 0.4933624722307512, + "grad_norm": 6.898276857469184, + "learning_rate": 4.409249018280204e-06, + "loss": 0.9064, + "step": 6829 + }, + { + "epoch": 0.4934347174309607, + "grad_norm": 5.331968828331962, + "learning_rate": 4.4090601739279815e-06, + "loss": 0.8177, + "step": 6830 + }, + { + "epoch": 0.49350696263117017, + "grad_norm": 5.900273281169311, + "learning_rate": 4.408871303442208e-06, + "loss": 0.8224, + "step": 6831 + }, + { + "epoch": 0.4935792078313797, + "grad_norm": 6.708910741499126, + "learning_rate": 4.408682406825469e-06, + "loss": 0.8275, + "step": 6832 + }, + { + "epoch": 0.4936514530315892, + "grad_norm": 5.99746364390082, + "learning_rate": 4.408493484080349e-06, + "loss": 0.8353, + "step": 6833 + }, + { + "epoch": 0.49372369823179874, + "grad_norm": 6.556731095313426, + "learning_rate": 4.4083045352094355e-06, + "loss": 0.8841, + "step": 6834 + }, + { + "epoch": 0.4937959434320082, + "grad_norm": 8.569522400482029, + "learning_rate": 4.408115560215314e-06, + "loss": 0.9538, + "step": 6835 + }, + { + "epoch": 0.49386818863221776, + "grad_norm": 6.412561201431872, + "learning_rate": 4.4079265591005725e-06, + "loss": 0.9109, + "step": 6836 + }, + { + "epoch": 0.49394043383242725, + "grad_norm": 6.884847073986301, + "learning_rate": 4.407737531867798e-06, + "loss": 0.9438, + "step": 6837 + }, + { + "epoch": 0.4940126790326368, + "grad_norm": 5.753066199073149, + "learning_rate": 4.407548478519578e-06, + "loss": 0.7883, + "step": 6838 + }, + { + "epoch": 0.4940849242328463, + "grad_norm": 8.062836869173669, + "learning_rate": 4.4073593990585005e-06, + "loss": 0.8667, + "step": 6839 + }, + { + "epoch": 0.49415716943305577, + "grad_norm": 5.05510595555858, + "learning_rate": 4.407170293487153e-06, + "loss": 0.8769, + "step": 6840 + }, + { + "epoch": 0.4942294146332653, + "grad_norm": 5.3346841214164655, + "learning_rate": 4.406981161808126e-06, + "loss": 0.7886, + "step": 6841 + }, + { + "epoch": 0.4943016598334748, + "grad_norm": 7.518447314326978, + "learning_rate": 4.406792004024007e-06, + "loss": 0.8959, + "step": 6842 + }, + { + "epoch": 0.49437390503368434, + "grad_norm": 6.966425349727097, + "learning_rate": 4.406602820137385e-06, + "loss": 0.9125, + "step": 6843 + }, + { + "epoch": 0.4944461502338938, + "grad_norm": 8.18475948215717, + "learning_rate": 4.406413610150852e-06, + "loss": 0.9365, + "step": 6844 + }, + { + "epoch": 0.49451839543410336, + "grad_norm": 6.3942350529865415, + "learning_rate": 4.406224374066998e-06, + "loss": 0.8499, + "step": 6845 + }, + { + "epoch": 0.49459064063431285, + "grad_norm": 7.677270798139075, + "learning_rate": 4.40603511188841e-06, + "loss": 0.8627, + "step": 6846 + }, + { + "epoch": 0.4946628858345224, + "grad_norm": 6.213125167492107, + "learning_rate": 4.405845823617683e-06, + "loss": 0.9048, + "step": 6847 + }, + { + "epoch": 0.4947351310347319, + "grad_norm": 7.3012365992128725, + "learning_rate": 4.405656509257406e-06, + "loss": 0.9067, + "step": 6848 + }, + { + "epoch": 0.49480737623494137, + "grad_norm": 6.236377059978695, + "learning_rate": 4.405467168810172e-06, + "loss": 0.8654, + "step": 6849 + }, + { + "epoch": 0.4948796214351509, + "grad_norm": 6.50516041070427, + "learning_rate": 4.40527780227857e-06, + "loss": 0.945, + "step": 6850 + }, + { + "epoch": 0.4949518666353604, + "grad_norm": 6.71822876793581, + "learning_rate": 4.4050884096651955e-06, + "loss": 0.8254, + "step": 6851 + }, + { + "epoch": 0.49502411183556994, + "grad_norm": 7.8162390954609435, + "learning_rate": 4.404898990972639e-06, + "loss": 0.9859, + "step": 6852 + }, + { + "epoch": 0.4950963570357794, + "grad_norm": 10.936837661307091, + "learning_rate": 4.404709546203495e-06, + "loss": 0.9774, + "step": 6853 + }, + { + "epoch": 0.49516860223598896, + "grad_norm": 7.734671908758174, + "learning_rate": 4.404520075360357e-06, + "loss": 0.8629, + "step": 6854 + }, + { + "epoch": 0.49524084743619845, + "grad_norm": 7.066311778148209, + "learning_rate": 4.404330578445816e-06, + "loss": 0.8519, + "step": 6855 + }, + { + "epoch": 0.495313092636408, + "grad_norm": 8.340345509642562, + "learning_rate": 4.40414105546247e-06, + "loss": 0.9287, + "step": 6856 + }, + { + "epoch": 0.4953853378366175, + "grad_norm": 11.933157725258576, + "learning_rate": 4.4039515064129105e-06, + "loss": 0.8801, + "step": 6857 + }, + { + "epoch": 0.49545758303682697, + "grad_norm": 7.978280624981615, + "learning_rate": 4.403761931299733e-06, + "loss": 0.9138, + "step": 6858 + }, + { + "epoch": 0.4955298282370365, + "grad_norm": 5.9614484775463294, + "learning_rate": 4.403572330125533e-06, + "loss": 0.8201, + "step": 6859 + }, + { + "epoch": 0.495602073437246, + "grad_norm": 8.154554870544361, + "learning_rate": 4.403382702892905e-06, + "loss": 0.9439, + "step": 6860 + }, + { + "epoch": 0.49567431863745554, + "grad_norm": 6.1424213140755315, + "learning_rate": 4.403193049604447e-06, + "loss": 0.9249, + "step": 6861 + }, + { + "epoch": 0.495746563837665, + "grad_norm": 9.655781854929225, + "learning_rate": 4.403003370262754e-06, + "loss": 0.8806, + "step": 6862 + }, + { + "epoch": 0.49581880903787456, + "grad_norm": 6.034468192472871, + "learning_rate": 4.402813664870421e-06, + "loss": 0.8597, + "step": 6863 + }, + { + "epoch": 0.49589105423808405, + "grad_norm": 5.777000061087982, + "learning_rate": 4.402623933430048e-06, + "loss": 0.8379, + "step": 6864 + }, + { + "epoch": 0.4959632994382936, + "grad_norm": 7.228844920390577, + "learning_rate": 4.4024341759442295e-06, + "loss": 0.929, + "step": 6865 + }, + { + "epoch": 0.4960355446385031, + "grad_norm": 9.20541607199771, + "learning_rate": 4.402244392415565e-06, + "loss": 0.9212, + "step": 6866 + }, + { + "epoch": 0.49610778983871257, + "grad_norm": 7.362879734576672, + "learning_rate": 4.402054582846651e-06, + "loss": 0.8654, + "step": 6867 + }, + { + "epoch": 0.4961800350389221, + "grad_norm": 10.369645908825145, + "learning_rate": 4.401864747240087e-06, + "loss": 0.9, + "step": 6868 + }, + { + "epoch": 0.4962522802391316, + "grad_norm": 6.3725521399488905, + "learning_rate": 4.401674885598471e-06, + "loss": 0.9179, + "step": 6869 + }, + { + "epoch": 0.49632452543934114, + "grad_norm": 5.802611769257578, + "learning_rate": 4.401484997924403e-06, + "loss": 0.9305, + "step": 6870 + }, + { + "epoch": 0.4963967706395506, + "grad_norm": 7.346233840672763, + "learning_rate": 4.401295084220482e-06, + "loss": 0.8811, + "step": 6871 + }, + { + "epoch": 0.49646901583976016, + "grad_norm": 5.909442018868214, + "learning_rate": 4.401105144489307e-06, + "loss": 0.9138, + "step": 6872 + }, + { + "epoch": 0.49654126103996965, + "grad_norm": 6.406726949661546, + "learning_rate": 4.400915178733478e-06, + "loss": 0.8442, + "step": 6873 + }, + { + "epoch": 0.4966135062401792, + "grad_norm": 6.955018340798937, + "learning_rate": 4.400725186955597e-06, + "loss": 0.9645, + "step": 6874 + }, + { + "epoch": 0.4966857514403887, + "grad_norm": 7.772390691921701, + "learning_rate": 4.400535169158264e-06, + "loss": 0.974, + "step": 6875 + }, + { + "epoch": 0.49675799664059817, + "grad_norm": 7.182934181588416, + "learning_rate": 4.40034512534408e-06, + "loss": 0.8561, + "step": 6876 + }, + { + "epoch": 0.4968302418408077, + "grad_norm": 5.762182515763511, + "learning_rate": 4.400155055515647e-06, + "loss": 0.8156, + "step": 6877 + }, + { + "epoch": 0.4969024870410172, + "grad_norm": 6.461004217649783, + "learning_rate": 4.399964959675567e-06, + "loss": 0.9253, + "step": 6878 + }, + { + "epoch": 0.49697473224122674, + "grad_norm": 7.435551235797907, + "learning_rate": 4.3997748378264415e-06, + "loss": 0.9684, + "step": 6879 + }, + { + "epoch": 0.4970469774414362, + "grad_norm": 7.345299090689871, + "learning_rate": 4.3995846899708734e-06, + "loss": 0.8631, + "step": 6880 + }, + { + "epoch": 0.49711922264164576, + "grad_norm": 8.239498043141323, + "learning_rate": 4.399394516111466e-06, + "loss": 0.96, + "step": 6881 + }, + { + "epoch": 0.49719146784185525, + "grad_norm": 7.107311846496509, + "learning_rate": 4.399204316250823e-06, + "loss": 0.8969, + "step": 6882 + }, + { + "epoch": 0.4972637130420648, + "grad_norm": 6.208357365026195, + "learning_rate": 4.399014090391546e-06, + "loss": 0.8763, + "step": 6883 + }, + { + "epoch": 0.4973359582422743, + "grad_norm": 6.6509788309074995, + "learning_rate": 4.398823838536242e-06, + "loss": 0.974, + "step": 6884 + }, + { + "epoch": 0.49740820344248377, + "grad_norm": 7.412578727728657, + "learning_rate": 4.398633560687513e-06, + "loss": 0.8613, + "step": 6885 + }, + { + "epoch": 0.4974804486426933, + "grad_norm": 8.015318985538967, + "learning_rate": 4.398443256847965e-06, + "loss": 0.8943, + "step": 6886 + }, + { + "epoch": 0.4975526938429028, + "grad_norm": 8.603474611494184, + "learning_rate": 4.398252927020203e-06, + "loss": 0.9716, + "step": 6887 + }, + { + "epoch": 0.49762493904311234, + "grad_norm": 6.9925590203324015, + "learning_rate": 4.398062571206833e-06, + "loss": 0.9396, + "step": 6888 + }, + { + "epoch": 0.4976971842433218, + "grad_norm": 7.602065066811076, + "learning_rate": 4.39787218941046e-06, + "loss": 0.9078, + "step": 6889 + }, + { + "epoch": 0.49776942944353136, + "grad_norm": 5.557055434044715, + "learning_rate": 4.39768178163369e-06, + "loss": 0.8748, + "step": 6890 + }, + { + "epoch": 0.49784167464374085, + "grad_norm": 8.329338743929965, + "learning_rate": 4.3974913478791294e-06, + "loss": 0.9315, + "step": 6891 + }, + { + "epoch": 0.4979139198439504, + "grad_norm": 7.341871695049721, + "learning_rate": 4.397300888149386e-06, + "loss": 0.8618, + "step": 6892 + }, + { + "epoch": 0.4979861650441599, + "grad_norm": 9.319434944017864, + "learning_rate": 4.397110402447067e-06, + "loss": 0.8358, + "step": 6893 + }, + { + "epoch": 0.49805841024436937, + "grad_norm": 7.258636592073008, + "learning_rate": 4.396919890774779e-06, + "loss": 0.9757, + "step": 6894 + }, + { + "epoch": 0.4981306554445789, + "grad_norm": 7.678377274501434, + "learning_rate": 4.39672935313513e-06, + "loss": 0.924, + "step": 6895 + }, + { + "epoch": 0.4982029006447884, + "grad_norm": 6.703268409742137, + "learning_rate": 4.39653878953073e-06, + "loss": 0.8243, + "step": 6896 + }, + { + "epoch": 0.49827514584499794, + "grad_norm": 8.347183264121151, + "learning_rate": 4.396348199964187e-06, + "loss": 0.8418, + "step": 6897 + }, + { + "epoch": 0.4983473910452074, + "grad_norm": 7.433979900391433, + "learning_rate": 4.396157584438107e-06, + "loss": 0.919, + "step": 6898 + }, + { + "epoch": 0.49841963624541696, + "grad_norm": 6.443751586558667, + "learning_rate": 4.395966942955105e-06, + "loss": 0.8689, + "step": 6899 + }, + { + "epoch": 0.49849188144562645, + "grad_norm": 6.682437584924831, + "learning_rate": 4.395776275517786e-06, + "loss": 0.9429, + "step": 6900 + }, + { + "epoch": 0.498564126645836, + "grad_norm": 6.461101340784609, + "learning_rate": 4.395585582128762e-06, + "loss": 0.884, + "step": 6901 + }, + { + "epoch": 0.4986363718460455, + "grad_norm": 6.694556960360246, + "learning_rate": 4.395394862790643e-06, + "loss": 0.8094, + "step": 6902 + }, + { + "epoch": 0.49870861704625497, + "grad_norm": 11.271115979348357, + "learning_rate": 4.395204117506041e-06, + "loss": 0.9055, + "step": 6903 + }, + { + "epoch": 0.4987808622464645, + "grad_norm": 10.724243535749386, + "learning_rate": 4.395013346277565e-06, + "loss": 0.8605, + "step": 6904 + }, + { + "epoch": 0.498853107446674, + "grad_norm": 7.317174737737108, + "learning_rate": 4.394822549107828e-06, + "loss": 0.9193, + "step": 6905 + }, + { + "epoch": 0.49892535264688354, + "grad_norm": 5.65324026720949, + "learning_rate": 4.394631725999442e-06, + "loss": 0.8705, + "step": 6906 + }, + { + "epoch": 0.498997597847093, + "grad_norm": 8.621436668389485, + "learning_rate": 4.394440876955018e-06, + "loss": 0.8867, + "step": 6907 + }, + { + "epoch": 0.49906984304730256, + "grad_norm": 10.436369874727006, + "learning_rate": 4.39425000197717e-06, + "loss": 0.9431, + "step": 6908 + }, + { + "epoch": 0.49914208824751205, + "grad_norm": 9.400090935449857, + "learning_rate": 4.39405910106851e-06, + "loss": 0.9361, + "step": 6909 + }, + { + "epoch": 0.4992143334477216, + "grad_norm": 7.700581902648936, + "learning_rate": 4.393868174231651e-06, + "loss": 0.8319, + "step": 6910 + }, + { + "epoch": 0.4992865786479311, + "grad_norm": 5.674280586520246, + "learning_rate": 4.393677221469208e-06, + "loss": 0.8573, + "step": 6911 + }, + { + "epoch": 0.49935882384814057, + "grad_norm": 6.79656125802468, + "learning_rate": 4.3934862427837945e-06, + "loss": 0.9799, + "step": 6912 + }, + { + "epoch": 0.4994310690483501, + "grad_norm": 7.782044151869136, + "learning_rate": 4.393295238178023e-06, + "loss": 0.9293, + "step": 6913 + }, + { + "epoch": 0.4995033142485596, + "grad_norm": 7.540467758847377, + "learning_rate": 4.393104207654512e-06, + "loss": 0.9122, + "step": 6914 + }, + { + "epoch": 0.49957555944876914, + "grad_norm": 7.754810931952631, + "learning_rate": 4.392913151215872e-06, + "loss": 0.8475, + "step": 6915 + }, + { + "epoch": 0.4996478046489786, + "grad_norm": 6.20743655015256, + "learning_rate": 4.392722068864722e-06, + "loss": 0.8711, + "step": 6916 + }, + { + "epoch": 0.49972004984918816, + "grad_norm": 5.388085162253678, + "learning_rate": 4.3925309606036765e-06, + "loss": 0.8052, + "step": 6917 + }, + { + "epoch": 0.49979229504939765, + "grad_norm": 5.415786710813258, + "learning_rate": 4.392339826435351e-06, + "loss": 0.8617, + "step": 6918 + }, + { + "epoch": 0.4998645402496072, + "grad_norm": 9.222734010684464, + "learning_rate": 4.392148666362363e-06, + "loss": 0.984, + "step": 6919 + }, + { + "epoch": 0.4999367854498167, + "grad_norm": 8.38242841909025, + "learning_rate": 4.391957480387329e-06, + "loss": 0.8758, + "step": 6920 + }, + { + "epoch": 0.5000090306500262, + "grad_norm": 5.652109221659154, + "learning_rate": 4.391766268512866e-06, + "loss": 0.812, + "step": 6921 + }, + { + "epoch": 0.5000812758502357, + "grad_norm": 6.648863800044559, + "learning_rate": 4.391575030741592e-06, + "loss": 0.8428, + "step": 6922 + }, + { + "epoch": 0.5001535210504452, + "grad_norm": 6.0846310977497335, + "learning_rate": 4.391383767076124e-06, + "loss": 0.9815, + "step": 6923 + }, + { + "epoch": 0.5002257662506547, + "grad_norm": 6.448927726625736, + "learning_rate": 4.3911924775190805e-06, + "loss": 0.8512, + "step": 6924 + }, + { + "epoch": 0.5002980114508643, + "grad_norm": 5.724476099821856, + "learning_rate": 4.391001162073081e-06, + "loss": 0.8496, + "step": 6925 + }, + { + "epoch": 0.5003702566510737, + "grad_norm": 6.404683437937081, + "learning_rate": 4.390809820740744e-06, + "loss": 0.8255, + "step": 6926 + }, + { + "epoch": 0.5004425018512833, + "grad_norm": 5.974796767349733, + "learning_rate": 4.390618453524688e-06, + "loss": 0.944, + "step": 6927 + }, + { + "epoch": 0.5005147470514928, + "grad_norm": 7.776881450897806, + "learning_rate": 4.390427060427534e-06, + "loss": 0.8524, + "step": 6928 + }, + { + "epoch": 0.5005869922517022, + "grad_norm": 6.398706675027158, + "learning_rate": 4.3902356414519005e-06, + "loss": 0.8422, + "step": 6929 + }, + { + "epoch": 0.5006592374519118, + "grad_norm": 7.206504575098664, + "learning_rate": 4.390044196600409e-06, + "loss": 0.8835, + "step": 6930 + }, + { + "epoch": 0.5007314826521213, + "grad_norm": 7.383609466326121, + "learning_rate": 4.389852725875681e-06, + "loss": 0.9299, + "step": 6931 + }, + { + "epoch": 0.5008037278523308, + "grad_norm": 6.496376788406991, + "learning_rate": 4.389661229280335e-06, + "loss": 0.8886, + "step": 6932 + }, + { + "epoch": 0.5008759730525403, + "grad_norm": 6.872365828363433, + "learning_rate": 4.389469706816995e-06, + "loss": 0.8547, + "step": 6933 + }, + { + "epoch": 0.5009482182527498, + "grad_norm": 5.635072463753499, + "learning_rate": 4.389278158488282e-06, + "loss": 0.8267, + "step": 6934 + }, + { + "epoch": 0.5010204634529594, + "grad_norm": 5.993185305919047, + "learning_rate": 4.389086584296817e-06, + "loss": 0.9444, + "step": 6935 + }, + { + "epoch": 0.5010927086531689, + "grad_norm": 7.452193163953875, + "learning_rate": 4.3888949842452246e-06, + "loss": 0.9232, + "step": 6936 + }, + { + "epoch": 0.5011649538533783, + "grad_norm": 6.3329083484141835, + "learning_rate": 4.388703358336125e-06, + "loss": 0.8767, + "step": 6937 + }, + { + "epoch": 0.5012371990535879, + "grad_norm": 7.826196490087943, + "learning_rate": 4.388511706572145e-06, + "loss": 0.9234, + "step": 6938 + }, + { + "epoch": 0.5013094442537974, + "grad_norm": 6.609748414946421, + "learning_rate": 4.388320028955904e-06, + "loss": 0.9129, + "step": 6939 + }, + { + "epoch": 0.5013816894540069, + "grad_norm": 4.976806826260497, + "learning_rate": 4.388128325490029e-06, + "loss": 0.8158, + "step": 6940 + }, + { + "epoch": 0.5014539346542164, + "grad_norm": 7.709610319469861, + "learning_rate": 4.387936596177143e-06, + "loss": 0.9236, + "step": 6941 + }, + { + "epoch": 0.5015261798544259, + "grad_norm": 6.972884931474068, + "learning_rate": 4.387744841019871e-06, + "loss": 0.9652, + "step": 6942 + }, + { + "epoch": 0.5015984250546355, + "grad_norm": 8.699863336300893, + "learning_rate": 4.387553060020838e-06, + "loss": 0.9957, + "step": 6943 + }, + { + "epoch": 0.5016706702548449, + "grad_norm": 7.482115211266662, + "learning_rate": 4.387361253182669e-06, + "loss": 0.8312, + "step": 6944 + }, + { + "epoch": 0.5017429154550545, + "grad_norm": 6.398150129001522, + "learning_rate": 4.38716942050799e-06, + "loss": 0.8934, + "step": 6945 + }, + { + "epoch": 0.501815160655264, + "grad_norm": 6.128740375284834, + "learning_rate": 4.386977561999427e-06, + "loss": 0.9033, + "step": 6946 + }, + { + "epoch": 0.5018874058554734, + "grad_norm": 8.342905523305461, + "learning_rate": 4.386785677659606e-06, + "loss": 0.894, + "step": 6947 + }, + { + "epoch": 0.501959651055683, + "grad_norm": 6.884172458933313, + "learning_rate": 4.386593767491154e-06, + "loss": 0.9149, + "step": 6948 + }, + { + "epoch": 0.5020318962558925, + "grad_norm": 8.891601079849657, + "learning_rate": 4.386401831496698e-06, + "loss": 0.8203, + "step": 6949 + }, + { + "epoch": 0.502104141456102, + "grad_norm": 6.809842492521337, + "learning_rate": 4.386209869678867e-06, + "loss": 0.8643, + "step": 6950 + }, + { + "epoch": 0.5021763866563115, + "grad_norm": 5.778659879562225, + "learning_rate": 4.386017882040286e-06, + "loss": 0.881, + "step": 6951 + }, + { + "epoch": 0.502248631856521, + "grad_norm": 7.390024689051123, + "learning_rate": 4.385825868583585e-06, + "loss": 0.8683, + "step": 6952 + }, + { + "epoch": 0.5023208770567306, + "grad_norm": 6.5260131867790205, + "learning_rate": 4.385633829311392e-06, + "loss": 0.8349, + "step": 6953 + }, + { + "epoch": 0.5023931222569401, + "grad_norm": 6.36786087313545, + "learning_rate": 4.385441764226337e-06, + "loss": 0.7764, + "step": 6954 + }, + { + "epoch": 0.5024653674571495, + "grad_norm": 7.646469408510257, + "learning_rate": 4.385249673331047e-06, + "loss": 0.9013, + "step": 6955 + }, + { + "epoch": 0.5025376126573591, + "grad_norm": 5.5040630591683595, + "learning_rate": 4.385057556628153e-06, + "loss": 0.8197, + "step": 6956 + }, + { + "epoch": 0.5026098578575686, + "grad_norm": 6.967006585013053, + "learning_rate": 4.384865414120285e-06, + "loss": 0.9262, + "step": 6957 + }, + { + "epoch": 0.502682103057778, + "grad_norm": 6.564813405949299, + "learning_rate": 4.384673245810073e-06, + "loss": 0.8728, + "step": 6958 + }, + { + "epoch": 0.5027543482579876, + "grad_norm": 8.67421711403997, + "learning_rate": 4.3844810517001465e-06, + "loss": 0.8898, + "step": 6959 + }, + { + "epoch": 0.5028265934581971, + "grad_norm": 6.509529171279462, + "learning_rate": 4.384288831793138e-06, + "loss": 0.8841, + "step": 6960 + }, + { + "epoch": 0.5028988386584067, + "grad_norm": 5.929011330230691, + "learning_rate": 4.3840965860916805e-06, + "loss": 0.868, + "step": 6961 + }, + { + "epoch": 0.5029710838586161, + "grad_norm": 6.0895386840283185, + "learning_rate": 4.3839043145984015e-06, + "loss": 0.9412, + "step": 6962 + }, + { + "epoch": 0.5030433290588257, + "grad_norm": 8.507209413169027, + "learning_rate": 4.383712017315936e-06, + "loss": 0.8656, + "step": 6963 + }, + { + "epoch": 0.5031155742590352, + "grad_norm": 6.265541589804845, + "learning_rate": 4.383519694246915e-06, + "loss": 0.8741, + "step": 6964 + }, + { + "epoch": 0.5031878194592446, + "grad_norm": 6.781932603083259, + "learning_rate": 4.383327345393973e-06, + "loss": 0.948, + "step": 6965 + }, + { + "epoch": 0.5032600646594542, + "grad_norm": 7.230451867214217, + "learning_rate": 4.383134970759741e-06, + "loss": 0.8576, + "step": 6966 + }, + { + "epoch": 0.5033323098596637, + "grad_norm": 5.639496666653704, + "learning_rate": 4.382942570346853e-06, + "loss": 0.7842, + "step": 6967 + }, + { + "epoch": 0.5034045550598732, + "grad_norm": 5.627170906764822, + "learning_rate": 4.382750144157945e-06, + "loss": 0.8304, + "step": 6968 + }, + { + "epoch": 0.5034768002600827, + "grad_norm": 6.175456338306354, + "learning_rate": 4.382557692195648e-06, + "loss": 0.8348, + "step": 6969 + }, + { + "epoch": 0.5035490454602922, + "grad_norm": 7.688386943483309, + "learning_rate": 4.382365214462598e-06, + "loss": 0.8531, + "step": 6970 + }, + { + "epoch": 0.5036212906605018, + "grad_norm": 6.782372447697568, + "learning_rate": 4.3821727109614295e-06, + "loss": 0.898, + "step": 6971 + }, + { + "epoch": 0.5036935358607113, + "grad_norm": 5.532809107816314, + "learning_rate": 4.3819801816947785e-06, + "loss": 0.8661, + "step": 6972 + }, + { + "epoch": 0.5037657810609207, + "grad_norm": 5.403434593637166, + "learning_rate": 4.38178762666528e-06, + "loss": 0.9564, + "step": 6973 + }, + { + "epoch": 0.5038380262611303, + "grad_norm": 5.604822207922556, + "learning_rate": 4.381595045875571e-06, + "loss": 0.8037, + "step": 6974 + }, + { + "epoch": 0.5039102714613398, + "grad_norm": 6.886069801120826, + "learning_rate": 4.381402439328285e-06, + "loss": 0.8877, + "step": 6975 + }, + { + "epoch": 0.5039825166615493, + "grad_norm": 7.4156305968162615, + "learning_rate": 4.381209807026061e-06, + "loss": 0.9497, + "step": 6976 + }, + { + "epoch": 0.5040547618617588, + "grad_norm": 5.597528130076006, + "learning_rate": 4.381017148971536e-06, + "loss": 0.8537, + "step": 6977 + }, + { + "epoch": 0.5041270070619683, + "grad_norm": 6.670613010952931, + "learning_rate": 4.380824465167346e-06, + "loss": 0.9299, + "step": 6978 + }, + { + "epoch": 0.5041992522621779, + "grad_norm": 6.178563610169609, + "learning_rate": 4.380631755616129e-06, + "loss": 0.8785, + "step": 6979 + }, + { + "epoch": 0.5042714974623873, + "grad_norm": 6.845471030748176, + "learning_rate": 4.380439020320525e-06, + "loss": 0.9619, + "step": 6980 + }, + { + "epoch": 0.5043437426625969, + "grad_norm": 7.275858311310066, + "learning_rate": 4.38024625928317e-06, + "loss": 0.8834, + "step": 6981 + }, + { + "epoch": 0.5044159878628064, + "grad_norm": 7.557948989377038, + "learning_rate": 4.380053472506703e-06, + "loss": 0.9666, + "step": 6982 + }, + { + "epoch": 0.5044882330630158, + "grad_norm": 5.9294613680737065, + "learning_rate": 4.379860659993765e-06, + "loss": 0.8562, + "step": 6983 + }, + { + "epoch": 0.5045604782632254, + "grad_norm": 5.479131468839932, + "learning_rate": 4.379667821746994e-06, + "loss": 0.9547, + "step": 6984 + }, + { + "epoch": 0.5046327234634349, + "grad_norm": 6.810161504188779, + "learning_rate": 4.3794749577690295e-06, + "loss": 0.9053, + "step": 6985 + }, + { + "epoch": 0.5047049686636444, + "grad_norm": 6.399027404793241, + "learning_rate": 4.379282068062512e-06, + "loss": 0.769, + "step": 6986 + }, + { + "epoch": 0.5047772138638539, + "grad_norm": 7.3276838157605955, + "learning_rate": 4.379089152630083e-06, + "loss": 0.8447, + "step": 6987 + }, + { + "epoch": 0.5048494590640634, + "grad_norm": 5.644466708649173, + "learning_rate": 4.378896211474382e-06, + "loss": 0.8923, + "step": 6988 + }, + { + "epoch": 0.504921704264273, + "grad_norm": 6.751125383164571, + "learning_rate": 4.378703244598051e-06, + "loss": 0.8327, + "step": 6989 + }, + { + "epoch": 0.5049939494644825, + "grad_norm": 6.685795647029446, + "learning_rate": 4.37851025200373e-06, + "loss": 0.9836, + "step": 6990 + }, + { + "epoch": 0.5050661946646919, + "grad_norm": 6.875188928955908, + "learning_rate": 4.378317233694064e-06, + "loss": 0.9376, + "step": 6991 + }, + { + "epoch": 0.5051384398649015, + "grad_norm": 9.008411291622037, + "learning_rate": 4.378124189671693e-06, + "loss": 0.9188, + "step": 6992 + }, + { + "epoch": 0.505210685065111, + "grad_norm": 9.187737494273108, + "learning_rate": 4.3779311199392595e-06, + "loss": 0.8548, + "step": 6993 + }, + { + "epoch": 0.5052829302653205, + "grad_norm": 6.292233708915096, + "learning_rate": 4.3777380244994076e-06, + "loss": 0.9179, + "step": 6994 + }, + { + "epoch": 0.50535517546553, + "grad_norm": 6.557579012011925, + "learning_rate": 4.37754490335478e-06, + "loss": 0.8835, + "step": 6995 + }, + { + "epoch": 0.5054274206657395, + "grad_norm": 5.844451372620345, + "learning_rate": 4.37735175650802e-06, + "loss": 0.8499, + "step": 6996 + }, + { + "epoch": 0.5054996658659491, + "grad_norm": 6.4937567438231945, + "learning_rate": 4.3771585839617725e-06, + "loss": 0.7582, + "step": 6997 + }, + { + "epoch": 0.5055719110661585, + "grad_norm": 7.020290395267614, + "learning_rate": 4.376965385718682e-06, + "loss": 0.9155, + "step": 6998 + }, + { + "epoch": 0.505644156266368, + "grad_norm": 7.301902460387678, + "learning_rate": 4.376772161781392e-06, + "loss": 0.8953, + "step": 6999 + }, + { + "epoch": 0.5057164014665776, + "grad_norm": 7.0987318921282245, + "learning_rate": 4.376578912152549e-06, + "loss": 0.805, + "step": 7000 + }, + { + "epoch": 0.505788646666787, + "grad_norm": 5.669508763584489, + "learning_rate": 4.376385636834797e-06, + "loss": 0.796, + "step": 7001 + }, + { + "epoch": 0.5058608918669966, + "grad_norm": 8.168434833964138, + "learning_rate": 4.376192335830783e-06, + "loss": 0.9204, + "step": 7002 + }, + { + "epoch": 0.5059331370672061, + "grad_norm": 6.813216985367284, + "learning_rate": 4.375999009143152e-06, + "loss": 0.9142, + "step": 7003 + }, + { + "epoch": 0.5060053822674156, + "grad_norm": 7.920470224158244, + "learning_rate": 4.375805656774553e-06, + "loss": 0.8685, + "step": 7004 + }, + { + "epoch": 0.5060776274676251, + "grad_norm": 8.367433232706945, + "learning_rate": 4.375612278727629e-06, + "loss": 0.8903, + "step": 7005 + }, + { + "epoch": 0.5061498726678346, + "grad_norm": 6.0863743675438275, + "learning_rate": 4.375418875005031e-06, + "loss": 0.83, + "step": 7006 + }, + { + "epoch": 0.5062221178680442, + "grad_norm": 5.9642332872578, + "learning_rate": 4.375225445609404e-06, + "loss": 0.9878, + "step": 7007 + }, + { + "epoch": 0.5062943630682537, + "grad_norm": 6.301351541534562, + "learning_rate": 4.375031990543396e-06, + "loss": 0.8208, + "step": 7008 + }, + { + "epoch": 0.5063666082684631, + "grad_norm": 6.400803944638151, + "learning_rate": 4.374838509809657e-06, + "loss": 0.7924, + "step": 7009 + }, + { + "epoch": 0.5064388534686727, + "grad_norm": 9.174021688997433, + "learning_rate": 4.374645003410833e-06, + "loss": 0.9051, + "step": 7010 + }, + { + "epoch": 0.5065110986688822, + "grad_norm": 5.561177471542795, + "learning_rate": 4.374451471349576e-06, + "loss": 0.8258, + "step": 7011 + }, + { + "epoch": 0.5065833438690917, + "grad_norm": 6.762535323581353, + "learning_rate": 4.374257913628533e-06, + "loss": 0.9028, + "step": 7012 + }, + { + "epoch": 0.5066555890693012, + "grad_norm": 5.603554939246453, + "learning_rate": 4.374064330250355e-06, + "loss": 0.8179, + "step": 7013 + }, + { + "epoch": 0.5067278342695107, + "grad_norm": 5.8095396973737365, + "learning_rate": 4.373870721217691e-06, + "loss": 0.8882, + "step": 7014 + }, + { + "epoch": 0.5068000794697203, + "grad_norm": 5.333846166909107, + "learning_rate": 4.373677086533191e-06, + "loss": 0.8338, + "step": 7015 + }, + { + "epoch": 0.5068723246699297, + "grad_norm": 7.363217527078378, + "learning_rate": 4.373483426199508e-06, + "loss": 0.914, + "step": 7016 + }, + { + "epoch": 0.5069445698701392, + "grad_norm": 7.6907765918781985, + "learning_rate": 4.373289740219291e-06, + "loss": 0.8918, + "step": 7017 + }, + { + "epoch": 0.5070168150703488, + "grad_norm": 6.172637660552947, + "learning_rate": 4.3730960285951915e-06, + "loss": 0.9713, + "step": 7018 + }, + { + "epoch": 0.5070890602705582, + "grad_norm": 5.9885099383518, + "learning_rate": 4.372902291329863e-06, + "loss": 0.8277, + "step": 7019 + }, + { + "epoch": 0.5071613054707678, + "grad_norm": 7.125239585729924, + "learning_rate": 4.372708528425955e-06, + "loss": 0.9035, + "step": 7020 + }, + { + "epoch": 0.5072335506709773, + "grad_norm": 5.617767091860312, + "learning_rate": 4.372514739886122e-06, + "loss": 0.8004, + "step": 7021 + }, + { + "epoch": 0.5073057958711868, + "grad_norm": 9.20841001821034, + "learning_rate": 4.3723209257130154e-06, + "loss": 0.8319, + "step": 7022 + }, + { + "epoch": 0.5073780410713963, + "grad_norm": 7.497138431145694, + "learning_rate": 4.37212708590929e-06, + "loss": 0.8341, + "step": 7023 + }, + { + "epoch": 0.5074502862716058, + "grad_norm": 6.742150157413968, + "learning_rate": 4.371933220477598e-06, + "loss": 0.8442, + "step": 7024 + }, + { + "epoch": 0.5075225314718154, + "grad_norm": 6.016278754046119, + "learning_rate": 4.371739329420593e-06, + "loss": 0.8831, + "step": 7025 + }, + { + "epoch": 0.5075947766720249, + "grad_norm": 5.657412920529059, + "learning_rate": 4.371545412740931e-06, + "loss": 0.879, + "step": 7026 + }, + { + "epoch": 0.5076670218722343, + "grad_norm": 9.115973201322312, + "learning_rate": 4.371351470441265e-06, + "loss": 0.9002, + "step": 7027 + }, + { + "epoch": 0.5077392670724439, + "grad_norm": 7.79679060223297, + "learning_rate": 4.371157502524251e-06, + "loss": 0.9576, + "step": 7028 + }, + { + "epoch": 0.5078115122726534, + "grad_norm": 6.064100545809535, + "learning_rate": 4.3709635089925425e-06, + "loss": 0.8515, + "step": 7029 + }, + { + "epoch": 0.5078837574728629, + "grad_norm": 5.5183396183593745, + "learning_rate": 4.370769489848797e-06, + "loss": 0.8282, + "step": 7030 + }, + { + "epoch": 0.5079560026730724, + "grad_norm": 5.348435338894995, + "learning_rate": 4.37057544509567e-06, + "loss": 0.8978, + "step": 7031 + }, + { + "epoch": 0.5080282478732819, + "grad_norm": 6.040553689803157, + "learning_rate": 4.370381374735817e-06, + "loss": 0.8923, + "step": 7032 + }, + { + "epoch": 0.5081004930734915, + "grad_norm": 6.714721323509456, + "learning_rate": 4.370187278771894e-06, + "loss": 0.8762, + "step": 7033 + }, + { + "epoch": 0.5081727382737009, + "grad_norm": 7.274276078639146, + "learning_rate": 4.369993157206561e-06, + "loss": 0.9382, + "step": 7034 + }, + { + "epoch": 0.5082449834739104, + "grad_norm": 9.263166387681855, + "learning_rate": 4.369799010042473e-06, + "loss": 0.9246, + "step": 7035 + }, + { + "epoch": 0.50831722867412, + "grad_norm": 5.214187912294378, + "learning_rate": 4.36960483728229e-06, + "loss": 0.8832, + "step": 7036 + }, + { + "epoch": 0.5083894738743294, + "grad_norm": 7.29081209533763, + "learning_rate": 4.3694106389286665e-06, + "loss": 0.8598, + "step": 7037 + }, + { + "epoch": 0.508461719074539, + "grad_norm": 5.737016365760678, + "learning_rate": 4.369216414984263e-06, + "loss": 0.9093, + "step": 7038 + }, + { + "epoch": 0.5085339642747485, + "grad_norm": 6.1748790528348545, + "learning_rate": 4.369022165451739e-06, + "loss": 0.8043, + "step": 7039 + }, + { + "epoch": 0.508606209474958, + "grad_norm": 5.818135800504766, + "learning_rate": 4.368827890333752e-06, + "loss": 0.8884, + "step": 7040 + }, + { + "epoch": 0.5086784546751675, + "grad_norm": 6.069440506566, + "learning_rate": 4.368633589632962e-06, + "loss": 0.9014, + "step": 7041 + }, + { + "epoch": 0.508750699875377, + "grad_norm": 7.766176384552847, + "learning_rate": 4.3684392633520305e-06, + "loss": 0.8507, + "step": 7042 + }, + { + "epoch": 0.5088229450755866, + "grad_norm": 6.73108821238804, + "learning_rate": 4.368244911493615e-06, + "loss": 0.9461, + "step": 7043 + }, + { + "epoch": 0.5088951902757961, + "grad_norm": 5.810209417721809, + "learning_rate": 4.3680505340603774e-06, + "loss": 0.9193, + "step": 7044 + }, + { + "epoch": 0.5089674354760055, + "grad_norm": 5.4290874780875065, + "learning_rate": 4.3678561310549796e-06, + "loss": 0.8196, + "step": 7045 + }, + { + "epoch": 0.5090396806762151, + "grad_norm": 8.079683666268913, + "learning_rate": 4.3676617024800795e-06, + "loss": 0.936, + "step": 7046 + }, + { + "epoch": 0.5091119258764246, + "grad_norm": 7.596136526591346, + "learning_rate": 4.367467248338343e-06, + "loss": 0.9016, + "step": 7047 + }, + { + "epoch": 0.509184171076634, + "grad_norm": 7.064976080916749, + "learning_rate": 4.367272768632428e-06, + "loss": 0.9115, + "step": 7048 + }, + { + "epoch": 0.5092564162768436, + "grad_norm": 6.684612756096991, + "learning_rate": 4.367078263365e-06, + "loss": 0.8125, + "step": 7049 + }, + { + "epoch": 0.5093286614770531, + "grad_norm": 5.685136240968121, + "learning_rate": 4.36688373253872e-06, + "loss": 0.8414, + "step": 7050 + }, + { + "epoch": 0.5094009066772627, + "grad_norm": 5.964019658507217, + "learning_rate": 4.366689176156251e-06, + "loss": 0.8337, + "step": 7051 + }, + { + "epoch": 0.5094731518774721, + "grad_norm": 5.509441594734535, + "learning_rate": 4.366494594220257e-06, + "loss": 0.8651, + "step": 7052 + }, + { + "epoch": 0.5095453970776816, + "grad_norm": 6.369416428308565, + "learning_rate": 4.3662999867334e-06, + "loss": 0.9146, + "step": 7053 + }, + { + "epoch": 0.5096176422778912, + "grad_norm": 6.26257402144066, + "learning_rate": 4.366105353698346e-06, + "loss": 0.8452, + "step": 7054 + }, + { + "epoch": 0.5096898874781006, + "grad_norm": 6.374742390532679, + "learning_rate": 4.365910695117759e-06, + "loss": 0.866, + "step": 7055 + }, + { + "epoch": 0.5097621326783102, + "grad_norm": 5.918126029263164, + "learning_rate": 4.365716010994303e-06, + "loss": 0.8268, + "step": 7056 + }, + { + "epoch": 0.5098343778785197, + "grad_norm": 5.604470151414903, + "learning_rate": 4.3655213013306444e-06, + "loss": 0.8219, + "step": 7057 + }, + { + "epoch": 0.5099066230787292, + "grad_norm": 5.744919232231492, + "learning_rate": 4.3653265661294465e-06, + "loss": 0.8199, + "step": 7058 + }, + { + "epoch": 0.5099788682789387, + "grad_norm": 5.372711936636576, + "learning_rate": 4.3651318053933765e-06, + "loss": 0.7759, + "step": 7059 + }, + { + "epoch": 0.5100511134791482, + "grad_norm": 9.15778345800825, + "learning_rate": 4.364937019125101e-06, + "loss": 0.8632, + "step": 7060 + }, + { + "epoch": 0.5101233586793578, + "grad_norm": 6.069240637450719, + "learning_rate": 4.3647422073272844e-06, + "loss": 0.8944, + "step": 7061 + }, + { + "epoch": 0.5101956038795673, + "grad_norm": 6.439272738317387, + "learning_rate": 4.364547370002595e-06, + "loss": 0.8323, + "step": 7062 + }, + { + "epoch": 0.5102678490797767, + "grad_norm": 5.877921494039189, + "learning_rate": 4.364352507153701e-06, + "loss": 0.9182, + "step": 7063 + }, + { + "epoch": 0.5103400942799863, + "grad_norm": 8.110014726693628, + "learning_rate": 4.364157618783268e-06, + "loss": 0.9251, + "step": 7064 + }, + { + "epoch": 0.5104123394801958, + "grad_norm": 7.229895768423334, + "learning_rate": 4.363962704893965e-06, + "loss": 0.8967, + "step": 7065 + }, + { + "epoch": 0.5104845846804053, + "grad_norm": 8.14046473720606, + "learning_rate": 4.363767765488459e-06, + "loss": 0.8837, + "step": 7066 + }, + { + "epoch": 0.5105568298806148, + "grad_norm": 7.154548009515275, + "learning_rate": 4.36357280056942e-06, + "loss": 0.9012, + "step": 7067 + }, + { + "epoch": 0.5106290750808243, + "grad_norm": 6.013159308498717, + "learning_rate": 4.363377810139517e-06, + "loss": 0.8475, + "step": 7068 + }, + { + "epoch": 0.5107013202810339, + "grad_norm": 7.120655542263121, + "learning_rate": 4.363182794201418e-06, + "loss": 0.8111, + "step": 7069 + }, + { + "epoch": 0.5107735654812433, + "grad_norm": 8.113577927742531, + "learning_rate": 4.362987752757793e-06, + "loss": 0.8324, + "step": 7070 + }, + { + "epoch": 0.5108458106814528, + "grad_norm": 7.664046323488917, + "learning_rate": 4.362792685811312e-06, + "loss": 0.8789, + "step": 7071 + }, + { + "epoch": 0.5109180558816624, + "grad_norm": 9.321446569599718, + "learning_rate": 4.362597593364646e-06, + "loss": 0.9056, + "step": 7072 + }, + { + "epoch": 0.5109903010818718, + "grad_norm": 7.146618621057357, + "learning_rate": 4.362402475420465e-06, + "loss": 0.8414, + "step": 7073 + }, + { + "epoch": 0.5110625462820814, + "grad_norm": 7.781269854784534, + "learning_rate": 4.36220733198144e-06, + "loss": 0.8839, + "step": 7074 + }, + { + "epoch": 0.5111347914822909, + "grad_norm": 5.686478072360047, + "learning_rate": 4.3620121630502435e-06, + "loss": 0.7543, + "step": 7075 + }, + { + "epoch": 0.5112070366825004, + "grad_norm": 7.768122741468087, + "learning_rate": 4.361816968629545e-06, + "loss": 0.8915, + "step": 7076 + }, + { + "epoch": 0.5112792818827099, + "grad_norm": 7.497144028168431, + "learning_rate": 4.361621748722018e-06, + "loss": 0.8222, + "step": 7077 + }, + { + "epoch": 0.5113515270829194, + "grad_norm": 6.20698730760484, + "learning_rate": 4.361426503330335e-06, + "loss": 0.8611, + "step": 7078 + }, + { + "epoch": 0.511423772283129, + "grad_norm": 7.023714495024494, + "learning_rate": 4.361231232457169e-06, + "loss": 0.8935, + "step": 7079 + }, + { + "epoch": 0.5114960174833385, + "grad_norm": 5.93768599369458, + "learning_rate": 4.3610359361051915e-06, + "loss": 0.9053, + "step": 7080 + }, + { + "epoch": 0.5115682626835479, + "grad_norm": 6.833505705853591, + "learning_rate": 4.360840614277078e-06, + "loss": 0.8486, + "step": 7081 + }, + { + "epoch": 0.5116405078837575, + "grad_norm": 5.440412245725721, + "learning_rate": 4.360645266975501e-06, + "loss": 0.8284, + "step": 7082 + }, + { + "epoch": 0.511712753083967, + "grad_norm": 9.21314423377618, + "learning_rate": 4.360449894203135e-06, + "loss": 0.933, + "step": 7083 + }, + { + "epoch": 0.5117849982841765, + "grad_norm": 5.941949401948194, + "learning_rate": 4.360254495962654e-06, + "loss": 0.8803, + "step": 7084 + }, + { + "epoch": 0.511857243484386, + "grad_norm": 5.633986859908538, + "learning_rate": 4.360059072256734e-06, + "loss": 0.9189, + "step": 7085 + }, + { + "epoch": 0.5119294886845955, + "grad_norm": 6.3291397599767345, + "learning_rate": 4.3598636230880485e-06, + "loss": 0.8541, + "step": 7086 + }, + { + "epoch": 0.5120017338848051, + "grad_norm": 7.053458038025707, + "learning_rate": 4.359668148459275e-06, + "loss": 0.8837, + "step": 7087 + }, + { + "epoch": 0.5120739790850145, + "grad_norm": 6.445658134671936, + "learning_rate": 4.359472648373089e-06, + "loss": 0.8873, + "step": 7088 + }, + { + "epoch": 0.512146224285224, + "grad_norm": 5.597748930670284, + "learning_rate": 4.359277122832165e-06, + "loss": 0.8807, + "step": 7089 + }, + { + "epoch": 0.5122184694854336, + "grad_norm": 8.910229459777055, + "learning_rate": 4.359081571839182e-06, + "loss": 0.9705, + "step": 7090 + }, + { + "epoch": 0.512290714685643, + "grad_norm": 6.17916709743849, + "learning_rate": 4.358885995396815e-06, + "loss": 0.956, + "step": 7091 + }, + { + "epoch": 0.5123629598858526, + "grad_norm": 7.144659392645255, + "learning_rate": 4.358690393507742e-06, + "loss": 0.9109, + "step": 7092 + }, + { + "epoch": 0.5124352050860621, + "grad_norm": 5.668892741062224, + "learning_rate": 4.358494766174641e-06, + "loss": 0.9519, + "step": 7093 + }, + { + "epoch": 0.5125074502862716, + "grad_norm": 6.3187940586565166, + "learning_rate": 4.358299113400189e-06, + "loss": 0.7959, + "step": 7094 + }, + { + "epoch": 0.5125796954864811, + "grad_norm": 6.4000374792908925, + "learning_rate": 4.358103435187067e-06, + "loss": 0.9122, + "step": 7095 + }, + { + "epoch": 0.5126519406866906, + "grad_norm": 5.793181113879739, + "learning_rate": 4.357907731537949e-06, + "loss": 0.947, + "step": 7096 + }, + { + "epoch": 0.5127241858869002, + "grad_norm": 5.940223791870957, + "learning_rate": 4.357712002455519e-06, + "loss": 0.8882, + "step": 7097 + }, + { + "epoch": 0.5127964310871097, + "grad_norm": 6.448944585045395, + "learning_rate": 4.3575162479424524e-06, + "loss": 0.9351, + "step": 7098 + }, + { + "epoch": 0.5128686762873191, + "grad_norm": 5.352205780794559, + "learning_rate": 4.357320468001432e-06, + "loss": 0.8805, + "step": 7099 + }, + { + "epoch": 0.5129409214875287, + "grad_norm": 6.867881835276457, + "learning_rate": 4.357124662635136e-06, + "loss": 0.8823, + "step": 7100 + }, + { + "epoch": 0.5130131666877382, + "grad_norm": 7.584117667190754, + "learning_rate": 4.356928831846246e-06, + "loss": 0.8495, + "step": 7101 + }, + { + "epoch": 0.5130854118879477, + "grad_norm": 6.93071600711917, + "learning_rate": 4.356732975637441e-06, + "loss": 0.85, + "step": 7102 + }, + { + "epoch": 0.5131576570881572, + "grad_norm": 6.377038368017552, + "learning_rate": 4.3565370940114026e-06, + "loss": 0.8552, + "step": 7103 + }, + { + "epoch": 0.5132299022883667, + "grad_norm": 7.955913182371316, + "learning_rate": 4.356341186970814e-06, + "loss": 0.9055, + "step": 7104 + }, + { + "epoch": 0.5133021474885763, + "grad_norm": 6.398376091929071, + "learning_rate": 4.356145254518356e-06, + "loss": 0.9189, + "step": 7105 + }, + { + "epoch": 0.5133743926887857, + "grad_norm": 6.865377454501204, + "learning_rate": 4.355949296656711e-06, + "loss": 0.8804, + "step": 7106 + }, + { + "epoch": 0.5134466378889952, + "grad_norm": 7.463325605617042, + "learning_rate": 4.35575331338856e-06, + "loss": 0.9049, + "step": 7107 + }, + { + "epoch": 0.5135188830892048, + "grad_norm": 7.778730482456708, + "learning_rate": 4.355557304716588e-06, + "loss": 0.8594, + "step": 7108 + }, + { + "epoch": 0.5135911282894142, + "grad_norm": 5.613322024904098, + "learning_rate": 4.355361270643477e-06, + "loss": 0.87, + "step": 7109 + }, + { + "epoch": 0.5136633734896238, + "grad_norm": 6.589316952491406, + "learning_rate": 4.35516521117191e-06, + "loss": 0.921, + "step": 7110 + }, + { + "epoch": 0.5137356186898333, + "grad_norm": 9.235997091432479, + "learning_rate": 4.3549691263045726e-06, + "loss": 0.9076, + "step": 7111 + }, + { + "epoch": 0.5138078638900428, + "grad_norm": 6.210066210014485, + "learning_rate": 4.354773016044148e-06, + "loss": 0.7867, + "step": 7112 + }, + { + "epoch": 0.5138801090902523, + "grad_norm": 6.964675875623375, + "learning_rate": 4.354576880393321e-06, + "loss": 0.8263, + "step": 7113 + }, + { + "epoch": 0.5139523542904618, + "grad_norm": 6.758675827711505, + "learning_rate": 4.354380719354776e-06, + "loss": 0.8358, + "step": 7114 + }, + { + "epoch": 0.5140245994906714, + "grad_norm": 9.671461253645269, + "learning_rate": 4.354184532931199e-06, + "loss": 0.9064, + "step": 7115 + }, + { + "epoch": 0.5140968446908808, + "grad_norm": 7.728043741122152, + "learning_rate": 4.353988321125277e-06, + "loss": 0.8942, + "step": 7116 + }, + { + "epoch": 0.5141690898910903, + "grad_norm": 6.323014670737938, + "learning_rate": 4.3537920839396915e-06, + "loss": 0.8376, + "step": 7117 + }, + { + "epoch": 0.5142413350912999, + "grad_norm": 6.182722308600288, + "learning_rate": 4.353595821377134e-06, + "loss": 0.9012, + "step": 7118 + }, + { + "epoch": 0.5143135802915094, + "grad_norm": 7.188517357516528, + "learning_rate": 4.353399533440288e-06, + "loss": 0.9115, + "step": 7119 + }, + { + "epoch": 0.5143858254917189, + "grad_norm": 8.392961514180536, + "learning_rate": 4.353203220131842e-06, + "loss": 0.9106, + "step": 7120 + }, + { + "epoch": 0.5144580706919284, + "grad_norm": 8.450317142110473, + "learning_rate": 4.353006881454483e-06, + "loss": 0.8678, + "step": 7121 + }, + { + "epoch": 0.5145303158921379, + "grad_norm": 6.071413389555598, + "learning_rate": 4.352810517410897e-06, + "loss": 0.8099, + "step": 7122 + }, + { + "epoch": 0.5146025610923475, + "grad_norm": 6.127164497170826, + "learning_rate": 4.3526141280037744e-06, + "loss": 0.9145, + "step": 7123 + }, + { + "epoch": 0.5146748062925569, + "grad_norm": 5.521448794433467, + "learning_rate": 4.352417713235804e-06, + "loss": 0.9316, + "step": 7124 + }, + { + "epoch": 0.5147470514927664, + "grad_norm": 5.72047873460998, + "learning_rate": 4.352221273109672e-06, + "loss": 0.7782, + "step": 7125 + }, + { + "epoch": 0.514819296692976, + "grad_norm": 8.46272271678861, + "learning_rate": 4.35202480762807e-06, + "loss": 1.0258, + "step": 7126 + }, + { + "epoch": 0.5148915418931854, + "grad_norm": 7.935017355195015, + "learning_rate": 4.351828316793686e-06, + "loss": 0.9316, + "step": 7127 + }, + { + "epoch": 0.514963787093395, + "grad_norm": 8.275148843308273, + "learning_rate": 4.35163180060921e-06, + "loss": 0.9451, + "step": 7128 + }, + { + "epoch": 0.5150360322936045, + "grad_norm": 8.10756608783748, + "learning_rate": 4.351435259077332e-06, + "loss": 0.8839, + "step": 7129 + }, + { + "epoch": 0.515108277493814, + "grad_norm": 6.794892962625324, + "learning_rate": 4.3512386922007435e-06, + "loss": 0.9094, + "step": 7130 + }, + { + "epoch": 0.5151805226940235, + "grad_norm": 9.479453102786557, + "learning_rate": 4.351042099982134e-06, + "loss": 0.9225, + "step": 7131 + }, + { + "epoch": 0.515252767894233, + "grad_norm": 5.8219298298209825, + "learning_rate": 4.350845482424196e-06, + "loss": 0.7787, + "step": 7132 + }, + { + "epoch": 0.5153250130944426, + "grad_norm": 7.88527239554558, + "learning_rate": 4.35064883952962e-06, + "loss": 0.905, + "step": 7133 + }, + { + "epoch": 0.515397258294652, + "grad_norm": 5.69230264635715, + "learning_rate": 4.3504521713010985e-06, + "loss": 0.9376, + "step": 7134 + }, + { + "epoch": 0.5154695034948615, + "grad_norm": 6.405183433508718, + "learning_rate": 4.350255477741324e-06, + "loss": 0.8643, + "step": 7135 + }, + { + "epoch": 0.5155417486950711, + "grad_norm": 5.810768115535957, + "learning_rate": 4.350058758852988e-06, + "loss": 0.8403, + "step": 7136 + }, + { + "epoch": 0.5156139938952806, + "grad_norm": 5.386855601563377, + "learning_rate": 4.349862014638785e-06, + "loss": 0.9154, + "step": 7137 + }, + { + "epoch": 0.51568623909549, + "grad_norm": 5.799789806371926, + "learning_rate": 4.349665245101407e-06, + "loss": 0.8298, + "step": 7138 + }, + { + "epoch": 0.5157584842956996, + "grad_norm": 5.9895169550216805, + "learning_rate": 4.349468450243547e-06, + "loss": 0.928, + "step": 7139 + }, + { + "epoch": 0.5158307294959091, + "grad_norm": 6.676762311799164, + "learning_rate": 4.3492716300679005e-06, + "loss": 0.7919, + "step": 7140 + }, + { + "epoch": 0.5159029746961187, + "grad_norm": 6.0459068765336434, + "learning_rate": 4.349074784577161e-06, + "loss": 0.8583, + "step": 7141 + }, + { + "epoch": 0.5159752198963281, + "grad_norm": 6.274948071268729, + "learning_rate": 4.3488779137740244e-06, + "loss": 0.8467, + "step": 7142 + }, + { + "epoch": 0.5160474650965376, + "grad_norm": 7.009830112112818, + "learning_rate": 4.348681017661183e-06, + "loss": 0.8942, + "step": 7143 + }, + { + "epoch": 0.5161197102967472, + "grad_norm": 5.095046738399903, + "learning_rate": 4.348484096241336e-06, + "loss": 0.7983, + "step": 7144 + }, + { + "epoch": 0.5161919554969566, + "grad_norm": 7.060758933105264, + "learning_rate": 4.348287149517176e-06, + "loss": 0.8089, + "step": 7145 + }, + { + "epoch": 0.5162642006971662, + "grad_norm": 5.661497937730418, + "learning_rate": 4.3480901774913996e-06, + "loss": 0.805, + "step": 7146 + }, + { + "epoch": 0.5163364458973757, + "grad_norm": 5.91741308241115, + "learning_rate": 4.347893180166704e-06, + "loss": 0.9177, + "step": 7147 + }, + { + "epoch": 0.5164086910975852, + "grad_norm": 6.133333414879398, + "learning_rate": 4.347696157545785e-06, + "loss": 0.9734, + "step": 7148 + }, + { + "epoch": 0.5164809362977947, + "grad_norm": 7.918944288849515, + "learning_rate": 4.3474991096313414e-06, + "loss": 0.9476, + "step": 7149 + }, + { + "epoch": 0.5165531814980042, + "grad_norm": 8.802858529951498, + "learning_rate": 4.347302036426069e-06, + "loss": 0.8804, + "step": 7150 + }, + { + "epoch": 0.5166254266982138, + "grad_norm": 6.552274748690276, + "learning_rate": 4.347104937932666e-06, + "loss": 0.905, + "step": 7151 + }, + { + "epoch": 0.5166976718984232, + "grad_norm": 5.629823608616503, + "learning_rate": 4.346907814153831e-06, + "loss": 0.813, + "step": 7152 + }, + { + "epoch": 0.5167699170986327, + "grad_norm": 5.829087410588659, + "learning_rate": 4.346710665092261e-06, + "loss": 0.7986, + "step": 7153 + }, + { + "epoch": 0.5168421622988423, + "grad_norm": 6.790952430411317, + "learning_rate": 4.346513490750657e-06, + "loss": 0.8546, + "step": 7154 + }, + { + "epoch": 0.5169144074990518, + "grad_norm": 6.598896350484193, + "learning_rate": 4.3463162911317154e-06, + "loss": 0.8614, + "step": 7155 + }, + { + "epoch": 0.5169866526992613, + "grad_norm": 5.927371732626383, + "learning_rate": 4.346119066238139e-06, + "loss": 0.832, + "step": 7156 + }, + { + "epoch": 0.5170588978994708, + "grad_norm": 5.88567016415718, + "learning_rate": 4.345921816072626e-06, + "loss": 0.8997, + "step": 7157 + }, + { + "epoch": 0.5171311430996803, + "grad_norm": 7.649680795430137, + "learning_rate": 4.345724540637875e-06, + "loss": 0.9303, + "step": 7158 + }, + { + "epoch": 0.5172033882998899, + "grad_norm": 7.74643231381578, + "learning_rate": 4.34552723993659e-06, + "loss": 0.918, + "step": 7159 + }, + { + "epoch": 0.5172756335000993, + "grad_norm": 7.355300196389261, + "learning_rate": 4.345329913971469e-06, + "loss": 0.9193, + "step": 7160 + }, + { + "epoch": 0.5173478787003088, + "grad_norm": 6.459883505889867, + "learning_rate": 4.345132562745215e-06, + "loss": 0.9212, + "step": 7161 + }, + { + "epoch": 0.5174201239005184, + "grad_norm": 6.234110049782003, + "learning_rate": 4.344935186260528e-06, + "loss": 0.8618, + "step": 7162 + }, + { + "epoch": 0.5174923691007278, + "grad_norm": 8.115491732155698, + "learning_rate": 4.344737784520111e-06, + "loss": 0.95, + "step": 7163 + }, + { + "epoch": 0.5175646143009374, + "grad_norm": 7.3420504287488395, + "learning_rate": 4.344540357526667e-06, + "loss": 0.9226, + "step": 7164 + }, + { + "epoch": 0.5176368595011469, + "grad_norm": 6.898161004509871, + "learning_rate": 4.3443429052828965e-06, + "loss": 0.8582, + "step": 7165 + }, + { + "epoch": 0.5177091047013564, + "grad_norm": 7.449761805470827, + "learning_rate": 4.344145427791504e-06, + "loss": 0.8841, + "step": 7166 + }, + { + "epoch": 0.5177813499015659, + "grad_norm": 6.055317822532915, + "learning_rate": 4.343947925055193e-06, + "loss": 0.8499, + "step": 7167 + }, + { + "epoch": 0.5178535951017754, + "grad_norm": 5.46976378717136, + "learning_rate": 4.343750397076666e-06, + "loss": 0.8896, + "step": 7168 + }, + { + "epoch": 0.517925840301985, + "grad_norm": 7.047492091190357, + "learning_rate": 4.3435528438586275e-06, + "loss": 0.9081, + "step": 7169 + }, + { + "epoch": 0.5179980855021944, + "grad_norm": 7.575506957561487, + "learning_rate": 4.343355265403783e-06, + "loss": 0.7665, + "step": 7170 + }, + { + "epoch": 0.5180703307024039, + "grad_norm": 5.76311423778615, + "learning_rate": 4.3431576617148355e-06, + "loss": 0.8298, + "step": 7171 + }, + { + "epoch": 0.5181425759026135, + "grad_norm": 6.666077142717166, + "learning_rate": 4.34296003279449e-06, + "loss": 0.8588, + "step": 7172 + }, + { + "epoch": 0.518214821102823, + "grad_norm": 7.3333500370644575, + "learning_rate": 4.342762378645454e-06, + "loss": 0.9677, + "step": 7173 + }, + { + "epoch": 0.5182870663030325, + "grad_norm": 8.060904884753024, + "learning_rate": 4.342564699270431e-06, + "loss": 0.8965, + "step": 7174 + }, + { + "epoch": 0.518359311503242, + "grad_norm": 6.042822611052105, + "learning_rate": 4.342366994672129e-06, + "loss": 0.9381, + "step": 7175 + }, + { + "epoch": 0.5184315567034515, + "grad_norm": 6.651136843230114, + "learning_rate": 4.342169264853252e-06, + "loss": 0.9348, + "step": 7176 + }, + { + "epoch": 0.5185038019036611, + "grad_norm": 5.50644791384879, + "learning_rate": 4.341971509816509e-06, + "loss": 0.9152, + "step": 7177 + }, + { + "epoch": 0.5185760471038705, + "grad_norm": 5.537182745876324, + "learning_rate": 4.3417737295646055e-06, + "loss": 0.9246, + "step": 7178 + }, + { + "epoch": 0.51864829230408, + "grad_norm": 8.005753832175126, + "learning_rate": 4.34157592410025e-06, + "loss": 0.8976, + "step": 7179 + }, + { + "epoch": 0.5187205375042896, + "grad_norm": 6.0237193159187505, + "learning_rate": 4.3413780934261504e-06, + "loss": 0.8309, + "step": 7180 + }, + { + "epoch": 0.518792782704499, + "grad_norm": 6.065456338116664, + "learning_rate": 4.341180237545014e-06, + "loss": 0.8438, + "step": 7181 + }, + { + "epoch": 0.5188650279047086, + "grad_norm": 6.902186514461915, + "learning_rate": 4.34098235645955e-06, + "loss": 0.8684, + "step": 7182 + }, + { + "epoch": 0.5189372731049181, + "grad_norm": 6.198764911809833, + "learning_rate": 4.340784450172466e-06, + "loss": 0.8875, + "step": 7183 + }, + { + "epoch": 0.5190095183051276, + "grad_norm": 5.561514606418581, + "learning_rate": 4.340586518686474e-06, + "loss": 0.9162, + "step": 7184 + }, + { + "epoch": 0.5190817635053371, + "grad_norm": 7.672183028678155, + "learning_rate": 4.34038856200428e-06, + "loss": 1.0113, + "step": 7185 + }, + { + "epoch": 0.5191540087055466, + "grad_norm": 6.313133887766304, + "learning_rate": 4.340190580128596e-06, + "loss": 0.9312, + "step": 7186 + }, + { + "epoch": 0.5192262539057562, + "grad_norm": 5.983979772360166, + "learning_rate": 4.339992573062133e-06, + "loss": 0.8613, + "step": 7187 + }, + { + "epoch": 0.5192984991059656, + "grad_norm": 6.188685448112838, + "learning_rate": 4.339794540807599e-06, + "loss": 0.8483, + "step": 7188 + }, + { + "epoch": 0.5193707443061751, + "grad_norm": 7.3356626741485655, + "learning_rate": 4.339596483367706e-06, + "loss": 0.9645, + "step": 7189 + }, + { + "epoch": 0.5194429895063847, + "grad_norm": 6.968550195951529, + "learning_rate": 4.339398400745166e-06, + "loss": 0.8937, + "step": 7190 + }, + { + "epoch": 0.5195152347065942, + "grad_norm": 6.279122215203095, + "learning_rate": 4.339200292942691e-06, + "loss": 0.8996, + "step": 7191 + }, + { + "epoch": 0.5195874799068037, + "grad_norm": 8.658957598458057, + "learning_rate": 4.33900215996299e-06, + "loss": 0.8438, + "step": 7192 + }, + { + "epoch": 0.5196597251070132, + "grad_norm": 6.370274905705999, + "learning_rate": 4.3388040018087795e-06, + "loss": 0.8691, + "step": 7193 + }, + { + "epoch": 0.5197319703072227, + "grad_norm": 6.74428174854157, + "learning_rate": 4.338605818482768e-06, + "loss": 0.9437, + "step": 7194 + }, + { + "epoch": 0.5198042155074323, + "grad_norm": 5.970141183790759, + "learning_rate": 4.338407609987672e-06, + "loss": 0.9394, + "step": 7195 + }, + { + "epoch": 0.5198764607076417, + "grad_norm": 6.318019153803827, + "learning_rate": 4.338209376326202e-06, + "loss": 0.8226, + "step": 7196 + }, + { + "epoch": 0.5199487059078512, + "grad_norm": 8.706675053700044, + "learning_rate": 4.338011117501074e-06, + "loss": 0.9069, + "step": 7197 + }, + { + "epoch": 0.5200209511080608, + "grad_norm": 8.511482112933626, + "learning_rate": 4.337812833515001e-06, + "loss": 0.8298, + "step": 7198 + }, + { + "epoch": 0.5200931963082702, + "grad_norm": 6.621308647876885, + "learning_rate": 4.337614524370697e-06, + "loss": 0.9109, + "step": 7199 + }, + { + "epoch": 0.5201654415084798, + "grad_norm": 6.498096187256533, + "learning_rate": 4.337416190070877e-06, + "loss": 0.9459, + "step": 7200 + }, + { + "epoch": 0.5202376867086893, + "grad_norm": 7.600061878153218, + "learning_rate": 4.337217830618255e-06, + "loss": 0.8507, + "step": 7201 + }, + { + "epoch": 0.5203099319088988, + "grad_norm": 6.536208611286355, + "learning_rate": 4.337019446015548e-06, + "loss": 0.8693, + "step": 7202 + }, + { + "epoch": 0.5203821771091083, + "grad_norm": 6.027131092327556, + "learning_rate": 4.336821036265471e-06, + "loss": 0.8614, + "step": 7203 + }, + { + "epoch": 0.5204544223093178, + "grad_norm": 6.766119183470805, + "learning_rate": 4.336622601370741e-06, + "loss": 0.8992, + "step": 7204 + }, + { + "epoch": 0.5205266675095274, + "grad_norm": 5.70119127908654, + "learning_rate": 4.336424141334073e-06, + "loss": 0.9005, + "step": 7205 + }, + { + "epoch": 0.5205989127097368, + "grad_norm": 10.87799872785625, + "learning_rate": 4.336225656158185e-06, + "loss": 1.027, + "step": 7206 + }, + { + "epoch": 0.5206711579099463, + "grad_norm": 5.82967145514242, + "learning_rate": 4.336027145845792e-06, + "loss": 0.8692, + "step": 7207 + }, + { + "epoch": 0.5207434031101559, + "grad_norm": 6.958573236346349, + "learning_rate": 4.335828610399615e-06, + "loss": 0.8333, + "step": 7208 + }, + { + "epoch": 0.5208156483103654, + "grad_norm": 5.179115045136267, + "learning_rate": 4.335630049822369e-06, + "loss": 0.8444, + "step": 7209 + }, + { + "epoch": 0.5208878935105749, + "grad_norm": 8.303389755582081, + "learning_rate": 4.335431464116772e-06, + "loss": 0.8637, + "step": 7210 + }, + { + "epoch": 0.5209601387107844, + "grad_norm": 7.046801378495689, + "learning_rate": 4.335232853285544e-06, + "loss": 0.9408, + "step": 7211 + }, + { + "epoch": 0.5210323839109939, + "grad_norm": 5.8501382273458065, + "learning_rate": 4.335034217331403e-06, + "loss": 0.8076, + "step": 7212 + }, + { + "epoch": 0.5211046291112035, + "grad_norm": 9.023288689249128, + "learning_rate": 4.334835556257069e-06, + "loss": 0.8367, + "step": 7213 + }, + { + "epoch": 0.5211768743114129, + "grad_norm": 8.279545907362849, + "learning_rate": 4.334636870065261e-06, + "loss": 0.8404, + "step": 7214 + }, + { + "epoch": 0.5212491195116224, + "grad_norm": 5.946288340783541, + "learning_rate": 4.3344381587586985e-06, + "loss": 0.8194, + "step": 7215 + }, + { + "epoch": 0.521321364711832, + "grad_norm": 5.783720900918093, + "learning_rate": 4.334239422340101e-06, + "loss": 0.9317, + "step": 7216 + }, + { + "epoch": 0.5213936099120414, + "grad_norm": 7.755550980710803, + "learning_rate": 4.334040660812191e-06, + "loss": 0.8472, + "step": 7217 + }, + { + "epoch": 0.521465855112251, + "grad_norm": 7.396257185991312, + "learning_rate": 4.333841874177688e-06, + "loss": 0.9214, + "step": 7218 + }, + { + "epoch": 0.5215381003124605, + "grad_norm": 6.799426211663843, + "learning_rate": 4.333643062439314e-06, + "loss": 0.8066, + "step": 7219 + }, + { + "epoch": 0.52161034551267, + "grad_norm": 8.576760994839349, + "learning_rate": 4.33344422559979e-06, + "loss": 0.8509, + "step": 7220 + }, + { + "epoch": 0.5216825907128795, + "grad_norm": 7.237713366906301, + "learning_rate": 4.3332453636618374e-06, + "loss": 0.9054, + "step": 7221 + }, + { + "epoch": 0.521754835913089, + "grad_norm": 6.148198938649349, + "learning_rate": 4.33304647662818e-06, + "loss": 0.8861, + "step": 7222 + }, + { + "epoch": 0.5218270811132986, + "grad_norm": 8.30289632979835, + "learning_rate": 4.33284756450154e-06, + "loss": 0.9402, + "step": 7223 + }, + { + "epoch": 0.521899326313508, + "grad_norm": 9.229951731843887, + "learning_rate": 4.332648627284639e-06, + "loss": 0.9007, + "step": 7224 + }, + { + "epoch": 0.5219715715137175, + "grad_norm": 5.581755324979282, + "learning_rate": 4.332449664980202e-06, + "loss": 0.7919, + "step": 7225 + }, + { + "epoch": 0.5220438167139271, + "grad_norm": 6.824201980159704, + "learning_rate": 4.332250677590951e-06, + "loss": 0.9296, + "step": 7226 + }, + { + "epoch": 0.5221160619141366, + "grad_norm": 5.9210974351403625, + "learning_rate": 4.332051665119612e-06, + "loss": 0.9195, + "step": 7227 + }, + { + "epoch": 0.522188307114346, + "grad_norm": 6.272087740556898, + "learning_rate": 4.331852627568907e-06, + "loss": 0.8652, + "step": 7228 + }, + { + "epoch": 0.5222605523145556, + "grad_norm": 5.895636169947114, + "learning_rate": 4.331653564941563e-06, + "loss": 0.9158, + "step": 7229 + }, + { + "epoch": 0.5223327975147651, + "grad_norm": 6.227340821904499, + "learning_rate": 4.331454477240303e-06, + "loss": 0.8809, + "step": 7230 + }, + { + "epoch": 0.5224050427149747, + "grad_norm": 7.771550640582023, + "learning_rate": 4.331255364467853e-06, + "loss": 0.8523, + "step": 7231 + }, + { + "epoch": 0.5224772879151841, + "grad_norm": 6.651906203498116, + "learning_rate": 4.331056226626941e-06, + "loss": 0.8336, + "step": 7232 + }, + { + "epoch": 0.5225495331153936, + "grad_norm": 7.105472776507243, + "learning_rate": 4.330857063720289e-06, + "loss": 0.9224, + "step": 7233 + }, + { + "epoch": 0.5226217783156032, + "grad_norm": 7.017277784155334, + "learning_rate": 4.330657875750626e-06, + "loss": 0.8876, + "step": 7234 + }, + { + "epoch": 0.5226940235158126, + "grad_norm": 7.7255764940570835, + "learning_rate": 4.330458662720678e-06, + "loss": 0.9062, + "step": 7235 + }, + { + "epoch": 0.5227662687160222, + "grad_norm": 6.050422199744908, + "learning_rate": 4.330259424633172e-06, + "loss": 0.8434, + "step": 7236 + }, + { + "epoch": 0.5228385139162317, + "grad_norm": 6.589833330237761, + "learning_rate": 4.330060161490836e-06, + "loss": 0.9912, + "step": 7237 + }, + { + "epoch": 0.5229107591164412, + "grad_norm": 7.943456382337333, + "learning_rate": 4.329860873296397e-06, + "loss": 0.9264, + "step": 7238 + }, + { + "epoch": 0.5229830043166507, + "grad_norm": 6.479637823933194, + "learning_rate": 4.3296615600525835e-06, + "loss": 0.9117, + "step": 7239 + }, + { + "epoch": 0.5230552495168602, + "grad_norm": 5.944675647046186, + "learning_rate": 4.329462221762124e-06, + "loss": 0.9053, + "step": 7240 + }, + { + "epoch": 0.5231274947170698, + "grad_norm": 5.407012709249746, + "learning_rate": 4.329262858427747e-06, + "loss": 0.8304, + "step": 7241 + }, + { + "epoch": 0.5231997399172792, + "grad_norm": 5.3833553223214965, + "learning_rate": 4.329063470052182e-06, + "loss": 0.8178, + "step": 7242 + }, + { + "epoch": 0.5232719851174887, + "grad_norm": 5.317881114019632, + "learning_rate": 4.328864056638158e-06, + "loss": 0.8871, + "step": 7243 + }, + { + "epoch": 0.5233442303176983, + "grad_norm": 7.513084442204371, + "learning_rate": 4.3286646181884055e-06, + "loss": 0.8636, + "step": 7244 + }, + { + "epoch": 0.5234164755179078, + "grad_norm": 6.63164834453886, + "learning_rate": 4.3284651547056536e-06, + "loss": 0.8259, + "step": 7245 + }, + { + "epoch": 0.5234887207181173, + "grad_norm": 5.197439759214067, + "learning_rate": 4.328265666192634e-06, + "loss": 0.897, + "step": 7246 + }, + { + "epoch": 0.5235609659183268, + "grad_norm": 6.15885444194855, + "learning_rate": 4.328066152652077e-06, + "loss": 0.8765, + "step": 7247 + }, + { + "epoch": 0.5236332111185363, + "grad_norm": 5.891588855681018, + "learning_rate": 4.327866614086713e-06, + "loss": 0.7942, + "step": 7248 + }, + { + "epoch": 0.5237054563187459, + "grad_norm": 5.594716931529719, + "learning_rate": 4.327667050499276e-06, + "loss": 0.8397, + "step": 7249 + }, + { + "epoch": 0.5237777015189553, + "grad_norm": 6.525787551531586, + "learning_rate": 4.327467461892495e-06, + "loss": 0.9515, + "step": 7250 + }, + { + "epoch": 0.5238499467191648, + "grad_norm": 8.458565644580165, + "learning_rate": 4.3272678482691035e-06, + "loss": 0.9404, + "step": 7251 + }, + { + "epoch": 0.5239221919193744, + "grad_norm": 7.571068063672117, + "learning_rate": 4.327068209631833e-06, + "loss": 0.8741, + "step": 7252 + }, + { + "epoch": 0.5239944371195838, + "grad_norm": 5.814705707060057, + "learning_rate": 4.3268685459834185e-06, + "loss": 0.9054, + "step": 7253 + }, + { + "epoch": 0.5240666823197934, + "grad_norm": 6.398131049972108, + "learning_rate": 4.326668857326592e-06, + "loss": 0.85, + "step": 7254 + }, + { + "epoch": 0.5241389275200029, + "grad_norm": 4.87320363130889, + "learning_rate": 4.326469143664087e-06, + "loss": 0.8093, + "step": 7255 + }, + { + "epoch": 0.5242111727202124, + "grad_norm": 7.774032738673838, + "learning_rate": 4.326269404998637e-06, + "loss": 0.8412, + "step": 7256 + }, + { + "epoch": 0.5242834179204219, + "grad_norm": 7.8285808620853485, + "learning_rate": 4.326069641332977e-06, + "loss": 0.9719, + "step": 7257 + }, + { + "epoch": 0.5243556631206314, + "grad_norm": 7.163651948602937, + "learning_rate": 4.325869852669843e-06, + "loss": 0.8918, + "step": 7258 + }, + { + "epoch": 0.524427908320841, + "grad_norm": 5.887356040798199, + "learning_rate": 4.325670039011967e-06, + "loss": 0.9373, + "step": 7259 + }, + { + "epoch": 0.5245001535210504, + "grad_norm": 5.5191495618978, + "learning_rate": 4.325470200362086e-06, + "loss": 0.912, + "step": 7260 + }, + { + "epoch": 0.5245723987212599, + "grad_norm": 5.287063739355597, + "learning_rate": 4.325270336722936e-06, + "loss": 0.8367, + "step": 7261 + }, + { + "epoch": 0.5246446439214695, + "grad_norm": 5.734219000666736, + "learning_rate": 4.325070448097251e-06, + "loss": 0.8927, + "step": 7262 + }, + { + "epoch": 0.524716889121679, + "grad_norm": 6.3193141304339004, + "learning_rate": 4.32487053448777e-06, + "loss": 0.8757, + "step": 7263 + }, + { + "epoch": 0.5247891343218885, + "grad_norm": 6.056249168561282, + "learning_rate": 4.324670595897227e-06, + "loss": 0.8873, + "step": 7264 + }, + { + "epoch": 0.524861379522098, + "grad_norm": 6.471302358213977, + "learning_rate": 4.324470632328361e-06, + "loss": 0.8831, + "step": 7265 + }, + { + "epoch": 0.5249336247223075, + "grad_norm": 6.534670577440309, + "learning_rate": 4.324270643783908e-06, + "loss": 0.8404, + "step": 7266 + }, + { + "epoch": 0.5250058699225171, + "grad_norm": 6.037823351302454, + "learning_rate": 4.324070630266607e-06, + "loss": 1.0025, + "step": 7267 + }, + { + "epoch": 0.5250781151227265, + "grad_norm": 6.357357670566732, + "learning_rate": 4.323870591779196e-06, + "loss": 0.9235, + "step": 7268 + }, + { + "epoch": 0.525150360322936, + "grad_norm": 9.756449986814612, + "learning_rate": 4.3236705283244115e-06, + "loss": 0.9071, + "step": 7269 + }, + { + "epoch": 0.5252226055231456, + "grad_norm": 6.996006780325569, + "learning_rate": 4.323470439904994e-06, + "loss": 0.8561, + "step": 7270 + }, + { + "epoch": 0.525294850723355, + "grad_norm": 7.01999886493728, + "learning_rate": 4.323270326523682e-06, + "loss": 0.9239, + "step": 7271 + }, + { + "epoch": 0.5253670959235646, + "grad_norm": 5.737727461151935, + "learning_rate": 4.323070188183215e-06, + "loss": 0.8552, + "step": 7272 + }, + { + "epoch": 0.5254393411237741, + "grad_norm": 6.515688815822077, + "learning_rate": 4.322870024886332e-06, + "loss": 0.8845, + "step": 7273 + }, + { + "epoch": 0.5255115863239836, + "grad_norm": 5.404689497449671, + "learning_rate": 4.322669836635774e-06, + "loss": 0.8803, + "step": 7274 + }, + { + "epoch": 0.5255838315241931, + "grad_norm": 8.167145799348296, + "learning_rate": 4.322469623434282e-06, + "loss": 0.9162, + "step": 7275 + }, + { + "epoch": 0.5256560767244026, + "grad_norm": 6.028523041124837, + "learning_rate": 4.322269385284596e-06, + "loss": 0.8386, + "step": 7276 + }, + { + "epoch": 0.5257283219246122, + "grad_norm": 6.49944244340969, + "learning_rate": 4.322069122189456e-06, + "loss": 0.9089, + "step": 7277 + }, + { + "epoch": 0.5258005671248216, + "grad_norm": 6.493110233106133, + "learning_rate": 4.321868834151605e-06, + "loss": 0.8154, + "step": 7278 + }, + { + "epoch": 0.5258728123250311, + "grad_norm": 5.592015343760995, + "learning_rate": 4.321668521173783e-06, + "loss": 0.9618, + "step": 7279 + }, + { + "epoch": 0.5259450575252407, + "grad_norm": 6.916018658658611, + "learning_rate": 4.321468183258735e-06, + "loss": 0.8653, + "step": 7280 + }, + { + "epoch": 0.5260173027254502, + "grad_norm": 8.62721467263986, + "learning_rate": 4.321267820409201e-06, + "loss": 0.8319, + "step": 7281 + }, + { + "epoch": 0.5260895479256597, + "grad_norm": 6.378438583208717, + "learning_rate": 4.3210674326279255e-06, + "loss": 0.861, + "step": 7282 + }, + { + "epoch": 0.5261617931258692, + "grad_norm": 6.289979855940301, + "learning_rate": 4.32086701991765e-06, + "loss": 0.7861, + "step": 7283 + }, + { + "epoch": 0.5262340383260787, + "grad_norm": 6.50051496373181, + "learning_rate": 4.320666582281119e-06, + "loss": 0.9049, + "step": 7284 + }, + { + "epoch": 0.5263062835262883, + "grad_norm": 9.066520272737332, + "learning_rate": 4.320466119721077e-06, + "loss": 0.8473, + "step": 7285 + }, + { + "epoch": 0.5263785287264977, + "grad_norm": 4.959600890314758, + "learning_rate": 4.320265632240266e-06, + "loss": 0.8467, + "step": 7286 + }, + { + "epoch": 0.5264507739267072, + "grad_norm": 9.079367468896383, + "learning_rate": 4.320065119841432e-06, + "loss": 1.0124, + "step": 7287 + }, + { + "epoch": 0.5265230191269168, + "grad_norm": 5.352698257102841, + "learning_rate": 4.31986458252732e-06, + "loss": 0.8212, + "step": 7288 + }, + { + "epoch": 0.5265952643271262, + "grad_norm": 5.129928080331633, + "learning_rate": 4.319664020300675e-06, + "loss": 0.8862, + "step": 7289 + }, + { + "epoch": 0.5266675095273358, + "grad_norm": 5.727655532836453, + "learning_rate": 4.319463433164243e-06, + "loss": 0.7974, + "step": 7290 + }, + { + "epoch": 0.5267397547275453, + "grad_norm": 7.185561739757661, + "learning_rate": 4.319262821120769e-06, + "loss": 0.8703, + "step": 7291 + }, + { + "epoch": 0.5268119999277548, + "grad_norm": 8.691608143722508, + "learning_rate": 4.319062184172999e-06, + "loss": 0.8637, + "step": 7292 + }, + { + "epoch": 0.5268842451279643, + "grad_norm": 5.511211585509922, + "learning_rate": 4.3188615223236795e-06, + "loss": 0.8589, + "step": 7293 + }, + { + "epoch": 0.5269564903281738, + "grad_norm": 7.377356976965136, + "learning_rate": 4.318660835575559e-06, + "loss": 0.8898, + "step": 7294 + }, + { + "epoch": 0.5270287355283834, + "grad_norm": 4.973566563807508, + "learning_rate": 4.3184601239313836e-06, + "loss": 0.8106, + "step": 7295 + }, + { + "epoch": 0.5271009807285928, + "grad_norm": 6.244770150761343, + "learning_rate": 4.318259387393902e-06, + "loss": 0.8974, + "step": 7296 + }, + { + "epoch": 0.5271732259288023, + "grad_norm": 4.973346239811272, + "learning_rate": 4.318058625965859e-06, + "loss": 0.846, + "step": 7297 + }, + { + "epoch": 0.5272454711290119, + "grad_norm": 6.731920967911974, + "learning_rate": 4.317857839650007e-06, + "loss": 0.9422, + "step": 7298 + }, + { + "epoch": 0.5273177163292214, + "grad_norm": 6.318076210810949, + "learning_rate": 4.317657028449092e-06, + "loss": 0.8392, + "step": 7299 + }, + { + "epoch": 0.5273899615294309, + "grad_norm": 5.937136990342218, + "learning_rate": 4.317456192365863e-06, + "loss": 0.8187, + "step": 7300 + }, + { + "epoch": 0.5274622067296404, + "grad_norm": 6.39734071080836, + "learning_rate": 4.317255331403071e-06, + "loss": 0.8672, + "step": 7301 + }, + { + "epoch": 0.5275344519298499, + "grad_norm": 5.415345935708406, + "learning_rate": 4.3170544455634645e-06, + "loss": 0.9089, + "step": 7302 + }, + { + "epoch": 0.5276066971300595, + "grad_norm": 5.367952206258162, + "learning_rate": 4.316853534849793e-06, + "loss": 0.7719, + "step": 7303 + }, + { + "epoch": 0.5276789423302689, + "grad_norm": 6.747591224667146, + "learning_rate": 4.3166525992648064e-06, + "loss": 0.8664, + "step": 7304 + }, + { + "epoch": 0.5277511875304784, + "grad_norm": 5.693827033751833, + "learning_rate": 4.316451638811258e-06, + "loss": 0.8466, + "step": 7305 + }, + { + "epoch": 0.527823432730688, + "grad_norm": 6.128253306320308, + "learning_rate": 4.316250653491896e-06, + "loss": 0.9082, + "step": 7306 + }, + { + "epoch": 0.5278956779308974, + "grad_norm": 5.970920667344581, + "learning_rate": 4.316049643309473e-06, + "loss": 0.8895, + "step": 7307 + }, + { + "epoch": 0.527967923131107, + "grad_norm": 6.382374705285025, + "learning_rate": 4.31584860826674e-06, + "loss": 0.9399, + "step": 7308 + }, + { + "epoch": 0.5280401683313165, + "grad_norm": 5.508900117016788, + "learning_rate": 4.31564754836645e-06, + "loss": 0.9212, + "step": 7309 + }, + { + "epoch": 0.528112413531526, + "grad_norm": 6.711229950790882, + "learning_rate": 4.315446463611354e-06, + "loss": 0.8065, + "step": 7310 + }, + { + "epoch": 0.5281846587317355, + "grad_norm": 6.084125763304941, + "learning_rate": 4.315245354004207e-06, + "loss": 0.8895, + "step": 7311 + }, + { + "epoch": 0.528256903931945, + "grad_norm": 7.124210246972391, + "learning_rate": 4.315044219547759e-06, + "loss": 0.909, + "step": 7312 + }, + { + "epoch": 0.5283291491321546, + "grad_norm": 7.66747896065327, + "learning_rate": 4.314843060244767e-06, + "loss": 0.8281, + "step": 7313 + }, + { + "epoch": 0.528401394332364, + "grad_norm": 5.756851095320376, + "learning_rate": 4.3146418760979806e-06, + "loss": 0.8646, + "step": 7314 + }, + { + "epoch": 0.5284736395325735, + "grad_norm": 6.516850857030061, + "learning_rate": 4.314440667110157e-06, + "loss": 0.7838, + "step": 7315 + }, + { + "epoch": 0.5285458847327831, + "grad_norm": 8.402629313632978, + "learning_rate": 4.314239433284049e-06, + "loss": 0.9798, + "step": 7316 + }, + { + "epoch": 0.5286181299329926, + "grad_norm": 6.802650821335862, + "learning_rate": 4.314038174622412e-06, + "loss": 0.8482, + "step": 7317 + }, + { + "epoch": 0.528690375133202, + "grad_norm": 6.008162985550367, + "learning_rate": 4.3138368911280004e-06, + "loss": 0.8883, + "step": 7318 + }, + { + "epoch": 0.5287626203334116, + "grad_norm": 7.335749517110246, + "learning_rate": 4.313635582803571e-06, + "loss": 0.8831, + "step": 7319 + }, + { + "epoch": 0.5288348655336211, + "grad_norm": 6.7637347604822775, + "learning_rate": 4.3134342496518775e-06, + "loss": 0.8523, + "step": 7320 + }, + { + "epoch": 0.5289071107338307, + "grad_norm": 6.2716939172989905, + "learning_rate": 4.313232891675679e-06, + "loss": 0.8968, + "step": 7321 + }, + { + "epoch": 0.5289793559340401, + "grad_norm": 6.247841424119013, + "learning_rate": 4.313031508877729e-06, + "loss": 0.9178, + "step": 7322 + }, + { + "epoch": 0.5290516011342496, + "grad_norm": 7.134713227805135, + "learning_rate": 4.312830101260785e-06, + "loss": 0.8748, + "step": 7323 + }, + { + "epoch": 0.5291238463344592, + "grad_norm": 7.221483910146304, + "learning_rate": 4.312628668827605e-06, + "loss": 0.9294, + "step": 7324 + }, + { + "epoch": 0.5291960915346686, + "grad_norm": 5.016887185117704, + "learning_rate": 4.312427211580945e-06, + "loss": 0.8299, + "step": 7325 + }, + { + "epoch": 0.5292683367348782, + "grad_norm": 6.755903699894344, + "learning_rate": 4.312225729523565e-06, + "loss": 0.9, + "step": 7326 + }, + { + "epoch": 0.5293405819350877, + "grad_norm": 4.805573668595848, + "learning_rate": 4.312024222658221e-06, + "loss": 0.8125, + "step": 7327 + }, + { + "epoch": 0.5294128271352972, + "grad_norm": 6.271085951607677, + "learning_rate": 4.311822690987673e-06, + "loss": 0.8904, + "step": 7328 + }, + { + "epoch": 0.5294850723355067, + "grad_norm": 5.653994790343673, + "learning_rate": 4.3116211345146785e-06, + "loss": 0.8052, + "step": 7329 + }, + { + "epoch": 0.5295573175357162, + "grad_norm": 6.4022987648091645, + "learning_rate": 4.311419553241998e-06, + "loss": 0.8153, + "step": 7330 + }, + { + "epoch": 0.5296295627359258, + "grad_norm": 7.205815340351034, + "learning_rate": 4.31121794717239e-06, + "loss": 0.9124, + "step": 7331 + }, + { + "epoch": 0.5297018079361352, + "grad_norm": 6.626095951047663, + "learning_rate": 4.311016316308615e-06, + "loss": 0.9019, + "step": 7332 + }, + { + "epoch": 0.5297740531363447, + "grad_norm": 6.445012126050008, + "learning_rate": 4.310814660653431e-06, + "loss": 0.955, + "step": 7333 + }, + { + "epoch": 0.5298462983365543, + "grad_norm": 5.267390195827309, + "learning_rate": 4.310612980209603e-06, + "loss": 0.8324, + "step": 7334 + }, + { + "epoch": 0.5299185435367638, + "grad_norm": 5.958341940849151, + "learning_rate": 4.310411274979888e-06, + "loss": 0.9343, + "step": 7335 + }, + { + "epoch": 0.5299907887369733, + "grad_norm": 6.030113627057505, + "learning_rate": 4.310209544967048e-06, + "loss": 0.8028, + "step": 7336 + }, + { + "epoch": 0.5300630339371828, + "grad_norm": 6.816550634215822, + "learning_rate": 4.310007790173845e-06, + "loss": 0.8969, + "step": 7337 + }, + { + "epoch": 0.5301352791373923, + "grad_norm": 7.399016247905364, + "learning_rate": 4.30980601060304e-06, + "loss": 0.8956, + "step": 7338 + }, + { + "epoch": 0.5302075243376018, + "grad_norm": 6.472120208736693, + "learning_rate": 4.309604206257398e-06, + "loss": 0.9429, + "step": 7339 + }, + { + "epoch": 0.5302797695378113, + "grad_norm": 5.698822823486121, + "learning_rate": 4.309402377139678e-06, + "loss": 0.8649, + "step": 7340 + }, + { + "epoch": 0.5303520147380208, + "grad_norm": 5.6641261445777715, + "learning_rate": 4.309200523252644e-06, + "loss": 0.7914, + "step": 7341 + }, + { + "epoch": 0.5304242599382304, + "grad_norm": 7.6040130460367354, + "learning_rate": 4.30899864459906e-06, + "loss": 0.9263, + "step": 7342 + }, + { + "epoch": 0.5304965051384398, + "grad_norm": 8.175800846548421, + "learning_rate": 4.3087967411816895e-06, + "loss": 0.906, + "step": 7343 + }, + { + "epoch": 0.5305687503386494, + "grad_norm": 6.90051390351243, + "learning_rate": 4.308594813003295e-06, + "loss": 0.9095, + "step": 7344 + }, + { + "epoch": 0.5306409955388589, + "grad_norm": 6.475738730689673, + "learning_rate": 4.308392860066644e-06, + "loss": 0.8503, + "step": 7345 + }, + { + "epoch": 0.5307132407390684, + "grad_norm": 7.517239656903734, + "learning_rate": 4.308190882374496e-06, + "loss": 0.8527, + "step": 7346 + }, + { + "epoch": 0.5307854859392779, + "grad_norm": 6.0162182007644125, + "learning_rate": 4.307988879929621e-06, + "loss": 0.813, + "step": 7347 + }, + { + "epoch": 0.5308577311394874, + "grad_norm": 5.965413224391429, + "learning_rate": 4.307786852734782e-06, + "loss": 0.8071, + "step": 7348 + }, + { + "epoch": 0.530929976339697, + "grad_norm": 5.475166959263573, + "learning_rate": 4.307584800792743e-06, + "loss": 0.9125, + "step": 7349 + }, + { + "epoch": 0.5310022215399064, + "grad_norm": 7.9210664535911475, + "learning_rate": 4.307382724106273e-06, + "loss": 0.89, + "step": 7350 + }, + { + "epoch": 0.5310744667401159, + "grad_norm": 7.433673804088119, + "learning_rate": 4.3071806226781365e-06, + "loss": 0.8787, + "step": 7351 + }, + { + "epoch": 0.5311467119403255, + "grad_norm": 5.336739922393406, + "learning_rate": 4.306978496511101e-06, + "loss": 0.8537, + "step": 7352 + }, + { + "epoch": 0.531218957140535, + "grad_norm": 5.35006376567167, + "learning_rate": 4.306776345607932e-06, + "loss": 0.8805, + "step": 7353 + }, + { + "epoch": 0.5312912023407445, + "grad_norm": 6.705070160022942, + "learning_rate": 4.306574169971398e-06, + "loss": 0.889, + "step": 7354 + }, + { + "epoch": 0.531363447540954, + "grad_norm": 5.6492306480867995, + "learning_rate": 4.306371969604266e-06, + "loss": 0.8694, + "step": 7355 + }, + { + "epoch": 0.5314356927411635, + "grad_norm": 6.878267378765459, + "learning_rate": 4.306169744509304e-06, + "loss": 0.7894, + "step": 7356 + }, + { + "epoch": 0.531507937941373, + "grad_norm": 5.878215153523498, + "learning_rate": 4.305967494689282e-06, + "loss": 0.8265, + "step": 7357 + }, + { + "epoch": 0.5315801831415825, + "grad_norm": 6.37797746127893, + "learning_rate": 4.305765220146966e-06, + "loss": 0.8968, + "step": 7358 + }, + { + "epoch": 0.531652428341792, + "grad_norm": 6.538949908491555, + "learning_rate": 4.305562920885127e-06, + "loss": 0.8591, + "step": 7359 + }, + { + "epoch": 0.5317246735420016, + "grad_norm": 6.788332848358824, + "learning_rate": 4.305360596906534e-06, + "loss": 0.893, + "step": 7360 + }, + { + "epoch": 0.531796918742211, + "grad_norm": 5.87859121093176, + "learning_rate": 4.305158248213955e-06, + "loss": 0.8256, + "step": 7361 + }, + { + "epoch": 0.5318691639424206, + "grad_norm": 6.7143335775507165, + "learning_rate": 4.304955874810162e-06, + "loss": 0.9322, + "step": 7362 + }, + { + "epoch": 0.5319414091426301, + "grad_norm": 6.659102280974258, + "learning_rate": 4.304753476697924e-06, + "loss": 0.8537, + "step": 7363 + }, + { + "epoch": 0.5320136543428396, + "grad_norm": 6.485284433590428, + "learning_rate": 4.304551053880012e-06, + "loss": 0.9135, + "step": 7364 + }, + { + "epoch": 0.5320858995430491, + "grad_norm": 5.853659321622098, + "learning_rate": 4.304348606359198e-06, + "loss": 0.8721, + "step": 7365 + }, + { + "epoch": 0.5321581447432586, + "grad_norm": 6.911060987797082, + "learning_rate": 4.304146134138252e-06, + "loss": 0.8583, + "step": 7366 + }, + { + "epoch": 0.5322303899434682, + "grad_norm": 7.6219849175429335, + "learning_rate": 4.303943637219946e-06, + "loss": 0.9286, + "step": 7367 + }, + { + "epoch": 0.5323026351436776, + "grad_norm": 5.750058049447978, + "learning_rate": 4.3037411156070526e-06, + "loss": 0.898, + "step": 7368 + }, + { + "epoch": 0.5323748803438871, + "grad_norm": 6.937449687053621, + "learning_rate": 4.303538569302344e-06, + "loss": 0.823, + "step": 7369 + }, + { + "epoch": 0.5324471255440967, + "grad_norm": 6.016410637618514, + "learning_rate": 4.303335998308592e-06, + "loss": 0.9108, + "step": 7370 + }, + { + "epoch": 0.5325193707443062, + "grad_norm": 6.200267527561216, + "learning_rate": 4.3031334026285715e-06, + "loss": 0.8492, + "step": 7371 + }, + { + "epoch": 0.5325916159445157, + "grad_norm": 6.506411398169237, + "learning_rate": 4.302930782265054e-06, + "loss": 0.8922, + "step": 7372 + }, + { + "epoch": 0.5326638611447252, + "grad_norm": 6.548631819306532, + "learning_rate": 4.302728137220815e-06, + "loss": 0.8562, + "step": 7373 + }, + { + "epoch": 0.5327361063449347, + "grad_norm": 5.538855028234961, + "learning_rate": 4.302525467498626e-06, + "loss": 0.887, + "step": 7374 + }, + { + "epoch": 0.5328083515451442, + "grad_norm": 6.8191515446796345, + "learning_rate": 4.302322773101264e-06, + "loss": 0.9184, + "step": 7375 + }, + { + "epoch": 0.5328805967453537, + "grad_norm": 7.283569752269943, + "learning_rate": 4.302120054031502e-06, + "loss": 0.9642, + "step": 7376 + }, + { + "epoch": 0.5329528419455632, + "grad_norm": 7.726073215833887, + "learning_rate": 4.301917310292116e-06, + "loss": 0.9119, + "step": 7377 + }, + { + "epoch": 0.5330250871457728, + "grad_norm": 7.3666502674777545, + "learning_rate": 4.301714541885882e-06, + "loss": 0.9082, + "step": 7378 + }, + { + "epoch": 0.5330973323459822, + "grad_norm": 6.761188135863764, + "learning_rate": 4.301511748815574e-06, + "loss": 0.8246, + "step": 7379 + }, + { + "epoch": 0.5331695775461918, + "grad_norm": 8.724319702559102, + "learning_rate": 4.301308931083969e-06, + "loss": 0.8662, + "step": 7380 + }, + { + "epoch": 0.5332418227464013, + "grad_norm": 6.7134850033267055, + "learning_rate": 4.301106088693844e-06, + "loss": 0.8763, + "step": 7381 + }, + { + "epoch": 0.5333140679466108, + "grad_norm": 7.843240546593513, + "learning_rate": 4.300903221647974e-06, + "loss": 0.7906, + "step": 7382 + }, + { + "epoch": 0.5333863131468203, + "grad_norm": 7.18937618812406, + "learning_rate": 4.300700329949138e-06, + "loss": 1.0158, + "step": 7383 + }, + { + "epoch": 0.5334585583470298, + "grad_norm": 5.793043160555477, + "learning_rate": 4.300497413600112e-06, + "loss": 0.8557, + "step": 7384 + }, + { + "epoch": 0.5335308035472394, + "grad_norm": 5.90870090878218, + "learning_rate": 4.300294472603674e-06, + "loss": 0.8857, + "step": 7385 + }, + { + "epoch": 0.5336030487474488, + "grad_norm": 5.990207628526855, + "learning_rate": 4.300091506962604e-06, + "loss": 0.8912, + "step": 7386 + }, + { + "epoch": 0.5336752939476583, + "grad_norm": 6.654426718996618, + "learning_rate": 4.299888516679677e-06, + "loss": 0.826, + "step": 7387 + }, + { + "epoch": 0.5337475391478679, + "grad_norm": 5.516754985613288, + "learning_rate": 4.2996855017576755e-06, + "loss": 0.7991, + "step": 7388 + }, + { + "epoch": 0.5338197843480774, + "grad_norm": 6.321030093881369, + "learning_rate": 4.2994824621993765e-06, + "loss": 0.9272, + "step": 7389 + }, + { + "epoch": 0.5338920295482869, + "grad_norm": 6.8891267769597215, + "learning_rate": 4.2992793980075594e-06, + "loss": 0.8925, + "step": 7390 + }, + { + "epoch": 0.5339642747484964, + "grad_norm": 6.412913360062794, + "learning_rate": 4.299076309185005e-06, + "loss": 1.0065, + "step": 7391 + }, + { + "epoch": 0.5340365199487059, + "grad_norm": 6.831234989690737, + "learning_rate": 4.298873195734492e-06, + "loss": 0.9505, + "step": 7392 + }, + { + "epoch": 0.5341087651489154, + "grad_norm": 6.916198193721315, + "learning_rate": 4.298670057658803e-06, + "loss": 0.9057, + "step": 7393 + }, + { + "epoch": 0.5341810103491249, + "grad_norm": 5.464832406785494, + "learning_rate": 4.298466894960716e-06, + "loss": 0.8703, + "step": 7394 + }, + { + "epoch": 0.5342532555493344, + "grad_norm": 5.536818637649305, + "learning_rate": 4.298263707643014e-06, + "loss": 0.8716, + "step": 7395 + }, + { + "epoch": 0.534325500749544, + "grad_norm": 6.113344273836475, + "learning_rate": 4.298060495708478e-06, + "loss": 0.875, + "step": 7396 + }, + { + "epoch": 0.5343977459497534, + "grad_norm": 6.864634519935141, + "learning_rate": 4.29785725915989e-06, + "loss": 0.8497, + "step": 7397 + }, + { + "epoch": 0.534469991149963, + "grad_norm": 6.132973599595863, + "learning_rate": 4.297653998000033e-06, + "loss": 0.9562, + "step": 7398 + }, + { + "epoch": 0.5345422363501725, + "grad_norm": 5.551718389124538, + "learning_rate": 4.297450712231688e-06, + "loss": 0.873, + "step": 7399 + }, + { + "epoch": 0.534614481550382, + "grad_norm": 6.516326061560775, + "learning_rate": 4.297247401857638e-06, + "loss": 0.8954, + "step": 7400 + }, + { + "epoch": 0.5346867267505915, + "grad_norm": 6.137867393296739, + "learning_rate": 4.297044066880667e-06, + "loss": 0.9442, + "step": 7401 + }, + { + "epoch": 0.534758971950801, + "grad_norm": 6.791979761064297, + "learning_rate": 4.296840707303558e-06, + "loss": 0.8094, + "step": 7402 + }, + { + "epoch": 0.5348312171510106, + "grad_norm": 5.944208792098617, + "learning_rate": 4.296637323129093e-06, + "loss": 0.8161, + "step": 7403 + }, + { + "epoch": 0.53490346235122, + "grad_norm": 7.3267748143987745, + "learning_rate": 4.296433914360061e-06, + "loss": 0.9195, + "step": 7404 + }, + { + "epoch": 0.5349757075514295, + "grad_norm": 5.9682709666450995, + "learning_rate": 4.2962304809992415e-06, + "loss": 0.8572, + "step": 7405 + }, + { + "epoch": 0.5350479527516391, + "grad_norm": 5.465346667017, + "learning_rate": 4.296027023049423e-06, + "loss": 0.9439, + "step": 7406 + }, + { + "epoch": 0.5351201979518486, + "grad_norm": 6.803831972984162, + "learning_rate": 4.295823540513388e-06, + "loss": 0.961, + "step": 7407 + }, + { + "epoch": 0.535192443152058, + "grad_norm": 7.102762689509366, + "learning_rate": 4.295620033393924e-06, + "loss": 0.8821, + "step": 7408 + }, + { + "epoch": 0.5352646883522676, + "grad_norm": 7.631418247227138, + "learning_rate": 4.295416501693814e-06, + "loss": 0.9037, + "step": 7409 + }, + { + "epoch": 0.5353369335524771, + "grad_norm": 7.201768859171178, + "learning_rate": 4.295212945415847e-06, + "loss": 0.8977, + "step": 7410 + }, + { + "epoch": 0.5354091787526866, + "grad_norm": 5.445121072170993, + "learning_rate": 4.29500936456281e-06, + "loss": 0.8877, + "step": 7411 + }, + { + "epoch": 0.5354814239528961, + "grad_norm": 7.012690620755463, + "learning_rate": 4.294805759137487e-06, + "loss": 0.939, + "step": 7412 + }, + { + "epoch": 0.5355536691531056, + "grad_norm": 6.926977752416324, + "learning_rate": 4.294602129142667e-06, + "loss": 0.9274, + "step": 7413 + }, + { + "epoch": 0.5356259143533152, + "grad_norm": 7.976034029230894, + "learning_rate": 4.294398474581138e-06, + "loss": 0.9042, + "step": 7414 + }, + { + "epoch": 0.5356981595535246, + "grad_norm": 6.055527600487316, + "learning_rate": 4.294194795455687e-06, + "loss": 0.8251, + "step": 7415 + }, + { + "epoch": 0.5357704047537342, + "grad_norm": 9.657542120643933, + "learning_rate": 4.293991091769102e-06, + "loss": 0.8588, + "step": 7416 + }, + { + "epoch": 0.5358426499539437, + "grad_norm": 7.242806384525991, + "learning_rate": 4.293787363524172e-06, + "loss": 1.0189, + "step": 7417 + }, + { + "epoch": 0.5359148951541532, + "grad_norm": 6.121602575872092, + "learning_rate": 4.293583610723686e-06, + "loss": 0.834, + "step": 7418 + }, + { + "epoch": 0.5359871403543627, + "grad_norm": 6.4235810846766785, + "learning_rate": 4.293379833370433e-06, + "loss": 0.8445, + "step": 7419 + }, + { + "epoch": 0.5360593855545722, + "grad_norm": 8.278130861584845, + "learning_rate": 4.293176031467202e-06, + "loss": 0.9012, + "step": 7420 + }, + { + "epoch": 0.5361316307547818, + "grad_norm": 5.710331625051771, + "learning_rate": 4.292972205016784e-06, + "loss": 0.8148, + "step": 7421 + }, + { + "epoch": 0.5362038759549912, + "grad_norm": 5.085149888611784, + "learning_rate": 4.292768354021969e-06, + "loss": 0.7677, + "step": 7422 + }, + { + "epoch": 0.5362761211552007, + "grad_norm": 6.114921842061555, + "learning_rate": 4.292564478485547e-06, + "loss": 0.9007, + "step": 7423 + }, + { + "epoch": 0.5363483663554103, + "grad_norm": 7.489900273676489, + "learning_rate": 4.292360578410308e-06, + "loss": 0.928, + "step": 7424 + }, + { + "epoch": 0.5364206115556198, + "grad_norm": 6.356631575302352, + "learning_rate": 4.2921566537990455e-06, + "loss": 0.7903, + "step": 7425 + }, + { + "epoch": 0.5364928567558293, + "grad_norm": 10.050860953580202, + "learning_rate": 4.29195270465455e-06, + "loss": 0.887, + "step": 7426 + }, + { + "epoch": 0.5365651019560388, + "grad_norm": 6.994995235416952, + "learning_rate": 4.2917487309796125e-06, + "loss": 0.8224, + "step": 7427 + }, + { + "epoch": 0.5366373471562483, + "grad_norm": 7.275220740466214, + "learning_rate": 4.291544732777027e-06, + "loss": 0.9156, + "step": 7428 + }, + { + "epoch": 0.5367095923564578, + "grad_norm": 6.846531133056229, + "learning_rate": 4.291340710049584e-06, + "loss": 0.8661, + "step": 7429 + }, + { + "epoch": 0.5367818375566673, + "grad_norm": 5.31964008490561, + "learning_rate": 4.291136662800078e-06, + "loss": 0.7623, + "step": 7430 + }, + { + "epoch": 0.5368540827568768, + "grad_norm": 6.263507437621718, + "learning_rate": 4.290932591031303e-06, + "loss": 0.9634, + "step": 7431 + }, + { + "epoch": 0.5369263279570864, + "grad_norm": 6.556894287802421, + "learning_rate": 4.29072849474605e-06, + "loss": 0.9518, + "step": 7432 + }, + { + "epoch": 0.5369985731572958, + "grad_norm": 6.8023311770406085, + "learning_rate": 4.290524373947115e-06, + "loss": 0.8819, + "step": 7433 + }, + { + "epoch": 0.5370708183575054, + "grad_norm": 7.18550732397718, + "learning_rate": 4.290320228637291e-06, + "loss": 0.9039, + "step": 7434 + }, + { + "epoch": 0.5371430635577149, + "grad_norm": 6.2412044433789164, + "learning_rate": 4.290116058819373e-06, + "loss": 0.7918, + "step": 7435 + }, + { + "epoch": 0.5372153087579244, + "grad_norm": 5.3045611134727615, + "learning_rate": 4.289911864496157e-06, + "loss": 0.8477, + "step": 7436 + }, + { + "epoch": 0.5372875539581339, + "grad_norm": 6.672557644014423, + "learning_rate": 4.289707645670437e-06, + "loss": 0.8387, + "step": 7437 + }, + { + "epoch": 0.5373597991583434, + "grad_norm": 6.700114679066806, + "learning_rate": 4.289503402345009e-06, + "loss": 0.8864, + "step": 7438 + }, + { + "epoch": 0.537432044358553, + "grad_norm": 5.816378704182591, + "learning_rate": 4.289299134522669e-06, + "loss": 0.9255, + "step": 7439 + }, + { + "epoch": 0.5375042895587624, + "grad_norm": 6.85869621098643, + "learning_rate": 4.2890948422062126e-06, + "loss": 0.8155, + "step": 7440 + }, + { + "epoch": 0.5375765347589719, + "grad_norm": 5.8420074639199235, + "learning_rate": 4.288890525398437e-06, + "loss": 0.896, + "step": 7441 + }, + { + "epoch": 0.5376487799591815, + "grad_norm": 6.319716153108411, + "learning_rate": 4.28868618410214e-06, + "loss": 0.938, + "step": 7442 + }, + { + "epoch": 0.537721025159391, + "grad_norm": 7.28729337788665, + "learning_rate": 4.288481818320117e-06, + "loss": 0.9202, + "step": 7443 + }, + { + "epoch": 0.5377932703596005, + "grad_norm": 5.939494229757469, + "learning_rate": 4.288277428055166e-06, + "loss": 0.7723, + "step": 7444 + }, + { + "epoch": 0.53786551555981, + "grad_norm": 5.455670219109001, + "learning_rate": 4.288073013310088e-06, + "loss": 0.9015, + "step": 7445 + }, + { + "epoch": 0.5379377607600195, + "grad_norm": 5.562861227243525, + "learning_rate": 4.287868574087676e-06, + "loss": 0.8773, + "step": 7446 + }, + { + "epoch": 0.538010005960229, + "grad_norm": 8.462757425604783, + "learning_rate": 4.287664110390734e-06, + "loss": 0.8373, + "step": 7447 + }, + { + "epoch": 0.5380822511604385, + "grad_norm": 6.5639914134897515, + "learning_rate": 4.287459622222056e-06, + "loss": 0.9372, + "step": 7448 + }, + { + "epoch": 0.538154496360648, + "grad_norm": 5.944483454844658, + "learning_rate": 4.287255109584445e-06, + "loss": 0.9103, + "step": 7449 + }, + { + "epoch": 0.5382267415608576, + "grad_norm": 7.091552662932327, + "learning_rate": 4.287050572480699e-06, + "loss": 0.8758, + "step": 7450 + }, + { + "epoch": 0.538298986761067, + "grad_norm": 8.899747489175956, + "learning_rate": 4.286846010913618e-06, + "loss": 0.9324, + "step": 7451 + }, + { + "epoch": 0.5383712319612766, + "grad_norm": 6.8597893035829305, + "learning_rate": 4.2866414248860025e-06, + "loss": 0.9201, + "step": 7452 + }, + { + "epoch": 0.5384434771614861, + "grad_norm": 6.923705812565629, + "learning_rate": 4.286436814400653e-06, + "loss": 0.8599, + "step": 7453 + }, + { + "epoch": 0.5385157223616956, + "grad_norm": 7.345896825085071, + "learning_rate": 4.286232179460371e-06, + "loss": 0.8944, + "step": 7454 + }, + { + "epoch": 0.5385879675619051, + "grad_norm": 7.365889013104199, + "learning_rate": 4.286027520067957e-06, + "loss": 0.9389, + "step": 7455 + }, + { + "epoch": 0.5386602127621146, + "grad_norm": 5.447388875667326, + "learning_rate": 4.285822836226214e-06, + "loss": 0.82, + "step": 7456 + }, + { + "epoch": 0.5387324579623242, + "grad_norm": 7.177813031031615, + "learning_rate": 4.285618127937942e-06, + "loss": 0.8796, + "step": 7457 + }, + { + "epoch": 0.5388047031625336, + "grad_norm": 5.3059310691431, + "learning_rate": 4.285413395205944e-06, + "loss": 0.8233, + "step": 7458 + }, + { + "epoch": 0.5388769483627431, + "grad_norm": 6.769167776414905, + "learning_rate": 4.2852086380330235e-06, + "loss": 0.8449, + "step": 7459 + }, + { + "epoch": 0.5389491935629527, + "grad_norm": 5.849288845037128, + "learning_rate": 4.2850038564219826e-06, + "loss": 0.8178, + "step": 7460 + }, + { + "epoch": 0.5390214387631622, + "grad_norm": 7.106402033351276, + "learning_rate": 4.284799050375625e-06, + "loss": 0.8545, + "step": 7461 + }, + { + "epoch": 0.5390936839633717, + "grad_norm": 6.002575321638485, + "learning_rate": 4.2845942198967545e-06, + "loss": 0.8896, + "step": 7462 + }, + { + "epoch": 0.5391659291635812, + "grad_norm": 6.622047486158307, + "learning_rate": 4.284389364988174e-06, + "loss": 0.9084, + "step": 7463 + }, + { + "epoch": 0.5392381743637907, + "grad_norm": 6.184819112736879, + "learning_rate": 4.28418448565269e-06, + "loss": 0.9085, + "step": 7464 + }, + { + "epoch": 0.5393104195640002, + "grad_norm": 6.850870130086151, + "learning_rate": 4.283979581893104e-06, + "loss": 0.9149, + "step": 7465 + }, + { + "epoch": 0.5393826647642097, + "grad_norm": 7.6942139126992695, + "learning_rate": 4.283774653712224e-06, + "loss": 0.9392, + "step": 7466 + }, + { + "epoch": 0.5394549099644192, + "grad_norm": 6.684489490586088, + "learning_rate": 4.283569701112853e-06, + "loss": 0.8491, + "step": 7467 + }, + { + "epoch": 0.5395271551646288, + "grad_norm": 7.512068638659633, + "learning_rate": 4.283364724097798e-06, + "loss": 0.9279, + "step": 7468 + }, + { + "epoch": 0.5395994003648382, + "grad_norm": 7.171924998123651, + "learning_rate": 4.283159722669865e-06, + "loss": 0.9556, + "step": 7469 + }, + { + "epoch": 0.5396716455650478, + "grad_norm": 6.18479845043948, + "learning_rate": 4.28295469683186e-06, + "loss": 0.9608, + "step": 7470 + }, + { + "epoch": 0.5397438907652573, + "grad_norm": 6.094989670078952, + "learning_rate": 4.282749646586589e-06, + "loss": 0.9047, + "step": 7471 + }, + { + "epoch": 0.5398161359654668, + "grad_norm": 6.2610263453539945, + "learning_rate": 4.28254457193686e-06, + "loss": 0.8573, + "step": 7472 + }, + { + "epoch": 0.5398883811656763, + "grad_norm": 5.33971962439174, + "learning_rate": 4.28233947288548e-06, + "loss": 0.7763, + "step": 7473 + }, + { + "epoch": 0.5399606263658858, + "grad_norm": 7.743667476482195, + "learning_rate": 4.282134349435256e-06, + "loss": 0.8608, + "step": 7474 + }, + { + "epoch": 0.5400328715660954, + "grad_norm": 6.89107672839031, + "learning_rate": 4.281929201588997e-06, + "loss": 0.8791, + "step": 7475 + }, + { + "epoch": 0.5401051167663048, + "grad_norm": 6.288366736552418, + "learning_rate": 4.28172402934951e-06, + "loss": 0.8631, + "step": 7476 + }, + { + "epoch": 0.5401773619665143, + "grad_norm": 5.53175394812113, + "learning_rate": 4.2815188327196054e-06, + "loss": 0.8042, + "step": 7477 + }, + { + "epoch": 0.5402496071667239, + "grad_norm": 5.926813728444626, + "learning_rate": 4.281313611702091e-06, + "loss": 0.8642, + "step": 7478 + }, + { + "epoch": 0.5403218523669334, + "grad_norm": 6.439799074154295, + "learning_rate": 4.281108366299776e-06, + "loss": 0.8058, + "step": 7479 + }, + { + "epoch": 0.5403940975671429, + "grad_norm": 5.462785880073756, + "learning_rate": 4.2809030965154705e-06, + "loss": 0.8331, + "step": 7480 + }, + { + "epoch": 0.5404663427673524, + "grad_norm": 5.139504725124794, + "learning_rate": 4.280697802351984e-06, + "loss": 0.8293, + "step": 7481 + }, + { + "epoch": 0.5405385879675619, + "grad_norm": 5.682003509273338, + "learning_rate": 4.280492483812128e-06, + "loss": 0.838, + "step": 7482 + }, + { + "epoch": 0.5406108331677714, + "grad_norm": 6.241946715866048, + "learning_rate": 4.280287140898712e-06, + "loss": 0.8657, + "step": 7483 + }, + { + "epoch": 0.5406830783679809, + "grad_norm": 8.754431338139515, + "learning_rate": 4.2800817736145476e-06, + "loss": 0.9277, + "step": 7484 + }, + { + "epoch": 0.5407553235681904, + "grad_norm": 6.943545345171242, + "learning_rate": 4.2798763819624455e-06, + "loss": 0.8852, + "step": 7485 + }, + { + "epoch": 0.5408275687684, + "grad_norm": 6.056472771073755, + "learning_rate": 4.279670965945218e-06, + "loss": 0.835, + "step": 7486 + }, + { + "epoch": 0.5408998139686094, + "grad_norm": 7.236123057148307, + "learning_rate": 4.279465525565677e-06, + "loss": 0.9056, + "step": 7487 + }, + { + "epoch": 0.540972059168819, + "grad_norm": 7.478867832118239, + "learning_rate": 4.279260060826634e-06, + "loss": 0.8649, + "step": 7488 + }, + { + "epoch": 0.5410443043690285, + "grad_norm": 5.732515371362745, + "learning_rate": 4.279054571730903e-06, + "loss": 0.8958, + "step": 7489 + }, + { + "epoch": 0.541116549569238, + "grad_norm": 6.26202304264923, + "learning_rate": 4.278849058281295e-06, + "loss": 0.9092, + "step": 7490 + }, + { + "epoch": 0.5411887947694475, + "grad_norm": 5.453865241495035, + "learning_rate": 4.2786435204806254e-06, + "loss": 0.8678, + "step": 7491 + }, + { + "epoch": 0.541261039969657, + "grad_norm": 5.6919454451390745, + "learning_rate": 4.278437958331707e-06, + "loss": 0.869, + "step": 7492 + }, + { + "epoch": 0.5413332851698666, + "grad_norm": 5.399051526449243, + "learning_rate": 4.2782323718373545e-06, + "loss": 0.8058, + "step": 7493 + }, + { + "epoch": 0.541405530370076, + "grad_norm": 6.258194700992239, + "learning_rate": 4.278026761000381e-06, + "loss": 0.9067, + "step": 7494 + }, + { + "epoch": 0.5414777755702855, + "grad_norm": 6.2982806205848565, + "learning_rate": 4.277821125823602e-06, + "loss": 0.9243, + "step": 7495 + }, + { + "epoch": 0.5415500207704951, + "grad_norm": 6.299363773251542, + "learning_rate": 4.277615466309831e-06, + "loss": 0.8982, + "step": 7496 + }, + { + "epoch": 0.5416222659707046, + "grad_norm": 7.0148536580945455, + "learning_rate": 4.277409782461885e-06, + "loss": 0.8888, + "step": 7497 + }, + { + "epoch": 0.541694511170914, + "grad_norm": 6.828768841659693, + "learning_rate": 4.277204074282579e-06, + "loss": 0.8852, + "step": 7498 + }, + { + "epoch": 0.5417667563711236, + "grad_norm": 7.307904747720636, + "learning_rate": 4.27699834177473e-06, + "loss": 0.8791, + "step": 7499 + }, + { + "epoch": 0.5418390015713331, + "grad_norm": 7.290662452938385, + "learning_rate": 4.276792584941153e-06, + "loss": 0.9229, + "step": 7500 + }, + { + "epoch": 0.5419112467715426, + "grad_norm": 5.873289792739836, + "learning_rate": 4.276586803784665e-06, + "loss": 0.9536, + "step": 7501 + }, + { + "epoch": 0.5419834919717521, + "grad_norm": 6.782398038801489, + "learning_rate": 4.276380998308084e-06, + "loss": 0.9011, + "step": 7502 + }, + { + "epoch": 0.5420557371719616, + "grad_norm": 8.056844457648793, + "learning_rate": 4.276175168514225e-06, + "loss": 0.8784, + "step": 7503 + }, + { + "epoch": 0.5421279823721712, + "grad_norm": 8.659985338863798, + "learning_rate": 4.275969314405908e-06, + "loss": 0.9286, + "step": 7504 + }, + { + "epoch": 0.5422002275723806, + "grad_norm": 9.093141725522026, + "learning_rate": 4.275763435985949e-06, + "loss": 0.8981, + "step": 7505 + }, + { + "epoch": 0.5422724727725902, + "grad_norm": 6.046380705431123, + "learning_rate": 4.275557533257169e-06, + "loss": 0.858, + "step": 7506 + }, + { + "epoch": 0.5423447179727997, + "grad_norm": 6.533429088020575, + "learning_rate": 4.275351606222383e-06, + "loss": 0.8852, + "step": 7507 + }, + { + "epoch": 0.5424169631730092, + "grad_norm": 5.924906978601278, + "learning_rate": 4.275145654884413e-06, + "loss": 0.8176, + "step": 7508 + }, + { + "epoch": 0.5424892083732187, + "grad_norm": 8.196746031676991, + "learning_rate": 4.2749396792460774e-06, + "loss": 0.8808, + "step": 7509 + }, + { + "epoch": 0.5425614535734282, + "grad_norm": 7.807712888817007, + "learning_rate": 4.274733679310196e-06, + "loss": 0.8771, + "step": 7510 + }, + { + "epoch": 0.5426336987736378, + "grad_norm": 10.743161887626473, + "learning_rate": 4.274527655079588e-06, + "loss": 0.9069, + "step": 7511 + }, + { + "epoch": 0.5427059439738472, + "grad_norm": 7.838517468859729, + "learning_rate": 4.274321606557074e-06, + "loss": 0.8744, + "step": 7512 + }, + { + "epoch": 0.5427781891740567, + "grad_norm": 6.441994292632414, + "learning_rate": 4.274115533745475e-06, + "loss": 0.8701, + "step": 7513 + }, + { + "epoch": 0.5428504343742663, + "grad_norm": 7.5818108787526395, + "learning_rate": 4.273909436647613e-06, + "loss": 0.8377, + "step": 7514 + }, + { + "epoch": 0.5429226795744758, + "grad_norm": 7.79247359994939, + "learning_rate": 4.273703315266307e-06, + "loss": 0.8387, + "step": 7515 + }, + { + "epoch": 0.5429949247746853, + "grad_norm": 11.051561367833605, + "learning_rate": 4.27349716960438e-06, + "loss": 0.8516, + "step": 7516 + }, + { + "epoch": 0.5430671699748948, + "grad_norm": 6.142535895125106, + "learning_rate": 4.2732909996646535e-06, + "loss": 0.8307, + "step": 7517 + }, + { + "epoch": 0.5431394151751043, + "grad_norm": 7.74298861426672, + "learning_rate": 4.273084805449951e-06, + "loss": 0.8679, + "step": 7518 + }, + { + "epoch": 0.5432116603753138, + "grad_norm": 5.819456470186751, + "learning_rate": 4.272878586963094e-06, + "loss": 0.9223, + "step": 7519 + }, + { + "epoch": 0.5432839055755233, + "grad_norm": 6.153357121403356, + "learning_rate": 4.272672344206905e-06, + "loss": 0.8705, + "step": 7520 + }, + { + "epoch": 0.5433561507757328, + "grad_norm": 5.6662138963715085, + "learning_rate": 4.272466077184208e-06, + "loss": 0.8532, + "step": 7521 + }, + { + "epoch": 0.5434283959759424, + "grad_norm": 5.608783318811916, + "learning_rate": 4.272259785897828e-06, + "loss": 0.8638, + "step": 7522 + }, + { + "epoch": 0.5435006411761518, + "grad_norm": 7.38237897619355, + "learning_rate": 4.272053470350586e-06, + "loss": 0.871, + "step": 7523 + }, + { + "epoch": 0.5435728863763614, + "grad_norm": 6.75919986661987, + "learning_rate": 4.271847130545309e-06, + "loss": 0.8909, + "step": 7524 + }, + { + "epoch": 0.5436451315765709, + "grad_norm": 7.311257802978985, + "learning_rate": 4.271640766484819e-06, + "loss": 0.8016, + "step": 7525 + }, + { + "epoch": 0.5437173767767804, + "grad_norm": 8.731448316130928, + "learning_rate": 4.271434378171945e-06, + "loss": 0.8887, + "step": 7526 + }, + { + "epoch": 0.5437896219769899, + "grad_norm": 6.186625409207949, + "learning_rate": 4.2712279656095075e-06, + "loss": 0.8708, + "step": 7527 + }, + { + "epoch": 0.5438618671771994, + "grad_norm": 6.100867863188157, + "learning_rate": 4.271021528800336e-06, + "loss": 0.8648, + "step": 7528 + }, + { + "epoch": 0.543934112377409, + "grad_norm": 5.708966667886498, + "learning_rate": 4.270815067747253e-06, + "loss": 0.7941, + "step": 7529 + }, + { + "epoch": 0.5440063575776184, + "grad_norm": 6.022538450758877, + "learning_rate": 4.270608582453088e-06, + "loss": 0.864, + "step": 7530 + }, + { + "epoch": 0.5440786027778279, + "grad_norm": 6.612863039604435, + "learning_rate": 4.270402072920666e-06, + "loss": 0.8889, + "step": 7531 + }, + { + "epoch": 0.5441508479780375, + "grad_norm": 5.556795259546509, + "learning_rate": 4.270195539152815e-06, + "loss": 0.9238, + "step": 7532 + }, + { + "epoch": 0.544223093178247, + "grad_norm": 5.968407426360172, + "learning_rate": 4.269988981152361e-06, + "loss": 0.8267, + "step": 7533 + }, + { + "epoch": 0.5442953383784565, + "grad_norm": 7.147336514428874, + "learning_rate": 4.269782398922132e-06, + "loss": 0.8989, + "step": 7534 + }, + { + "epoch": 0.544367583578666, + "grad_norm": 6.278621294334132, + "learning_rate": 4.269575792464956e-06, + "loss": 0.9284, + "step": 7535 + }, + { + "epoch": 0.5444398287788755, + "grad_norm": 5.44696413931292, + "learning_rate": 4.269369161783661e-06, + "loss": 0.8365, + "step": 7536 + }, + { + "epoch": 0.544512073979085, + "grad_norm": 5.684770537000558, + "learning_rate": 4.269162506881077e-06, + "loss": 0.9147, + "step": 7537 + }, + { + "epoch": 0.5445843191792945, + "grad_norm": 5.714890809038132, + "learning_rate": 4.268955827760031e-06, + "loss": 0.7987, + "step": 7538 + }, + { + "epoch": 0.544656564379504, + "grad_norm": 8.462194854779339, + "learning_rate": 4.268749124423354e-06, + "loss": 0.7508, + "step": 7539 + }, + { + "epoch": 0.5447288095797136, + "grad_norm": 5.855616307451155, + "learning_rate": 4.2685423968738745e-06, + "loss": 0.7776, + "step": 7540 + }, + { + "epoch": 0.544801054779923, + "grad_norm": 6.300127046303158, + "learning_rate": 4.268335645114423e-06, + "loss": 0.8063, + "step": 7541 + }, + { + "epoch": 0.5448732999801326, + "grad_norm": 6.088350535042215, + "learning_rate": 4.2681288691478295e-06, + "loss": 0.8714, + "step": 7542 + }, + { + "epoch": 0.5449455451803421, + "grad_norm": 7.4375410319246305, + "learning_rate": 4.267922068976924e-06, + "loss": 0.9276, + "step": 7543 + }, + { + "epoch": 0.5450177903805515, + "grad_norm": 6.122386141084771, + "learning_rate": 4.267715244604539e-06, + "loss": 0.9237, + "step": 7544 + }, + { + "epoch": 0.5450900355807611, + "grad_norm": 7.756553586171906, + "learning_rate": 4.267508396033504e-06, + "loss": 0.8731, + "step": 7545 + }, + { + "epoch": 0.5451622807809706, + "grad_norm": 5.212232894823286, + "learning_rate": 4.267301523266652e-06, + "loss": 0.864, + "step": 7546 + }, + { + "epoch": 0.5452345259811802, + "grad_norm": 7.157103908183747, + "learning_rate": 4.267094626306815e-06, + "loss": 0.9827, + "step": 7547 + }, + { + "epoch": 0.5453067711813896, + "grad_norm": 5.683301444910406, + "learning_rate": 4.2668877051568234e-06, + "loss": 0.8197, + "step": 7548 + }, + { + "epoch": 0.5453790163815991, + "grad_norm": 5.958724144698862, + "learning_rate": 4.2666807598195115e-06, + "loss": 0.8916, + "step": 7549 + }, + { + "epoch": 0.5454512615818087, + "grad_norm": 5.493113454627989, + "learning_rate": 4.2664737902977115e-06, + "loss": 0.8807, + "step": 7550 + }, + { + "epoch": 0.5455235067820182, + "grad_norm": 7.029062973671654, + "learning_rate": 4.2662667965942575e-06, + "loss": 0.8374, + "step": 7551 + }, + { + "epoch": 0.5455957519822277, + "grad_norm": 7.972967011872814, + "learning_rate": 4.266059778711982e-06, + "loss": 0.8928, + "step": 7552 + }, + { + "epoch": 0.5456679971824372, + "grad_norm": 7.359334050609269, + "learning_rate": 4.2658527366537195e-06, + "loss": 0.9325, + "step": 7553 + }, + { + "epoch": 0.5457402423826467, + "grad_norm": 6.0075569247288385, + "learning_rate": 4.265645670422304e-06, + "loss": 0.8833, + "step": 7554 + }, + { + "epoch": 0.5458124875828562, + "grad_norm": 6.519385846472469, + "learning_rate": 4.26543858002057e-06, + "loss": 0.8767, + "step": 7555 + }, + { + "epoch": 0.5458847327830657, + "grad_norm": 7.093269499919366, + "learning_rate": 4.265231465451353e-06, + "loss": 0.8637, + "step": 7556 + }, + { + "epoch": 0.5459569779832752, + "grad_norm": 6.9684543974581, + "learning_rate": 4.265024326717488e-06, + "loss": 0.9094, + "step": 7557 + }, + { + "epoch": 0.5460292231834848, + "grad_norm": 5.551566877076184, + "learning_rate": 4.264817163821809e-06, + "loss": 0.8862, + "step": 7558 + }, + { + "epoch": 0.5461014683836942, + "grad_norm": 7.597785285697313, + "learning_rate": 4.264609976767156e-06, + "loss": 0.8488, + "step": 7559 + }, + { + "epoch": 0.5461737135839038, + "grad_norm": 5.931789181544394, + "learning_rate": 4.264402765556361e-06, + "loss": 0.7909, + "step": 7560 + }, + { + "epoch": 0.5462459587841133, + "grad_norm": 6.442074529937353, + "learning_rate": 4.2641955301922615e-06, + "loss": 0.9222, + "step": 7561 + }, + { + "epoch": 0.5463182039843227, + "grad_norm": 6.12182877636566, + "learning_rate": 4.263988270677696e-06, + "loss": 0.9164, + "step": 7562 + }, + { + "epoch": 0.5463904491845323, + "grad_norm": 7.699886856238665, + "learning_rate": 4.2637809870155e-06, + "loss": 0.9671, + "step": 7563 + }, + { + "epoch": 0.5464626943847418, + "grad_norm": 6.273913905770582, + "learning_rate": 4.2635736792085125e-06, + "loss": 0.8698, + "step": 7564 + }, + { + "epoch": 0.5465349395849514, + "grad_norm": 7.22825518605064, + "learning_rate": 4.2633663472595696e-06, + "loss": 0.89, + "step": 7565 + }, + { + "epoch": 0.5466071847851608, + "grad_norm": 5.855855713948058, + "learning_rate": 4.2631589911715124e-06, + "loss": 0.8372, + "step": 7566 + }, + { + "epoch": 0.5466794299853703, + "grad_norm": 6.230285680108234, + "learning_rate": 4.262951610947176e-06, + "loss": 0.8801, + "step": 7567 + }, + { + "epoch": 0.5467516751855799, + "grad_norm": 5.704843758559065, + "learning_rate": 4.2627442065894014e-06, + "loss": 0.9643, + "step": 7568 + }, + { + "epoch": 0.5468239203857894, + "grad_norm": 5.690615293357317, + "learning_rate": 4.262536778101028e-06, + "loss": 0.9189, + "step": 7569 + }, + { + "epoch": 0.5468961655859989, + "grad_norm": 5.2562130086461245, + "learning_rate": 4.262329325484893e-06, + "loss": 0.8232, + "step": 7570 + }, + { + "epoch": 0.5469684107862084, + "grad_norm": 6.323809473225937, + "learning_rate": 4.26212184874384e-06, + "loss": 0.8793, + "step": 7571 + }, + { + "epoch": 0.5470406559864179, + "grad_norm": 6.1985399803650205, + "learning_rate": 4.2619143478807045e-06, + "loss": 0.7791, + "step": 7572 + }, + { + "epoch": 0.5471129011866274, + "grad_norm": 5.94818022011003, + "learning_rate": 4.2617068228983316e-06, + "loss": 0.8625, + "step": 7573 + }, + { + "epoch": 0.5471851463868369, + "grad_norm": 5.040056943723124, + "learning_rate": 4.26149927379956e-06, + "loss": 0.8362, + "step": 7574 + }, + { + "epoch": 0.5472573915870464, + "grad_norm": 7.208745455858688, + "learning_rate": 4.26129170058723e-06, + "loss": 0.9337, + "step": 7575 + }, + { + "epoch": 0.547329636787256, + "grad_norm": 5.457734266802147, + "learning_rate": 4.2610841032641855e-06, + "loss": 0.8269, + "step": 7576 + }, + { + "epoch": 0.5474018819874654, + "grad_norm": 5.110490123139468, + "learning_rate": 4.260876481833266e-06, + "loss": 0.8594, + "step": 7577 + }, + { + "epoch": 0.547474127187675, + "grad_norm": 7.068261691090691, + "learning_rate": 4.260668836297315e-06, + "loss": 0.8814, + "step": 7578 + }, + { + "epoch": 0.5475463723878845, + "grad_norm": 5.6795501482489446, + "learning_rate": 4.260461166659175e-06, + "loss": 0.8603, + "step": 7579 + }, + { + "epoch": 0.5476186175880939, + "grad_norm": 5.309129790357303, + "learning_rate": 4.260253472921688e-06, + "loss": 0.8637, + "step": 7580 + }, + { + "epoch": 0.5476908627883035, + "grad_norm": 7.364935262671346, + "learning_rate": 4.2600457550876975e-06, + "loss": 0.9053, + "step": 7581 + }, + { + "epoch": 0.547763107988513, + "grad_norm": 8.52926042434105, + "learning_rate": 4.259838013160048e-06, + "loss": 0.9353, + "step": 7582 + }, + { + "epoch": 0.5478353531887226, + "grad_norm": 5.455239834080521, + "learning_rate": 4.259630247141583e-06, + "loss": 0.8967, + "step": 7583 + }, + { + "epoch": 0.547907598388932, + "grad_norm": 7.9024796372863, + "learning_rate": 4.259422457035145e-06, + "loss": 0.944, + "step": 7584 + }, + { + "epoch": 0.5479798435891415, + "grad_norm": 5.878110346352515, + "learning_rate": 4.25921464284358e-06, + "loss": 0.8829, + "step": 7585 + }, + { + "epoch": 0.5480520887893511, + "grad_norm": 6.436199760603957, + "learning_rate": 4.259006804569732e-06, + "loss": 0.9253, + "step": 7586 + }, + { + "epoch": 0.5481243339895606, + "grad_norm": 6.3518465736366405, + "learning_rate": 4.258798942216448e-06, + "loss": 0.9079, + "step": 7587 + }, + { + "epoch": 0.54819657918977, + "grad_norm": 6.098174538050974, + "learning_rate": 4.2585910557865705e-06, + "loss": 0.9337, + "step": 7588 + }, + { + "epoch": 0.5482688243899796, + "grad_norm": 6.7831058335953935, + "learning_rate": 4.258383145282948e-06, + "loss": 0.8178, + "step": 7589 + }, + { + "epoch": 0.5483410695901891, + "grad_norm": 6.536100347880865, + "learning_rate": 4.258175210708425e-06, + "loss": 0.8603, + "step": 7590 + }, + { + "epoch": 0.5484133147903986, + "grad_norm": 5.806204062486469, + "learning_rate": 4.257967252065849e-06, + "loss": 0.8171, + "step": 7591 + }, + { + "epoch": 0.5484855599906081, + "grad_norm": 6.758941379699838, + "learning_rate": 4.257759269358066e-06, + "loss": 0.9304, + "step": 7592 + }, + { + "epoch": 0.5485578051908176, + "grad_norm": 5.919245234136334, + "learning_rate": 4.257551262587923e-06, + "loss": 1.0344, + "step": 7593 + }, + { + "epoch": 0.5486300503910272, + "grad_norm": 6.694791437480141, + "learning_rate": 4.257343231758269e-06, + "loss": 0.981, + "step": 7594 + }, + { + "epoch": 0.5487022955912366, + "grad_norm": 6.350517984725058, + "learning_rate": 4.257135176871949e-06, + "loss": 0.8644, + "step": 7595 + }, + { + "epoch": 0.5487745407914462, + "grad_norm": 6.402861801703125, + "learning_rate": 4.256927097931814e-06, + "loss": 0.7756, + "step": 7596 + }, + { + "epoch": 0.5488467859916557, + "grad_norm": 6.003576484315607, + "learning_rate": 4.256718994940711e-06, + "loss": 0.8071, + "step": 7597 + }, + { + "epoch": 0.5489190311918651, + "grad_norm": 5.936813796696055, + "learning_rate": 4.256510867901489e-06, + "loss": 0.9321, + "step": 7598 + }, + { + "epoch": 0.5489912763920747, + "grad_norm": 7.289282563415047, + "learning_rate": 4.256302716816997e-06, + "loss": 0.8692, + "step": 7599 + }, + { + "epoch": 0.5490635215922842, + "grad_norm": 9.247197319413065, + "learning_rate": 4.256094541690085e-06, + "loss": 0.8167, + "step": 7600 + }, + { + "epoch": 0.5491357667924938, + "grad_norm": 5.693067906347095, + "learning_rate": 4.255886342523601e-06, + "loss": 0.8801, + "step": 7601 + }, + { + "epoch": 0.5492080119927032, + "grad_norm": 5.795480401020547, + "learning_rate": 4.255678119320397e-06, + "loss": 0.8819, + "step": 7602 + }, + { + "epoch": 0.5492802571929127, + "grad_norm": 5.728810950131253, + "learning_rate": 4.255469872083323e-06, + "loss": 0.8356, + "step": 7603 + }, + { + "epoch": 0.5493525023931223, + "grad_norm": 6.490234962759528, + "learning_rate": 4.255261600815229e-06, + "loss": 0.7917, + "step": 7604 + }, + { + "epoch": 0.5494247475933318, + "grad_norm": 8.158785488027519, + "learning_rate": 4.2550533055189666e-06, + "loss": 0.8632, + "step": 7605 + }, + { + "epoch": 0.5494969927935413, + "grad_norm": 7.7102517979691125, + "learning_rate": 4.254844986197387e-06, + "loss": 0.962, + "step": 7606 + }, + { + "epoch": 0.5495692379937508, + "grad_norm": 7.104351171817857, + "learning_rate": 4.254636642853343e-06, + "loss": 0.9548, + "step": 7607 + }, + { + "epoch": 0.5496414831939603, + "grad_norm": 6.15018222197728, + "learning_rate": 4.254428275489685e-06, + "loss": 0.9217, + "step": 7608 + }, + { + "epoch": 0.5497137283941698, + "grad_norm": 5.4270652531513655, + "learning_rate": 4.254219884109266e-06, + "loss": 0.8413, + "step": 7609 + }, + { + "epoch": 0.5497859735943793, + "grad_norm": 5.757385155231808, + "learning_rate": 4.254011468714939e-06, + "loss": 0.9042, + "step": 7610 + }, + { + "epoch": 0.5498582187945888, + "grad_norm": 7.091629316304067, + "learning_rate": 4.253803029309557e-06, + "loss": 0.8549, + "step": 7611 + }, + { + "epoch": 0.5499304639947984, + "grad_norm": 6.57138767585113, + "learning_rate": 4.2535945658959736e-06, + "loss": 0.8707, + "step": 7612 + }, + { + "epoch": 0.5500027091950078, + "grad_norm": 6.229890745132619, + "learning_rate": 4.253386078477042e-06, + "loss": 0.8077, + "step": 7613 + }, + { + "epoch": 0.5500749543952174, + "grad_norm": 8.14643403657128, + "learning_rate": 4.253177567055616e-06, + "loss": 0.8591, + "step": 7614 + }, + { + "epoch": 0.5501471995954269, + "grad_norm": 7.120026575960431, + "learning_rate": 4.25296903163455e-06, + "loss": 0.9313, + "step": 7615 + }, + { + "epoch": 0.5502194447956363, + "grad_norm": 7.766887354955672, + "learning_rate": 4.252760472216701e-06, + "loss": 0.879, + "step": 7616 + }, + { + "epoch": 0.5502916899958459, + "grad_norm": 5.425345505642089, + "learning_rate": 4.25255188880492e-06, + "loss": 0.8402, + "step": 7617 + }, + { + "epoch": 0.5503639351960554, + "grad_norm": 4.989057010082518, + "learning_rate": 4.252343281402065e-06, + "loss": 0.8615, + "step": 7618 + }, + { + "epoch": 0.550436180396265, + "grad_norm": 5.13422419674258, + "learning_rate": 4.2521346500109916e-06, + "loss": 0.9121, + "step": 7619 + }, + { + "epoch": 0.5505084255964744, + "grad_norm": 5.685838729197599, + "learning_rate": 4.251925994634555e-06, + "loss": 0.9861, + "step": 7620 + }, + { + "epoch": 0.5505806707966839, + "grad_norm": 6.292743245494046, + "learning_rate": 4.251717315275612e-06, + "loss": 0.8, + "step": 7621 + }, + { + "epoch": 0.5506529159968935, + "grad_norm": 7.460331341950534, + "learning_rate": 4.251508611937018e-06, + "loss": 0.8896, + "step": 7622 + }, + { + "epoch": 0.550725161197103, + "grad_norm": 7.149321154399906, + "learning_rate": 4.251299884621633e-06, + "loss": 0.8608, + "step": 7623 + }, + { + "epoch": 0.5507974063973125, + "grad_norm": 6.579180924937876, + "learning_rate": 4.25109113333231e-06, + "loss": 0.9859, + "step": 7624 + }, + { + "epoch": 0.550869651597522, + "grad_norm": 7.746362139986007, + "learning_rate": 4.2508823580719105e-06, + "loss": 0.8105, + "step": 7625 + }, + { + "epoch": 0.5509418967977315, + "grad_norm": 9.024710323513983, + "learning_rate": 4.250673558843291e-06, + "loss": 0.9694, + "step": 7626 + }, + { + "epoch": 0.551014141997941, + "grad_norm": 6.222758012742923, + "learning_rate": 4.25046473564931e-06, + "loss": 0.8866, + "step": 7627 + }, + { + "epoch": 0.5510863871981505, + "grad_norm": 7.515818381250401, + "learning_rate": 4.250255888492825e-06, + "loss": 0.8801, + "step": 7628 + }, + { + "epoch": 0.55115863239836, + "grad_norm": 5.753761388090749, + "learning_rate": 4.2500470173766965e-06, + "loss": 0.809, + "step": 7629 + }, + { + "epoch": 0.5512308775985696, + "grad_norm": 7.635108376004238, + "learning_rate": 4.249838122303783e-06, + "loss": 0.8651, + "step": 7630 + }, + { + "epoch": 0.551303122798779, + "grad_norm": 8.375741043328784, + "learning_rate": 4.2496292032769434e-06, + "loss": 0.9271, + "step": 7631 + }, + { + "epoch": 0.5513753679989886, + "grad_norm": 8.613438915790526, + "learning_rate": 4.2494202602990405e-06, + "loss": 0.9039, + "step": 7632 + }, + { + "epoch": 0.5514476131991981, + "grad_norm": 10.938633800051132, + "learning_rate": 4.249211293372931e-06, + "loss": 0.9204, + "step": 7633 + }, + { + "epoch": 0.5515198583994075, + "grad_norm": 8.99430624677856, + "learning_rate": 4.249002302501478e-06, + "loss": 0.818, + "step": 7634 + }, + { + "epoch": 0.5515921035996171, + "grad_norm": 7.303409762496936, + "learning_rate": 4.248793287687541e-06, + "loss": 0.9245, + "step": 7635 + }, + { + "epoch": 0.5516643487998266, + "grad_norm": 5.610546276674409, + "learning_rate": 4.248584248933982e-06, + "loss": 0.8415, + "step": 7636 + }, + { + "epoch": 0.5517365940000362, + "grad_norm": 8.205526062293897, + "learning_rate": 4.248375186243662e-06, + "loss": 0.9366, + "step": 7637 + }, + { + "epoch": 0.5518088392002456, + "grad_norm": 14.06212184821421, + "learning_rate": 4.248166099619442e-06, + "loss": 0.954, + "step": 7638 + }, + { + "epoch": 0.5518810844004551, + "grad_norm": 9.807430439900967, + "learning_rate": 4.247956989064188e-06, + "loss": 0.9069, + "step": 7639 + }, + { + "epoch": 0.5519533296006647, + "grad_norm": 5.472334727740667, + "learning_rate": 4.247747854580758e-06, + "loss": 0.8661, + "step": 7640 + }, + { + "epoch": 0.5520255748008742, + "grad_norm": 8.49136407504592, + "learning_rate": 4.247538696172018e-06, + "loss": 0.8732, + "step": 7641 + }, + { + "epoch": 0.5520978200010837, + "grad_norm": 6.891146477929013, + "learning_rate": 4.24732951384083e-06, + "loss": 0.8245, + "step": 7642 + }, + { + "epoch": 0.5521700652012932, + "grad_norm": 10.858428957779866, + "learning_rate": 4.247120307590057e-06, + "loss": 0.9333, + "step": 7643 + }, + { + "epoch": 0.5522423104015027, + "grad_norm": 8.321861559878135, + "learning_rate": 4.246911077422564e-06, + "loss": 0.8743, + "step": 7644 + }, + { + "epoch": 0.5523145556017122, + "grad_norm": 6.443997261638758, + "learning_rate": 4.246701823341215e-06, + "loss": 0.8565, + "step": 7645 + }, + { + "epoch": 0.5523868008019217, + "grad_norm": 5.879899275040274, + "learning_rate": 4.246492545348874e-06, + "loss": 0.8463, + "step": 7646 + }, + { + "epoch": 0.5524590460021312, + "grad_norm": 6.588549547354065, + "learning_rate": 4.2462832434484055e-06, + "loss": 0.849, + "step": 7647 + }, + { + "epoch": 0.5525312912023408, + "grad_norm": 5.8093249765656605, + "learning_rate": 4.246073917642677e-06, + "loss": 0.7671, + "step": 7648 + }, + { + "epoch": 0.5526035364025502, + "grad_norm": 8.980397387167525, + "learning_rate": 4.245864567934551e-06, + "loss": 0.8301, + "step": 7649 + }, + { + "epoch": 0.5526757816027598, + "grad_norm": 7.708195315663327, + "learning_rate": 4.245655194326894e-06, + "loss": 0.7983, + "step": 7650 + }, + { + "epoch": 0.5527480268029693, + "grad_norm": 8.223601675161483, + "learning_rate": 4.245445796822575e-06, + "loss": 0.9341, + "step": 7651 + }, + { + "epoch": 0.5528202720031787, + "grad_norm": 7.436849357290836, + "learning_rate": 4.245236375424457e-06, + "loss": 0.9418, + "step": 7652 + }, + { + "epoch": 0.5528925172033883, + "grad_norm": 6.0449335495646555, + "learning_rate": 4.245026930135408e-06, + "loss": 0.935, + "step": 7653 + }, + { + "epoch": 0.5529647624035978, + "grad_norm": 6.436210429087912, + "learning_rate": 4.244817460958295e-06, + "loss": 0.878, + "step": 7654 + }, + { + "epoch": 0.5530370076038074, + "grad_norm": 5.931675352837095, + "learning_rate": 4.244607967895986e-06, + "loss": 0.8099, + "step": 7655 + }, + { + "epoch": 0.5531092528040168, + "grad_norm": 9.19910471333246, + "learning_rate": 4.244398450951348e-06, + "loss": 0.981, + "step": 7656 + }, + { + "epoch": 0.5531814980042263, + "grad_norm": 8.166573141172323, + "learning_rate": 4.24418891012725e-06, + "loss": 0.9486, + "step": 7657 + }, + { + "epoch": 0.5532537432044359, + "grad_norm": 6.9815291168779385, + "learning_rate": 4.243979345426561e-06, + "loss": 0.93, + "step": 7658 + }, + { + "epoch": 0.5533259884046454, + "grad_norm": 7.187636399011467, + "learning_rate": 4.243769756852148e-06, + "loss": 0.8516, + "step": 7659 + }, + { + "epoch": 0.5533982336048549, + "grad_norm": 6.692302084936462, + "learning_rate": 4.243560144406881e-06, + "loss": 0.8925, + "step": 7660 + }, + { + "epoch": 0.5534704788050644, + "grad_norm": 6.597206981262092, + "learning_rate": 4.24335050809363e-06, + "loss": 0.9559, + "step": 7661 + }, + { + "epoch": 0.5535427240052739, + "grad_norm": 5.8244450382557575, + "learning_rate": 4.243140847915264e-06, + "loss": 0.8219, + "step": 7662 + }, + { + "epoch": 0.5536149692054834, + "grad_norm": 7.714696893380729, + "learning_rate": 4.242931163874653e-06, + "loss": 0.8663, + "step": 7663 + }, + { + "epoch": 0.5536872144056929, + "grad_norm": 5.662828866195964, + "learning_rate": 4.242721455974668e-06, + "loss": 0.831, + "step": 7664 + }, + { + "epoch": 0.5537594596059024, + "grad_norm": 7.0686267845417206, + "learning_rate": 4.242511724218178e-06, + "loss": 0.8624, + "step": 7665 + }, + { + "epoch": 0.553831704806112, + "grad_norm": 5.90627454318644, + "learning_rate": 4.2423019686080565e-06, + "loss": 0.8487, + "step": 7666 + }, + { + "epoch": 0.5539039500063214, + "grad_norm": 5.514756261258856, + "learning_rate": 4.2420921891471745e-06, + "loss": 0.9375, + "step": 7667 + }, + { + "epoch": 0.553976195206531, + "grad_norm": 6.623518580223683, + "learning_rate": 4.2418823858384025e-06, + "loss": 0.9167, + "step": 7668 + }, + { + "epoch": 0.5540484404067405, + "grad_norm": 5.341306071773847, + "learning_rate": 4.241672558684613e-06, + "loss": 0.8534, + "step": 7669 + }, + { + "epoch": 0.5541206856069499, + "grad_norm": 5.507815789837227, + "learning_rate": 4.241462707688678e-06, + "loss": 0.8008, + "step": 7670 + }, + { + "epoch": 0.5541929308071595, + "grad_norm": 8.073671157199174, + "learning_rate": 4.241252832853472e-06, + "loss": 0.9813, + "step": 7671 + }, + { + "epoch": 0.554265176007369, + "grad_norm": 6.391633119789361, + "learning_rate": 4.241042934181866e-06, + "loss": 0.8986, + "step": 7672 + }, + { + "epoch": 0.5543374212075786, + "grad_norm": 6.839760649659381, + "learning_rate": 4.240833011676734e-06, + "loss": 0.8271, + "step": 7673 + }, + { + "epoch": 0.554409666407788, + "grad_norm": 6.156097255302912, + "learning_rate": 4.24062306534095e-06, + "loss": 0.93, + "step": 7674 + }, + { + "epoch": 0.5544819116079975, + "grad_norm": 6.688625062799745, + "learning_rate": 4.240413095177387e-06, + "loss": 0.8707, + "step": 7675 + }, + { + "epoch": 0.5545541568082071, + "grad_norm": 5.983986147206638, + "learning_rate": 4.240203101188921e-06, + "loss": 0.8369, + "step": 7676 + }, + { + "epoch": 0.5546264020084166, + "grad_norm": 7.10780722969482, + "learning_rate": 4.239993083378425e-06, + "loss": 0.8717, + "step": 7677 + }, + { + "epoch": 0.554698647208626, + "grad_norm": 6.148759496611326, + "learning_rate": 4.239783041748775e-06, + "loss": 0.8723, + "step": 7678 + }, + { + "epoch": 0.5547708924088356, + "grad_norm": 6.656898144610209, + "learning_rate": 4.239572976302846e-06, + "loss": 0.8901, + "step": 7679 + }, + { + "epoch": 0.5548431376090451, + "grad_norm": 6.527303717243336, + "learning_rate": 4.239362887043514e-06, + "loss": 0.9029, + "step": 7680 + }, + { + "epoch": 0.5549153828092546, + "grad_norm": 7.323054855439189, + "learning_rate": 4.239152773973654e-06, + "loss": 0.8577, + "step": 7681 + }, + { + "epoch": 0.5549876280094641, + "grad_norm": 6.629234256395822, + "learning_rate": 4.238942637096144e-06, + "loss": 0.9163, + "step": 7682 + }, + { + "epoch": 0.5550598732096736, + "grad_norm": 6.204000646634194, + "learning_rate": 4.238732476413858e-06, + "loss": 0.9265, + "step": 7683 + }, + { + "epoch": 0.5551321184098832, + "grad_norm": 5.598677697060834, + "learning_rate": 4.238522291929675e-06, + "loss": 0.9123, + "step": 7684 + }, + { + "epoch": 0.5552043636100926, + "grad_norm": 7.034500388032438, + "learning_rate": 4.238312083646472e-06, + "loss": 0.7851, + "step": 7685 + }, + { + "epoch": 0.5552766088103022, + "grad_norm": 5.342941669206392, + "learning_rate": 4.238101851567126e-06, + "loss": 0.9319, + "step": 7686 + }, + { + "epoch": 0.5553488540105117, + "grad_norm": 6.378306410292172, + "learning_rate": 4.237891595694515e-06, + "loss": 0.8431, + "step": 7687 + }, + { + "epoch": 0.5554210992107211, + "grad_norm": 7.552098206253315, + "learning_rate": 4.237681316031519e-06, + "loss": 0.9447, + "step": 7688 + }, + { + "epoch": 0.5554933444109307, + "grad_norm": 6.060474175260869, + "learning_rate": 4.237471012581014e-06, + "loss": 0.846, + "step": 7689 + }, + { + "epoch": 0.5555655896111402, + "grad_norm": 7.789738571226677, + "learning_rate": 4.237260685345879e-06, + "loss": 0.8252, + "step": 7690 + }, + { + "epoch": 0.5556378348113498, + "grad_norm": 7.226148900748656, + "learning_rate": 4.2370503343289945e-06, + "loss": 0.8922, + "step": 7691 + }, + { + "epoch": 0.5557100800115592, + "grad_norm": 5.304796445286825, + "learning_rate": 4.23683995953324e-06, + "loss": 0.7991, + "step": 7692 + }, + { + "epoch": 0.5557823252117687, + "grad_norm": 6.990006533989424, + "learning_rate": 4.236629560961495e-06, + "loss": 0.9017, + "step": 7693 + }, + { + "epoch": 0.5558545704119783, + "grad_norm": 6.502801438164128, + "learning_rate": 4.2364191386166395e-06, + "loss": 0.8696, + "step": 7694 + }, + { + "epoch": 0.5559268156121878, + "grad_norm": 9.608493671474506, + "learning_rate": 4.2362086925015544e-06, + "loss": 0.9384, + "step": 7695 + }, + { + "epoch": 0.5559990608123972, + "grad_norm": 8.845427337208307, + "learning_rate": 4.235998222619121e-06, + "loss": 1.1045, + "step": 7696 + }, + { + "epoch": 0.5560713060126068, + "grad_norm": 7.715441037005455, + "learning_rate": 4.235787728972218e-06, + "loss": 0.9469, + "step": 7697 + }, + { + "epoch": 0.5561435512128163, + "grad_norm": 5.746760907267064, + "learning_rate": 4.23557721156373e-06, + "loss": 0.8926, + "step": 7698 + }, + { + "epoch": 0.5562157964130258, + "grad_norm": 5.7872566979433895, + "learning_rate": 4.235366670396538e-06, + "loss": 0.7925, + "step": 7699 + }, + { + "epoch": 0.5562880416132353, + "grad_norm": 6.280193183017193, + "learning_rate": 4.235156105473524e-06, + "loss": 0.8686, + "step": 7700 + }, + { + "epoch": 0.5563602868134448, + "grad_norm": 5.879912574792845, + "learning_rate": 4.234945516797569e-06, + "loss": 0.9131, + "step": 7701 + }, + { + "epoch": 0.5564325320136544, + "grad_norm": 6.845277102025573, + "learning_rate": 4.234734904371558e-06, + "loss": 0.8391, + "step": 7702 + }, + { + "epoch": 0.5565047772138638, + "grad_norm": 5.304532347981878, + "learning_rate": 4.234524268198372e-06, + "loss": 0.8461, + "step": 7703 + }, + { + "epoch": 0.5565770224140734, + "grad_norm": 7.797544034030618, + "learning_rate": 4.234313608280895e-06, + "loss": 0.9392, + "step": 7704 + }, + { + "epoch": 0.5566492676142829, + "grad_norm": 7.533942410088186, + "learning_rate": 4.234102924622013e-06, + "loss": 0.8173, + "step": 7705 + }, + { + "epoch": 0.5567215128144923, + "grad_norm": 6.452634039945765, + "learning_rate": 4.233892217224608e-06, + "loss": 0.9187, + "step": 7706 + }, + { + "epoch": 0.5567937580147019, + "grad_norm": 5.835892724372242, + "learning_rate": 4.233681486091564e-06, + "loss": 0.8768, + "step": 7707 + }, + { + "epoch": 0.5568660032149114, + "grad_norm": 6.273670082958973, + "learning_rate": 4.233470731225768e-06, + "loss": 0.8166, + "step": 7708 + }, + { + "epoch": 0.556938248415121, + "grad_norm": 6.696016966749274, + "learning_rate": 4.233259952630102e-06, + "loss": 0.9266, + "step": 7709 + }, + { + "epoch": 0.5570104936153304, + "grad_norm": 6.901749607457733, + "learning_rate": 4.233049150307454e-06, + "loss": 0.898, + "step": 7710 + }, + { + "epoch": 0.5570827388155399, + "grad_norm": 6.341622131966587, + "learning_rate": 4.232838324260709e-06, + "loss": 0.795, + "step": 7711 + }, + { + "epoch": 0.5571549840157495, + "grad_norm": 5.891577524736517, + "learning_rate": 4.232627474492752e-06, + "loss": 0.9564, + "step": 7712 + }, + { + "epoch": 0.557227229215959, + "grad_norm": 6.444663201196434, + "learning_rate": 4.232416601006471e-06, + "loss": 0.8338, + "step": 7713 + }, + { + "epoch": 0.5572994744161684, + "grad_norm": 4.461684182870933, + "learning_rate": 4.2322057038047515e-06, + "loss": 0.8103, + "step": 7714 + }, + { + "epoch": 0.557371719616378, + "grad_norm": 6.642924693711959, + "learning_rate": 4.231994782890481e-06, + "loss": 0.8804, + "step": 7715 + }, + { + "epoch": 0.5574439648165875, + "grad_norm": 5.913461309436557, + "learning_rate": 4.231783838266546e-06, + "loss": 0.8521, + "step": 7716 + }, + { + "epoch": 0.557516210016797, + "grad_norm": 6.499015146618491, + "learning_rate": 4.231572869935836e-06, + "loss": 0.886, + "step": 7717 + }, + { + "epoch": 0.5575884552170065, + "grad_norm": 5.403794806085295, + "learning_rate": 4.231361877901237e-06, + "loss": 0.8468, + "step": 7718 + }, + { + "epoch": 0.557660700417216, + "grad_norm": 6.324475703164799, + "learning_rate": 4.231150862165638e-06, + "loss": 0.871, + "step": 7719 + }, + { + "epoch": 0.5577329456174256, + "grad_norm": 5.629105489620997, + "learning_rate": 4.230939822731929e-06, + "loss": 0.9641, + "step": 7720 + }, + { + "epoch": 0.557805190817635, + "grad_norm": 6.551852061966489, + "learning_rate": 4.2307287596029975e-06, + "loss": 0.8266, + "step": 7721 + }, + { + "epoch": 0.5578774360178446, + "grad_norm": 9.003909215699078, + "learning_rate": 4.230517672781732e-06, + "loss": 0.8832, + "step": 7722 + }, + { + "epoch": 0.5579496812180541, + "grad_norm": 9.49611945954973, + "learning_rate": 4.230306562271024e-06, + "loss": 0.8982, + "step": 7723 + }, + { + "epoch": 0.5580219264182635, + "grad_norm": 9.10117102833004, + "learning_rate": 4.230095428073763e-06, + "loss": 0.93, + "step": 7724 + }, + { + "epoch": 0.5580941716184731, + "grad_norm": 7.612688725579705, + "learning_rate": 4.229884270192839e-06, + "loss": 0.8474, + "step": 7725 + }, + { + "epoch": 0.5581664168186826, + "grad_norm": 10.01899517328093, + "learning_rate": 4.229673088631142e-06, + "loss": 0.9356, + "step": 7726 + }, + { + "epoch": 0.5582386620188922, + "grad_norm": 7.727558993580546, + "learning_rate": 4.229461883391564e-06, + "loss": 0.9601, + "step": 7727 + }, + { + "epoch": 0.5583109072191016, + "grad_norm": 8.311166756772884, + "learning_rate": 4.229250654476994e-06, + "loss": 0.9353, + "step": 7728 + }, + { + "epoch": 0.5583831524193111, + "grad_norm": 6.982257701304205, + "learning_rate": 4.229039401890327e-06, + "loss": 0.8773, + "step": 7729 + }, + { + "epoch": 0.5584553976195207, + "grad_norm": 6.680008444409544, + "learning_rate": 4.228828125634452e-06, + "loss": 0.8816, + "step": 7730 + }, + { + "epoch": 0.5585276428197302, + "grad_norm": 7.062830824403936, + "learning_rate": 4.228616825712263e-06, + "loss": 0.8634, + "step": 7731 + }, + { + "epoch": 0.5585998880199396, + "grad_norm": 9.164921218691639, + "learning_rate": 4.2284055021266525e-06, + "loss": 0.8981, + "step": 7732 + }, + { + "epoch": 0.5586721332201492, + "grad_norm": 6.8397012518173135, + "learning_rate": 4.228194154880511e-06, + "loss": 0.8551, + "step": 7733 + }, + { + "epoch": 0.5587443784203587, + "grad_norm": 9.444677995773475, + "learning_rate": 4.227982783976735e-06, + "loss": 0.8575, + "step": 7734 + }, + { + "epoch": 0.5588166236205682, + "grad_norm": 6.53540257493035, + "learning_rate": 4.227771389418215e-06, + "loss": 0.7458, + "step": 7735 + }, + { + "epoch": 0.5588888688207777, + "grad_norm": 6.663745367234251, + "learning_rate": 4.227559971207847e-06, + "loss": 0.9285, + "step": 7736 + }, + { + "epoch": 0.5589611140209872, + "grad_norm": 6.865046283429764, + "learning_rate": 4.227348529348524e-06, + "loss": 0.8457, + "step": 7737 + }, + { + "epoch": 0.5590333592211968, + "grad_norm": 6.722255058663757, + "learning_rate": 4.227137063843141e-06, + "loss": 0.8621, + "step": 7738 + }, + { + "epoch": 0.5591056044214062, + "grad_norm": 7.25813231980022, + "learning_rate": 4.226925574694592e-06, + "loss": 0.8164, + "step": 7739 + }, + { + "epoch": 0.5591778496216158, + "grad_norm": 7.498055269515358, + "learning_rate": 4.2267140619057745e-06, + "loss": 0.8928, + "step": 7740 + }, + { + "epoch": 0.5592500948218253, + "grad_norm": 5.685750168127794, + "learning_rate": 4.2265025254795814e-06, + "loss": 0.9025, + "step": 7741 + }, + { + "epoch": 0.5593223400220347, + "grad_norm": 7.9797591212733305, + "learning_rate": 4.2262909654189084e-06, + "loss": 0.93, + "step": 7742 + }, + { + "epoch": 0.5593945852222443, + "grad_norm": 6.862147854801173, + "learning_rate": 4.226079381726653e-06, + "loss": 0.9045, + "step": 7743 + }, + { + "epoch": 0.5594668304224538, + "grad_norm": 8.186748441024745, + "learning_rate": 4.225867774405711e-06, + "loss": 0.8793, + "step": 7744 + }, + { + "epoch": 0.5595390756226634, + "grad_norm": 6.394525880991942, + "learning_rate": 4.225656143458979e-06, + "loss": 0.895, + "step": 7745 + }, + { + "epoch": 0.5596113208228728, + "grad_norm": 7.136777813880523, + "learning_rate": 4.225444488889356e-06, + "loss": 0.8487, + "step": 7746 + }, + { + "epoch": 0.5596835660230823, + "grad_norm": 6.497800308103823, + "learning_rate": 4.225232810699736e-06, + "loss": 0.7627, + "step": 7747 + }, + { + "epoch": 0.5597558112232919, + "grad_norm": 6.24407372843036, + "learning_rate": 4.2250211088930185e-06, + "loss": 0.8916, + "step": 7748 + }, + { + "epoch": 0.5598280564235014, + "grad_norm": 6.164530031679188, + "learning_rate": 4.224809383472101e-06, + "loss": 0.7581, + "step": 7749 + }, + { + "epoch": 0.5599003016237108, + "grad_norm": 5.157634942142681, + "learning_rate": 4.2245976344398835e-06, + "loss": 0.9485, + "step": 7750 + }, + { + "epoch": 0.5599725468239204, + "grad_norm": 6.320609445280227, + "learning_rate": 4.224385861799262e-06, + "loss": 0.7883, + "step": 7751 + }, + { + "epoch": 0.5600447920241299, + "grad_norm": 7.064091055917177, + "learning_rate": 4.2241740655531375e-06, + "loss": 0.9419, + "step": 7752 + }, + { + "epoch": 0.5601170372243394, + "grad_norm": 7.740590936541925, + "learning_rate": 4.223962245704409e-06, + "loss": 0.9586, + "step": 7753 + }, + { + "epoch": 0.5601892824245489, + "grad_norm": 6.598487634488679, + "learning_rate": 4.223750402255976e-06, + "loss": 0.8664, + "step": 7754 + }, + { + "epoch": 0.5602615276247584, + "grad_norm": 5.28122002406896, + "learning_rate": 4.223538535210738e-06, + "loss": 0.956, + "step": 7755 + }, + { + "epoch": 0.560333772824968, + "grad_norm": 7.572625309841492, + "learning_rate": 4.223326644571597e-06, + "loss": 1.0034, + "step": 7756 + }, + { + "epoch": 0.5604060180251774, + "grad_norm": 6.289813377146888, + "learning_rate": 4.223114730341451e-06, + "loss": 0.9104, + "step": 7757 + }, + { + "epoch": 0.560478263225387, + "grad_norm": 9.09517655866563, + "learning_rate": 4.222902792523203e-06, + "loss": 0.9693, + "step": 7758 + }, + { + "epoch": 0.5605505084255965, + "grad_norm": 6.216393105604584, + "learning_rate": 4.222690831119752e-06, + "loss": 0.825, + "step": 7759 + }, + { + "epoch": 0.5606227536258059, + "grad_norm": 6.999304328137251, + "learning_rate": 4.222478846134003e-06, + "loss": 0.933, + "step": 7760 + }, + { + "epoch": 0.5606949988260155, + "grad_norm": 5.87219593415541, + "learning_rate": 4.222266837568855e-06, + "loss": 0.9201, + "step": 7761 + }, + { + "epoch": 0.560767244026225, + "grad_norm": 6.461460890321801, + "learning_rate": 4.222054805427211e-06, + "loss": 0.8829, + "step": 7762 + }, + { + "epoch": 0.5608394892264346, + "grad_norm": 6.992483190326037, + "learning_rate": 4.221842749711975e-06, + "loss": 0.7985, + "step": 7763 + }, + { + "epoch": 0.560911734426644, + "grad_norm": 5.210513988482319, + "learning_rate": 4.221630670426048e-06, + "loss": 0.8123, + "step": 7764 + }, + { + "epoch": 0.5609839796268535, + "grad_norm": 5.118062790420275, + "learning_rate": 4.221418567572334e-06, + "loss": 0.9357, + "step": 7765 + }, + { + "epoch": 0.5610562248270631, + "grad_norm": 6.259827478731069, + "learning_rate": 4.221206441153736e-06, + "loss": 0.8687, + "step": 7766 + }, + { + "epoch": 0.5611284700272725, + "grad_norm": 5.404340285446359, + "learning_rate": 4.2209942911731586e-06, + "loss": 0.8609, + "step": 7767 + }, + { + "epoch": 0.561200715227482, + "grad_norm": 6.989721108304322, + "learning_rate": 4.220782117633505e-06, + "loss": 0.8605, + "step": 7768 + }, + { + "epoch": 0.5612729604276916, + "grad_norm": 7.012567682330601, + "learning_rate": 4.220569920537681e-06, + "loss": 0.9187, + "step": 7769 + }, + { + "epoch": 0.5613452056279011, + "grad_norm": 6.392322715834139, + "learning_rate": 4.220357699888591e-06, + "loss": 0.8477, + "step": 7770 + }, + { + "epoch": 0.5614174508281106, + "grad_norm": 5.2648829268347805, + "learning_rate": 4.220145455689139e-06, + "loss": 0.822, + "step": 7771 + }, + { + "epoch": 0.5614896960283201, + "grad_norm": 6.269502957003917, + "learning_rate": 4.219933187942232e-06, + "loss": 0.8922, + "step": 7772 + }, + { + "epoch": 0.5615619412285296, + "grad_norm": 6.576122858166743, + "learning_rate": 4.2197208966507746e-06, + "loss": 0.8836, + "step": 7773 + }, + { + "epoch": 0.5616341864287392, + "grad_norm": 7.486074871792559, + "learning_rate": 4.219508581817674e-06, + "loss": 0.9047, + "step": 7774 + }, + { + "epoch": 0.5617064316289486, + "grad_norm": 5.672063314711301, + "learning_rate": 4.219296243445835e-06, + "loss": 0.8566, + "step": 7775 + }, + { + "epoch": 0.5617786768291582, + "grad_norm": 6.032115182439635, + "learning_rate": 4.219083881538166e-06, + "loss": 0.8723, + "step": 7776 + }, + { + "epoch": 0.5618509220293677, + "grad_norm": 7.242526707940701, + "learning_rate": 4.218871496097574e-06, + "loss": 0.9077, + "step": 7777 + }, + { + "epoch": 0.5619231672295771, + "grad_norm": 5.392749489365672, + "learning_rate": 4.2186590871269654e-06, + "loss": 0.8139, + "step": 7778 + }, + { + "epoch": 0.5619954124297867, + "grad_norm": 6.574436050423148, + "learning_rate": 4.218446654629248e-06, + "loss": 0.8866, + "step": 7779 + }, + { + "epoch": 0.5620676576299962, + "grad_norm": 5.761877315085058, + "learning_rate": 4.2182341986073305e-06, + "loss": 0.9432, + "step": 7780 + }, + { + "epoch": 0.5621399028302058, + "grad_norm": 6.674149065391806, + "learning_rate": 4.218021719064122e-06, + "loss": 0.8559, + "step": 7781 + }, + { + "epoch": 0.5622121480304152, + "grad_norm": 6.774276311869818, + "learning_rate": 4.217809216002528e-06, + "loss": 0.8856, + "step": 7782 + }, + { + "epoch": 0.5622843932306247, + "grad_norm": 6.353582269862578, + "learning_rate": 4.217596689425461e-06, + "loss": 0.8833, + "step": 7783 + }, + { + "epoch": 0.5623566384308343, + "grad_norm": 6.15988036646414, + "learning_rate": 4.217384139335828e-06, + "loss": 0.913, + "step": 7784 + }, + { + "epoch": 0.5624288836310437, + "grad_norm": 7.7266269291581935, + "learning_rate": 4.21717156573654e-06, + "loss": 0.8923, + "step": 7785 + }, + { + "epoch": 0.5625011288312532, + "grad_norm": 5.552062969158489, + "learning_rate": 4.216958968630507e-06, + "loss": 0.9269, + "step": 7786 + }, + { + "epoch": 0.5625733740314628, + "grad_norm": 6.34452176214889, + "learning_rate": 4.216746348020639e-06, + "loss": 0.8827, + "step": 7787 + }, + { + "epoch": 0.5626456192316723, + "grad_norm": 6.020294517333803, + "learning_rate": 4.216533703909846e-06, + "loss": 0.8652, + "step": 7788 + }, + { + "epoch": 0.5627178644318818, + "grad_norm": 6.186116690086693, + "learning_rate": 4.216321036301038e-06, + "loss": 0.8339, + "step": 7789 + }, + { + "epoch": 0.5627901096320913, + "grad_norm": 7.070877642727792, + "learning_rate": 4.216108345197131e-06, + "loss": 0.8313, + "step": 7790 + }, + { + "epoch": 0.5628623548323008, + "grad_norm": 6.435011299319006, + "learning_rate": 4.215895630601031e-06, + "loss": 0.871, + "step": 7791 + }, + { + "epoch": 0.5629346000325104, + "grad_norm": 6.007067650307519, + "learning_rate": 4.215682892515652e-06, + "loss": 0.8293, + "step": 7792 + }, + { + "epoch": 0.5630068452327198, + "grad_norm": 6.379630109055629, + "learning_rate": 4.215470130943907e-06, + "loss": 0.9483, + "step": 7793 + }, + { + "epoch": 0.5630790904329294, + "grad_norm": 6.4609256914946505, + "learning_rate": 4.215257345888708e-06, + "loss": 0.9438, + "step": 7794 + }, + { + "epoch": 0.5631513356331389, + "grad_norm": 8.06114481100932, + "learning_rate": 4.215044537352967e-06, + "loss": 0.8742, + "step": 7795 + }, + { + "epoch": 0.5632235808333483, + "grad_norm": 5.244697981612951, + "learning_rate": 4.214831705339598e-06, + "loss": 0.8815, + "step": 7796 + }, + { + "epoch": 0.5632958260335579, + "grad_norm": 7.336806890709506, + "learning_rate": 4.214618849851515e-06, + "loss": 0.8676, + "step": 7797 + }, + { + "epoch": 0.5633680712337674, + "grad_norm": 5.76505497955018, + "learning_rate": 4.214405970891631e-06, + "loss": 0.9096, + "step": 7798 + }, + { + "epoch": 0.563440316433977, + "grad_norm": 6.762881667671536, + "learning_rate": 4.21419306846286e-06, + "loss": 0.851, + "step": 7799 + }, + { + "epoch": 0.5635125616341864, + "grad_norm": 8.514325355956856, + "learning_rate": 4.2139801425681185e-06, + "loss": 0.9518, + "step": 7800 + }, + { + "epoch": 0.5635848068343959, + "grad_norm": 5.53445307344308, + "learning_rate": 4.213767193210318e-06, + "loss": 0.8705, + "step": 7801 + }, + { + "epoch": 0.5636570520346055, + "grad_norm": 6.993101535491484, + "learning_rate": 4.2135542203923755e-06, + "loss": 0.8268, + "step": 7802 + }, + { + "epoch": 0.5637292972348149, + "grad_norm": 6.3396539101467875, + "learning_rate": 4.213341224117207e-06, + "loss": 0.8814, + "step": 7803 + }, + { + "epoch": 0.5638015424350244, + "grad_norm": 6.702737721457291, + "learning_rate": 4.2131282043877266e-06, + "loss": 0.8265, + "step": 7804 + }, + { + "epoch": 0.563873787635234, + "grad_norm": 4.797631020876406, + "learning_rate": 4.2129151612068516e-06, + "loss": 0.7843, + "step": 7805 + }, + { + "epoch": 0.5639460328354435, + "grad_norm": 6.137076791751229, + "learning_rate": 4.212702094577499e-06, + "loss": 0.8857, + "step": 7806 + }, + { + "epoch": 0.564018278035653, + "grad_norm": 5.669677645015515, + "learning_rate": 4.212489004502583e-06, + "loss": 0.8769, + "step": 7807 + }, + { + "epoch": 0.5640905232358625, + "grad_norm": 8.015960031984628, + "learning_rate": 4.212275890985022e-06, + "loss": 0.8414, + "step": 7808 + }, + { + "epoch": 0.564162768436072, + "grad_norm": 5.0621041979124115, + "learning_rate": 4.2120627540277345e-06, + "loss": 0.842, + "step": 7809 + }, + { + "epoch": 0.5642350136362816, + "grad_norm": 7.391270937929475, + "learning_rate": 4.211849593633638e-06, + "loss": 0.8845, + "step": 7810 + }, + { + "epoch": 0.564307258836491, + "grad_norm": 6.071352129543155, + "learning_rate": 4.2116364098056485e-06, + "loss": 0.8141, + "step": 7811 + }, + { + "epoch": 0.5643795040367006, + "grad_norm": 5.821712289728427, + "learning_rate": 4.211423202546686e-06, + "loss": 0.9096, + "step": 7812 + }, + { + "epoch": 0.5644517492369101, + "grad_norm": 5.198865825502907, + "learning_rate": 4.2112099718596684e-06, + "loss": 0.7861, + "step": 7813 + }, + { + "epoch": 0.5645239944371195, + "grad_norm": 7.5116806624424415, + "learning_rate": 4.210996717747515e-06, + "loss": 0.919, + "step": 7814 + }, + { + "epoch": 0.5645962396373291, + "grad_norm": 5.976895351740257, + "learning_rate": 4.210783440213145e-06, + "loss": 0.8478, + "step": 7815 + }, + { + "epoch": 0.5646684848375386, + "grad_norm": 5.93671837731526, + "learning_rate": 4.210570139259478e-06, + "loss": 0.9793, + "step": 7816 + }, + { + "epoch": 0.5647407300377482, + "grad_norm": 6.007112737612206, + "learning_rate": 4.210356814889434e-06, + "loss": 0.838, + "step": 7817 + }, + { + "epoch": 0.5648129752379576, + "grad_norm": 5.241283264855198, + "learning_rate": 4.210143467105934e-06, + "loss": 0.8965, + "step": 7818 + }, + { + "epoch": 0.5648852204381671, + "grad_norm": 6.8931835726930775, + "learning_rate": 4.209930095911897e-06, + "loss": 0.8474, + "step": 7819 + }, + { + "epoch": 0.5649574656383767, + "grad_norm": 5.818769785584145, + "learning_rate": 4.2097167013102445e-06, + "loss": 0.8418, + "step": 7820 + }, + { + "epoch": 0.5650297108385861, + "grad_norm": 5.827310050051704, + "learning_rate": 4.209503283303899e-06, + "loss": 0.8823, + "step": 7821 + }, + { + "epoch": 0.5651019560387956, + "grad_norm": 6.830842688476788, + "learning_rate": 4.209289841895779e-06, + "loss": 0.8926, + "step": 7822 + }, + { + "epoch": 0.5651742012390052, + "grad_norm": 6.528023100587699, + "learning_rate": 4.20907637708881e-06, + "loss": 0.7664, + "step": 7823 + }, + { + "epoch": 0.5652464464392147, + "grad_norm": 5.79325058316451, + "learning_rate": 4.2088628888859114e-06, + "loss": 0.8134, + "step": 7824 + }, + { + "epoch": 0.5653186916394242, + "grad_norm": 6.3635579067199295, + "learning_rate": 4.208649377290007e-06, + "loss": 0.8704, + "step": 7825 + }, + { + "epoch": 0.5653909368396337, + "grad_norm": 7.371294027982648, + "learning_rate": 4.208435842304019e-06, + "loss": 0.8659, + "step": 7826 + }, + { + "epoch": 0.5654631820398432, + "grad_norm": 7.588769606023152, + "learning_rate": 4.208222283930871e-06, + "loss": 0.9155, + "step": 7827 + }, + { + "epoch": 0.5655354272400528, + "grad_norm": 5.512248013158455, + "learning_rate": 4.2080087021734865e-06, + "loss": 0.9383, + "step": 7828 + }, + { + "epoch": 0.5656076724402622, + "grad_norm": 6.027347547390249, + "learning_rate": 4.207795097034789e-06, + "loss": 0.7954, + "step": 7829 + }, + { + "epoch": 0.5656799176404718, + "grad_norm": 7.419363010343142, + "learning_rate": 4.207581468517703e-06, + "loss": 0.8459, + "step": 7830 + }, + { + "epoch": 0.5657521628406813, + "grad_norm": 6.067219892715485, + "learning_rate": 4.207367816625152e-06, + "loss": 0.8651, + "step": 7831 + }, + { + "epoch": 0.5658244080408907, + "grad_norm": 4.960300771146279, + "learning_rate": 4.2071541413600615e-06, + "loss": 0.8451, + "step": 7832 + }, + { + "epoch": 0.5658966532411003, + "grad_norm": 7.79606694561794, + "learning_rate": 4.206940442725356e-06, + "loss": 0.9602, + "step": 7833 + }, + { + "epoch": 0.5659688984413098, + "grad_norm": 6.489711541817255, + "learning_rate": 4.206726720723962e-06, + "loss": 0.8322, + "step": 7834 + }, + { + "epoch": 0.5660411436415194, + "grad_norm": 5.922113016221854, + "learning_rate": 4.206512975358804e-06, + "loss": 0.8456, + "step": 7835 + }, + { + "epoch": 0.5661133888417288, + "grad_norm": 5.491375316403039, + "learning_rate": 4.206299206632808e-06, + "loss": 0.867, + "step": 7836 + }, + { + "epoch": 0.5661856340419383, + "grad_norm": 5.682040434218481, + "learning_rate": 4.2060854145489e-06, + "loss": 0.8325, + "step": 7837 + }, + { + "epoch": 0.5662578792421479, + "grad_norm": 6.77849498282456, + "learning_rate": 4.205871599110008e-06, + "loss": 0.8557, + "step": 7838 + }, + { + "epoch": 0.5663301244423573, + "grad_norm": 5.753914205697147, + "learning_rate": 4.20565776031906e-06, + "loss": 0.8444, + "step": 7839 + }, + { + "epoch": 0.5664023696425668, + "grad_norm": 6.256505708299043, + "learning_rate": 4.20544389817898e-06, + "loss": 0.9019, + "step": 7840 + }, + { + "epoch": 0.5664746148427764, + "grad_norm": 5.233025858081312, + "learning_rate": 4.205230012692697e-06, + "loss": 0.7972, + "step": 7841 + }, + { + "epoch": 0.5665468600429859, + "grad_norm": 6.043416613877988, + "learning_rate": 4.20501610386314e-06, + "loss": 0.8277, + "step": 7842 + }, + { + "epoch": 0.5666191052431954, + "grad_norm": 5.465004472093768, + "learning_rate": 4.204802171693236e-06, + "loss": 0.8852, + "step": 7843 + }, + { + "epoch": 0.5666913504434049, + "grad_norm": 6.4608787524724836, + "learning_rate": 4.204588216185914e-06, + "loss": 0.8886, + "step": 7844 + }, + { + "epoch": 0.5667635956436144, + "grad_norm": 7.52437724327867, + "learning_rate": 4.2043742373441024e-06, + "loss": 0.9307, + "step": 7845 + }, + { + "epoch": 0.566835840843824, + "grad_norm": 6.363981710698996, + "learning_rate": 4.2041602351707315e-06, + "loss": 0.8015, + "step": 7846 + }, + { + "epoch": 0.5669080860440334, + "grad_norm": 6.561300113520757, + "learning_rate": 4.20394620966873e-06, + "loss": 0.9179, + "step": 7847 + }, + { + "epoch": 0.566980331244243, + "grad_norm": 5.6793120416220555, + "learning_rate": 4.203732160841027e-06, + "loss": 0.8681, + "step": 7848 + }, + { + "epoch": 0.5670525764444525, + "grad_norm": 6.300845425510126, + "learning_rate": 4.2035180886905555e-06, + "loss": 0.9167, + "step": 7849 + }, + { + "epoch": 0.5671248216446619, + "grad_norm": 6.150736707276319, + "learning_rate": 4.203303993220243e-06, + "loss": 0.887, + "step": 7850 + }, + { + "epoch": 0.5671970668448715, + "grad_norm": 6.450405480702629, + "learning_rate": 4.203089874433021e-06, + "loss": 0.8894, + "step": 7851 + }, + { + "epoch": 0.567269312045081, + "grad_norm": 7.247515548684112, + "learning_rate": 4.2028757323318214e-06, + "loss": 0.9686, + "step": 7852 + }, + { + "epoch": 0.5673415572452906, + "grad_norm": 6.64063706340816, + "learning_rate": 4.2026615669195755e-06, + "loss": 0.8299, + "step": 7853 + }, + { + "epoch": 0.5674138024455, + "grad_norm": 9.859677946390692, + "learning_rate": 4.202447378199214e-06, + "loss": 0.8956, + "step": 7854 + }, + { + "epoch": 0.5674860476457095, + "grad_norm": 6.058252478702347, + "learning_rate": 4.20223316617367e-06, + "loss": 0.8211, + "step": 7855 + }, + { + "epoch": 0.5675582928459191, + "grad_norm": 6.205690713289207, + "learning_rate": 4.202018930845876e-06, + "loss": 0.889, + "step": 7856 + }, + { + "epoch": 0.5676305380461285, + "grad_norm": 6.139389884532247, + "learning_rate": 4.201804672218764e-06, + "loss": 0.8239, + "step": 7857 + }, + { + "epoch": 0.567702783246338, + "grad_norm": 7.715603453496773, + "learning_rate": 4.201590390295268e-06, + "loss": 0.8185, + "step": 7858 + }, + { + "epoch": 0.5677750284465476, + "grad_norm": 6.559946780570405, + "learning_rate": 4.2013760850783206e-06, + "loss": 0.8622, + "step": 7859 + }, + { + "epoch": 0.5678472736467571, + "grad_norm": 7.703049727066303, + "learning_rate": 4.201161756570855e-06, + "loss": 0.832, + "step": 7860 + }, + { + "epoch": 0.5679195188469666, + "grad_norm": 7.939942232013094, + "learning_rate": 4.200947404775807e-06, + "loss": 0.8749, + "step": 7861 + }, + { + "epoch": 0.5679917640471761, + "grad_norm": 5.50471537201555, + "learning_rate": 4.200733029696109e-06, + "loss": 0.885, + "step": 7862 + }, + { + "epoch": 0.5680640092473856, + "grad_norm": 7.52338045758165, + "learning_rate": 4.200518631334696e-06, + "loss": 0.8477, + "step": 7863 + }, + { + "epoch": 0.5681362544475952, + "grad_norm": 8.150600537600598, + "learning_rate": 4.200304209694503e-06, + "loss": 0.9013, + "step": 7864 + }, + { + "epoch": 0.5682084996478046, + "grad_norm": 5.633951143474759, + "learning_rate": 4.200089764778467e-06, + "loss": 0.8865, + "step": 7865 + }, + { + "epoch": 0.5682807448480142, + "grad_norm": 7.160455658974548, + "learning_rate": 4.199875296589521e-06, + "loss": 0.9447, + "step": 7866 + }, + { + "epoch": 0.5683529900482237, + "grad_norm": 5.853247772900683, + "learning_rate": 4.199660805130603e-06, + "loss": 0.8154, + "step": 7867 + }, + { + "epoch": 0.5684252352484331, + "grad_norm": 5.3996441123389785, + "learning_rate": 4.199446290404647e-06, + "loss": 0.8792, + "step": 7868 + }, + { + "epoch": 0.5684974804486427, + "grad_norm": 6.504361669959827, + "learning_rate": 4.199231752414592e-06, + "loss": 0.8191, + "step": 7869 + }, + { + "epoch": 0.5685697256488522, + "grad_norm": 7.636094074115394, + "learning_rate": 4.1990171911633725e-06, + "loss": 0.9341, + "step": 7870 + }, + { + "epoch": 0.5686419708490618, + "grad_norm": 6.36920950208081, + "learning_rate": 4.198802606653929e-06, + "loss": 0.8834, + "step": 7871 + }, + { + "epoch": 0.5687142160492712, + "grad_norm": 5.456031352727754, + "learning_rate": 4.198587998889195e-06, + "loss": 0.8685, + "step": 7872 + }, + { + "epoch": 0.5687864612494807, + "grad_norm": 6.8133135668523215, + "learning_rate": 4.19837336787211e-06, + "loss": 0.8961, + "step": 7873 + }, + { + "epoch": 0.5688587064496903, + "grad_norm": 5.643424145320319, + "learning_rate": 4.198158713605614e-06, + "loss": 0.7781, + "step": 7874 + }, + { + "epoch": 0.5689309516498997, + "grad_norm": 6.35773748736545, + "learning_rate": 4.197944036092642e-06, + "loss": 0.8958, + "step": 7875 + }, + { + "epoch": 0.5690031968501092, + "grad_norm": 5.911932255461328, + "learning_rate": 4.197729335336135e-06, + "loss": 0.9362, + "step": 7876 + }, + { + "epoch": 0.5690754420503188, + "grad_norm": 5.87780435063766, + "learning_rate": 4.1975146113390315e-06, + "loss": 0.9232, + "step": 7877 + }, + { + "epoch": 0.5691476872505283, + "grad_norm": 6.2985973794944705, + "learning_rate": 4.197299864104271e-06, + "loss": 0.8295, + "step": 7878 + }, + { + "epoch": 0.5692199324507378, + "grad_norm": 6.566334793818521, + "learning_rate": 4.197085093634794e-06, + "loss": 0.8853, + "step": 7879 + }, + { + "epoch": 0.5692921776509473, + "grad_norm": 6.102715260023351, + "learning_rate": 4.196870299933539e-06, + "loss": 0.8601, + "step": 7880 + }, + { + "epoch": 0.5693644228511568, + "grad_norm": 7.0961560441571, + "learning_rate": 4.196655483003448e-06, + "loss": 0.8824, + "step": 7881 + }, + { + "epoch": 0.5694366680513664, + "grad_norm": 8.680223669870747, + "learning_rate": 4.196440642847459e-06, + "loss": 0.8696, + "step": 7882 + }, + { + "epoch": 0.5695089132515758, + "grad_norm": 8.07277621895638, + "learning_rate": 4.196225779468517e-06, + "loss": 0.8876, + "step": 7883 + }, + { + "epoch": 0.5695811584517854, + "grad_norm": 6.018490591826679, + "learning_rate": 4.19601089286956e-06, + "loss": 0.833, + "step": 7884 + }, + { + "epoch": 0.5696534036519949, + "grad_norm": 6.477404715670153, + "learning_rate": 4.195795983053531e-06, + "loss": 0.9678, + "step": 7885 + }, + { + "epoch": 0.5697256488522043, + "grad_norm": 7.310184470796227, + "learning_rate": 4.195581050023371e-06, + "loss": 0.8375, + "step": 7886 + }, + { + "epoch": 0.5697978940524139, + "grad_norm": 6.1074662214571624, + "learning_rate": 4.195366093782023e-06, + "loss": 0.8461, + "step": 7887 + }, + { + "epoch": 0.5698701392526234, + "grad_norm": 5.507907904481785, + "learning_rate": 4.195151114332431e-06, + "loss": 0.8676, + "step": 7888 + }, + { + "epoch": 0.569942384452833, + "grad_norm": 7.935874231803201, + "learning_rate": 4.1949361116775345e-06, + "loss": 0.8463, + "step": 7889 + }, + { + "epoch": 0.5700146296530424, + "grad_norm": 6.180253534640905, + "learning_rate": 4.19472108582028e-06, + "loss": 0.8808, + "step": 7890 + }, + { + "epoch": 0.5700868748532519, + "grad_norm": 5.021842549093971, + "learning_rate": 4.194506036763609e-06, + "loss": 0.9005, + "step": 7891 + }, + { + "epoch": 0.5701591200534615, + "grad_norm": 6.757408032964608, + "learning_rate": 4.194290964510467e-06, + "loss": 0.9426, + "step": 7892 + }, + { + "epoch": 0.5702313652536709, + "grad_norm": 6.7148340923069005, + "learning_rate": 4.194075869063796e-06, + "loss": 0.8888, + "step": 7893 + }, + { + "epoch": 0.5703036104538804, + "grad_norm": 5.104076379509442, + "learning_rate": 4.193860750426543e-06, + "loss": 0.8645, + "step": 7894 + }, + { + "epoch": 0.57037585565409, + "grad_norm": 6.82724335519503, + "learning_rate": 4.193645608601651e-06, + "loss": 1.0236, + "step": 7895 + }, + { + "epoch": 0.5704481008542995, + "grad_norm": 7.74504250738252, + "learning_rate": 4.193430443592065e-06, + "loss": 0.9778, + "step": 7896 + }, + { + "epoch": 0.570520346054509, + "grad_norm": 6.306419238720333, + "learning_rate": 4.193215255400732e-06, + "loss": 0.9082, + "step": 7897 + }, + { + "epoch": 0.5705925912547185, + "grad_norm": 5.843979122137627, + "learning_rate": 4.193000044030597e-06, + "loss": 0.9411, + "step": 7898 + }, + { + "epoch": 0.570664836454928, + "grad_norm": 6.011505856874493, + "learning_rate": 4.192784809484605e-06, + "loss": 0.8591, + "step": 7899 + }, + { + "epoch": 0.5707370816551376, + "grad_norm": 6.315018255429374, + "learning_rate": 4.192569551765703e-06, + "loss": 0.9386, + "step": 7900 + }, + { + "epoch": 0.570809326855347, + "grad_norm": 5.769408269254634, + "learning_rate": 4.1923542708768386e-06, + "loss": 0.8415, + "step": 7901 + }, + { + "epoch": 0.5708815720555566, + "grad_norm": 6.547297425400816, + "learning_rate": 4.192138966820959e-06, + "loss": 0.7953, + "step": 7902 + }, + { + "epoch": 0.5709538172557661, + "grad_norm": 5.787176939678996, + "learning_rate": 4.19192363960101e-06, + "loss": 0.8381, + "step": 7903 + }, + { + "epoch": 0.5710260624559755, + "grad_norm": 5.285489699846884, + "learning_rate": 4.1917082892199405e-06, + "loss": 0.8902, + "step": 7904 + }, + { + "epoch": 0.5710983076561851, + "grad_norm": 6.054185967131285, + "learning_rate": 4.191492915680697e-06, + "loss": 0.9268, + "step": 7905 + }, + { + "epoch": 0.5711705528563946, + "grad_norm": 5.783926349471722, + "learning_rate": 4.19127751898623e-06, + "loss": 0.8308, + "step": 7906 + }, + { + "epoch": 0.5712427980566042, + "grad_norm": 5.819989701094806, + "learning_rate": 4.191062099139487e-06, + "loss": 0.8796, + "step": 7907 + }, + { + "epoch": 0.5713150432568136, + "grad_norm": 6.448074396409432, + "learning_rate": 4.190846656143417e-06, + "loss": 0.8134, + "step": 7908 + }, + { + "epoch": 0.5713872884570231, + "grad_norm": 5.798330119235266, + "learning_rate": 4.190631190000967e-06, + "loss": 0.839, + "step": 7909 + }, + { + "epoch": 0.5714595336572327, + "grad_norm": 5.972027423057269, + "learning_rate": 4.190415700715092e-06, + "loss": 0.8975, + "step": 7910 + }, + { + "epoch": 0.5715317788574421, + "grad_norm": 7.940598491337713, + "learning_rate": 4.190200188288735e-06, + "loss": 0.8754, + "step": 7911 + }, + { + "epoch": 0.5716040240576516, + "grad_norm": 7.013931851732804, + "learning_rate": 4.189984652724853e-06, + "loss": 0.9844, + "step": 7912 + }, + { + "epoch": 0.5716762692578612, + "grad_norm": 5.965789859785797, + "learning_rate": 4.189769094026392e-06, + "loss": 0.8712, + "step": 7913 + }, + { + "epoch": 0.5717485144580707, + "grad_norm": 6.308034998475531, + "learning_rate": 4.189553512196304e-06, + "loss": 0.8743, + "step": 7914 + }, + { + "epoch": 0.5718207596582802, + "grad_norm": 7.429874133023275, + "learning_rate": 4.1893379072375405e-06, + "loss": 0.8646, + "step": 7915 + }, + { + "epoch": 0.5718930048584897, + "grad_norm": 6.3944850167273355, + "learning_rate": 4.189122279153052e-06, + "loss": 0.8481, + "step": 7916 + }, + { + "epoch": 0.5719652500586992, + "grad_norm": 7.6018713705074115, + "learning_rate": 4.188906627945792e-06, + "loss": 0.9147, + "step": 7917 + }, + { + "epoch": 0.5720374952589088, + "grad_norm": 8.373331843252515, + "learning_rate": 4.188690953618711e-06, + "loss": 0.8879, + "step": 7918 + }, + { + "epoch": 0.5721097404591182, + "grad_norm": 6.474983492207933, + "learning_rate": 4.188475256174762e-06, + "loss": 0.8064, + "step": 7919 + }, + { + "epoch": 0.5721819856593278, + "grad_norm": 6.191067678538504, + "learning_rate": 4.188259535616898e-06, + "loss": 0.8706, + "step": 7920 + }, + { + "epoch": 0.5722542308595373, + "grad_norm": 6.317288838983578, + "learning_rate": 4.188043791948072e-06, + "loss": 0.94, + "step": 7921 + }, + { + "epoch": 0.5723264760597467, + "grad_norm": 7.100593123040822, + "learning_rate": 4.187828025171238e-06, + "loss": 0.8102, + "step": 7922 + }, + { + "epoch": 0.5723987212599563, + "grad_norm": 6.10193510786102, + "learning_rate": 4.187612235289347e-06, + "loss": 0.8835, + "step": 7923 + }, + { + "epoch": 0.5724709664601658, + "grad_norm": 5.44546188959578, + "learning_rate": 4.187396422305356e-06, + "loss": 0.7964, + "step": 7924 + }, + { + "epoch": 0.5725432116603754, + "grad_norm": 6.466159467699413, + "learning_rate": 4.187180586222217e-06, + "loss": 0.8785, + "step": 7925 + }, + { + "epoch": 0.5726154568605848, + "grad_norm": 6.203559764164329, + "learning_rate": 4.186964727042887e-06, + "loss": 0.854, + "step": 7926 + }, + { + "epoch": 0.5726877020607943, + "grad_norm": 5.8719263355717315, + "learning_rate": 4.1867488447703195e-06, + "loss": 0.8647, + "step": 7927 + }, + { + "epoch": 0.5727599472610039, + "grad_norm": 7.225985777240422, + "learning_rate": 4.186532939407471e-06, + "loss": 0.9191, + "step": 7928 + }, + { + "epoch": 0.5728321924612133, + "grad_norm": 7.131501026064964, + "learning_rate": 4.1863170109572935e-06, + "loss": 0.8351, + "step": 7929 + }, + { + "epoch": 0.5729044376614228, + "grad_norm": 5.334479963703519, + "learning_rate": 4.1861010594227475e-06, + "loss": 0.9375, + "step": 7930 + }, + { + "epoch": 0.5729766828616324, + "grad_norm": 7.501926174775856, + "learning_rate": 4.185885084806787e-06, + "loss": 0.821, + "step": 7931 + }, + { + "epoch": 0.5730489280618419, + "grad_norm": 5.963850796834567, + "learning_rate": 4.185669087112367e-06, + "loss": 0.9867, + "step": 7932 + }, + { + "epoch": 0.5731211732620514, + "grad_norm": 7.613300039660626, + "learning_rate": 4.1854530663424484e-06, + "loss": 0.8858, + "step": 7933 + }, + { + "epoch": 0.5731934184622609, + "grad_norm": 5.260339093029766, + "learning_rate": 4.185237022499984e-06, + "loss": 0.8744, + "step": 7934 + }, + { + "epoch": 0.5732656636624704, + "grad_norm": 5.602660174005793, + "learning_rate": 4.185020955587935e-06, + "loss": 0.899, + "step": 7935 + }, + { + "epoch": 0.57333790886268, + "grad_norm": 6.396457538111641, + "learning_rate": 4.184804865609257e-06, + "loss": 0.8173, + "step": 7936 + }, + { + "epoch": 0.5734101540628894, + "grad_norm": 5.613904054371561, + "learning_rate": 4.184588752566907e-06, + "loss": 0.7891, + "step": 7937 + }, + { + "epoch": 0.573482399263099, + "grad_norm": 7.667509060342198, + "learning_rate": 4.184372616463846e-06, + "loss": 0.8981, + "step": 7938 + }, + { + "epoch": 0.5735546444633085, + "grad_norm": 7.0230551202917555, + "learning_rate": 4.184156457303032e-06, + "loss": 0.791, + "step": 7939 + }, + { + "epoch": 0.5736268896635179, + "grad_norm": 6.963251656138028, + "learning_rate": 4.183940275087424e-06, + "loss": 0.9212, + "step": 7940 + }, + { + "epoch": 0.5736991348637275, + "grad_norm": 6.111899242249671, + "learning_rate": 4.183724069819981e-06, + "loss": 0.8941, + "step": 7941 + }, + { + "epoch": 0.573771380063937, + "grad_norm": 4.643285945331947, + "learning_rate": 4.1835078415036625e-06, + "loss": 0.8422, + "step": 7942 + }, + { + "epoch": 0.5738436252641466, + "grad_norm": 6.509460313872484, + "learning_rate": 4.183291590141429e-06, + "loss": 0.8435, + "step": 7943 + }, + { + "epoch": 0.573915870464356, + "grad_norm": 9.072553438316762, + "learning_rate": 4.18307531573624e-06, + "loss": 0.9389, + "step": 7944 + }, + { + "epoch": 0.5739881156645655, + "grad_norm": 7.829414794411471, + "learning_rate": 4.182859018291057e-06, + "loss": 0.9148, + "step": 7945 + }, + { + "epoch": 0.5740603608647751, + "grad_norm": 6.648695979937914, + "learning_rate": 4.182642697808841e-06, + "loss": 0.8387, + "step": 7946 + }, + { + "epoch": 0.5741326060649845, + "grad_norm": 6.623943892457852, + "learning_rate": 4.182426354292554e-06, + "loss": 0.879, + "step": 7947 + }, + { + "epoch": 0.574204851265194, + "grad_norm": 5.132069247937424, + "learning_rate": 4.182209987745155e-06, + "loss": 0.8272, + "step": 7948 + }, + { + "epoch": 0.5742770964654036, + "grad_norm": 6.41351769534595, + "learning_rate": 4.181993598169607e-06, + "loss": 0.9244, + "step": 7949 + }, + { + "epoch": 0.5743493416656131, + "grad_norm": 5.9345323072719625, + "learning_rate": 4.181777185568874e-06, + "loss": 0.8172, + "step": 7950 + }, + { + "epoch": 0.5744215868658226, + "grad_norm": 6.987304346283185, + "learning_rate": 4.181560749945916e-06, + "loss": 0.9383, + "step": 7951 + }, + { + "epoch": 0.5744938320660321, + "grad_norm": 6.723007485558744, + "learning_rate": 4.181344291303698e-06, + "loss": 0.9046, + "step": 7952 + }, + { + "epoch": 0.5745660772662416, + "grad_norm": 7.084922372935188, + "learning_rate": 4.181127809645182e-06, + "loss": 0.8599, + "step": 7953 + }, + { + "epoch": 0.5746383224664512, + "grad_norm": 10.134385268505822, + "learning_rate": 4.180911304973331e-06, + "loss": 0.9025, + "step": 7954 + }, + { + "epoch": 0.5747105676666606, + "grad_norm": 6.0389715387716265, + "learning_rate": 4.18069477729111e-06, + "loss": 0.8507, + "step": 7955 + }, + { + "epoch": 0.5747828128668702, + "grad_norm": 6.908421239841966, + "learning_rate": 4.180478226601482e-06, + "loss": 0.8643, + "step": 7956 + }, + { + "epoch": 0.5748550580670797, + "grad_norm": 5.547368245290062, + "learning_rate": 4.180261652907412e-06, + "loss": 0.8589, + "step": 7957 + }, + { + "epoch": 0.5749273032672891, + "grad_norm": 6.1614410666715305, + "learning_rate": 4.180045056211865e-06, + "loss": 0.8828, + "step": 7958 + }, + { + "epoch": 0.5749995484674987, + "grad_norm": 8.558446662645258, + "learning_rate": 4.179828436517805e-06, + "loss": 0.8823, + "step": 7959 + }, + { + "epoch": 0.5750717936677082, + "grad_norm": 8.405980442646083, + "learning_rate": 4.179611793828198e-06, + "loss": 1.0353, + "step": 7960 + }, + { + "epoch": 0.5751440388679178, + "grad_norm": 9.592430595602208, + "learning_rate": 4.17939512814601e-06, + "loss": 0.887, + "step": 7961 + }, + { + "epoch": 0.5752162840681272, + "grad_norm": 7.006149043530691, + "learning_rate": 4.179178439474206e-06, + "loss": 0.8716, + "step": 7962 + }, + { + "epoch": 0.5752885292683367, + "grad_norm": 7.1040979943261116, + "learning_rate": 4.1789617278157545e-06, + "loss": 0.8607, + "step": 7963 + }, + { + "epoch": 0.5753607744685463, + "grad_norm": 6.737682831217423, + "learning_rate": 4.178744993173619e-06, + "loss": 0.7935, + "step": 7964 + }, + { + "epoch": 0.5754330196687557, + "grad_norm": 7.0686027693656674, + "learning_rate": 4.1785282355507695e-06, + "loss": 0.8542, + "step": 7965 + }, + { + "epoch": 0.5755052648689652, + "grad_norm": 8.448880129310222, + "learning_rate": 4.178311454950171e-06, + "loss": 0.8432, + "step": 7966 + }, + { + "epoch": 0.5755775100691748, + "grad_norm": 5.509932998554481, + "learning_rate": 4.178094651374791e-06, + "loss": 0.8842, + "step": 7967 + }, + { + "epoch": 0.5756497552693843, + "grad_norm": 6.324461528790446, + "learning_rate": 4.177877824827599e-06, + "loss": 0.9471, + "step": 7968 + }, + { + "epoch": 0.5757220004695938, + "grad_norm": 5.3452112531175136, + "learning_rate": 4.177660975311563e-06, + "loss": 0.8691, + "step": 7969 + }, + { + "epoch": 0.5757942456698033, + "grad_norm": 8.669598694767844, + "learning_rate": 4.177444102829649e-06, + "loss": 0.8754, + "step": 7970 + }, + { + "epoch": 0.5758664908700128, + "grad_norm": 8.098757575553062, + "learning_rate": 4.17722720738483e-06, + "loss": 1.0133, + "step": 7971 + }, + { + "epoch": 0.5759387360702223, + "grad_norm": 6.186286884412957, + "learning_rate": 4.177010288980071e-06, + "loss": 0.8515, + "step": 7972 + }, + { + "epoch": 0.5760109812704318, + "grad_norm": 7.551675916183533, + "learning_rate": 4.176793347618344e-06, + "loss": 0.8276, + "step": 7973 + }, + { + "epoch": 0.5760832264706414, + "grad_norm": 6.750573875615797, + "learning_rate": 4.176576383302618e-06, + "loss": 0.8637, + "step": 7974 + }, + { + "epoch": 0.5761554716708509, + "grad_norm": 8.439352100688449, + "learning_rate": 4.1763593960358626e-06, + "loss": 0.9399, + "step": 7975 + }, + { + "epoch": 0.5762277168710603, + "grad_norm": 6.179739969949496, + "learning_rate": 4.17614238582105e-06, + "loss": 0.8545, + "step": 7976 + }, + { + "epoch": 0.5762999620712699, + "grad_norm": 10.664919551182892, + "learning_rate": 4.175925352661148e-06, + "loss": 0.8955, + "step": 7977 + }, + { + "epoch": 0.5763722072714794, + "grad_norm": 7.813952013503534, + "learning_rate": 4.17570829655913e-06, + "loss": 0.88, + "step": 7978 + }, + { + "epoch": 0.576444452471689, + "grad_norm": 5.873409299384448, + "learning_rate": 4.1754912175179665e-06, + "loss": 0.8996, + "step": 7979 + }, + { + "epoch": 0.5765166976718984, + "grad_norm": 5.341345351975229, + "learning_rate": 4.17527411554063e-06, + "loss": 0.8296, + "step": 7980 + }, + { + "epoch": 0.5765889428721079, + "grad_norm": 6.531902955922258, + "learning_rate": 4.17505699063009e-06, + "loss": 0.8531, + "step": 7981 + }, + { + "epoch": 0.5766611880723175, + "grad_norm": 6.5097676759860255, + "learning_rate": 4.174839842789321e-06, + "loss": 0.8958, + "step": 7982 + }, + { + "epoch": 0.5767334332725269, + "grad_norm": 10.568210841164213, + "learning_rate": 4.174622672021295e-06, + "loss": 0.8642, + "step": 7983 + }, + { + "epoch": 0.5768056784727364, + "grad_norm": 6.899393259544894, + "learning_rate": 4.174405478328984e-06, + "loss": 0.7904, + "step": 7984 + }, + { + "epoch": 0.576877923672946, + "grad_norm": 8.996137849978751, + "learning_rate": 4.174188261715364e-06, + "loss": 0.8622, + "step": 7985 + }, + { + "epoch": 0.5769501688731555, + "grad_norm": 5.830253805456085, + "learning_rate": 4.173971022183405e-06, + "loss": 0.8583, + "step": 7986 + }, + { + "epoch": 0.577022414073365, + "grad_norm": 6.165673184484488, + "learning_rate": 4.173753759736084e-06, + "loss": 0.9007, + "step": 7987 + }, + { + "epoch": 0.5770946592735745, + "grad_norm": 7.557801103060846, + "learning_rate": 4.173536474376373e-06, + "loss": 0.8397, + "step": 7988 + }, + { + "epoch": 0.577166904473784, + "grad_norm": 7.990317684346308, + "learning_rate": 4.173319166107246e-06, + "loss": 0.8551, + "step": 7989 + }, + { + "epoch": 0.5772391496739935, + "grad_norm": 5.867142312679685, + "learning_rate": 4.17310183493168e-06, + "loss": 0.8105, + "step": 7990 + }, + { + "epoch": 0.577311394874203, + "grad_norm": 6.376223876756028, + "learning_rate": 4.172884480852649e-06, + "loss": 0.8724, + "step": 7991 + }, + { + "epoch": 0.5773836400744126, + "grad_norm": 8.167366724436082, + "learning_rate": 4.172667103873128e-06, + "loss": 0.9658, + "step": 7992 + }, + { + "epoch": 0.5774558852746221, + "grad_norm": 7.083981682519477, + "learning_rate": 4.172449703996093e-06, + "loss": 0.9647, + "step": 7993 + }, + { + "epoch": 0.5775281304748315, + "grad_norm": 5.748050525079541, + "learning_rate": 4.17223228122452e-06, + "loss": 0.8846, + "step": 7994 + }, + { + "epoch": 0.5776003756750411, + "grad_norm": 6.692190931534405, + "learning_rate": 4.172014835561385e-06, + "loss": 0.834, + "step": 7995 + }, + { + "epoch": 0.5776726208752506, + "grad_norm": 4.940371680234918, + "learning_rate": 4.171797367009667e-06, + "loss": 0.9046, + "step": 7996 + }, + { + "epoch": 0.5777448660754602, + "grad_norm": 6.597075432823049, + "learning_rate": 4.171579875572339e-06, + "loss": 0.8303, + "step": 7997 + }, + { + "epoch": 0.5778171112756696, + "grad_norm": 5.44443954848158, + "learning_rate": 4.171362361252382e-06, + "loss": 0.7713, + "step": 7998 + }, + { + "epoch": 0.5778893564758791, + "grad_norm": 5.741736403412258, + "learning_rate": 4.1711448240527716e-06, + "loss": 0.8744, + "step": 7999 + }, + { + "epoch": 0.5779616016760887, + "grad_norm": 5.603756271518769, + "learning_rate": 4.1709272639764855e-06, + "loss": 0.8894, + "step": 8000 + }, + { + "epoch": 0.5780338468762981, + "grad_norm": 8.324855255609881, + "learning_rate": 4.170709681026503e-06, + "loss": 0.9301, + "step": 8001 + }, + { + "epoch": 0.5781060920765076, + "grad_norm": 8.195962274646089, + "learning_rate": 4.170492075205802e-06, + "loss": 0.9179, + "step": 8002 + }, + { + "epoch": 0.5781783372767172, + "grad_norm": 5.5120006032315, + "learning_rate": 4.170274446517361e-06, + "loss": 0.8161, + "step": 8003 + }, + { + "epoch": 0.5782505824769267, + "grad_norm": 5.26722996190195, + "learning_rate": 4.170056794964161e-06, + "loss": 0.9021, + "step": 8004 + }, + { + "epoch": 0.5783228276771362, + "grad_norm": 5.759269540352121, + "learning_rate": 4.16983912054918e-06, + "loss": 0.919, + "step": 8005 + }, + { + "epoch": 0.5783950728773457, + "grad_norm": 7.05734767164317, + "learning_rate": 4.169621423275398e-06, + "loss": 0.8879, + "step": 8006 + }, + { + "epoch": 0.5784673180775552, + "grad_norm": 5.81826299643238, + "learning_rate": 4.169403703145794e-06, + "loss": 0.8676, + "step": 8007 + }, + { + "epoch": 0.5785395632777647, + "grad_norm": 5.756947508063253, + "learning_rate": 4.16918596016335e-06, + "loss": 0.8968, + "step": 8008 + }, + { + "epoch": 0.5786118084779742, + "grad_norm": 6.781259563105296, + "learning_rate": 4.1689681943310466e-06, + "loss": 0.9194, + "step": 8009 + }, + { + "epoch": 0.5786840536781838, + "grad_norm": 5.591005813652923, + "learning_rate": 4.168750405651864e-06, + "loss": 0.8788, + "step": 8010 + }, + { + "epoch": 0.5787562988783933, + "grad_norm": 5.849766535868754, + "learning_rate": 4.168532594128785e-06, + "loss": 0.834, + "step": 8011 + }, + { + "epoch": 0.5788285440786027, + "grad_norm": 6.310751493413529, + "learning_rate": 4.168314759764789e-06, + "loss": 0.8586, + "step": 8012 + }, + { + "epoch": 0.5789007892788123, + "grad_norm": 5.896439412196077, + "learning_rate": 4.16809690256286e-06, + "loss": 0.9003, + "step": 8013 + }, + { + "epoch": 0.5789730344790218, + "grad_norm": 5.655343336076189, + "learning_rate": 4.167879022525979e-06, + "loss": 0.8915, + "step": 8014 + }, + { + "epoch": 0.5790452796792314, + "grad_norm": 7.60312905395751, + "learning_rate": 4.167661119657131e-06, + "loss": 0.864, + "step": 8015 + }, + { + "epoch": 0.5791175248794408, + "grad_norm": 5.008878073791123, + "learning_rate": 4.167443193959295e-06, + "loss": 0.794, + "step": 8016 + }, + { + "epoch": 0.5791897700796503, + "grad_norm": 5.883066686075597, + "learning_rate": 4.167225245435458e-06, + "loss": 0.8934, + "step": 8017 + }, + { + "epoch": 0.5792620152798599, + "grad_norm": 5.253330627447164, + "learning_rate": 4.1670072740886005e-06, + "loss": 0.7999, + "step": 8018 + }, + { + "epoch": 0.5793342604800693, + "grad_norm": 6.97344702930043, + "learning_rate": 4.166789279921708e-06, + "loss": 0.9073, + "step": 8019 + }, + { + "epoch": 0.5794065056802788, + "grad_norm": 7.706804306006737, + "learning_rate": 4.166571262937766e-06, + "loss": 0.8737, + "step": 8020 + }, + { + "epoch": 0.5794787508804884, + "grad_norm": 6.893514775459081, + "learning_rate": 4.166353223139756e-06, + "loss": 0.8415, + "step": 8021 + }, + { + "epoch": 0.5795509960806979, + "grad_norm": 5.43453442612253, + "learning_rate": 4.166135160530665e-06, + "loss": 0.8389, + "step": 8022 + }, + { + "epoch": 0.5796232412809074, + "grad_norm": 7.521424021115943, + "learning_rate": 4.165917075113476e-06, + "loss": 0.8832, + "step": 8023 + }, + { + "epoch": 0.5796954864811169, + "grad_norm": 8.754356389726194, + "learning_rate": 4.1656989668911765e-06, + "loss": 0.7972, + "step": 8024 + }, + { + "epoch": 0.5797677316813264, + "grad_norm": 5.5840328784182365, + "learning_rate": 4.165480835866751e-06, + "loss": 0.8827, + "step": 8025 + }, + { + "epoch": 0.5798399768815359, + "grad_norm": 7.399215512319796, + "learning_rate": 4.1652626820431865e-06, + "loss": 0.8854, + "step": 8026 + }, + { + "epoch": 0.5799122220817454, + "grad_norm": 6.4820405365766005, + "learning_rate": 4.165044505423469e-06, + "loss": 0.8846, + "step": 8027 + }, + { + "epoch": 0.579984467281955, + "grad_norm": 6.419412628114058, + "learning_rate": 4.164826306010585e-06, + "loss": 0.8737, + "step": 8028 + }, + { + "epoch": 0.5800567124821645, + "grad_norm": 5.379031776176926, + "learning_rate": 4.164608083807522e-06, + "loss": 0.884, + "step": 8029 + }, + { + "epoch": 0.5801289576823739, + "grad_norm": 6.010476185345831, + "learning_rate": 4.164389838817267e-06, + "loss": 0.9398, + "step": 8030 + }, + { + "epoch": 0.5802012028825835, + "grad_norm": 6.682859432830726, + "learning_rate": 4.164171571042806e-06, + "loss": 0.8298, + "step": 8031 + }, + { + "epoch": 0.580273448082793, + "grad_norm": 5.419597928100433, + "learning_rate": 4.163953280487129e-06, + "loss": 0.8863, + "step": 8032 + }, + { + "epoch": 0.5803456932830026, + "grad_norm": 5.569339105376751, + "learning_rate": 4.163734967153223e-06, + "loss": 0.8076, + "step": 8033 + }, + { + "epoch": 0.580417938483212, + "grad_norm": 5.864789490495141, + "learning_rate": 4.163516631044078e-06, + "loss": 0.8616, + "step": 8034 + }, + { + "epoch": 0.5804901836834215, + "grad_norm": 6.999663753608042, + "learning_rate": 4.163298272162682e-06, + "loss": 0.8416, + "step": 8035 + }, + { + "epoch": 0.5805624288836311, + "grad_norm": 6.0281642471546695, + "learning_rate": 4.163079890512024e-06, + "loss": 0.9669, + "step": 8036 + }, + { + "epoch": 0.5806346740838405, + "grad_norm": 6.507599423001092, + "learning_rate": 4.162861486095094e-06, + "loss": 0.9122, + "step": 8037 + }, + { + "epoch": 0.58070691928405, + "grad_norm": 5.947037274967554, + "learning_rate": 4.16264305891488e-06, + "loss": 0.873, + "step": 8038 + }, + { + "epoch": 0.5807791644842596, + "grad_norm": 7.233698704130458, + "learning_rate": 4.162424608974374e-06, + "loss": 0.8113, + "step": 8039 + }, + { + "epoch": 0.5808514096844691, + "grad_norm": 5.588611821578339, + "learning_rate": 4.162206136276566e-06, + "loss": 0.9272, + "step": 8040 + }, + { + "epoch": 0.5809236548846786, + "grad_norm": 5.652965287398052, + "learning_rate": 4.161987640824448e-06, + "loss": 0.7941, + "step": 8041 + }, + { + "epoch": 0.5809959000848881, + "grad_norm": 5.528966917422295, + "learning_rate": 4.161769122621007e-06, + "loss": 0.8222, + "step": 8042 + }, + { + "epoch": 0.5810681452850976, + "grad_norm": 6.081650836589795, + "learning_rate": 4.161550581669239e-06, + "loss": 0.9112, + "step": 8043 + }, + { + "epoch": 0.5811403904853071, + "grad_norm": 6.19179255005448, + "learning_rate": 4.161332017972134e-06, + "loss": 0.9066, + "step": 8044 + }, + { + "epoch": 0.5812126356855166, + "grad_norm": 7.124422551898368, + "learning_rate": 4.161113431532682e-06, + "loss": 0.8726, + "step": 8045 + }, + { + "epoch": 0.5812848808857262, + "grad_norm": 6.317865187880045, + "learning_rate": 4.160894822353877e-06, + "loss": 0.8721, + "step": 8046 + }, + { + "epoch": 0.5813571260859357, + "grad_norm": 5.179584761071021, + "learning_rate": 4.160676190438713e-06, + "loss": 0.8526, + "step": 8047 + }, + { + "epoch": 0.5814293712861451, + "grad_norm": 7.203303998928309, + "learning_rate": 4.16045753579018e-06, + "loss": 0.8631, + "step": 8048 + }, + { + "epoch": 0.5815016164863547, + "grad_norm": 6.426849443413977, + "learning_rate": 4.160238858411273e-06, + "loss": 0.8505, + "step": 8049 + }, + { + "epoch": 0.5815738616865642, + "grad_norm": 5.9601931925958915, + "learning_rate": 4.160020158304985e-06, + "loss": 0.8739, + "step": 8050 + }, + { + "epoch": 0.5816461068867738, + "grad_norm": 6.356109455546583, + "learning_rate": 4.15980143547431e-06, + "loss": 0.9052, + "step": 8051 + }, + { + "epoch": 0.5817183520869832, + "grad_norm": 6.063590040234035, + "learning_rate": 4.159582689922243e-06, + "loss": 0.8578, + "step": 8052 + }, + { + "epoch": 0.5817905972871927, + "grad_norm": 6.88148109817154, + "learning_rate": 4.159363921651777e-06, + "loss": 0.9234, + "step": 8053 + }, + { + "epoch": 0.5818628424874023, + "grad_norm": 5.470471443628214, + "learning_rate": 4.159145130665907e-06, + "loss": 0.9769, + "step": 8054 + }, + { + "epoch": 0.5819350876876117, + "grad_norm": 5.686061132166138, + "learning_rate": 4.158926316967628e-06, + "loss": 0.8289, + "step": 8055 + }, + { + "epoch": 0.5820073328878212, + "grad_norm": 6.2976491750363435, + "learning_rate": 4.158707480559937e-06, + "loss": 0.8533, + "step": 8056 + }, + { + "epoch": 0.5820795780880308, + "grad_norm": 6.475125768905538, + "learning_rate": 4.158488621445827e-06, + "loss": 0.8095, + "step": 8057 + }, + { + "epoch": 0.5821518232882403, + "grad_norm": 6.679114958558407, + "learning_rate": 4.158269739628297e-06, + "loss": 0.8855, + "step": 8058 + }, + { + "epoch": 0.5822240684884498, + "grad_norm": 5.662382900214589, + "learning_rate": 4.15805083511034e-06, + "loss": 0.7995, + "step": 8059 + }, + { + "epoch": 0.5822963136886593, + "grad_norm": 7.865518099240661, + "learning_rate": 4.1578319078949556e-06, + "loss": 0.8956, + "step": 8060 + }, + { + "epoch": 0.5823685588888688, + "grad_norm": 7.63175415073339, + "learning_rate": 4.15761295798514e-06, + "loss": 0.8741, + "step": 8061 + }, + { + "epoch": 0.5824408040890783, + "grad_norm": 6.059245701040367, + "learning_rate": 4.15739398538389e-06, + "loss": 0.8333, + "step": 8062 + }, + { + "epoch": 0.5825130492892878, + "grad_norm": 6.898006991716062, + "learning_rate": 4.157174990094203e-06, + "loss": 0.9129, + "step": 8063 + }, + { + "epoch": 0.5825852944894974, + "grad_norm": 7.82489032120057, + "learning_rate": 4.156955972119077e-06, + "loss": 0.9145, + "step": 8064 + }, + { + "epoch": 0.5826575396897069, + "grad_norm": 6.927847806197108, + "learning_rate": 4.15673693146151e-06, + "loss": 0.8514, + "step": 8065 + }, + { + "epoch": 0.5827297848899163, + "grad_norm": 6.372644026495418, + "learning_rate": 4.156517868124501e-06, + "loss": 0.9049, + "step": 8066 + }, + { + "epoch": 0.5828020300901259, + "grad_norm": 7.395834250293371, + "learning_rate": 4.156298782111049e-06, + "loss": 0.8407, + "step": 8067 + }, + { + "epoch": 0.5828742752903354, + "grad_norm": 6.45272153465942, + "learning_rate": 4.156079673424152e-06, + "loss": 0.8458, + "step": 8068 + }, + { + "epoch": 0.582946520490545, + "grad_norm": 6.1766570674837435, + "learning_rate": 4.155860542066811e-06, + "loss": 0.898, + "step": 8069 + }, + { + "epoch": 0.5830187656907544, + "grad_norm": 6.398287555954456, + "learning_rate": 4.155641388042024e-06, + "loss": 0.8448, + "step": 8070 + }, + { + "epoch": 0.5830910108909639, + "grad_norm": 6.007219739227235, + "learning_rate": 4.155422211352792e-06, + "loss": 0.9381, + "step": 8071 + }, + { + "epoch": 0.5831632560911735, + "grad_norm": 8.78804160737583, + "learning_rate": 4.155203012002116e-06, + "loss": 0.9333, + "step": 8072 + }, + { + "epoch": 0.5832355012913829, + "grad_norm": 6.920911321443666, + "learning_rate": 4.154983789992995e-06, + "loss": 0.7973, + "step": 8073 + }, + { + "epoch": 0.5833077464915924, + "grad_norm": 7.178323211391172, + "learning_rate": 4.154764545328431e-06, + "loss": 0.9236, + "step": 8074 + }, + { + "epoch": 0.583379991691802, + "grad_norm": 6.480327473595687, + "learning_rate": 4.1545452780114255e-06, + "loss": 0.8508, + "step": 8075 + }, + { + "epoch": 0.5834522368920115, + "grad_norm": 6.390028648252548, + "learning_rate": 4.154325988044981e-06, + "loss": 0.8908, + "step": 8076 + }, + { + "epoch": 0.583524482092221, + "grad_norm": 5.604135090149289, + "learning_rate": 4.154106675432097e-06, + "loss": 0.8683, + "step": 8077 + }, + { + "epoch": 0.5835967272924305, + "grad_norm": 6.032189804935968, + "learning_rate": 4.153887340175777e-06, + "loss": 0.7757, + "step": 8078 + }, + { + "epoch": 0.58366897249264, + "grad_norm": 6.582038784305777, + "learning_rate": 4.1536679822790235e-06, + "loss": 0.947, + "step": 8079 + }, + { + "epoch": 0.5837412176928495, + "grad_norm": 8.109649241960883, + "learning_rate": 4.15344860174484e-06, + "loss": 0.8912, + "step": 8080 + }, + { + "epoch": 0.583813462893059, + "grad_norm": 7.319484928541121, + "learning_rate": 4.153229198576228e-06, + "loss": 0.9255, + "step": 8081 + }, + { + "epoch": 0.5838857080932686, + "grad_norm": 6.184427750927753, + "learning_rate": 4.153009772776192e-06, + "loss": 0.9105, + "step": 8082 + }, + { + "epoch": 0.5839579532934781, + "grad_norm": 7.142285978096923, + "learning_rate": 4.152790324347736e-06, + "loss": 0.9111, + "step": 8083 + }, + { + "epoch": 0.5840301984936875, + "grad_norm": 8.926371014432723, + "learning_rate": 4.1525708532938636e-06, + "loss": 0.8721, + "step": 8084 + }, + { + "epoch": 0.5841024436938971, + "grad_norm": 6.336991943219411, + "learning_rate": 4.15235135961758e-06, + "loss": 0.8676, + "step": 8085 + }, + { + "epoch": 0.5841746888941066, + "grad_norm": 7.606950001411512, + "learning_rate": 4.152131843321889e-06, + "loss": 0.9557, + "step": 8086 + }, + { + "epoch": 0.5842469340943162, + "grad_norm": 5.544886272836128, + "learning_rate": 4.151912304409795e-06, + "loss": 0.9165, + "step": 8087 + }, + { + "epoch": 0.5843191792945256, + "grad_norm": 7.211084829294633, + "learning_rate": 4.151692742884305e-06, + "loss": 0.804, + "step": 8088 + }, + { + "epoch": 0.5843914244947351, + "grad_norm": 4.965905676964161, + "learning_rate": 4.151473158748423e-06, + "loss": 0.8058, + "step": 8089 + }, + { + "epoch": 0.5844636696949447, + "grad_norm": 7.6952455779133215, + "learning_rate": 4.151253552005156e-06, + "loss": 0.8911, + "step": 8090 + }, + { + "epoch": 0.5845359148951541, + "grad_norm": 6.217509850531407, + "learning_rate": 4.151033922657512e-06, + "loss": 0.8514, + "step": 8091 + }, + { + "epoch": 0.5846081600953636, + "grad_norm": 6.4846659032074045, + "learning_rate": 4.150814270708493e-06, + "loss": 0.9009, + "step": 8092 + }, + { + "epoch": 0.5846804052955732, + "grad_norm": 5.987346658355788, + "learning_rate": 4.150594596161109e-06, + "loss": 0.8782, + "step": 8093 + }, + { + "epoch": 0.5847526504957827, + "grad_norm": 7.092719318278488, + "learning_rate": 4.150374899018368e-06, + "loss": 0.835, + "step": 8094 + }, + { + "epoch": 0.5848248956959922, + "grad_norm": 5.95777531213807, + "learning_rate": 4.150155179283274e-06, + "loss": 0.845, + "step": 8095 + }, + { + "epoch": 0.5848971408962017, + "grad_norm": 7.813991556835275, + "learning_rate": 4.149935436958838e-06, + "loss": 0.9091, + "step": 8096 + }, + { + "epoch": 0.5849693860964112, + "grad_norm": 5.695458178756718, + "learning_rate": 4.149715672048067e-06, + "loss": 0.8928, + "step": 8097 + }, + { + "epoch": 0.5850416312966207, + "grad_norm": 7.181825736582901, + "learning_rate": 4.1494958845539694e-06, + "loss": 0.8236, + "step": 8098 + }, + { + "epoch": 0.5851138764968302, + "grad_norm": 5.998934969155788, + "learning_rate": 4.149276074479553e-06, + "loss": 0.8274, + "step": 8099 + }, + { + "epoch": 0.5851861216970398, + "grad_norm": 6.004229326558756, + "learning_rate": 4.149056241827828e-06, + "loss": 0.9455, + "step": 8100 + }, + { + "epoch": 0.5852583668972493, + "grad_norm": 6.337976393391158, + "learning_rate": 4.148836386601805e-06, + "loss": 0.7404, + "step": 8101 + }, + { + "epoch": 0.5853306120974587, + "grad_norm": 6.310170263057828, + "learning_rate": 4.14861650880449e-06, + "loss": 0.8815, + "step": 8102 + }, + { + "epoch": 0.5854028572976683, + "grad_norm": 7.017896934293855, + "learning_rate": 4.148396608438896e-06, + "loss": 0.7975, + "step": 8103 + }, + { + "epoch": 0.5854751024978778, + "grad_norm": 6.131193494001692, + "learning_rate": 4.148176685508032e-06, + "loss": 0.7869, + "step": 8104 + }, + { + "epoch": 0.5855473476980874, + "grad_norm": 7.302446545878985, + "learning_rate": 4.14795674001491e-06, + "loss": 0.9093, + "step": 8105 + }, + { + "epoch": 0.5856195928982968, + "grad_norm": 6.975036796007894, + "learning_rate": 4.147736771962539e-06, + "loss": 0.9632, + "step": 8106 + }, + { + "epoch": 0.5856918380985063, + "grad_norm": 5.962120968589265, + "learning_rate": 4.1475167813539305e-06, + "loss": 0.8406, + "step": 8107 + }, + { + "epoch": 0.5857640832987159, + "grad_norm": 6.247058328721701, + "learning_rate": 4.147296768192097e-06, + "loss": 0.8243, + "step": 8108 + }, + { + "epoch": 0.5858363284989253, + "grad_norm": 6.314203013721235, + "learning_rate": 4.1470767324800495e-06, + "loss": 0.8519, + "step": 8109 + }, + { + "epoch": 0.5859085736991348, + "grad_norm": 8.256655233922237, + "learning_rate": 4.1468566742208005e-06, + "loss": 0.8781, + "step": 8110 + }, + { + "epoch": 0.5859808188993444, + "grad_norm": 8.47096004012915, + "learning_rate": 4.146636593417363e-06, + "loss": 0.9122, + "step": 8111 + }, + { + "epoch": 0.5860530640995539, + "grad_norm": 6.13591152542169, + "learning_rate": 4.146416490072748e-06, + "loss": 0.9001, + "step": 8112 + }, + { + "epoch": 0.5861253092997634, + "grad_norm": 6.792969311565532, + "learning_rate": 4.14619636418997e-06, + "loss": 0.8891, + "step": 8113 + }, + { + "epoch": 0.5861975544999729, + "grad_norm": 5.890682311247231, + "learning_rate": 4.145976215772042e-06, + "loss": 0.9171, + "step": 8114 + }, + { + "epoch": 0.5862697997001824, + "grad_norm": 5.4161638857952745, + "learning_rate": 4.145756044821978e-06, + "loss": 0.8837, + "step": 8115 + }, + { + "epoch": 0.5863420449003919, + "grad_norm": 6.265602777684252, + "learning_rate": 4.145535851342791e-06, + "loss": 0.9133, + "step": 8116 + }, + { + "epoch": 0.5864142901006014, + "grad_norm": 5.9714077928954685, + "learning_rate": 4.145315635337497e-06, + "loss": 0.8748, + "step": 8117 + }, + { + "epoch": 0.586486535300811, + "grad_norm": 6.354885306655203, + "learning_rate": 4.145095396809108e-06, + "loss": 0.8751, + "step": 8118 + }, + { + "epoch": 0.5865587805010205, + "grad_norm": 6.4007139522963445, + "learning_rate": 4.1448751357606415e-06, + "loss": 0.8508, + "step": 8119 + }, + { + "epoch": 0.5866310257012299, + "grad_norm": 7.770290516021209, + "learning_rate": 4.144654852195111e-06, + "loss": 0.8909, + "step": 8120 + }, + { + "epoch": 0.5867032709014395, + "grad_norm": 5.560156682228714, + "learning_rate": 4.144434546115532e-06, + "loss": 0.8157, + "step": 8121 + }, + { + "epoch": 0.586775516101649, + "grad_norm": 5.862027069081358, + "learning_rate": 4.144214217524922e-06, + "loss": 0.8936, + "step": 8122 + }, + { + "epoch": 0.5868477613018586, + "grad_norm": 7.828877852551781, + "learning_rate": 4.143993866426296e-06, + "loss": 0.8625, + "step": 8123 + }, + { + "epoch": 0.586920006502068, + "grad_norm": 6.251133320098407, + "learning_rate": 4.14377349282267e-06, + "loss": 0.8474, + "step": 8124 + }, + { + "epoch": 0.5869922517022775, + "grad_norm": 6.405749492166899, + "learning_rate": 4.1435530967170605e-06, + "loss": 0.9522, + "step": 8125 + }, + { + "epoch": 0.5870644969024871, + "grad_norm": 6.149068758791188, + "learning_rate": 4.143332678112486e-06, + "loss": 0.8542, + "step": 8126 + }, + { + "epoch": 0.5871367421026965, + "grad_norm": 8.736939628651845, + "learning_rate": 4.143112237011963e-06, + "loss": 0.8371, + "step": 8127 + }, + { + "epoch": 0.587208987302906, + "grad_norm": 7.2981295763024105, + "learning_rate": 4.142891773418509e-06, + "loss": 0.8674, + "step": 8128 + }, + { + "epoch": 0.5872812325031156, + "grad_norm": 7.041920343227501, + "learning_rate": 4.142671287335143e-06, + "loss": 0.9432, + "step": 8129 + }, + { + "epoch": 0.5873534777033251, + "grad_norm": 5.517387130267302, + "learning_rate": 4.142450778764882e-06, + "loss": 0.8557, + "step": 8130 + }, + { + "epoch": 0.5874257229035346, + "grad_norm": 6.877113156242926, + "learning_rate": 4.142230247710745e-06, + "loss": 0.8749, + "step": 8131 + }, + { + "epoch": 0.5874979681037441, + "grad_norm": 6.059542534308812, + "learning_rate": 4.142009694175752e-06, + "loss": 0.9066, + "step": 8132 + }, + { + "epoch": 0.5875702133039536, + "grad_norm": 5.893512857789625, + "learning_rate": 4.14178911816292e-06, + "loss": 0.7769, + "step": 8133 + }, + { + "epoch": 0.5876424585041631, + "grad_norm": 5.676748992687325, + "learning_rate": 4.14156851967527e-06, + "loss": 0.8257, + "step": 8134 + }, + { + "epoch": 0.5877147037043726, + "grad_norm": 7.021944797578117, + "learning_rate": 4.141347898715822e-06, + "loss": 0.9058, + "step": 8135 + }, + { + "epoch": 0.5877869489045822, + "grad_norm": 6.009201940781146, + "learning_rate": 4.141127255287595e-06, + "loss": 0.8555, + "step": 8136 + }, + { + "epoch": 0.5878591941047917, + "grad_norm": 6.53906483348226, + "learning_rate": 4.140906589393611e-06, + "loss": 0.8676, + "step": 8137 + }, + { + "epoch": 0.5879314393050011, + "grad_norm": 6.701307072466698, + "learning_rate": 4.1406859010368896e-06, + "loss": 0.9122, + "step": 8138 + }, + { + "epoch": 0.5880036845052107, + "grad_norm": 7.615859008750235, + "learning_rate": 4.140465190220451e-06, + "loss": 0.8486, + "step": 8139 + }, + { + "epoch": 0.5880759297054202, + "grad_norm": 6.6669663679786035, + "learning_rate": 4.140244456947319e-06, + "loss": 0.9773, + "step": 8140 + }, + { + "epoch": 0.5881481749056298, + "grad_norm": 6.021709268424351, + "learning_rate": 4.140023701220514e-06, + "loss": 0.7933, + "step": 8141 + }, + { + "epoch": 0.5882204201058392, + "grad_norm": 6.993025711367971, + "learning_rate": 4.139802923043057e-06, + "loss": 0.8766, + "step": 8142 + }, + { + "epoch": 0.5882926653060487, + "grad_norm": 6.634944433857819, + "learning_rate": 4.139582122417971e-06, + "loss": 0.7969, + "step": 8143 + }, + { + "epoch": 0.5883649105062583, + "grad_norm": 7.0056658022289025, + "learning_rate": 4.1393612993482805e-06, + "loss": 0.8927, + "step": 8144 + }, + { + "epoch": 0.5884371557064677, + "grad_norm": 7.612637864040522, + "learning_rate": 4.139140453837005e-06, + "loss": 0.8271, + "step": 8145 + }, + { + "epoch": 0.5885094009066772, + "grad_norm": 6.171873145767126, + "learning_rate": 4.13891958588717e-06, + "loss": 0.8632, + "step": 8146 + }, + { + "epoch": 0.5885816461068868, + "grad_norm": 8.619290438520691, + "learning_rate": 4.138698695501799e-06, + "loss": 0.8209, + "step": 8147 + }, + { + "epoch": 0.5886538913070963, + "grad_norm": 9.512609044576621, + "learning_rate": 4.138477782683914e-06, + "loss": 0.8653, + "step": 8148 + }, + { + "epoch": 0.5887261365073058, + "grad_norm": 5.895252140725991, + "learning_rate": 4.138256847436542e-06, + "loss": 0.9094, + "step": 8149 + }, + { + "epoch": 0.5887983817075153, + "grad_norm": 7.366257998262935, + "learning_rate": 4.138035889762704e-06, + "loss": 0.8401, + "step": 8150 + }, + { + "epoch": 0.5888706269077248, + "grad_norm": 8.048534037107899, + "learning_rate": 4.137814909665428e-06, + "loss": 0.8602, + "step": 8151 + }, + { + "epoch": 0.5889428721079343, + "grad_norm": 4.914699397940747, + "learning_rate": 4.137593907147737e-06, + "loss": 0.7778, + "step": 8152 + }, + { + "epoch": 0.5890151173081438, + "grad_norm": 5.843967372502939, + "learning_rate": 4.137372882212657e-06, + "loss": 0.8156, + "step": 8153 + }, + { + "epoch": 0.5890873625083534, + "grad_norm": 8.164496100925717, + "learning_rate": 4.137151834863213e-06, + "loss": 0.9126, + "step": 8154 + }, + { + "epoch": 0.5891596077085629, + "grad_norm": 6.786658314597956, + "learning_rate": 4.136930765102432e-06, + "loss": 0.8425, + "step": 8155 + }, + { + "epoch": 0.5892318529087723, + "grad_norm": 7.532485543992699, + "learning_rate": 4.13670967293334e-06, + "loss": 0.7785, + "step": 8156 + }, + { + "epoch": 0.5893040981089819, + "grad_norm": 5.981383051334124, + "learning_rate": 4.136488558358963e-06, + "loss": 0.8598, + "step": 8157 + }, + { + "epoch": 0.5893763433091914, + "grad_norm": 6.341508441142536, + "learning_rate": 4.136267421382329e-06, + "loss": 0.9071, + "step": 8158 + }, + { + "epoch": 0.589448588509401, + "grad_norm": 6.316883944328349, + "learning_rate": 4.136046262006463e-06, + "loss": 0.8441, + "step": 8159 + }, + { + "epoch": 0.5895208337096104, + "grad_norm": 6.555662535029801, + "learning_rate": 4.135825080234396e-06, + "loss": 0.8701, + "step": 8160 + }, + { + "epoch": 0.5895930789098199, + "grad_norm": 6.189684549653459, + "learning_rate": 4.1356038760691525e-06, + "loss": 0.9348, + "step": 8161 + }, + { + "epoch": 0.5896653241100295, + "grad_norm": 6.04891799204715, + "learning_rate": 4.135382649513761e-06, + "loss": 0.7873, + "step": 8162 + }, + { + "epoch": 0.5897375693102389, + "grad_norm": 7.250247556471914, + "learning_rate": 4.135161400571253e-06, + "loss": 0.8373, + "step": 8163 + }, + { + "epoch": 0.5898098145104484, + "grad_norm": 5.859149409719803, + "learning_rate": 4.134940129244653e-06, + "loss": 0.9016, + "step": 8164 + }, + { + "epoch": 0.589882059710658, + "grad_norm": 5.0978710143936095, + "learning_rate": 4.134718835536994e-06, + "loss": 0.8197, + "step": 8165 + }, + { + "epoch": 0.5899543049108675, + "grad_norm": 6.658057885167874, + "learning_rate": 4.1344975194513025e-06, + "loss": 0.8267, + "step": 8166 + }, + { + "epoch": 0.590026550111077, + "grad_norm": 5.668062300011919, + "learning_rate": 4.134276180990609e-06, + "loss": 0.8357, + "step": 8167 + }, + { + "epoch": 0.5900987953112865, + "grad_norm": 6.749657798852571, + "learning_rate": 4.134054820157944e-06, + "loss": 0.7833, + "step": 8168 + }, + { + "epoch": 0.590171040511496, + "grad_norm": 5.812899339965579, + "learning_rate": 4.1338334369563365e-06, + "loss": 0.8523, + "step": 8169 + }, + { + "epoch": 0.5902432857117055, + "grad_norm": 6.284138636836589, + "learning_rate": 4.1336120313888184e-06, + "loss": 0.8084, + "step": 8170 + }, + { + "epoch": 0.590315530911915, + "grad_norm": 5.795388578695666, + "learning_rate": 4.133390603458419e-06, + "loss": 0.8624, + "step": 8171 + }, + { + "epoch": 0.5903877761121246, + "grad_norm": 6.291394293070408, + "learning_rate": 4.1331691531681715e-06, + "loss": 0.792, + "step": 8172 + }, + { + "epoch": 0.5904600213123341, + "grad_norm": 6.459017743998321, + "learning_rate": 4.1329476805211065e-06, + "loss": 0.8671, + "step": 8173 + }, + { + "epoch": 0.5905322665125435, + "grad_norm": 6.333991300971391, + "learning_rate": 4.132726185520255e-06, + "loss": 0.8895, + "step": 8174 + }, + { + "epoch": 0.5906045117127531, + "grad_norm": 5.564359889750664, + "learning_rate": 4.1325046681686504e-06, + "loss": 0.8695, + "step": 8175 + }, + { + "epoch": 0.5906767569129626, + "grad_norm": 7.36310898961301, + "learning_rate": 4.132283128469324e-06, + "loss": 0.8867, + "step": 8176 + }, + { + "epoch": 0.5907490021131722, + "grad_norm": 6.406241365752566, + "learning_rate": 4.132061566425309e-06, + "loss": 0.8302, + "step": 8177 + }, + { + "epoch": 0.5908212473133816, + "grad_norm": 5.752020439440579, + "learning_rate": 4.131839982039639e-06, + "loss": 0.7977, + "step": 8178 + }, + { + "epoch": 0.5908934925135911, + "grad_norm": 5.658468075001444, + "learning_rate": 4.131618375315346e-06, + "loss": 0.8721, + "step": 8179 + }, + { + "epoch": 0.5909657377138007, + "grad_norm": 7.162110706554962, + "learning_rate": 4.131396746255464e-06, + "loss": 0.8599, + "step": 8180 + }, + { + "epoch": 0.5910379829140101, + "grad_norm": 6.554979938444373, + "learning_rate": 4.131175094863028e-06, + "loss": 0.8331, + "step": 8181 + }, + { + "epoch": 0.5911102281142196, + "grad_norm": 6.475210014089312, + "learning_rate": 4.130953421141071e-06, + "loss": 0.883, + "step": 8182 + }, + { + "epoch": 0.5911824733144292, + "grad_norm": 6.118053311672177, + "learning_rate": 4.130731725092628e-06, + "loss": 0.8475, + "step": 8183 + }, + { + "epoch": 0.5912547185146387, + "grad_norm": 5.937566495824193, + "learning_rate": 4.130510006720734e-06, + "loss": 0.8165, + "step": 8184 + }, + { + "epoch": 0.5913269637148482, + "grad_norm": 6.90779227732917, + "learning_rate": 4.130288266028424e-06, + "loss": 0.927, + "step": 8185 + }, + { + "epoch": 0.5913992089150577, + "grad_norm": 9.553111135805281, + "learning_rate": 4.1300665030187345e-06, + "loss": 0.8714, + "step": 8186 + }, + { + "epoch": 0.5914714541152672, + "grad_norm": 5.663307465839791, + "learning_rate": 4.1298447176946985e-06, + "loss": 0.9194, + "step": 8187 + }, + { + "epoch": 0.5915436993154767, + "grad_norm": 7.40036356264846, + "learning_rate": 4.129622910059355e-06, + "loss": 0.9394, + "step": 8188 + }, + { + "epoch": 0.5916159445156862, + "grad_norm": 6.574991018775057, + "learning_rate": 4.129401080115739e-06, + "loss": 0.9134, + "step": 8189 + }, + { + "epoch": 0.5916881897158958, + "grad_norm": 6.032454138216676, + "learning_rate": 4.129179227866887e-06, + "loss": 0.8215, + "step": 8190 + }, + { + "epoch": 0.5917604349161053, + "grad_norm": 6.280043149180299, + "learning_rate": 4.128957353315836e-06, + "loss": 0.8263, + "step": 8191 + }, + { + "epoch": 0.5918326801163147, + "grad_norm": 6.251498233509645, + "learning_rate": 4.128735456465625e-06, + "loss": 0.8303, + "step": 8192 + }, + { + "epoch": 0.5919049253165243, + "grad_norm": 5.809149648185159, + "learning_rate": 4.128513537319289e-06, + "loss": 0.9223, + "step": 8193 + }, + { + "epoch": 0.5919771705167338, + "grad_norm": 5.653121841961182, + "learning_rate": 4.128291595879867e-06, + "loss": 0.8553, + "step": 8194 + }, + { + "epoch": 0.5920494157169433, + "grad_norm": 6.445319898196511, + "learning_rate": 4.128069632150399e-06, + "loss": 0.9302, + "step": 8195 + }, + { + "epoch": 0.5921216609171528, + "grad_norm": 5.588958222245875, + "learning_rate": 4.127847646133919e-06, + "loss": 0.9073, + "step": 8196 + }, + { + "epoch": 0.5921939061173623, + "grad_norm": 5.822851336481774, + "learning_rate": 4.127625637833471e-06, + "loss": 0.8238, + "step": 8197 + }, + { + "epoch": 0.5922661513175719, + "grad_norm": 6.877457058272259, + "learning_rate": 4.12740360725209e-06, + "loss": 0.8819, + "step": 8198 + }, + { + "epoch": 0.5923383965177813, + "grad_norm": 5.609543651712566, + "learning_rate": 4.12718155439282e-06, + "loss": 0.8671, + "step": 8199 + }, + { + "epoch": 0.5924106417179908, + "grad_norm": 8.88705270663716, + "learning_rate": 4.126959479258695e-06, + "loss": 0.8624, + "step": 8200 + }, + { + "epoch": 0.5924828869182004, + "grad_norm": 6.85835248064572, + "learning_rate": 4.12673738185276e-06, + "loss": 0.8051, + "step": 8201 + }, + { + "epoch": 0.5925551321184099, + "grad_norm": 7.47826873857954, + "learning_rate": 4.126515262178052e-06, + "loss": 0.9025, + "step": 8202 + }, + { + "epoch": 0.5926273773186194, + "grad_norm": 6.31011494817485, + "learning_rate": 4.126293120237614e-06, + "loss": 0.87, + "step": 8203 + }, + { + "epoch": 0.5926996225188289, + "grad_norm": 6.021864471663895, + "learning_rate": 4.1260709560344855e-06, + "loss": 0.8698, + "step": 8204 + }, + { + "epoch": 0.5927718677190384, + "grad_norm": 6.686388155054909, + "learning_rate": 4.125848769571708e-06, + "loss": 0.9255, + "step": 8205 + }, + { + "epoch": 0.5928441129192479, + "grad_norm": 6.702289235757873, + "learning_rate": 4.125626560852324e-06, + "loss": 0.8174, + "step": 8206 + }, + { + "epoch": 0.5929163581194574, + "grad_norm": 6.787478913051383, + "learning_rate": 4.125404329879373e-06, + "loss": 0.9034, + "step": 8207 + }, + { + "epoch": 0.592988603319667, + "grad_norm": 6.915490230414119, + "learning_rate": 4.1251820766559005e-06, + "loss": 0.8236, + "step": 8208 + }, + { + "epoch": 0.5930608485198765, + "grad_norm": 5.544253823352938, + "learning_rate": 4.124959801184946e-06, + "loss": 0.8816, + "step": 8209 + }, + { + "epoch": 0.5931330937200859, + "grad_norm": 5.996465277577726, + "learning_rate": 4.124737503469555e-06, + "loss": 0.8828, + "step": 8210 + }, + { + "epoch": 0.5932053389202955, + "grad_norm": 5.745094196818906, + "learning_rate": 4.124515183512767e-06, + "loss": 0.845, + "step": 8211 + }, + { + "epoch": 0.593277584120505, + "grad_norm": 8.654711424778496, + "learning_rate": 4.124292841317629e-06, + "loss": 0.8736, + "step": 8212 + }, + { + "epoch": 0.5933498293207145, + "grad_norm": 6.579387625168227, + "learning_rate": 4.124070476887183e-06, + "loss": 0.8873, + "step": 8213 + }, + { + "epoch": 0.593422074520924, + "grad_norm": 6.6357628109732865, + "learning_rate": 4.123848090224473e-06, + "loss": 0.8797, + "step": 8214 + }, + { + "epoch": 0.5934943197211335, + "grad_norm": 7.418523863099252, + "learning_rate": 4.1236256813325435e-06, + "loss": 0.8478, + "step": 8215 + }, + { + "epoch": 0.5935665649213431, + "grad_norm": 5.632749517461685, + "learning_rate": 4.123403250214438e-06, + "loss": 0.8613, + "step": 8216 + }, + { + "epoch": 0.5936388101215525, + "grad_norm": 5.0976064862274075, + "learning_rate": 4.1231807968732045e-06, + "loss": 0.7983, + "step": 8217 + }, + { + "epoch": 0.593711055321762, + "grad_norm": 6.325630355605041, + "learning_rate": 4.122958321311885e-06, + "loss": 0.8407, + "step": 8218 + }, + { + "epoch": 0.5937833005219716, + "grad_norm": 6.015260363340022, + "learning_rate": 4.122735823533527e-06, + "loss": 0.8393, + "step": 8219 + }, + { + "epoch": 0.5938555457221811, + "grad_norm": 7.285773840695556, + "learning_rate": 4.122513303541175e-06, + "loss": 0.9197, + "step": 8220 + }, + { + "epoch": 0.5939277909223906, + "grad_norm": 7.333439768394496, + "learning_rate": 4.122290761337875e-06, + "loss": 0.893, + "step": 8221 + }, + { + "epoch": 0.5940000361226001, + "grad_norm": 7.503097911628528, + "learning_rate": 4.122068196926675e-06, + "loss": 0.8878, + "step": 8222 + }, + { + "epoch": 0.5940722813228096, + "grad_norm": 4.9189297053119985, + "learning_rate": 4.121845610310621e-06, + "loss": 0.7922, + "step": 8223 + }, + { + "epoch": 0.5941445265230191, + "grad_norm": 6.735256057776747, + "learning_rate": 4.121623001492759e-06, + "loss": 0.8134, + "step": 8224 + }, + { + "epoch": 0.5942167717232286, + "grad_norm": 8.404999707350763, + "learning_rate": 4.121400370476138e-06, + "loss": 0.9016, + "step": 8225 + }, + { + "epoch": 0.5942890169234382, + "grad_norm": 6.523390133622947, + "learning_rate": 4.121177717263804e-06, + "loss": 0.7894, + "step": 8226 + }, + { + "epoch": 0.5943612621236477, + "grad_norm": 5.533395297304843, + "learning_rate": 4.1209550418588074e-06, + "loss": 0.7814, + "step": 8227 + }, + { + "epoch": 0.5944335073238571, + "grad_norm": 5.560500053313147, + "learning_rate": 4.120732344264194e-06, + "loss": 0.9132, + "step": 8228 + }, + { + "epoch": 0.5945057525240667, + "grad_norm": 7.160545425951793, + "learning_rate": 4.120509624483013e-06, + "loss": 0.8619, + "step": 8229 + }, + { + "epoch": 0.5945779977242762, + "grad_norm": 6.093248239057982, + "learning_rate": 4.120286882518313e-06, + "loss": 0.8077, + "step": 8230 + }, + { + "epoch": 0.5946502429244857, + "grad_norm": 5.807659471664842, + "learning_rate": 4.1200641183731445e-06, + "loss": 0.8692, + "step": 8231 + }, + { + "epoch": 0.5947224881246952, + "grad_norm": 7.811029402605596, + "learning_rate": 4.1198413320505555e-06, + "loss": 0.8824, + "step": 8232 + }, + { + "epoch": 0.5947947333249047, + "grad_norm": 5.518527296677069, + "learning_rate": 4.119618523553597e-06, + "loss": 0.8183, + "step": 8233 + }, + { + "epoch": 0.5948669785251143, + "grad_norm": 7.86497683182764, + "learning_rate": 4.119395692885319e-06, + "loss": 0.7615, + "step": 8234 + }, + { + "epoch": 0.5949392237253237, + "grad_norm": 6.341307822997229, + "learning_rate": 4.11917284004877e-06, + "loss": 0.8815, + "step": 8235 + }, + { + "epoch": 0.5950114689255332, + "grad_norm": 6.957880823950834, + "learning_rate": 4.118949965047003e-06, + "loss": 0.858, + "step": 8236 + }, + { + "epoch": 0.5950837141257428, + "grad_norm": 6.430125336322793, + "learning_rate": 4.118727067883067e-06, + "loss": 0.8428, + "step": 8237 + }, + { + "epoch": 0.5951559593259523, + "grad_norm": 6.79855262835967, + "learning_rate": 4.118504148560015e-06, + "loss": 0.8192, + "step": 8238 + }, + { + "epoch": 0.5952282045261618, + "grad_norm": 6.905747891638609, + "learning_rate": 4.118281207080898e-06, + "loss": 0.7784, + "step": 8239 + }, + { + "epoch": 0.5953004497263713, + "grad_norm": 7.248630262748622, + "learning_rate": 4.118058243448767e-06, + "loss": 0.8562, + "step": 8240 + }, + { + "epoch": 0.5953726949265808, + "grad_norm": 7.40444112972772, + "learning_rate": 4.117835257666676e-06, + "loss": 0.8664, + "step": 8241 + }, + { + "epoch": 0.5954449401267903, + "grad_norm": 8.161948832995794, + "learning_rate": 4.117612249737676e-06, + "loss": 0.9087, + "step": 8242 + }, + { + "epoch": 0.5955171853269998, + "grad_norm": 6.139443009551379, + "learning_rate": 4.11738921966482e-06, + "loss": 0.8096, + "step": 8243 + }, + { + "epoch": 0.5955894305272094, + "grad_norm": 8.014248556891696, + "learning_rate": 4.117166167451162e-06, + "loss": 0.9069, + "step": 8244 + }, + { + "epoch": 0.5956616757274189, + "grad_norm": 5.754926146987065, + "learning_rate": 4.116943093099754e-06, + "loss": 0.8554, + "step": 8245 + }, + { + "epoch": 0.5957339209276283, + "grad_norm": 6.146513234262564, + "learning_rate": 4.116719996613652e-06, + "loss": 0.9094, + "step": 8246 + }, + { + "epoch": 0.5958061661278379, + "grad_norm": 6.593229300382467, + "learning_rate": 4.116496877995907e-06, + "loss": 0.8704, + "step": 8247 + }, + { + "epoch": 0.5958784113280474, + "grad_norm": 6.523605618623338, + "learning_rate": 4.1162737372495755e-06, + "loss": 0.8792, + "step": 8248 + }, + { + "epoch": 0.5959506565282569, + "grad_norm": 7.5317185125601585, + "learning_rate": 4.116050574377712e-06, + "loss": 0.858, + "step": 8249 + }, + { + "epoch": 0.5960229017284664, + "grad_norm": 6.205294520345389, + "learning_rate": 4.11582738938337e-06, + "loss": 0.8928, + "step": 8250 + }, + { + "epoch": 0.5960951469286759, + "grad_norm": 10.229097219776564, + "learning_rate": 4.1156041822696065e-06, + "loss": 0.9891, + "step": 8251 + }, + { + "epoch": 0.5961673921288855, + "grad_norm": 5.64365159938529, + "learning_rate": 4.115380953039476e-06, + "loss": 0.802, + "step": 8252 + }, + { + "epoch": 0.5962396373290949, + "grad_norm": 6.965896364302524, + "learning_rate": 4.115157701696034e-06, + "loss": 0.8915, + "step": 8253 + }, + { + "epoch": 0.5963118825293044, + "grad_norm": 7.650918653592945, + "learning_rate": 4.114934428242338e-06, + "loss": 0.7705, + "step": 8254 + }, + { + "epoch": 0.596384127729514, + "grad_norm": 5.656966190414451, + "learning_rate": 4.114711132681443e-06, + "loss": 0.8324, + "step": 8255 + }, + { + "epoch": 0.5964563729297235, + "grad_norm": 5.403321989229136, + "learning_rate": 4.114487815016406e-06, + "loss": 0.8871, + "step": 8256 + }, + { + "epoch": 0.596528618129933, + "grad_norm": 6.2265395256202565, + "learning_rate": 4.114264475250284e-06, + "loss": 0.8089, + "step": 8257 + }, + { + "epoch": 0.5966008633301425, + "grad_norm": 6.322507875436846, + "learning_rate": 4.1140411133861355e-06, + "loss": 0.8378, + "step": 8258 + }, + { + "epoch": 0.596673108530352, + "grad_norm": 6.720871133652691, + "learning_rate": 4.113817729427018e-06, + "loss": 0.9816, + "step": 8259 + }, + { + "epoch": 0.5967453537305615, + "grad_norm": 7.711804682272466, + "learning_rate": 4.1135943233759875e-06, + "loss": 0.8951, + "step": 8260 + }, + { + "epoch": 0.596817598930771, + "grad_norm": 6.151781038254446, + "learning_rate": 4.113370895236105e-06, + "loss": 0.8555, + "step": 8261 + }, + { + "epoch": 0.5968898441309806, + "grad_norm": 6.449982919596449, + "learning_rate": 4.113147445010427e-06, + "loss": 0.8335, + "step": 8262 + }, + { + "epoch": 0.5969620893311901, + "grad_norm": 6.8012972772347124, + "learning_rate": 4.1129239727020135e-06, + "loss": 0.9054, + "step": 8263 + }, + { + "epoch": 0.5970343345313995, + "grad_norm": 7.279924132821624, + "learning_rate": 4.112700478313922e-06, + "loss": 0.8106, + "step": 8264 + }, + { + "epoch": 0.5971065797316091, + "grad_norm": 9.005853656675153, + "learning_rate": 4.112476961849213e-06, + "loss": 0.9252, + "step": 8265 + }, + { + "epoch": 0.5971788249318186, + "grad_norm": 5.830636227474551, + "learning_rate": 4.112253423310947e-06, + "loss": 0.7945, + "step": 8266 + }, + { + "epoch": 0.597251070132028, + "grad_norm": 8.436277852025858, + "learning_rate": 4.112029862702184e-06, + "loss": 0.9021, + "step": 8267 + }, + { + "epoch": 0.5973233153322376, + "grad_norm": 7.64151788342762, + "learning_rate": 4.111806280025984e-06, + "loss": 0.8537, + "step": 8268 + }, + { + "epoch": 0.5973955605324471, + "grad_norm": 8.555480753033539, + "learning_rate": 4.111582675285407e-06, + "loss": 0.8317, + "step": 8269 + }, + { + "epoch": 0.5974678057326567, + "grad_norm": 6.5341381641232195, + "learning_rate": 4.111359048483514e-06, + "loss": 0.9092, + "step": 8270 + }, + { + "epoch": 0.5975400509328661, + "grad_norm": 5.949827868111501, + "learning_rate": 4.111135399623367e-06, + "loss": 0.8285, + "step": 8271 + }, + { + "epoch": 0.5976122961330756, + "grad_norm": 9.153751175528866, + "learning_rate": 4.110911728708028e-06, + "loss": 0.9022, + "step": 8272 + }, + { + "epoch": 0.5976845413332852, + "grad_norm": 6.30928063211065, + "learning_rate": 4.110688035740558e-06, + "loss": 0.8777, + "step": 8273 + }, + { + "epoch": 0.5977567865334947, + "grad_norm": 6.462606120022234, + "learning_rate": 4.110464320724019e-06, + "loss": 0.8494, + "step": 8274 + }, + { + "epoch": 0.5978290317337042, + "grad_norm": 6.162745423295023, + "learning_rate": 4.110240583661473e-06, + "loss": 0.8436, + "step": 8275 + }, + { + "epoch": 0.5979012769339137, + "grad_norm": 5.717139846006815, + "learning_rate": 4.110016824555985e-06, + "loss": 0.7976, + "step": 8276 + }, + { + "epoch": 0.5979735221341232, + "grad_norm": 6.0816552273172375, + "learning_rate": 4.109793043410617e-06, + "loss": 0.8146, + "step": 8277 + }, + { + "epoch": 0.5980457673343327, + "grad_norm": 6.338081721406451, + "learning_rate": 4.109569240228432e-06, + "loss": 0.8594, + "step": 8278 + }, + { + "epoch": 0.5981180125345422, + "grad_norm": 6.604792455954272, + "learning_rate": 4.109345415012494e-06, + "loss": 0.9103, + "step": 8279 + }, + { + "epoch": 0.5981902577347518, + "grad_norm": 8.375472781373206, + "learning_rate": 4.109121567765866e-06, + "loss": 1.015, + "step": 8280 + }, + { + "epoch": 0.5982625029349613, + "grad_norm": 8.11948289624662, + "learning_rate": 4.108897698491613e-06, + "loss": 0.8741, + "step": 8281 + }, + { + "epoch": 0.5983347481351707, + "grad_norm": 5.972478051829897, + "learning_rate": 4.1086738071928005e-06, + "loss": 0.8755, + "step": 8282 + }, + { + "epoch": 0.5984069933353803, + "grad_norm": 6.821286476041922, + "learning_rate": 4.108449893872493e-06, + "loss": 0.93, + "step": 8283 + }, + { + "epoch": 0.5984792385355898, + "grad_norm": 7.44486237613299, + "learning_rate": 4.108225958533754e-06, + "loss": 0.85, + "step": 8284 + }, + { + "epoch": 0.5985514837357993, + "grad_norm": 5.540018147187515, + "learning_rate": 4.108002001179651e-06, + "loss": 0.8335, + "step": 8285 + }, + { + "epoch": 0.5986237289360088, + "grad_norm": 6.618840070909505, + "learning_rate": 4.107778021813249e-06, + "loss": 0.9085, + "step": 8286 + }, + { + "epoch": 0.5986959741362183, + "grad_norm": 6.426910876202076, + "learning_rate": 4.107554020437614e-06, + "loss": 0.7824, + "step": 8287 + }, + { + "epoch": 0.5987682193364279, + "grad_norm": 6.578657912462026, + "learning_rate": 4.107329997055814e-06, + "loss": 0.8845, + "step": 8288 + }, + { + "epoch": 0.5988404645366373, + "grad_norm": 6.379000138049102, + "learning_rate": 4.107105951670913e-06, + "loss": 0.8567, + "step": 8289 + }, + { + "epoch": 0.5989127097368468, + "grad_norm": 6.9146577912869605, + "learning_rate": 4.106881884285981e-06, + "loss": 0.9061, + "step": 8290 + }, + { + "epoch": 0.5989849549370564, + "grad_norm": 7.453115275314673, + "learning_rate": 4.1066577949040815e-06, + "loss": 0.8459, + "step": 8291 + }, + { + "epoch": 0.5990572001372659, + "grad_norm": 8.657056419735994, + "learning_rate": 4.106433683528286e-06, + "loss": 0.9416, + "step": 8292 + }, + { + "epoch": 0.5991294453374754, + "grad_norm": 7.854662595382738, + "learning_rate": 4.1062095501616595e-06, + "loss": 0.9607, + "step": 8293 + }, + { + "epoch": 0.5992016905376849, + "grad_norm": 6.7449859080581, + "learning_rate": 4.105985394807271e-06, + "loss": 0.7959, + "step": 8294 + }, + { + "epoch": 0.5992739357378944, + "grad_norm": 8.972011704102636, + "learning_rate": 4.105761217468191e-06, + "loss": 0.9424, + "step": 8295 + }, + { + "epoch": 0.5993461809381039, + "grad_norm": 8.332565068752839, + "learning_rate": 4.1055370181474855e-06, + "loss": 0.8889, + "step": 8296 + }, + { + "epoch": 0.5994184261383134, + "grad_norm": 8.036820079468377, + "learning_rate": 4.105312796848225e-06, + "loss": 0.8753, + "step": 8297 + }, + { + "epoch": 0.599490671338523, + "grad_norm": 5.7025591007767344, + "learning_rate": 4.10508855357348e-06, + "loss": 0.8432, + "step": 8298 + }, + { + "epoch": 0.5995629165387325, + "grad_norm": 8.752373073848494, + "learning_rate": 4.104864288326318e-06, + "loss": 0.9059, + "step": 8299 + }, + { + "epoch": 0.5996351617389419, + "grad_norm": 7.744924790846528, + "learning_rate": 4.1046400011098096e-06, + "loss": 0.7978, + "step": 8300 + }, + { + "epoch": 0.5997074069391515, + "grad_norm": 9.80945048963964, + "learning_rate": 4.104415691927026e-06, + "loss": 1.0002, + "step": 8301 + }, + { + "epoch": 0.599779652139361, + "grad_norm": 8.032152889574744, + "learning_rate": 4.104191360781038e-06, + "loss": 0.8657, + "step": 8302 + }, + { + "epoch": 0.5998518973395705, + "grad_norm": 4.931014038164977, + "learning_rate": 4.1039670076749144e-06, + "loss": 0.8417, + "step": 8303 + }, + { + "epoch": 0.59992414253978, + "grad_norm": 6.996909140349798, + "learning_rate": 4.103742632611729e-06, + "loss": 0.8031, + "step": 8304 + }, + { + "epoch": 0.5999963877399895, + "grad_norm": 7.298380988361017, + "learning_rate": 4.103518235594551e-06, + "loss": 0.9203, + "step": 8305 + }, + { + "epoch": 0.6000686329401991, + "grad_norm": 5.597388762254284, + "learning_rate": 4.103293816626454e-06, + "loss": 0.8526, + "step": 8306 + }, + { + "epoch": 0.6001408781404085, + "grad_norm": 8.212164847910556, + "learning_rate": 4.10306937571051e-06, + "loss": 0.8193, + "step": 8307 + }, + { + "epoch": 0.600213123340618, + "grad_norm": 6.933369824729159, + "learning_rate": 4.10284491284979e-06, + "loss": 0.8584, + "step": 8308 + }, + { + "epoch": 0.6002853685408276, + "grad_norm": 7.1424189678398315, + "learning_rate": 4.102620428047369e-06, + "loss": 0.9425, + "step": 8309 + }, + { + "epoch": 0.6003576137410371, + "grad_norm": 6.847644551294913, + "learning_rate": 4.102395921306318e-06, + "loss": 0.8003, + "step": 8310 + }, + { + "epoch": 0.6004298589412466, + "grad_norm": 5.747752206386893, + "learning_rate": 4.102171392629711e-06, + "loss": 0.8474, + "step": 8311 + }, + { + "epoch": 0.6005021041414561, + "grad_norm": 6.671746925000228, + "learning_rate": 4.101946842020622e-06, + "loss": 0.8547, + "step": 8312 + }, + { + "epoch": 0.6005743493416656, + "grad_norm": 6.6853628576230575, + "learning_rate": 4.101722269482124e-06, + "loss": 0.8323, + "step": 8313 + }, + { + "epoch": 0.6006465945418751, + "grad_norm": 5.900470469192478, + "learning_rate": 4.101497675017292e-06, + "loss": 0.865, + "step": 8314 + }, + { + "epoch": 0.6007188397420846, + "grad_norm": 5.936518216526944, + "learning_rate": 4.1012730586292e-06, + "loss": 0.7994, + "step": 8315 + }, + { + "epoch": 0.6007910849422942, + "grad_norm": 7.756178577226428, + "learning_rate": 4.101048420320923e-06, + "loss": 0.856, + "step": 8316 + }, + { + "epoch": 0.6008633301425037, + "grad_norm": 6.688019919461606, + "learning_rate": 4.100823760095537e-06, + "loss": 0.8797, + "step": 8317 + }, + { + "epoch": 0.6009355753427131, + "grad_norm": 6.880090943999583, + "learning_rate": 4.1005990779561165e-06, + "loss": 0.8126, + "step": 8318 + }, + { + "epoch": 0.6010078205429227, + "grad_norm": 7.092474600121536, + "learning_rate": 4.100374373905738e-06, + "loss": 0.8704, + "step": 8319 + }, + { + "epoch": 0.6010800657431322, + "grad_norm": 5.3273189527877385, + "learning_rate": 4.100149647947476e-06, + "loss": 0.8267, + "step": 8320 + }, + { + "epoch": 0.6011523109433417, + "grad_norm": 6.145968298857911, + "learning_rate": 4.099924900084407e-06, + "loss": 0.8349, + "step": 8321 + }, + { + "epoch": 0.6012245561435512, + "grad_norm": 4.9797889395114, + "learning_rate": 4.09970013031961e-06, + "loss": 0.751, + "step": 8322 + }, + { + "epoch": 0.6012968013437607, + "grad_norm": 8.857525452467709, + "learning_rate": 4.0994753386561596e-06, + "loss": 0.9349, + "step": 8323 + }, + { + "epoch": 0.6013690465439703, + "grad_norm": 5.953096805215385, + "learning_rate": 4.099250525097134e-06, + "loss": 0.8727, + "step": 8324 + }, + { + "epoch": 0.6014412917441797, + "grad_norm": 8.076649632558626, + "learning_rate": 4.099025689645611e-06, + "loss": 0.8026, + "step": 8325 + }, + { + "epoch": 0.6015135369443892, + "grad_norm": 7.219254381137991, + "learning_rate": 4.098800832304667e-06, + "loss": 0.8689, + "step": 8326 + }, + { + "epoch": 0.6015857821445988, + "grad_norm": 6.30755663398419, + "learning_rate": 4.09857595307738e-06, + "loss": 0.8327, + "step": 8327 + }, + { + "epoch": 0.6016580273448083, + "grad_norm": 7.493246471041857, + "learning_rate": 4.098351051966831e-06, + "loss": 0.9223, + "step": 8328 + }, + { + "epoch": 0.6017302725450178, + "grad_norm": 7.342190710871957, + "learning_rate": 4.098126128976097e-06, + "loss": 0.8088, + "step": 8329 + }, + { + "epoch": 0.6018025177452273, + "grad_norm": 6.528337769353063, + "learning_rate": 4.097901184108256e-06, + "loss": 0.8286, + "step": 8330 + }, + { + "epoch": 0.6018747629454368, + "grad_norm": 7.649075131825707, + "learning_rate": 4.097676217366389e-06, + "loss": 0.7741, + "step": 8331 + }, + { + "epoch": 0.6019470081456463, + "grad_norm": 5.82947187226856, + "learning_rate": 4.097451228753576e-06, + "loss": 0.8526, + "step": 8332 + }, + { + "epoch": 0.6020192533458558, + "grad_norm": 8.128406456133531, + "learning_rate": 4.097226218272896e-06, + "loss": 0.8086, + "step": 8333 + }, + { + "epoch": 0.6020914985460654, + "grad_norm": 6.754395925532323, + "learning_rate": 4.0970011859274285e-06, + "loss": 0.9052, + "step": 8334 + }, + { + "epoch": 0.6021637437462749, + "grad_norm": 5.289230186249674, + "learning_rate": 4.096776131720254e-06, + "loss": 0.793, + "step": 8335 + }, + { + "epoch": 0.6022359889464843, + "grad_norm": 5.22795034095161, + "learning_rate": 4.096551055654456e-06, + "loss": 0.829, + "step": 8336 + }, + { + "epoch": 0.6023082341466939, + "grad_norm": 6.0666822971240455, + "learning_rate": 4.096325957733113e-06, + "loss": 0.8136, + "step": 8337 + }, + { + "epoch": 0.6023804793469034, + "grad_norm": 6.5928158934577645, + "learning_rate": 4.096100837959306e-06, + "loss": 0.8552, + "step": 8338 + }, + { + "epoch": 0.6024527245471129, + "grad_norm": 6.337577635761683, + "learning_rate": 4.095875696336119e-06, + "loss": 0.8001, + "step": 8339 + }, + { + "epoch": 0.6025249697473224, + "grad_norm": 6.305803429450457, + "learning_rate": 4.095650532866633e-06, + "loss": 0.8034, + "step": 8340 + }, + { + "epoch": 0.6025972149475319, + "grad_norm": 6.840648488317658, + "learning_rate": 4.0954253475539286e-06, + "loss": 0.8577, + "step": 8341 + }, + { + "epoch": 0.6026694601477415, + "grad_norm": 5.81280385525505, + "learning_rate": 4.095200140401091e-06, + "loss": 0.8628, + "step": 8342 + }, + { + "epoch": 0.6027417053479509, + "grad_norm": 7.003645628778334, + "learning_rate": 4.094974911411202e-06, + "loss": 0.8955, + "step": 8343 + }, + { + "epoch": 0.6028139505481604, + "grad_norm": 7.4689568526859995, + "learning_rate": 4.094749660587345e-06, + "loss": 0.8251, + "step": 8344 + }, + { + "epoch": 0.60288619574837, + "grad_norm": 6.989197979540771, + "learning_rate": 4.094524387932604e-06, + "loss": 0.8787, + "step": 8345 + }, + { + "epoch": 0.6029584409485795, + "grad_norm": 6.125568597053422, + "learning_rate": 4.094299093450061e-06, + "loss": 0.849, + "step": 8346 + }, + { + "epoch": 0.603030686148789, + "grad_norm": 7.546335141793374, + "learning_rate": 4.094073777142802e-06, + "loss": 0.9285, + "step": 8347 + }, + { + "epoch": 0.6031029313489985, + "grad_norm": 5.966362760394564, + "learning_rate": 4.09384843901391e-06, + "loss": 0.8038, + "step": 8348 + }, + { + "epoch": 0.603175176549208, + "grad_norm": 7.7481970381843865, + "learning_rate": 4.093623079066471e-06, + "loss": 0.8501, + "step": 8349 + }, + { + "epoch": 0.6032474217494175, + "grad_norm": 7.41711530150044, + "learning_rate": 4.093397697303569e-06, + "loss": 0.9788, + "step": 8350 + }, + { + "epoch": 0.603319666949627, + "grad_norm": 5.933234037504237, + "learning_rate": 4.09317229372829e-06, + "loss": 0.8806, + "step": 8351 + }, + { + "epoch": 0.6033919121498366, + "grad_norm": 5.730789269218801, + "learning_rate": 4.0929468683437205e-06, + "loss": 0.8434, + "step": 8352 + }, + { + "epoch": 0.6034641573500461, + "grad_norm": 6.341674164388722, + "learning_rate": 4.092721421152943e-06, + "loss": 0.862, + "step": 8353 + }, + { + "epoch": 0.6035364025502555, + "grad_norm": 6.116396720119668, + "learning_rate": 4.092495952159048e-06, + "loss": 0.7748, + "step": 8354 + }, + { + "epoch": 0.6036086477504651, + "grad_norm": 6.389585377788517, + "learning_rate": 4.092270461365119e-06, + "loss": 0.8801, + "step": 8355 + }, + { + "epoch": 0.6036808929506746, + "grad_norm": 7.161990332986017, + "learning_rate": 4.092044948774243e-06, + "loss": 0.9525, + "step": 8356 + }, + { + "epoch": 0.603753138150884, + "grad_norm": 7.383555735158329, + "learning_rate": 4.091819414389509e-06, + "loss": 0.9092, + "step": 8357 + }, + { + "epoch": 0.6038253833510936, + "grad_norm": 6.211380008312647, + "learning_rate": 4.0915938582140015e-06, + "loss": 0.8149, + "step": 8358 + }, + { + "epoch": 0.6038976285513031, + "grad_norm": 7.4265284034464765, + "learning_rate": 4.091368280250811e-06, + "loss": 0.8152, + "step": 8359 + }, + { + "epoch": 0.6039698737515127, + "grad_norm": 7.716072390298332, + "learning_rate": 4.091142680503024e-06, + "loss": 0.9021, + "step": 8360 + }, + { + "epoch": 0.6040421189517221, + "grad_norm": 6.18087382931777, + "learning_rate": 4.09091705897373e-06, + "loss": 0.9304, + "step": 8361 + }, + { + "epoch": 0.6041143641519316, + "grad_norm": 6.520556589252376, + "learning_rate": 4.0906914156660164e-06, + "loss": 0.8895, + "step": 8362 + }, + { + "epoch": 0.6041866093521412, + "grad_norm": 7.431364461853057, + "learning_rate": 4.090465750582973e-06, + "loss": 0.8523, + "step": 8363 + }, + { + "epoch": 0.6042588545523507, + "grad_norm": 6.3381936681210185, + "learning_rate": 4.090240063727686e-06, + "loss": 0.8727, + "step": 8364 + }, + { + "epoch": 0.6043310997525602, + "grad_norm": 6.90206381851356, + "learning_rate": 4.09001435510325e-06, + "loss": 0.9507, + "step": 8365 + }, + { + "epoch": 0.6044033449527697, + "grad_norm": 7.221929997457992, + "learning_rate": 4.089788624712751e-06, + "loss": 0.8699, + "step": 8366 + }, + { + "epoch": 0.6044755901529792, + "grad_norm": 7.126352549887159, + "learning_rate": 4.0895628725592795e-06, + "loss": 0.8588, + "step": 8367 + }, + { + "epoch": 0.6045478353531887, + "grad_norm": 5.7412045422495845, + "learning_rate": 4.0893370986459275e-06, + "loss": 0.8784, + "step": 8368 + }, + { + "epoch": 0.6046200805533982, + "grad_norm": 5.726347665543013, + "learning_rate": 4.089111302975784e-06, + "loss": 0.9466, + "step": 8369 + }, + { + "epoch": 0.6046923257536078, + "grad_norm": 7.532262203844918, + "learning_rate": 4.088885485551941e-06, + "loss": 0.8618, + "step": 8370 + }, + { + "epoch": 0.6047645709538173, + "grad_norm": 6.427838825685826, + "learning_rate": 4.0886596463774886e-06, + "loss": 0.9751, + "step": 8371 + }, + { + "epoch": 0.6048368161540267, + "grad_norm": 6.962508755306855, + "learning_rate": 4.08843378545552e-06, + "loss": 0.8406, + "step": 8372 + }, + { + "epoch": 0.6049090613542363, + "grad_norm": 7.712919311006417, + "learning_rate": 4.088207902789126e-06, + "loss": 0.8174, + "step": 8373 + }, + { + "epoch": 0.6049813065544458, + "grad_norm": 6.7720327713652795, + "learning_rate": 4.087981998381399e-06, + "loss": 0.8235, + "step": 8374 + }, + { + "epoch": 0.6050535517546552, + "grad_norm": 8.01724006785777, + "learning_rate": 4.087756072235431e-06, + "loss": 0.8309, + "step": 8375 + }, + { + "epoch": 0.6051257969548648, + "grad_norm": 7.181851763366922, + "learning_rate": 4.087530124354316e-06, + "loss": 0.9192, + "step": 8376 + }, + { + "epoch": 0.6051980421550743, + "grad_norm": 7.428026079763748, + "learning_rate": 4.0873041547411465e-06, + "loss": 0.9048, + "step": 8377 + }, + { + "epoch": 0.6052702873552839, + "grad_norm": 6.313295219867603, + "learning_rate": 4.087078163399014e-06, + "loss": 0.7764, + "step": 8378 + }, + { + "epoch": 0.6053425325554933, + "grad_norm": 7.451425802521187, + "learning_rate": 4.086852150331015e-06, + "loss": 0.8623, + "step": 8379 + }, + { + "epoch": 0.6054147777557028, + "grad_norm": 7.925330738181866, + "learning_rate": 4.086626115540242e-06, + "loss": 0.8773, + "step": 8380 + }, + { + "epoch": 0.6054870229559124, + "grad_norm": 7.36648481786105, + "learning_rate": 4.086400059029788e-06, + "loss": 0.8567, + "step": 8381 + }, + { + "epoch": 0.6055592681561219, + "grad_norm": 6.734468747081682, + "learning_rate": 4.086173980802751e-06, + "loss": 0.9247, + "step": 8382 + }, + { + "epoch": 0.6056315133563314, + "grad_norm": 6.595339615112468, + "learning_rate": 4.085947880862223e-06, + "loss": 0.8994, + "step": 8383 + }, + { + "epoch": 0.6057037585565409, + "grad_norm": 5.648639089698848, + "learning_rate": 4.0857217592112995e-06, + "loss": 0.7817, + "step": 8384 + }, + { + "epoch": 0.6057760037567504, + "grad_norm": 7.12959944189744, + "learning_rate": 4.0854956158530765e-06, + "loss": 0.8449, + "step": 8385 + }, + { + "epoch": 0.6058482489569599, + "grad_norm": 5.119525683903677, + "learning_rate": 4.08526945079065e-06, + "loss": 0.8254, + "step": 8386 + }, + { + "epoch": 0.6059204941571694, + "grad_norm": 6.575386691597915, + "learning_rate": 4.085043264027115e-06, + "loss": 0.7787, + "step": 8387 + }, + { + "epoch": 0.605992739357379, + "grad_norm": 6.029857731749292, + "learning_rate": 4.08481705556557e-06, + "loss": 0.7934, + "step": 8388 + }, + { + "epoch": 0.6060649845575885, + "grad_norm": 6.205602194484546, + "learning_rate": 4.084590825409108e-06, + "loss": 0.8451, + "step": 8389 + }, + { + "epoch": 0.6061372297577979, + "grad_norm": 6.369608076126586, + "learning_rate": 4.084364573560829e-06, + "loss": 0.8831, + "step": 8390 + }, + { + "epoch": 0.6062094749580075, + "grad_norm": 5.862652403715157, + "learning_rate": 4.084138300023829e-06, + "loss": 0.8748, + "step": 8391 + }, + { + "epoch": 0.606281720158217, + "grad_norm": 7.200497493510621, + "learning_rate": 4.083912004801206e-06, + "loss": 0.9134, + "step": 8392 + }, + { + "epoch": 0.6063539653584264, + "grad_norm": 5.267910696131531, + "learning_rate": 4.0836856878960565e-06, + "loss": 0.8075, + "step": 8393 + }, + { + "epoch": 0.606426210558636, + "grad_norm": 6.162423538445528, + "learning_rate": 4.083459349311481e-06, + "loss": 0.8949, + "step": 8394 + }, + { + "epoch": 0.6064984557588455, + "grad_norm": 5.846206230438536, + "learning_rate": 4.083232989050575e-06, + "loss": 0.8448, + "step": 8395 + }, + { + "epoch": 0.6065707009590551, + "grad_norm": 5.248915969103336, + "learning_rate": 4.0830066071164396e-06, + "loss": 0.8523, + "step": 8396 + }, + { + "epoch": 0.6066429461592645, + "grad_norm": 6.24142417012229, + "learning_rate": 4.0827802035121724e-06, + "loss": 0.8709, + "step": 8397 + }, + { + "epoch": 0.606715191359474, + "grad_norm": 5.126703862988314, + "learning_rate": 4.082553778240874e-06, + "loss": 0.8064, + "step": 8398 + }, + { + "epoch": 0.6067874365596836, + "grad_norm": 5.370132548935794, + "learning_rate": 4.0823273313056425e-06, + "loss": 0.8613, + "step": 8399 + }, + { + "epoch": 0.6068596817598931, + "grad_norm": 6.538995120340578, + "learning_rate": 4.082100862709578e-06, + "loss": 0.8739, + "step": 8400 + }, + { + "epoch": 0.6069319269601026, + "grad_norm": 6.381819424830397, + "learning_rate": 4.081874372455782e-06, + "loss": 0.7978, + "step": 8401 + }, + { + "epoch": 0.6070041721603121, + "grad_norm": 5.954636749796035, + "learning_rate": 4.081647860547354e-06, + "loss": 0.8925, + "step": 8402 + }, + { + "epoch": 0.6070764173605216, + "grad_norm": 7.464737201211728, + "learning_rate": 4.081421326987394e-06, + "loss": 0.961, + "step": 8403 + }, + { + "epoch": 0.6071486625607311, + "grad_norm": 6.314760615386673, + "learning_rate": 4.081194771779005e-06, + "loss": 0.7963, + "step": 8404 + }, + { + "epoch": 0.6072209077609406, + "grad_norm": 5.647614858404138, + "learning_rate": 4.080968194925287e-06, + "loss": 0.8761, + "step": 8405 + }, + { + "epoch": 0.6072931529611502, + "grad_norm": 8.734405572117783, + "learning_rate": 4.080741596429341e-06, + "loss": 0.9114, + "step": 8406 + }, + { + "epoch": 0.6073653981613597, + "grad_norm": 6.019885489049566, + "learning_rate": 4.0805149762942705e-06, + "loss": 0.8566, + "step": 8407 + }, + { + "epoch": 0.6074376433615691, + "grad_norm": 7.601431521064542, + "learning_rate": 4.0802883345231775e-06, + "loss": 0.9127, + "step": 8408 + }, + { + "epoch": 0.6075098885617787, + "grad_norm": 5.143267755162466, + "learning_rate": 4.0800616711191645e-06, + "loss": 0.9248, + "step": 8409 + }, + { + "epoch": 0.6075821337619882, + "grad_norm": 6.115417458234275, + "learning_rate": 4.0798349860853334e-06, + "loss": 0.8284, + "step": 8410 + }, + { + "epoch": 0.6076543789621976, + "grad_norm": 6.462556832098691, + "learning_rate": 4.079608279424788e-06, + "loss": 0.856, + "step": 8411 + }, + { + "epoch": 0.6077266241624072, + "grad_norm": 6.247267774377483, + "learning_rate": 4.079381551140632e-06, + "loss": 0.7891, + "step": 8412 + }, + { + "epoch": 0.6077988693626167, + "grad_norm": 5.435968643805403, + "learning_rate": 4.079154801235969e-06, + "loss": 0.8024, + "step": 8413 + }, + { + "epoch": 0.6078711145628263, + "grad_norm": 6.612659404830773, + "learning_rate": 4.078928029713902e-06, + "loss": 0.8124, + "step": 8414 + }, + { + "epoch": 0.6079433597630357, + "grad_norm": 6.5530824935473175, + "learning_rate": 4.078701236577538e-06, + "loss": 0.8295, + "step": 8415 + }, + { + "epoch": 0.6080156049632452, + "grad_norm": 7.359450678021191, + "learning_rate": 4.078474421829979e-06, + "loss": 0.8331, + "step": 8416 + }, + { + "epoch": 0.6080878501634548, + "grad_norm": 7.141008295836182, + "learning_rate": 4.0782475854743295e-06, + "loss": 0.912, + "step": 8417 + }, + { + "epoch": 0.6081600953636642, + "grad_norm": 5.530933090872904, + "learning_rate": 4.078020727513698e-06, + "loss": 0.8558, + "step": 8418 + }, + { + "epoch": 0.6082323405638738, + "grad_norm": 6.257688752092613, + "learning_rate": 4.077793847951186e-06, + "loss": 0.8509, + "step": 8419 + }, + { + "epoch": 0.6083045857640833, + "grad_norm": 11.37504158431616, + "learning_rate": 4.077566946789903e-06, + "loss": 0.9231, + "step": 8420 + }, + { + "epoch": 0.6083768309642928, + "grad_norm": 7.435686932290069, + "learning_rate": 4.077340024032952e-06, + "loss": 0.9159, + "step": 8421 + }, + { + "epoch": 0.6084490761645023, + "grad_norm": 7.245707326286587, + "learning_rate": 4.077113079683442e-06, + "loss": 0.8132, + "step": 8422 + }, + { + "epoch": 0.6085213213647118, + "grad_norm": 6.2878789889895135, + "learning_rate": 4.076886113744478e-06, + "loss": 0.9048, + "step": 8423 + }, + { + "epoch": 0.6085935665649214, + "grad_norm": 6.4049356740539976, + "learning_rate": 4.076659126219168e-06, + "loss": 0.8813, + "step": 8424 + }, + { + "epoch": 0.6086658117651309, + "grad_norm": 6.3357406190511405, + "learning_rate": 4.076432117110618e-06, + "loss": 0.871, + "step": 8425 + }, + { + "epoch": 0.6087380569653403, + "grad_norm": 5.946698582368289, + "learning_rate": 4.076205086421937e-06, + "loss": 0.8695, + "step": 8426 + }, + { + "epoch": 0.6088103021655499, + "grad_norm": 6.14937893592576, + "learning_rate": 4.075978034156232e-06, + "loss": 0.8559, + "step": 8427 + }, + { + "epoch": 0.6088825473657594, + "grad_norm": 6.6376673675528055, + "learning_rate": 4.07575096031661e-06, + "loss": 0.8905, + "step": 8428 + }, + { + "epoch": 0.6089547925659688, + "grad_norm": 6.649548519554994, + "learning_rate": 4.0755238649061825e-06, + "loss": 0.7934, + "step": 8429 + }, + { + "epoch": 0.6090270377661784, + "grad_norm": 6.090705619736441, + "learning_rate": 4.075296747928056e-06, + "loss": 0.8146, + "step": 8430 + }, + { + "epoch": 0.6090992829663879, + "grad_norm": 5.952788576246942, + "learning_rate": 4.07506960938534e-06, + "loss": 0.9015, + "step": 8431 + }, + { + "epoch": 0.6091715281665975, + "grad_norm": 7.983696298438046, + "learning_rate": 4.074842449281144e-06, + "loss": 0.9471, + "step": 8432 + }, + { + "epoch": 0.6092437733668069, + "grad_norm": 6.807993667079612, + "learning_rate": 4.074615267618579e-06, + "loss": 0.8687, + "step": 8433 + }, + { + "epoch": 0.6093160185670164, + "grad_norm": 5.454212331714253, + "learning_rate": 4.074388064400753e-06, + "loss": 0.9185, + "step": 8434 + }, + { + "epoch": 0.609388263767226, + "grad_norm": 6.3325607765555265, + "learning_rate": 4.074160839630776e-06, + "loss": 0.9122, + "step": 8435 + }, + { + "epoch": 0.6094605089674354, + "grad_norm": 6.2479473558032375, + "learning_rate": 4.073933593311759e-06, + "loss": 0.9355, + "step": 8436 + }, + { + "epoch": 0.609532754167645, + "grad_norm": 5.703708232164076, + "learning_rate": 4.0737063254468146e-06, + "loss": 0.8317, + "step": 8437 + }, + { + "epoch": 0.6096049993678545, + "grad_norm": 5.330114625195654, + "learning_rate": 4.073479036039052e-06, + "loss": 0.7311, + "step": 8438 + }, + { + "epoch": 0.609677244568064, + "grad_norm": 5.453457272962478, + "learning_rate": 4.073251725091583e-06, + "loss": 0.8745, + "step": 8439 + }, + { + "epoch": 0.6097494897682735, + "grad_norm": 4.972594111699348, + "learning_rate": 4.073024392607519e-06, + "loss": 0.92, + "step": 8440 + }, + { + "epoch": 0.609821734968483, + "grad_norm": 6.738244734955409, + "learning_rate": 4.072797038589973e-06, + "loss": 0.8116, + "step": 8441 + }, + { + "epoch": 0.6098939801686926, + "grad_norm": 6.273458478424606, + "learning_rate": 4.072569663042056e-06, + "loss": 0.9044, + "step": 8442 + }, + { + "epoch": 0.6099662253689021, + "grad_norm": 4.523480930679891, + "learning_rate": 4.0723422659668825e-06, + "loss": 0.7786, + "step": 8443 + }, + { + "epoch": 0.6100384705691115, + "grad_norm": 6.538402382768603, + "learning_rate": 4.072114847367563e-06, + "loss": 0.8788, + "step": 8444 + }, + { + "epoch": 0.6101107157693211, + "grad_norm": 6.924590600647251, + "learning_rate": 4.071887407247213e-06, + "loss": 0.9087, + "step": 8445 + }, + { + "epoch": 0.6101829609695306, + "grad_norm": 5.9019210532792306, + "learning_rate": 4.071659945608945e-06, + "loss": 0.8202, + "step": 8446 + }, + { + "epoch": 0.61025520616974, + "grad_norm": 7.180866663489746, + "learning_rate": 4.071432462455872e-06, + "loss": 0.8614, + "step": 8447 + }, + { + "epoch": 0.6103274513699496, + "grad_norm": 6.31354929446215, + "learning_rate": 4.071204957791109e-06, + "loss": 0.9225, + "step": 8448 + }, + { + "epoch": 0.6103996965701591, + "grad_norm": 5.746500691954776, + "learning_rate": 4.0709774316177696e-06, + "loss": 0.7699, + "step": 8449 + }, + { + "epoch": 0.6104719417703687, + "grad_norm": 6.070630471482271, + "learning_rate": 4.0707498839389695e-06, + "loss": 0.8626, + "step": 8450 + }, + { + "epoch": 0.6105441869705781, + "grad_norm": 8.316623210092684, + "learning_rate": 4.070522314757822e-06, + "loss": 0.8793, + "step": 8451 + }, + { + "epoch": 0.6106164321707876, + "grad_norm": 6.496734972862691, + "learning_rate": 4.070294724077445e-06, + "loss": 0.8733, + "step": 8452 + }, + { + "epoch": 0.6106886773709972, + "grad_norm": 6.198950143341909, + "learning_rate": 4.070067111900952e-06, + "loss": 0.8406, + "step": 8453 + }, + { + "epoch": 0.6107609225712066, + "grad_norm": 6.674720604010445, + "learning_rate": 4.069839478231458e-06, + "loss": 0.8495, + "step": 8454 + }, + { + "epoch": 0.6108331677714162, + "grad_norm": 6.738756210615819, + "learning_rate": 4.069611823072082e-06, + "loss": 0.8845, + "step": 8455 + }, + { + "epoch": 0.6109054129716257, + "grad_norm": 6.904296046234038, + "learning_rate": 4.069384146425938e-06, + "loss": 0.9763, + "step": 8456 + }, + { + "epoch": 0.6109776581718352, + "grad_norm": 11.006068203154134, + "learning_rate": 4.069156448296145e-06, + "loss": 0.8966, + "step": 8457 + }, + { + "epoch": 0.6110499033720447, + "grad_norm": 6.614597142322774, + "learning_rate": 4.068928728685817e-06, + "loss": 0.8571, + "step": 8458 + }, + { + "epoch": 0.6111221485722542, + "grad_norm": 6.233584093296035, + "learning_rate": 4.068700987598074e-06, + "loss": 0.8621, + "step": 8459 + }, + { + "epoch": 0.6111943937724638, + "grad_norm": 6.048905694530074, + "learning_rate": 4.068473225036032e-06, + "loss": 0.9194, + "step": 8460 + }, + { + "epoch": 0.6112666389726733, + "grad_norm": 7.852438442276775, + "learning_rate": 4.068245441002809e-06, + "loss": 0.8382, + "step": 8461 + }, + { + "epoch": 0.6113388841728827, + "grad_norm": 9.925683631702183, + "learning_rate": 4.0680176355015235e-06, + "loss": 0.8685, + "step": 8462 + }, + { + "epoch": 0.6114111293730923, + "grad_norm": 8.13993706606219, + "learning_rate": 4.067789808535295e-06, + "loss": 0.9347, + "step": 8463 + }, + { + "epoch": 0.6114833745733018, + "grad_norm": 5.788873049046638, + "learning_rate": 4.067561960107241e-06, + "loss": 0.8844, + "step": 8464 + }, + { + "epoch": 0.6115556197735112, + "grad_norm": 4.494861423957479, + "learning_rate": 4.067334090220481e-06, + "loss": 0.7907, + "step": 8465 + }, + { + "epoch": 0.6116278649737208, + "grad_norm": 8.501331449459798, + "learning_rate": 4.067106198878134e-06, + "loss": 0.8889, + "step": 8466 + }, + { + "epoch": 0.6117001101739303, + "grad_norm": 9.583682465758775, + "learning_rate": 4.06687828608332e-06, + "loss": 0.8997, + "step": 8467 + }, + { + "epoch": 0.6117723553741399, + "grad_norm": 7.344410448609642, + "learning_rate": 4.0666503518391585e-06, + "loss": 0.8164, + "step": 8468 + }, + { + "epoch": 0.6118446005743493, + "grad_norm": 7.681088635058764, + "learning_rate": 4.06642239614877e-06, + "loss": 0.8006, + "step": 8469 + }, + { + "epoch": 0.6119168457745588, + "grad_norm": 6.13742766483357, + "learning_rate": 4.066194419015276e-06, + "loss": 0.9038, + "step": 8470 + }, + { + "epoch": 0.6119890909747684, + "grad_norm": 5.841289797973239, + "learning_rate": 4.065966420441796e-06, + "loss": 0.9177, + "step": 8471 + }, + { + "epoch": 0.6120613361749778, + "grad_norm": 6.556065774807595, + "learning_rate": 4.065738400431452e-06, + "loss": 0.8785, + "step": 8472 + }, + { + "epoch": 0.6121335813751874, + "grad_norm": 7.233630411871129, + "learning_rate": 4.065510358987364e-06, + "loss": 0.869, + "step": 8473 + }, + { + "epoch": 0.6122058265753969, + "grad_norm": 7.150542135959676, + "learning_rate": 4.065282296112656e-06, + "loss": 0.8293, + "step": 8474 + }, + { + "epoch": 0.6122780717756064, + "grad_norm": 6.5022749221020835, + "learning_rate": 4.065054211810448e-06, + "loss": 0.8507, + "step": 8475 + }, + { + "epoch": 0.6123503169758159, + "grad_norm": 5.845265237942954, + "learning_rate": 4.064826106083862e-06, + "loss": 0.8312, + "step": 8476 + }, + { + "epoch": 0.6124225621760254, + "grad_norm": 5.090085823819443, + "learning_rate": 4.064597978936023e-06, + "loss": 0.8388, + "step": 8477 + }, + { + "epoch": 0.612494807376235, + "grad_norm": 5.911429581313155, + "learning_rate": 4.064369830370053e-06, + "loss": 0.7671, + "step": 8478 + }, + { + "epoch": 0.6125670525764445, + "grad_norm": 7.424873726024713, + "learning_rate": 4.064141660389074e-06, + "loss": 0.9081, + "step": 8479 + }, + { + "epoch": 0.6126392977766539, + "grad_norm": 6.347914658201739, + "learning_rate": 4.063913468996209e-06, + "loss": 0.8699, + "step": 8480 + }, + { + "epoch": 0.6127115429768635, + "grad_norm": 6.747566349569725, + "learning_rate": 4.063685256194585e-06, + "loss": 0.8607, + "step": 8481 + }, + { + "epoch": 0.612783788177073, + "grad_norm": 5.894037447217248, + "learning_rate": 4.063457021987323e-06, + "loss": 0.8837, + "step": 8482 + }, + { + "epoch": 0.6128560333772824, + "grad_norm": 6.089533359332538, + "learning_rate": 4.063228766377548e-06, + "loss": 0.8834, + "step": 8483 + }, + { + "epoch": 0.612928278577492, + "grad_norm": 5.543140626464454, + "learning_rate": 4.063000489368385e-06, + "loss": 0.8008, + "step": 8484 + }, + { + "epoch": 0.6130005237777015, + "grad_norm": 7.0575379350962235, + "learning_rate": 4.062772190962959e-06, + "loss": 0.8666, + "step": 8485 + }, + { + "epoch": 0.6130727689779111, + "grad_norm": 8.41639658680291, + "learning_rate": 4.062543871164396e-06, + "loss": 0.8566, + "step": 8486 + }, + { + "epoch": 0.6131450141781205, + "grad_norm": 10.616400615845881, + "learning_rate": 4.062315529975819e-06, + "loss": 0.8441, + "step": 8487 + }, + { + "epoch": 0.61321725937833, + "grad_norm": 6.86271513165682, + "learning_rate": 4.062087167400356e-06, + "loss": 0.8801, + "step": 8488 + }, + { + "epoch": 0.6132895045785396, + "grad_norm": 7.409488018433696, + "learning_rate": 4.061858783441133e-06, + "loss": 0.875, + "step": 8489 + }, + { + "epoch": 0.613361749778749, + "grad_norm": 8.126833664782309, + "learning_rate": 4.061630378101276e-06, + "loss": 0.8633, + "step": 8490 + }, + { + "epoch": 0.6134339949789586, + "grad_norm": 7.6826255665431935, + "learning_rate": 4.061401951383912e-06, + "loss": 0.8147, + "step": 8491 + }, + { + "epoch": 0.6135062401791681, + "grad_norm": 10.589125082239997, + "learning_rate": 4.061173503292167e-06, + "loss": 0.8744, + "step": 8492 + }, + { + "epoch": 0.6135784853793776, + "grad_norm": 8.18296164026538, + "learning_rate": 4.060945033829169e-06, + "loss": 0.8451, + "step": 8493 + }, + { + "epoch": 0.6136507305795871, + "grad_norm": 7.489822857828507, + "learning_rate": 4.060716542998045e-06, + "loss": 0.8627, + "step": 8494 + }, + { + "epoch": 0.6137229757797966, + "grad_norm": 7.311897970343332, + "learning_rate": 4.0604880308019245e-06, + "loss": 0.9487, + "step": 8495 + }, + { + "epoch": 0.6137952209800062, + "grad_norm": 7.7355820455696955, + "learning_rate": 4.0602594972439335e-06, + "loss": 0.8141, + "step": 8496 + }, + { + "epoch": 0.6138674661802157, + "grad_norm": 5.660865544433837, + "learning_rate": 4.060030942327202e-06, + "loss": 0.8288, + "step": 8497 + }, + { + "epoch": 0.6139397113804251, + "grad_norm": 6.080989368652124, + "learning_rate": 4.059802366054858e-06, + "loss": 0.8264, + "step": 8498 + }, + { + "epoch": 0.6140119565806347, + "grad_norm": 6.29314817792, + "learning_rate": 4.059573768430031e-06, + "loss": 0.904, + "step": 8499 + }, + { + "epoch": 0.6140842017808442, + "grad_norm": 5.136744370777359, + "learning_rate": 4.05934514945585e-06, + "loss": 0.7632, + "step": 8500 + }, + { + "epoch": 0.6141564469810536, + "grad_norm": 7.215114454179785, + "learning_rate": 4.059116509135444e-06, + "loss": 0.8196, + "step": 8501 + }, + { + "epoch": 0.6142286921812632, + "grad_norm": 6.305986121774895, + "learning_rate": 4.058887847471944e-06, + "loss": 0.8258, + "step": 8502 + }, + { + "epoch": 0.6143009373814727, + "grad_norm": 5.197149471116072, + "learning_rate": 4.058659164468479e-06, + "loss": 0.8049, + "step": 8503 + }, + { + "epoch": 0.6143731825816823, + "grad_norm": 5.29763653437472, + "learning_rate": 4.058430460128182e-06, + "loss": 0.8508, + "step": 8504 + }, + { + "epoch": 0.6144454277818917, + "grad_norm": 7.419668668767957, + "learning_rate": 4.05820173445418e-06, + "loss": 0.8794, + "step": 8505 + }, + { + "epoch": 0.6145176729821012, + "grad_norm": 5.747076867163849, + "learning_rate": 4.057972987449608e-06, + "loss": 0.8025, + "step": 8506 + }, + { + "epoch": 0.6145899181823108, + "grad_norm": 5.585258474627074, + "learning_rate": 4.057744219117594e-06, + "loss": 0.9402, + "step": 8507 + }, + { + "epoch": 0.6146621633825202, + "grad_norm": 6.661184281912463, + "learning_rate": 4.057515429461273e-06, + "loss": 0.9672, + "step": 8508 + }, + { + "epoch": 0.6147344085827298, + "grad_norm": 7.636091326527618, + "learning_rate": 4.057286618483773e-06, + "loss": 0.8522, + "step": 8509 + }, + { + "epoch": 0.6148066537829393, + "grad_norm": 6.002780270148133, + "learning_rate": 4.057057786188229e-06, + "loss": 0.8843, + "step": 8510 + }, + { + "epoch": 0.6148788989831488, + "grad_norm": 5.695820852441329, + "learning_rate": 4.056828932577774e-06, + "loss": 0.861, + "step": 8511 + }, + { + "epoch": 0.6149511441833583, + "grad_norm": 5.0215459083161, + "learning_rate": 4.056600057655538e-06, + "loss": 0.8836, + "step": 8512 + }, + { + "epoch": 0.6150233893835678, + "grad_norm": 5.495749825437729, + "learning_rate": 4.056371161424657e-06, + "loss": 0.7904, + "step": 8513 + }, + { + "epoch": 0.6150956345837774, + "grad_norm": 5.2805933798079066, + "learning_rate": 4.056142243888263e-06, + "loss": 0.8533, + "step": 8514 + }, + { + "epoch": 0.6151678797839869, + "grad_norm": 8.9918790842255, + "learning_rate": 4.0559133050494895e-06, + "loss": 0.918, + "step": 8515 + }, + { + "epoch": 0.6152401249841963, + "grad_norm": 8.40241321231748, + "learning_rate": 4.055684344911472e-06, + "loss": 0.8263, + "step": 8516 + }, + { + "epoch": 0.6153123701844059, + "grad_norm": 6.441582727646162, + "learning_rate": 4.055455363477343e-06, + "loss": 0.8342, + "step": 8517 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 5.501959711679586, + "learning_rate": 4.055226360750238e-06, + "loss": 0.8491, + "step": 8518 + }, + { + "epoch": 0.6154568605848248, + "grad_norm": 7.439318009980033, + "learning_rate": 4.054997336733291e-06, + "loss": 0.8544, + "step": 8519 + }, + { + "epoch": 0.6155291057850344, + "grad_norm": 5.571021079088416, + "learning_rate": 4.054768291429638e-06, + "loss": 0.8902, + "step": 8520 + }, + { + "epoch": 0.6156013509852439, + "grad_norm": 5.490616857341358, + "learning_rate": 4.054539224842416e-06, + "loss": 0.8076, + "step": 8521 + }, + { + "epoch": 0.6156735961854535, + "grad_norm": 7.822386566636833, + "learning_rate": 4.054310136974758e-06, + "loss": 0.9139, + "step": 8522 + }, + { + "epoch": 0.6157458413856629, + "grad_norm": 8.001050403300328, + "learning_rate": 4.0540810278298e-06, + "loss": 0.7977, + "step": 8523 + }, + { + "epoch": 0.6158180865858724, + "grad_norm": 7.053609197422333, + "learning_rate": 4.05385189741068e-06, + "loss": 0.8152, + "step": 8524 + }, + { + "epoch": 0.615890331786082, + "grad_norm": 6.615586700833134, + "learning_rate": 4.053622745720534e-06, + "loss": 0.8121, + "step": 8525 + }, + { + "epoch": 0.6159625769862914, + "grad_norm": 5.772740054816128, + "learning_rate": 4.053393572762499e-06, + "loss": 0.819, + "step": 8526 + }, + { + "epoch": 0.616034822186501, + "grad_norm": 5.850211910746034, + "learning_rate": 4.053164378539712e-06, + "loss": 0.9109, + "step": 8527 + }, + { + "epoch": 0.6161070673867105, + "grad_norm": 8.83427908770782, + "learning_rate": 4.052935163055311e-06, + "loss": 0.9018, + "step": 8528 + }, + { + "epoch": 0.61617931258692, + "grad_norm": 8.027876921162164, + "learning_rate": 4.052705926312434e-06, + "loss": 0.8824, + "step": 8529 + }, + { + "epoch": 0.6162515577871295, + "grad_norm": 6.049293527260442, + "learning_rate": 4.052476668314217e-06, + "loss": 0.8597, + "step": 8530 + }, + { + "epoch": 0.616323802987339, + "grad_norm": 7.793346393966934, + "learning_rate": 4.0522473890638e-06, + "loss": 0.8868, + "step": 8531 + }, + { + "epoch": 0.6163960481875486, + "grad_norm": 6.226430779072552, + "learning_rate": 4.052018088564322e-06, + "loss": 0.8883, + "step": 8532 + }, + { + "epoch": 0.6164682933877581, + "grad_norm": 5.961380968308971, + "learning_rate": 4.051788766818922e-06, + "loss": 0.8891, + "step": 8533 + }, + { + "epoch": 0.6165405385879675, + "grad_norm": 6.783635858011658, + "learning_rate": 4.051559423830738e-06, + "loss": 1.0432, + "step": 8534 + }, + { + "epoch": 0.6166127837881771, + "grad_norm": 7.4767446465861, + "learning_rate": 4.05133005960291e-06, + "loss": 0.8496, + "step": 8535 + }, + { + "epoch": 0.6166850289883866, + "grad_norm": 5.369328189930435, + "learning_rate": 4.0511006741385786e-06, + "loss": 0.8143, + "step": 8536 + }, + { + "epoch": 0.616757274188596, + "grad_norm": 5.424328163842296, + "learning_rate": 4.050871267440883e-06, + "loss": 0.793, + "step": 8537 + }, + { + "epoch": 0.6168295193888056, + "grad_norm": 5.770074383604945, + "learning_rate": 4.0506418395129645e-06, + "loss": 0.8562, + "step": 8538 + }, + { + "epoch": 0.6169017645890151, + "grad_norm": 6.11629069296255, + "learning_rate": 4.050412390357962e-06, + "loss": 0.9453, + "step": 8539 + }, + { + "epoch": 0.6169740097892247, + "grad_norm": 8.195143997016523, + "learning_rate": 4.050182919979019e-06, + "loss": 0.9172, + "step": 8540 + }, + { + "epoch": 0.6170462549894341, + "grad_norm": 6.42597715316876, + "learning_rate": 4.0499534283792745e-06, + "loss": 0.8733, + "step": 8541 + }, + { + "epoch": 0.6171185001896436, + "grad_norm": 5.5449002041448665, + "learning_rate": 4.049723915561871e-06, + "loss": 0.7533, + "step": 8542 + }, + { + "epoch": 0.6171907453898532, + "grad_norm": 5.8440254676885335, + "learning_rate": 4.049494381529951e-06, + "loss": 0.871, + "step": 8543 + }, + { + "epoch": 0.6172629905900626, + "grad_norm": 5.378195056979884, + "learning_rate": 4.049264826286656e-06, + "loss": 0.8888, + "step": 8544 + }, + { + "epoch": 0.6173352357902722, + "grad_norm": 7.444433490624918, + "learning_rate": 4.049035249835128e-06, + "loss": 0.7692, + "step": 8545 + }, + { + "epoch": 0.6174074809904817, + "grad_norm": 7.657088751489914, + "learning_rate": 4.04880565217851e-06, + "loss": 0.8205, + "step": 8546 + }, + { + "epoch": 0.6174797261906912, + "grad_norm": 5.368998348367273, + "learning_rate": 4.048576033319946e-06, + "loss": 0.759, + "step": 8547 + }, + { + "epoch": 0.6175519713909007, + "grad_norm": 5.668911246263923, + "learning_rate": 4.048346393262578e-06, + "loss": 0.8594, + "step": 8548 + }, + { + "epoch": 0.6176242165911102, + "grad_norm": 7.012407750357604, + "learning_rate": 4.048116732009551e-06, + "loss": 0.904, + "step": 8549 + }, + { + "epoch": 0.6176964617913198, + "grad_norm": 6.465272420172333, + "learning_rate": 4.047887049564007e-06, + "loss": 0.8207, + "step": 8550 + }, + { + "epoch": 0.6177687069915293, + "grad_norm": 6.549895761839786, + "learning_rate": 4.04765734592909e-06, + "loss": 0.8861, + "step": 8551 + }, + { + "epoch": 0.6178409521917387, + "grad_norm": 7.719138934881021, + "learning_rate": 4.047427621107948e-06, + "loss": 0.949, + "step": 8552 + }, + { + "epoch": 0.6179131973919483, + "grad_norm": 7.6924179597434, + "learning_rate": 4.047197875103722e-06, + "loss": 0.8856, + "step": 8553 + }, + { + "epoch": 0.6179854425921578, + "grad_norm": 6.166801897920296, + "learning_rate": 4.0469681079195584e-06, + "loss": 0.8699, + "step": 8554 + }, + { + "epoch": 0.6180576877923672, + "grad_norm": 7.762186129454819, + "learning_rate": 4.046738319558603e-06, + "loss": 0.8676, + "step": 8555 + }, + { + "epoch": 0.6181299329925768, + "grad_norm": 6.451683640861338, + "learning_rate": 4.0465085100240006e-06, + "loss": 0.8604, + "step": 8556 + }, + { + "epoch": 0.6182021781927863, + "grad_norm": 5.83321373226535, + "learning_rate": 4.046278679318898e-06, + "loss": 0.8955, + "step": 8557 + }, + { + "epoch": 0.6182744233929959, + "grad_norm": 6.964068152973362, + "learning_rate": 4.046048827446441e-06, + "loss": 0.8779, + "step": 8558 + }, + { + "epoch": 0.6183466685932053, + "grad_norm": 6.073175842442932, + "learning_rate": 4.0458189544097744e-06, + "loss": 0.8341, + "step": 8559 + }, + { + "epoch": 0.6184189137934148, + "grad_norm": 6.199735315887328, + "learning_rate": 4.0455890602120475e-06, + "loss": 0.9331, + "step": 8560 + }, + { + "epoch": 0.6184911589936244, + "grad_norm": 7.817837044208884, + "learning_rate": 4.045359144856407e-06, + "loss": 0.9444, + "step": 8561 + }, + { + "epoch": 0.6185634041938338, + "grad_norm": 7.06432757741979, + "learning_rate": 4.045129208345999e-06, + "loss": 0.9046, + "step": 8562 + }, + { + "epoch": 0.6186356493940434, + "grad_norm": 6.5262963888919465, + "learning_rate": 4.044899250683973e-06, + "loss": 0.8503, + "step": 8563 + }, + { + "epoch": 0.6187078945942529, + "grad_norm": 5.628207839110476, + "learning_rate": 4.044669271873475e-06, + "loss": 0.8759, + "step": 8564 + }, + { + "epoch": 0.6187801397944624, + "grad_norm": 8.134232191018544, + "learning_rate": 4.044439271917654e-06, + "loss": 0.8644, + "step": 8565 + }, + { + "epoch": 0.6188523849946719, + "grad_norm": 7.558559683789512, + "learning_rate": 4.044209250819658e-06, + "loss": 0.931, + "step": 8566 + }, + { + "epoch": 0.6189246301948814, + "grad_norm": 6.745835891858237, + "learning_rate": 4.043979208582637e-06, + "loss": 0.9127, + "step": 8567 + }, + { + "epoch": 0.618996875395091, + "grad_norm": 5.7235884079758925, + "learning_rate": 4.0437491452097395e-06, + "loss": 0.8835, + "step": 8568 + }, + { + "epoch": 0.6190691205953005, + "grad_norm": 7.833999226721773, + "learning_rate": 4.043519060704114e-06, + "loss": 0.9022, + "step": 8569 + }, + { + "epoch": 0.6191413657955099, + "grad_norm": 6.609677138761175, + "learning_rate": 4.043288955068913e-06, + "loss": 0.8651, + "step": 8570 + }, + { + "epoch": 0.6192136109957195, + "grad_norm": 6.256508452024492, + "learning_rate": 4.043058828307283e-06, + "loss": 0.9041, + "step": 8571 + }, + { + "epoch": 0.619285856195929, + "grad_norm": 5.467528811755184, + "learning_rate": 4.042828680422376e-06, + "loss": 0.8627, + "step": 8572 + }, + { + "epoch": 0.6193581013961384, + "grad_norm": 6.278543828776166, + "learning_rate": 4.042598511417343e-06, + "loss": 0.8546, + "step": 8573 + }, + { + "epoch": 0.619430346596348, + "grad_norm": 5.506936294444872, + "learning_rate": 4.042368321295333e-06, + "loss": 0.7523, + "step": 8574 + }, + { + "epoch": 0.6195025917965575, + "grad_norm": 6.279927432375967, + "learning_rate": 4.042138110059498e-06, + "loss": 0.792, + "step": 8575 + }, + { + "epoch": 0.6195748369967671, + "grad_norm": 7.915729614482157, + "learning_rate": 4.041907877712991e-06, + "loss": 0.9044, + "step": 8576 + }, + { + "epoch": 0.6196470821969765, + "grad_norm": 6.533001094596721, + "learning_rate": 4.041677624258962e-06, + "loss": 0.8166, + "step": 8577 + }, + { + "epoch": 0.619719327397186, + "grad_norm": 7.546137992933765, + "learning_rate": 4.041447349700562e-06, + "loss": 0.9074, + "step": 8578 + }, + { + "epoch": 0.6197915725973956, + "grad_norm": 6.210840457182129, + "learning_rate": 4.041217054040946e-06, + "loss": 0.8078, + "step": 8579 + }, + { + "epoch": 0.619863817797605, + "grad_norm": 5.564381313429654, + "learning_rate": 4.040986737283265e-06, + "loss": 0.9725, + "step": 8580 + }, + { + "epoch": 0.6199360629978146, + "grad_norm": 5.157645851547048, + "learning_rate": 4.040756399430672e-06, + "loss": 0.9078, + "step": 8581 + }, + { + "epoch": 0.6200083081980241, + "grad_norm": 6.149983116064699, + "learning_rate": 4.04052604048632e-06, + "loss": 0.8702, + "step": 8582 + }, + { + "epoch": 0.6200805533982336, + "grad_norm": 5.842140669735749, + "learning_rate": 4.0402956604533625e-06, + "loss": 0.8525, + "step": 8583 + }, + { + "epoch": 0.6201527985984431, + "grad_norm": 7.266537713589435, + "learning_rate": 4.040065259334954e-06, + "loss": 0.8761, + "step": 8584 + }, + { + "epoch": 0.6202250437986526, + "grad_norm": 6.599337122602821, + "learning_rate": 4.039834837134248e-06, + "loss": 0.8982, + "step": 8585 + }, + { + "epoch": 0.6202972889988622, + "grad_norm": 7.094537498018529, + "learning_rate": 4.039604393854398e-06, + "loss": 0.8933, + "step": 8586 + }, + { + "epoch": 0.6203695341990717, + "grad_norm": 5.991331195945666, + "learning_rate": 4.03937392949856e-06, + "loss": 0.8513, + "step": 8587 + }, + { + "epoch": 0.6204417793992811, + "grad_norm": 7.291875462040355, + "learning_rate": 4.0391434440698875e-06, + "loss": 0.8908, + "step": 8588 + }, + { + "epoch": 0.6205140245994907, + "grad_norm": 6.696658699077742, + "learning_rate": 4.0389129375715366e-06, + "loss": 0.8472, + "step": 8589 + }, + { + "epoch": 0.6205862697997002, + "grad_norm": 6.952151661156894, + "learning_rate": 4.038682410006662e-06, + "loss": 0.8738, + "step": 8590 + }, + { + "epoch": 0.6206585149999096, + "grad_norm": 5.5665450229412174, + "learning_rate": 4.038451861378421e-06, + "loss": 0.7677, + "step": 8591 + }, + { + "epoch": 0.6207307602001192, + "grad_norm": 6.914504145940511, + "learning_rate": 4.038221291689968e-06, + "loss": 0.8007, + "step": 8592 + }, + { + "epoch": 0.6208030054003287, + "grad_norm": 6.040644311454678, + "learning_rate": 4.037990700944459e-06, + "loss": 0.9086, + "step": 8593 + }, + { + "epoch": 0.6208752506005383, + "grad_norm": 6.981428032417592, + "learning_rate": 4.037760089145052e-06, + "loss": 0.8873, + "step": 8594 + }, + { + "epoch": 0.6209474958007477, + "grad_norm": 4.992125413774236, + "learning_rate": 4.037529456294904e-06, + "loss": 0.8494, + "step": 8595 + }, + { + "epoch": 0.6210197410009572, + "grad_norm": 5.103170284670668, + "learning_rate": 4.03729880239717e-06, + "loss": 0.8111, + "step": 8596 + }, + { + "epoch": 0.6210919862011668, + "grad_norm": 6.630788613392682, + "learning_rate": 4.037068127455011e-06, + "loss": 0.8626, + "step": 8597 + }, + { + "epoch": 0.6211642314013762, + "grad_norm": 5.066085111135766, + "learning_rate": 4.036837431471582e-06, + "loss": 0.8458, + "step": 8598 + }, + { + "epoch": 0.6212364766015858, + "grad_norm": 5.966819890503893, + "learning_rate": 4.036606714450042e-06, + "loss": 0.8539, + "step": 8599 + }, + { + "epoch": 0.6213087218017953, + "grad_norm": 5.528932592424352, + "learning_rate": 4.0363759763935486e-06, + "loss": 0.9367, + "step": 8600 + }, + { + "epoch": 0.6213809670020048, + "grad_norm": 8.016190834225608, + "learning_rate": 4.0361452173052616e-06, + "loss": 1.0066, + "step": 8601 + }, + { + "epoch": 0.6214532122022143, + "grad_norm": 5.661418766092856, + "learning_rate": 4.035914437188339e-06, + "loss": 0.8229, + "step": 8602 + }, + { + "epoch": 0.6215254574024238, + "grad_norm": 6.3437911910800935, + "learning_rate": 4.03568363604594e-06, + "loss": 0.8129, + "step": 8603 + }, + { + "epoch": 0.6215977026026334, + "grad_norm": 5.581194036017363, + "learning_rate": 4.0354528138812255e-06, + "loss": 0.8575, + "step": 8604 + }, + { + "epoch": 0.6216699478028429, + "grad_norm": 5.443187326286224, + "learning_rate": 4.035221970697353e-06, + "loss": 0.8738, + "step": 8605 + }, + { + "epoch": 0.6217421930030523, + "grad_norm": 6.071810150978165, + "learning_rate": 4.0349911064974845e-06, + "loss": 0.8846, + "step": 8606 + }, + { + "epoch": 0.6218144382032619, + "grad_norm": 7.05213424085746, + "learning_rate": 4.03476022128478e-06, + "loss": 0.9483, + "step": 8607 + }, + { + "epoch": 0.6218866834034714, + "grad_norm": 6.0723419528211835, + "learning_rate": 4.034529315062399e-06, + "loss": 0.8076, + "step": 8608 + }, + { + "epoch": 0.6219589286036808, + "grad_norm": 5.061651841687166, + "learning_rate": 4.034298387833503e-06, + "loss": 0.8547, + "step": 8609 + }, + { + "epoch": 0.6220311738038904, + "grad_norm": 6.164975871018019, + "learning_rate": 4.034067439601254e-06, + "loss": 0.8349, + "step": 8610 + }, + { + "epoch": 0.6221034190040999, + "grad_norm": 6.098552044225975, + "learning_rate": 4.0338364703688115e-06, + "loss": 0.8819, + "step": 8611 + }, + { + "epoch": 0.6221756642043095, + "grad_norm": 5.359447245208215, + "learning_rate": 4.03360548013934e-06, + "loss": 0.7751, + "step": 8612 + }, + { + "epoch": 0.6222479094045189, + "grad_norm": 7.419079191750604, + "learning_rate": 4.0333744689159996e-06, + "loss": 0.8948, + "step": 8613 + }, + { + "epoch": 0.6223201546047284, + "grad_norm": 7.842023317480895, + "learning_rate": 4.033143436701954e-06, + "loss": 1.0058, + "step": 8614 + }, + { + "epoch": 0.622392399804938, + "grad_norm": 7.231652557061539, + "learning_rate": 4.032912383500365e-06, + "loss": 0.8516, + "step": 8615 + }, + { + "epoch": 0.6224646450051474, + "grad_norm": 5.660025837977444, + "learning_rate": 4.032681309314395e-06, + "loss": 0.8835, + "step": 8616 + }, + { + "epoch": 0.622536890205357, + "grad_norm": 6.419366871165047, + "learning_rate": 4.032450214147208e-06, + "loss": 0.7578, + "step": 8617 + }, + { + "epoch": 0.6226091354055665, + "grad_norm": 6.666124353603455, + "learning_rate": 4.032219098001968e-06, + "loss": 0.8731, + "step": 8618 + }, + { + "epoch": 0.622681380605776, + "grad_norm": 6.073902850507435, + "learning_rate": 4.031987960881838e-06, + "loss": 0.9018, + "step": 8619 + }, + { + "epoch": 0.6227536258059855, + "grad_norm": 6.866496150050774, + "learning_rate": 4.031756802789983e-06, + "loss": 0.874, + "step": 8620 + }, + { + "epoch": 0.622825871006195, + "grad_norm": 8.0088980780937, + "learning_rate": 4.031525623729565e-06, + "loss": 0.8587, + "step": 8621 + }, + { + "epoch": 0.6228981162064046, + "grad_norm": 4.998681848342347, + "learning_rate": 4.031294423703752e-06, + "loss": 0.9626, + "step": 8622 + }, + { + "epoch": 0.622970361406614, + "grad_norm": 7.478993561706343, + "learning_rate": 4.031063202715707e-06, + "loss": 0.9321, + "step": 8623 + }, + { + "epoch": 0.6230426066068235, + "grad_norm": 7.5477678548736815, + "learning_rate": 4.0308319607685955e-06, + "loss": 0.8691, + "step": 8624 + }, + { + "epoch": 0.6231148518070331, + "grad_norm": 7.011652648104156, + "learning_rate": 4.0306006978655824e-06, + "loss": 0.8974, + "step": 8625 + }, + { + "epoch": 0.6231870970072426, + "grad_norm": 7.591483574622917, + "learning_rate": 4.030369414009835e-06, + "loss": 0.9032, + "step": 8626 + }, + { + "epoch": 0.623259342207452, + "grad_norm": 7.059366797129614, + "learning_rate": 4.0301381092045185e-06, + "loss": 0.9249, + "step": 8627 + }, + { + "epoch": 0.6233315874076616, + "grad_norm": 6.321539421433539, + "learning_rate": 4.029906783452798e-06, + "loss": 0.8655, + "step": 8628 + }, + { + "epoch": 0.6234038326078711, + "grad_norm": 5.361149886039929, + "learning_rate": 4.029675436757844e-06, + "loss": 0.8288, + "step": 8629 + }, + { + "epoch": 0.6234760778080807, + "grad_norm": 6.908206438188349, + "learning_rate": 4.029444069122819e-06, + "loss": 0.9234, + "step": 8630 + }, + { + "epoch": 0.6235483230082901, + "grad_norm": 7.208591199316093, + "learning_rate": 4.029212680550893e-06, + "loss": 0.8237, + "step": 8631 + }, + { + "epoch": 0.6236205682084996, + "grad_norm": 5.42602872455782, + "learning_rate": 4.028981271045233e-06, + "loss": 0.9514, + "step": 8632 + }, + { + "epoch": 0.6236928134087092, + "grad_norm": 9.379855106755672, + "learning_rate": 4.028749840609005e-06, + "loss": 0.9254, + "step": 8633 + }, + { + "epoch": 0.6237650586089186, + "grad_norm": 5.883173674379345, + "learning_rate": 4.0285183892453805e-06, + "loss": 0.79, + "step": 8634 + }, + { + "epoch": 0.6238373038091282, + "grad_norm": 7.253313556999473, + "learning_rate": 4.0282869169575254e-06, + "loss": 0.8397, + "step": 8635 + }, + { + "epoch": 0.6239095490093377, + "grad_norm": 6.251244993186857, + "learning_rate": 4.028055423748609e-06, + "loss": 0.8414, + "step": 8636 + }, + { + "epoch": 0.6239817942095472, + "grad_norm": 6.048988938772866, + "learning_rate": 4.027823909621801e-06, + "loss": 0.8863, + "step": 8637 + }, + { + "epoch": 0.6240540394097567, + "grad_norm": 4.965379447459275, + "learning_rate": 4.027592374580269e-06, + "loss": 0.7733, + "step": 8638 + }, + { + "epoch": 0.6241262846099662, + "grad_norm": 7.396320108504021, + "learning_rate": 4.0273608186271836e-06, + "loss": 0.8983, + "step": 8639 + }, + { + "epoch": 0.6241985298101758, + "grad_norm": 5.978561246661116, + "learning_rate": 4.027129241765715e-06, + "loss": 0.8035, + "step": 8640 + }, + { + "epoch": 0.6242707750103852, + "grad_norm": 6.307626788316156, + "learning_rate": 4.0268976439990325e-06, + "loss": 0.8869, + "step": 8641 + }, + { + "epoch": 0.6243430202105947, + "grad_norm": 6.228383639143278, + "learning_rate": 4.026666025330307e-06, + "loss": 0.8435, + "step": 8642 + }, + { + "epoch": 0.6244152654108043, + "grad_norm": 6.151119669894204, + "learning_rate": 4.026434385762709e-06, + "loss": 0.7973, + "step": 8643 + }, + { + "epoch": 0.6244875106110138, + "grad_norm": 6.264931809281819, + "learning_rate": 4.026202725299409e-06, + "loss": 0.8913, + "step": 8644 + }, + { + "epoch": 0.6245597558112232, + "grad_norm": 6.569310605184731, + "learning_rate": 4.02597104394358e-06, + "loss": 0.9117, + "step": 8645 + }, + { + "epoch": 0.6246320010114328, + "grad_norm": 7.48270354590886, + "learning_rate": 4.02573934169839e-06, + "loss": 0.9066, + "step": 8646 + }, + { + "epoch": 0.6247042462116423, + "grad_norm": 6.676724603293343, + "learning_rate": 4.025507618567015e-06, + "loss": 0.9038, + "step": 8647 + }, + { + "epoch": 0.6247764914118519, + "grad_norm": 6.7451518980077285, + "learning_rate": 4.025275874552624e-06, + "loss": 0.8843, + "step": 8648 + }, + { + "epoch": 0.6248487366120613, + "grad_norm": 7.9113461199987904, + "learning_rate": 4.025044109658391e-06, + "loss": 0.8725, + "step": 8649 + }, + { + "epoch": 0.6249209818122708, + "grad_norm": 5.712228527338096, + "learning_rate": 4.024812323887489e-06, + "loss": 0.9144, + "step": 8650 + }, + { + "epoch": 0.6249932270124804, + "grad_norm": 6.269088586884466, + "learning_rate": 4.02458051724309e-06, + "loss": 0.8536, + "step": 8651 + }, + { + "epoch": 0.6250654722126898, + "grad_norm": 9.03788539966947, + "learning_rate": 4.024348689728367e-06, + "loss": 0.9003, + "step": 8652 + }, + { + "epoch": 0.6251377174128994, + "grad_norm": 6.313328452559897, + "learning_rate": 4.024116841346494e-06, + "loss": 0.855, + "step": 8653 + }, + { + "epoch": 0.6252099626131089, + "grad_norm": 8.12857962762839, + "learning_rate": 4.023884972100644e-06, + "loss": 0.9329, + "step": 8654 + }, + { + "epoch": 0.6252822078133184, + "grad_norm": 6.302557943668648, + "learning_rate": 4.023653081993994e-06, + "loss": 0.8442, + "step": 8655 + }, + { + "epoch": 0.6253544530135279, + "grad_norm": 9.851413032916506, + "learning_rate": 4.023421171029714e-06, + "loss": 1.0157, + "step": 8656 + }, + { + "epoch": 0.6254266982137374, + "grad_norm": 7.012001648192748, + "learning_rate": 4.0231892392109835e-06, + "loss": 0.8833, + "step": 8657 + }, + { + "epoch": 0.625498943413947, + "grad_norm": 5.514517264936543, + "learning_rate": 4.022957286540974e-06, + "loss": 0.9733, + "step": 8658 + }, + { + "epoch": 0.6255711886141564, + "grad_norm": 7.81889191642788, + "learning_rate": 4.022725313022861e-06, + "loss": 0.862, + "step": 8659 + }, + { + "epoch": 0.6256434338143659, + "grad_norm": 6.331980042012246, + "learning_rate": 4.0224933186598214e-06, + "loss": 0.8539, + "step": 8660 + }, + { + "epoch": 0.6257156790145755, + "grad_norm": 5.557186203170099, + "learning_rate": 4.022261303455031e-06, + "loss": 0.8457, + "step": 8661 + }, + { + "epoch": 0.625787924214785, + "grad_norm": 7.138510493216885, + "learning_rate": 4.022029267411663e-06, + "loss": 0.9836, + "step": 8662 + }, + { + "epoch": 0.6258601694149944, + "grad_norm": 6.4195258305928204, + "learning_rate": 4.021797210532898e-06, + "loss": 0.8183, + "step": 8663 + }, + { + "epoch": 0.625932414615204, + "grad_norm": 8.166749240172338, + "learning_rate": 4.0215651328219115e-06, + "loss": 0.9248, + "step": 8664 + }, + { + "epoch": 0.6260046598154135, + "grad_norm": 6.548093841644634, + "learning_rate": 4.021333034281878e-06, + "loss": 0.8515, + "step": 8665 + }, + { + "epoch": 0.6260769050156231, + "grad_norm": 8.877241684322614, + "learning_rate": 4.021100914915977e-06, + "loss": 0.9273, + "step": 8666 + }, + { + "epoch": 0.6261491502158325, + "grad_norm": 4.9543251484941875, + "learning_rate": 4.0208687747273865e-06, + "loss": 0.8673, + "step": 8667 + }, + { + "epoch": 0.626221395416042, + "grad_norm": 5.333194412965308, + "learning_rate": 4.020636613719282e-06, + "loss": 0.8442, + "step": 8668 + }, + { + "epoch": 0.6262936406162516, + "grad_norm": 5.915531031551936, + "learning_rate": 4.020404431894844e-06, + "loss": 0.9245, + "step": 8669 + }, + { + "epoch": 0.626365885816461, + "grad_norm": 5.322727008384531, + "learning_rate": 4.02017222925725e-06, + "loss": 0.8416, + "step": 8670 + }, + { + "epoch": 0.6264381310166706, + "grad_norm": 5.911286643695732, + "learning_rate": 4.0199400058096775e-06, + "loss": 0.8927, + "step": 8671 + }, + { + "epoch": 0.6265103762168801, + "grad_norm": 5.654160255797689, + "learning_rate": 4.019707761555307e-06, + "loss": 0.9003, + "step": 8672 + }, + { + "epoch": 0.6265826214170896, + "grad_norm": 6.602492769144209, + "learning_rate": 4.019475496497318e-06, + "loss": 0.8777, + "step": 8673 + }, + { + "epoch": 0.6266548666172991, + "grad_norm": 7.07231093347127, + "learning_rate": 4.0192432106388876e-06, + "loss": 0.8661, + "step": 8674 + }, + { + "epoch": 0.6267271118175086, + "grad_norm": 6.1363510517029844, + "learning_rate": 4.019010903983198e-06, + "loss": 0.8113, + "step": 8675 + }, + { + "epoch": 0.6267993570177182, + "grad_norm": 5.923624308746452, + "learning_rate": 4.018778576533429e-06, + "loss": 0.8343, + "step": 8676 + }, + { + "epoch": 0.6268716022179276, + "grad_norm": 6.171160151936555, + "learning_rate": 4.01854622829276e-06, + "loss": 0.8589, + "step": 8677 + }, + { + "epoch": 0.6269438474181371, + "grad_norm": 5.799771718761532, + "learning_rate": 4.018313859264372e-06, + "loss": 0.8396, + "step": 8678 + }, + { + "epoch": 0.6270160926183467, + "grad_norm": 4.979838157076865, + "learning_rate": 4.018081469451447e-06, + "loss": 0.8034, + "step": 8679 + }, + { + "epoch": 0.6270883378185562, + "grad_norm": 6.90482035895599, + "learning_rate": 4.017849058857165e-06, + "loss": 0.7689, + "step": 8680 + }, + { + "epoch": 0.6271605830187656, + "grad_norm": 6.87410688668004, + "learning_rate": 4.017616627484708e-06, + "loss": 0.9576, + "step": 8681 + }, + { + "epoch": 0.6272328282189752, + "grad_norm": 5.376642730211842, + "learning_rate": 4.017384175337258e-06, + "loss": 0.8116, + "step": 8682 + }, + { + "epoch": 0.6273050734191847, + "grad_norm": 5.770232719078514, + "learning_rate": 4.017151702417996e-06, + "loss": 0.8755, + "step": 8683 + }, + { + "epoch": 0.6273773186193943, + "grad_norm": 6.319816051172241, + "learning_rate": 4.016919208730106e-06, + "loss": 0.8454, + "step": 8684 + }, + { + "epoch": 0.6274495638196037, + "grad_norm": 6.855988450742807, + "learning_rate": 4.016686694276769e-06, + "loss": 0.8998, + "step": 8685 + }, + { + "epoch": 0.6275218090198132, + "grad_norm": 6.482172359803252, + "learning_rate": 4.016454159061169e-06, + "loss": 0.9166, + "step": 8686 + }, + { + "epoch": 0.6275940542200228, + "grad_norm": 6.371929551246992, + "learning_rate": 4.01622160308649e-06, + "loss": 0.8594, + "step": 8687 + }, + { + "epoch": 0.6276662994202322, + "grad_norm": 5.654083680134464, + "learning_rate": 4.015989026355913e-06, + "loss": 0.8354, + "step": 8688 + }, + { + "epoch": 0.6277385446204418, + "grad_norm": 6.525007705391613, + "learning_rate": 4.015756428872625e-06, + "loss": 0.7484, + "step": 8689 + }, + { + "epoch": 0.6278107898206513, + "grad_norm": 7.141005891954283, + "learning_rate": 4.015523810639807e-06, + "loss": 0.9266, + "step": 8690 + }, + { + "epoch": 0.6278830350208608, + "grad_norm": 5.0800467742809055, + "learning_rate": 4.015291171660646e-06, + "loss": 0.8066, + "step": 8691 + }, + { + "epoch": 0.6279552802210703, + "grad_norm": 5.5277183163169505, + "learning_rate": 4.0150585119383235e-06, + "loss": 0.8837, + "step": 8692 + }, + { + "epoch": 0.6280275254212798, + "grad_norm": 5.520334626711422, + "learning_rate": 4.014825831476028e-06, + "loss": 0.8454, + "step": 8693 + }, + { + "epoch": 0.6280997706214894, + "grad_norm": 6.102560862824119, + "learning_rate": 4.0145931302769424e-06, + "loss": 0.8294, + "step": 8694 + }, + { + "epoch": 0.6281720158216988, + "grad_norm": 7.729828949253457, + "learning_rate": 4.014360408344253e-06, + "loss": 0.9084, + "step": 8695 + }, + { + "epoch": 0.6282442610219083, + "grad_norm": 5.175221815285055, + "learning_rate": 4.014127665681146e-06, + "loss": 0.8469, + "step": 8696 + }, + { + "epoch": 0.6283165062221179, + "grad_norm": 7.141068125524967, + "learning_rate": 4.013894902290806e-06, + "loss": 0.8416, + "step": 8697 + }, + { + "epoch": 0.6283887514223274, + "grad_norm": 6.546473524944019, + "learning_rate": 4.013662118176421e-06, + "loss": 0.8202, + "step": 8698 + }, + { + "epoch": 0.6284609966225368, + "grad_norm": 6.526188545573369, + "learning_rate": 4.013429313341176e-06, + "loss": 0.9016, + "step": 8699 + }, + { + "epoch": 0.6285332418227464, + "grad_norm": 7.752114530475121, + "learning_rate": 4.013196487788259e-06, + "loss": 0.8727, + "step": 8700 + }, + { + "epoch": 0.6286054870229559, + "grad_norm": 6.195558482959505, + "learning_rate": 4.012963641520858e-06, + "loss": 0.8982, + "step": 8701 + }, + { + "epoch": 0.6286777322231655, + "grad_norm": 6.776253402025121, + "learning_rate": 4.0127307745421584e-06, + "loss": 0.9, + "step": 8702 + }, + { + "epoch": 0.6287499774233749, + "grad_norm": 4.532645175163786, + "learning_rate": 4.0124978868553486e-06, + "loss": 0.7657, + "step": 8703 + }, + { + "epoch": 0.6288222226235844, + "grad_norm": 6.884463645859575, + "learning_rate": 4.012264978463618e-06, + "loss": 0.8208, + "step": 8704 + }, + { + "epoch": 0.628894467823794, + "grad_norm": 10.89561665442913, + "learning_rate": 4.012032049370155e-06, + "loss": 0.8693, + "step": 8705 + }, + { + "epoch": 0.6289667130240034, + "grad_norm": 7.242828768719998, + "learning_rate": 4.011799099578145e-06, + "loss": 0.8105, + "step": 8706 + }, + { + "epoch": 0.629038958224213, + "grad_norm": 6.154618874966487, + "learning_rate": 4.011566129090782e-06, + "loss": 0.8565, + "step": 8707 + }, + { + "epoch": 0.6291112034244225, + "grad_norm": 6.200941801834325, + "learning_rate": 4.011333137911249e-06, + "loss": 0.9046, + "step": 8708 + }, + { + "epoch": 0.629183448624632, + "grad_norm": 7.146560972959958, + "learning_rate": 4.0111001260427415e-06, + "loss": 0.837, + "step": 8709 + }, + { + "epoch": 0.6292556938248415, + "grad_norm": 5.9819675314126375, + "learning_rate": 4.010867093488445e-06, + "loss": 0.9306, + "step": 8710 + }, + { + "epoch": 0.629327939025051, + "grad_norm": 8.95715262742117, + "learning_rate": 4.010634040251552e-06, + "loss": 0.9856, + "step": 8711 + }, + { + "epoch": 0.6294001842252606, + "grad_norm": 5.374868169542935, + "learning_rate": 4.0104009663352515e-06, + "loss": 0.8119, + "step": 8712 + }, + { + "epoch": 0.62947242942547, + "grad_norm": 7.132157060606434, + "learning_rate": 4.0101678717427354e-06, + "loss": 0.7803, + "step": 8713 + }, + { + "epoch": 0.6295446746256795, + "grad_norm": 5.514531791781453, + "learning_rate": 4.009934756477193e-06, + "loss": 0.8642, + "step": 8714 + }, + { + "epoch": 0.6296169198258891, + "grad_norm": 7.194908768616924, + "learning_rate": 4.009701620541816e-06, + "loss": 0.954, + "step": 8715 + }, + { + "epoch": 0.6296891650260986, + "grad_norm": 7.931883499421097, + "learning_rate": 4.0094684639397966e-06, + "loss": 0.7909, + "step": 8716 + }, + { + "epoch": 0.629761410226308, + "grad_norm": 8.17911432074264, + "learning_rate": 4.009235286674325e-06, + "loss": 0.9322, + "step": 8717 + }, + { + "epoch": 0.6298336554265176, + "grad_norm": 7.5439103584343945, + "learning_rate": 4.009002088748595e-06, + "loss": 0.8273, + "step": 8718 + }, + { + "epoch": 0.6299059006267271, + "grad_norm": 5.162498433480014, + "learning_rate": 4.008768870165797e-06, + "loss": 0.7806, + "step": 8719 + }, + { + "epoch": 0.6299781458269367, + "grad_norm": 6.526413290205449, + "learning_rate": 4.008535630929125e-06, + "loss": 0.8471, + "step": 8720 + }, + { + "epoch": 0.6300503910271461, + "grad_norm": 6.754208136056071, + "learning_rate": 4.008302371041773e-06, + "loss": 0.8687, + "step": 8721 + }, + { + "epoch": 0.6301226362273556, + "grad_norm": 6.3851578511178095, + "learning_rate": 4.0080690905069315e-06, + "loss": 0.7783, + "step": 8722 + }, + { + "epoch": 0.6301948814275652, + "grad_norm": 6.06525665218183, + "learning_rate": 4.007835789327795e-06, + "loss": 0.7899, + "step": 8723 + }, + { + "epoch": 0.6302671266277746, + "grad_norm": 5.877446740548848, + "learning_rate": 4.0076024675075575e-06, + "loss": 0.8259, + "step": 8724 + }, + { + "epoch": 0.6303393718279842, + "grad_norm": 5.92562578333059, + "learning_rate": 4.007369125049413e-06, + "loss": 0.838, + "step": 8725 + }, + { + "epoch": 0.6304116170281937, + "grad_norm": 6.45260359387189, + "learning_rate": 4.007135761956556e-06, + "loss": 0.7733, + "step": 8726 + }, + { + "epoch": 0.6304838622284032, + "grad_norm": 7.419619054757772, + "learning_rate": 4.006902378232179e-06, + "loss": 0.9769, + "step": 8727 + }, + { + "epoch": 0.6305561074286127, + "grad_norm": 6.971521758633728, + "learning_rate": 4.0066689738794795e-06, + "loss": 0.9175, + "step": 8728 + }, + { + "epoch": 0.6306283526288222, + "grad_norm": 6.497753928999326, + "learning_rate": 4.006435548901651e-06, + "loss": 0.8895, + "step": 8729 + }, + { + "epoch": 0.6307005978290318, + "grad_norm": 7.163201965765293, + "learning_rate": 4.006202103301891e-06, + "loss": 0.8643, + "step": 8730 + }, + { + "epoch": 0.6307728430292412, + "grad_norm": 5.806843291850855, + "learning_rate": 4.005968637083393e-06, + "loss": 0.8843, + "step": 8731 + }, + { + "epoch": 0.6308450882294507, + "grad_norm": 5.729549029913405, + "learning_rate": 4.005735150249353e-06, + "loss": 0.8333, + "step": 8732 + }, + { + "epoch": 0.6309173334296603, + "grad_norm": 6.602328103719184, + "learning_rate": 4.0055016428029684e-06, + "loss": 0.9721, + "step": 8733 + }, + { + "epoch": 0.6309895786298698, + "grad_norm": 5.975993768895589, + "learning_rate": 4.005268114747435e-06, + "loss": 0.8151, + "step": 8734 + }, + { + "epoch": 0.6310618238300792, + "grad_norm": 6.165184392716199, + "learning_rate": 4.00503456608595e-06, + "loss": 0.8674, + "step": 8735 + }, + { + "epoch": 0.6311340690302888, + "grad_norm": 7.197833519403382, + "learning_rate": 4.004800996821711e-06, + "loss": 0.89, + "step": 8736 + }, + { + "epoch": 0.6312063142304983, + "grad_norm": 5.972450587122927, + "learning_rate": 4.004567406957914e-06, + "loss": 0.8409, + "step": 8737 + }, + { + "epoch": 0.6312785594307079, + "grad_norm": 6.306304005901059, + "learning_rate": 4.004333796497758e-06, + "loss": 0.8714, + "step": 8738 + }, + { + "epoch": 0.6313508046309173, + "grad_norm": 7.05324765336627, + "learning_rate": 4.00410016544444e-06, + "loss": 0.9145, + "step": 8739 + }, + { + "epoch": 0.6314230498311268, + "grad_norm": 5.4246488395986505, + "learning_rate": 4.0038665138011575e-06, + "loss": 0.8913, + "step": 8740 + }, + { + "epoch": 0.6314952950313364, + "grad_norm": 5.585785208806885, + "learning_rate": 4.003632841571112e-06, + "loss": 0.8412, + "step": 8741 + }, + { + "epoch": 0.6315675402315458, + "grad_norm": 7.177096060052743, + "learning_rate": 4.003399148757499e-06, + "loss": 0.8826, + "step": 8742 + }, + { + "epoch": 0.6316397854317554, + "grad_norm": 6.1103080954033695, + "learning_rate": 4.0031654353635204e-06, + "loss": 0.925, + "step": 8743 + }, + { + "epoch": 0.6317120306319649, + "grad_norm": 6.385566481491739, + "learning_rate": 4.002931701392373e-06, + "loss": 0.9062, + "step": 8744 + }, + { + "epoch": 0.6317842758321744, + "grad_norm": 5.903688874583489, + "learning_rate": 4.002697946847258e-06, + "loss": 0.9745, + "step": 8745 + }, + { + "epoch": 0.6318565210323839, + "grad_norm": 7.6143018372180205, + "learning_rate": 4.002464171731375e-06, + "loss": 0.8942, + "step": 8746 + }, + { + "epoch": 0.6319287662325934, + "grad_norm": 7.118264745930996, + "learning_rate": 4.002230376047923e-06, + "loss": 0.9196, + "step": 8747 + }, + { + "epoch": 0.632001011432803, + "grad_norm": 6.429291463824504, + "learning_rate": 4.001996559800105e-06, + "loss": 0.8561, + "step": 8748 + }, + { + "epoch": 0.6320732566330124, + "grad_norm": 5.262692866109795, + "learning_rate": 4.00176272299112e-06, + "loss": 0.8219, + "step": 8749 + }, + { + "epoch": 0.6321455018332219, + "grad_norm": 6.320552109371867, + "learning_rate": 4.001528865624169e-06, + "loss": 0.8655, + "step": 8750 + }, + { + "epoch": 0.6322177470334315, + "grad_norm": 6.231060169814103, + "learning_rate": 4.001294987702454e-06, + "loss": 0.8487, + "step": 8751 + }, + { + "epoch": 0.632289992233641, + "grad_norm": 7.277911685685576, + "learning_rate": 4.001061089229176e-06, + "loss": 0.9258, + "step": 8752 + }, + { + "epoch": 0.6323622374338504, + "grad_norm": 8.00586008973222, + "learning_rate": 4.000827170207537e-06, + "loss": 0.8404, + "step": 8753 + }, + { + "epoch": 0.63243448263406, + "grad_norm": 5.9865967134893046, + "learning_rate": 4.0005932306407394e-06, + "loss": 0.9005, + "step": 8754 + }, + { + "epoch": 0.6325067278342695, + "grad_norm": 5.854608086973097, + "learning_rate": 4.0003592705319855e-06, + "loss": 0.8083, + "step": 8755 + }, + { + "epoch": 0.6325789730344791, + "grad_norm": 6.8334546272243575, + "learning_rate": 4.000125289884478e-06, + "loss": 0.8501, + "step": 8756 + }, + { + "epoch": 0.6326512182346885, + "grad_norm": 7.335667094324142, + "learning_rate": 3.9998912887014205e-06, + "loss": 0.8875, + "step": 8757 + }, + { + "epoch": 0.632723463434898, + "grad_norm": 6.66441682363289, + "learning_rate": 3.9996572669860164e-06, + "loss": 0.8904, + "step": 8758 + }, + { + "epoch": 0.6327957086351076, + "grad_norm": 6.64608289036817, + "learning_rate": 3.999423224741469e-06, + "loss": 0.8423, + "step": 8759 + }, + { + "epoch": 0.632867953835317, + "grad_norm": 6.051777589552363, + "learning_rate": 3.99918916197098e-06, + "loss": 0.8338, + "step": 8760 + }, + { + "epoch": 0.6329401990355266, + "grad_norm": 6.01621534745338, + "learning_rate": 3.998955078677757e-06, + "loss": 0.8669, + "step": 8761 + }, + { + "epoch": 0.6330124442357361, + "grad_norm": 7.032079486662411, + "learning_rate": 3.9987209748650015e-06, + "loss": 0.8469, + "step": 8762 + }, + { + "epoch": 0.6330846894359456, + "grad_norm": 5.4877899927490486, + "learning_rate": 3.9984868505359206e-06, + "loss": 0.7907, + "step": 8763 + }, + { + "epoch": 0.6331569346361551, + "grad_norm": 6.270146971323095, + "learning_rate": 3.9982527056937185e-06, + "loss": 0.9133, + "step": 8764 + }, + { + "epoch": 0.6332291798363646, + "grad_norm": 6.307197685602035, + "learning_rate": 3.998018540341599e-06, + "loss": 0.8653, + "step": 8765 + }, + { + "epoch": 0.6333014250365742, + "grad_norm": 6.060615797324018, + "learning_rate": 3.99778435448277e-06, + "loss": 0.7956, + "step": 8766 + }, + { + "epoch": 0.6333736702367836, + "grad_norm": 6.037056297283658, + "learning_rate": 3.997550148120435e-06, + "loss": 0.8323, + "step": 8767 + }, + { + "epoch": 0.6334459154369931, + "grad_norm": 6.75608607824676, + "learning_rate": 3.9973159212578016e-06, + "loss": 0.9111, + "step": 8768 + }, + { + "epoch": 0.6335181606372027, + "grad_norm": 7.788224002008978, + "learning_rate": 3.997081673898077e-06, + "loss": 0.7565, + "step": 8769 + }, + { + "epoch": 0.6335904058374122, + "grad_norm": 5.331405748281369, + "learning_rate": 3.996847406044465e-06, + "loss": 0.927, + "step": 8770 + }, + { + "epoch": 0.6336626510376216, + "grad_norm": 5.687466464100992, + "learning_rate": 3.996613117700175e-06, + "loss": 0.8967, + "step": 8771 + }, + { + "epoch": 0.6337348962378312, + "grad_norm": 7.631960334178924, + "learning_rate": 3.996378808868414e-06, + "loss": 0.8284, + "step": 8772 + }, + { + "epoch": 0.6338071414380407, + "grad_norm": 7.899833397376101, + "learning_rate": 3.996144479552388e-06, + "loss": 0.8885, + "step": 8773 + }, + { + "epoch": 0.6338793866382503, + "grad_norm": 6.783996869500355, + "learning_rate": 3.995910129755306e-06, + "loss": 0.8585, + "step": 8774 + }, + { + "epoch": 0.6339516318384597, + "grad_norm": 5.023790409393554, + "learning_rate": 3.9956757594803765e-06, + "loss": 0.7675, + "step": 8775 + }, + { + "epoch": 0.6340238770386692, + "grad_norm": 5.681562069756013, + "learning_rate": 3.9954413687308065e-06, + "loss": 0.8697, + "step": 8776 + }, + { + "epoch": 0.6340961222388788, + "grad_norm": 6.730749583792046, + "learning_rate": 3.9952069575098054e-06, + "loss": 0.8607, + "step": 8777 + }, + { + "epoch": 0.6341683674390882, + "grad_norm": 10.022879938092542, + "learning_rate": 3.994972525820582e-06, + "loss": 0.9504, + "step": 8778 + }, + { + "epoch": 0.6342406126392978, + "grad_norm": 6.6973126160315575, + "learning_rate": 3.994738073666345e-06, + "loss": 0.8251, + "step": 8779 + }, + { + "epoch": 0.6343128578395073, + "grad_norm": 6.444090497319733, + "learning_rate": 3.994503601050306e-06, + "loss": 0.8272, + "step": 8780 + }, + { + "epoch": 0.6343851030397168, + "grad_norm": 8.075482938042603, + "learning_rate": 3.994269107975671e-06, + "loss": 0.9418, + "step": 8781 + }, + { + "epoch": 0.6344573482399263, + "grad_norm": 5.785381098981226, + "learning_rate": 3.994034594445654e-06, + "loss": 0.824, + "step": 8782 + }, + { + "epoch": 0.6345295934401358, + "grad_norm": 6.707826614987042, + "learning_rate": 3.993800060463462e-06, + "loss": 0.919, + "step": 8783 + }, + { + "epoch": 0.6346018386403454, + "grad_norm": 6.62695082246497, + "learning_rate": 3.993565506032308e-06, + "loss": 0.8774, + "step": 8784 + }, + { + "epoch": 0.6346740838405548, + "grad_norm": 5.706373486222034, + "learning_rate": 3.993330931155402e-06, + "loss": 0.799, + "step": 8785 + }, + { + "epoch": 0.6347463290407643, + "grad_norm": 5.240531921470023, + "learning_rate": 3.9930963358359545e-06, + "loss": 0.8331, + "step": 8786 + }, + { + "epoch": 0.6348185742409739, + "grad_norm": 5.685324787400665, + "learning_rate": 3.992861720077178e-06, + "loss": 0.8823, + "step": 8787 + }, + { + "epoch": 0.6348908194411834, + "grad_norm": 7.0811672114947575, + "learning_rate": 3.992627083882282e-06, + "loss": 0.8922, + "step": 8788 + }, + { + "epoch": 0.6349630646413928, + "grad_norm": 6.435913186653669, + "learning_rate": 3.992392427254482e-06, + "loss": 0.8577, + "step": 8789 + }, + { + "epoch": 0.6350353098416024, + "grad_norm": 6.389715526379609, + "learning_rate": 3.992157750196988e-06, + "loss": 0.8426, + "step": 8790 + }, + { + "epoch": 0.6351075550418119, + "grad_norm": 4.752763446246464, + "learning_rate": 3.991923052713013e-06, + "loss": 0.8055, + "step": 8791 + }, + { + "epoch": 0.6351798002420215, + "grad_norm": 7.2848954795822225, + "learning_rate": 3.9916883348057685e-06, + "loss": 0.8083, + "step": 8792 + }, + { + "epoch": 0.6352520454422309, + "grad_norm": 7.457837420409338, + "learning_rate": 3.991453596478471e-06, + "loss": 0.8054, + "step": 8793 + }, + { + "epoch": 0.6353242906424404, + "grad_norm": 6.1053840890043585, + "learning_rate": 3.991218837734331e-06, + "loss": 0.7809, + "step": 8794 + }, + { + "epoch": 0.63539653584265, + "grad_norm": 6.91721877992984, + "learning_rate": 3.990984058576563e-06, + "loss": 0.874, + "step": 8795 + }, + { + "epoch": 0.6354687810428594, + "grad_norm": 4.857964706613073, + "learning_rate": 3.99074925900838e-06, + "loss": 0.8117, + "step": 8796 + }, + { + "epoch": 0.635541026243069, + "grad_norm": 7.387617793814219, + "learning_rate": 3.990514439032998e-06, + "loss": 0.8549, + "step": 8797 + }, + { + "epoch": 0.6356132714432785, + "grad_norm": 6.068866650828399, + "learning_rate": 3.99027959865363e-06, + "loss": 0.7671, + "step": 8798 + }, + { + "epoch": 0.635685516643488, + "grad_norm": 6.650118157046951, + "learning_rate": 3.990044737873493e-06, + "loss": 0.8958, + "step": 8799 + }, + { + "epoch": 0.6357577618436975, + "grad_norm": 5.660184219112229, + "learning_rate": 3.989809856695799e-06, + "loss": 0.8406, + "step": 8800 + }, + { + "epoch": 0.635830007043907, + "grad_norm": 7.106007207985467, + "learning_rate": 3.989574955123765e-06, + "loss": 0.7636, + "step": 8801 + }, + { + "epoch": 0.6359022522441166, + "grad_norm": 6.16883578255325, + "learning_rate": 3.9893400331606075e-06, + "loss": 0.8132, + "step": 8802 + }, + { + "epoch": 0.635974497444326, + "grad_norm": 6.247298610535535, + "learning_rate": 3.98910509080954e-06, + "loss": 0.8229, + "step": 8803 + }, + { + "epoch": 0.6360467426445355, + "grad_norm": 6.946095133948518, + "learning_rate": 3.988870128073781e-06, + "loss": 0.827, + "step": 8804 + }, + { + "epoch": 0.6361189878447451, + "grad_norm": 6.964928370297081, + "learning_rate": 3.988635144956546e-06, + "loss": 0.8954, + "step": 8805 + }, + { + "epoch": 0.6361912330449546, + "grad_norm": 6.063651378665514, + "learning_rate": 3.98840014146105e-06, + "loss": 0.7878, + "step": 8806 + }, + { + "epoch": 0.636263478245164, + "grad_norm": 7.561696411818694, + "learning_rate": 3.988165117590514e-06, + "loss": 0.9234, + "step": 8807 + }, + { + "epoch": 0.6363357234453736, + "grad_norm": 7.511381541443146, + "learning_rate": 3.987930073348152e-06, + "loss": 0.8393, + "step": 8808 + }, + { + "epoch": 0.6364079686455831, + "grad_norm": 5.813382645727402, + "learning_rate": 3.9876950087371826e-06, + "loss": 0.9414, + "step": 8809 + }, + { + "epoch": 0.6364802138457927, + "grad_norm": 6.083195236241112, + "learning_rate": 3.9874599237608236e-06, + "loss": 0.8275, + "step": 8810 + }, + { + "epoch": 0.6365524590460021, + "grad_norm": 6.173964057543411, + "learning_rate": 3.987224818422293e-06, + "loss": 0.8838, + "step": 8811 + }, + { + "epoch": 0.6366247042462116, + "grad_norm": 6.791756783416064, + "learning_rate": 3.98698969272481e-06, + "loss": 0.8935, + "step": 8812 + }, + { + "epoch": 0.6366969494464212, + "grad_norm": 7.35516120158284, + "learning_rate": 3.986754546671592e-06, + "loss": 0.8739, + "step": 8813 + }, + { + "epoch": 0.6367691946466306, + "grad_norm": 7.2364707201706215, + "learning_rate": 3.986519380265859e-06, + "loss": 0.9234, + "step": 8814 + }, + { + "epoch": 0.6368414398468402, + "grad_norm": 6.4027780940602135, + "learning_rate": 3.9862841935108295e-06, + "loss": 0.8922, + "step": 8815 + }, + { + "epoch": 0.6369136850470497, + "grad_norm": 5.39887205988834, + "learning_rate": 3.986048986409724e-06, + "loss": 0.8013, + "step": 8816 + }, + { + "epoch": 0.6369859302472592, + "grad_norm": 5.5865391120521375, + "learning_rate": 3.985813758965762e-06, + "loss": 0.8422, + "step": 8817 + }, + { + "epoch": 0.6370581754474687, + "grad_norm": 6.199288899650356, + "learning_rate": 3.985578511182163e-06, + "loss": 0.8207, + "step": 8818 + }, + { + "epoch": 0.6371304206476782, + "grad_norm": 5.649436260895238, + "learning_rate": 3.985343243062148e-06, + "loss": 0.7677, + "step": 8819 + }, + { + "epoch": 0.6372026658478878, + "grad_norm": 7.613635990965794, + "learning_rate": 3.985107954608935e-06, + "loss": 0.8364, + "step": 8820 + }, + { + "epoch": 0.6372749110480972, + "grad_norm": 6.341270826754065, + "learning_rate": 3.98487264582575e-06, + "loss": 0.9381, + "step": 8821 + }, + { + "epoch": 0.6373471562483067, + "grad_norm": 7.280920717054753, + "learning_rate": 3.98463731671581e-06, + "loss": 0.9618, + "step": 8822 + }, + { + "epoch": 0.6374194014485163, + "grad_norm": 5.832407669010226, + "learning_rate": 3.9844019672823385e-06, + "loss": 0.8339, + "step": 8823 + }, + { + "epoch": 0.6374916466487258, + "grad_norm": 6.561281799599813, + "learning_rate": 3.984166597528557e-06, + "loss": 0.8482, + "step": 8824 + }, + { + "epoch": 0.6375638918489352, + "grad_norm": 7.483005597500031, + "learning_rate": 3.983931207457686e-06, + "loss": 0.8492, + "step": 8825 + }, + { + "epoch": 0.6376361370491448, + "grad_norm": 6.085506869592974, + "learning_rate": 3.983695797072949e-06, + "loss": 0.855, + "step": 8826 + }, + { + "epoch": 0.6377083822493543, + "grad_norm": 6.418958609663125, + "learning_rate": 3.98346036637757e-06, + "loss": 0.8431, + "step": 8827 + }, + { + "epoch": 0.6377806274495639, + "grad_norm": 6.079484253140203, + "learning_rate": 3.983224915374769e-06, + "loss": 0.8212, + "step": 8828 + }, + { + "epoch": 0.6378528726497733, + "grad_norm": 8.54874668862571, + "learning_rate": 3.982989444067771e-06, + "loss": 0.953, + "step": 8829 + }, + { + "epoch": 0.6379251178499828, + "grad_norm": 6.362175102407305, + "learning_rate": 3.982753952459799e-06, + "loss": 0.8296, + "step": 8830 + }, + { + "epoch": 0.6379973630501924, + "grad_norm": 5.447560791684803, + "learning_rate": 3.982518440554078e-06, + "loss": 0.8431, + "step": 8831 + }, + { + "epoch": 0.6380696082504018, + "grad_norm": 7.7237088108893746, + "learning_rate": 3.982282908353829e-06, + "loss": 0.9218, + "step": 8832 + }, + { + "epoch": 0.6381418534506114, + "grad_norm": 5.8162462200953495, + "learning_rate": 3.982047355862277e-06, + "loss": 0.8876, + "step": 8833 + }, + { + "epoch": 0.6382140986508209, + "grad_norm": 6.7220152971070375, + "learning_rate": 3.98181178308265e-06, + "loss": 0.8545, + "step": 8834 + }, + { + "epoch": 0.6382863438510304, + "grad_norm": 6.422463049034299, + "learning_rate": 3.981576190018168e-06, + "loss": 0.8564, + "step": 8835 + }, + { + "epoch": 0.6383585890512399, + "grad_norm": 6.180291803378085, + "learning_rate": 3.981340576672059e-06, + "loss": 0.866, + "step": 8836 + }, + { + "epoch": 0.6384308342514494, + "grad_norm": 6.830445338373466, + "learning_rate": 3.981104943047549e-06, + "loss": 0.8222, + "step": 8837 + }, + { + "epoch": 0.638503079451659, + "grad_norm": 5.523904697024256, + "learning_rate": 3.980869289147861e-06, + "loss": 0.8527, + "step": 8838 + }, + { + "epoch": 0.6385753246518684, + "grad_norm": 5.934992210428064, + "learning_rate": 3.980633614976222e-06, + "loss": 0.8474, + "step": 8839 + }, + { + "epoch": 0.6386475698520779, + "grad_norm": 8.128728861892014, + "learning_rate": 3.9803979205358586e-06, + "loss": 0.8246, + "step": 8840 + }, + { + "epoch": 0.6387198150522875, + "grad_norm": 5.766874679312403, + "learning_rate": 3.980162205829998e-06, + "loss": 0.9161, + "step": 8841 + }, + { + "epoch": 0.638792060252497, + "grad_norm": 5.542123398088819, + "learning_rate": 3.979926470861866e-06, + "loss": 0.8759, + "step": 8842 + }, + { + "epoch": 0.6388643054527064, + "grad_norm": 5.334220196578954, + "learning_rate": 3.979690715634689e-06, + "loss": 0.8351, + "step": 8843 + }, + { + "epoch": 0.638936550652916, + "grad_norm": 6.418003818186299, + "learning_rate": 3.979454940151695e-06, + "loss": 0.8449, + "step": 8844 + }, + { + "epoch": 0.6390087958531255, + "grad_norm": 6.929042850687255, + "learning_rate": 3.9792191444161125e-06, + "loss": 0.823, + "step": 8845 + }, + { + "epoch": 0.639081041053335, + "grad_norm": 6.008649313876492, + "learning_rate": 3.978983328431168e-06, + "loss": 0.9188, + "step": 8846 + }, + { + "epoch": 0.6391532862535445, + "grad_norm": 7.126297146691908, + "learning_rate": 3.97874749220009e-06, + "loss": 0.8844, + "step": 8847 + }, + { + "epoch": 0.639225531453754, + "grad_norm": 6.658542005621426, + "learning_rate": 3.978511635726106e-06, + "loss": 0.9205, + "step": 8848 + }, + { + "epoch": 0.6392977766539636, + "grad_norm": 7.3119104913955075, + "learning_rate": 3.9782757590124475e-06, + "loss": 0.9773, + "step": 8849 + }, + { + "epoch": 0.639370021854173, + "grad_norm": 6.699996253624623, + "learning_rate": 3.978039862062341e-06, + "loss": 0.8971, + "step": 8850 + }, + { + "epoch": 0.6394422670543826, + "grad_norm": 5.844223900821193, + "learning_rate": 3.977803944879015e-06, + "loss": 0.8584, + "step": 8851 + }, + { + "epoch": 0.6395145122545921, + "grad_norm": 7.2032335649695485, + "learning_rate": 3.9775680074657025e-06, + "loss": 0.8739, + "step": 8852 + }, + { + "epoch": 0.6395867574548016, + "grad_norm": 5.41196029826712, + "learning_rate": 3.977332049825631e-06, + "loss": 0.7687, + "step": 8853 + }, + { + "epoch": 0.6396590026550111, + "grad_norm": 5.859122715922934, + "learning_rate": 3.97709607196203e-06, + "loss": 0.886, + "step": 8854 + }, + { + "epoch": 0.6397312478552206, + "grad_norm": 5.631006551850859, + "learning_rate": 3.97686007387813e-06, + "loss": 0.8265, + "step": 8855 + }, + { + "epoch": 0.6398034930554302, + "grad_norm": 6.1684223803064295, + "learning_rate": 3.976624055577163e-06, + "loss": 0.7754, + "step": 8856 + }, + { + "epoch": 0.6398757382556396, + "grad_norm": 5.972896713296245, + "learning_rate": 3.97638801706236e-06, + "loss": 0.8501, + "step": 8857 + }, + { + "epoch": 0.6399479834558491, + "grad_norm": 6.78660210559367, + "learning_rate": 3.97615195833695e-06, + "loss": 0.8954, + "step": 8858 + }, + { + "epoch": 0.6400202286560587, + "grad_norm": 4.805575653112547, + "learning_rate": 3.975915879404168e-06, + "loss": 0.7679, + "step": 8859 + }, + { + "epoch": 0.6400924738562682, + "grad_norm": 7.088799316148603, + "learning_rate": 3.975679780267241e-06, + "loss": 0.8567, + "step": 8860 + }, + { + "epoch": 0.6401647190564776, + "grad_norm": 5.4877395959915125, + "learning_rate": 3.975443660929406e-06, + "loss": 0.8429, + "step": 8861 + }, + { + "epoch": 0.6402369642566872, + "grad_norm": 5.6914022284646055, + "learning_rate": 3.975207521393891e-06, + "loss": 0.966, + "step": 8862 + }, + { + "epoch": 0.6403092094568967, + "grad_norm": 5.858441657434548, + "learning_rate": 3.9749713616639305e-06, + "loss": 0.943, + "step": 8863 + }, + { + "epoch": 0.6403814546571062, + "grad_norm": 6.408214137734365, + "learning_rate": 3.974735181742758e-06, + "loss": 0.8695, + "step": 8864 + }, + { + "epoch": 0.6404536998573157, + "grad_norm": 7.655068318007955, + "learning_rate": 3.974498981633606e-06, + "loss": 0.8906, + "step": 8865 + }, + { + "epoch": 0.6405259450575252, + "grad_norm": 6.035436886679037, + "learning_rate": 3.974262761339708e-06, + "loss": 0.8673, + "step": 8866 + }, + { + "epoch": 0.6405981902577348, + "grad_norm": 6.646670903568327, + "learning_rate": 3.974026520864297e-06, + "loss": 0.9299, + "step": 8867 + }, + { + "epoch": 0.6406704354579442, + "grad_norm": 5.861350903821116, + "learning_rate": 3.973790260210609e-06, + "loss": 0.9411, + "step": 8868 + }, + { + "epoch": 0.6407426806581538, + "grad_norm": 6.610178941390848, + "learning_rate": 3.973553979381874e-06, + "loss": 0.8134, + "step": 8869 + }, + { + "epoch": 0.6408149258583633, + "grad_norm": 5.38936823693521, + "learning_rate": 3.9733176783813305e-06, + "loss": 0.7965, + "step": 8870 + }, + { + "epoch": 0.6408871710585728, + "grad_norm": 6.479589254288953, + "learning_rate": 3.9730813572122116e-06, + "loss": 0.821, + "step": 8871 + }, + { + "epoch": 0.6409594162587823, + "grad_norm": 6.605745947022301, + "learning_rate": 3.972845015877753e-06, + "loss": 0.8963, + "step": 8872 + }, + { + "epoch": 0.6410316614589918, + "grad_norm": 6.237485028633085, + "learning_rate": 3.97260865438119e-06, + "loss": 0.8351, + "step": 8873 + }, + { + "epoch": 0.6411039066592014, + "grad_norm": 7.370253247655084, + "learning_rate": 3.972372272725758e-06, + "loss": 0.8388, + "step": 8874 + }, + { + "epoch": 0.6411761518594108, + "grad_norm": 7.020856847827447, + "learning_rate": 3.972135870914692e-06, + "loss": 0.8018, + "step": 8875 + }, + { + "epoch": 0.6412483970596203, + "grad_norm": 5.807658486406252, + "learning_rate": 3.971899448951229e-06, + "loss": 0.9365, + "step": 8876 + }, + { + "epoch": 0.6413206422598299, + "grad_norm": 6.786720425006268, + "learning_rate": 3.971663006838606e-06, + "loss": 0.8461, + "step": 8877 + }, + { + "epoch": 0.6413928874600394, + "grad_norm": 7.981827122218595, + "learning_rate": 3.9714265445800595e-06, + "loss": 0.8928, + "step": 8878 + }, + { + "epoch": 0.6414651326602488, + "grad_norm": 6.462520529974492, + "learning_rate": 3.971190062178825e-06, + "loss": 0.9296, + "step": 8879 + }, + { + "epoch": 0.6415373778604584, + "grad_norm": 6.4984897913185256, + "learning_rate": 3.970953559638141e-06, + "loss": 0.8593, + "step": 8880 + }, + { + "epoch": 0.6416096230606679, + "grad_norm": 6.693546022073007, + "learning_rate": 3.9707170369612455e-06, + "loss": 0.8248, + "step": 8881 + }, + { + "epoch": 0.6416818682608774, + "grad_norm": 7.062651776058571, + "learning_rate": 3.970480494151375e-06, + "loss": 0.8032, + "step": 8882 + }, + { + "epoch": 0.6417541134610869, + "grad_norm": 5.631284128100489, + "learning_rate": 3.970243931211769e-06, + "loss": 0.8549, + "step": 8883 + }, + { + "epoch": 0.6418263586612964, + "grad_norm": 5.439322506005763, + "learning_rate": 3.970007348145664e-06, + "loss": 0.7844, + "step": 8884 + }, + { + "epoch": 0.641898603861506, + "grad_norm": 5.860190698430611, + "learning_rate": 3.969770744956301e-06, + "loss": 0.8047, + "step": 8885 + }, + { + "epoch": 0.6419708490617154, + "grad_norm": 9.620506129284143, + "learning_rate": 3.969534121646918e-06, + "loss": 0.8094, + "step": 8886 + }, + { + "epoch": 0.642043094261925, + "grad_norm": 7.574941694079313, + "learning_rate": 3.9692974782207535e-06, + "loss": 0.7891, + "step": 8887 + }, + { + "epoch": 0.6421153394621345, + "grad_norm": 6.539782923090976, + "learning_rate": 3.969060814681047e-06, + "loss": 0.9308, + "step": 8888 + }, + { + "epoch": 0.642187584662344, + "grad_norm": 9.969661085055947, + "learning_rate": 3.968824131031039e-06, + "loss": 0.8971, + "step": 8889 + }, + { + "epoch": 0.6422598298625535, + "grad_norm": 6.590886221819099, + "learning_rate": 3.968587427273968e-06, + "loss": 0.8792, + "step": 8890 + }, + { + "epoch": 0.642332075062763, + "grad_norm": 6.829369333396998, + "learning_rate": 3.968350703413076e-06, + "loss": 0.8563, + "step": 8891 + }, + { + "epoch": 0.6424043202629726, + "grad_norm": 8.038153742519416, + "learning_rate": 3.968113959451604e-06, + "loss": 0.8369, + "step": 8892 + }, + { + "epoch": 0.642476565463182, + "grad_norm": 6.514698722003694, + "learning_rate": 3.9678771953927905e-06, + "loss": 0.8246, + "step": 8893 + }, + { + "epoch": 0.6425488106633915, + "grad_norm": 6.551341424692398, + "learning_rate": 3.967640411239879e-06, + "loss": 0.8128, + "step": 8894 + }, + { + "epoch": 0.6426210558636011, + "grad_norm": 8.627804065387824, + "learning_rate": 3.967403606996109e-06, + "loss": 0.7666, + "step": 8895 + }, + { + "epoch": 0.6426933010638106, + "grad_norm": 6.406065105468258, + "learning_rate": 3.967166782664723e-06, + "loss": 0.772, + "step": 8896 + }, + { + "epoch": 0.64276554626402, + "grad_norm": 6.578983205597294, + "learning_rate": 3.966929938248963e-06, + "loss": 0.8139, + "step": 8897 + }, + { + "epoch": 0.6428377914642296, + "grad_norm": 8.844239993051268, + "learning_rate": 3.966693073752072e-06, + "loss": 0.7874, + "step": 8898 + }, + { + "epoch": 0.6429100366644391, + "grad_norm": 6.706510529163673, + "learning_rate": 3.966456189177291e-06, + "loss": 0.9465, + "step": 8899 + }, + { + "epoch": 0.6429822818646486, + "grad_norm": 9.12843864134544, + "learning_rate": 3.966219284527864e-06, + "loss": 0.8858, + "step": 8900 + }, + { + "epoch": 0.6430545270648581, + "grad_norm": 6.775447208523455, + "learning_rate": 3.965982359807032e-06, + "loss": 0.7972, + "step": 8901 + }, + { + "epoch": 0.6431267722650676, + "grad_norm": 6.248571308875302, + "learning_rate": 3.965745415018041e-06, + "loss": 0.851, + "step": 8902 + }, + { + "epoch": 0.6431990174652772, + "grad_norm": 6.524245599274175, + "learning_rate": 3.9655084501641325e-06, + "loss": 0.8243, + "step": 8903 + }, + { + "epoch": 0.6432712626654866, + "grad_norm": 7.782305419920615, + "learning_rate": 3.965271465248551e-06, + "loss": 0.9182, + "step": 8904 + }, + { + "epoch": 0.6433435078656962, + "grad_norm": 10.024218795665771, + "learning_rate": 3.965034460274542e-06, + "loss": 0.8823, + "step": 8905 + }, + { + "epoch": 0.6434157530659057, + "grad_norm": 7.949484840832087, + "learning_rate": 3.964797435245348e-06, + "loss": 0.8555, + "step": 8906 + }, + { + "epoch": 0.6434879982661152, + "grad_norm": 6.509827154299677, + "learning_rate": 3.964560390164215e-06, + "loss": 0.8113, + "step": 8907 + }, + { + "epoch": 0.6435602434663247, + "grad_norm": 5.018422520120651, + "learning_rate": 3.964323325034387e-06, + "loss": 0.8175, + "step": 8908 + }, + { + "epoch": 0.6436324886665342, + "grad_norm": 6.227361343046909, + "learning_rate": 3.9640862398591096e-06, + "loss": 0.8038, + "step": 8909 + }, + { + "epoch": 0.6437047338667438, + "grad_norm": 6.587828666696489, + "learning_rate": 3.963849134641629e-06, + "loss": 0.8729, + "step": 8910 + }, + { + "epoch": 0.6437769790669532, + "grad_norm": 9.147811594595444, + "learning_rate": 3.963612009385189e-06, + "loss": 0.8939, + "step": 8911 + }, + { + "epoch": 0.6438492242671627, + "grad_norm": 8.913734690784516, + "learning_rate": 3.9633748640930384e-06, + "loss": 0.8978, + "step": 8912 + }, + { + "epoch": 0.6439214694673723, + "grad_norm": 6.921068958341843, + "learning_rate": 3.963137698768422e-06, + "loss": 0.9342, + "step": 8913 + }, + { + "epoch": 0.6439937146675818, + "grad_norm": 9.01671743137738, + "learning_rate": 3.962900513414586e-06, + "loss": 0.8922, + "step": 8914 + }, + { + "epoch": 0.6440659598677912, + "grad_norm": 8.450471528570212, + "learning_rate": 3.962663308034778e-06, + "loss": 0.8034, + "step": 8915 + }, + { + "epoch": 0.6441382050680008, + "grad_norm": 7.594273427497956, + "learning_rate": 3.962426082632246e-06, + "loss": 0.9164, + "step": 8916 + }, + { + "epoch": 0.6442104502682103, + "grad_norm": 8.16266108261926, + "learning_rate": 3.9621888372102344e-06, + "loss": 0.8875, + "step": 8917 + }, + { + "epoch": 0.6442826954684198, + "grad_norm": 9.487395909960949, + "learning_rate": 3.961951571771994e-06, + "loss": 0.7804, + "step": 8918 + }, + { + "epoch": 0.6443549406686293, + "grad_norm": 9.867981166739675, + "learning_rate": 3.961714286320772e-06, + "loss": 0.9331, + "step": 8919 + }, + { + "epoch": 0.6444271858688388, + "grad_norm": 7.355114264310545, + "learning_rate": 3.9614769808598156e-06, + "loss": 0.8371, + "step": 8920 + }, + { + "epoch": 0.6444994310690484, + "grad_norm": 5.959207466223028, + "learning_rate": 3.961239655392375e-06, + "loss": 0.8136, + "step": 8921 + }, + { + "epoch": 0.6445716762692578, + "grad_norm": 5.641899831321453, + "learning_rate": 3.961002309921698e-06, + "loss": 0.8379, + "step": 8922 + }, + { + "epoch": 0.6446439214694674, + "grad_norm": 11.418260746138042, + "learning_rate": 3.960764944451033e-06, + "loss": 0.8194, + "step": 8923 + }, + { + "epoch": 0.6447161666696769, + "grad_norm": 7.17038207628248, + "learning_rate": 3.96052755898363e-06, + "loss": 0.8467, + "step": 8924 + }, + { + "epoch": 0.6447884118698864, + "grad_norm": 8.273749182632343, + "learning_rate": 3.96029015352274e-06, + "loss": 0.8381, + "step": 8925 + }, + { + "epoch": 0.6448606570700959, + "grad_norm": 6.314976574634588, + "learning_rate": 3.960052728071611e-06, + "loss": 0.9022, + "step": 8926 + }, + { + "epoch": 0.6449329022703054, + "grad_norm": 8.654270648163262, + "learning_rate": 3.959815282633493e-06, + "loss": 0.833, + "step": 8927 + }, + { + "epoch": 0.645005147470515, + "grad_norm": 5.752945187155533, + "learning_rate": 3.9595778172116384e-06, + "loss": 0.8023, + "step": 8928 + }, + { + "epoch": 0.6450773926707244, + "grad_norm": 6.789733361920523, + "learning_rate": 3.959340331809296e-06, + "loss": 0.8922, + "step": 8929 + }, + { + "epoch": 0.6451496378709339, + "grad_norm": 6.1303961203006185, + "learning_rate": 3.959102826429718e-06, + "loss": 0.8515, + "step": 8930 + }, + { + "epoch": 0.6452218830711435, + "grad_norm": 8.262230332860625, + "learning_rate": 3.958865301076156e-06, + "loss": 0.8134, + "step": 8931 + }, + { + "epoch": 0.645294128271353, + "grad_norm": 7.730643433789365, + "learning_rate": 3.958627755751859e-06, + "loss": 0.8593, + "step": 8932 + }, + { + "epoch": 0.6453663734715624, + "grad_norm": 7.585416263230028, + "learning_rate": 3.9583901904600815e-06, + "loss": 0.7967, + "step": 8933 + }, + { + "epoch": 0.645438618671772, + "grad_norm": 5.416600720908915, + "learning_rate": 3.958152605204075e-06, + "loss": 0.8163, + "step": 8934 + }, + { + "epoch": 0.6455108638719815, + "grad_norm": 6.610605400397578, + "learning_rate": 3.957914999987091e-06, + "loss": 0.836, + "step": 8935 + }, + { + "epoch": 0.645583109072191, + "grad_norm": 5.882339438254622, + "learning_rate": 3.957677374812383e-06, + "loss": 0.8551, + "step": 8936 + }, + { + "epoch": 0.6456553542724005, + "grad_norm": 5.691767171230227, + "learning_rate": 3.957439729683203e-06, + "loss": 0.9016, + "step": 8937 + }, + { + "epoch": 0.64572759947261, + "grad_norm": 5.177607652400601, + "learning_rate": 3.957202064602805e-06, + "loss": 0.7567, + "step": 8938 + }, + { + "epoch": 0.6457998446728196, + "grad_norm": 9.049909171813074, + "learning_rate": 3.9569643795744425e-06, + "loss": 0.855, + "step": 8939 + }, + { + "epoch": 0.645872089873029, + "grad_norm": 10.24210109135938, + "learning_rate": 3.9567266746013686e-06, + "loss": 0.8897, + "step": 8940 + }, + { + "epoch": 0.6459443350732386, + "grad_norm": 6.559321042257376, + "learning_rate": 3.9564889496868385e-06, + "loss": 0.8537, + "step": 8941 + }, + { + "epoch": 0.6460165802734481, + "grad_norm": 7.026666800530931, + "learning_rate": 3.956251204834104e-06, + "loss": 0.7624, + "step": 8942 + }, + { + "epoch": 0.6460888254736576, + "grad_norm": 6.629084929180725, + "learning_rate": 3.956013440046422e-06, + "loss": 0.8064, + "step": 8943 + }, + { + "epoch": 0.6461610706738671, + "grad_norm": 8.734802126463803, + "learning_rate": 3.955775655327047e-06, + "loss": 0.8418, + "step": 8944 + }, + { + "epoch": 0.6462333158740766, + "grad_norm": 8.256880232084766, + "learning_rate": 3.955537850679233e-06, + "loss": 0.9639, + "step": 8945 + }, + { + "epoch": 0.6463055610742862, + "grad_norm": 6.885082827355876, + "learning_rate": 3.9553000261062365e-06, + "loss": 0.8467, + "step": 8946 + }, + { + "epoch": 0.6463778062744956, + "grad_norm": 7.869760617174506, + "learning_rate": 3.9550621816113125e-06, + "loss": 0.8768, + "step": 8947 + }, + { + "epoch": 0.6464500514747051, + "grad_norm": 6.031479589015853, + "learning_rate": 3.954824317197716e-06, + "loss": 0.838, + "step": 8948 + }, + { + "epoch": 0.6465222966749147, + "grad_norm": 6.549339249458084, + "learning_rate": 3.954586432868705e-06, + "loss": 0.8463, + "step": 8949 + }, + { + "epoch": 0.6465945418751242, + "grad_norm": 6.793789428484223, + "learning_rate": 3.954348528627535e-06, + "loss": 0.8532, + "step": 8950 + }, + { + "epoch": 0.6466667870753336, + "grad_norm": 7.033571394135165, + "learning_rate": 3.954110604477463e-06, + "loss": 0.8166, + "step": 8951 + }, + { + "epoch": 0.6467390322755432, + "grad_norm": 6.195341439678625, + "learning_rate": 3.953872660421746e-06, + "loss": 0.8556, + "step": 8952 + }, + { + "epoch": 0.6468112774757527, + "grad_norm": 7.525792340611313, + "learning_rate": 3.953634696463641e-06, + "loss": 0.8769, + "step": 8953 + }, + { + "epoch": 0.6468835226759622, + "grad_norm": 6.794226821127172, + "learning_rate": 3.953396712606405e-06, + "loss": 0.9084, + "step": 8954 + }, + { + "epoch": 0.6469557678761717, + "grad_norm": 6.760025209007435, + "learning_rate": 3.953158708853298e-06, + "loss": 0.8786, + "step": 8955 + }, + { + "epoch": 0.6470280130763812, + "grad_norm": 6.284607251018996, + "learning_rate": 3.952920685207575e-06, + "loss": 0.8292, + "step": 8956 + }, + { + "epoch": 0.6471002582765908, + "grad_norm": 7.1738095136567726, + "learning_rate": 3.952682641672497e-06, + "loss": 0.9663, + "step": 8957 + }, + { + "epoch": 0.6471725034768002, + "grad_norm": 5.610218547841164, + "learning_rate": 3.952444578251321e-06, + "loss": 0.8407, + "step": 8958 + }, + { + "epoch": 0.6472447486770098, + "grad_norm": 5.9434225960940825, + "learning_rate": 3.9522064949473065e-06, + "loss": 0.8115, + "step": 8959 + }, + { + "epoch": 0.6473169938772193, + "grad_norm": 7.027489500034539, + "learning_rate": 3.951968391763713e-06, + "loss": 0.9205, + "step": 8960 + }, + { + "epoch": 0.6473892390774288, + "grad_norm": 5.228152091804899, + "learning_rate": 3.9517302687037996e-06, + "loss": 0.7813, + "step": 8961 + }, + { + "epoch": 0.6474614842776383, + "grad_norm": 6.521524443479034, + "learning_rate": 3.951492125770826e-06, + "loss": 0.9276, + "step": 8962 + }, + { + "epoch": 0.6475337294778478, + "grad_norm": 4.867137731110916, + "learning_rate": 3.951253962968052e-06, + "loss": 0.7958, + "step": 8963 + }, + { + "epoch": 0.6476059746780574, + "grad_norm": 5.270407115404784, + "learning_rate": 3.951015780298738e-06, + "loss": 0.8618, + "step": 8964 + }, + { + "epoch": 0.6476782198782668, + "grad_norm": 6.265294092434709, + "learning_rate": 3.9507775777661445e-06, + "loss": 0.8659, + "step": 8965 + }, + { + "epoch": 0.6477504650784763, + "grad_norm": 6.572940913023161, + "learning_rate": 3.9505393553735325e-06, + "loss": 0.84, + "step": 8966 + }, + { + "epoch": 0.6478227102786859, + "grad_norm": 6.178259837753891, + "learning_rate": 3.950301113124163e-06, + "loss": 0.8579, + "step": 8967 + }, + { + "epoch": 0.6478949554788954, + "grad_norm": 6.030748099360431, + "learning_rate": 3.950062851021298e-06, + "loss": 0.8004, + "step": 8968 + }, + { + "epoch": 0.6479672006791048, + "grad_norm": 5.98668910769283, + "learning_rate": 3.949824569068198e-06, + "loss": 0.8068, + "step": 8969 + }, + { + "epoch": 0.6480394458793144, + "grad_norm": 6.493394576507784, + "learning_rate": 3.949586267268125e-06, + "loss": 0.8214, + "step": 8970 + }, + { + "epoch": 0.6481116910795239, + "grad_norm": 6.568553348867909, + "learning_rate": 3.949347945624342e-06, + "loss": 0.8644, + "step": 8971 + }, + { + "epoch": 0.6481839362797334, + "grad_norm": 6.2483698435595425, + "learning_rate": 3.949109604140111e-06, + "loss": 0.7739, + "step": 8972 + }, + { + "epoch": 0.6482561814799429, + "grad_norm": 5.769707782294213, + "learning_rate": 3.948871242818695e-06, + "loss": 0.8548, + "step": 8973 + }, + { + "epoch": 0.6483284266801524, + "grad_norm": 5.097940605095938, + "learning_rate": 3.948632861663355e-06, + "loss": 0.8821, + "step": 8974 + }, + { + "epoch": 0.648400671880362, + "grad_norm": 7.770040381288599, + "learning_rate": 3.948394460677358e-06, + "loss": 0.9094, + "step": 8975 + }, + { + "epoch": 0.6484729170805714, + "grad_norm": 7.297572361952274, + "learning_rate": 3.948156039863964e-06, + "loss": 0.8103, + "step": 8976 + }, + { + "epoch": 0.648545162280781, + "grad_norm": 6.954065289300321, + "learning_rate": 3.947917599226439e-06, + "loss": 0.8722, + "step": 8977 + }, + { + "epoch": 0.6486174074809905, + "grad_norm": 6.52957496883845, + "learning_rate": 3.947679138768046e-06, + "loss": 0.9429, + "step": 8978 + }, + { + "epoch": 0.6486896526812, + "grad_norm": 6.383275663083803, + "learning_rate": 3.94744065849205e-06, + "loss": 0.8463, + "step": 8979 + }, + { + "epoch": 0.6487618978814095, + "grad_norm": 6.751881090066274, + "learning_rate": 3.947202158401715e-06, + "loss": 0.8821, + "step": 8980 + }, + { + "epoch": 0.648834143081619, + "grad_norm": 6.076507431569385, + "learning_rate": 3.946963638500306e-06, + "loss": 0.9204, + "step": 8981 + }, + { + "epoch": 0.6489063882818286, + "grad_norm": 6.014049611086475, + "learning_rate": 3.946725098791089e-06, + "loss": 0.8997, + "step": 8982 + }, + { + "epoch": 0.648978633482038, + "grad_norm": 8.041543383139219, + "learning_rate": 3.946486539277328e-06, + "loss": 0.8285, + "step": 8983 + }, + { + "epoch": 0.6490508786822475, + "grad_norm": 6.995165381504544, + "learning_rate": 3.946247959962289e-06, + "loss": 0.8142, + "step": 8984 + }, + { + "epoch": 0.6491231238824571, + "grad_norm": 6.419804222948417, + "learning_rate": 3.946009360849239e-06, + "loss": 0.9177, + "step": 8985 + }, + { + "epoch": 0.6491953690826666, + "grad_norm": 6.778148592340223, + "learning_rate": 3.945770741941443e-06, + "loss": 0.8996, + "step": 8986 + }, + { + "epoch": 0.649267614282876, + "grad_norm": 6.569554488174651, + "learning_rate": 3.945532103242169e-06, + "loss": 0.9115, + "step": 8987 + }, + { + "epoch": 0.6493398594830856, + "grad_norm": 5.516998725109634, + "learning_rate": 3.945293444754682e-06, + "loss": 0.8628, + "step": 8988 + }, + { + "epoch": 0.6494121046832951, + "grad_norm": 5.823434696345247, + "learning_rate": 3.945054766482251e-06, + "loss": 0.896, + "step": 8989 + }, + { + "epoch": 0.6494843498835046, + "grad_norm": 7.496003675689936, + "learning_rate": 3.9448160684281414e-06, + "loss": 0.8396, + "step": 8990 + }, + { + "epoch": 0.6495565950837141, + "grad_norm": 7.00186541088714, + "learning_rate": 3.944577350595622e-06, + "loss": 0.8703, + "step": 8991 + }, + { + "epoch": 0.6496288402839236, + "grad_norm": 5.478532336204459, + "learning_rate": 3.94433861298796e-06, + "loss": 0.8916, + "step": 8992 + }, + { + "epoch": 0.6497010854841332, + "grad_norm": 5.9273250733897465, + "learning_rate": 3.944099855608424e-06, + "loss": 0.8986, + "step": 8993 + }, + { + "epoch": 0.6497733306843426, + "grad_norm": 6.243847985400904, + "learning_rate": 3.943861078460283e-06, + "loss": 0.8633, + "step": 8994 + }, + { + "epoch": 0.6498455758845522, + "grad_norm": 6.933880110723907, + "learning_rate": 3.9436222815468035e-06, + "loss": 0.8276, + "step": 8995 + }, + { + "epoch": 0.6499178210847617, + "grad_norm": 6.101496541036732, + "learning_rate": 3.943383464871257e-06, + "loss": 0.8273, + "step": 8996 + }, + { + "epoch": 0.6499900662849712, + "grad_norm": 6.098006576259291, + "learning_rate": 3.943144628436911e-06, + "loss": 0.7822, + "step": 8997 + }, + { + "epoch": 0.6500623114851807, + "grad_norm": 6.465779530083022, + "learning_rate": 3.942905772247036e-06, + "loss": 0.8337, + "step": 8998 + }, + { + "epoch": 0.6501345566853902, + "grad_norm": 6.1842488696713405, + "learning_rate": 3.942666896304901e-06, + "loss": 0.872, + "step": 8999 + }, + { + "epoch": 0.6502068018855998, + "grad_norm": 6.2136817099901736, + "learning_rate": 3.942428000613776e-06, + "loss": 0.8215, + "step": 9000 + }, + { + "epoch": 0.6502790470858092, + "grad_norm": 5.993137249537748, + "learning_rate": 3.942189085176933e-06, + "loss": 0.9213, + "step": 9001 + }, + { + "epoch": 0.6503512922860187, + "grad_norm": 6.518884077000833, + "learning_rate": 3.94195014999764e-06, + "loss": 0.8824, + "step": 9002 + }, + { + "epoch": 0.6504235374862283, + "grad_norm": 8.63732295206798, + "learning_rate": 3.941711195079169e-06, + "loss": 0.8762, + "step": 9003 + }, + { + "epoch": 0.6504957826864378, + "grad_norm": 5.191510040876026, + "learning_rate": 3.941472220424791e-06, + "loss": 0.8376, + "step": 9004 + }, + { + "epoch": 0.6505680278866472, + "grad_norm": 6.164607382939405, + "learning_rate": 3.941233226037778e-06, + "loss": 0.9631, + "step": 9005 + }, + { + "epoch": 0.6506402730868568, + "grad_norm": 6.6695892921291335, + "learning_rate": 3.940994211921401e-06, + "loss": 0.8197, + "step": 9006 + }, + { + "epoch": 0.6507125182870663, + "grad_norm": 5.527098913802258, + "learning_rate": 3.940755178078933e-06, + "loss": 0.8823, + "step": 9007 + }, + { + "epoch": 0.6507847634872758, + "grad_norm": 6.815978395866577, + "learning_rate": 3.9405161245136444e-06, + "loss": 0.8753, + "step": 9008 + }, + { + "epoch": 0.6508570086874853, + "grad_norm": 6.795164677952131, + "learning_rate": 3.940277051228808e-06, + "loss": 0.8408, + "step": 9009 + }, + { + "epoch": 0.6509292538876948, + "grad_norm": 6.232038093401502, + "learning_rate": 3.940037958227698e-06, + "loss": 0.8674, + "step": 9010 + }, + { + "epoch": 0.6510014990879044, + "grad_norm": 5.1075279326942455, + "learning_rate": 3.9397988455135865e-06, + "loss": 0.8046, + "step": 9011 + }, + { + "epoch": 0.6510737442881138, + "grad_norm": 4.882536710961498, + "learning_rate": 3.939559713089747e-06, + "loss": 0.854, + "step": 9012 + }, + { + "epoch": 0.6511459894883234, + "grad_norm": 5.830345732970455, + "learning_rate": 3.939320560959452e-06, + "loss": 0.8762, + "step": 9013 + }, + { + "epoch": 0.6512182346885329, + "grad_norm": 5.431941381067951, + "learning_rate": 3.939081389125976e-06, + "loss": 0.9098, + "step": 9014 + }, + { + "epoch": 0.6512904798887424, + "grad_norm": 6.4444036409394405, + "learning_rate": 3.9388421975925935e-06, + "loss": 0.8024, + "step": 9015 + }, + { + "epoch": 0.6513627250889519, + "grad_norm": 10.786593586313014, + "learning_rate": 3.938602986362579e-06, + "loss": 0.8968, + "step": 9016 + }, + { + "epoch": 0.6514349702891614, + "grad_norm": 7.556769603852107, + "learning_rate": 3.938363755439205e-06, + "loss": 0.7988, + "step": 9017 + }, + { + "epoch": 0.651507215489371, + "grad_norm": 6.2704773186116105, + "learning_rate": 3.93812450482575e-06, + "loss": 0.9045, + "step": 9018 + }, + { + "epoch": 0.6515794606895804, + "grad_norm": 6.375035603741724, + "learning_rate": 3.937885234525486e-06, + "loss": 0.8794, + "step": 9019 + }, + { + "epoch": 0.6516517058897899, + "grad_norm": 5.593565016090668, + "learning_rate": 3.93764594454169e-06, + "loss": 0.8634, + "step": 9020 + }, + { + "epoch": 0.6517239510899995, + "grad_norm": 7.004332427862756, + "learning_rate": 3.937406634877637e-06, + "loss": 0.7927, + "step": 9021 + }, + { + "epoch": 0.651796196290209, + "grad_norm": 8.214195933354826, + "learning_rate": 3.9371673055366035e-06, + "loss": 0.8046, + "step": 9022 + }, + { + "epoch": 0.6518684414904184, + "grad_norm": 5.730297998798082, + "learning_rate": 3.936927956521865e-06, + "loss": 0.8312, + "step": 9023 + }, + { + "epoch": 0.651940686690628, + "grad_norm": 6.087711726899051, + "learning_rate": 3.936688587836699e-06, + "loss": 0.7981, + "step": 9024 + }, + { + "epoch": 0.6520129318908375, + "grad_norm": 7.483276796304372, + "learning_rate": 3.936449199484382e-06, + "loss": 0.8626, + "step": 9025 + }, + { + "epoch": 0.652085177091047, + "grad_norm": 6.611424482311111, + "learning_rate": 3.936209791468191e-06, + "loss": 0.8889, + "step": 9026 + }, + { + "epoch": 0.6521574222912565, + "grad_norm": 6.994666929025144, + "learning_rate": 3.935970363791402e-06, + "loss": 0.8728, + "step": 9027 + }, + { + "epoch": 0.652229667491466, + "grad_norm": 5.7121624135240205, + "learning_rate": 3.935730916457295e-06, + "loss": 0.8292, + "step": 9028 + }, + { + "epoch": 0.6523019126916756, + "grad_norm": 6.9897658603294115, + "learning_rate": 3.935491449469144e-06, + "loss": 0.7834, + "step": 9029 + }, + { + "epoch": 0.652374157891885, + "grad_norm": 8.667676084373094, + "learning_rate": 3.935251962830232e-06, + "loss": 0.9806, + "step": 9030 + }, + { + "epoch": 0.6524464030920946, + "grad_norm": 7.900003129016884, + "learning_rate": 3.935012456543834e-06, + "loss": 0.8307, + "step": 9031 + }, + { + "epoch": 0.6525186482923041, + "grad_norm": 6.529527939094975, + "learning_rate": 3.934772930613231e-06, + "loss": 0.8642, + "step": 9032 + }, + { + "epoch": 0.6525908934925136, + "grad_norm": 5.475874962415039, + "learning_rate": 3.934533385041699e-06, + "loss": 0.8472, + "step": 9033 + }, + { + "epoch": 0.6526631386927231, + "grad_norm": 6.113458151893343, + "learning_rate": 3.93429381983252e-06, + "loss": 0.8378, + "step": 9034 + }, + { + "epoch": 0.6527353838929326, + "grad_norm": 5.864279522883418, + "learning_rate": 3.934054234988972e-06, + "loss": 0.8448, + "step": 9035 + }, + { + "epoch": 0.6528076290931422, + "grad_norm": 5.915980805237577, + "learning_rate": 3.933814630514334e-06, + "loss": 0.841, + "step": 9036 + }, + { + "epoch": 0.6528798742933516, + "grad_norm": 7.07027473228863, + "learning_rate": 3.9335750064118885e-06, + "loss": 0.8585, + "step": 9037 + }, + { + "epoch": 0.6529521194935611, + "grad_norm": 5.6474329898095865, + "learning_rate": 3.933335362684913e-06, + "loss": 0.8387, + "step": 9038 + }, + { + "epoch": 0.6530243646937707, + "grad_norm": 6.047994978140709, + "learning_rate": 3.93309569933669e-06, + "loss": 0.8109, + "step": 9039 + }, + { + "epoch": 0.6530966098939802, + "grad_norm": 6.146564435846554, + "learning_rate": 3.932856016370499e-06, + "loss": 0.812, + "step": 9040 + }, + { + "epoch": 0.6531688550941896, + "grad_norm": 6.572787695182121, + "learning_rate": 3.932616313789622e-06, + "loss": 0.8378, + "step": 9041 + }, + { + "epoch": 0.6532411002943992, + "grad_norm": 6.751695208138009, + "learning_rate": 3.93237659159734e-06, + "loss": 0.9239, + "step": 9042 + }, + { + "epoch": 0.6533133454946087, + "grad_norm": 6.482299178279166, + "learning_rate": 3.9321368497969345e-06, + "loss": 0.8845, + "step": 9043 + }, + { + "epoch": 0.6533855906948182, + "grad_norm": 6.593459281117011, + "learning_rate": 3.931897088391688e-06, + "loss": 0.825, + "step": 9044 + }, + { + "epoch": 0.6534578358950277, + "grad_norm": 7.904740381676617, + "learning_rate": 3.931657307384882e-06, + "loss": 0.8724, + "step": 9045 + }, + { + "epoch": 0.6535300810952372, + "grad_norm": 7.796975052980908, + "learning_rate": 3.931417506779799e-06, + "loss": 0.8636, + "step": 9046 + }, + { + "epoch": 0.6536023262954468, + "grad_norm": 5.608992964717202, + "learning_rate": 3.9311776865797215e-06, + "loss": 0.911, + "step": 9047 + }, + { + "epoch": 0.6536745714956562, + "grad_norm": 6.625041457712265, + "learning_rate": 3.930937846787933e-06, + "loss": 0.8276, + "step": 9048 + }, + { + "epoch": 0.6537468166958658, + "grad_norm": 5.854451381868844, + "learning_rate": 3.930697987407716e-06, + "loss": 0.8744, + "step": 9049 + }, + { + "epoch": 0.6538190618960753, + "grad_norm": 6.130589017712863, + "learning_rate": 3.930458108442355e-06, + "loss": 0.8413, + "step": 9050 + }, + { + "epoch": 0.6538913070962847, + "grad_norm": 6.72360409001218, + "learning_rate": 3.930218209895133e-06, + "loss": 0.8186, + "step": 9051 + }, + { + "epoch": 0.6539635522964943, + "grad_norm": 5.1263710607090465, + "learning_rate": 3.929978291769334e-06, + "loss": 0.874, + "step": 9052 + }, + { + "epoch": 0.6540357974967038, + "grad_norm": 6.4913948025007375, + "learning_rate": 3.929738354068244e-06, + "loss": 0.8287, + "step": 9053 + }, + { + "epoch": 0.6541080426969134, + "grad_norm": 5.2878553627305465, + "learning_rate": 3.929498396795144e-06, + "loss": 0.8211, + "step": 9054 + }, + { + "epoch": 0.6541802878971228, + "grad_norm": 4.834066839574382, + "learning_rate": 3.929258419953321e-06, + "loss": 0.7897, + "step": 9055 + }, + { + "epoch": 0.6542525330973323, + "grad_norm": 8.077142711070543, + "learning_rate": 3.9290184235460606e-06, + "loss": 0.8557, + "step": 9056 + }, + { + "epoch": 0.6543247782975419, + "grad_norm": 5.367142012179876, + "learning_rate": 3.928778407576648e-06, + "loss": 0.8041, + "step": 9057 + }, + { + "epoch": 0.6543970234977514, + "grad_norm": 6.415024419248967, + "learning_rate": 3.928538372048367e-06, + "loss": 0.8318, + "step": 9058 + }, + { + "epoch": 0.6544692686979608, + "grad_norm": 7.873146686758776, + "learning_rate": 3.928298316964506e-06, + "loss": 0.895, + "step": 9059 + }, + { + "epoch": 0.6545415138981704, + "grad_norm": 6.9949104334977905, + "learning_rate": 3.928058242328349e-06, + "loss": 0.8477, + "step": 9060 + }, + { + "epoch": 0.6546137590983799, + "grad_norm": 5.5735957887596665, + "learning_rate": 3.927818148143185e-06, + "loss": 0.8008, + "step": 9061 + }, + { + "epoch": 0.6546860042985894, + "grad_norm": 6.732035715190872, + "learning_rate": 3.927578034412298e-06, + "loss": 0.7843, + "step": 9062 + }, + { + "epoch": 0.6547582494987989, + "grad_norm": 5.4471572531461705, + "learning_rate": 3.927337901138977e-06, + "loss": 0.8104, + "step": 9063 + }, + { + "epoch": 0.6548304946990084, + "grad_norm": 5.935238698708295, + "learning_rate": 3.927097748326508e-06, + "loss": 0.7887, + "step": 9064 + }, + { + "epoch": 0.654902739899218, + "grad_norm": 6.343091808431086, + "learning_rate": 3.9268575759781795e-06, + "loss": 0.968, + "step": 9065 + }, + { + "epoch": 0.6549749850994274, + "grad_norm": 6.432717037164359, + "learning_rate": 3.926617384097278e-06, + "loss": 0.8048, + "step": 9066 + }, + { + "epoch": 0.655047230299637, + "grad_norm": 5.927812563837708, + "learning_rate": 3.926377172687092e-06, + "loss": 0.8493, + "step": 9067 + }, + { + "epoch": 0.6551194754998465, + "grad_norm": 5.283829098775656, + "learning_rate": 3.92613694175091e-06, + "loss": 0.8566, + "step": 9068 + }, + { + "epoch": 0.6551917207000559, + "grad_norm": 5.778035357838817, + "learning_rate": 3.925896691292021e-06, + "loss": 0.8718, + "step": 9069 + }, + { + "epoch": 0.6552639659002655, + "grad_norm": 6.056942939286731, + "learning_rate": 3.925656421313713e-06, + "loss": 0.8385, + "step": 9070 + }, + { + "epoch": 0.655336211100475, + "grad_norm": 6.435702174719512, + "learning_rate": 3.925416131819276e-06, + "loss": 0.9599, + "step": 9071 + }, + { + "epoch": 0.6554084563006846, + "grad_norm": 7.970446836038321, + "learning_rate": 3.925175822811999e-06, + "loss": 0.8988, + "step": 9072 + }, + { + "epoch": 0.655480701500894, + "grad_norm": 5.57724445135463, + "learning_rate": 3.924935494295171e-06, + "loss": 0.818, + "step": 9073 + }, + { + "epoch": 0.6555529467011035, + "grad_norm": 5.516950842384771, + "learning_rate": 3.924695146272083e-06, + "loss": 0.814, + "step": 9074 + }, + { + "epoch": 0.6556251919013131, + "grad_norm": 6.73378586460191, + "learning_rate": 3.924454778746024e-06, + "loss": 0.9293, + "step": 9075 + }, + { + "epoch": 0.6556974371015226, + "grad_norm": 5.844955891099306, + "learning_rate": 3.924214391720285e-06, + "loss": 0.885, + "step": 9076 + }, + { + "epoch": 0.655769682301732, + "grad_norm": 6.126026301134162, + "learning_rate": 3.923973985198158e-06, + "loss": 0.8512, + "step": 9077 + }, + { + "epoch": 0.6558419275019416, + "grad_norm": 6.016312042239798, + "learning_rate": 3.9237335591829325e-06, + "loss": 0.8249, + "step": 9078 + }, + { + "epoch": 0.6559141727021511, + "grad_norm": 5.430906836727534, + "learning_rate": 3.923493113677899e-06, + "loss": 0.7953, + "step": 9079 + }, + { + "epoch": 0.6559864179023606, + "grad_norm": 5.662229633402969, + "learning_rate": 3.923252648686351e-06, + "loss": 0.7528, + "step": 9080 + }, + { + "epoch": 0.6560586631025701, + "grad_norm": 5.406848499487584, + "learning_rate": 3.923012164211579e-06, + "loss": 0.8134, + "step": 9081 + }, + { + "epoch": 0.6561309083027796, + "grad_norm": 5.43883138558462, + "learning_rate": 3.9227716602568755e-06, + "loss": 0.8055, + "step": 9082 + }, + { + "epoch": 0.6562031535029892, + "grad_norm": 6.101234261137441, + "learning_rate": 3.922531136825532e-06, + "loss": 0.8775, + "step": 9083 + }, + { + "epoch": 0.6562753987031986, + "grad_norm": 6.158293254276225, + "learning_rate": 3.922290593920843e-06, + "loss": 0.7598, + "step": 9084 + }, + { + "epoch": 0.6563476439034082, + "grad_norm": 5.718479838008948, + "learning_rate": 3.9220500315461e-06, + "loss": 0.8491, + "step": 9085 + }, + { + "epoch": 0.6564198891036177, + "grad_norm": 5.491460065568482, + "learning_rate": 3.921809449704595e-06, + "loss": 0.8688, + "step": 9086 + }, + { + "epoch": 0.6564921343038271, + "grad_norm": 8.096432274537463, + "learning_rate": 3.921568848399623e-06, + "loss": 0.869, + "step": 9087 + }, + { + "epoch": 0.6565643795040367, + "grad_norm": 5.386702639173243, + "learning_rate": 3.921328227634478e-06, + "loss": 0.7914, + "step": 9088 + }, + { + "epoch": 0.6566366247042462, + "grad_norm": 5.987644507589573, + "learning_rate": 3.921087587412453e-06, + "loss": 0.861, + "step": 9089 + }, + { + "epoch": 0.6567088699044558, + "grad_norm": 6.952253445639918, + "learning_rate": 3.920846927736841e-06, + "loss": 0.8787, + "step": 9090 + }, + { + "epoch": 0.6567811151046652, + "grad_norm": 8.826532973153835, + "learning_rate": 3.9206062486109395e-06, + "loss": 0.8904, + "step": 9091 + }, + { + "epoch": 0.6568533603048747, + "grad_norm": 6.760662276200594, + "learning_rate": 3.920365550038041e-06, + "loss": 0.8143, + "step": 9092 + }, + { + "epoch": 0.6569256055050843, + "grad_norm": 7.215382240411394, + "learning_rate": 3.92012483202144e-06, + "loss": 0.9014, + "step": 9093 + }, + { + "epoch": 0.6569978507052938, + "grad_norm": 5.822558160526595, + "learning_rate": 3.919884094564434e-06, + "loss": 0.8239, + "step": 9094 + }, + { + "epoch": 0.6570700959055032, + "grad_norm": 5.893539395822125, + "learning_rate": 3.919643337670316e-06, + "loss": 0.8548, + "step": 9095 + }, + { + "epoch": 0.6571423411057128, + "grad_norm": 5.413959804953609, + "learning_rate": 3.919402561342384e-06, + "loss": 0.8549, + "step": 9096 + }, + { + "epoch": 0.6572145863059223, + "grad_norm": 8.487215637926726, + "learning_rate": 3.9191617655839324e-06, + "loss": 0.9271, + "step": 9097 + }, + { + "epoch": 0.6572868315061318, + "grad_norm": 5.713722228469071, + "learning_rate": 3.918920950398259e-06, + "loss": 0.8179, + "step": 9098 + }, + { + "epoch": 0.6573590767063413, + "grad_norm": 6.934789729598872, + "learning_rate": 3.918680115788658e-06, + "loss": 0.9247, + "step": 9099 + }, + { + "epoch": 0.6574313219065508, + "grad_norm": 6.52916132917188, + "learning_rate": 3.918439261758429e-06, + "loss": 0.8084, + "step": 9100 + }, + { + "epoch": 0.6575035671067604, + "grad_norm": 6.368096297297973, + "learning_rate": 3.9181983883108676e-06, + "loss": 0.7446, + "step": 9101 + }, + { + "epoch": 0.6575758123069698, + "grad_norm": 6.509227658351792, + "learning_rate": 3.9179574954492714e-06, + "loss": 0.9312, + "step": 9102 + }, + { + "epoch": 0.6576480575071794, + "grad_norm": 5.999998410542595, + "learning_rate": 3.917716583176938e-06, + "loss": 0.8898, + "step": 9103 + }, + { + "epoch": 0.6577203027073889, + "grad_norm": 6.670721378693477, + "learning_rate": 3.917475651497164e-06, + "loss": 0.8037, + "step": 9104 + }, + { + "epoch": 0.6577925479075983, + "grad_norm": 7.771716547486946, + "learning_rate": 3.917234700413252e-06, + "loss": 0.9096, + "step": 9105 + }, + { + "epoch": 0.6578647931078079, + "grad_norm": 6.916353731851184, + "learning_rate": 3.916993729928495e-06, + "loss": 0.9388, + "step": 9106 + }, + { + "epoch": 0.6579370383080174, + "grad_norm": 6.175591926157988, + "learning_rate": 3.916752740046195e-06, + "loss": 0.7842, + "step": 9107 + }, + { + "epoch": 0.658009283508227, + "grad_norm": 6.454933042526165, + "learning_rate": 3.916511730769649e-06, + "loss": 0.8383, + "step": 9108 + }, + { + "epoch": 0.6580815287084364, + "grad_norm": 6.6759383895709865, + "learning_rate": 3.916270702102158e-06, + "loss": 0.8229, + "step": 9109 + }, + { + "epoch": 0.6581537739086459, + "grad_norm": 8.337541763681475, + "learning_rate": 3.916029654047021e-06, + "loss": 0.8542, + "step": 9110 + }, + { + "epoch": 0.6582260191088555, + "grad_norm": 7.482953599613899, + "learning_rate": 3.9157885866075375e-06, + "loss": 0.8864, + "step": 9111 + }, + { + "epoch": 0.658298264309065, + "grad_norm": 5.598429677682576, + "learning_rate": 3.915547499787008e-06, + "loss": 0.8046, + "step": 9112 + }, + { + "epoch": 0.6583705095092744, + "grad_norm": 7.017686298915779, + "learning_rate": 3.9153063935887305e-06, + "loss": 0.8652, + "step": 9113 + }, + { + "epoch": 0.658442754709484, + "grad_norm": 5.882680539377363, + "learning_rate": 3.915065268016009e-06, + "loss": 0.8394, + "step": 9114 + }, + { + "epoch": 0.6585149999096935, + "grad_norm": 7.6483020867896565, + "learning_rate": 3.9148241230721415e-06, + "loss": 0.899, + "step": 9115 + }, + { + "epoch": 0.658587245109903, + "grad_norm": 6.886908188333227, + "learning_rate": 3.9145829587604314e-06, + "loss": 0.8446, + "step": 9116 + }, + { + "epoch": 0.6586594903101125, + "grad_norm": 6.814361317990054, + "learning_rate": 3.9143417750841785e-06, + "loss": 0.8781, + "step": 9117 + }, + { + "epoch": 0.658731735510322, + "grad_norm": 6.307984200428181, + "learning_rate": 3.914100572046685e-06, + "loss": 0.7993, + "step": 9118 + }, + { + "epoch": 0.6588039807105316, + "grad_norm": 6.7391190602338975, + "learning_rate": 3.913859349651253e-06, + "loss": 0.8496, + "step": 9119 + }, + { + "epoch": 0.658876225910741, + "grad_norm": 5.687649903051148, + "learning_rate": 3.913618107901184e-06, + "loss": 0.8927, + "step": 9120 + }, + { + "epoch": 0.6589484711109506, + "grad_norm": 5.782936963543387, + "learning_rate": 3.91337684679978e-06, + "loss": 0.8591, + "step": 9121 + }, + { + "epoch": 0.6590207163111601, + "grad_norm": 8.41576519157739, + "learning_rate": 3.913135566350345e-06, + "loss": 0.8868, + "step": 9122 + }, + { + "epoch": 0.6590929615113695, + "grad_norm": 6.861911869382602, + "learning_rate": 3.912894266556182e-06, + "loss": 0.8705, + "step": 9123 + }, + { + "epoch": 0.6591652067115791, + "grad_norm": 5.268914980584887, + "learning_rate": 3.912652947420592e-06, + "loss": 0.7518, + "step": 9124 + }, + { + "epoch": 0.6592374519117886, + "grad_norm": 6.5174086802779, + "learning_rate": 3.912411608946881e-06, + "loss": 0.9465, + "step": 9125 + }, + { + "epoch": 0.6593096971119982, + "grad_norm": 8.776819163368062, + "learning_rate": 3.912170251138352e-06, + "loss": 0.8435, + "step": 9126 + }, + { + "epoch": 0.6593819423122076, + "grad_norm": 5.955371502899616, + "learning_rate": 3.911928873998308e-06, + "loss": 0.872, + "step": 9127 + }, + { + "epoch": 0.6594541875124171, + "grad_norm": 5.722164614653904, + "learning_rate": 3.911687477530054e-06, + "loss": 0.8693, + "step": 9128 + }, + { + "epoch": 0.6595264327126267, + "grad_norm": 8.066789299319115, + "learning_rate": 3.911446061736895e-06, + "loss": 0.8216, + "step": 9129 + }, + { + "epoch": 0.6595986779128362, + "grad_norm": 5.438676728702136, + "learning_rate": 3.911204626622135e-06, + "loss": 0.8652, + "step": 9130 + }, + { + "epoch": 0.6596709231130456, + "grad_norm": 5.348916216730593, + "learning_rate": 3.91096317218908e-06, + "loss": 0.7859, + "step": 9131 + }, + { + "epoch": 0.6597431683132552, + "grad_norm": 6.342586617365134, + "learning_rate": 3.910721698441034e-06, + "loss": 0.874, + "step": 9132 + }, + { + "epoch": 0.6598154135134647, + "grad_norm": 6.383105940125943, + "learning_rate": 3.910480205381304e-06, + "loss": 0.8443, + "step": 9133 + }, + { + "epoch": 0.6598876587136742, + "grad_norm": 6.0662257750440345, + "learning_rate": 3.910238693013194e-06, + "loss": 0.8091, + "step": 9134 + }, + { + "epoch": 0.6599599039138837, + "grad_norm": 7.450819382635459, + "learning_rate": 3.909997161340013e-06, + "loss": 0.8529, + "step": 9135 + }, + { + "epoch": 0.6600321491140932, + "grad_norm": 7.179934027323562, + "learning_rate": 3.9097556103650635e-06, + "loss": 0.936, + "step": 9136 + }, + { + "epoch": 0.6601043943143028, + "grad_norm": 7.526326069633531, + "learning_rate": 3.909514040091656e-06, + "loss": 0.9578, + "step": 9137 + }, + { + "epoch": 0.6601766395145122, + "grad_norm": 5.772634654302209, + "learning_rate": 3.909272450523095e-06, + "loss": 0.8597, + "step": 9138 + }, + { + "epoch": 0.6602488847147218, + "grad_norm": 10.612475744069195, + "learning_rate": 3.909030841662689e-06, + "loss": 0.8264, + "step": 9139 + }, + { + "epoch": 0.6603211299149313, + "grad_norm": 6.970261208936203, + "learning_rate": 3.908789213513744e-06, + "loss": 0.9623, + "step": 9140 + }, + { + "epoch": 0.6603933751151407, + "grad_norm": 6.242225389043333, + "learning_rate": 3.908547566079569e-06, + "loss": 0.9068, + "step": 9141 + }, + { + "epoch": 0.6604656203153503, + "grad_norm": 6.915258271861777, + "learning_rate": 3.908305899363472e-06, + "loss": 0.7967, + "step": 9142 + }, + { + "epoch": 0.6605378655155598, + "grad_norm": 7.355076143764006, + "learning_rate": 3.90806421336876e-06, + "loss": 0.9146, + "step": 9143 + }, + { + "epoch": 0.6606101107157694, + "grad_norm": 7.910949516550966, + "learning_rate": 3.907822508098742e-06, + "loss": 0.9376, + "step": 9144 + }, + { + "epoch": 0.6606823559159788, + "grad_norm": 6.111276316095511, + "learning_rate": 3.907580783556727e-06, + "loss": 0.9047, + "step": 9145 + }, + { + "epoch": 0.6607546011161883, + "grad_norm": 7.22810266526759, + "learning_rate": 3.907339039746024e-06, + "loss": 0.8218, + "step": 9146 + }, + { + "epoch": 0.6608268463163979, + "grad_norm": 9.372297380627547, + "learning_rate": 3.907097276669942e-06, + "loss": 0.8534, + "step": 9147 + }, + { + "epoch": 0.6608990915166074, + "grad_norm": 5.392517111604765, + "learning_rate": 3.9068554943317925e-06, + "loss": 0.7965, + "step": 9148 + }, + { + "epoch": 0.6609713367168168, + "grad_norm": 6.52010083781706, + "learning_rate": 3.906613692734882e-06, + "loss": 0.8173, + "step": 9149 + }, + { + "epoch": 0.6610435819170264, + "grad_norm": 7.302747172946128, + "learning_rate": 3.9063718718825225e-06, + "loss": 0.8228, + "step": 9150 + }, + { + "epoch": 0.6611158271172359, + "grad_norm": 6.935728379594862, + "learning_rate": 3.906130031778025e-06, + "loss": 0.8565, + "step": 9151 + }, + { + "epoch": 0.6611880723174454, + "grad_norm": 5.686800064782183, + "learning_rate": 3.905888172424698e-06, + "loss": 0.7741, + "step": 9152 + }, + { + "epoch": 0.6612603175176549, + "grad_norm": 7.185593592706243, + "learning_rate": 3.905646293825854e-06, + "loss": 0.9534, + "step": 9153 + }, + { + "epoch": 0.6613325627178644, + "grad_norm": 7.633477173273188, + "learning_rate": 3.905404395984803e-06, + "loss": 0.8537, + "step": 9154 + }, + { + "epoch": 0.661404807918074, + "grad_norm": 6.19805624241036, + "learning_rate": 3.905162478904858e-06, + "loss": 0.791, + "step": 9155 + }, + { + "epoch": 0.6614770531182834, + "grad_norm": 5.3521902788148, + "learning_rate": 3.90492054258933e-06, + "loss": 0.9297, + "step": 9156 + }, + { + "epoch": 0.661549298318493, + "grad_norm": 7.33133274301599, + "learning_rate": 3.90467858704153e-06, + "loss": 0.883, + "step": 9157 + }, + { + "epoch": 0.6616215435187025, + "grad_norm": 5.771793363661982, + "learning_rate": 3.90443661226477e-06, + "loss": 0.854, + "step": 9158 + }, + { + "epoch": 0.6616937887189119, + "grad_norm": 5.394910618109681, + "learning_rate": 3.904194618262364e-06, + "loss": 0.8332, + "step": 9159 + }, + { + "epoch": 0.6617660339191215, + "grad_norm": 5.996630676285929, + "learning_rate": 3.9039526050376245e-06, + "loss": 0.934, + "step": 9160 + }, + { + "epoch": 0.661838279119331, + "grad_norm": 6.948042274113702, + "learning_rate": 3.903710572593863e-06, + "loss": 0.8508, + "step": 9161 + }, + { + "epoch": 0.6619105243195406, + "grad_norm": 5.383000651532963, + "learning_rate": 3.903468520934394e-06, + "loss": 0.9314, + "step": 9162 + }, + { + "epoch": 0.66198276951975, + "grad_norm": 6.242306360824705, + "learning_rate": 3.903226450062531e-06, + "loss": 0.7335, + "step": 9163 + }, + { + "epoch": 0.6620550147199595, + "grad_norm": 7.255225304562961, + "learning_rate": 3.902984359981587e-06, + "loss": 0.8842, + "step": 9164 + }, + { + "epoch": 0.6621272599201691, + "grad_norm": 7.8176457662872165, + "learning_rate": 3.902742250694877e-06, + "loss": 0.852, + "step": 9165 + }, + { + "epoch": 0.6621995051203786, + "grad_norm": 6.127507844668549, + "learning_rate": 3.902500122205714e-06, + "loss": 0.8371, + "step": 9166 + }, + { + "epoch": 0.662271750320588, + "grad_norm": 6.137894739326725, + "learning_rate": 3.902257974517414e-06, + "loss": 0.9059, + "step": 9167 + }, + { + "epoch": 0.6623439955207976, + "grad_norm": 5.4620634345355805, + "learning_rate": 3.9020158076332905e-06, + "loss": 0.7654, + "step": 9168 + }, + { + "epoch": 0.6624162407210071, + "grad_norm": 5.749962184615802, + "learning_rate": 3.9017736215566594e-06, + "loss": 0.8728, + "step": 9169 + }, + { + "epoch": 0.6624884859212166, + "grad_norm": 6.918229290424868, + "learning_rate": 3.901531416290836e-06, + "loss": 0.7879, + "step": 9170 + }, + { + "epoch": 0.6625607311214261, + "grad_norm": 7.459804141257847, + "learning_rate": 3.901289191839136e-06, + "loss": 0.8929, + "step": 9171 + }, + { + "epoch": 0.6626329763216356, + "grad_norm": 6.457713862417367, + "learning_rate": 3.9010469482048745e-06, + "loss": 0.9069, + "step": 9172 + }, + { + "epoch": 0.6627052215218452, + "grad_norm": 5.153895539924164, + "learning_rate": 3.900804685391368e-06, + "loss": 0.8305, + "step": 9173 + }, + { + "epoch": 0.6627774667220546, + "grad_norm": 6.453570408262723, + "learning_rate": 3.900562403401933e-06, + "loss": 0.865, + "step": 9174 + }, + { + "epoch": 0.6628497119222642, + "grad_norm": 5.839895354078019, + "learning_rate": 3.9003201022398865e-06, + "loss": 0.793, + "step": 9175 + }, + { + "epoch": 0.6629219571224737, + "grad_norm": 6.984392750694026, + "learning_rate": 3.900077781908545e-06, + "loss": 0.8926, + "step": 9176 + }, + { + "epoch": 0.6629942023226831, + "grad_norm": 7.354957631687352, + "learning_rate": 3.899835442411226e-06, + "loss": 0.8913, + "step": 9177 + }, + { + "epoch": 0.6630664475228927, + "grad_norm": 6.176757735301383, + "learning_rate": 3.8995930837512466e-06, + "loss": 0.8227, + "step": 9178 + }, + { + "epoch": 0.6631386927231022, + "grad_norm": 5.8434712552587404, + "learning_rate": 3.899350705931925e-06, + "loss": 0.798, + "step": 9179 + }, + { + "epoch": 0.6632109379233118, + "grad_norm": 5.885472156184303, + "learning_rate": 3.899108308956578e-06, + "loss": 0.8001, + "step": 9180 + }, + { + "epoch": 0.6632831831235212, + "grad_norm": 6.346216182802725, + "learning_rate": 3.898865892828524e-06, + "loss": 0.9273, + "step": 9181 + }, + { + "epoch": 0.6633554283237307, + "grad_norm": 5.863101028834179, + "learning_rate": 3.898623457551083e-06, + "loss": 0.8633, + "step": 9182 + }, + { + "epoch": 0.6634276735239403, + "grad_norm": 6.1849847170148315, + "learning_rate": 3.898381003127573e-06, + "loss": 0.8665, + "step": 9183 + }, + { + "epoch": 0.6634999187241498, + "grad_norm": 5.860366452577526, + "learning_rate": 3.898138529561313e-06, + "loss": 0.8627, + "step": 9184 + }, + { + "epoch": 0.6635721639243592, + "grad_norm": 5.8168210611192155, + "learning_rate": 3.897896036855622e-06, + "loss": 0.8406, + "step": 9185 + }, + { + "epoch": 0.6636444091245688, + "grad_norm": 6.669901285656362, + "learning_rate": 3.897653525013818e-06, + "loss": 0.8225, + "step": 9186 + }, + { + "epoch": 0.6637166543247783, + "grad_norm": 5.446620614375795, + "learning_rate": 3.897410994039224e-06, + "loss": 0.8642, + "step": 9187 + }, + { + "epoch": 0.6637888995249878, + "grad_norm": 6.270706057180994, + "learning_rate": 3.897168443935159e-06, + "loss": 0.8431, + "step": 9188 + }, + { + "epoch": 0.6638611447251973, + "grad_norm": 5.427408785833258, + "learning_rate": 3.896925874704942e-06, + "loss": 0.7699, + "step": 9189 + }, + { + "epoch": 0.6639333899254068, + "grad_norm": 6.301647866917276, + "learning_rate": 3.896683286351895e-06, + "loss": 0.835, + "step": 9190 + }, + { + "epoch": 0.6640056351256164, + "grad_norm": 5.408745514965027, + "learning_rate": 3.896440678879337e-06, + "loss": 0.784, + "step": 9191 + }, + { + "epoch": 0.6640778803258258, + "grad_norm": 5.73353874075158, + "learning_rate": 3.896198052290592e-06, + "loss": 0.8518, + "step": 9192 + }, + { + "epoch": 0.6641501255260354, + "grad_norm": 5.754450195457595, + "learning_rate": 3.895955406588978e-06, + "loss": 0.8379, + "step": 9193 + }, + { + "epoch": 0.6642223707262449, + "grad_norm": 5.936255394842187, + "learning_rate": 3.8957127417778195e-06, + "loss": 0.7491, + "step": 9194 + }, + { + "epoch": 0.6642946159264543, + "grad_norm": 7.162735977413163, + "learning_rate": 3.895470057860437e-06, + "loss": 0.8869, + "step": 9195 + }, + { + "epoch": 0.6643668611266639, + "grad_norm": 5.446396838772458, + "learning_rate": 3.895227354840153e-06, + "loss": 0.8605, + "step": 9196 + }, + { + "epoch": 0.6644391063268734, + "grad_norm": 6.79407831291856, + "learning_rate": 3.89498463272029e-06, + "loss": 0.8745, + "step": 9197 + }, + { + "epoch": 0.664511351527083, + "grad_norm": 5.676157613984047, + "learning_rate": 3.89474189150417e-06, + "loss": 0.8107, + "step": 9198 + }, + { + "epoch": 0.6645835967272924, + "grad_norm": 5.494460003381566, + "learning_rate": 3.8944991311951175e-06, + "loss": 0.815, + "step": 9199 + }, + { + "epoch": 0.6646558419275019, + "grad_norm": 6.706283003271686, + "learning_rate": 3.894256351796453e-06, + "loss": 0.7951, + "step": 9200 + }, + { + "epoch": 0.6647280871277115, + "grad_norm": 5.567105047968136, + "learning_rate": 3.894013553311503e-06, + "loss": 0.8999, + "step": 9201 + }, + { + "epoch": 0.664800332327921, + "grad_norm": 7.058514305896317, + "learning_rate": 3.893770735743589e-06, + "loss": 0.7517, + "step": 9202 + }, + { + "epoch": 0.6648725775281304, + "grad_norm": 5.466427904047704, + "learning_rate": 3.893527899096037e-06, + "loss": 0.8715, + "step": 9203 + }, + { + "epoch": 0.66494482272834, + "grad_norm": 6.071710884518922, + "learning_rate": 3.893285043372169e-06, + "loss": 0.8973, + "step": 9204 + }, + { + "epoch": 0.6650170679285495, + "grad_norm": 6.017695717063472, + "learning_rate": 3.89304216857531e-06, + "loss": 0.7992, + "step": 9205 + }, + { + "epoch": 0.665089313128759, + "grad_norm": 6.01453546342694, + "learning_rate": 3.892799274708786e-06, + "loss": 1.0126, + "step": 9206 + }, + { + "epoch": 0.6651615583289685, + "grad_norm": 6.409819115810688, + "learning_rate": 3.892556361775922e-06, + "loss": 0.8502, + "step": 9207 + }, + { + "epoch": 0.665233803529178, + "grad_norm": 5.533863893986549, + "learning_rate": 3.892313429780042e-06, + "loss": 0.9596, + "step": 9208 + }, + { + "epoch": 0.6653060487293876, + "grad_norm": 6.286959506213132, + "learning_rate": 3.892070478724473e-06, + "loss": 0.8748, + "step": 9209 + }, + { + "epoch": 0.665378293929597, + "grad_norm": 6.966230407391589, + "learning_rate": 3.891827508612539e-06, + "loss": 0.8652, + "step": 9210 + }, + { + "epoch": 0.6654505391298066, + "grad_norm": 6.742164019458678, + "learning_rate": 3.8915845194475675e-06, + "loss": 0.8398, + "step": 9211 + }, + { + "epoch": 0.6655227843300161, + "grad_norm": 6.404905299038265, + "learning_rate": 3.891341511232885e-06, + "loss": 0.9322, + "step": 9212 + }, + { + "epoch": 0.6655950295302255, + "grad_norm": 7.803509964191144, + "learning_rate": 3.8910984839718166e-06, + "loss": 0.8376, + "step": 9213 + }, + { + "epoch": 0.6656672747304351, + "grad_norm": 7.687488835024285, + "learning_rate": 3.890855437667691e-06, + "loss": 0.9547, + "step": 9214 + }, + { + "epoch": 0.6657395199306446, + "grad_norm": 6.989138760098426, + "learning_rate": 3.8906123723238335e-06, + "loss": 0.854, + "step": 9215 + }, + { + "epoch": 0.6658117651308542, + "grad_norm": 6.698817863369743, + "learning_rate": 3.890369287943573e-06, + "loss": 0.877, + "step": 9216 + }, + { + "epoch": 0.6658840103310636, + "grad_norm": 7.128480027926354, + "learning_rate": 3.890126184530236e-06, + "loss": 0.9161, + "step": 9217 + }, + { + "epoch": 0.6659562555312731, + "grad_norm": 6.4486248590875705, + "learning_rate": 3.889883062087151e-06, + "loss": 0.8485, + "step": 9218 + }, + { + "epoch": 0.6660285007314827, + "grad_norm": 5.614050316737668, + "learning_rate": 3.889639920617646e-06, + "loss": 0.794, + "step": 9219 + }, + { + "epoch": 0.6661007459316922, + "grad_norm": 7.4107989413076325, + "learning_rate": 3.88939676012505e-06, + "loss": 0.8515, + "step": 9220 + }, + { + "epoch": 0.6661729911319016, + "grad_norm": 6.011815517110249, + "learning_rate": 3.889153580612691e-06, + "loss": 0.8533, + "step": 9221 + }, + { + "epoch": 0.6662452363321112, + "grad_norm": 8.59510553939848, + "learning_rate": 3.888910382083897e-06, + "loss": 0.9223, + "step": 9222 + }, + { + "epoch": 0.6663174815323207, + "grad_norm": 5.311251504834564, + "learning_rate": 3.888667164541999e-06, + "loss": 0.9095, + "step": 9223 + }, + { + "epoch": 0.6663897267325302, + "grad_norm": 7.675407513165934, + "learning_rate": 3.888423927990326e-06, + "loss": 0.8976, + "step": 9224 + }, + { + "epoch": 0.6664619719327397, + "grad_norm": 7.8415932900877, + "learning_rate": 3.888180672432208e-06, + "loss": 0.902, + "step": 9225 + }, + { + "epoch": 0.6665342171329492, + "grad_norm": 5.0118658411408745, + "learning_rate": 3.887937397870973e-06, + "loss": 0.8291, + "step": 9226 + }, + { + "epoch": 0.6666064623331588, + "grad_norm": 6.623907898912203, + "learning_rate": 3.887694104309954e-06, + "loss": 0.8667, + "step": 9227 + }, + { + "epoch": 0.6666787075333682, + "grad_norm": 6.688482479761764, + "learning_rate": 3.88745079175248e-06, + "loss": 0.8659, + "step": 9228 + }, + { + "epoch": 0.6667509527335778, + "grad_norm": 8.1585030787119, + "learning_rate": 3.887207460201881e-06, + "loss": 0.8621, + "step": 9229 + }, + { + "epoch": 0.6668231979337873, + "grad_norm": 6.215103383758051, + "learning_rate": 3.88696410966149e-06, + "loss": 0.7751, + "step": 9230 + }, + { + "epoch": 0.6668954431339967, + "grad_norm": 6.453899346322419, + "learning_rate": 3.8867207401346366e-06, + "loss": 0.8352, + "step": 9231 + }, + { + "epoch": 0.6669676883342063, + "grad_norm": 6.911906553873009, + "learning_rate": 3.8864773516246534e-06, + "loss": 0.875, + "step": 9232 + }, + { + "epoch": 0.6670399335344158, + "grad_norm": 6.228079540433222, + "learning_rate": 3.886233944134872e-06, + "loss": 0.8901, + "step": 9233 + }, + { + "epoch": 0.6671121787346254, + "grad_norm": 6.325557385639124, + "learning_rate": 3.885990517668623e-06, + "loss": 0.8193, + "step": 9234 + }, + { + "epoch": 0.6671844239348348, + "grad_norm": 6.842303118760096, + "learning_rate": 3.885747072229241e-06, + "loss": 0.775, + "step": 9235 + }, + { + "epoch": 0.6672566691350443, + "grad_norm": 6.042364286288348, + "learning_rate": 3.885503607820058e-06, + "loss": 0.8892, + "step": 9236 + }, + { + "epoch": 0.6673289143352539, + "grad_norm": 7.455228304830674, + "learning_rate": 3.885260124444406e-06, + "loss": 0.8815, + "step": 9237 + }, + { + "epoch": 0.6674011595354634, + "grad_norm": 5.466261989842928, + "learning_rate": 3.885016622105617e-06, + "loss": 0.8718, + "step": 9238 + }, + { + "epoch": 0.6674734047356728, + "grad_norm": 7.583923764002584, + "learning_rate": 3.8847731008070275e-06, + "loss": 0.942, + "step": 9239 + }, + { + "epoch": 0.6675456499358824, + "grad_norm": 7.252456248959135, + "learning_rate": 3.884529560551969e-06, + "loss": 0.7951, + "step": 9240 + }, + { + "epoch": 0.6676178951360919, + "grad_norm": 7.428453088174087, + "learning_rate": 3.884286001343776e-06, + "loss": 0.8736, + "step": 9241 + }, + { + "epoch": 0.6676901403363014, + "grad_norm": 6.398895955225462, + "learning_rate": 3.884042423185783e-06, + "loss": 0.8306, + "step": 9242 + }, + { + "epoch": 0.6677623855365109, + "grad_norm": 5.89780561081401, + "learning_rate": 3.8837988260813225e-06, + "loss": 0.8401, + "step": 9243 + }, + { + "epoch": 0.6678346307367204, + "grad_norm": 6.050618592471558, + "learning_rate": 3.883555210033732e-06, + "loss": 0.7332, + "step": 9244 + }, + { + "epoch": 0.66790687593693, + "grad_norm": 5.7128408785227265, + "learning_rate": 3.883311575046344e-06, + "loss": 0.8404, + "step": 9245 + }, + { + "epoch": 0.6679791211371394, + "grad_norm": 8.808637489497835, + "learning_rate": 3.883067921122494e-06, + "loss": 0.9064, + "step": 9246 + }, + { + "epoch": 0.668051366337349, + "grad_norm": 8.02163821719105, + "learning_rate": 3.882824248265519e-06, + "loss": 0.8699, + "step": 9247 + }, + { + "epoch": 0.6681236115375585, + "grad_norm": 6.818844422019686, + "learning_rate": 3.882580556478753e-06, + "loss": 0.7987, + "step": 9248 + }, + { + "epoch": 0.6681958567377679, + "grad_norm": 5.621669206992845, + "learning_rate": 3.882336845765534e-06, + "loss": 0.8582, + "step": 9249 + }, + { + "epoch": 0.6682681019379775, + "grad_norm": 6.833200065739897, + "learning_rate": 3.882093116129196e-06, + "loss": 0.845, + "step": 9250 + }, + { + "epoch": 0.668340347138187, + "grad_norm": 6.619833028940737, + "learning_rate": 3.881849367573076e-06, + "loss": 0.8665, + "step": 9251 + }, + { + "epoch": 0.6684125923383966, + "grad_norm": 6.525553724390912, + "learning_rate": 3.881605600100512e-06, + "loss": 0.9643, + "step": 9252 + }, + { + "epoch": 0.668484837538606, + "grad_norm": 5.193797322228665, + "learning_rate": 3.881361813714839e-06, + "loss": 0.7913, + "step": 9253 + }, + { + "epoch": 0.6685570827388155, + "grad_norm": 5.977472931574436, + "learning_rate": 3.881118008419397e-06, + "loss": 0.862, + "step": 9254 + }, + { + "epoch": 0.6686293279390251, + "grad_norm": 7.083984913495982, + "learning_rate": 3.8808741842175205e-06, + "loss": 0.8828, + "step": 9255 + }, + { + "epoch": 0.6687015731392346, + "grad_norm": 5.453819252569456, + "learning_rate": 3.880630341112549e-06, + "loss": 0.8446, + "step": 9256 + }, + { + "epoch": 0.668773818339444, + "grad_norm": 6.525583830105202, + "learning_rate": 3.88038647910782e-06, + "loss": 0.8472, + "step": 9257 + }, + { + "epoch": 0.6688460635396536, + "grad_norm": 7.031385361110252, + "learning_rate": 3.880142598206672e-06, + "loss": 0.8084, + "step": 9258 + }, + { + "epoch": 0.6689183087398631, + "grad_norm": 6.220001248178633, + "learning_rate": 3.879898698412443e-06, + "loss": 0.8626, + "step": 9259 + }, + { + "epoch": 0.6689905539400726, + "grad_norm": 6.840810484133732, + "learning_rate": 3.879654779728474e-06, + "loss": 0.8912, + "step": 9260 + }, + { + "epoch": 0.6690627991402821, + "grad_norm": 7.19940446933875, + "learning_rate": 3.8794108421581e-06, + "loss": 0.8596, + "step": 9261 + }, + { + "epoch": 0.6691350443404916, + "grad_norm": 5.391211790130308, + "learning_rate": 3.879166885704664e-06, + "loss": 0.8908, + "step": 9262 + }, + { + "epoch": 0.6692072895407012, + "grad_norm": 9.219614382807764, + "learning_rate": 3.878922910371503e-06, + "loss": 0.9053, + "step": 9263 + }, + { + "epoch": 0.6692795347409106, + "grad_norm": 5.95446922364635, + "learning_rate": 3.878678916161959e-06, + "loss": 0.7407, + "step": 9264 + }, + { + "epoch": 0.6693517799411202, + "grad_norm": 7.07416940942553, + "learning_rate": 3.878434903079371e-06, + "loss": 0.8717, + "step": 9265 + }, + { + "epoch": 0.6694240251413297, + "grad_norm": 7.296482116447265, + "learning_rate": 3.878190871127079e-06, + "loss": 0.7886, + "step": 9266 + }, + { + "epoch": 0.6694962703415391, + "grad_norm": 7.544244343284597, + "learning_rate": 3.877946820308425e-06, + "loss": 0.8728, + "step": 9267 + }, + { + "epoch": 0.6695685155417487, + "grad_norm": 6.448576943193371, + "learning_rate": 3.877702750626748e-06, + "loss": 0.8744, + "step": 9268 + }, + { + "epoch": 0.6696407607419582, + "grad_norm": 6.216770182847976, + "learning_rate": 3.87745866208539e-06, + "loss": 0.7908, + "step": 9269 + }, + { + "epoch": 0.6697130059421678, + "grad_norm": 8.771635500604704, + "learning_rate": 3.8772145546876925e-06, + "loss": 0.9132, + "step": 9270 + }, + { + "epoch": 0.6697852511423772, + "grad_norm": 7.178722827347826, + "learning_rate": 3.876970428436998e-06, + "loss": 0.932, + "step": 9271 + }, + { + "epoch": 0.6698574963425867, + "grad_norm": 6.887330805367098, + "learning_rate": 3.876726283336647e-06, + "loss": 0.7897, + "step": 9272 + }, + { + "epoch": 0.6699297415427963, + "grad_norm": 6.458246374057539, + "learning_rate": 3.876482119389982e-06, + "loss": 0.8339, + "step": 9273 + }, + { + "epoch": 0.6700019867430057, + "grad_norm": 6.285349860092875, + "learning_rate": 3.876237936600345e-06, + "loss": 0.8477, + "step": 9274 + }, + { + "epoch": 0.6700742319432152, + "grad_norm": 6.1435744795073415, + "learning_rate": 3.87599373497108e-06, + "loss": 0.8056, + "step": 9275 + }, + { + "epoch": 0.6701464771434248, + "grad_norm": 5.9794968924783465, + "learning_rate": 3.8757495145055294e-06, + "loss": 0.8346, + "step": 9276 + }, + { + "epoch": 0.6702187223436343, + "grad_norm": 7.092589161540678, + "learning_rate": 3.875505275207035e-06, + "loss": 0.829, + "step": 9277 + }, + { + "epoch": 0.6702909675438438, + "grad_norm": 7.574405347189005, + "learning_rate": 3.875261017078943e-06, + "loss": 0.8429, + "step": 9278 + }, + { + "epoch": 0.6703632127440533, + "grad_norm": 7.550996969726342, + "learning_rate": 3.875016740124594e-06, + "loss": 0.9167, + "step": 9279 + }, + { + "epoch": 0.6704354579442628, + "grad_norm": 6.516785003810443, + "learning_rate": 3.8747724443473345e-06, + "loss": 0.767, + "step": 9280 + }, + { + "epoch": 0.6705077031444724, + "grad_norm": 6.192075328595414, + "learning_rate": 3.874528129750507e-06, + "loss": 0.8914, + "step": 9281 + }, + { + "epoch": 0.6705799483446818, + "grad_norm": 6.458717121493149, + "learning_rate": 3.874283796337457e-06, + "loss": 0.8379, + "step": 9282 + }, + { + "epoch": 0.6706521935448914, + "grad_norm": 6.348103650098396, + "learning_rate": 3.874039444111529e-06, + "loss": 0.7417, + "step": 9283 + }, + { + "epoch": 0.6707244387451009, + "grad_norm": 7.018779631609034, + "learning_rate": 3.873795073076067e-06, + "loss": 0.9506, + "step": 9284 + }, + { + "epoch": 0.6707966839453103, + "grad_norm": 6.260869454646455, + "learning_rate": 3.8735506832344185e-06, + "loss": 0.8211, + "step": 9285 + }, + { + "epoch": 0.6708689291455199, + "grad_norm": 6.169387663838974, + "learning_rate": 3.8733062745899275e-06, + "loss": 0.8415, + "step": 9286 + }, + { + "epoch": 0.6709411743457294, + "grad_norm": 5.093923554067397, + "learning_rate": 3.873061847145939e-06, + "loss": 0.8092, + "step": 9287 + }, + { + "epoch": 0.671013419545939, + "grad_norm": 5.653416719601409, + "learning_rate": 3.8728174009058e-06, + "loss": 0.825, + "step": 9288 + }, + { + "epoch": 0.6710856647461484, + "grad_norm": 8.269723146687049, + "learning_rate": 3.872572935872857e-06, + "loss": 0.8612, + "step": 9289 + }, + { + "epoch": 0.6711579099463579, + "grad_norm": 6.236624175755594, + "learning_rate": 3.8723284520504565e-06, + "loss": 0.8077, + "step": 9290 + }, + { + "epoch": 0.6712301551465675, + "grad_norm": 6.796181268476023, + "learning_rate": 3.872083949441945e-06, + "loss": 0.8011, + "step": 9291 + }, + { + "epoch": 0.6713024003467769, + "grad_norm": 7.365800194969114, + "learning_rate": 3.87183942805067e-06, + "loss": 0.8759, + "step": 9292 + }, + { + "epoch": 0.6713746455469864, + "grad_norm": 7.21342291521914, + "learning_rate": 3.871594887879977e-06, + "loss": 0.8385, + "step": 9293 + }, + { + "epoch": 0.671446890747196, + "grad_norm": 6.209220293764277, + "learning_rate": 3.871350328933215e-06, + "loss": 0.7953, + "step": 9294 + }, + { + "epoch": 0.6715191359474055, + "grad_norm": 6.810895538375843, + "learning_rate": 3.871105751213733e-06, + "loss": 0.8221, + "step": 9295 + }, + { + "epoch": 0.671591381147615, + "grad_norm": 6.633877258747773, + "learning_rate": 3.870861154724877e-06, + "loss": 0.7691, + "step": 9296 + }, + { + "epoch": 0.6716636263478245, + "grad_norm": 6.93693333966354, + "learning_rate": 3.870616539469997e-06, + "loss": 0.8906, + "step": 9297 + }, + { + "epoch": 0.671735871548034, + "grad_norm": 8.289644643348005, + "learning_rate": 3.87037190545244e-06, + "loss": 0.8567, + "step": 9298 + }, + { + "epoch": 0.6718081167482436, + "grad_norm": 5.358052527078712, + "learning_rate": 3.870127252675556e-06, + "loss": 0.8296, + "step": 9299 + }, + { + "epoch": 0.671880361948453, + "grad_norm": 5.985604501023751, + "learning_rate": 3.869882581142694e-06, + "loss": 0.7974, + "step": 9300 + }, + { + "epoch": 0.6719526071486626, + "grad_norm": 5.599417942315488, + "learning_rate": 3.869637890857203e-06, + "loss": 0.8523, + "step": 9301 + }, + { + "epoch": 0.6720248523488721, + "grad_norm": 5.817539450557174, + "learning_rate": 3.869393181822433e-06, + "loss": 0.8829, + "step": 9302 + }, + { + "epoch": 0.6720970975490815, + "grad_norm": 6.819453619048487, + "learning_rate": 3.869148454041733e-06, + "loss": 0.8847, + "step": 9303 + }, + { + "epoch": 0.6721693427492911, + "grad_norm": 8.390589083518334, + "learning_rate": 3.868903707518453e-06, + "loss": 0.84, + "step": 9304 + }, + { + "epoch": 0.6722415879495006, + "grad_norm": 6.197890064171725, + "learning_rate": 3.868658942255946e-06, + "loss": 0.842, + "step": 9305 + }, + { + "epoch": 0.6723138331497102, + "grad_norm": 6.231964333624384, + "learning_rate": 3.86841415825756e-06, + "loss": 0.8314, + "step": 9306 + }, + { + "epoch": 0.6723860783499196, + "grad_norm": 8.41918857171252, + "learning_rate": 3.8681693555266465e-06, + "loss": 0.9258, + "step": 9307 + }, + { + "epoch": 0.6724583235501291, + "grad_norm": 6.348929259369529, + "learning_rate": 3.867924534066557e-06, + "loss": 0.8644, + "step": 9308 + }, + { + "epoch": 0.6725305687503387, + "grad_norm": 6.435865472481756, + "learning_rate": 3.867679693880642e-06, + "loss": 0.7979, + "step": 9309 + }, + { + "epoch": 0.6726028139505481, + "grad_norm": 9.114649506828236, + "learning_rate": 3.8674348349722544e-06, + "loss": 0.8699, + "step": 9310 + }, + { + "epoch": 0.6726750591507576, + "grad_norm": 9.626398554667306, + "learning_rate": 3.867189957344746e-06, + "loss": 0.9857, + "step": 9311 + }, + { + "epoch": 0.6727473043509672, + "grad_norm": 7.562898483496898, + "learning_rate": 3.866945061001468e-06, + "loss": 0.8341, + "step": 9312 + }, + { + "epoch": 0.6728195495511767, + "grad_norm": 6.760091796342022, + "learning_rate": 3.866700145945774e-06, + "loss": 0.8446, + "step": 9313 + }, + { + "epoch": 0.6728917947513862, + "grad_norm": 6.748139866852024, + "learning_rate": 3.866455212181016e-06, + "loss": 0.9242, + "step": 9314 + }, + { + "epoch": 0.6729640399515957, + "grad_norm": 9.265253635317158, + "learning_rate": 3.8662102597105475e-06, + "loss": 0.8509, + "step": 9315 + }, + { + "epoch": 0.6730362851518052, + "grad_norm": 7.278976150869822, + "learning_rate": 3.8659652885377204e-06, + "loss": 0.912, + "step": 9316 + }, + { + "epoch": 0.6731085303520148, + "grad_norm": 10.188642753679677, + "learning_rate": 3.8657202986658905e-06, + "loss": 0.863, + "step": 9317 + }, + { + "epoch": 0.6731807755522242, + "grad_norm": 7.165729224823272, + "learning_rate": 3.86547529009841e-06, + "loss": 0.8152, + "step": 9318 + }, + { + "epoch": 0.6732530207524338, + "grad_norm": 6.000473957415618, + "learning_rate": 3.865230262838632e-06, + "loss": 0.8989, + "step": 9319 + }, + { + "epoch": 0.6733252659526433, + "grad_norm": 6.780484345276742, + "learning_rate": 3.8649852168899114e-06, + "loss": 0.7896, + "step": 9320 + }, + { + "epoch": 0.6733975111528527, + "grad_norm": 6.313835682579278, + "learning_rate": 3.864740152255604e-06, + "loss": 0.826, + "step": 9321 + }, + { + "epoch": 0.6734697563530623, + "grad_norm": 9.673579101314804, + "learning_rate": 3.8644950689390626e-06, + "loss": 0.8939, + "step": 9322 + }, + { + "epoch": 0.6735420015532718, + "grad_norm": 7.073855562961034, + "learning_rate": 3.864249966943644e-06, + "loss": 0.7918, + "step": 9323 + }, + { + "epoch": 0.6736142467534814, + "grad_norm": 8.289146717733102, + "learning_rate": 3.864004846272703e-06, + "loss": 0.9242, + "step": 9324 + }, + { + "epoch": 0.6736864919536908, + "grad_norm": 6.854440917429177, + "learning_rate": 3.8637597069295944e-06, + "loss": 0.7882, + "step": 9325 + }, + { + "epoch": 0.6737587371539003, + "grad_norm": 6.1763204668538165, + "learning_rate": 3.863514548917674e-06, + "loss": 0.8724, + "step": 9326 + }, + { + "epoch": 0.6738309823541099, + "grad_norm": 5.623148804271107, + "learning_rate": 3.863269372240298e-06, + "loss": 0.7811, + "step": 9327 + }, + { + "epoch": 0.6739032275543193, + "grad_norm": 6.012522980136535, + "learning_rate": 3.8630241769008235e-06, + "loss": 0.8624, + "step": 9328 + }, + { + "epoch": 0.6739754727545288, + "grad_norm": 5.270445295482242, + "learning_rate": 3.862778962902606e-06, + "loss": 0.8327, + "step": 9329 + }, + { + "epoch": 0.6740477179547384, + "grad_norm": 8.872130883891336, + "learning_rate": 3.8625337302490015e-06, + "loss": 0.81, + "step": 9330 + }, + { + "epoch": 0.6741199631549479, + "grad_norm": 6.5630600463222075, + "learning_rate": 3.8622884789433704e-06, + "loss": 0.9192, + "step": 9331 + }, + { + "epoch": 0.6741922083551574, + "grad_norm": 6.739361609497182, + "learning_rate": 3.862043208989066e-06, + "loss": 0.9527, + "step": 9332 + }, + { + "epoch": 0.6742644535553669, + "grad_norm": 7.58362472589159, + "learning_rate": 3.861797920389448e-06, + "loss": 0.8417, + "step": 9333 + }, + { + "epoch": 0.6743366987555764, + "grad_norm": 7.026947468329064, + "learning_rate": 3.8615526131478745e-06, + "loss": 0.8627, + "step": 9334 + }, + { + "epoch": 0.674408943955786, + "grad_norm": 5.750202672952432, + "learning_rate": 3.861307287267703e-06, + "loss": 0.7602, + "step": 9335 + }, + { + "epoch": 0.6744811891559954, + "grad_norm": 6.113140536020317, + "learning_rate": 3.86106194275229e-06, + "loss": 0.8023, + "step": 9336 + }, + { + "epoch": 0.674553434356205, + "grad_norm": 8.213671141196405, + "learning_rate": 3.860816579604997e-06, + "loss": 0.7853, + "step": 9337 + }, + { + "epoch": 0.6746256795564145, + "grad_norm": 6.365262841778168, + "learning_rate": 3.860571197829181e-06, + "loss": 0.7788, + "step": 9338 + }, + { + "epoch": 0.6746979247566239, + "grad_norm": 6.219305732470893, + "learning_rate": 3.8603257974282035e-06, + "loss": 0.7938, + "step": 9339 + }, + { + "epoch": 0.6747701699568335, + "grad_norm": 6.246132836807032, + "learning_rate": 3.860080378405421e-06, + "loss": 0.8951, + "step": 9340 + }, + { + "epoch": 0.674842415157043, + "grad_norm": 5.163050565387392, + "learning_rate": 3.859834940764193e-06, + "loss": 0.7617, + "step": 9341 + }, + { + "epoch": 0.6749146603572526, + "grad_norm": 6.234438941264363, + "learning_rate": 3.859589484507882e-06, + "loss": 0.8371, + "step": 9342 + }, + { + "epoch": 0.674986905557462, + "grad_norm": 5.872024107090996, + "learning_rate": 3.859344009639846e-06, + "loss": 0.791, + "step": 9343 + }, + { + "epoch": 0.6750591507576715, + "grad_norm": 6.443420650379298, + "learning_rate": 3.859098516163446e-06, + "loss": 0.8776, + "step": 9344 + }, + { + "epoch": 0.6751313959578811, + "grad_norm": 5.844399808770432, + "learning_rate": 3.8588530040820426e-06, + "loss": 0.7773, + "step": 9345 + }, + { + "epoch": 0.6752036411580905, + "grad_norm": 7.910886106375006, + "learning_rate": 3.858607473398997e-06, + "loss": 0.8418, + "step": 9346 + }, + { + "epoch": 0.6752758863583, + "grad_norm": 5.345382708701991, + "learning_rate": 3.8583619241176695e-06, + "loss": 0.8363, + "step": 9347 + }, + { + "epoch": 0.6753481315585096, + "grad_norm": 6.936126573034708, + "learning_rate": 3.858116356241422e-06, + "loss": 0.8511, + "step": 9348 + }, + { + "epoch": 0.6754203767587191, + "grad_norm": 6.412654001866641, + "learning_rate": 3.857870769773617e-06, + "loss": 0.8418, + "step": 9349 + }, + { + "epoch": 0.6754926219589286, + "grad_norm": 6.672075112265369, + "learning_rate": 3.857625164717614e-06, + "loss": 0.8306, + "step": 9350 + }, + { + "epoch": 0.6755648671591381, + "grad_norm": 7.342307350818728, + "learning_rate": 3.8573795410767775e-06, + "loss": 0.8535, + "step": 9351 + }, + { + "epoch": 0.6756371123593476, + "grad_norm": 6.85233914052547, + "learning_rate": 3.85713389885447e-06, + "loss": 0.8772, + "step": 9352 + }, + { + "epoch": 0.6757093575595572, + "grad_norm": 8.856259614001045, + "learning_rate": 3.856888238054052e-06, + "loss": 0.8641, + "step": 9353 + }, + { + "epoch": 0.6757816027597666, + "grad_norm": 5.577525216134939, + "learning_rate": 3.856642558678887e-06, + "loss": 0.9051, + "step": 9354 + }, + { + "epoch": 0.6758538479599762, + "grad_norm": 7.278949685257332, + "learning_rate": 3.85639686073234e-06, + "loss": 0.8311, + "step": 9355 + }, + { + "epoch": 0.6759260931601857, + "grad_norm": 6.247405467331394, + "learning_rate": 3.8561511442177724e-06, + "loss": 0.7671, + "step": 9356 + }, + { + "epoch": 0.6759983383603951, + "grad_norm": 6.909939570072994, + "learning_rate": 3.855905409138549e-06, + "loss": 0.8451, + "step": 9357 + }, + { + "epoch": 0.6760705835606047, + "grad_norm": 7.085755267002765, + "learning_rate": 3.8556596554980326e-06, + "loss": 0.8484, + "step": 9358 + }, + { + "epoch": 0.6761428287608142, + "grad_norm": 6.612349614126101, + "learning_rate": 3.855413883299588e-06, + "loss": 0.9431, + "step": 9359 + }, + { + "epoch": 0.6762150739610238, + "grad_norm": 5.673528256920915, + "learning_rate": 3.855168092546581e-06, + "loss": 0.8184, + "step": 9360 + }, + { + "epoch": 0.6762873191612332, + "grad_norm": 5.907010145569693, + "learning_rate": 3.854922283242374e-06, + "loss": 0.8058, + "step": 9361 + }, + { + "epoch": 0.6763595643614427, + "grad_norm": 5.798479459667806, + "learning_rate": 3.8546764553903335e-06, + "loss": 0.7776, + "step": 9362 + }, + { + "epoch": 0.6764318095616523, + "grad_norm": 6.371040161581036, + "learning_rate": 3.854430608993824e-06, + "loss": 0.8609, + "step": 9363 + }, + { + "epoch": 0.6765040547618617, + "grad_norm": 6.598772929014998, + "learning_rate": 3.854184744056211e-06, + "loss": 0.878, + "step": 9364 + }, + { + "epoch": 0.6765762999620712, + "grad_norm": 7.040832500219174, + "learning_rate": 3.85393886058086e-06, + "loss": 0.8371, + "step": 9365 + }, + { + "epoch": 0.6766485451622808, + "grad_norm": 7.65041281390387, + "learning_rate": 3.853692958571138e-06, + "loss": 0.8808, + "step": 9366 + }, + { + "epoch": 0.6767207903624903, + "grad_norm": 7.5953859678418105, + "learning_rate": 3.85344703803041e-06, + "loss": 0.7673, + "step": 9367 + }, + { + "epoch": 0.6767930355626998, + "grad_norm": 6.2215051729811846, + "learning_rate": 3.853201098962044e-06, + "loss": 0.8839, + "step": 9368 + }, + { + "epoch": 0.6768652807629093, + "grad_norm": 6.446882207055624, + "learning_rate": 3.852955141369405e-06, + "loss": 0.8966, + "step": 9369 + }, + { + "epoch": 0.6769375259631188, + "grad_norm": 5.415674363234347, + "learning_rate": 3.8527091652558595e-06, + "loss": 0.8636, + "step": 9370 + }, + { + "epoch": 0.6770097711633284, + "grad_norm": 7.131122033645212, + "learning_rate": 3.852463170624777e-06, + "loss": 0.7553, + "step": 9371 + }, + { + "epoch": 0.6770820163635378, + "grad_norm": 7.379278461364062, + "learning_rate": 3.852217157479524e-06, + "loss": 0.8612, + "step": 9372 + }, + { + "epoch": 0.6771542615637474, + "grad_norm": 5.562085425540124, + "learning_rate": 3.851971125823467e-06, + "loss": 0.8169, + "step": 9373 + }, + { + "epoch": 0.6772265067639569, + "grad_norm": 7.398776762614737, + "learning_rate": 3.851725075659975e-06, + "loss": 0.8719, + "step": 9374 + }, + { + "epoch": 0.6772987519641663, + "grad_norm": 6.271433889161477, + "learning_rate": 3.8514790069924174e-06, + "loss": 0.757, + "step": 9375 + }, + { + "epoch": 0.6773709971643759, + "grad_norm": 5.877550586010577, + "learning_rate": 3.851232919824161e-06, + "loss": 0.8494, + "step": 9376 + }, + { + "epoch": 0.6774432423645854, + "grad_norm": 8.46410509943475, + "learning_rate": 3.850986814158575e-06, + "loss": 0.9782, + "step": 9377 + }, + { + "epoch": 0.677515487564795, + "grad_norm": 6.235908185927717, + "learning_rate": 3.850740689999029e-06, + "loss": 0.8256, + "step": 9378 + }, + { + "epoch": 0.6775877327650044, + "grad_norm": 7.99704854880495, + "learning_rate": 3.850494547348891e-06, + "loss": 0.8624, + "step": 9379 + }, + { + "epoch": 0.6776599779652139, + "grad_norm": 6.826355170107051, + "learning_rate": 3.850248386211531e-06, + "loss": 0.8516, + "step": 9380 + }, + { + "epoch": 0.6777322231654235, + "grad_norm": 6.4527401566919735, + "learning_rate": 3.85000220659032e-06, + "loss": 0.888, + "step": 9381 + }, + { + "epoch": 0.6778044683656329, + "grad_norm": 6.071225523558662, + "learning_rate": 3.849756008488627e-06, + "loss": 0.8259, + "step": 9382 + }, + { + "epoch": 0.6778767135658424, + "grad_norm": 6.719048072058192, + "learning_rate": 3.849509791909822e-06, + "loss": 0.8795, + "step": 9383 + }, + { + "epoch": 0.677948958766052, + "grad_norm": 6.34581974635065, + "learning_rate": 3.849263556857275e-06, + "loss": 0.9088, + "step": 9384 + }, + { + "epoch": 0.6780212039662615, + "grad_norm": 6.840020544010118, + "learning_rate": 3.849017303334358e-06, + "loss": 0.8694, + "step": 9385 + }, + { + "epoch": 0.678093449166471, + "grad_norm": 6.394896630233905, + "learning_rate": 3.848771031344442e-06, + "loss": 0.8456, + "step": 9386 + }, + { + "epoch": 0.6781656943666805, + "grad_norm": 5.515177333052369, + "learning_rate": 3.8485247408908974e-06, + "loss": 0.8063, + "step": 9387 + }, + { + "epoch": 0.67823793956689, + "grad_norm": 8.606915075773783, + "learning_rate": 3.848278431977096e-06, + "loss": 0.843, + "step": 9388 + }, + { + "epoch": 0.6783101847670996, + "grad_norm": 6.4393761132413445, + "learning_rate": 3.848032104606411e-06, + "loss": 0.7785, + "step": 9389 + }, + { + "epoch": 0.678382429967309, + "grad_norm": 5.624106103018311, + "learning_rate": 3.847785758782212e-06, + "loss": 0.7467, + "step": 9390 + }, + { + "epoch": 0.6784546751675186, + "grad_norm": 6.356477944516984, + "learning_rate": 3.847539394507872e-06, + "loss": 0.8268, + "step": 9391 + }, + { + "epoch": 0.6785269203677281, + "grad_norm": 7.231263779055855, + "learning_rate": 3.8472930117867654e-06, + "loss": 0.8801, + "step": 9392 + }, + { + "epoch": 0.6785991655679375, + "grad_norm": 8.498824992027851, + "learning_rate": 3.847046610622263e-06, + "loss": 0.8354, + "step": 9393 + }, + { + "epoch": 0.6786714107681471, + "grad_norm": 5.994300678561261, + "learning_rate": 3.846800191017737e-06, + "loss": 0.8498, + "step": 9394 + }, + { + "epoch": 0.6787436559683566, + "grad_norm": 6.099868285211069, + "learning_rate": 3.846553752976564e-06, + "loss": 0.8503, + "step": 9395 + }, + { + "epoch": 0.6788159011685662, + "grad_norm": 5.500722144142136, + "learning_rate": 3.846307296502115e-06, + "loss": 0.8344, + "step": 9396 + }, + { + "epoch": 0.6788881463687756, + "grad_norm": 6.6093184376273575, + "learning_rate": 3.846060821597764e-06, + "loss": 0.7913, + "step": 9397 + }, + { + "epoch": 0.6789603915689851, + "grad_norm": 6.304786426094235, + "learning_rate": 3.8458143282668865e-06, + "loss": 0.8363, + "step": 9398 + }, + { + "epoch": 0.6790326367691947, + "grad_norm": 9.248071830417755, + "learning_rate": 3.845567816512855e-06, + "loss": 0.9, + "step": 9399 + }, + { + "epoch": 0.6791048819694041, + "grad_norm": 5.933272613614741, + "learning_rate": 3.845321286339045e-06, + "loss": 0.9487, + "step": 9400 + }, + { + "epoch": 0.6791771271696136, + "grad_norm": 6.937710870510728, + "learning_rate": 3.845074737748832e-06, + "loss": 0.8416, + "step": 9401 + }, + { + "epoch": 0.6792493723698232, + "grad_norm": 6.213666668956454, + "learning_rate": 3.844828170745588e-06, + "loss": 0.8424, + "step": 9402 + }, + { + "epoch": 0.6793216175700327, + "grad_norm": 7.74514692390033, + "learning_rate": 3.8445815853326925e-06, + "loss": 0.9048, + "step": 9403 + }, + { + "epoch": 0.6793938627702422, + "grad_norm": 5.87461559580178, + "learning_rate": 3.844334981513519e-06, + "loss": 0.895, + "step": 9404 + }, + { + "epoch": 0.6794661079704517, + "grad_norm": 6.145747642135214, + "learning_rate": 3.844088359291443e-06, + "loss": 0.8781, + "step": 9405 + }, + { + "epoch": 0.6795383531706612, + "grad_norm": 8.221843412912053, + "learning_rate": 3.8438417186698416e-06, + "loss": 0.8, + "step": 9406 + }, + { + "epoch": 0.6796105983708708, + "grad_norm": 5.900078672757386, + "learning_rate": 3.843595059652089e-06, + "loss": 0.8812, + "step": 9407 + }, + { + "epoch": 0.6796828435710802, + "grad_norm": 6.209386475950329, + "learning_rate": 3.843348382241564e-06, + "loss": 0.7575, + "step": 9408 + }, + { + "epoch": 0.6797550887712898, + "grad_norm": 7.101504486149176, + "learning_rate": 3.843101686441643e-06, + "loss": 0.9085, + "step": 9409 + }, + { + "epoch": 0.6798273339714993, + "grad_norm": 7.303633310879929, + "learning_rate": 3.842854972255703e-06, + "loss": 0.8883, + "step": 9410 + }, + { + "epoch": 0.6798995791717087, + "grad_norm": 7.018940233623152, + "learning_rate": 3.842608239687121e-06, + "loss": 0.8744, + "step": 9411 + }, + { + "epoch": 0.6799718243719183, + "grad_norm": 7.165136691464654, + "learning_rate": 3.842361488739275e-06, + "loss": 0.9556, + "step": 9412 + }, + { + "epoch": 0.6800440695721278, + "grad_norm": 5.863843023033555, + "learning_rate": 3.8421147194155406e-06, + "loss": 0.9436, + "step": 9413 + }, + { + "epoch": 0.6801163147723374, + "grad_norm": 5.1027727779907766, + "learning_rate": 3.841867931719299e-06, + "loss": 0.8194, + "step": 9414 + }, + { + "epoch": 0.6801885599725468, + "grad_norm": 5.719815847993791, + "learning_rate": 3.841621125653928e-06, + "loss": 0.848, + "step": 9415 + }, + { + "epoch": 0.6802608051727563, + "grad_norm": 7.835205530758495, + "learning_rate": 3.8413743012228044e-06, + "loss": 0.8006, + "step": 9416 + }, + { + "epoch": 0.6803330503729659, + "grad_norm": 5.8056741648619, + "learning_rate": 3.841127458429309e-06, + "loss": 0.8901, + "step": 9417 + }, + { + "epoch": 0.6804052955731753, + "grad_norm": 8.71651875921719, + "learning_rate": 3.8408805972768194e-06, + "loss": 0.769, + "step": 9418 + }, + { + "epoch": 0.6804775407733848, + "grad_norm": 6.616653943309834, + "learning_rate": 3.840633717768716e-06, + "loss": 0.7956, + "step": 9419 + }, + { + "epoch": 0.6805497859735944, + "grad_norm": 7.185884509769423, + "learning_rate": 3.840386819908377e-06, + "loss": 0.8632, + "step": 9420 + }, + { + "epoch": 0.6806220311738039, + "grad_norm": 5.656263657021237, + "learning_rate": 3.8401399036991845e-06, + "loss": 0.8445, + "step": 9421 + }, + { + "epoch": 0.6806942763740134, + "grad_norm": 6.240485005725874, + "learning_rate": 3.839892969144516e-06, + "loss": 0.8026, + "step": 9422 + }, + { + "epoch": 0.6807665215742229, + "grad_norm": 6.879408931182361, + "learning_rate": 3.839646016247754e-06, + "loss": 0.9328, + "step": 9423 + }, + { + "epoch": 0.6808387667744324, + "grad_norm": 5.100768887771721, + "learning_rate": 3.8393990450122784e-06, + "loss": 0.7939, + "step": 9424 + }, + { + "epoch": 0.680911011974642, + "grad_norm": 5.597627286975839, + "learning_rate": 3.839152055441469e-06, + "loss": 0.8326, + "step": 9425 + }, + { + "epoch": 0.6809832571748514, + "grad_norm": 6.951317301443073, + "learning_rate": 3.838905047538709e-06, + "loss": 0.8559, + "step": 9426 + }, + { + "epoch": 0.681055502375061, + "grad_norm": 6.047809538471185, + "learning_rate": 3.838658021307377e-06, + "loss": 0.8576, + "step": 9427 + }, + { + "epoch": 0.6811277475752705, + "grad_norm": 6.849456498240504, + "learning_rate": 3.838410976750856e-06, + "loss": 0.8683, + "step": 9428 + }, + { + "epoch": 0.6811999927754799, + "grad_norm": 5.723801346611249, + "learning_rate": 3.838163913872529e-06, + "loss": 0.8762, + "step": 9429 + }, + { + "epoch": 0.6812722379756895, + "grad_norm": 4.812663286089464, + "learning_rate": 3.837916832675777e-06, + "loss": 0.7725, + "step": 9430 + }, + { + "epoch": 0.681344483175899, + "grad_norm": 8.170929658961654, + "learning_rate": 3.837669733163982e-06, + "loss": 0.8607, + "step": 9431 + }, + { + "epoch": 0.6814167283761086, + "grad_norm": 8.075850441381546, + "learning_rate": 3.837422615340527e-06, + "loss": 0.7813, + "step": 9432 + }, + { + "epoch": 0.681488973576318, + "grad_norm": 5.14211578550959, + "learning_rate": 3.8371754792087944e-06, + "loss": 0.8599, + "step": 9433 + }, + { + "epoch": 0.6815612187765275, + "grad_norm": 5.06947655979708, + "learning_rate": 3.836928324772169e-06, + "loss": 0.8051, + "step": 9434 + }, + { + "epoch": 0.6816334639767371, + "grad_norm": 7.382693658104341, + "learning_rate": 3.8366811520340315e-06, + "loss": 0.8895, + "step": 9435 + }, + { + "epoch": 0.6817057091769465, + "grad_norm": 6.5876300487674495, + "learning_rate": 3.836433960997768e-06, + "loss": 0.8728, + "step": 9436 + }, + { + "epoch": 0.681777954377156, + "grad_norm": 5.640605556634165, + "learning_rate": 3.83618675166676e-06, + "loss": 0.8686, + "step": 9437 + }, + { + "epoch": 0.6818501995773656, + "grad_norm": 5.613685418154372, + "learning_rate": 3.8359395240443945e-06, + "loss": 0.7752, + "step": 9438 + }, + { + "epoch": 0.6819224447775751, + "grad_norm": 7.307372814406725, + "learning_rate": 3.835692278134054e-06, + "loss": 0.865, + "step": 9439 + }, + { + "epoch": 0.6819946899777846, + "grad_norm": 5.786766595695444, + "learning_rate": 3.835445013939122e-06, + "loss": 0.8347, + "step": 9440 + }, + { + "epoch": 0.6820669351779941, + "grad_norm": 6.83899736205629, + "learning_rate": 3.835197731462985e-06, + "loss": 0.8756, + "step": 9441 + }, + { + "epoch": 0.6821391803782036, + "grad_norm": 5.706147195280629, + "learning_rate": 3.834950430709028e-06, + "loss": 0.8326, + "step": 9442 + }, + { + "epoch": 0.6822114255784132, + "grad_norm": 5.58011128085617, + "learning_rate": 3.834703111680636e-06, + "loss": 0.8145, + "step": 9443 + }, + { + "epoch": 0.6822836707786226, + "grad_norm": 6.892914614377833, + "learning_rate": 3.834455774381195e-06, + "loss": 0.8488, + "step": 9444 + }, + { + "epoch": 0.6823559159788322, + "grad_norm": 7.120718489409613, + "learning_rate": 3.8342084188140905e-06, + "loss": 0.8799, + "step": 9445 + }, + { + "epoch": 0.6824281611790417, + "grad_norm": 8.466946230300325, + "learning_rate": 3.833961044982709e-06, + "loss": 0.8296, + "step": 9446 + }, + { + "epoch": 0.6825004063792511, + "grad_norm": 6.586042627330442, + "learning_rate": 3.833713652890436e-06, + "loss": 0.8332, + "step": 9447 + }, + { + "epoch": 0.6825726515794607, + "grad_norm": 4.666999237380896, + "learning_rate": 3.8334662425406585e-06, + "loss": 0.7535, + "step": 9448 + }, + { + "epoch": 0.6826448967796702, + "grad_norm": 7.11071576900375, + "learning_rate": 3.833218813936765e-06, + "loss": 0.7809, + "step": 9449 + }, + { + "epoch": 0.6827171419798798, + "grad_norm": 5.917434033701845, + "learning_rate": 3.83297136708214e-06, + "loss": 0.8959, + "step": 9450 + }, + { + "epoch": 0.6827893871800892, + "grad_norm": 5.237508900345771, + "learning_rate": 3.832723901980171e-06, + "loss": 0.7947, + "step": 9451 + }, + { + "epoch": 0.6828616323802987, + "grad_norm": 5.424877379829119, + "learning_rate": 3.832476418634248e-06, + "loss": 0.7909, + "step": 9452 + }, + { + "epoch": 0.6829338775805083, + "grad_norm": 7.486627992217034, + "learning_rate": 3.8322289170477575e-06, + "loss": 0.928, + "step": 9453 + }, + { + "epoch": 0.6830061227807177, + "grad_norm": 5.41681815693609, + "learning_rate": 3.831981397224087e-06, + "loss": 0.858, + "step": 9454 + }, + { + "epoch": 0.6830783679809272, + "grad_norm": 6.650697209801755, + "learning_rate": 3.831733859166625e-06, + "loss": 0.8841, + "step": 9455 + }, + { + "epoch": 0.6831506131811368, + "grad_norm": 5.212489959194841, + "learning_rate": 3.831486302878761e-06, + "loss": 0.8337, + "step": 9456 + }, + { + "epoch": 0.6832228583813463, + "grad_norm": 7.450556730423688, + "learning_rate": 3.831238728363883e-06, + "loss": 0.8683, + "step": 9457 + }, + { + "epoch": 0.6832951035815558, + "grad_norm": 7.351736587587418, + "learning_rate": 3.830991135625381e-06, + "loss": 0.8155, + "step": 9458 + }, + { + "epoch": 0.6833673487817653, + "grad_norm": 6.64456770593688, + "learning_rate": 3.830743524666643e-06, + "loss": 0.8505, + "step": 9459 + }, + { + "epoch": 0.6834395939819748, + "grad_norm": 5.838678617569867, + "learning_rate": 3.830495895491061e-06, + "loss": 0.7815, + "step": 9460 + }, + { + "epoch": 0.6835118391821844, + "grad_norm": 6.10040577023557, + "learning_rate": 3.830248248102022e-06, + "loss": 0.8427, + "step": 9461 + }, + { + "epoch": 0.6835840843823938, + "grad_norm": 5.923136151576898, + "learning_rate": 3.830000582502918e-06, + "loss": 0.8802, + "step": 9462 + }, + { + "epoch": 0.6836563295826034, + "grad_norm": 7.01511739750876, + "learning_rate": 3.829752898697138e-06, + "loss": 0.8896, + "step": 9463 + }, + { + "epoch": 0.6837285747828129, + "grad_norm": 7.2106692841842985, + "learning_rate": 3.829505196688074e-06, + "loss": 0.8276, + "step": 9464 + }, + { + "epoch": 0.6838008199830223, + "grad_norm": 5.90127014454072, + "learning_rate": 3.829257476479114e-06, + "loss": 0.8595, + "step": 9465 + }, + { + "epoch": 0.6838730651832319, + "grad_norm": 6.272073143688249, + "learning_rate": 3.829009738073653e-06, + "loss": 0.8227, + "step": 9466 + }, + { + "epoch": 0.6839453103834414, + "grad_norm": 5.909937116414244, + "learning_rate": 3.828761981475082e-06, + "loss": 0.8096, + "step": 9467 + }, + { + "epoch": 0.684017555583651, + "grad_norm": 6.990884840684539, + "learning_rate": 3.828514206686789e-06, + "loss": 0.8251, + "step": 9468 + }, + { + "epoch": 0.6840898007838604, + "grad_norm": 5.6934779683188745, + "learning_rate": 3.8282664137121695e-06, + "loss": 0.8445, + "step": 9469 + }, + { + "epoch": 0.6841620459840699, + "grad_norm": 7.046764296784969, + "learning_rate": 3.8280186025546126e-06, + "loss": 0.9287, + "step": 9470 + }, + { + "epoch": 0.6842342911842795, + "grad_norm": 5.093490125895765, + "learning_rate": 3.827770773217513e-06, + "loss": 0.8548, + "step": 9471 + }, + { + "epoch": 0.6843065363844889, + "grad_norm": 8.04944209659317, + "learning_rate": 3.827522925704263e-06, + "loss": 0.868, + "step": 9472 + }, + { + "epoch": 0.6843787815846984, + "grad_norm": 6.659615252496145, + "learning_rate": 3.827275060018254e-06, + "loss": 0.8507, + "step": 9473 + }, + { + "epoch": 0.684451026784908, + "grad_norm": 5.684993988275495, + "learning_rate": 3.8270271761628805e-06, + "loss": 0.8457, + "step": 9474 + }, + { + "epoch": 0.6845232719851175, + "grad_norm": 6.757737705059354, + "learning_rate": 3.8267792741415345e-06, + "loss": 0.8216, + "step": 9475 + }, + { + "epoch": 0.684595517185327, + "grad_norm": 6.5248825939621495, + "learning_rate": 3.826531353957612e-06, + "loss": 0.936, + "step": 9476 + }, + { + "epoch": 0.6846677623855365, + "grad_norm": 6.031435316332063, + "learning_rate": 3.8262834156145035e-06, + "loss": 0.8325, + "step": 9477 + }, + { + "epoch": 0.684740007585746, + "grad_norm": 5.046257241059391, + "learning_rate": 3.826035459115606e-06, + "loss": 0.7969, + "step": 9478 + }, + { + "epoch": 0.6848122527859556, + "grad_norm": 5.773062852568559, + "learning_rate": 3.825787484464312e-06, + "loss": 0.8489, + "step": 9479 + }, + { + "epoch": 0.684884497986165, + "grad_norm": 6.880608784998408, + "learning_rate": 3.825539491664017e-06, + "loss": 0.8809, + "step": 9480 + }, + { + "epoch": 0.6849567431863746, + "grad_norm": 8.448195622577861, + "learning_rate": 3.825291480718116e-06, + "loss": 0.9149, + "step": 9481 + }, + { + "epoch": 0.6850289883865841, + "grad_norm": 6.299019800862683, + "learning_rate": 3.825043451630003e-06, + "loss": 0.799, + "step": 9482 + }, + { + "epoch": 0.6851012335867935, + "grad_norm": 6.12317272099543, + "learning_rate": 3.824795404403074e-06, + "loss": 0.8448, + "step": 9483 + }, + { + "epoch": 0.6851734787870031, + "grad_norm": 8.946209698456101, + "learning_rate": 3.824547339040725e-06, + "loss": 0.8016, + "step": 9484 + }, + { + "epoch": 0.6852457239872126, + "grad_norm": 5.750072312936934, + "learning_rate": 3.824299255546352e-06, + "loss": 0.8095, + "step": 9485 + }, + { + "epoch": 0.6853179691874222, + "grad_norm": 6.485696754407865, + "learning_rate": 3.824051153923349e-06, + "loss": 0.8317, + "step": 9486 + }, + { + "epoch": 0.6853902143876316, + "grad_norm": 8.022655356077708, + "learning_rate": 3.823803034175114e-06, + "loss": 0.8537, + "step": 9487 + }, + { + "epoch": 0.6854624595878411, + "grad_norm": 5.730617529063015, + "learning_rate": 3.823554896305044e-06, + "loss": 0.9229, + "step": 9488 + }, + { + "epoch": 0.6855347047880507, + "grad_norm": 6.7502192708887705, + "learning_rate": 3.823306740316534e-06, + "loss": 0.9249, + "step": 9489 + }, + { + "epoch": 0.6856069499882601, + "grad_norm": 11.241312699416145, + "learning_rate": 3.823058566212984e-06, + "loss": 0.8448, + "step": 9490 + }, + { + "epoch": 0.6856791951884696, + "grad_norm": 6.306361471349721, + "learning_rate": 3.822810373997788e-06, + "loss": 0.8783, + "step": 9491 + }, + { + "epoch": 0.6857514403886792, + "grad_norm": 6.53957817933786, + "learning_rate": 3.822562163674346e-06, + "loss": 0.7916, + "step": 9492 + }, + { + "epoch": 0.6858236855888887, + "grad_norm": 5.99106536663513, + "learning_rate": 3.822313935246055e-06, + "loss": 0.8492, + "step": 9493 + }, + { + "epoch": 0.6858959307890982, + "grad_norm": 5.566515384072711, + "learning_rate": 3.822065688716312e-06, + "loss": 0.8059, + "step": 9494 + }, + { + "epoch": 0.6859681759893077, + "grad_norm": 5.386062947298854, + "learning_rate": 3.821817424088517e-06, + "loss": 0.8078, + "step": 9495 + }, + { + "epoch": 0.6860404211895172, + "grad_norm": 5.6489946399243935, + "learning_rate": 3.821569141366068e-06, + "loss": 0.8641, + "step": 9496 + }, + { + "epoch": 0.6861126663897267, + "grad_norm": 7.9120575454208355, + "learning_rate": 3.821320840552362e-06, + "loss": 0.9127, + "step": 9497 + }, + { + "epoch": 0.6861849115899362, + "grad_norm": 7.4587166401772516, + "learning_rate": 3.821072521650802e-06, + "loss": 0.9433, + "step": 9498 + }, + { + "epoch": 0.6862571567901458, + "grad_norm": 6.355516767642822, + "learning_rate": 3.820824184664783e-06, + "loss": 0.9384, + "step": 9499 + }, + { + "epoch": 0.6863294019903553, + "grad_norm": 6.899551664520436, + "learning_rate": 3.820575829597707e-06, + "loss": 0.8514, + "step": 9500 + }, + { + "epoch": 0.6864016471905647, + "grad_norm": 6.59301753744243, + "learning_rate": 3.820327456452974e-06, + "loss": 0.8427, + "step": 9501 + }, + { + "epoch": 0.6864738923907743, + "grad_norm": 7.415972416420082, + "learning_rate": 3.8200790652339825e-06, + "loss": 0.8538, + "step": 9502 + }, + { + "epoch": 0.6865461375909838, + "grad_norm": 9.572995080028303, + "learning_rate": 3.819830655944134e-06, + "loss": 0.8576, + "step": 9503 + }, + { + "epoch": 0.6866183827911934, + "grad_norm": 7.523532062700759, + "learning_rate": 3.819582228586828e-06, + "loss": 0.8269, + "step": 9504 + }, + { + "epoch": 0.6866906279914028, + "grad_norm": 6.477379391900794, + "learning_rate": 3.819333783165466e-06, + "loss": 0.852, + "step": 9505 + }, + { + "epoch": 0.6867628731916123, + "grad_norm": 6.179869291040523, + "learning_rate": 3.819085319683449e-06, + "loss": 0.8797, + "step": 9506 + }, + { + "epoch": 0.6868351183918219, + "grad_norm": 5.864772904236081, + "learning_rate": 3.818836838144178e-06, + "loss": 0.8952, + "step": 9507 + }, + { + "epoch": 0.6869073635920313, + "grad_norm": 6.394229385437603, + "learning_rate": 3.818588338551055e-06, + "loss": 0.8082, + "step": 9508 + }, + { + "epoch": 0.6869796087922408, + "grad_norm": 5.613217538619384, + "learning_rate": 3.818339820907482e-06, + "loss": 0.8404, + "step": 9509 + }, + { + "epoch": 0.6870518539924504, + "grad_norm": 10.138753709516104, + "learning_rate": 3.81809128521686e-06, + "loss": 0.8025, + "step": 9510 + }, + { + "epoch": 0.6871240991926599, + "grad_norm": 5.111974772216442, + "learning_rate": 3.817842731482591e-06, + "loss": 0.7465, + "step": 9511 + }, + { + "epoch": 0.6871963443928694, + "grad_norm": 7.861520523654928, + "learning_rate": 3.81759415970808e-06, + "loss": 0.9108, + "step": 9512 + }, + { + "epoch": 0.6872685895930789, + "grad_norm": 7.780558482622804, + "learning_rate": 3.817345569896726e-06, + "loss": 0.9047, + "step": 9513 + }, + { + "epoch": 0.6873408347932884, + "grad_norm": 7.664763034609027, + "learning_rate": 3.817096962051935e-06, + "loss": 0.8225, + "step": 9514 + }, + { + "epoch": 0.6874130799934979, + "grad_norm": 6.77606255901692, + "learning_rate": 3.81684833617711e-06, + "loss": 0.8387, + "step": 9515 + }, + { + "epoch": 0.6874853251937074, + "grad_norm": 6.63645721692605, + "learning_rate": 3.816599692275652e-06, + "loss": 0.8203, + "step": 9516 + }, + { + "epoch": 0.687557570393917, + "grad_norm": 6.363708968801104, + "learning_rate": 3.816351030350967e-06, + "loss": 0.8439, + "step": 9517 + }, + { + "epoch": 0.6876298155941265, + "grad_norm": 7.72016529281838, + "learning_rate": 3.816102350406459e-06, + "loss": 0.7994, + "step": 9518 + }, + { + "epoch": 0.6877020607943359, + "grad_norm": 6.124706961959327, + "learning_rate": 3.815853652445533e-06, + "loss": 0.788, + "step": 9519 + }, + { + "epoch": 0.6877743059945455, + "grad_norm": 8.168496478348807, + "learning_rate": 3.8156049364715895e-06, + "loss": 0.9449, + "step": 9520 + }, + { + "epoch": 0.687846551194755, + "grad_norm": 7.079963361138838, + "learning_rate": 3.815356202488038e-06, + "loss": 0.8143, + "step": 9521 + }, + { + "epoch": 0.6879187963949646, + "grad_norm": 4.630716657108579, + "learning_rate": 3.81510745049828e-06, + "loss": 0.8449, + "step": 9522 + }, + { + "epoch": 0.687991041595174, + "grad_norm": 6.243017034113588, + "learning_rate": 3.814858680505723e-06, + "loss": 0.7923, + "step": 9523 + }, + { + "epoch": 0.6880632867953835, + "grad_norm": 6.034924904546504, + "learning_rate": 3.8146098925137714e-06, + "loss": 0.8488, + "step": 9524 + }, + { + "epoch": 0.6881355319955931, + "grad_norm": 6.456882962037427, + "learning_rate": 3.8143610865258308e-06, + "loss": 0.8277, + "step": 9525 + }, + { + "epoch": 0.6882077771958025, + "grad_norm": 6.384258055211063, + "learning_rate": 3.8141122625453074e-06, + "loss": 0.8341, + "step": 9526 + }, + { + "epoch": 0.688280022396012, + "grad_norm": 6.403631801106355, + "learning_rate": 3.8138634205756075e-06, + "loss": 0.9938, + "step": 9527 + }, + { + "epoch": 0.6883522675962216, + "grad_norm": 4.5103579099614866, + "learning_rate": 3.813614560620138e-06, + "loss": 0.8147, + "step": 9528 + }, + { + "epoch": 0.6884245127964311, + "grad_norm": 7.777664032361707, + "learning_rate": 3.813365682682305e-06, + "loss": 0.8398, + "step": 9529 + }, + { + "epoch": 0.6884967579966406, + "grad_norm": 6.113042252716408, + "learning_rate": 3.8131167867655154e-06, + "loss": 0.8173, + "step": 9530 + }, + { + "epoch": 0.6885690031968501, + "grad_norm": 5.732317396833248, + "learning_rate": 3.8128678728731765e-06, + "loss": 0.7524, + "step": 9531 + }, + { + "epoch": 0.6886412483970596, + "grad_norm": 5.9699734537374685, + "learning_rate": 3.8126189410086958e-06, + "loss": 0.7783, + "step": 9532 + }, + { + "epoch": 0.6887134935972691, + "grad_norm": 6.153479867818093, + "learning_rate": 3.8123699911754813e-06, + "loss": 0.8316, + "step": 9533 + }, + { + "epoch": 0.6887857387974786, + "grad_norm": 7.440935383314527, + "learning_rate": 3.8121210233769403e-06, + "loss": 0.786, + "step": 9534 + }, + { + "epoch": 0.6888579839976882, + "grad_norm": 7.110362761910044, + "learning_rate": 3.811872037616482e-06, + "loss": 0.8567, + "step": 9535 + }, + { + "epoch": 0.6889302291978977, + "grad_norm": 9.19045537009282, + "learning_rate": 3.811623033897513e-06, + "loss": 0.8704, + "step": 9536 + }, + { + "epoch": 0.6890024743981071, + "grad_norm": 7.025863007992689, + "learning_rate": 3.8113740122234433e-06, + "loss": 0.8248, + "step": 9537 + }, + { + "epoch": 0.6890747195983167, + "grad_norm": 5.531197068128176, + "learning_rate": 3.8111249725976823e-06, + "loss": 0.8204, + "step": 9538 + }, + { + "epoch": 0.6891469647985262, + "grad_norm": 6.0136151015304815, + "learning_rate": 3.8108759150236375e-06, + "loss": 0.8572, + "step": 9539 + }, + { + "epoch": 0.6892192099987358, + "grad_norm": 6.725819545569916, + "learning_rate": 3.8106268395047203e-06, + "loss": 0.908, + "step": 9540 + }, + { + "epoch": 0.6892914551989452, + "grad_norm": 6.371715522330683, + "learning_rate": 3.810377746044338e-06, + "loss": 0.8417, + "step": 9541 + }, + { + "epoch": 0.6893637003991547, + "grad_norm": 8.296831321242136, + "learning_rate": 3.8101286346459033e-06, + "loss": 0.883, + "step": 9542 + }, + { + "epoch": 0.6894359455993643, + "grad_norm": 8.053862920783873, + "learning_rate": 3.8098795053128235e-06, + "loss": 0.8382, + "step": 9543 + }, + { + "epoch": 0.6895081907995737, + "grad_norm": 5.0392623506880945, + "learning_rate": 3.809630358048512e-06, + "loss": 0.8641, + "step": 9544 + }, + { + "epoch": 0.6895804359997832, + "grad_norm": 6.308098797824527, + "learning_rate": 3.809381192856376e-06, + "loss": 0.8795, + "step": 9545 + }, + { + "epoch": 0.6896526811999928, + "grad_norm": 8.04329173480313, + "learning_rate": 3.8091320097398287e-06, + "loss": 0.8315, + "step": 9546 + }, + { + "epoch": 0.6897249264002023, + "grad_norm": 5.756629107952278, + "learning_rate": 3.808882808702281e-06, + "loss": 0.8307, + "step": 9547 + }, + { + "epoch": 0.6897971716004118, + "grad_norm": 10.400951239291958, + "learning_rate": 3.8086335897471432e-06, + "loss": 0.9127, + "step": 9548 + }, + { + "epoch": 0.6898694168006213, + "grad_norm": 7.738449595515498, + "learning_rate": 3.8083843528778288e-06, + "loss": 0.8396, + "step": 9549 + }, + { + "epoch": 0.6899416620008308, + "grad_norm": 5.668959359505637, + "learning_rate": 3.8081350980977472e-06, + "loss": 0.8669, + "step": 9550 + }, + { + "epoch": 0.6900139072010403, + "grad_norm": 9.017330861145942, + "learning_rate": 3.8078858254103122e-06, + "loss": 0.8416, + "step": 9551 + }, + { + "epoch": 0.6900861524012498, + "grad_norm": 8.648877691767206, + "learning_rate": 3.807636534818936e-06, + "loss": 0.9144, + "step": 9552 + }, + { + "epoch": 0.6901583976014594, + "grad_norm": 6.6330304688493325, + "learning_rate": 3.8073872263270316e-06, + "loss": 0.7488, + "step": 9553 + }, + { + "epoch": 0.6902306428016689, + "grad_norm": 5.613467792764148, + "learning_rate": 3.8071378999380105e-06, + "loss": 0.8271, + "step": 9554 + }, + { + "epoch": 0.6903028880018783, + "grad_norm": 5.136885282768654, + "learning_rate": 3.806888555655286e-06, + "loss": 0.8064, + "step": 9555 + }, + { + "epoch": 0.6903751332020879, + "grad_norm": 6.863018901502662, + "learning_rate": 3.8066391934822733e-06, + "loss": 0.8599, + "step": 9556 + }, + { + "epoch": 0.6904473784022974, + "grad_norm": 5.73634674516045, + "learning_rate": 3.806389813422383e-06, + "loss": 0.8398, + "step": 9557 + }, + { + "epoch": 0.690519623602507, + "grad_norm": 7.281285887535621, + "learning_rate": 3.8061404154790315e-06, + "loss": 0.8475, + "step": 9558 + }, + { + "epoch": 0.6905918688027164, + "grad_norm": 7.033855311593607, + "learning_rate": 3.8058909996556314e-06, + "loss": 0.7975, + "step": 9559 + }, + { + "epoch": 0.6906641140029259, + "grad_norm": 6.939915416982619, + "learning_rate": 3.805641565955598e-06, + "loss": 0.844, + "step": 9560 + }, + { + "epoch": 0.6907363592031355, + "grad_norm": 5.457921233405945, + "learning_rate": 3.8053921143823447e-06, + "loss": 0.8584, + "step": 9561 + }, + { + "epoch": 0.6908086044033449, + "grad_norm": 6.174297697462017, + "learning_rate": 3.805142644939287e-06, + "loss": 0.9722, + "step": 9562 + }, + { + "epoch": 0.6908808496035544, + "grad_norm": 6.069864106770283, + "learning_rate": 3.804893157629841e-06, + "loss": 0.9059, + "step": 9563 + }, + { + "epoch": 0.690953094803764, + "grad_norm": 5.760530531610368, + "learning_rate": 3.8046436524574193e-06, + "loss": 0.8507, + "step": 9564 + }, + { + "epoch": 0.6910253400039735, + "grad_norm": 7.102316099832054, + "learning_rate": 3.8043941294254394e-06, + "loss": 0.8929, + "step": 9565 + }, + { + "epoch": 0.691097585204183, + "grad_norm": 5.737559917860131, + "learning_rate": 3.8041445885373176e-06, + "loss": 0.8624, + "step": 9566 + }, + { + "epoch": 0.6911698304043925, + "grad_norm": 6.106518948585486, + "learning_rate": 3.8038950297964682e-06, + "loss": 0.8813, + "step": 9567 + }, + { + "epoch": 0.691242075604602, + "grad_norm": 6.240287864109566, + "learning_rate": 3.8036454532063083e-06, + "loss": 0.8252, + "step": 9568 + }, + { + "epoch": 0.6913143208048115, + "grad_norm": 5.052093359029464, + "learning_rate": 3.8033958587702535e-06, + "loss": 0.9679, + "step": 9569 + }, + { + "epoch": 0.691386566005021, + "grad_norm": 5.2299860987287055, + "learning_rate": 3.803146246491723e-06, + "loss": 0.8235, + "step": 9570 + }, + { + "epoch": 0.6914588112052306, + "grad_norm": 5.5902076458793415, + "learning_rate": 3.802896616374131e-06, + "loss": 0.8262, + "step": 9571 + }, + { + "epoch": 0.6915310564054401, + "grad_norm": 8.221930639005487, + "learning_rate": 3.8026469684208974e-06, + "loss": 0.8901, + "step": 9572 + }, + { + "epoch": 0.6916033016056495, + "grad_norm": 7.319898986536337, + "learning_rate": 3.8023973026354365e-06, + "loss": 0.8739, + "step": 9573 + }, + { + "epoch": 0.6916755468058591, + "grad_norm": 6.3522186128101445, + "learning_rate": 3.802147619021169e-06, + "loss": 0.8563, + "step": 9574 + }, + { + "epoch": 0.6917477920060686, + "grad_norm": 5.594224962923016, + "learning_rate": 3.801897917581511e-06, + "loss": 0.7802, + "step": 9575 + }, + { + "epoch": 0.6918200372062782, + "grad_norm": 8.017116831659651, + "learning_rate": 3.8016481983198814e-06, + "loss": 0.8952, + "step": 9576 + }, + { + "epoch": 0.6918922824064876, + "grad_norm": 6.584011026903933, + "learning_rate": 3.8013984612396993e-06, + "loss": 0.8498, + "step": 9577 + }, + { + "epoch": 0.6919645276066971, + "grad_norm": 6.622726626178065, + "learning_rate": 3.8011487063443826e-06, + "loss": 0.888, + "step": 9578 + }, + { + "epoch": 0.6920367728069067, + "grad_norm": 7.839301439810118, + "learning_rate": 3.8008989336373497e-06, + "loss": 0.8153, + "step": 9579 + }, + { + "epoch": 0.6921090180071161, + "grad_norm": 8.690832582893696, + "learning_rate": 3.8006491431220203e-06, + "loss": 0.835, + "step": 9580 + }, + { + "epoch": 0.6921812632073256, + "grad_norm": 5.4695407840649475, + "learning_rate": 3.8003993348018153e-06, + "loss": 0.8502, + "step": 9581 + }, + { + "epoch": 0.6922535084075352, + "grad_norm": 7.299218007883538, + "learning_rate": 3.800149508680152e-06, + "loss": 0.8942, + "step": 9582 + }, + { + "epoch": 0.6923257536077447, + "grad_norm": 8.491282312182053, + "learning_rate": 3.7998996647604512e-06, + "loss": 0.924, + "step": 9583 + }, + { + "epoch": 0.6923979988079542, + "grad_norm": 8.794201188757274, + "learning_rate": 3.7996498030461344e-06, + "loss": 0.7621, + "step": 9584 + }, + { + "epoch": 0.6924702440081637, + "grad_norm": 6.440848655463649, + "learning_rate": 3.7993999235406207e-06, + "loss": 0.7875, + "step": 9585 + }, + { + "epoch": 0.6925424892083732, + "grad_norm": 6.4631571157093, + "learning_rate": 3.7991500262473304e-06, + "loss": 0.8813, + "step": 9586 + }, + { + "epoch": 0.6926147344085827, + "grad_norm": 7.601273690958214, + "learning_rate": 3.7989001111696855e-06, + "loss": 0.8185, + "step": 9587 + }, + { + "epoch": 0.6926869796087922, + "grad_norm": 5.350019380053063, + "learning_rate": 3.7986501783111064e-06, + "loss": 0.8671, + "step": 9588 + }, + { + "epoch": 0.6927592248090018, + "grad_norm": 9.181942459075712, + "learning_rate": 3.798400227675014e-06, + "loss": 0.8699, + "step": 9589 + }, + { + "epoch": 0.6928314700092113, + "grad_norm": 7.08302282512972, + "learning_rate": 3.7981502592648316e-06, + "loss": 0.8724, + "step": 9590 + }, + { + "epoch": 0.6929037152094207, + "grad_norm": 5.87270618867619, + "learning_rate": 3.7979002730839796e-06, + "loss": 0.8005, + "step": 9591 + }, + { + "epoch": 0.6929759604096303, + "grad_norm": 5.905903478074627, + "learning_rate": 3.7976502691358814e-06, + "loss": 0.8194, + "step": 9592 + }, + { + "epoch": 0.6930482056098398, + "grad_norm": 6.030005450143454, + "learning_rate": 3.7974002474239578e-06, + "loss": 0.9017, + "step": 9593 + }, + { + "epoch": 0.6931204508100494, + "grad_norm": 5.834655539209703, + "learning_rate": 3.797150207951632e-06, + "loss": 0.8845, + "step": 9594 + }, + { + "epoch": 0.6931926960102588, + "grad_norm": 6.4736648538060955, + "learning_rate": 3.796900150722328e-06, + "loss": 0.9645, + "step": 9595 + }, + { + "epoch": 0.6932649412104683, + "grad_norm": 6.378757341916275, + "learning_rate": 3.7966500757394664e-06, + "loss": 0.8301, + "step": 9596 + }, + { + "epoch": 0.6933371864106779, + "grad_norm": 5.430529632029512, + "learning_rate": 3.7963999830064737e-06, + "loss": 0.8, + "step": 9597 + }, + { + "epoch": 0.6934094316108873, + "grad_norm": 5.538699720632715, + "learning_rate": 3.7961498725267702e-06, + "loss": 0.7661, + "step": 9598 + }, + { + "epoch": 0.6934816768110968, + "grad_norm": 6.73863167114188, + "learning_rate": 3.7958997443037827e-06, + "loss": 0.8137, + "step": 9599 + }, + { + "epoch": 0.6935539220113064, + "grad_norm": 5.440849412631507, + "learning_rate": 3.795649598340933e-06, + "loss": 0.8197, + "step": 9600 + }, + { + "epoch": 0.6936261672115159, + "grad_norm": 8.047170333627713, + "learning_rate": 3.795399434641647e-06, + "loss": 0.8223, + "step": 9601 + }, + { + "epoch": 0.6936984124117254, + "grad_norm": 7.230574266518034, + "learning_rate": 3.795149253209348e-06, + "loss": 0.8273, + "step": 9602 + }, + { + "epoch": 0.6937706576119349, + "grad_norm": 6.0719615606164306, + "learning_rate": 3.794899054047462e-06, + "loss": 0.8061, + "step": 9603 + }, + { + "epoch": 0.6938429028121444, + "grad_norm": 8.05539502681811, + "learning_rate": 3.7946488371594125e-06, + "loss": 0.8569, + "step": 9604 + }, + { + "epoch": 0.6939151480123539, + "grad_norm": 7.819526869175888, + "learning_rate": 3.794398602548626e-06, + "loss": 0.9143, + "step": 9605 + }, + { + "epoch": 0.6939873932125634, + "grad_norm": 6.546068237415226, + "learning_rate": 3.7941483502185282e-06, + "loss": 0.8675, + "step": 9606 + }, + { + "epoch": 0.694059638412773, + "grad_norm": 6.136310954782935, + "learning_rate": 3.793898080172544e-06, + "loss": 0.768, + "step": 9607 + }, + { + "epoch": 0.6941318836129825, + "grad_norm": 4.684304330609571, + "learning_rate": 3.7936477924140993e-06, + "loss": 0.7341, + "step": 9608 + }, + { + "epoch": 0.6942041288131919, + "grad_norm": 5.505414724917185, + "learning_rate": 3.7933974869466207e-06, + "loss": 0.7928, + "step": 9609 + }, + { + "epoch": 0.6942763740134015, + "grad_norm": 6.206420945255569, + "learning_rate": 3.7931471637735357e-06, + "loss": 0.9526, + "step": 9610 + }, + { + "epoch": 0.694348619213611, + "grad_norm": 7.019100560225686, + "learning_rate": 3.79289682289827e-06, + "loss": 0.6897, + "step": 9611 + }, + { + "epoch": 0.6944208644138206, + "grad_norm": 4.452476909391233, + "learning_rate": 3.7926464643242496e-06, + "loss": 0.7655, + "step": 9612 + }, + { + "epoch": 0.69449310961403, + "grad_norm": 4.73261613561258, + "learning_rate": 3.792396088054904e-06, + "loss": 0.8012, + "step": 9613 + }, + { + "epoch": 0.6945653548142395, + "grad_norm": 6.70389920545385, + "learning_rate": 3.7921456940936586e-06, + "loss": 0.8471, + "step": 9614 + }, + { + "epoch": 0.6946376000144491, + "grad_norm": 7.6999849492706245, + "learning_rate": 3.791895282443942e-06, + "loss": 0.7722, + "step": 9615 + }, + { + "epoch": 0.6947098452146585, + "grad_norm": 7.435684110649092, + "learning_rate": 3.791644853109182e-06, + "loss": 0.9324, + "step": 9616 + }, + { + "epoch": 0.694782090414868, + "grad_norm": 5.804783446801251, + "learning_rate": 3.791394406092807e-06, + "loss": 0.7545, + "step": 9617 + }, + { + "epoch": 0.6948543356150776, + "grad_norm": 5.77237031852932, + "learning_rate": 3.7911439413982453e-06, + "loss": 0.9211, + "step": 9618 + }, + { + "epoch": 0.6949265808152871, + "grad_norm": 8.171931950387119, + "learning_rate": 3.790893459028926e-06, + "loss": 0.8393, + "step": 9619 + }, + { + "epoch": 0.6949988260154966, + "grad_norm": 6.280260909668153, + "learning_rate": 3.7906429589882778e-06, + "loss": 0.8794, + "step": 9620 + }, + { + "epoch": 0.6950710712157061, + "grad_norm": 5.955642127994116, + "learning_rate": 3.790392441279728e-06, + "loss": 0.8694, + "step": 9621 + }, + { + "epoch": 0.6951433164159156, + "grad_norm": 7.706849596345033, + "learning_rate": 3.790141905906709e-06, + "loss": 0.8852, + "step": 9622 + }, + { + "epoch": 0.6952155616161251, + "grad_norm": 6.6158035074043715, + "learning_rate": 3.7898913528726484e-06, + "loss": 0.9381, + "step": 9623 + }, + { + "epoch": 0.6952878068163346, + "grad_norm": 5.8503778580142125, + "learning_rate": 3.789640782180977e-06, + "loss": 0.7794, + "step": 9624 + }, + { + "epoch": 0.6953600520165442, + "grad_norm": 6.786931764762411, + "learning_rate": 3.7893901938351245e-06, + "loss": 0.7637, + "step": 9625 + }, + { + "epoch": 0.6954322972167537, + "grad_norm": 8.406140181380717, + "learning_rate": 3.789139587838522e-06, + "loss": 0.8727, + "step": 9626 + }, + { + "epoch": 0.6955045424169631, + "grad_norm": 9.05613453372194, + "learning_rate": 3.7888889641945982e-06, + "loss": 0.8504, + "step": 9627 + }, + { + "epoch": 0.6955767876171727, + "grad_norm": 6.0277260086172575, + "learning_rate": 3.7886383229067864e-06, + "loss": 0.8325, + "step": 9628 + }, + { + "epoch": 0.6956490328173822, + "grad_norm": 7.111799359129286, + "learning_rate": 3.7883876639785164e-06, + "loss": 0.8133, + "step": 9629 + }, + { + "epoch": 0.6957212780175918, + "grad_norm": 6.978911923746015, + "learning_rate": 3.788136987413219e-06, + "loss": 0.8597, + "step": 9630 + }, + { + "epoch": 0.6957935232178012, + "grad_norm": 5.893350066999841, + "learning_rate": 3.7878862932143268e-06, + "loss": 0.7851, + "step": 9631 + }, + { + "epoch": 0.6958657684180107, + "grad_norm": 6.462856390624046, + "learning_rate": 3.787635581385271e-06, + "loss": 0.9467, + "step": 9632 + }, + { + "epoch": 0.6959380136182203, + "grad_norm": 7.170986411370513, + "learning_rate": 3.787384851929484e-06, + "loss": 0.8109, + "step": 9633 + }, + { + "epoch": 0.6960102588184297, + "grad_norm": 8.053611409689521, + "learning_rate": 3.7871341048503984e-06, + "loss": 0.9164, + "step": 9634 + }, + { + "epoch": 0.6960825040186392, + "grad_norm": 6.410300263083674, + "learning_rate": 3.786883340151446e-06, + "loss": 0.8962, + "step": 9635 + }, + { + "epoch": 0.6961547492188488, + "grad_norm": 6.699459896807768, + "learning_rate": 3.7866325578360598e-06, + "loss": 0.7915, + "step": 9636 + }, + { + "epoch": 0.6962269944190583, + "grad_norm": 7.1255388558056225, + "learning_rate": 3.7863817579076734e-06, + "loss": 0.8219, + "step": 9637 + }, + { + "epoch": 0.6962992396192678, + "grad_norm": 6.248749874974994, + "learning_rate": 3.786130940369719e-06, + "loss": 0.8549, + "step": 9638 + }, + { + "epoch": 0.6963714848194773, + "grad_norm": 6.565992897180903, + "learning_rate": 3.7858801052256307e-06, + "loss": 0.8555, + "step": 9639 + }, + { + "epoch": 0.6964437300196868, + "grad_norm": 6.746415776097305, + "learning_rate": 3.7856292524788417e-06, + "loss": 0.8237, + "step": 9640 + }, + { + "epoch": 0.6965159752198963, + "grad_norm": 7.170988539220982, + "learning_rate": 3.785378382132787e-06, + "loss": 0.8567, + "step": 9641 + }, + { + "epoch": 0.6965882204201058, + "grad_norm": 6.8715709110718395, + "learning_rate": 3.7851274941909004e-06, + "loss": 0.8407, + "step": 9642 + }, + { + "epoch": 0.6966604656203154, + "grad_norm": 5.895290965372808, + "learning_rate": 3.7848765886566163e-06, + "loss": 0.8479, + "step": 9643 + }, + { + "epoch": 0.6967327108205249, + "grad_norm": 7.614348178751472, + "learning_rate": 3.784625665533369e-06, + "loss": 0.8806, + "step": 9644 + }, + { + "epoch": 0.6968049560207343, + "grad_norm": 5.503721711941394, + "learning_rate": 3.7843747248245937e-06, + "loss": 0.8729, + "step": 9645 + }, + { + "epoch": 0.6968772012209439, + "grad_norm": 5.428230188873574, + "learning_rate": 3.784123766533726e-06, + "loss": 0.8064, + "step": 9646 + }, + { + "epoch": 0.6969494464211534, + "grad_norm": 7.355290083049788, + "learning_rate": 3.7838727906642014e-06, + "loss": 0.8437, + "step": 9647 + }, + { + "epoch": 0.697021691621363, + "grad_norm": 7.073122661064687, + "learning_rate": 3.7836217972194546e-06, + "loss": 0.903, + "step": 9648 + }, + { + "epoch": 0.6970939368215724, + "grad_norm": 6.614967667435037, + "learning_rate": 3.783370786202922e-06, + "loss": 0.7687, + "step": 9649 + }, + { + "epoch": 0.6971661820217819, + "grad_norm": 5.1975182920191845, + "learning_rate": 3.783119757618039e-06, + "loss": 0.8416, + "step": 9650 + }, + { + "epoch": 0.6972384272219915, + "grad_norm": 8.513848187624328, + "learning_rate": 3.7828687114682444e-06, + "loss": 0.8522, + "step": 9651 + }, + { + "epoch": 0.6973106724222009, + "grad_norm": 6.365870502215432, + "learning_rate": 3.782617647756972e-06, + "loss": 0.8569, + "step": 9652 + }, + { + "epoch": 0.6973829176224104, + "grad_norm": 7.311740411937674, + "learning_rate": 3.7823665664876606e-06, + "loss": 0.9081, + "step": 9653 + }, + { + "epoch": 0.69745516282262, + "grad_norm": 7.123629789608282, + "learning_rate": 3.7821154676637465e-06, + "loss": 0.7606, + "step": 9654 + }, + { + "epoch": 0.6975274080228295, + "grad_norm": 6.728215704491207, + "learning_rate": 3.7818643512886673e-06, + "loss": 0.863, + "step": 9655 + }, + { + "epoch": 0.697599653223039, + "grad_norm": 9.065762899271125, + "learning_rate": 3.7816132173658605e-06, + "loss": 0.8504, + "step": 9656 + }, + { + "epoch": 0.6976718984232485, + "grad_norm": 8.723673862355142, + "learning_rate": 3.781362065898763e-06, + "loss": 0.9153, + "step": 9657 + }, + { + "epoch": 0.697744143623458, + "grad_norm": 6.543289112959283, + "learning_rate": 3.781110896890815e-06, + "loss": 0.8191, + "step": 9658 + }, + { + "epoch": 0.6978163888236675, + "grad_norm": 6.326891517515854, + "learning_rate": 3.7808597103454525e-06, + "loss": 0.7784, + "step": 9659 + }, + { + "epoch": 0.697888634023877, + "grad_norm": 6.0554932679561055, + "learning_rate": 3.7806085062661153e-06, + "loss": 0.7813, + "step": 9660 + }, + { + "epoch": 0.6979608792240866, + "grad_norm": 6.855694663300448, + "learning_rate": 3.780357284656242e-06, + "loss": 0.8677, + "step": 9661 + }, + { + "epoch": 0.6980331244242961, + "grad_norm": 6.941718116499369, + "learning_rate": 3.7801060455192717e-06, + "loss": 0.8548, + "step": 9662 + }, + { + "epoch": 0.6981053696245055, + "grad_norm": 8.630574069947809, + "learning_rate": 3.779854788858644e-06, + "loss": 0.8474, + "step": 9663 + }, + { + "epoch": 0.6981776148247151, + "grad_norm": 5.332970169736451, + "learning_rate": 3.7796035146777966e-06, + "loss": 0.8754, + "step": 9664 + }, + { + "epoch": 0.6982498600249246, + "grad_norm": 7.5389448561902705, + "learning_rate": 3.779352222980172e-06, + "loss": 0.8698, + "step": 9665 + }, + { + "epoch": 0.6983221052251342, + "grad_norm": 8.019292932862998, + "learning_rate": 3.779100913769208e-06, + "loss": 0.7918, + "step": 9666 + }, + { + "epoch": 0.6983943504253436, + "grad_norm": 5.650605646941405, + "learning_rate": 3.7788495870483467e-06, + "loss": 0.8365, + "step": 9667 + }, + { + "epoch": 0.6984665956255531, + "grad_norm": 5.295357365782564, + "learning_rate": 3.778598242821027e-06, + "loss": 0.8199, + "step": 9668 + }, + { + "epoch": 0.6985388408257627, + "grad_norm": 5.613157054612937, + "learning_rate": 3.7783468810906897e-06, + "loss": 0.8293, + "step": 9669 + }, + { + "epoch": 0.6986110860259721, + "grad_norm": 7.081884737177249, + "learning_rate": 3.778095501860777e-06, + "loss": 0.9037, + "step": 9670 + }, + { + "epoch": 0.6986833312261816, + "grad_norm": 7.586261589128181, + "learning_rate": 3.777844105134728e-06, + "loss": 0.8255, + "step": 9671 + }, + { + "epoch": 0.6987555764263912, + "grad_norm": 5.505178614463698, + "learning_rate": 3.7775926909159865e-06, + "loss": 0.7853, + "step": 9672 + }, + { + "epoch": 0.6988278216266007, + "grad_norm": 8.871894830491442, + "learning_rate": 3.777341259207993e-06, + "loss": 0.8955, + "step": 9673 + }, + { + "epoch": 0.6989000668268102, + "grad_norm": 5.933577677521023, + "learning_rate": 3.7770898100141885e-06, + "loss": 0.7451, + "step": 9674 + }, + { + "epoch": 0.6989723120270197, + "grad_norm": 5.62740240716557, + "learning_rate": 3.776838343338017e-06, + "loss": 0.8039, + "step": 9675 + }, + { + "epoch": 0.6990445572272292, + "grad_norm": 6.176606115355588, + "learning_rate": 3.7765868591829196e-06, + "loss": 0.8627, + "step": 9676 + }, + { + "epoch": 0.6991168024274387, + "grad_norm": 5.121756480942182, + "learning_rate": 3.77633535755234e-06, + "loss": 0.7731, + "step": 9677 + }, + { + "epoch": 0.6991890476276482, + "grad_norm": 5.351458427363954, + "learning_rate": 3.7760838384497188e-06, + "loss": 0.8901, + "step": 9678 + }, + { + "epoch": 0.6992612928278578, + "grad_norm": 4.861529004738737, + "learning_rate": 3.775832301878502e-06, + "loss": 0.7641, + "step": 9679 + }, + { + "epoch": 0.6993335380280673, + "grad_norm": 8.486919885510078, + "learning_rate": 3.7755807478421312e-06, + "loss": 0.8535, + "step": 9680 + }, + { + "epoch": 0.6994057832282767, + "grad_norm": 6.027846250521149, + "learning_rate": 3.775329176344051e-06, + "loss": 0.8725, + "step": 9681 + }, + { + "epoch": 0.6994780284284863, + "grad_norm": 8.211099639366074, + "learning_rate": 3.7750775873877033e-06, + "loss": 0.8637, + "step": 9682 + }, + { + "epoch": 0.6995502736286958, + "grad_norm": 5.944210396473936, + "learning_rate": 3.774825980976534e-06, + "loss": 0.809, + "step": 9683 + }, + { + "epoch": 0.6996225188289054, + "grad_norm": 6.315960832412775, + "learning_rate": 3.7745743571139872e-06, + "loss": 0.8766, + "step": 9684 + }, + { + "epoch": 0.6996947640291148, + "grad_norm": 4.909990573415622, + "learning_rate": 3.7743227158035072e-06, + "loss": 0.8338, + "step": 9685 + }, + { + "epoch": 0.6997670092293243, + "grad_norm": 5.56159194220869, + "learning_rate": 3.7740710570485383e-06, + "loss": 0.8929, + "step": 9686 + }, + { + "epoch": 0.6998392544295339, + "grad_norm": 5.216572198631975, + "learning_rate": 3.7738193808525257e-06, + "loss": 0.8164, + "step": 9687 + }, + { + "epoch": 0.6999114996297433, + "grad_norm": 6.634548575618848, + "learning_rate": 3.773567687218915e-06, + "loss": 0.8654, + "step": 9688 + }, + { + "epoch": 0.6999837448299528, + "grad_norm": 5.930042281361932, + "learning_rate": 3.7733159761511516e-06, + "loss": 0.9486, + "step": 9689 + }, + { + "epoch": 0.7000559900301624, + "grad_norm": 5.8256536157987915, + "learning_rate": 3.773064247652682e-06, + "loss": 0.8456, + "step": 9690 + }, + { + "epoch": 0.7001282352303719, + "grad_norm": 5.1862598453482525, + "learning_rate": 3.77281250172695e-06, + "loss": 0.8684, + "step": 9691 + }, + { + "epoch": 0.7002004804305814, + "grad_norm": 6.086578687942161, + "learning_rate": 3.772560738377404e-06, + "loss": 0.8835, + "step": 9692 + }, + { + "epoch": 0.7002727256307909, + "grad_norm": 5.640497179775786, + "learning_rate": 3.772308957607489e-06, + "loss": 0.826, + "step": 9693 + }, + { + "epoch": 0.7003449708310004, + "grad_norm": 7.512311875309362, + "learning_rate": 3.7720571594206522e-06, + "loss": 0.8953, + "step": 9694 + }, + { + "epoch": 0.7004172160312099, + "grad_norm": 7.323029070046233, + "learning_rate": 3.7718053438203417e-06, + "loss": 0.7829, + "step": 9695 + }, + { + "epoch": 0.7004894612314194, + "grad_norm": 6.169399412035552, + "learning_rate": 3.771553510810002e-06, + "loss": 0.9676, + "step": 9696 + }, + { + "epoch": 0.700561706431629, + "grad_norm": 5.039813413042317, + "learning_rate": 3.771301660393083e-06, + "loss": 0.8196, + "step": 9697 + }, + { + "epoch": 0.7006339516318385, + "grad_norm": 6.598643724207531, + "learning_rate": 3.7710497925730307e-06, + "loss": 0.879, + "step": 9698 + }, + { + "epoch": 0.7007061968320479, + "grad_norm": 7.512607404858266, + "learning_rate": 3.770797907353294e-06, + "loss": 0.8299, + "step": 9699 + }, + { + "epoch": 0.7007784420322575, + "grad_norm": 6.783446909760484, + "learning_rate": 3.770546004737321e-06, + "loss": 0.8047, + "step": 9700 + }, + { + "epoch": 0.700850687232467, + "grad_norm": 6.538204888877781, + "learning_rate": 3.7702940847285596e-06, + "loss": 0.8632, + "step": 9701 + }, + { + "epoch": 0.7009229324326764, + "grad_norm": 8.053343785681268, + "learning_rate": 3.770042147330458e-06, + "loss": 0.8416, + "step": 9702 + }, + { + "epoch": 0.700995177632886, + "grad_norm": 6.694981178445263, + "learning_rate": 3.7697901925464657e-06, + "loss": 0.8833, + "step": 9703 + }, + { + "epoch": 0.7010674228330955, + "grad_norm": 5.287501861355501, + "learning_rate": 3.7695382203800326e-06, + "loss": 0.8059, + "step": 9704 + }, + { + "epoch": 0.7011396680333051, + "grad_norm": 5.315927275103513, + "learning_rate": 3.7692862308346056e-06, + "loss": 0.8305, + "step": 9705 + }, + { + "epoch": 0.7012119132335145, + "grad_norm": 5.678435091779489, + "learning_rate": 3.769034223913637e-06, + "loss": 0.8489, + "step": 9706 + }, + { + "epoch": 0.701284158433724, + "grad_norm": 5.4623771810800275, + "learning_rate": 3.7687821996205733e-06, + "loss": 0.8514, + "step": 9707 + }, + { + "epoch": 0.7013564036339336, + "grad_norm": 5.430602159920752, + "learning_rate": 3.768530157958867e-06, + "loss": 0.838, + "step": 9708 + }, + { + "epoch": 0.7014286488341431, + "grad_norm": 5.811033330477457, + "learning_rate": 3.768278098931969e-06, + "loss": 0.8014, + "step": 9709 + }, + { + "epoch": 0.7015008940343526, + "grad_norm": 5.62368199483907, + "learning_rate": 3.768026022543328e-06, + "loss": 0.8033, + "step": 9710 + }, + { + "epoch": 0.7015731392345621, + "grad_norm": 5.672284240250771, + "learning_rate": 3.767773928796395e-06, + "loss": 0.817, + "step": 9711 + }, + { + "epoch": 0.7016453844347716, + "grad_norm": 7.284838402080968, + "learning_rate": 3.7675218176946214e-06, + "loss": 0.8927, + "step": 9712 + }, + { + "epoch": 0.7017176296349811, + "grad_norm": 8.070507242931086, + "learning_rate": 3.7672696892414586e-06, + "loss": 0.874, + "step": 9713 + }, + { + "epoch": 0.7017898748351906, + "grad_norm": 7.517852897112126, + "learning_rate": 3.767017543440357e-06, + "loss": 0.8328, + "step": 9714 + }, + { + "epoch": 0.7018621200354002, + "grad_norm": 5.276787853540213, + "learning_rate": 3.7667653802947703e-06, + "loss": 0.7872, + "step": 9715 + }, + { + "epoch": 0.7019343652356097, + "grad_norm": 6.646397135226753, + "learning_rate": 3.7665131998081478e-06, + "loss": 0.906, + "step": 9716 + }, + { + "epoch": 0.7020066104358191, + "grad_norm": 6.051583440711412, + "learning_rate": 3.7662610019839437e-06, + "loss": 0.8083, + "step": 9717 + }, + { + "epoch": 0.7020788556360287, + "grad_norm": 6.24468829462069, + "learning_rate": 3.766008786825609e-06, + "loss": 0.9174, + "step": 9718 + }, + { + "epoch": 0.7021511008362382, + "grad_norm": 9.185150053556429, + "learning_rate": 3.765756554336598e-06, + "loss": 0.814, + "step": 9719 + }, + { + "epoch": 0.7022233460364476, + "grad_norm": 7.553893938171914, + "learning_rate": 3.765504304520362e-06, + "loss": 0.9073, + "step": 9720 + }, + { + "epoch": 0.7022955912366572, + "grad_norm": 6.036740664796187, + "learning_rate": 3.7652520373803544e-06, + "loss": 0.9424, + "step": 9721 + }, + { + "epoch": 0.7023678364368667, + "grad_norm": 9.9162400952656, + "learning_rate": 3.764999752920029e-06, + "loss": 0.78, + "step": 9722 + }, + { + "epoch": 0.7024400816370763, + "grad_norm": 7.9695991811632805, + "learning_rate": 3.7647474511428394e-06, + "loss": 0.9082, + "step": 9723 + }, + { + "epoch": 0.7025123268372857, + "grad_norm": 7.099633016638766, + "learning_rate": 3.7644951320522393e-06, + "loss": 0.9241, + "step": 9724 + }, + { + "epoch": 0.7025845720374952, + "grad_norm": 5.650585900369629, + "learning_rate": 3.7642427956516824e-06, + "loss": 0.8186, + "step": 9725 + }, + { + "epoch": 0.7026568172377048, + "grad_norm": 9.367770650113345, + "learning_rate": 3.763990441944623e-06, + "loss": 0.7936, + "step": 9726 + }, + { + "epoch": 0.7027290624379143, + "grad_norm": 6.249279438443752, + "learning_rate": 3.763738070934516e-06, + "loss": 0.7884, + "step": 9727 + }, + { + "epoch": 0.7028013076381238, + "grad_norm": 5.699489824987559, + "learning_rate": 3.763485682624817e-06, + "loss": 0.8117, + "step": 9728 + }, + { + "epoch": 0.7028735528383333, + "grad_norm": 6.40202676152431, + "learning_rate": 3.7632332770189796e-06, + "loss": 0.7741, + "step": 9729 + }, + { + "epoch": 0.7029457980385428, + "grad_norm": 8.099768797789029, + "learning_rate": 3.7629808541204583e-06, + "loss": 0.9076, + "step": 9730 + }, + { + "epoch": 0.7030180432387523, + "grad_norm": 6.557777085843738, + "learning_rate": 3.7627284139327104e-06, + "loss": 0.8865, + "step": 9731 + }, + { + "epoch": 0.7030902884389618, + "grad_norm": 6.71206801177021, + "learning_rate": 3.762475956459191e-06, + "loss": 0.864, + "step": 9732 + }, + { + "epoch": 0.7031625336391714, + "grad_norm": 6.935639002925848, + "learning_rate": 3.7622234817033564e-06, + "loss": 0.8222, + "step": 9733 + }, + { + "epoch": 0.7032347788393809, + "grad_norm": 5.927527475281312, + "learning_rate": 3.7619709896686616e-06, + "loss": 0.8074, + "step": 9734 + }, + { + "epoch": 0.7033070240395903, + "grad_norm": 5.1826193036610055, + "learning_rate": 3.7617184803585648e-06, + "loss": 0.8115, + "step": 9735 + }, + { + "epoch": 0.7033792692397999, + "grad_norm": 6.322518735762078, + "learning_rate": 3.761465953776522e-06, + "loss": 0.9187, + "step": 9736 + }, + { + "epoch": 0.7034515144400094, + "grad_norm": 6.945498692491587, + "learning_rate": 3.761213409925988e-06, + "loss": 0.8136, + "step": 9737 + }, + { + "epoch": 0.7035237596402188, + "grad_norm": 5.5843430170766775, + "learning_rate": 3.7609608488104233e-06, + "loss": 0.8429, + "step": 9738 + }, + { + "epoch": 0.7035960048404284, + "grad_norm": 7.030911450182864, + "learning_rate": 3.760708270433283e-06, + "loss": 0.9103, + "step": 9739 + }, + { + "epoch": 0.7036682500406379, + "grad_norm": 6.8251225834272455, + "learning_rate": 3.7604556747980246e-06, + "loss": 0.8246, + "step": 9740 + }, + { + "epoch": 0.7037404952408475, + "grad_norm": 5.270340887695005, + "learning_rate": 3.7602030619081074e-06, + "loss": 0.732, + "step": 9741 + }, + { + "epoch": 0.7038127404410569, + "grad_norm": 5.502730298766858, + "learning_rate": 3.7599504317669895e-06, + "loss": 0.8521, + "step": 9742 + }, + { + "epoch": 0.7038849856412664, + "grad_norm": 6.149716390877909, + "learning_rate": 3.759697784378128e-06, + "loss": 0.8095, + "step": 9743 + }, + { + "epoch": 0.703957230841476, + "grad_norm": 7.698813696299077, + "learning_rate": 3.7594451197449814e-06, + "loss": 0.7384, + "step": 9744 + }, + { + "epoch": 0.7040294760416855, + "grad_norm": 6.345096539183051, + "learning_rate": 3.7591924378710094e-06, + "loss": 0.8493, + "step": 9745 + }, + { + "epoch": 0.704101721241895, + "grad_norm": 5.488069251910869, + "learning_rate": 3.7589397387596705e-06, + "loss": 0.8142, + "step": 9746 + }, + { + "epoch": 0.7041739664421045, + "grad_norm": 6.240696811406122, + "learning_rate": 3.7586870224144247e-06, + "loss": 0.841, + "step": 9747 + }, + { + "epoch": 0.704246211642314, + "grad_norm": 7.664623928213145, + "learning_rate": 3.7584342888387297e-06, + "loss": 0.9662, + "step": 9748 + }, + { + "epoch": 0.7043184568425235, + "grad_norm": 6.064445891199389, + "learning_rate": 3.7581815380360464e-06, + "loss": 0.792, + "step": 9749 + }, + { + "epoch": 0.704390702042733, + "grad_norm": 6.824437313149948, + "learning_rate": 3.7579287700098353e-06, + "loss": 0.8507, + "step": 9750 + }, + { + "epoch": 0.7044629472429426, + "grad_norm": 6.8406102890895095, + "learning_rate": 3.7576759847635567e-06, + "loss": 0.8144, + "step": 9751 + }, + { + "epoch": 0.7045351924431521, + "grad_norm": 8.192429318115856, + "learning_rate": 3.7574231823006703e-06, + "loss": 0.8036, + "step": 9752 + }, + { + "epoch": 0.7046074376433615, + "grad_norm": 9.386322508412631, + "learning_rate": 3.757170362624636e-06, + "loss": 0.8868, + "step": 9753 + }, + { + "epoch": 0.7046796828435711, + "grad_norm": 5.319897516453798, + "learning_rate": 3.7569175257389155e-06, + "loss": 0.8692, + "step": 9754 + }, + { + "epoch": 0.7047519280437806, + "grad_norm": 5.855453114555509, + "learning_rate": 3.7566646716469708e-06, + "loss": 0.8183, + "step": 9755 + }, + { + "epoch": 0.70482417324399, + "grad_norm": 6.265960151669851, + "learning_rate": 3.756411800352262e-06, + "loss": 0.8441, + "step": 9756 + }, + { + "epoch": 0.7048964184441996, + "grad_norm": 5.358508337474451, + "learning_rate": 3.7561589118582513e-06, + "loss": 0.8148, + "step": 9757 + }, + { + "epoch": 0.7049686636444091, + "grad_norm": 6.172200718264351, + "learning_rate": 3.7559060061684006e-06, + "loss": 0.7529, + "step": 9758 + }, + { + "epoch": 0.7050409088446187, + "grad_norm": 5.639058157014641, + "learning_rate": 3.7556530832861714e-06, + "loss": 0.909, + "step": 9759 + }, + { + "epoch": 0.7051131540448281, + "grad_norm": 8.382656867439259, + "learning_rate": 3.755400143215027e-06, + "loss": 0.9029, + "step": 9760 + }, + { + "epoch": 0.7051853992450376, + "grad_norm": 5.2800600415486185, + "learning_rate": 3.7551471859584294e-06, + "loss": 0.8187, + "step": 9761 + }, + { + "epoch": 0.7052576444452472, + "grad_norm": 6.108962881720135, + "learning_rate": 3.7548942115198407e-06, + "loss": 0.8424, + "step": 9762 + }, + { + "epoch": 0.7053298896454567, + "grad_norm": 5.529776511617139, + "learning_rate": 3.754641219902725e-06, + "loss": 0.801, + "step": 9763 + }, + { + "epoch": 0.7054021348456662, + "grad_norm": 5.5758493092998425, + "learning_rate": 3.754388211110545e-06, + "loss": 0.8345, + "step": 9764 + }, + { + "epoch": 0.7054743800458757, + "grad_norm": 6.633315715414961, + "learning_rate": 3.7541351851467652e-06, + "loss": 0.8419, + "step": 9765 + }, + { + "epoch": 0.7055466252460852, + "grad_norm": 7.304567909281739, + "learning_rate": 3.7538821420148476e-06, + "loss": 0.7779, + "step": 9766 + }, + { + "epoch": 0.7056188704462947, + "grad_norm": 7.064622678190611, + "learning_rate": 3.7536290817182576e-06, + "loss": 0.8195, + "step": 9767 + }, + { + "epoch": 0.7056911156465042, + "grad_norm": 5.585140486190861, + "learning_rate": 3.7533760042604585e-06, + "loss": 0.8694, + "step": 9768 + }, + { + "epoch": 0.7057633608467138, + "grad_norm": 7.333383328816467, + "learning_rate": 3.7531229096449145e-06, + "loss": 0.7572, + "step": 9769 + }, + { + "epoch": 0.7058356060469233, + "grad_norm": 7.877338122874986, + "learning_rate": 3.7528697978750915e-06, + "loss": 0.8527, + "step": 9770 + }, + { + "epoch": 0.7059078512471327, + "grad_norm": 4.597262953122896, + "learning_rate": 3.7526166689544543e-06, + "loss": 0.84, + "step": 9771 + }, + { + "epoch": 0.7059800964473423, + "grad_norm": 6.5151674398129495, + "learning_rate": 3.752363522886467e-06, + "loss": 0.8182, + "step": 9772 + }, + { + "epoch": 0.7060523416475518, + "grad_norm": 6.1418440290675385, + "learning_rate": 3.7521103596745944e-06, + "loss": 0.7935, + "step": 9773 + }, + { + "epoch": 0.7061245868477612, + "grad_norm": 7.582464426864657, + "learning_rate": 3.7518571793223047e-06, + "loss": 0.8963, + "step": 9774 + }, + { + "epoch": 0.7061968320479708, + "grad_norm": 7.594994964899299, + "learning_rate": 3.7516039818330617e-06, + "loss": 0.9011, + "step": 9775 + }, + { + "epoch": 0.7062690772481803, + "grad_norm": 5.962108492043602, + "learning_rate": 3.7513507672103323e-06, + "loss": 0.7514, + "step": 9776 + }, + { + "epoch": 0.7063413224483899, + "grad_norm": 5.931823908511765, + "learning_rate": 3.7510975354575816e-06, + "loss": 0.7922, + "step": 9777 + }, + { + "epoch": 0.7064135676485993, + "grad_norm": 6.890160508189434, + "learning_rate": 3.750844286578278e-06, + "loss": 0.8198, + "step": 9778 + }, + { + "epoch": 0.7064858128488088, + "grad_norm": 7.338614498415056, + "learning_rate": 3.7505910205758864e-06, + "loss": 0.8703, + "step": 9779 + }, + { + "epoch": 0.7065580580490184, + "grad_norm": 5.095993203342762, + "learning_rate": 3.7503377374538757e-06, + "loss": 0.7942, + "step": 9780 + }, + { + "epoch": 0.7066303032492279, + "grad_norm": 5.267585909975245, + "learning_rate": 3.750084437215712e-06, + "loss": 0.8037, + "step": 9781 + }, + { + "epoch": 0.7067025484494374, + "grad_norm": 7.097653562705875, + "learning_rate": 3.749831119864863e-06, + "loss": 0.8234, + "step": 9782 + }, + { + "epoch": 0.7067747936496469, + "grad_norm": 6.547629811331749, + "learning_rate": 3.7495777854047956e-06, + "loss": 0.8825, + "step": 9783 + }, + { + "epoch": 0.7068470388498564, + "grad_norm": 6.180478514382851, + "learning_rate": 3.7493244338389788e-06, + "loss": 0.9105, + "step": 9784 + }, + { + "epoch": 0.7069192840500659, + "grad_norm": 7.899296654403944, + "learning_rate": 3.749071065170882e-06, + "loss": 0.9183, + "step": 9785 + }, + { + "epoch": 0.7069915292502754, + "grad_norm": 6.178644181444537, + "learning_rate": 3.748817679403971e-06, + "loss": 0.9109, + "step": 9786 + }, + { + "epoch": 0.707063774450485, + "grad_norm": 5.459296027552918, + "learning_rate": 3.7485642765417153e-06, + "loss": 0.859, + "step": 9787 + }, + { + "epoch": 0.7071360196506945, + "grad_norm": 7.256922606740053, + "learning_rate": 3.748310856587585e-06, + "loss": 0.8897, + "step": 9788 + }, + { + "epoch": 0.7072082648509039, + "grad_norm": 5.70058637581049, + "learning_rate": 3.748057419545047e-06, + "loss": 0.8319, + "step": 9789 + }, + { + "epoch": 0.7072805100511135, + "grad_norm": 6.315839733793816, + "learning_rate": 3.747803965417573e-06, + "loss": 0.8278, + "step": 9790 + }, + { + "epoch": 0.707352755251323, + "grad_norm": 7.226919068483617, + "learning_rate": 3.7475504942086315e-06, + "loss": 0.8467, + "step": 9791 + }, + { + "epoch": 0.7074250004515324, + "grad_norm": 5.477892922220146, + "learning_rate": 3.747297005921692e-06, + "loss": 0.9096, + "step": 9792 + }, + { + "epoch": 0.707497245651742, + "grad_norm": 5.8790282379370264, + "learning_rate": 3.7470435005602256e-06, + "loss": 0.8307, + "step": 9793 + }, + { + "epoch": 0.7075694908519515, + "grad_norm": 6.214250786844451, + "learning_rate": 3.7467899781277014e-06, + "loss": 0.8411, + "step": 9794 + }, + { + "epoch": 0.7076417360521611, + "grad_norm": 7.334565088111106, + "learning_rate": 3.7465364386275903e-06, + "loss": 0.8999, + "step": 9795 + }, + { + "epoch": 0.7077139812523705, + "grad_norm": 6.708191706266121, + "learning_rate": 3.746282882063364e-06, + "loss": 0.8489, + "step": 9796 + }, + { + "epoch": 0.70778622645258, + "grad_norm": 5.8057536690151785, + "learning_rate": 3.746029308438492e-06, + "loss": 0.8511, + "step": 9797 + }, + { + "epoch": 0.7078584716527896, + "grad_norm": 7.567115069083462, + "learning_rate": 3.7457757177564463e-06, + "loss": 1.0173, + "step": 9798 + }, + { + "epoch": 0.7079307168529991, + "grad_norm": 6.440595752810686, + "learning_rate": 3.7455221100206984e-06, + "loss": 0.8684, + "step": 9799 + }, + { + "epoch": 0.7080029620532086, + "grad_norm": 5.805650181990345, + "learning_rate": 3.74526848523472e-06, + "loss": 0.8771, + "step": 9800 + }, + { + "epoch": 0.7080752072534181, + "grad_norm": 6.5410751107701195, + "learning_rate": 3.7450148434019835e-06, + "loss": 0.8533, + "step": 9801 + }, + { + "epoch": 0.7081474524536276, + "grad_norm": 6.042972221940664, + "learning_rate": 3.7447611845259595e-06, + "loss": 0.809, + "step": 9802 + }, + { + "epoch": 0.7082196976538371, + "grad_norm": 6.311024106161748, + "learning_rate": 3.7445075086101217e-06, + "loss": 0.8835, + "step": 9803 + }, + { + "epoch": 0.7082919428540466, + "grad_norm": 7.108186928544635, + "learning_rate": 3.7442538156579427e-06, + "loss": 0.7591, + "step": 9804 + }, + { + "epoch": 0.7083641880542562, + "grad_norm": 5.195370689582491, + "learning_rate": 3.7440001056728948e-06, + "loss": 0.8294, + "step": 9805 + }, + { + "epoch": 0.7084364332544657, + "grad_norm": 11.095199963012757, + "learning_rate": 3.743746378658452e-06, + "loss": 0.8644, + "step": 9806 + }, + { + "epoch": 0.7085086784546751, + "grad_norm": 9.220430692927849, + "learning_rate": 3.7434926346180854e-06, + "loss": 0.8529, + "step": 9807 + }, + { + "epoch": 0.7085809236548847, + "grad_norm": 6.2129744350485145, + "learning_rate": 3.7432388735552715e-06, + "loss": 0.8179, + "step": 9808 + }, + { + "epoch": 0.7086531688550942, + "grad_norm": 8.603327404652262, + "learning_rate": 3.7429850954734823e-06, + "loss": 0.9815, + "step": 9809 + }, + { + "epoch": 0.7087254140553036, + "grad_norm": 5.690942749336062, + "learning_rate": 3.742731300376193e-06, + "loss": 0.8186, + "step": 9810 + }, + { + "epoch": 0.7087976592555132, + "grad_norm": 7.870477013291383, + "learning_rate": 3.7424774882668758e-06, + "loss": 0.8753, + "step": 9811 + }, + { + "epoch": 0.7088699044557227, + "grad_norm": 5.2543623829065425, + "learning_rate": 3.742223659149007e-06, + "loss": 0.8327, + "step": 9812 + }, + { + "epoch": 0.7089421496559323, + "grad_norm": 5.96011894875436, + "learning_rate": 3.741969813026062e-06, + "loss": 0.9072, + "step": 9813 + }, + { + "epoch": 0.7090143948561417, + "grad_norm": 7.411382643062276, + "learning_rate": 3.741715949901513e-06, + "loss": 0.7757, + "step": 9814 + }, + { + "epoch": 0.7090866400563512, + "grad_norm": 9.05891379070266, + "learning_rate": 3.7414620697788375e-06, + "loss": 0.8879, + "step": 9815 + }, + { + "epoch": 0.7091588852565608, + "grad_norm": 6.797198548782599, + "learning_rate": 3.7412081726615097e-06, + "loss": 0.8066, + "step": 9816 + }, + { + "epoch": 0.7092311304567703, + "grad_norm": 6.216481777574673, + "learning_rate": 3.7409542585530064e-06, + "loss": 0.833, + "step": 9817 + }, + { + "epoch": 0.7093033756569798, + "grad_norm": 5.530092969668216, + "learning_rate": 3.740700327456803e-06, + "loss": 0.8217, + "step": 9818 + }, + { + "epoch": 0.7093756208571893, + "grad_norm": 6.632802722865291, + "learning_rate": 3.7404463793763744e-06, + "loss": 0.7189, + "step": 9819 + }, + { + "epoch": 0.7094478660573988, + "grad_norm": 8.39639373751673, + "learning_rate": 3.7401924143151983e-06, + "loss": 0.9204, + "step": 9820 + }, + { + "epoch": 0.7095201112576083, + "grad_norm": 7.149433204219868, + "learning_rate": 3.739938432276751e-06, + "loss": 0.8951, + "step": 9821 + }, + { + "epoch": 0.7095923564578178, + "grad_norm": 6.347293559349879, + "learning_rate": 3.7396844332645103e-06, + "loss": 0.7775, + "step": 9822 + }, + { + "epoch": 0.7096646016580274, + "grad_norm": 7.0682007054241325, + "learning_rate": 3.7394304172819517e-06, + "loss": 0.8068, + "step": 9823 + }, + { + "epoch": 0.7097368468582369, + "grad_norm": 7.102903669595328, + "learning_rate": 3.7391763843325528e-06, + "loss": 0.7981, + "step": 9824 + }, + { + "epoch": 0.7098090920584463, + "grad_norm": 6.513042862244917, + "learning_rate": 3.738922334419792e-06, + "loss": 0.7864, + "step": 9825 + }, + { + "epoch": 0.7098813372586559, + "grad_norm": 6.393046845881298, + "learning_rate": 3.7386682675471452e-06, + "loss": 0.9183, + "step": 9826 + }, + { + "epoch": 0.7099535824588654, + "grad_norm": 7.145199702190253, + "learning_rate": 3.7384141837180925e-06, + "loss": 0.8619, + "step": 9827 + }, + { + "epoch": 0.7100258276590748, + "grad_norm": 5.325826288866328, + "learning_rate": 3.738160082936111e-06, + "loss": 0.7852, + "step": 9828 + }, + { + "epoch": 0.7100980728592844, + "grad_norm": 8.948253218183886, + "learning_rate": 3.7379059652046793e-06, + "loss": 0.8121, + "step": 9829 + }, + { + "epoch": 0.7101703180594939, + "grad_norm": 9.527070676949645, + "learning_rate": 3.7376518305272757e-06, + "loss": 0.8391, + "step": 9830 + }, + { + "epoch": 0.7102425632597035, + "grad_norm": 6.259850635635706, + "learning_rate": 3.73739767890738e-06, + "loss": 0.7758, + "step": 9831 + }, + { + "epoch": 0.7103148084599129, + "grad_norm": 5.848803288575131, + "learning_rate": 3.737143510348471e-06, + "loss": 0.8559, + "step": 9832 + }, + { + "epoch": 0.7103870536601224, + "grad_norm": 7.453069722672372, + "learning_rate": 3.7368893248540277e-06, + "loss": 0.8463, + "step": 9833 + }, + { + "epoch": 0.710459298860332, + "grad_norm": 7.11652633326133, + "learning_rate": 3.73663512242753e-06, + "loss": 0.858, + "step": 9834 + }, + { + "epoch": 0.7105315440605415, + "grad_norm": 5.535400040668199, + "learning_rate": 3.7363809030724575e-06, + "loss": 0.8246, + "step": 9835 + }, + { + "epoch": 0.710603789260751, + "grad_norm": 5.236683077593236, + "learning_rate": 3.7361266667922905e-06, + "loss": 0.8301, + "step": 9836 + }, + { + "epoch": 0.7106760344609605, + "grad_norm": 7.590347092724885, + "learning_rate": 3.735872413590509e-06, + "loss": 0.7653, + "step": 9837 + }, + { + "epoch": 0.71074827966117, + "grad_norm": 6.773529580094274, + "learning_rate": 3.7356181434705947e-06, + "loss": 0.8275, + "step": 9838 + }, + { + "epoch": 0.7108205248613795, + "grad_norm": 6.5020701706384445, + "learning_rate": 3.7353638564360263e-06, + "loss": 0.8094, + "step": 9839 + }, + { + "epoch": 0.710892770061589, + "grad_norm": 6.197014785244195, + "learning_rate": 3.735109552490286e-06, + "loss": 0.864, + "step": 9840 + }, + { + "epoch": 0.7109650152617986, + "grad_norm": 5.510733101982808, + "learning_rate": 3.734855231636855e-06, + "loss": 0.8499, + "step": 9841 + }, + { + "epoch": 0.7110372604620081, + "grad_norm": 7.093663152087446, + "learning_rate": 3.7346008938792155e-06, + "loss": 0.8083, + "step": 9842 + }, + { + "epoch": 0.7111095056622175, + "grad_norm": 6.187489210947822, + "learning_rate": 3.7343465392208477e-06, + "loss": 0.7922, + "step": 9843 + }, + { + "epoch": 0.7111817508624271, + "grad_norm": 6.177668303076962, + "learning_rate": 3.7340921676652334e-06, + "loss": 0.7477, + "step": 9844 + }, + { + "epoch": 0.7112539960626366, + "grad_norm": 5.145689906954195, + "learning_rate": 3.733837779215857e-06, + "loss": 0.753, + "step": 9845 + }, + { + "epoch": 0.711326241262846, + "grad_norm": 7.358420923675829, + "learning_rate": 3.733583373876199e-06, + "loss": 0.876, + "step": 9846 + }, + { + "epoch": 0.7113984864630556, + "grad_norm": 6.299078241158179, + "learning_rate": 3.7333289516497424e-06, + "loss": 0.7836, + "step": 9847 + }, + { + "epoch": 0.7114707316632651, + "grad_norm": 8.027120872057983, + "learning_rate": 3.7330745125399697e-06, + "loss": 0.9263, + "step": 9848 + }, + { + "epoch": 0.7115429768634747, + "grad_norm": 9.980691579581961, + "learning_rate": 3.7328200565503643e-06, + "loss": 0.9146, + "step": 9849 + }, + { + "epoch": 0.7116152220636841, + "grad_norm": 5.834963797724194, + "learning_rate": 3.73256558368441e-06, + "loss": 0.8133, + "step": 9850 + }, + { + "epoch": 0.7116874672638936, + "grad_norm": 5.656755287826453, + "learning_rate": 3.7323110939455896e-06, + "loss": 0.8168, + "step": 9851 + }, + { + "epoch": 0.7117597124641032, + "grad_norm": 8.361040010146437, + "learning_rate": 3.7320565873373876e-06, + "loss": 0.8451, + "step": 9852 + }, + { + "epoch": 0.7118319576643127, + "grad_norm": 10.939978792773235, + "learning_rate": 3.7318020638632866e-06, + "loss": 0.9182, + "step": 9853 + }, + { + "epoch": 0.7119042028645222, + "grad_norm": 5.395784866331354, + "learning_rate": 3.7315475235267726e-06, + "loss": 0.767, + "step": 9854 + }, + { + "epoch": 0.7119764480647317, + "grad_norm": 6.048865333272709, + "learning_rate": 3.731292966331329e-06, + "loss": 0.7851, + "step": 9855 + }, + { + "epoch": 0.7120486932649412, + "grad_norm": 5.574293001104104, + "learning_rate": 3.7310383922804406e-06, + "loss": 0.7369, + "step": 9856 + }, + { + "epoch": 0.7121209384651507, + "grad_norm": 6.857187674012638, + "learning_rate": 3.730783801377593e-06, + "loss": 0.9011, + "step": 9857 + }, + { + "epoch": 0.7121931836653602, + "grad_norm": 7.529831368836572, + "learning_rate": 3.73052919362627e-06, + "loss": 0.8732, + "step": 9858 + }, + { + "epoch": 0.7122654288655698, + "grad_norm": 7.337848515982451, + "learning_rate": 3.730274569029958e-06, + "loss": 0.8329, + "step": 9859 + }, + { + "epoch": 0.7123376740657793, + "grad_norm": 7.78494824381868, + "learning_rate": 3.7300199275921428e-06, + "loss": 1.0011, + "step": 9860 + }, + { + "epoch": 0.7124099192659887, + "grad_norm": 5.5348056205657565, + "learning_rate": 3.72976526931631e-06, + "loss": 0.8472, + "step": 9861 + }, + { + "epoch": 0.7124821644661983, + "grad_norm": 6.646305302649389, + "learning_rate": 3.729510594205945e-06, + "loss": 0.8067, + "step": 9862 + }, + { + "epoch": 0.7125544096664078, + "grad_norm": 5.816687275468374, + "learning_rate": 3.7292559022645343e-06, + "loss": 0.8431, + "step": 9863 + }, + { + "epoch": 0.7126266548666172, + "grad_norm": 7.722084715035922, + "learning_rate": 3.729001193495565e-06, + "loss": 0.9323, + "step": 9864 + }, + { + "epoch": 0.7126989000668268, + "grad_norm": 5.273874222367698, + "learning_rate": 3.7287464679025243e-06, + "loss": 0.8203, + "step": 9865 + }, + { + "epoch": 0.7127711452670363, + "grad_norm": 9.454726507547404, + "learning_rate": 3.7284917254888976e-06, + "loss": 0.8792, + "step": 9866 + }, + { + "epoch": 0.7128433904672459, + "grad_norm": 8.621976903110646, + "learning_rate": 3.728236966258174e-06, + "loss": 0.9025, + "step": 9867 + }, + { + "epoch": 0.7129156356674553, + "grad_norm": 6.212071191737176, + "learning_rate": 3.727982190213839e-06, + "loss": 0.8834, + "step": 9868 + }, + { + "epoch": 0.7129878808676648, + "grad_norm": 5.5292673814230175, + "learning_rate": 3.7277273973593818e-06, + "loss": 0.8779, + "step": 9869 + }, + { + "epoch": 0.7130601260678744, + "grad_norm": 8.918025213356076, + "learning_rate": 3.72747258769829e-06, + "loss": 0.9161, + "step": 9870 + }, + { + "epoch": 0.7131323712680839, + "grad_norm": 7.364344771503723, + "learning_rate": 3.727217761234051e-06, + "loss": 0.8414, + "step": 9871 + }, + { + "epoch": 0.7132046164682934, + "grad_norm": 6.23987583330302, + "learning_rate": 3.726962917970154e-06, + "loss": 0.7934, + "step": 9872 + }, + { + "epoch": 0.7132768616685029, + "grad_norm": 7.803406817473018, + "learning_rate": 3.7267080579100867e-06, + "loss": 0.8414, + "step": 9873 + }, + { + "epoch": 0.7133491068687124, + "grad_norm": 6.250646634029623, + "learning_rate": 3.72645318105734e-06, + "loss": 0.8298, + "step": 9874 + }, + { + "epoch": 0.7134213520689219, + "grad_norm": 6.63634254148178, + "learning_rate": 3.7261982874154013e-06, + "loss": 0.8082, + "step": 9875 + }, + { + "epoch": 0.7134935972691314, + "grad_norm": 6.593085522187685, + "learning_rate": 3.725943376987759e-06, + "loss": 0.8462, + "step": 9876 + }, + { + "epoch": 0.713565842469341, + "grad_norm": 7.933183352337613, + "learning_rate": 3.7256884497779046e-06, + "loss": 0.8567, + "step": 9877 + }, + { + "epoch": 0.7136380876695505, + "grad_norm": 7.052135052249497, + "learning_rate": 3.725433505789326e-06, + "loss": 0.9179, + "step": 9878 + }, + { + "epoch": 0.7137103328697599, + "grad_norm": 5.541611616200613, + "learning_rate": 3.725178545025515e-06, + "loss": 0.8187, + "step": 9879 + }, + { + "epoch": 0.7137825780699695, + "grad_norm": 6.378303120886017, + "learning_rate": 3.724923567489961e-06, + "loss": 0.8425, + "step": 9880 + }, + { + "epoch": 0.713854823270179, + "grad_norm": 5.594486806069973, + "learning_rate": 3.724668573186155e-06, + "loss": 0.782, + "step": 9881 + }, + { + "epoch": 0.7139270684703884, + "grad_norm": 5.5006427389335, + "learning_rate": 3.7244135621175857e-06, + "loss": 0.7841, + "step": 9882 + }, + { + "epoch": 0.713999313670598, + "grad_norm": 8.150323461186842, + "learning_rate": 3.7241585342877464e-06, + "loss": 0.906, + "step": 9883 + }, + { + "epoch": 0.7140715588708075, + "grad_norm": 7.71623652380541, + "learning_rate": 3.7239034897001277e-06, + "loss": 0.9127, + "step": 9884 + }, + { + "epoch": 0.7141438040710171, + "grad_norm": 5.836825424524615, + "learning_rate": 3.7236484283582197e-06, + "loss": 0.837, + "step": 9885 + }, + { + "epoch": 0.7142160492712265, + "grad_norm": 6.507012912313577, + "learning_rate": 3.723393350265515e-06, + "loss": 0.9232, + "step": 9886 + }, + { + "epoch": 0.714288294471436, + "grad_norm": 5.664295859955704, + "learning_rate": 3.723138255425505e-06, + "loss": 0.8269, + "step": 9887 + }, + { + "epoch": 0.7143605396716456, + "grad_norm": 7.113317716820308, + "learning_rate": 3.7228831438416826e-06, + "loss": 0.8463, + "step": 9888 + }, + { + "epoch": 0.7144327848718551, + "grad_norm": 6.657574302496497, + "learning_rate": 3.7226280155175386e-06, + "loss": 0.8545, + "step": 9889 + }, + { + "epoch": 0.7145050300720646, + "grad_norm": 6.04257640785579, + "learning_rate": 3.722372870456567e-06, + "loss": 0.7444, + "step": 9890 + }, + { + "epoch": 0.7145772752722741, + "grad_norm": 6.306578625527548, + "learning_rate": 3.7221177086622597e-06, + "loss": 0.8352, + "step": 9891 + }, + { + "epoch": 0.7146495204724836, + "grad_norm": 5.955879435264964, + "learning_rate": 3.72186253013811e-06, + "loss": 0.8157, + "step": 9892 + }, + { + "epoch": 0.7147217656726931, + "grad_norm": 5.971266929853211, + "learning_rate": 3.7216073348876115e-06, + "loss": 0.9453, + "step": 9893 + }, + { + "epoch": 0.7147940108729026, + "grad_norm": 5.314222168872114, + "learning_rate": 3.7213521229142563e-06, + "loss": 0.8058, + "step": 9894 + }, + { + "epoch": 0.7148662560731122, + "grad_norm": 8.22709527966722, + "learning_rate": 3.721096894221539e-06, + "loss": 0.9721, + "step": 9895 + }, + { + "epoch": 0.7149385012733217, + "grad_norm": 7.737876515633434, + "learning_rate": 3.7208416488129537e-06, + "loss": 0.8749, + "step": 9896 + }, + { + "epoch": 0.7150107464735311, + "grad_norm": 6.630548708664956, + "learning_rate": 3.7205863866919933e-06, + "loss": 0.8078, + "step": 9897 + }, + { + "epoch": 0.7150829916737407, + "grad_norm": 9.115111963836826, + "learning_rate": 3.720331107862154e-06, + "loss": 0.8734, + "step": 9898 + }, + { + "epoch": 0.7151552368739502, + "grad_norm": 6.043312778062715, + "learning_rate": 3.7200758123269294e-06, + "loss": 0.768, + "step": 9899 + }, + { + "epoch": 0.7152274820741596, + "grad_norm": 6.616151190343698, + "learning_rate": 3.7198205000898136e-06, + "loss": 0.7809, + "step": 9900 + }, + { + "epoch": 0.7152997272743692, + "grad_norm": 7.026883138305653, + "learning_rate": 3.719565171154302e-06, + "loss": 0.8262, + "step": 9901 + }, + { + "epoch": 0.7153719724745787, + "grad_norm": 5.28751773335254, + "learning_rate": 3.7193098255238912e-06, + "loss": 0.8101, + "step": 9902 + }, + { + "epoch": 0.7154442176747883, + "grad_norm": 7.557087624492508, + "learning_rate": 3.7190544632020747e-06, + "loss": 0.8666, + "step": 9903 + }, + { + "epoch": 0.7155164628749977, + "grad_norm": 6.557321012601464, + "learning_rate": 3.71879908419235e-06, + "loss": 0.8331, + "step": 9904 + }, + { + "epoch": 0.7155887080752072, + "grad_norm": 6.713847798267457, + "learning_rate": 3.7185436884982114e-06, + "loss": 0.8716, + "step": 9905 + }, + { + "epoch": 0.7156609532754168, + "grad_norm": 5.8323903366093575, + "learning_rate": 3.718288276123156e-06, + "loss": 0.7436, + "step": 9906 + }, + { + "epoch": 0.7157331984756263, + "grad_norm": 5.176746298592412, + "learning_rate": 3.71803284707068e-06, + "loss": 0.8471, + "step": 9907 + }, + { + "epoch": 0.7158054436758358, + "grad_norm": 6.960274918486911, + "learning_rate": 3.7177774013442802e-06, + "loss": 0.8047, + "step": 9908 + }, + { + "epoch": 0.7158776888760453, + "grad_norm": 6.770204892459952, + "learning_rate": 3.7175219389474535e-06, + "loss": 0.9332, + "step": 9909 + }, + { + "epoch": 0.7159499340762548, + "grad_norm": 6.675797821256495, + "learning_rate": 3.717266459883697e-06, + "loss": 0.8514, + "step": 9910 + }, + { + "epoch": 0.7160221792764643, + "grad_norm": 5.914045406906441, + "learning_rate": 3.7170109641565072e-06, + "loss": 0.9265, + "step": 9911 + }, + { + "epoch": 0.7160944244766738, + "grad_norm": 8.098803264609387, + "learning_rate": 3.716755451769382e-06, + "loss": 0.8188, + "step": 9912 + }, + { + "epoch": 0.7161666696768834, + "grad_norm": 6.8417224406277874, + "learning_rate": 3.7164999227258203e-06, + "loss": 0.8979, + "step": 9913 + }, + { + "epoch": 0.7162389148770929, + "grad_norm": 7.050105468826834, + "learning_rate": 3.716244377029319e-06, + "loss": 0.8659, + "step": 9914 + }, + { + "epoch": 0.7163111600773023, + "grad_norm": 5.3509587079548915, + "learning_rate": 3.715988814683376e-06, + "loss": 0.8942, + "step": 9915 + }, + { + "epoch": 0.7163834052775119, + "grad_norm": 5.611246715875261, + "learning_rate": 3.715733235691491e-06, + "loss": 0.7954, + "step": 9916 + }, + { + "epoch": 0.7164556504777214, + "grad_norm": 7.867632128851819, + "learning_rate": 3.715477640057161e-06, + "loss": 0.9646, + "step": 9917 + }, + { + "epoch": 0.7165278956779308, + "grad_norm": 8.49358574692291, + "learning_rate": 3.7152220277838875e-06, + "loss": 0.8563, + "step": 9918 + }, + { + "epoch": 0.7166001408781404, + "grad_norm": 5.557271321577342, + "learning_rate": 3.7149663988751666e-06, + "loss": 0.8115, + "step": 9919 + }, + { + "epoch": 0.7166723860783499, + "grad_norm": 7.7529475851797764, + "learning_rate": 3.7147107533344994e-06, + "loss": 0.8222, + "step": 9920 + }, + { + "epoch": 0.7167446312785595, + "grad_norm": 7.208806310876605, + "learning_rate": 3.714455091165385e-06, + "loss": 0.9258, + "step": 9921 + }, + { + "epoch": 0.7168168764787689, + "grad_norm": 6.3721127423156325, + "learning_rate": 3.714199412371324e-06, + "loss": 0.8085, + "step": 9922 + }, + { + "epoch": 0.7168891216789784, + "grad_norm": 6.679918501579502, + "learning_rate": 3.7139437169558147e-06, + "loss": 0.8316, + "step": 9923 + }, + { + "epoch": 0.716961366879188, + "grad_norm": 5.956441761304356, + "learning_rate": 3.7136880049223594e-06, + "loss": 0.8187, + "step": 9924 + }, + { + "epoch": 0.7170336120793974, + "grad_norm": 7.405079680741119, + "learning_rate": 3.7134322762744574e-06, + "loss": 0.7719, + "step": 9925 + }, + { + "epoch": 0.717105857279607, + "grad_norm": 7.506196323428372, + "learning_rate": 3.71317653101561e-06, + "loss": 0.8655, + "step": 9926 + }, + { + "epoch": 0.7171781024798165, + "grad_norm": 6.085205660928069, + "learning_rate": 3.7129207691493174e-06, + "loss": 0.8399, + "step": 9927 + }, + { + "epoch": 0.717250347680026, + "grad_norm": 6.389454629516238, + "learning_rate": 3.7126649906790815e-06, + "loss": 0.8105, + "step": 9928 + }, + { + "epoch": 0.7173225928802355, + "grad_norm": 7.223587595019522, + "learning_rate": 3.712409195608403e-06, + "loss": 0.8448, + "step": 9929 + }, + { + "epoch": 0.717394838080445, + "grad_norm": 6.57836246741433, + "learning_rate": 3.712153383940784e-06, + "loss": 0.8272, + "step": 9930 + }, + { + "epoch": 0.7174670832806546, + "grad_norm": 5.933264576945736, + "learning_rate": 3.711897555679727e-06, + "loss": 0.8175, + "step": 9931 + }, + { + "epoch": 0.7175393284808641, + "grad_norm": 6.935910704428416, + "learning_rate": 3.7116417108287333e-06, + "loss": 0.91, + "step": 9932 + }, + { + "epoch": 0.7176115736810735, + "grad_norm": 5.49046053262853, + "learning_rate": 3.711385849391306e-06, + "loss": 0.8121, + "step": 9933 + }, + { + "epoch": 0.7176838188812831, + "grad_norm": 5.547865916110855, + "learning_rate": 3.7111299713709453e-06, + "loss": 0.7688, + "step": 9934 + }, + { + "epoch": 0.7177560640814926, + "grad_norm": 6.6009201911085675, + "learning_rate": 3.7108740767711565e-06, + "loss": 0.7631, + "step": 9935 + }, + { + "epoch": 0.717828309281702, + "grad_norm": 6.152862391333764, + "learning_rate": 3.710618165595442e-06, + "loss": 0.8987, + "step": 9936 + }, + { + "epoch": 0.7179005544819116, + "grad_norm": 5.701722523938729, + "learning_rate": 3.7103622378473046e-06, + "loss": 0.8139, + "step": 9937 + }, + { + "epoch": 0.7179727996821211, + "grad_norm": 5.64750121232721, + "learning_rate": 3.7101062935302483e-06, + "loss": 0.7432, + "step": 9938 + }, + { + "epoch": 0.7180450448823307, + "grad_norm": 6.225124446743327, + "learning_rate": 3.7098503326477753e-06, + "loss": 0.9261, + "step": 9939 + }, + { + "epoch": 0.7181172900825401, + "grad_norm": 6.324900919624465, + "learning_rate": 3.709594355203392e-06, + "loss": 0.8422, + "step": 9940 + }, + { + "epoch": 0.7181895352827496, + "grad_norm": 5.881041970865872, + "learning_rate": 3.709338361200601e-06, + "loss": 0.8514, + "step": 9941 + }, + { + "epoch": 0.7182617804829592, + "grad_norm": 5.273607853151233, + "learning_rate": 3.7090823506429064e-06, + "loss": 0.8685, + "step": 9942 + }, + { + "epoch": 0.7183340256831686, + "grad_norm": 5.626708385861265, + "learning_rate": 3.7088263235338127e-06, + "loss": 0.8784, + "step": 9943 + }, + { + "epoch": 0.7184062708833782, + "grad_norm": 6.016958113750415, + "learning_rate": 3.708570279876826e-06, + "loss": 0.8284, + "step": 9944 + }, + { + "epoch": 0.7184785160835877, + "grad_norm": 5.804689471525574, + "learning_rate": 3.7083142196754505e-06, + "loss": 0.7409, + "step": 9945 + }, + { + "epoch": 0.7185507612837972, + "grad_norm": 7.017689560414982, + "learning_rate": 3.708058142933191e-06, + "loss": 0.9745, + "step": 9946 + }, + { + "epoch": 0.7186230064840067, + "grad_norm": 7.990957633484643, + "learning_rate": 3.7078020496535545e-06, + "loss": 0.8793, + "step": 9947 + }, + { + "epoch": 0.7186952516842162, + "grad_norm": 7.928289641646911, + "learning_rate": 3.707545939840045e-06, + "loss": 0.8816, + "step": 9948 + }, + { + "epoch": 0.7187674968844258, + "grad_norm": 5.806640296522863, + "learning_rate": 3.707289813496169e-06, + "loss": 0.7848, + "step": 9949 + }, + { + "epoch": 0.7188397420846353, + "grad_norm": 8.001365545076508, + "learning_rate": 3.707033670625434e-06, + "loss": 0.907, + "step": 9950 + }, + { + "epoch": 0.7189119872848447, + "grad_norm": 5.582465526646911, + "learning_rate": 3.7067775112313443e-06, + "loss": 0.8625, + "step": 9951 + }, + { + "epoch": 0.7189842324850543, + "grad_norm": 6.599639142893061, + "learning_rate": 3.7065213353174074e-06, + "loss": 0.875, + "step": 9952 + }, + { + "epoch": 0.7190564776852638, + "grad_norm": 5.580945923180132, + "learning_rate": 3.7062651428871298e-06, + "loss": 0.8875, + "step": 9953 + }, + { + "epoch": 0.7191287228854732, + "grad_norm": 5.3133957051661165, + "learning_rate": 3.7060089339440198e-06, + "loss": 0.8227, + "step": 9954 + }, + { + "epoch": 0.7192009680856828, + "grad_norm": 7.08754859113772, + "learning_rate": 3.7057527084915833e-06, + "loss": 0.938, + "step": 9955 + }, + { + "epoch": 0.7192732132858923, + "grad_norm": 8.79970319420788, + "learning_rate": 3.7054964665333292e-06, + "loss": 0.9003, + "step": 9956 + }, + { + "epoch": 0.7193454584861019, + "grad_norm": 6.509915931264313, + "learning_rate": 3.7052402080727646e-06, + "loss": 0.8932, + "step": 9957 + }, + { + "epoch": 0.7194177036863113, + "grad_norm": 8.539842910215173, + "learning_rate": 3.7049839331133963e-06, + "loss": 0.8308, + "step": 9958 + }, + { + "epoch": 0.7194899488865208, + "grad_norm": 6.683984420657763, + "learning_rate": 3.7047276416587346e-06, + "loss": 0.8647, + "step": 9959 + }, + { + "epoch": 0.7195621940867304, + "grad_norm": 6.043937344098197, + "learning_rate": 3.704471333712286e-06, + "loss": 0.8957, + "step": 9960 + }, + { + "epoch": 0.7196344392869398, + "grad_norm": 5.487484477027714, + "learning_rate": 3.704215009277561e-06, + "loss": 0.7956, + "step": 9961 + }, + { + "epoch": 0.7197066844871494, + "grad_norm": 7.024504413975614, + "learning_rate": 3.703958668358067e-06, + "loss": 0.8474, + "step": 9962 + }, + { + "epoch": 0.7197789296873589, + "grad_norm": 5.988972066011888, + "learning_rate": 3.703702310957313e-06, + "loss": 0.8394, + "step": 9963 + }, + { + "epoch": 0.7198511748875684, + "grad_norm": 6.038508183749118, + "learning_rate": 3.70344593707881e-06, + "loss": 0.779, + "step": 9964 + }, + { + "epoch": 0.7199234200877779, + "grad_norm": 6.684245806244038, + "learning_rate": 3.7031895467260664e-06, + "loss": 0.8111, + "step": 9965 + }, + { + "epoch": 0.7199956652879874, + "grad_norm": 7.146599671965535, + "learning_rate": 3.7029331399025926e-06, + "loss": 0.8568, + "step": 9966 + }, + { + "epoch": 0.720067910488197, + "grad_norm": 6.294767344131794, + "learning_rate": 3.7026767166118966e-06, + "loss": 0.8999, + "step": 9967 + }, + { + "epoch": 0.7201401556884065, + "grad_norm": 6.483574875334016, + "learning_rate": 3.7024202768574915e-06, + "loss": 0.9812, + "step": 9968 + }, + { + "epoch": 0.7202124008886159, + "grad_norm": 8.090942838635458, + "learning_rate": 3.7021638206428857e-06, + "loss": 0.8508, + "step": 9969 + }, + { + "epoch": 0.7202846460888255, + "grad_norm": 4.832487733907274, + "learning_rate": 3.701907347971591e-06, + "loss": 0.7712, + "step": 9970 + }, + { + "epoch": 0.720356891289035, + "grad_norm": 5.403874046071998, + "learning_rate": 3.701650858847118e-06, + "loss": 0.7601, + "step": 9971 + }, + { + "epoch": 0.7204291364892444, + "grad_norm": 7.28099354304596, + "learning_rate": 3.7013943532729767e-06, + "loss": 0.8755, + "step": 9972 + }, + { + "epoch": 0.720501381689454, + "grad_norm": 6.322993555078053, + "learning_rate": 3.7011378312526802e-06, + "loss": 0.8554, + "step": 9973 + }, + { + "epoch": 0.7205736268896635, + "grad_norm": 6.358075583230378, + "learning_rate": 3.7008812927897404e-06, + "loss": 0.8891, + "step": 9974 + }, + { + "epoch": 0.7206458720898731, + "grad_norm": 8.213681823141, + "learning_rate": 3.7006247378876677e-06, + "loss": 0.833, + "step": 9975 + }, + { + "epoch": 0.7207181172900825, + "grad_norm": 7.374164339986923, + "learning_rate": 3.7003681665499735e-06, + "loss": 0.9184, + "step": 9976 + }, + { + "epoch": 0.720790362490292, + "grad_norm": 7.5382414850069175, + "learning_rate": 3.700111578780172e-06, + "loss": 0.878, + "step": 9977 + }, + { + "epoch": 0.7208626076905016, + "grad_norm": 5.689738744761712, + "learning_rate": 3.6998549745817747e-06, + "loss": 0.7638, + "step": 9978 + }, + { + "epoch": 0.720934852890711, + "grad_norm": 6.715255041214346, + "learning_rate": 3.699598353958294e-06, + "loss": 0.913, + "step": 9979 + }, + { + "epoch": 0.7210070980909206, + "grad_norm": 5.940501849620266, + "learning_rate": 3.699341716913244e-06, + "loss": 0.8899, + "step": 9980 + }, + { + "epoch": 0.7210793432911301, + "grad_norm": 5.6939616962694775, + "learning_rate": 3.699085063450137e-06, + "loss": 0.753, + "step": 9981 + }, + { + "epoch": 0.7211515884913396, + "grad_norm": 7.811671342771691, + "learning_rate": 3.6988283935724855e-06, + "loss": 0.7899, + "step": 9982 + }, + { + "epoch": 0.7212238336915491, + "grad_norm": 5.019655313702704, + "learning_rate": 3.698571707283805e-06, + "loss": 0.841, + "step": 9983 + }, + { + "epoch": 0.7212960788917586, + "grad_norm": 5.952191617588452, + "learning_rate": 3.698315004587609e-06, + "loss": 0.8157, + "step": 9984 + }, + { + "epoch": 0.7213683240919682, + "grad_norm": 6.583599648541472, + "learning_rate": 3.69805828548741e-06, + "loss": 0.8183, + "step": 9985 + }, + { + "epoch": 0.7214405692921777, + "grad_norm": 5.5991086386680715, + "learning_rate": 3.6978015499867235e-06, + "loss": 0.7524, + "step": 9986 + }, + { + "epoch": 0.7215128144923871, + "grad_norm": 6.129835438845338, + "learning_rate": 3.6975447980890632e-06, + "loss": 0.8927, + "step": 9987 + }, + { + "epoch": 0.7215850596925967, + "grad_norm": 5.2887925680634265, + "learning_rate": 3.697288029797946e-06, + "loss": 0.8077, + "step": 9988 + }, + { + "epoch": 0.7216573048928062, + "grad_norm": 4.962150749773385, + "learning_rate": 3.6970312451168843e-06, + "loss": 0.7546, + "step": 9989 + }, + { + "epoch": 0.7217295500930156, + "grad_norm": 6.455993458173763, + "learning_rate": 3.6967744440493947e-06, + "loss": 0.8671, + "step": 9990 + }, + { + "epoch": 0.7218017952932252, + "grad_norm": 5.130620526118865, + "learning_rate": 3.6965176265989924e-06, + "loss": 0.7112, + "step": 9991 + }, + { + "epoch": 0.7218740404934347, + "grad_norm": 6.054676159263573, + "learning_rate": 3.6962607927691916e-06, + "loss": 0.8912, + "step": 9992 + }, + { + "epoch": 0.7219462856936443, + "grad_norm": 5.594159841174838, + "learning_rate": 3.6960039425635107e-06, + "loss": 0.8506, + "step": 9993 + }, + { + "epoch": 0.7220185308938537, + "grad_norm": 7.26502538688409, + "learning_rate": 3.695747075985464e-06, + "loss": 0.8038, + "step": 9994 + }, + { + "epoch": 0.7220907760940632, + "grad_norm": 5.289224055878258, + "learning_rate": 3.6954901930385683e-06, + "loss": 0.8237, + "step": 9995 + }, + { + "epoch": 0.7221630212942728, + "grad_norm": 5.426202371841266, + "learning_rate": 3.69523329372634e-06, + "loss": 0.8208, + "step": 9996 + }, + { + "epoch": 0.7222352664944822, + "grad_norm": 5.495108423248076, + "learning_rate": 3.6949763780522957e-06, + "loss": 0.8919, + "step": 9997 + }, + { + "epoch": 0.7223075116946918, + "grad_norm": 5.5909892680529625, + "learning_rate": 3.6947194460199527e-06, + "loss": 0.7933, + "step": 9998 + }, + { + "epoch": 0.7223797568949013, + "grad_norm": 6.91269241810202, + "learning_rate": 3.6944624976328287e-06, + "loss": 0.8731, + "step": 9999 + }, + { + "epoch": 0.7224520020951108, + "grad_norm": 6.434630411793435, + "learning_rate": 3.69420553289444e-06, + "loss": 0.9572, + "step": 10000 + }, + { + "epoch": 0.7225242472953203, + "grad_norm": 7.653735436876252, + "learning_rate": 3.6939485518083053e-06, + "loss": 0.7815, + "step": 10001 + }, + { + "epoch": 0.7225964924955298, + "grad_norm": 7.486412710681813, + "learning_rate": 3.693691554377942e-06, + "loss": 0.9157, + "step": 10002 + }, + { + "epoch": 0.7226687376957394, + "grad_norm": 7.723483097806933, + "learning_rate": 3.6934345406068674e-06, + "loss": 0.8962, + "step": 10003 + }, + { + "epoch": 0.7227409828959489, + "grad_norm": 6.466544102297078, + "learning_rate": 3.6931775104986013e-06, + "loss": 0.8627, + "step": 10004 + }, + { + "epoch": 0.7228132280961583, + "grad_norm": 5.397332217712689, + "learning_rate": 3.6929204640566605e-06, + "loss": 0.8482, + "step": 10005 + }, + { + "epoch": 0.7228854732963679, + "grad_norm": 5.806522372029988, + "learning_rate": 3.6926634012845653e-06, + "loss": 0.8034, + "step": 10006 + }, + { + "epoch": 0.7229577184965774, + "grad_norm": 5.849667740213835, + "learning_rate": 3.692406322185835e-06, + "loss": 0.8996, + "step": 10007 + }, + { + "epoch": 0.7230299636967868, + "grad_norm": 6.4352469343988234, + "learning_rate": 3.6921492267639867e-06, + "loss": 0.8281, + "step": 10008 + }, + { + "epoch": 0.7231022088969964, + "grad_norm": 6.0125017257339, + "learning_rate": 3.691892115022543e-06, + "loss": 0.7375, + "step": 10009 + }, + { + "epoch": 0.7231744540972059, + "grad_norm": 5.397099330460884, + "learning_rate": 3.691634986965019e-06, + "loss": 0.858, + "step": 10010 + }, + { + "epoch": 0.7232466992974155, + "grad_norm": 5.760932480779106, + "learning_rate": 3.6913778425949397e-06, + "loss": 0.817, + "step": 10011 + }, + { + "epoch": 0.7233189444976249, + "grad_norm": 6.042199823817274, + "learning_rate": 3.6911206819158214e-06, + "loss": 0.8591, + "step": 10012 + }, + { + "epoch": 0.7233911896978344, + "grad_norm": 5.490819204388205, + "learning_rate": 3.690863504931187e-06, + "loss": 0.8243, + "step": 10013 + }, + { + "epoch": 0.723463434898044, + "grad_norm": 6.923587630255583, + "learning_rate": 3.6906063116445544e-06, + "loss": 0.8546, + "step": 10014 + }, + { + "epoch": 0.7235356800982534, + "grad_norm": 6.579254270924221, + "learning_rate": 3.6903491020594466e-06, + "loss": 0.8571, + "step": 10015 + }, + { + "epoch": 0.723607925298463, + "grad_norm": 5.582485172509995, + "learning_rate": 3.690091876179384e-06, + "loss": 0.7454, + "step": 10016 + }, + { + "epoch": 0.7236801704986725, + "grad_norm": 6.113493094871779, + "learning_rate": 3.689834634007887e-06, + "loss": 0.7785, + "step": 10017 + }, + { + "epoch": 0.723752415698882, + "grad_norm": 6.350143443011833, + "learning_rate": 3.689577375548479e-06, + "loss": 0.9742, + "step": 10018 + }, + { + "epoch": 0.7238246608990915, + "grad_norm": 6.470744392224279, + "learning_rate": 3.6893201008046792e-06, + "loss": 0.8476, + "step": 10019 + }, + { + "epoch": 0.723896906099301, + "grad_norm": 7.9072773985838065, + "learning_rate": 3.6890628097800107e-06, + "loss": 0.9054, + "step": 10020 + }, + { + "epoch": 0.7239691512995106, + "grad_norm": 7.294799770982784, + "learning_rate": 3.6888055024779955e-06, + "loss": 0.8215, + "step": 10021 + }, + { + "epoch": 0.7240413964997201, + "grad_norm": 5.686699779513969, + "learning_rate": 3.688548178902157e-06, + "loss": 0.8687, + "step": 10022 + }, + { + "epoch": 0.7241136416999295, + "grad_norm": 6.949859606745074, + "learning_rate": 3.6882908390560162e-06, + "loss": 0.9461, + "step": 10023 + }, + { + "epoch": 0.7241858869001391, + "grad_norm": 8.316975930171187, + "learning_rate": 3.6880334829430964e-06, + "loss": 0.8223, + "step": 10024 + }, + { + "epoch": 0.7242581321003486, + "grad_norm": 5.850963689216444, + "learning_rate": 3.687776110566921e-06, + "loss": 0.828, + "step": 10025 + }, + { + "epoch": 0.724330377300558, + "grad_norm": 6.898630763767178, + "learning_rate": 3.687518721931012e-06, + "loss": 0.8386, + "step": 10026 + }, + { + "epoch": 0.7244026225007676, + "grad_norm": 5.831975324685809, + "learning_rate": 3.687261317038895e-06, + "loss": 0.8441, + "step": 10027 + }, + { + "epoch": 0.7244748677009771, + "grad_norm": 6.499285145111033, + "learning_rate": 3.6870038958940914e-06, + "loss": 0.8728, + "step": 10028 + }, + { + "epoch": 0.7245471129011867, + "grad_norm": 5.661195058120148, + "learning_rate": 3.6867464585001268e-06, + "loss": 0.8359, + "step": 10029 + }, + { + "epoch": 0.7246193581013961, + "grad_norm": 6.417181745721185, + "learning_rate": 3.6864890048605238e-06, + "loss": 0.8734, + "step": 10030 + }, + { + "epoch": 0.7246916033016056, + "grad_norm": 8.593872069445535, + "learning_rate": 3.6862315349788086e-06, + "loss": 0.8359, + "step": 10031 + }, + { + "epoch": 0.7247638485018152, + "grad_norm": 5.340098778886184, + "learning_rate": 3.6859740488585046e-06, + "loss": 0.8486, + "step": 10032 + }, + { + "epoch": 0.7248360937020246, + "grad_norm": 6.0481712667042355, + "learning_rate": 3.6857165465031357e-06, + "loss": 0.8081, + "step": 10033 + }, + { + "epoch": 0.7249083389022342, + "grad_norm": 6.096671162320845, + "learning_rate": 3.685459027916228e-06, + "loss": 0.8782, + "step": 10034 + }, + { + "epoch": 0.7249805841024437, + "grad_norm": 6.105163215123801, + "learning_rate": 3.6852014931013074e-06, + "loss": 0.8134, + "step": 10035 + }, + { + "epoch": 0.7250528293026532, + "grad_norm": 7.328306411417248, + "learning_rate": 3.6849439420618995e-06, + "loss": 0.8018, + "step": 10036 + }, + { + "epoch": 0.7251250745028627, + "grad_norm": 7.317764343554182, + "learning_rate": 3.6846863748015273e-06, + "loss": 0.9304, + "step": 10037 + }, + { + "epoch": 0.7251973197030722, + "grad_norm": 6.470685144198531, + "learning_rate": 3.6844287913237192e-06, + "loss": 0.8356, + "step": 10038 + }, + { + "epoch": 0.7252695649032818, + "grad_norm": 5.835143253558629, + "learning_rate": 3.6841711916320005e-06, + "loss": 0.8826, + "step": 10039 + }, + { + "epoch": 0.7253418101034913, + "grad_norm": 7.1389914215416805, + "learning_rate": 3.683913575729898e-06, + "loss": 0.8306, + "step": 10040 + }, + { + "epoch": 0.7254140553037007, + "grad_norm": 7.020160797538117, + "learning_rate": 3.6836559436209378e-06, + "loss": 0.8727, + "step": 10041 + }, + { + "epoch": 0.7254863005039103, + "grad_norm": 7.5000033060702345, + "learning_rate": 3.6833982953086465e-06, + "loss": 0.7901, + "step": 10042 + }, + { + "epoch": 0.7255585457041198, + "grad_norm": 5.933088410426521, + "learning_rate": 3.683140630796551e-06, + "loss": 0.8581, + "step": 10043 + }, + { + "epoch": 0.7256307909043292, + "grad_norm": 6.0558302852982875, + "learning_rate": 3.6828829500881796e-06, + "loss": 0.7984, + "step": 10044 + }, + { + "epoch": 0.7257030361045388, + "grad_norm": 5.822599107775929, + "learning_rate": 3.6826252531870593e-06, + "loss": 0.8489, + "step": 10045 + }, + { + "epoch": 0.7257752813047483, + "grad_norm": 8.88482036728849, + "learning_rate": 3.682367540096717e-06, + "loss": 0.9326, + "step": 10046 + }, + { + "epoch": 0.7258475265049579, + "grad_norm": 6.1382067240448475, + "learning_rate": 3.6821098108206814e-06, + "loss": 0.8622, + "step": 10047 + }, + { + "epoch": 0.7259197717051673, + "grad_norm": 5.899971021160791, + "learning_rate": 3.68185206536248e-06, + "loss": 0.8536, + "step": 10048 + }, + { + "epoch": 0.7259920169053768, + "grad_norm": 7.509030055816156, + "learning_rate": 3.6815943037256415e-06, + "loss": 0.8641, + "step": 10049 + }, + { + "epoch": 0.7260642621055864, + "grad_norm": 7.57100558577485, + "learning_rate": 3.6813365259136945e-06, + "loss": 0.8059, + "step": 10050 + }, + { + "epoch": 0.7261365073057958, + "grad_norm": 5.704436854151165, + "learning_rate": 3.6810787319301678e-06, + "loss": 0.8522, + "step": 10051 + }, + { + "epoch": 0.7262087525060054, + "grad_norm": 6.479236303938558, + "learning_rate": 3.6808209217785905e-06, + "loss": 0.8851, + "step": 10052 + }, + { + "epoch": 0.7262809977062149, + "grad_norm": 5.500254885229432, + "learning_rate": 3.680563095462491e-06, + "loss": 0.856, + "step": 10053 + }, + { + "epoch": 0.7263532429064244, + "grad_norm": 7.011323489468633, + "learning_rate": 3.6803052529854e-06, + "loss": 0.8623, + "step": 10054 + }, + { + "epoch": 0.7264254881066339, + "grad_norm": 6.456467028623257, + "learning_rate": 3.6800473943508462e-06, + "loss": 0.825, + "step": 10055 + }, + { + "epoch": 0.7264977333068434, + "grad_norm": 7.346057545541169, + "learning_rate": 3.67978951956236e-06, + "loss": 0.7887, + "step": 10056 + }, + { + "epoch": 0.726569978507053, + "grad_norm": 7.0287775058197965, + "learning_rate": 3.6795316286234718e-06, + "loss": 0.8373, + "step": 10057 + }, + { + "epoch": 0.7266422237072625, + "grad_norm": 6.630006733656466, + "learning_rate": 3.6792737215377104e-06, + "loss": 0.8598, + "step": 10058 + }, + { + "epoch": 0.7267144689074719, + "grad_norm": 6.676410071777587, + "learning_rate": 3.679015798308608e-06, + "loss": 0.8218, + "step": 10059 + }, + { + "epoch": 0.7267867141076815, + "grad_norm": 7.218596749929029, + "learning_rate": 3.678757858939695e-06, + "loss": 0.8843, + "step": 10060 + }, + { + "epoch": 0.726858959307891, + "grad_norm": 6.5352890448066585, + "learning_rate": 3.678499903434502e-06, + "loss": 0.7775, + "step": 10061 + }, + { + "epoch": 0.7269312045081004, + "grad_norm": 8.634129654183674, + "learning_rate": 3.6782419317965595e-06, + "loss": 0.91, + "step": 10062 + }, + { + "epoch": 0.72700344970831, + "grad_norm": 6.167274170240688, + "learning_rate": 3.677983944029401e-06, + "loss": 0.8301, + "step": 10063 + }, + { + "epoch": 0.7270756949085195, + "grad_norm": 5.7656068052092655, + "learning_rate": 3.677725940136556e-06, + "loss": 0.8616, + "step": 10064 + }, + { + "epoch": 0.7271479401087291, + "grad_norm": 6.283718857544795, + "learning_rate": 3.677467920121558e-06, + "loss": 0.8533, + "step": 10065 + }, + { + "epoch": 0.7272201853089385, + "grad_norm": 7.928404154541073, + "learning_rate": 3.6772098839879382e-06, + "loss": 0.8117, + "step": 10066 + }, + { + "epoch": 0.727292430509148, + "grad_norm": 9.864944940133979, + "learning_rate": 3.6769518317392293e-06, + "loss": 0.8268, + "step": 10067 + }, + { + "epoch": 0.7273646757093576, + "grad_norm": 6.945503910203376, + "learning_rate": 3.6766937633789636e-06, + "loss": 0.9149, + "step": 10068 + }, + { + "epoch": 0.727436920909567, + "grad_norm": 6.411155943527389, + "learning_rate": 3.6764356789106736e-06, + "loss": 0.8938, + "step": 10069 + }, + { + "epoch": 0.7275091661097766, + "grad_norm": 6.070010851578542, + "learning_rate": 3.6761775783378935e-06, + "loss": 0.8685, + "step": 10070 + }, + { + "epoch": 0.7275814113099861, + "grad_norm": 7.176629113457718, + "learning_rate": 3.6759194616641547e-06, + "loss": 0.7963, + "step": 10071 + }, + { + "epoch": 0.7276536565101956, + "grad_norm": 5.830722915034984, + "learning_rate": 3.6756613288929914e-06, + "loss": 0.835, + "step": 10072 + }, + { + "epoch": 0.7277259017104051, + "grad_norm": 8.92253599373913, + "learning_rate": 3.6754031800279378e-06, + "loss": 0.8347, + "step": 10073 + }, + { + "epoch": 0.7277981469106146, + "grad_norm": 6.879751055006091, + "learning_rate": 3.675145015072527e-06, + "loss": 0.8169, + "step": 10074 + }, + { + "epoch": 0.7278703921108242, + "grad_norm": 6.38029200492542, + "learning_rate": 3.674886834030294e-06, + "loss": 0.8315, + "step": 10075 + }, + { + "epoch": 0.7279426373110337, + "grad_norm": 5.4998357921709555, + "learning_rate": 3.6746286369047723e-06, + "loss": 0.7331, + "step": 10076 + }, + { + "epoch": 0.7280148825112431, + "grad_norm": 6.186656855926189, + "learning_rate": 3.6743704236994958e-06, + "loss": 0.7812, + "step": 10077 + }, + { + "epoch": 0.7280871277114527, + "grad_norm": 6.731399905661631, + "learning_rate": 3.6741121944180003e-06, + "loss": 0.8523, + "step": 10078 + }, + { + "epoch": 0.7281593729116622, + "grad_norm": 6.180331306342108, + "learning_rate": 3.6738539490638216e-06, + "loss": 0.8231, + "step": 10079 + }, + { + "epoch": 0.7282316181118716, + "grad_norm": 5.09198510422367, + "learning_rate": 3.673595687640493e-06, + "loss": 0.7867, + "step": 10080 + }, + { + "epoch": 0.7283038633120812, + "grad_norm": 5.428687485192866, + "learning_rate": 3.6733374101515503e-06, + "loss": 0.8958, + "step": 10081 + }, + { + "epoch": 0.7283761085122907, + "grad_norm": 5.745590177521985, + "learning_rate": 3.67307911660053e-06, + "loss": 0.8476, + "step": 10082 + }, + { + "epoch": 0.7284483537125003, + "grad_norm": 5.82968127051731, + "learning_rate": 3.6728208069909672e-06, + "loss": 0.852, + "step": 10083 + }, + { + "epoch": 0.7285205989127097, + "grad_norm": 5.954153057249234, + "learning_rate": 3.672562481326398e-06, + "loss": 0.7996, + "step": 10084 + }, + { + "epoch": 0.7285928441129192, + "grad_norm": 5.961397925677457, + "learning_rate": 3.672304139610359e-06, + "loss": 0.8299, + "step": 10085 + }, + { + "epoch": 0.7286650893131288, + "grad_norm": 6.291164487624812, + "learning_rate": 3.6720457818463868e-06, + "loss": 0.8485, + "step": 10086 + }, + { + "epoch": 0.7287373345133382, + "grad_norm": 8.632079826091939, + "learning_rate": 3.671787408038018e-06, + "loss": 0.7978, + "step": 10087 + }, + { + "epoch": 0.7288095797135478, + "grad_norm": 6.606460830658973, + "learning_rate": 3.671529018188789e-06, + "loss": 0.9085, + "step": 10088 + }, + { + "epoch": 0.7288818249137573, + "grad_norm": 6.313361382964253, + "learning_rate": 3.671270612302238e-06, + "loss": 0.8303, + "step": 10089 + }, + { + "epoch": 0.7289540701139668, + "grad_norm": 7.820747843542547, + "learning_rate": 3.6710121903819014e-06, + "loss": 0.8179, + "step": 10090 + }, + { + "epoch": 0.7290263153141763, + "grad_norm": 7.150182025433071, + "learning_rate": 3.670753752431317e-06, + "loss": 0.8099, + "step": 10091 + }, + { + "epoch": 0.7290985605143858, + "grad_norm": 5.605327708101459, + "learning_rate": 3.670495298454022e-06, + "loss": 0.8893, + "step": 10092 + }, + { + "epoch": 0.7291708057145954, + "grad_norm": 5.594163591662582, + "learning_rate": 3.6702368284535567e-06, + "loss": 0.8331, + "step": 10093 + }, + { + "epoch": 0.7292430509148049, + "grad_norm": 8.020302759101819, + "learning_rate": 3.669978342433457e-06, + "loss": 0.8221, + "step": 10094 + }, + { + "epoch": 0.7293152961150143, + "grad_norm": 6.014110503348303, + "learning_rate": 3.6697198403972624e-06, + "loss": 0.9556, + "step": 10095 + }, + { + "epoch": 0.7293875413152239, + "grad_norm": 7.035135373979992, + "learning_rate": 3.669461322348511e-06, + "loss": 0.8951, + "step": 10096 + }, + { + "epoch": 0.7294597865154334, + "grad_norm": 6.310425067978719, + "learning_rate": 3.669202788290743e-06, + "loss": 0.8728, + "step": 10097 + }, + { + "epoch": 0.7295320317156428, + "grad_norm": 5.199690318056117, + "learning_rate": 3.6689442382274964e-06, + "loss": 0.9039, + "step": 10098 + }, + { + "epoch": 0.7296042769158524, + "grad_norm": 7.975353422076745, + "learning_rate": 3.6686856721623104e-06, + "loss": 0.9308, + "step": 10099 + }, + { + "epoch": 0.7296765221160619, + "grad_norm": 5.7138307185908905, + "learning_rate": 3.668427090098725e-06, + "loss": 0.8346, + "step": 10100 + }, + { + "epoch": 0.7297487673162715, + "grad_norm": 5.372931148790078, + "learning_rate": 3.6681684920402797e-06, + "loss": 0.8194, + "step": 10101 + }, + { + "epoch": 0.7298210125164809, + "grad_norm": 5.405391735240044, + "learning_rate": 3.667909877990516e-06, + "loss": 0.9185, + "step": 10102 + }, + { + "epoch": 0.7298932577166904, + "grad_norm": 7.254209677451679, + "learning_rate": 3.6676512479529717e-06, + "loss": 0.8812, + "step": 10103 + }, + { + "epoch": 0.7299655029169, + "grad_norm": 6.367469678371501, + "learning_rate": 3.667392601931189e-06, + "loss": 0.8617, + "step": 10104 + }, + { + "epoch": 0.7300377481171094, + "grad_norm": 5.109562768314879, + "learning_rate": 3.6671339399287077e-06, + "loss": 0.8591, + "step": 10105 + }, + { + "epoch": 0.730109993317319, + "grad_norm": 6.928115408361228, + "learning_rate": 3.666875261949069e-06, + "loss": 0.8322, + "step": 10106 + }, + { + "epoch": 0.7301822385175285, + "grad_norm": 6.5235149812951025, + "learning_rate": 3.6666165679958145e-06, + "loss": 0.8302, + "step": 10107 + }, + { + "epoch": 0.730254483717738, + "grad_norm": 5.3979845312405, + "learning_rate": 3.6663578580724844e-06, + "loss": 0.8349, + "step": 10108 + }, + { + "epoch": 0.7303267289179475, + "grad_norm": 6.037705203531873, + "learning_rate": 3.6660991321826213e-06, + "loss": 0.8681, + "step": 10109 + }, + { + "epoch": 0.730398974118157, + "grad_norm": 7.721790533402732, + "learning_rate": 3.6658403903297655e-06, + "loss": 0.8072, + "step": 10110 + }, + { + "epoch": 0.7304712193183666, + "grad_norm": 6.800712817361041, + "learning_rate": 3.6655816325174613e-06, + "loss": 0.8917, + "step": 10111 + }, + { + "epoch": 0.7305434645185761, + "grad_norm": 6.5436511421205354, + "learning_rate": 3.665322858749249e-06, + "loss": 0.804, + "step": 10112 + }, + { + "epoch": 0.7306157097187855, + "grad_norm": 6.814101284715349, + "learning_rate": 3.6650640690286715e-06, + "loss": 0.8848, + "step": 10113 + }, + { + "epoch": 0.7306879549189951, + "grad_norm": 5.276555068056285, + "learning_rate": 3.6648052633592713e-06, + "loss": 0.8767, + "step": 10114 + }, + { + "epoch": 0.7307602001192046, + "grad_norm": 6.931129623757452, + "learning_rate": 3.664546441744592e-06, + "loss": 0.8414, + "step": 10115 + }, + { + "epoch": 0.730832445319414, + "grad_norm": 5.452437480443899, + "learning_rate": 3.6642876041881757e-06, + "loss": 0.8246, + "step": 10116 + }, + { + "epoch": 0.7309046905196236, + "grad_norm": 7.936966284883226, + "learning_rate": 3.664028750693566e-06, + "loss": 0.7765, + "step": 10117 + }, + { + "epoch": 0.7309769357198331, + "grad_norm": 6.26437525283644, + "learning_rate": 3.6637698812643076e-06, + "loss": 0.773, + "step": 10118 + }, + { + "epoch": 0.7310491809200427, + "grad_norm": 6.8804532704634545, + "learning_rate": 3.6635109959039416e-06, + "loss": 0.7572, + "step": 10119 + }, + { + "epoch": 0.7311214261202521, + "grad_norm": 6.239695179169819, + "learning_rate": 3.6632520946160142e-06, + "loss": 0.7619, + "step": 10120 + }, + { + "epoch": 0.7311936713204616, + "grad_norm": 5.425575774265788, + "learning_rate": 3.662993177404069e-06, + "loss": 0.7883, + "step": 10121 + }, + { + "epoch": 0.7312659165206712, + "grad_norm": 6.123090485291268, + "learning_rate": 3.66273424427165e-06, + "loss": 0.8307, + "step": 10122 + }, + { + "epoch": 0.7313381617208806, + "grad_norm": 6.4100387162632755, + "learning_rate": 3.662475295222302e-06, + "loss": 0.8187, + "step": 10123 + }, + { + "epoch": 0.7314104069210902, + "grad_norm": 6.552200809592053, + "learning_rate": 3.66221633025957e-06, + "loss": 0.8464, + "step": 10124 + }, + { + "epoch": 0.7314826521212997, + "grad_norm": 8.1115056584984, + "learning_rate": 3.6619573493869988e-06, + "loss": 0.8258, + "step": 10125 + }, + { + "epoch": 0.7315548973215092, + "grad_norm": 6.398230916136208, + "learning_rate": 3.6616983526081336e-06, + "loss": 0.815, + "step": 10126 + }, + { + "epoch": 0.7316271425217187, + "grad_norm": 6.357464777601013, + "learning_rate": 3.66143933992652e-06, + "loss": 0.7698, + "step": 10127 + }, + { + "epoch": 0.7316993877219282, + "grad_norm": 7.30854781853552, + "learning_rate": 3.6611803113457034e-06, + "loss": 0.8168, + "step": 10128 + }, + { + "epoch": 0.7317716329221378, + "grad_norm": 5.873997704250574, + "learning_rate": 3.6609212668692297e-06, + "loss": 0.782, + "step": 10129 + }, + { + "epoch": 0.7318438781223472, + "grad_norm": 7.580554942246766, + "learning_rate": 3.660662206500646e-06, + "loss": 0.8445, + "step": 10130 + }, + { + "epoch": 0.7319161233225567, + "grad_norm": 6.593155531277807, + "learning_rate": 3.6604031302434973e-06, + "loss": 0.856, + "step": 10131 + }, + { + "epoch": 0.7319883685227663, + "grad_norm": 6.236349839944186, + "learning_rate": 3.6601440381013316e-06, + "loss": 0.8537, + "step": 10132 + }, + { + "epoch": 0.7320606137229758, + "grad_norm": 6.4182038220606294, + "learning_rate": 3.6598849300776933e-06, + "loss": 0.8477, + "step": 10133 + }, + { + "epoch": 0.7321328589231852, + "grad_norm": 6.522907678709008, + "learning_rate": 3.659625806176132e-06, + "loss": 0.8503, + "step": 10134 + }, + { + "epoch": 0.7322051041233948, + "grad_norm": 5.574960532283923, + "learning_rate": 3.6593666664001935e-06, + "loss": 0.8399, + "step": 10135 + }, + { + "epoch": 0.7322773493236043, + "grad_norm": 7.066044550928272, + "learning_rate": 3.659107510753426e-06, + "loss": 0.8593, + "step": 10136 + }, + { + "epoch": 0.7323495945238139, + "grad_norm": 5.305558819905731, + "learning_rate": 3.658848339239376e-06, + "loss": 0.7879, + "step": 10137 + }, + { + "epoch": 0.7324218397240233, + "grad_norm": 7.196473731028062, + "learning_rate": 3.6585891518615923e-06, + "loss": 0.8902, + "step": 10138 + }, + { + "epoch": 0.7324940849242328, + "grad_norm": 6.231115268180453, + "learning_rate": 3.658329948623623e-06, + "loss": 0.9211, + "step": 10139 + }, + { + "epoch": 0.7325663301244424, + "grad_norm": 7.636860112981899, + "learning_rate": 3.658070729529016e-06, + "loss": 0.8342, + "step": 10140 + }, + { + "epoch": 0.7326385753246518, + "grad_norm": 5.938094902351758, + "learning_rate": 3.65781149458132e-06, + "loss": 0.8609, + "step": 10141 + }, + { + "epoch": 0.7327108205248614, + "grad_norm": 7.528093238342348, + "learning_rate": 3.6575522437840827e-06, + "loss": 0.8249, + "step": 10142 + }, + { + "epoch": 0.7327830657250709, + "grad_norm": 5.016087971172944, + "learning_rate": 3.6572929771408543e-06, + "loss": 0.8568, + "step": 10143 + }, + { + "epoch": 0.7328553109252804, + "grad_norm": 5.979816503356801, + "learning_rate": 3.6570336946551837e-06, + "loss": 0.8174, + "step": 10144 + }, + { + "epoch": 0.7329275561254899, + "grad_norm": 5.8095410106264325, + "learning_rate": 3.656774396330621e-06, + "loss": 0.8512, + "step": 10145 + }, + { + "epoch": 0.7329998013256994, + "grad_norm": 6.889016030618392, + "learning_rate": 3.656515082170714e-06, + "loss": 0.9389, + "step": 10146 + }, + { + "epoch": 0.733072046525909, + "grad_norm": 6.5569623561963635, + "learning_rate": 3.6562557521790137e-06, + "loss": 0.8607, + "step": 10147 + }, + { + "epoch": 0.7331442917261184, + "grad_norm": 7.401364546051943, + "learning_rate": 3.65599640635907e-06, + "loss": 0.8574, + "step": 10148 + }, + { + "epoch": 0.7332165369263279, + "grad_norm": 6.481435822018251, + "learning_rate": 3.6557370447144334e-06, + "loss": 0.8511, + "step": 10149 + }, + { + "epoch": 0.7332887821265375, + "grad_norm": 7.637149074752321, + "learning_rate": 3.655477667248654e-06, + "loss": 0.8045, + "step": 10150 + }, + { + "epoch": 0.733361027326747, + "grad_norm": 6.6369690648608515, + "learning_rate": 3.6552182739652824e-06, + "loss": 0.7891, + "step": 10151 + }, + { + "epoch": 0.7334332725269564, + "grad_norm": 6.5826927871376455, + "learning_rate": 3.654958864867869e-06, + "loss": 0.885, + "step": 10152 + }, + { + "epoch": 0.733505517727166, + "grad_norm": 6.992228690084683, + "learning_rate": 3.6546994399599663e-06, + "loss": 0.81, + "step": 10153 + }, + { + "epoch": 0.7335777629273755, + "grad_norm": 6.301579462090517, + "learning_rate": 3.654439999245125e-06, + "loss": 0.7769, + "step": 10154 + }, + { + "epoch": 0.7336500081275851, + "grad_norm": 5.564851928002291, + "learning_rate": 3.6541805427268958e-06, + "loss": 0.8448, + "step": 10155 + }, + { + "epoch": 0.7337222533277945, + "grad_norm": 5.364028022937555, + "learning_rate": 3.6539210704088313e-06, + "loss": 0.8265, + "step": 10156 + }, + { + "epoch": 0.733794498528004, + "grad_norm": 6.2462137579348385, + "learning_rate": 3.6536615822944833e-06, + "loss": 0.7927, + "step": 10157 + }, + { + "epoch": 0.7338667437282136, + "grad_norm": 7.519902588233669, + "learning_rate": 3.6534020783874042e-06, + "loss": 0.7935, + "step": 10158 + }, + { + "epoch": 0.733938988928423, + "grad_norm": 10.593307440861947, + "learning_rate": 3.653142558691146e-06, + "loss": 0.828, + "step": 10159 + }, + { + "epoch": 0.7340112341286326, + "grad_norm": 6.416913048443859, + "learning_rate": 3.6528830232092618e-06, + "loss": 0.7939, + "step": 10160 + }, + { + "epoch": 0.7340834793288421, + "grad_norm": 5.602865283093389, + "learning_rate": 3.6526234719453037e-06, + "loss": 0.8268, + "step": 10161 + }, + { + "epoch": 0.7341557245290516, + "grad_norm": 5.2146725744378, + "learning_rate": 3.652363904902825e-06, + "loss": 0.8833, + "step": 10162 + }, + { + "epoch": 0.7342279697292611, + "grad_norm": 6.553783040622774, + "learning_rate": 3.6521043220853804e-06, + "loss": 0.8223, + "step": 10163 + }, + { + "epoch": 0.7343002149294706, + "grad_norm": 6.703129837283365, + "learning_rate": 3.6518447234965214e-06, + "loss": 0.8505, + "step": 10164 + }, + { + "epoch": 0.7343724601296802, + "grad_norm": 6.78864921830761, + "learning_rate": 3.6515851091398024e-06, + "loss": 0.9144, + "step": 10165 + }, + { + "epoch": 0.7344447053298896, + "grad_norm": 7.061537685493335, + "learning_rate": 3.651325479018778e-06, + "loss": 0.8669, + "step": 10166 + }, + { + "epoch": 0.7345169505300991, + "grad_norm": 6.080738437381405, + "learning_rate": 3.651065833137001e-06, + "loss": 0.8068, + "step": 10167 + }, + { + "epoch": 0.7345891957303087, + "grad_norm": 5.260990627690146, + "learning_rate": 3.650806171498027e-06, + "loss": 0.7128, + "step": 10168 + }, + { + "epoch": 0.7346614409305182, + "grad_norm": 6.391837828067332, + "learning_rate": 3.6505464941054098e-06, + "loss": 0.8503, + "step": 10169 + }, + { + "epoch": 0.7347336861307276, + "grad_norm": 6.7156177405331, + "learning_rate": 3.6502868009627046e-06, + "loss": 0.8799, + "step": 10170 + }, + { + "epoch": 0.7348059313309372, + "grad_norm": 5.751454376721576, + "learning_rate": 3.650027092073466e-06, + "loss": 0.7673, + "step": 10171 + }, + { + "epoch": 0.7348781765311467, + "grad_norm": 6.380022052977903, + "learning_rate": 3.649767367441249e-06, + "loss": 0.8085, + "step": 10172 + }, + { + "epoch": 0.7349504217313563, + "grad_norm": 6.217763544562767, + "learning_rate": 3.6495076270696106e-06, + "loss": 0.7188, + "step": 10173 + }, + { + "epoch": 0.7350226669315657, + "grad_norm": 7.360927280393829, + "learning_rate": 3.649247870962105e-06, + "loss": 0.8507, + "step": 10174 + }, + { + "epoch": 0.7350949121317752, + "grad_norm": 7.4542163873517175, + "learning_rate": 3.648988099122288e-06, + "loss": 0.8665, + "step": 10175 + }, + { + "epoch": 0.7351671573319848, + "grad_norm": 7.098331534761713, + "learning_rate": 3.648728311553716e-06, + "loss": 0.8065, + "step": 10176 + }, + { + "epoch": 0.7352394025321942, + "grad_norm": 6.341207361151277, + "learning_rate": 3.648468508259946e-06, + "loss": 0.8331, + "step": 10177 + }, + { + "epoch": 0.7353116477324038, + "grad_norm": 6.062678698236191, + "learning_rate": 3.648208689244533e-06, + "loss": 0.8682, + "step": 10178 + }, + { + "epoch": 0.7353838929326133, + "grad_norm": 6.829277726907892, + "learning_rate": 3.647948854511035e-06, + "loss": 0.767, + "step": 10179 + }, + { + "epoch": 0.7354561381328228, + "grad_norm": 8.35504642969057, + "learning_rate": 3.647689004063009e-06, + "loss": 0.8313, + "step": 10180 + }, + { + "epoch": 0.7355283833330323, + "grad_norm": 7.0480192824412775, + "learning_rate": 3.6474291379040105e-06, + "loss": 0.8574, + "step": 10181 + }, + { + "epoch": 0.7356006285332418, + "grad_norm": 7.133863323642398, + "learning_rate": 3.647169256037599e-06, + "loss": 0.9374, + "step": 10182 + }, + { + "epoch": 0.7356728737334514, + "grad_norm": 8.436143554160648, + "learning_rate": 3.6469093584673306e-06, + "loss": 0.8555, + "step": 10183 + }, + { + "epoch": 0.7357451189336608, + "grad_norm": 5.512467039533069, + "learning_rate": 3.6466494451967637e-06, + "loss": 0.8146, + "step": 10184 + }, + { + "epoch": 0.7358173641338703, + "grad_norm": 6.631381434253537, + "learning_rate": 3.6463895162294566e-06, + "loss": 0.8491, + "step": 10185 + }, + { + "epoch": 0.7358896093340799, + "grad_norm": 5.645194867996698, + "learning_rate": 3.6461295715689664e-06, + "loss": 0.8917, + "step": 10186 + }, + { + "epoch": 0.7359618545342894, + "grad_norm": 7.114721011109148, + "learning_rate": 3.6458696112188522e-06, + "loss": 0.8208, + "step": 10187 + }, + { + "epoch": 0.7360340997344988, + "grad_norm": 7.5233784293988135, + "learning_rate": 3.6456096351826743e-06, + "loss": 0.8427, + "step": 10188 + }, + { + "epoch": 0.7361063449347084, + "grad_norm": 6.528942814024245, + "learning_rate": 3.645349643463989e-06, + "loss": 0.8443, + "step": 10189 + }, + { + "epoch": 0.7361785901349179, + "grad_norm": 5.211023881873977, + "learning_rate": 3.645089636066356e-06, + "loss": 0.8304, + "step": 10190 + }, + { + "epoch": 0.7362508353351275, + "grad_norm": 6.218961012437588, + "learning_rate": 3.644829612993335e-06, + "loss": 0.8596, + "step": 10191 + }, + { + "epoch": 0.7363230805353369, + "grad_norm": 5.537598151376896, + "learning_rate": 3.6445695742484853e-06, + "loss": 0.7752, + "step": 10192 + }, + { + "epoch": 0.7363953257355464, + "grad_norm": 6.545213292713931, + "learning_rate": 3.644309519835368e-06, + "loss": 0.7764, + "step": 10193 + }, + { + "epoch": 0.736467570935756, + "grad_norm": 5.875426581296632, + "learning_rate": 3.644049449757541e-06, + "loss": 0.8452, + "step": 10194 + }, + { + "epoch": 0.7365398161359654, + "grad_norm": 6.7852979259820225, + "learning_rate": 3.643789364018565e-06, + "loss": 0.8134, + "step": 10195 + }, + { + "epoch": 0.736612061336175, + "grad_norm": 5.652978277542916, + "learning_rate": 3.6435292626220013e-06, + "loss": 0.8294, + "step": 10196 + }, + { + "epoch": 0.7366843065363845, + "grad_norm": 7.268952950869411, + "learning_rate": 3.6432691455714102e-06, + "loss": 0.8459, + "step": 10197 + }, + { + "epoch": 0.736756551736594, + "grad_norm": 7.617685029825206, + "learning_rate": 3.643009012870352e-06, + "loss": 0.799, + "step": 10198 + }, + { + "epoch": 0.7368287969368035, + "grad_norm": 5.7660725737737115, + "learning_rate": 3.6427488645223878e-06, + "loss": 0.8831, + "step": 10199 + }, + { + "epoch": 0.736901042137013, + "grad_norm": 6.112275900760021, + "learning_rate": 3.642488700531079e-06, + "loss": 0.7668, + "step": 10200 + }, + { + "epoch": 0.7369732873372226, + "grad_norm": 6.879242853597301, + "learning_rate": 3.6422285208999862e-06, + "loss": 0.8775, + "step": 10201 + }, + { + "epoch": 0.737045532537432, + "grad_norm": 5.6493614781206585, + "learning_rate": 3.641968325632673e-06, + "loss": 0.83, + "step": 10202 + }, + { + "epoch": 0.7371177777376415, + "grad_norm": 6.046480387741757, + "learning_rate": 3.6417081147326994e-06, + "loss": 0.8756, + "step": 10203 + }, + { + "epoch": 0.7371900229378511, + "grad_norm": 6.330026298576794, + "learning_rate": 3.641447888203628e-06, + "loss": 0.8385, + "step": 10204 + }, + { + "epoch": 0.7372622681380606, + "grad_norm": 5.640602851464912, + "learning_rate": 3.641187646049022e-06, + "loss": 0.9488, + "step": 10205 + }, + { + "epoch": 0.73733451333827, + "grad_norm": 7.144339298696036, + "learning_rate": 3.6409273882724426e-06, + "loss": 0.9385, + "step": 10206 + }, + { + "epoch": 0.7374067585384796, + "grad_norm": 5.981218825431428, + "learning_rate": 3.6406671148774538e-06, + "loss": 0.8876, + "step": 10207 + }, + { + "epoch": 0.7374790037386891, + "grad_norm": 6.22802380270504, + "learning_rate": 3.640406825867617e-06, + "loss": 0.7354, + "step": 10208 + }, + { + "epoch": 0.7375512489388987, + "grad_norm": 5.226030034653045, + "learning_rate": 3.6401465212464965e-06, + "loss": 0.8569, + "step": 10209 + }, + { + "epoch": 0.7376234941391081, + "grad_norm": 6.256885245557426, + "learning_rate": 3.6398862010176554e-06, + "loss": 0.8759, + "step": 10210 + }, + { + "epoch": 0.7376957393393176, + "grad_norm": 5.269726341713841, + "learning_rate": 3.639625865184658e-06, + "loss": 0.8067, + "step": 10211 + }, + { + "epoch": 0.7377679845395272, + "grad_norm": 5.546471465570141, + "learning_rate": 3.6393655137510662e-06, + "loss": 0.8108, + "step": 10212 + }, + { + "epoch": 0.7378402297397366, + "grad_norm": 5.923618512921059, + "learning_rate": 3.6391051467204463e-06, + "loss": 0.9281, + "step": 10213 + }, + { + "epoch": 0.7379124749399462, + "grad_norm": 8.426089561031697, + "learning_rate": 3.638844764096361e-06, + "loss": 0.8597, + "step": 10214 + }, + { + "epoch": 0.7379847201401557, + "grad_norm": 6.603714920461998, + "learning_rate": 3.6385843658823743e-06, + "loss": 0.8511, + "step": 10215 + }, + { + "epoch": 0.7380569653403652, + "grad_norm": 5.52947607528184, + "learning_rate": 3.638323952082053e-06, + "loss": 0.7751, + "step": 10216 + }, + { + "epoch": 0.7381292105405747, + "grad_norm": 5.893320291657208, + "learning_rate": 3.6380635226989597e-06, + "loss": 0.7567, + "step": 10217 + }, + { + "epoch": 0.7382014557407842, + "grad_norm": 8.599504909015156, + "learning_rate": 3.63780307773666e-06, + "loss": 0.8363, + "step": 10218 + }, + { + "epoch": 0.7382737009409938, + "grad_norm": 7.4765099470788865, + "learning_rate": 3.6375426171987205e-06, + "loss": 0.8288, + "step": 10219 + }, + { + "epoch": 0.7383459461412032, + "grad_norm": 7.790969599857787, + "learning_rate": 3.637282141088706e-06, + "loss": 0.8106, + "step": 10220 + }, + { + "epoch": 0.7384181913414127, + "grad_norm": 7.927835422951728, + "learning_rate": 3.6370216494101818e-06, + "loss": 0.8518, + "step": 10221 + }, + { + "epoch": 0.7384904365416223, + "grad_norm": 6.422457109414249, + "learning_rate": 3.6367611421667142e-06, + "loss": 0.8751, + "step": 10222 + }, + { + "epoch": 0.7385626817418318, + "grad_norm": 7.868416107826176, + "learning_rate": 3.636500619361869e-06, + "loss": 0.8802, + "step": 10223 + }, + { + "epoch": 0.7386349269420412, + "grad_norm": 7.148796897983817, + "learning_rate": 3.6362400809992127e-06, + "loss": 0.9282, + "step": 10224 + }, + { + "epoch": 0.7387071721422508, + "grad_norm": 8.734177152136448, + "learning_rate": 3.6359795270823117e-06, + "loss": 0.9333, + "step": 10225 + }, + { + "epoch": 0.7387794173424603, + "grad_norm": 6.727425018198886, + "learning_rate": 3.6357189576147336e-06, + "loss": 0.862, + "step": 10226 + }, + { + "epoch": 0.7388516625426699, + "grad_norm": 4.759318396869787, + "learning_rate": 3.635458372600045e-06, + "loss": 0.8036, + "step": 10227 + }, + { + "epoch": 0.7389239077428793, + "grad_norm": 5.828908095533986, + "learning_rate": 3.635197772041811e-06, + "loss": 0.8469, + "step": 10228 + }, + { + "epoch": 0.7389961529430888, + "grad_norm": 5.981104024064427, + "learning_rate": 3.634937155943603e-06, + "loss": 0.8184, + "step": 10229 + }, + { + "epoch": 0.7390683981432984, + "grad_norm": 5.3366943537517875, + "learning_rate": 3.634676524308986e-06, + "loss": 0.7728, + "step": 10230 + }, + { + "epoch": 0.7391406433435078, + "grad_norm": 8.199679335860488, + "learning_rate": 3.6344158771415284e-06, + "loss": 0.8582, + "step": 10231 + }, + { + "epoch": 0.7392128885437174, + "grad_norm": 6.1037809317338425, + "learning_rate": 3.634155214444798e-06, + "loss": 0.8464, + "step": 10232 + }, + { + "epoch": 0.7392851337439269, + "grad_norm": 5.94610197462396, + "learning_rate": 3.633894536222363e-06, + "loss": 0.9242, + "step": 10233 + }, + { + "epoch": 0.7393573789441364, + "grad_norm": 6.087165600876178, + "learning_rate": 3.6336338424777926e-06, + "loss": 0.8536, + "step": 10234 + }, + { + "epoch": 0.7394296241443459, + "grad_norm": 6.500627634117429, + "learning_rate": 3.6333731332146547e-06, + "loss": 0.8626, + "step": 10235 + }, + { + "epoch": 0.7395018693445554, + "grad_norm": 5.061731350790767, + "learning_rate": 3.63311240843652e-06, + "loss": 0.7589, + "step": 10236 + }, + { + "epoch": 0.739574114544765, + "grad_norm": 6.275076342114511, + "learning_rate": 3.632851668146955e-06, + "loss": 0.8536, + "step": 10237 + }, + { + "epoch": 0.7396463597449744, + "grad_norm": 6.2194765519553386, + "learning_rate": 3.63259091234953e-06, + "loss": 0.799, + "step": 10238 + }, + { + "epoch": 0.7397186049451839, + "grad_norm": 6.86543718583788, + "learning_rate": 3.632330141047816e-06, + "loss": 0.8603, + "step": 10239 + }, + { + "epoch": 0.7397908501453935, + "grad_norm": 7.147414437549417, + "learning_rate": 3.6320693542453807e-06, + "loss": 0.8221, + "step": 10240 + }, + { + "epoch": 0.739863095345603, + "grad_norm": 6.371638589900757, + "learning_rate": 3.6318085519457956e-06, + "loss": 0.7588, + "step": 10241 + }, + { + "epoch": 0.7399353405458124, + "grad_norm": 6.566914263606359, + "learning_rate": 3.631547734152629e-06, + "loss": 0.7961, + "step": 10242 + }, + { + "epoch": 0.740007585746022, + "grad_norm": 6.2638777201520766, + "learning_rate": 3.6312869008694533e-06, + "loss": 0.7899, + "step": 10243 + }, + { + "epoch": 0.7400798309462315, + "grad_norm": 7.229515338444798, + "learning_rate": 3.6310260520998386e-06, + "loss": 0.8236, + "step": 10244 + }, + { + "epoch": 0.7401520761464411, + "grad_norm": 7.327824112626199, + "learning_rate": 3.6307651878473553e-06, + "loss": 0.8084, + "step": 10245 + }, + { + "epoch": 0.7402243213466505, + "grad_norm": 7.18337602477284, + "learning_rate": 3.6305043081155744e-06, + "loss": 0.8375, + "step": 10246 + }, + { + "epoch": 0.74029656654686, + "grad_norm": 7.821152191355299, + "learning_rate": 3.630243412908067e-06, + "loss": 0.9676, + "step": 10247 + }, + { + "epoch": 0.7403688117470696, + "grad_norm": 5.665482378604261, + "learning_rate": 3.6299825022284052e-06, + "loss": 0.7689, + "step": 10248 + }, + { + "epoch": 0.740441056947279, + "grad_norm": 8.091363857062838, + "learning_rate": 3.6297215760801603e-06, + "loss": 0.8856, + "step": 10249 + }, + { + "epoch": 0.7405133021474886, + "grad_norm": 8.307012367137384, + "learning_rate": 3.6294606344669048e-06, + "loss": 0.7867, + "step": 10250 + }, + { + "epoch": 0.7405855473476981, + "grad_norm": 5.469114978055268, + "learning_rate": 3.6291996773922088e-06, + "loss": 0.854, + "step": 10251 + }, + { + "epoch": 0.7406577925479076, + "grad_norm": 6.139644631902948, + "learning_rate": 3.628938704859647e-06, + "loss": 0.8101, + "step": 10252 + }, + { + "epoch": 0.7407300377481171, + "grad_norm": 8.49059268747302, + "learning_rate": 3.6286777168727905e-06, + "loss": 0.7733, + "step": 10253 + }, + { + "epoch": 0.7408022829483266, + "grad_norm": 8.171168688853463, + "learning_rate": 3.6284167134352125e-06, + "loss": 0.9165, + "step": 10254 + }, + { + "epoch": 0.7408745281485362, + "grad_norm": 6.384486003551188, + "learning_rate": 3.6281556945504866e-06, + "loss": 0.8455, + "step": 10255 + }, + { + "epoch": 0.7409467733487456, + "grad_norm": 6.298464742425114, + "learning_rate": 3.6278946602221837e-06, + "loss": 0.8131, + "step": 10256 + }, + { + "epoch": 0.7410190185489551, + "grad_norm": 6.443667818094844, + "learning_rate": 3.62763361045388e-06, + "loss": 0.9077, + "step": 10257 + }, + { + "epoch": 0.7410912637491647, + "grad_norm": 7.337945210403841, + "learning_rate": 3.627372545249147e-06, + "loss": 0.8185, + "step": 10258 + }, + { + "epoch": 0.7411635089493742, + "grad_norm": 6.794496317336468, + "learning_rate": 3.6271114646115595e-06, + "loss": 0.7487, + "step": 10259 + }, + { + "epoch": 0.7412357541495836, + "grad_norm": 5.165211421335975, + "learning_rate": 3.626850368544691e-06, + "loss": 0.7669, + "step": 10260 + }, + { + "epoch": 0.7413079993497932, + "grad_norm": 8.730895630643147, + "learning_rate": 3.6265892570521154e-06, + "loss": 0.8391, + "step": 10261 + }, + { + "epoch": 0.7413802445500027, + "grad_norm": 6.219043207205264, + "learning_rate": 3.6263281301374083e-06, + "loss": 0.7662, + "step": 10262 + }, + { + "epoch": 0.7414524897502123, + "grad_norm": 6.003664169444578, + "learning_rate": 3.626066987804144e-06, + "loss": 0.7949, + "step": 10263 + }, + { + "epoch": 0.7415247349504217, + "grad_norm": 6.473860486478579, + "learning_rate": 3.625805830055897e-06, + "loss": 0.8578, + "step": 10264 + }, + { + "epoch": 0.7415969801506312, + "grad_norm": 6.521417106196938, + "learning_rate": 3.6255446568962414e-06, + "loss": 0.8583, + "step": 10265 + }, + { + "epoch": 0.7416692253508408, + "grad_norm": 6.150351239850872, + "learning_rate": 3.6252834683287534e-06, + "loss": 0.8709, + "step": 10266 + }, + { + "epoch": 0.7417414705510502, + "grad_norm": 6.117943260306369, + "learning_rate": 3.6250222643570086e-06, + "loss": 0.8749, + "step": 10267 + }, + { + "epoch": 0.7418137157512598, + "grad_norm": 5.645849962198278, + "learning_rate": 3.624761044984583e-06, + "loss": 0.7926, + "step": 10268 + }, + { + "epoch": 0.7418859609514693, + "grad_norm": 7.177928887670529, + "learning_rate": 3.624499810215052e-06, + "loss": 0.6999, + "step": 10269 + }, + { + "epoch": 0.7419582061516788, + "grad_norm": 7.175211344407499, + "learning_rate": 3.6242385600519914e-06, + "loss": 0.8763, + "step": 10270 + }, + { + "epoch": 0.7420304513518883, + "grad_norm": 5.9194411457318505, + "learning_rate": 3.623977294498978e-06, + "loss": 0.8311, + "step": 10271 + }, + { + "epoch": 0.7421026965520978, + "grad_norm": 5.572225576551667, + "learning_rate": 3.6237160135595878e-06, + "loss": 0.8195, + "step": 10272 + }, + { + "epoch": 0.7421749417523074, + "grad_norm": 6.5756683474068165, + "learning_rate": 3.6234547172373984e-06, + "loss": 0.8793, + "step": 10273 + }, + { + "epoch": 0.7422471869525168, + "grad_norm": 5.508301452144328, + "learning_rate": 3.6231934055359864e-06, + "loss": 0.8248, + "step": 10274 + }, + { + "epoch": 0.7423194321527263, + "grad_norm": 4.6332585600896, + "learning_rate": 3.6229320784589277e-06, + "loss": 0.7755, + "step": 10275 + }, + { + "epoch": 0.7423916773529359, + "grad_norm": 7.271926083649672, + "learning_rate": 3.6226707360098012e-06, + "loss": 0.885, + "step": 10276 + }, + { + "epoch": 0.7424639225531454, + "grad_norm": 5.508542795666514, + "learning_rate": 3.6224093781921848e-06, + "loss": 0.8753, + "step": 10277 + }, + { + "epoch": 0.7425361677533548, + "grad_norm": 5.31034412396841, + "learning_rate": 3.622148005009655e-06, + "loss": 0.8475, + "step": 10278 + }, + { + "epoch": 0.7426084129535644, + "grad_norm": 6.241599579243864, + "learning_rate": 3.62188661646579e-06, + "loss": 0.8122, + "step": 10279 + }, + { + "epoch": 0.7426806581537739, + "grad_norm": 5.727397779720598, + "learning_rate": 3.6216252125641686e-06, + "loss": 0.8242, + "step": 10280 + }, + { + "epoch": 0.7427529033539835, + "grad_norm": 6.531520710130622, + "learning_rate": 3.621363793308369e-06, + "loss": 0.8329, + "step": 10281 + }, + { + "epoch": 0.7428251485541929, + "grad_norm": 7.409077937406811, + "learning_rate": 3.6211023587019695e-06, + "loss": 0.8496, + "step": 10282 + }, + { + "epoch": 0.7428973937544024, + "grad_norm": 7.10388078608449, + "learning_rate": 3.620840908748549e-06, + "loss": 0.8306, + "step": 10283 + }, + { + "epoch": 0.742969638954612, + "grad_norm": 7.274419765339589, + "learning_rate": 3.6205794434516877e-06, + "loss": 0.8508, + "step": 10284 + }, + { + "epoch": 0.7430418841548214, + "grad_norm": 4.813414449633486, + "learning_rate": 3.6203179628149626e-06, + "loss": 0.7913, + "step": 10285 + }, + { + "epoch": 0.743114129355031, + "grad_norm": 5.1832250422279476, + "learning_rate": 3.620056466841955e-06, + "loss": 0.7614, + "step": 10286 + }, + { + "epoch": 0.7431863745552405, + "grad_norm": 6.700312809260725, + "learning_rate": 3.6197949555362448e-06, + "loss": 0.8246, + "step": 10287 + }, + { + "epoch": 0.74325861975545, + "grad_norm": 7.1686536414963085, + "learning_rate": 3.6195334289014107e-06, + "loss": 0.8786, + "step": 10288 + }, + { + "epoch": 0.7433308649556595, + "grad_norm": 5.409191588185365, + "learning_rate": 3.6192718869410325e-06, + "loss": 0.8378, + "step": 10289 + }, + { + "epoch": 0.743403110155869, + "grad_norm": 5.98241709261562, + "learning_rate": 3.6190103296586914e-06, + "loss": 0.9293, + "step": 10290 + }, + { + "epoch": 0.7434753553560786, + "grad_norm": 7.704633027384881, + "learning_rate": 3.6187487570579683e-06, + "loss": 0.8558, + "step": 10291 + }, + { + "epoch": 0.743547600556288, + "grad_norm": 5.031395572428191, + "learning_rate": 3.6184871691424427e-06, + "loss": 0.8458, + "step": 10292 + }, + { + "epoch": 0.7436198457564975, + "grad_norm": 7.484509811302175, + "learning_rate": 3.6182255659156975e-06, + "loss": 0.8037, + "step": 10293 + }, + { + "epoch": 0.7436920909567071, + "grad_norm": 5.517900467630665, + "learning_rate": 3.617963947381311e-06, + "loss": 0.831, + "step": 10294 + }, + { + "epoch": 0.7437643361569166, + "grad_norm": 8.379385184447361, + "learning_rate": 3.6177023135428667e-06, + "loss": 0.9099, + "step": 10295 + }, + { + "epoch": 0.743836581357126, + "grad_norm": 7.394912991995219, + "learning_rate": 3.617440664403946e-06, + "loss": 0.8394, + "step": 10296 + }, + { + "epoch": 0.7439088265573356, + "grad_norm": 6.135291348155995, + "learning_rate": 3.61717899996813e-06, + "loss": 0.7743, + "step": 10297 + }, + { + "epoch": 0.7439810717575451, + "grad_norm": 6.137388818048458, + "learning_rate": 3.616917320239001e-06, + "loss": 0.7656, + "step": 10298 + }, + { + "epoch": 0.7440533169577547, + "grad_norm": 6.20942671533022, + "learning_rate": 3.61665562522014e-06, + "loss": 0.8837, + "step": 10299 + }, + { + "epoch": 0.7441255621579641, + "grad_norm": 7.549064618708178, + "learning_rate": 3.616393914915132e-06, + "loss": 0.8106, + "step": 10300 + }, + { + "epoch": 0.7441978073581736, + "grad_norm": 6.423980442003095, + "learning_rate": 3.6161321893275576e-06, + "loss": 0.7821, + "step": 10301 + }, + { + "epoch": 0.7442700525583832, + "grad_norm": 7.099662568576515, + "learning_rate": 3.615870448461e-06, + "loss": 0.8867, + "step": 10302 + }, + { + "epoch": 0.7443422977585926, + "grad_norm": 6.485702342027791, + "learning_rate": 3.6156086923190425e-06, + "loss": 0.7988, + "step": 10303 + }, + { + "epoch": 0.7444145429588022, + "grad_norm": 7.473225932283925, + "learning_rate": 3.6153469209052685e-06, + "loss": 0.8699, + "step": 10304 + }, + { + "epoch": 0.7444867881590117, + "grad_norm": 7.27345087396394, + "learning_rate": 3.6150851342232605e-06, + "loss": 0.8922, + "step": 10305 + }, + { + "epoch": 0.7445590333592212, + "grad_norm": 7.997558936581324, + "learning_rate": 3.6148233322766037e-06, + "loss": 0.9018, + "step": 10306 + }, + { + "epoch": 0.7446312785594307, + "grad_norm": 5.714507984198188, + "learning_rate": 3.6145615150688806e-06, + "loss": 0.8514, + "step": 10307 + }, + { + "epoch": 0.7447035237596402, + "grad_norm": 6.378348873382981, + "learning_rate": 3.614299682603676e-06, + "loss": 0.8393, + "step": 10308 + }, + { + "epoch": 0.7447757689598498, + "grad_norm": 7.134689435074307, + "learning_rate": 3.614037834884573e-06, + "loss": 0.7894, + "step": 10309 + }, + { + "epoch": 0.7448480141600592, + "grad_norm": 7.336437724225928, + "learning_rate": 3.613775971915158e-06, + "loss": 0.8902, + "step": 10310 + }, + { + "epoch": 0.7449202593602687, + "grad_norm": 5.737044625089946, + "learning_rate": 3.6135140936990144e-06, + "loss": 0.8418, + "step": 10311 + }, + { + "epoch": 0.7449925045604783, + "grad_norm": 6.274780281749195, + "learning_rate": 3.613252200239728e-06, + "loss": 0.8243, + "step": 10312 + }, + { + "epoch": 0.7450647497606878, + "grad_norm": 5.413230666955004, + "learning_rate": 3.612990291540882e-06, + "loss": 0.8229, + "step": 10313 + }, + { + "epoch": 0.7451369949608972, + "grad_norm": 6.748941126528346, + "learning_rate": 3.612728367606064e-06, + "loss": 0.9578, + "step": 10314 + }, + { + "epoch": 0.7452092401611068, + "grad_norm": 5.178011201518568, + "learning_rate": 3.6124664284388587e-06, + "loss": 0.8085, + "step": 10315 + }, + { + "epoch": 0.7452814853613163, + "grad_norm": 5.467482763332134, + "learning_rate": 3.612204474042852e-06, + "loss": 0.7916, + "step": 10316 + }, + { + "epoch": 0.7453537305615259, + "grad_norm": 5.209259357782124, + "learning_rate": 3.611942504421629e-06, + "loss": 0.8854, + "step": 10317 + }, + { + "epoch": 0.7454259757617353, + "grad_norm": 6.075016589985867, + "learning_rate": 3.611680519578776e-06, + "loss": 0.9329, + "step": 10318 + }, + { + "epoch": 0.7454982209619448, + "grad_norm": 5.883092622812746, + "learning_rate": 3.611418519517881e-06, + "loss": 0.7863, + "step": 10319 + }, + { + "epoch": 0.7455704661621544, + "grad_norm": 6.03677415611302, + "learning_rate": 3.6111565042425297e-06, + "loss": 0.8525, + "step": 10320 + }, + { + "epoch": 0.7456427113623638, + "grad_norm": 6.118753790537641, + "learning_rate": 3.610894473756308e-06, + "loss": 0.8343, + "step": 10321 + }, + { + "epoch": 0.7457149565625734, + "grad_norm": 6.201993363402407, + "learning_rate": 3.6106324280628034e-06, + "loss": 0.8955, + "step": 10322 + }, + { + "epoch": 0.7457872017627829, + "grad_norm": 6.861463780504976, + "learning_rate": 3.6103703671656034e-06, + "loss": 0.8709, + "step": 10323 + }, + { + "epoch": 0.7458594469629924, + "grad_norm": 5.527841670925313, + "learning_rate": 3.6101082910682942e-06, + "loss": 0.7704, + "step": 10324 + }, + { + "epoch": 0.7459316921632019, + "grad_norm": 7.527381758304404, + "learning_rate": 3.609846199774466e-06, + "loss": 0.926, + "step": 10325 + }, + { + "epoch": 0.7460039373634114, + "grad_norm": 5.85160128856887, + "learning_rate": 3.609584093287704e-06, + "loss": 0.8408, + "step": 10326 + }, + { + "epoch": 0.746076182563621, + "grad_norm": 7.880017574310069, + "learning_rate": 3.609321971611598e-06, + "loss": 0.9078, + "step": 10327 + }, + { + "epoch": 0.7461484277638304, + "grad_norm": 6.226379927922479, + "learning_rate": 3.6090598347497348e-06, + "loss": 0.8652, + "step": 10328 + }, + { + "epoch": 0.7462206729640399, + "grad_norm": 6.644382553634534, + "learning_rate": 3.608797682705704e-06, + "loss": 0.898, + "step": 10329 + }, + { + "epoch": 0.7462929181642495, + "grad_norm": 6.997126398174557, + "learning_rate": 3.6085355154830947e-06, + "loss": 0.8088, + "step": 10330 + }, + { + "epoch": 0.746365163364459, + "grad_norm": 7.16514334642404, + "learning_rate": 3.6082733330854935e-06, + "loss": 0.8928, + "step": 10331 + }, + { + "epoch": 0.7464374085646684, + "grad_norm": 7.793826315597193, + "learning_rate": 3.6080111355164913e-06, + "loss": 0.9048, + "step": 10332 + }, + { + "epoch": 0.746509653764878, + "grad_norm": 5.915074774558902, + "learning_rate": 3.607748922779677e-06, + "loss": 0.8183, + "step": 10333 + }, + { + "epoch": 0.7465818989650875, + "grad_norm": 5.371740905614833, + "learning_rate": 3.607486694878641e-06, + "loss": 0.8565, + "step": 10334 + }, + { + "epoch": 0.7466541441652971, + "grad_norm": 6.140682152246758, + "learning_rate": 3.6072244518169707e-06, + "loss": 0.8, + "step": 10335 + }, + { + "epoch": 0.7467263893655065, + "grad_norm": 6.480130270131234, + "learning_rate": 3.606962193598258e-06, + "loss": 0.913, + "step": 10336 + }, + { + "epoch": 0.746798634565716, + "grad_norm": 5.652045273034587, + "learning_rate": 3.606699920226092e-06, + "loss": 0.827, + "step": 10337 + }, + { + "epoch": 0.7468708797659256, + "grad_norm": 7.237221341212331, + "learning_rate": 3.606437631704064e-06, + "loss": 0.7938, + "step": 10338 + }, + { + "epoch": 0.746943124966135, + "grad_norm": 5.116219418805005, + "learning_rate": 3.6061753280357636e-06, + "loss": 0.8058, + "step": 10339 + }, + { + "epoch": 0.7470153701663446, + "grad_norm": 7.7014698161520805, + "learning_rate": 3.605913009224782e-06, + "loss": 0.7614, + "step": 10340 + }, + { + "epoch": 0.7470876153665541, + "grad_norm": 7.3843340239354545, + "learning_rate": 3.6056506752747093e-06, + "loss": 0.9002, + "step": 10341 + }, + { + "epoch": 0.7471598605667636, + "grad_norm": 6.342367087277068, + "learning_rate": 3.6053883261891374e-06, + "loss": 0.8039, + "step": 10342 + }, + { + "epoch": 0.7472321057669731, + "grad_norm": 6.369429604280647, + "learning_rate": 3.605125961971659e-06, + "loss": 0.8101, + "step": 10343 + }, + { + "epoch": 0.7473043509671826, + "grad_norm": 5.642692039240404, + "learning_rate": 3.604863582625863e-06, + "loss": 0.7834, + "step": 10344 + }, + { + "epoch": 0.7473765961673922, + "grad_norm": 6.1857214018637, + "learning_rate": 3.604601188155343e-06, + "loss": 0.8676, + "step": 10345 + }, + { + "epoch": 0.7474488413676016, + "grad_norm": 5.534441872897611, + "learning_rate": 3.60433877856369e-06, + "loss": 0.9013, + "step": 10346 + }, + { + "epoch": 0.7475210865678111, + "grad_norm": 6.848201888825082, + "learning_rate": 3.6040763538544966e-06, + "loss": 0.8206, + "step": 10347 + }, + { + "epoch": 0.7475933317680207, + "grad_norm": 6.271512050799258, + "learning_rate": 3.6038139140313555e-06, + "loss": 0.7998, + "step": 10348 + }, + { + "epoch": 0.7476655769682302, + "grad_norm": 6.731344085290847, + "learning_rate": 3.603551459097859e-06, + "loss": 0.8587, + "step": 10349 + }, + { + "epoch": 0.7477378221684396, + "grad_norm": 6.535540034214339, + "learning_rate": 3.6032889890575996e-06, + "loss": 0.8724, + "step": 10350 + }, + { + "epoch": 0.7478100673686492, + "grad_norm": 6.174820363756196, + "learning_rate": 3.60302650391417e-06, + "loss": 0.7308, + "step": 10351 + }, + { + "epoch": 0.7478823125688587, + "grad_norm": 6.587467038616837, + "learning_rate": 3.602764003671165e-06, + "loss": 0.7887, + "step": 10352 + }, + { + "epoch": 0.7479545577690682, + "grad_norm": 5.966091023102754, + "learning_rate": 3.6025014883321772e-06, + "loss": 0.8199, + "step": 10353 + }, + { + "epoch": 0.7480268029692777, + "grad_norm": 6.744367156545168, + "learning_rate": 3.6022389579007994e-06, + "loss": 0.8562, + "step": 10354 + }, + { + "epoch": 0.7480990481694872, + "grad_norm": 6.182812080450769, + "learning_rate": 3.601976412380626e-06, + "loss": 0.841, + "step": 10355 + }, + { + "epoch": 0.7481712933696968, + "grad_norm": 5.006940792586215, + "learning_rate": 3.6017138517752513e-06, + "loss": 0.7711, + "step": 10356 + }, + { + "epoch": 0.7482435385699062, + "grad_norm": 6.420349682467927, + "learning_rate": 3.6014512760882697e-06, + "loss": 0.7515, + "step": 10357 + }, + { + "epoch": 0.7483157837701158, + "grad_norm": 6.743604668013764, + "learning_rate": 3.6011886853232746e-06, + "loss": 0.859, + "step": 10358 + }, + { + "epoch": 0.7483880289703253, + "grad_norm": 7.36118250718271, + "learning_rate": 3.6009260794838624e-06, + "loss": 0.8064, + "step": 10359 + }, + { + "epoch": 0.7484602741705348, + "grad_norm": 6.367875849486441, + "learning_rate": 3.6006634585736267e-06, + "loss": 0.8877, + "step": 10360 + }, + { + "epoch": 0.7485325193707443, + "grad_norm": 10.905586812751928, + "learning_rate": 3.6004008225961622e-06, + "loss": 0.8275, + "step": 10361 + }, + { + "epoch": 0.7486047645709538, + "grad_norm": 6.64522244347337, + "learning_rate": 3.600138171555066e-06, + "loss": 0.8564, + "step": 10362 + }, + { + "epoch": 0.7486770097711634, + "grad_norm": 5.768688845491127, + "learning_rate": 3.5998755054539313e-06, + "loss": 0.8387, + "step": 10363 + }, + { + "epoch": 0.7487492549713728, + "grad_norm": 6.068710449592738, + "learning_rate": 3.599612824296356e-06, + "loss": 0.8749, + "step": 10364 + }, + { + "epoch": 0.7488215001715823, + "grad_norm": 9.23819412287139, + "learning_rate": 3.5993501280859338e-06, + "loss": 0.8363, + "step": 10365 + }, + { + "epoch": 0.7488937453717919, + "grad_norm": 4.938933345526414, + "learning_rate": 3.5990874168262625e-06, + "loss": 0.7156, + "step": 10366 + }, + { + "epoch": 0.7489659905720014, + "grad_norm": 8.451826126238286, + "learning_rate": 3.5988246905209377e-06, + "loss": 0.9045, + "step": 10367 + }, + { + "epoch": 0.7490382357722108, + "grad_norm": 5.150263229605698, + "learning_rate": 3.5985619491735563e-06, + "loss": 0.8206, + "step": 10368 + }, + { + "epoch": 0.7491104809724204, + "grad_norm": 7.257059803542169, + "learning_rate": 3.5982991927877148e-06, + "loss": 0.8784, + "step": 10369 + }, + { + "epoch": 0.7491827261726299, + "grad_norm": 7.234409010871052, + "learning_rate": 3.598036421367009e-06, + "loss": 1.0376, + "step": 10370 + }, + { + "epoch": 0.7492549713728394, + "grad_norm": 6.31415498397221, + "learning_rate": 3.5977736349150395e-06, + "loss": 0.8008, + "step": 10371 + }, + { + "epoch": 0.7493272165730489, + "grad_norm": 6.57250968854037, + "learning_rate": 3.597510833435399e-06, + "loss": 0.8568, + "step": 10372 + }, + { + "epoch": 0.7493994617732584, + "grad_norm": 6.479243957778122, + "learning_rate": 3.5972480169316894e-06, + "loss": 0.8337, + "step": 10373 + }, + { + "epoch": 0.749471706973468, + "grad_norm": 5.966561280639377, + "learning_rate": 3.596985185407505e-06, + "loss": 0.7875, + "step": 10374 + }, + { + "epoch": 0.7495439521736774, + "grad_norm": 5.953836874062743, + "learning_rate": 3.596722338866445e-06, + "loss": 0.7788, + "step": 10375 + }, + { + "epoch": 0.749616197373887, + "grad_norm": 6.198913220554616, + "learning_rate": 3.596459477312108e-06, + "loss": 0.8878, + "step": 10376 + }, + { + "epoch": 0.7496884425740965, + "grad_norm": 5.761958747649895, + "learning_rate": 3.596196600748093e-06, + "loss": 0.7645, + "step": 10377 + }, + { + "epoch": 0.749760687774306, + "grad_norm": 7.435754138331424, + "learning_rate": 3.595933709177997e-06, + "loss": 0.8017, + "step": 10378 + }, + { + "epoch": 0.7498329329745155, + "grad_norm": 6.3123510362026005, + "learning_rate": 3.5956708026054192e-06, + "loss": 0.7727, + "step": 10379 + }, + { + "epoch": 0.749905178174725, + "grad_norm": 5.261019993796077, + "learning_rate": 3.5954078810339593e-06, + "loss": 0.8077, + "step": 10380 + }, + { + "epoch": 0.7499774233749346, + "grad_norm": 6.328478541918079, + "learning_rate": 3.595144944467216e-06, + "loss": 0.8477, + "step": 10381 + }, + { + "epoch": 0.750049668575144, + "grad_norm": 8.725571892913292, + "learning_rate": 3.5948819929087884e-06, + "loss": 0.7962, + "step": 10382 + }, + { + "epoch": 0.7501219137753535, + "grad_norm": 6.25471105406741, + "learning_rate": 3.5946190263622765e-06, + "loss": 0.8302, + "step": 10383 + }, + { + "epoch": 0.7501941589755631, + "grad_norm": 7.093594587194878, + "learning_rate": 3.59435604483128e-06, + "loss": 0.9058, + "step": 10384 + }, + { + "epoch": 0.7502664041757726, + "grad_norm": 6.4003626124654005, + "learning_rate": 3.5940930483193994e-06, + "loss": 0.7755, + "step": 10385 + }, + { + "epoch": 0.750338649375982, + "grad_norm": 5.512618242236946, + "learning_rate": 3.593830036830234e-06, + "loss": 0.7454, + "step": 10386 + }, + { + "epoch": 0.7504108945761916, + "grad_norm": 7.1702230042662, + "learning_rate": 3.5935670103673855e-06, + "loss": 0.8348, + "step": 10387 + }, + { + "epoch": 0.7504831397764011, + "grad_norm": 7.031861138805249, + "learning_rate": 3.593303968934453e-06, + "loss": 0.8441, + "step": 10388 + }, + { + "epoch": 0.7505553849766106, + "grad_norm": 6.918888179567395, + "learning_rate": 3.593040912535038e-06, + "loss": 0.8794, + "step": 10389 + }, + { + "epoch": 0.7506276301768201, + "grad_norm": 6.297090058168423, + "learning_rate": 3.5927778411727415e-06, + "loss": 0.8438, + "step": 10390 + }, + { + "epoch": 0.7506998753770296, + "grad_norm": 9.031797399068056, + "learning_rate": 3.592514754851165e-06, + "loss": 0.9037, + "step": 10391 + }, + { + "epoch": 0.7507721205772392, + "grad_norm": 6.861514928605673, + "learning_rate": 3.5922516535739103e-06, + "loss": 0.8747, + "step": 10392 + }, + { + "epoch": 0.7508443657774486, + "grad_norm": 5.267132008593221, + "learning_rate": 3.5919885373445784e-06, + "loss": 0.8582, + "step": 10393 + }, + { + "epoch": 0.7509166109776582, + "grad_norm": 5.628718185724583, + "learning_rate": 3.5917254061667705e-06, + "loss": 0.8659, + "step": 10394 + }, + { + "epoch": 0.7509888561778677, + "grad_norm": 5.6801388231962475, + "learning_rate": 3.5914622600440903e-06, + "loss": 0.7812, + "step": 10395 + }, + { + "epoch": 0.7510611013780772, + "grad_norm": 5.247294864361511, + "learning_rate": 3.5911990989801394e-06, + "loss": 0.7765, + "step": 10396 + }, + { + "epoch": 0.7511333465782867, + "grad_norm": 5.451910282586289, + "learning_rate": 3.590935922978519e-06, + "loss": 0.829, + "step": 10397 + }, + { + "epoch": 0.7512055917784962, + "grad_norm": 5.893644575873129, + "learning_rate": 3.5906727320428336e-06, + "loss": 0.8306, + "step": 10398 + }, + { + "epoch": 0.7512778369787058, + "grad_norm": 6.37621520184921, + "learning_rate": 3.590409526176685e-06, + "loss": 0.7894, + "step": 10399 + }, + { + "epoch": 0.7513500821789152, + "grad_norm": 6.469447886097216, + "learning_rate": 3.590146305383677e-06, + "loss": 0.8033, + "step": 10400 + }, + { + "epoch": 0.7514223273791247, + "grad_norm": 6.706959301357685, + "learning_rate": 3.5898830696674124e-06, + "loss": 0.8954, + "step": 10401 + }, + { + "epoch": 0.7514945725793343, + "grad_norm": 6.375008676560425, + "learning_rate": 3.589619819031495e-06, + "loss": 0.8889, + "step": 10402 + }, + { + "epoch": 0.7515668177795438, + "grad_norm": 7.427649634673069, + "learning_rate": 3.5893565534795284e-06, + "loss": 0.8624, + "step": 10403 + }, + { + "epoch": 0.7516390629797532, + "grad_norm": 7.57859212477819, + "learning_rate": 3.589093273015116e-06, + "loss": 0.9158, + "step": 10404 + }, + { + "epoch": 0.7517113081799628, + "grad_norm": 6.306470049373308, + "learning_rate": 3.588829977641863e-06, + "loss": 0.9048, + "step": 10405 + }, + { + "epoch": 0.7517835533801723, + "grad_norm": 5.520446571862027, + "learning_rate": 3.588566667363372e-06, + "loss": 0.8396, + "step": 10406 + }, + { + "epoch": 0.7518557985803818, + "grad_norm": 7.009859770543598, + "learning_rate": 3.5883033421832493e-06, + "loss": 0.8404, + "step": 10407 + }, + { + "epoch": 0.7519280437805913, + "grad_norm": 5.724051930915066, + "learning_rate": 3.588040002105098e-06, + "loss": 0.8918, + "step": 10408 + }, + { + "epoch": 0.7520002889808008, + "grad_norm": 6.22939137677581, + "learning_rate": 3.587776647132525e-06, + "loss": 0.8722, + "step": 10409 + }, + { + "epoch": 0.7520725341810104, + "grad_norm": 8.852192976636099, + "learning_rate": 3.5875132772691334e-06, + "loss": 0.7867, + "step": 10410 + }, + { + "epoch": 0.7521447793812198, + "grad_norm": 6.489036704401834, + "learning_rate": 3.58724989251853e-06, + "loss": 0.8779, + "step": 10411 + }, + { + "epoch": 0.7522170245814294, + "grad_norm": 7.469367731956002, + "learning_rate": 3.5869864928843196e-06, + "loss": 0.868, + "step": 10412 + }, + { + "epoch": 0.7522892697816389, + "grad_norm": 6.63124423588566, + "learning_rate": 3.586723078370108e-06, + "loss": 0.9208, + "step": 10413 + }, + { + "epoch": 0.7523615149818484, + "grad_norm": 5.884674871728026, + "learning_rate": 3.5864596489795017e-06, + "loss": 0.8712, + "step": 10414 + }, + { + "epoch": 0.7524337601820579, + "grad_norm": 6.749771962022616, + "learning_rate": 3.5861962047161056e-06, + "loss": 0.8116, + "step": 10415 + }, + { + "epoch": 0.7525060053822674, + "grad_norm": 7.368403701474978, + "learning_rate": 3.585932745583528e-06, + "loss": 0.9096, + "step": 10416 + }, + { + "epoch": 0.752578250582477, + "grad_norm": 7.6608842798477745, + "learning_rate": 3.585669271585373e-06, + "loss": 0.7975, + "step": 10417 + }, + { + "epoch": 0.7526504957826864, + "grad_norm": 6.06744937808025, + "learning_rate": 3.585405782725249e-06, + "loss": 0.9163, + "step": 10418 + }, + { + "epoch": 0.7527227409828959, + "grad_norm": 6.209242717867818, + "learning_rate": 3.5851422790067635e-06, + "loss": 0.8443, + "step": 10419 + }, + { + "epoch": 0.7527949861831055, + "grad_norm": 6.036219945703615, + "learning_rate": 3.584878760433522e-06, + "loss": 0.7442, + "step": 10420 + }, + { + "epoch": 0.752867231383315, + "grad_norm": 6.900964155421622, + "learning_rate": 3.584615227009133e-06, + "loss": 0.8348, + "step": 10421 + }, + { + "epoch": 0.7529394765835244, + "grad_norm": 8.87765377288563, + "learning_rate": 3.584351678737202e-06, + "loss": 0.9392, + "step": 10422 + }, + { + "epoch": 0.753011721783734, + "grad_norm": 6.234218050479766, + "learning_rate": 3.5840881156213405e-06, + "loss": 0.8032, + "step": 10423 + }, + { + "epoch": 0.7530839669839435, + "grad_norm": 7.629082009224097, + "learning_rate": 3.5838245376651537e-06, + "loss": 0.8018, + "step": 10424 + }, + { + "epoch": 0.753156212184153, + "grad_norm": 5.800877590214293, + "learning_rate": 3.583560944872251e-06, + "loss": 0.8363, + "step": 10425 + }, + { + "epoch": 0.7532284573843625, + "grad_norm": 6.000661495619442, + "learning_rate": 3.5832973372462393e-06, + "loss": 0.8739, + "step": 10426 + }, + { + "epoch": 0.753300702584572, + "grad_norm": 5.119638383135303, + "learning_rate": 3.583033714790729e-06, + "loss": 0.7968, + "step": 10427 + }, + { + "epoch": 0.7533729477847816, + "grad_norm": 7.336074519071166, + "learning_rate": 3.5827700775093277e-06, + "loss": 0.8153, + "step": 10428 + }, + { + "epoch": 0.753445192984991, + "grad_norm": 6.630647087912563, + "learning_rate": 3.5825064254056453e-06, + "loss": 0.7896, + "step": 10429 + }, + { + "epoch": 0.7535174381852006, + "grad_norm": 7.575405490207403, + "learning_rate": 3.5822427584832896e-06, + "loss": 0.7714, + "step": 10430 + }, + { + "epoch": 0.7535896833854101, + "grad_norm": 7.85572903285846, + "learning_rate": 3.581979076745871e-06, + "loss": 0.789, + "step": 10431 + }, + { + "epoch": 0.7536619285856196, + "grad_norm": 7.237308574773784, + "learning_rate": 3.5817153801969994e-06, + "loss": 0.8421, + "step": 10432 + }, + { + "epoch": 0.7537341737858291, + "grad_norm": 6.5362731015732205, + "learning_rate": 3.581451668840284e-06, + "loss": 0.8304, + "step": 10433 + }, + { + "epoch": 0.7538064189860386, + "grad_norm": 6.036935922876278, + "learning_rate": 3.581187942679335e-06, + "loss": 0.8493, + "step": 10434 + }, + { + "epoch": 0.7538786641862482, + "grad_norm": 5.335537137249515, + "learning_rate": 3.5809242017177625e-06, + "loss": 0.8401, + "step": 10435 + }, + { + "epoch": 0.7539509093864576, + "grad_norm": 7.790183458236929, + "learning_rate": 3.5806604459591766e-06, + "loss": 0.7791, + "step": 10436 + }, + { + "epoch": 0.7540231545866671, + "grad_norm": 7.927807755188231, + "learning_rate": 3.580396675407189e-06, + "loss": 0.7906, + "step": 10437 + }, + { + "epoch": 0.7540953997868767, + "grad_norm": 8.141421112553607, + "learning_rate": 3.580132890065409e-06, + "loss": 0.8852, + "step": 10438 + }, + { + "epoch": 0.7541676449870862, + "grad_norm": 7.030879710301722, + "learning_rate": 3.579869089937449e-06, + "loss": 0.8231, + "step": 10439 + }, + { + "epoch": 0.7542398901872956, + "grad_norm": 6.5911663471587465, + "learning_rate": 3.5796052750269193e-06, + "loss": 0.8244, + "step": 10440 + }, + { + "epoch": 0.7543121353875052, + "grad_norm": 5.943336589567727, + "learning_rate": 3.5793414453374313e-06, + "loss": 0.7886, + "step": 10441 + }, + { + "epoch": 0.7543843805877147, + "grad_norm": 7.9323040633794575, + "learning_rate": 3.5790776008725975e-06, + "loss": 0.7652, + "step": 10442 + }, + { + "epoch": 0.7544566257879242, + "grad_norm": 7.124078523711801, + "learning_rate": 3.578813741636029e-06, + "loss": 0.8355, + "step": 10443 + }, + { + "epoch": 0.7545288709881337, + "grad_norm": 11.843865950125037, + "learning_rate": 3.5785498676313393e-06, + "loss": 0.8499, + "step": 10444 + }, + { + "epoch": 0.7546011161883432, + "grad_norm": 6.227289671896984, + "learning_rate": 3.5782859788621375e-06, + "loss": 0.7846, + "step": 10445 + }, + { + "epoch": 0.7546733613885528, + "grad_norm": 9.140208073424164, + "learning_rate": 3.578022075332038e-06, + "loss": 0.8407, + "step": 10446 + }, + { + "epoch": 0.7547456065887622, + "grad_norm": 5.642844146658364, + "learning_rate": 3.577758157044654e-06, + "loss": 0.8171, + "step": 10447 + }, + { + "epoch": 0.7548178517889718, + "grad_norm": 6.5411757104738415, + "learning_rate": 3.577494224003598e-06, + "loss": 0.8808, + "step": 10448 + }, + { + "epoch": 0.7548900969891813, + "grad_norm": 8.826589589238614, + "learning_rate": 3.5772302762124824e-06, + "loss": 0.7447, + "step": 10449 + }, + { + "epoch": 0.7549623421893908, + "grad_norm": 8.845502376489447, + "learning_rate": 3.57696631367492e-06, + "loss": 0.9079, + "step": 10450 + }, + { + "epoch": 0.7550345873896003, + "grad_norm": 5.967721261252234, + "learning_rate": 3.576702336394525e-06, + "loss": 0.8447, + "step": 10451 + }, + { + "epoch": 0.7551068325898098, + "grad_norm": 5.996736274613519, + "learning_rate": 3.5764383443749124e-06, + "loss": 0.7914, + "step": 10452 + }, + { + "epoch": 0.7551790777900194, + "grad_norm": 5.809574826781109, + "learning_rate": 3.576174337619694e-06, + "loss": 0.9108, + "step": 10453 + }, + { + "epoch": 0.7552513229902288, + "grad_norm": 5.643254647214288, + "learning_rate": 3.575910316132484e-06, + "loss": 0.8117, + "step": 10454 + }, + { + "epoch": 0.7553235681904383, + "grad_norm": 6.233551047427836, + "learning_rate": 3.575646279916898e-06, + "loss": 0.91, + "step": 10455 + }, + { + "epoch": 0.7553958133906479, + "grad_norm": 8.14848150605327, + "learning_rate": 3.575382228976548e-06, + "loss": 0.8034, + "step": 10456 + }, + { + "epoch": 0.7554680585908574, + "grad_norm": 9.9298246480743, + "learning_rate": 3.5751181633150524e-06, + "loss": 0.8659, + "step": 10457 + }, + { + "epoch": 0.7555403037910668, + "grad_norm": 5.597332707469724, + "learning_rate": 3.574854082936022e-06, + "loss": 0.8045, + "step": 10458 + }, + { + "epoch": 0.7556125489912764, + "grad_norm": 5.863763981210682, + "learning_rate": 3.5745899878430754e-06, + "loss": 0.8686, + "step": 10459 + }, + { + "epoch": 0.7556847941914859, + "grad_norm": 6.708715992630437, + "learning_rate": 3.5743258780398252e-06, + "loss": 0.7978, + "step": 10460 + }, + { + "epoch": 0.7557570393916954, + "grad_norm": 6.504270177975059, + "learning_rate": 3.5740617535298884e-06, + "loss": 0.8092, + "step": 10461 + }, + { + "epoch": 0.7558292845919049, + "grad_norm": 5.21717947896162, + "learning_rate": 3.57379761431688e-06, + "loss": 0.8351, + "step": 10462 + }, + { + "epoch": 0.7559015297921144, + "grad_norm": 5.427610326600637, + "learning_rate": 3.573533460404416e-06, + "loss": 0.8526, + "step": 10463 + }, + { + "epoch": 0.755973774992324, + "grad_norm": 5.2907239820185525, + "learning_rate": 3.5732692917961125e-06, + "loss": 0.7997, + "step": 10464 + }, + { + "epoch": 0.7560460201925334, + "grad_norm": 6.418761006525069, + "learning_rate": 3.5730051084955852e-06, + "loss": 0.856, + "step": 10465 + }, + { + "epoch": 0.756118265392743, + "grad_norm": 7.79960605898099, + "learning_rate": 3.572740910506452e-06, + "loss": 0.8134, + "step": 10466 + }, + { + "epoch": 0.7561905105929525, + "grad_norm": 4.82136511786148, + "learning_rate": 3.5724766978323278e-06, + "loss": 0.8138, + "step": 10467 + }, + { + "epoch": 0.756262755793162, + "grad_norm": 6.498766855495729, + "learning_rate": 3.5722124704768313e-06, + "loss": 0.8544, + "step": 10468 + }, + { + "epoch": 0.7563350009933715, + "grad_norm": 6.321327307184824, + "learning_rate": 3.571948228443578e-06, + "loss": 0.8388, + "step": 10469 + }, + { + "epoch": 0.756407246193581, + "grad_norm": 6.042671418226573, + "learning_rate": 3.5716839717361856e-06, + "loss": 0.8865, + "step": 10470 + }, + { + "epoch": 0.7564794913937906, + "grad_norm": 5.843708874562843, + "learning_rate": 3.5714197003582718e-06, + "loss": 0.8275, + "step": 10471 + }, + { + "epoch": 0.756551736594, + "grad_norm": 5.65725277605376, + "learning_rate": 3.5711554143134548e-06, + "loss": 0.8668, + "step": 10472 + }, + { + "epoch": 0.7566239817942095, + "grad_norm": 5.64783927313802, + "learning_rate": 3.5708911136053514e-06, + "loss": 0.8162, + "step": 10473 + }, + { + "epoch": 0.7566962269944191, + "grad_norm": 7.916722053200235, + "learning_rate": 3.5706267982375802e-06, + "loss": 0.8037, + "step": 10474 + }, + { + "epoch": 0.7567684721946286, + "grad_norm": 6.431632321396098, + "learning_rate": 3.5703624682137593e-06, + "loss": 0.7871, + "step": 10475 + }, + { + "epoch": 0.756840717394838, + "grad_norm": 5.398105550406105, + "learning_rate": 3.570098123537507e-06, + "loss": 0.8909, + "step": 10476 + }, + { + "epoch": 0.7569129625950476, + "grad_norm": 5.375918287746145, + "learning_rate": 3.5698337642124433e-06, + "loss": 0.8707, + "step": 10477 + }, + { + "epoch": 0.7569852077952571, + "grad_norm": 6.707647189794191, + "learning_rate": 3.5695693902421856e-06, + "loss": 0.8757, + "step": 10478 + }, + { + "epoch": 0.7570574529954666, + "grad_norm": 5.603026811853188, + "learning_rate": 3.5693050016303523e-06, + "loss": 0.8601, + "step": 10479 + }, + { + "epoch": 0.7571296981956761, + "grad_norm": 6.626453743986402, + "learning_rate": 3.5690405983805653e-06, + "loss": 0.9214, + "step": 10480 + }, + { + "epoch": 0.7572019433958856, + "grad_norm": 7.36744250991909, + "learning_rate": 3.568776180496442e-06, + "loss": 0.8406, + "step": 10481 + }, + { + "epoch": 0.7572741885960952, + "grad_norm": 5.757868814387091, + "learning_rate": 3.568511747981602e-06, + "loss": 0.8879, + "step": 10482 + }, + { + "epoch": 0.7573464337963046, + "grad_norm": 6.3748130022934, + "learning_rate": 3.5682473008396668e-06, + "loss": 0.8347, + "step": 10483 + }, + { + "epoch": 0.7574186789965142, + "grad_norm": 9.00254361977639, + "learning_rate": 3.567982839074255e-06, + "loss": 0.8419, + "step": 10484 + }, + { + "epoch": 0.7574909241967237, + "grad_norm": 8.365496809229825, + "learning_rate": 3.5677183626889877e-06, + "loss": 0.8856, + "step": 10485 + }, + { + "epoch": 0.7575631693969332, + "grad_norm": 6.5301737651517495, + "learning_rate": 3.5674538716874848e-06, + "loss": 0.9059, + "step": 10486 + }, + { + "epoch": 0.7576354145971427, + "grad_norm": 6.751426546034115, + "learning_rate": 3.5671893660733675e-06, + "loss": 0.8193, + "step": 10487 + }, + { + "epoch": 0.7577076597973522, + "grad_norm": 7.585198504893763, + "learning_rate": 3.566924845850256e-06, + "loss": 0.848, + "step": 10488 + }, + { + "epoch": 0.7577799049975618, + "grad_norm": 6.57220177810741, + "learning_rate": 3.566660311021772e-06, + "loss": 0.8532, + "step": 10489 + }, + { + "epoch": 0.7578521501977712, + "grad_norm": 5.499737993415306, + "learning_rate": 3.566395761591536e-06, + "loss": 0.8488, + "step": 10490 + }, + { + "epoch": 0.7579243953979807, + "grad_norm": 6.118633464759449, + "learning_rate": 3.5661311975631706e-06, + "loss": 0.8521, + "step": 10491 + }, + { + "epoch": 0.7579966405981903, + "grad_norm": 6.636509811767738, + "learning_rate": 3.565866618940297e-06, + "loss": 0.8563, + "step": 10492 + }, + { + "epoch": 0.7580688857983998, + "grad_norm": 6.757220609465865, + "learning_rate": 3.565602025726537e-06, + "loss": 0.7759, + "step": 10493 + }, + { + "epoch": 0.7581411309986092, + "grad_norm": 6.530931382872064, + "learning_rate": 3.5653374179255123e-06, + "loss": 0.8153, + "step": 10494 + }, + { + "epoch": 0.7582133761988188, + "grad_norm": 6.733837699255314, + "learning_rate": 3.5650727955408454e-06, + "loss": 0.7907, + "step": 10495 + }, + { + "epoch": 0.7582856213990283, + "grad_norm": 5.503701265090289, + "learning_rate": 3.5648081585761597e-06, + "loss": 0.8149, + "step": 10496 + }, + { + "epoch": 0.7583578665992378, + "grad_norm": 5.972637089161547, + "learning_rate": 3.5645435070350764e-06, + "loss": 0.8546, + "step": 10497 + }, + { + "epoch": 0.7584301117994473, + "grad_norm": 5.430567388800791, + "learning_rate": 3.5642788409212193e-06, + "loss": 0.7948, + "step": 10498 + }, + { + "epoch": 0.7585023569996568, + "grad_norm": 5.949314930690025, + "learning_rate": 3.5640141602382104e-06, + "loss": 0.8826, + "step": 10499 + }, + { + "epoch": 0.7585746021998664, + "grad_norm": 7.328943788379241, + "learning_rate": 3.563749464989675e-06, + "loss": 0.9075, + "step": 10500 + }, + { + "epoch": 0.7586468474000758, + "grad_norm": 8.210538874422216, + "learning_rate": 3.5634847551792353e-06, + "loss": 0.8049, + "step": 10501 + }, + { + "epoch": 0.7587190926002854, + "grad_norm": 6.00585397611546, + "learning_rate": 3.563220030810515e-06, + "loss": 0.8557, + "step": 10502 + }, + { + "epoch": 0.7587913378004949, + "grad_norm": 5.155006397439444, + "learning_rate": 3.5629552918871367e-06, + "loss": 0.7559, + "step": 10503 + }, + { + "epoch": 0.7588635830007044, + "grad_norm": 5.263951791316455, + "learning_rate": 3.562690538412727e-06, + "loss": 0.8152, + "step": 10504 + }, + { + "epoch": 0.7589358282009139, + "grad_norm": 5.443470625601118, + "learning_rate": 3.5624257703909087e-06, + "loss": 0.8577, + "step": 10505 + }, + { + "epoch": 0.7590080734011234, + "grad_norm": 5.7361000231291115, + "learning_rate": 3.5621609878253066e-06, + "loss": 0.8514, + "step": 10506 + }, + { + "epoch": 0.759080318601333, + "grad_norm": 6.239262627292124, + "learning_rate": 3.561896190719545e-06, + "loss": 0.9443, + "step": 10507 + }, + { + "epoch": 0.7591525638015424, + "grad_norm": 5.624222426182123, + "learning_rate": 3.5616313790772493e-06, + "loss": 0.7847, + "step": 10508 + }, + { + "epoch": 0.7592248090017519, + "grad_norm": 6.050694878135963, + "learning_rate": 3.561366552902045e-06, + "loss": 0.7872, + "step": 10509 + }, + { + "epoch": 0.7592970542019615, + "grad_norm": 6.650336705912111, + "learning_rate": 3.5611017121975556e-06, + "loss": 0.7142, + "step": 10510 + }, + { + "epoch": 0.759369299402171, + "grad_norm": 6.592484628473559, + "learning_rate": 3.560836856967408e-06, + "loss": 0.8429, + "step": 10511 + }, + { + "epoch": 0.7594415446023804, + "grad_norm": 5.012269415748212, + "learning_rate": 3.5605719872152272e-06, + "loss": 0.825, + "step": 10512 + }, + { + "epoch": 0.75951378980259, + "grad_norm": 6.397917599451794, + "learning_rate": 3.56030710294464e-06, + "loss": 0.8652, + "step": 10513 + }, + { + "epoch": 0.7595860350027995, + "grad_norm": 6.129196286556715, + "learning_rate": 3.560042204159272e-06, + "loss": 0.8195, + "step": 10514 + }, + { + "epoch": 0.759658280203009, + "grad_norm": 5.195660159302839, + "learning_rate": 3.559777290862748e-06, + "loss": 0.7837, + "step": 10515 + }, + { + "epoch": 0.7597305254032185, + "grad_norm": 6.647856528945503, + "learning_rate": 3.559512363058697e-06, + "loss": 0.7927, + "step": 10516 + }, + { + "epoch": 0.759802770603428, + "grad_norm": 6.28217942682554, + "learning_rate": 3.5592474207507437e-06, + "loss": 0.76, + "step": 10517 + }, + { + "epoch": 0.7598750158036376, + "grad_norm": 5.203018141557078, + "learning_rate": 3.558982463942516e-06, + "loss": 0.8928, + "step": 10518 + }, + { + "epoch": 0.759947261003847, + "grad_norm": 7.65848430807119, + "learning_rate": 3.558717492637641e-06, + "loss": 0.9122, + "step": 10519 + }, + { + "epoch": 0.7600195062040566, + "grad_norm": 5.435662320796119, + "learning_rate": 3.5584525068397453e-06, + "loss": 0.8218, + "step": 10520 + }, + { + "epoch": 0.7600917514042661, + "grad_norm": 5.983650502305448, + "learning_rate": 3.5581875065524564e-06, + "loss": 0.8329, + "step": 10521 + }, + { + "epoch": 0.7601639966044756, + "grad_norm": 6.616675274872965, + "learning_rate": 3.557922491779402e-06, + "loss": 0.8653, + "step": 10522 + }, + { + "epoch": 0.7602362418046851, + "grad_norm": 6.395290919196882, + "learning_rate": 3.5576574625242104e-06, + "loss": 0.816, + "step": 10523 + }, + { + "epoch": 0.7603084870048946, + "grad_norm": 6.026196828661792, + "learning_rate": 3.5573924187905094e-06, + "loss": 0.7575, + "step": 10524 + }, + { + "epoch": 0.7603807322051042, + "grad_norm": 6.22370628793616, + "learning_rate": 3.5571273605819272e-06, + "loss": 0.8164, + "step": 10525 + }, + { + "epoch": 0.7604529774053136, + "grad_norm": 6.244129322890147, + "learning_rate": 3.556862287902092e-06, + "loss": 0.894, + "step": 10526 + }, + { + "epoch": 0.7605252226055231, + "grad_norm": 6.121209353467222, + "learning_rate": 3.556597200754633e-06, + "loss": 0.7327, + "step": 10527 + }, + { + "epoch": 0.7605974678057327, + "grad_norm": 6.363606462779951, + "learning_rate": 3.556332099143179e-06, + "loss": 0.8706, + "step": 10528 + }, + { + "epoch": 0.7606697130059422, + "grad_norm": 6.691164527887341, + "learning_rate": 3.5560669830713578e-06, + "loss": 0.8136, + "step": 10529 + }, + { + "epoch": 0.7607419582061516, + "grad_norm": 4.956335525448243, + "learning_rate": 3.5558018525428006e-06, + "loss": 0.8064, + "step": 10530 + }, + { + "epoch": 0.7608142034063612, + "grad_norm": 5.589741384020765, + "learning_rate": 3.5555367075611347e-06, + "loss": 0.8641, + "step": 10531 + }, + { + "epoch": 0.7608864486065707, + "grad_norm": 6.709837213807129, + "learning_rate": 3.5552715481299914e-06, + "loss": 0.7754, + "step": 10532 + }, + { + "epoch": 0.7609586938067802, + "grad_norm": 6.039711506503196, + "learning_rate": 3.555006374253e-06, + "loss": 0.8692, + "step": 10533 + }, + { + "epoch": 0.7610309390069897, + "grad_norm": 5.444352841201337, + "learning_rate": 3.554741185933791e-06, + "loss": 0.9027, + "step": 10534 + }, + { + "epoch": 0.7611031842071992, + "grad_norm": 5.50689351957725, + "learning_rate": 3.5544759831759934e-06, + "loss": 0.7378, + "step": 10535 + }, + { + "epoch": 0.7611754294074088, + "grad_norm": 6.208633552136638, + "learning_rate": 3.554210765983238e-06, + "loss": 0.8857, + "step": 10536 + }, + { + "epoch": 0.7612476746076182, + "grad_norm": 6.691144574020892, + "learning_rate": 3.5539455343591566e-06, + "loss": 0.8971, + "step": 10537 + }, + { + "epoch": 0.7613199198078278, + "grad_norm": 6.657865373051906, + "learning_rate": 3.553680288307379e-06, + "loss": 0.8908, + "step": 10538 + }, + { + "epoch": 0.7613921650080373, + "grad_norm": 6.1894374087643635, + "learning_rate": 3.5534150278315366e-06, + "loss": 0.756, + "step": 10539 + }, + { + "epoch": 0.7614644102082468, + "grad_norm": 6.597001995850233, + "learning_rate": 3.55314975293526e-06, + "loss": 0.8363, + "step": 10540 + }, + { + "epoch": 0.7615366554084563, + "grad_norm": 5.363451416694586, + "learning_rate": 3.552884463622181e-06, + "loss": 0.8343, + "step": 10541 + }, + { + "epoch": 0.7616089006086658, + "grad_norm": 7.338509495711806, + "learning_rate": 3.5526191598959307e-06, + "loss": 0.8819, + "step": 10542 + }, + { + "epoch": 0.7616811458088754, + "grad_norm": 6.359780188546681, + "learning_rate": 3.552353841760143e-06, + "loss": 0.8982, + "step": 10543 + }, + { + "epoch": 0.7617533910090848, + "grad_norm": 6.60597433707842, + "learning_rate": 3.5520885092184472e-06, + "loss": 0.8839, + "step": 10544 + }, + { + "epoch": 0.7618256362092943, + "grad_norm": 5.646738052607508, + "learning_rate": 3.551823162274476e-06, + "loss": 0.7699, + "step": 10545 + }, + { + "epoch": 0.7618978814095039, + "grad_norm": 5.459058097109536, + "learning_rate": 3.5515578009318635e-06, + "loss": 0.805, + "step": 10546 + }, + { + "epoch": 0.7619701266097134, + "grad_norm": 7.4426174990265626, + "learning_rate": 3.5512924251942405e-06, + "loss": 0.9882, + "step": 10547 + }, + { + "epoch": 0.7620423718099228, + "grad_norm": 8.273687400321709, + "learning_rate": 3.551027035065241e-06, + "loss": 0.9147, + "step": 10548 + }, + { + "epoch": 0.7621146170101324, + "grad_norm": 5.982226750613182, + "learning_rate": 3.550761630548497e-06, + "loss": 0.9244, + "step": 10549 + }, + { + "epoch": 0.7621868622103419, + "grad_norm": 6.365251455082178, + "learning_rate": 3.5504962116476427e-06, + "loss": 0.8166, + "step": 10550 + }, + { + "epoch": 0.7622591074105514, + "grad_norm": 5.427612786511464, + "learning_rate": 3.5502307783663104e-06, + "loss": 0.7612, + "step": 10551 + }, + { + "epoch": 0.7623313526107609, + "grad_norm": 5.072101671156527, + "learning_rate": 3.5499653307081345e-06, + "loss": 0.9005, + "step": 10552 + }, + { + "epoch": 0.7624035978109704, + "grad_norm": 5.53552718659393, + "learning_rate": 3.549699868676749e-06, + "loss": 0.7242, + "step": 10553 + }, + { + "epoch": 0.76247584301118, + "grad_norm": 5.396789034107421, + "learning_rate": 3.5494343922757864e-06, + "loss": 0.779, + "step": 10554 + }, + { + "epoch": 0.7625480882113894, + "grad_norm": 6.138810139530235, + "learning_rate": 3.5491689015088813e-06, + "loss": 0.8245, + "step": 10555 + }, + { + "epoch": 0.762620333411599, + "grad_norm": 6.996164088318941, + "learning_rate": 3.5489033963796694e-06, + "loss": 0.8745, + "step": 10556 + }, + { + "epoch": 0.7626925786118085, + "grad_norm": 7.465661850938508, + "learning_rate": 3.548637876891785e-06, + "loss": 0.8221, + "step": 10557 + }, + { + "epoch": 0.7627648238120179, + "grad_norm": 6.3342044967340065, + "learning_rate": 3.5483723430488614e-06, + "loss": 0.9252, + "step": 10558 + }, + { + "epoch": 0.7628370690122275, + "grad_norm": 6.820006268629846, + "learning_rate": 3.548106794854535e-06, + "loss": 0.8288, + "step": 10559 + }, + { + "epoch": 0.762909314212437, + "grad_norm": 5.694418252197237, + "learning_rate": 3.54784123231244e-06, + "loss": 0.8387, + "step": 10560 + }, + { + "epoch": 0.7629815594126466, + "grad_norm": 6.641500689045202, + "learning_rate": 3.5475756554262118e-06, + "loss": 0.8814, + "step": 10561 + }, + { + "epoch": 0.763053804612856, + "grad_norm": 6.4993772942030645, + "learning_rate": 3.547310064199487e-06, + "loss": 0.8843, + "step": 10562 + }, + { + "epoch": 0.7631260498130655, + "grad_norm": 7.053414771770263, + "learning_rate": 3.5470444586359e-06, + "loss": 0.8171, + "step": 10563 + }, + { + "epoch": 0.7631982950132751, + "grad_norm": 5.552061766772821, + "learning_rate": 3.5467788387390877e-06, + "loss": 0.792, + "step": 10564 + }, + { + "epoch": 0.7632705402134846, + "grad_norm": 5.451552899506092, + "learning_rate": 3.5465132045126856e-06, + "loss": 0.8616, + "step": 10565 + }, + { + "epoch": 0.763342785413694, + "grad_norm": 6.012735202812496, + "learning_rate": 3.5462475559603302e-06, + "loss": 0.8862, + "step": 10566 + }, + { + "epoch": 0.7634150306139036, + "grad_norm": 5.927288389318212, + "learning_rate": 3.545981893085658e-06, + "loss": 0.8018, + "step": 10567 + }, + { + "epoch": 0.7634872758141131, + "grad_norm": 5.800008497560787, + "learning_rate": 3.545716215892307e-06, + "loss": 0.9079, + "step": 10568 + }, + { + "epoch": 0.7635595210143226, + "grad_norm": 5.303506756306112, + "learning_rate": 3.545450524383912e-06, + "loss": 0.7976, + "step": 10569 + }, + { + "epoch": 0.7636317662145321, + "grad_norm": 6.196592489738753, + "learning_rate": 3.5451848185641114e-06, + "loss": 0.747, + "step": 10570 + }, + { + "epoch": 0.7637040114147416, + "grad_norm": 8.325207168704901, + "learning_rate": 3.5449190984365423e-06, + "loss": 0.8978, + "step": 10571 + }, + { + "epoch": 0.7637762566149512, + "grad_norm": 6.439358341166243, + "learning_rate": 3.5446533640048416e-06, + "loss": 0.8297, + "step": 10572 + }, + { + "epoch": 0.7638485018151606, + "grad_norm": 5.504948210788799, + "learning_rate": 3.5443876152726476e-06, + "loss": 0.8406, + "step": 10573 + }, + { + "epoch": 0.7639207470153702, + "grad_norm": 4.908988431381112, + "learning_rate": 3.544121852243598e-06, + "loss": 0.7458, + "step": 10574 + }, + { + "epoch": 0.7639929922155797, + "grad_norm": 7.2629384313928425, + "learning_rate": 3.5438560749213306e-06, + "loss": 0.8144, + "step": 10575 + }, + { + "epoch": 0.7640652374157891, + "grad_norm": 5.697044329028349, + "learning_rate": 3.543590283309485e-06, + "loss": 0.8543, + "step": 10576 + }, + { + "epoch": 0.7641374826159987, + "grad_norm": 7.312746483976866, + "learning_rate": 3.543324477411698e-06, + "loss": 0.8752, + "step": 10577 + }, + { + "epoch": 0.7642097278162082, + "grad_norm": 6.54128651415407, + "learning_rate": 3.5430586572316096e-06, + "loss": 0.7252, + "step": 10578 + }, + { + "epoch": 0.7642819730164178, + "grad_norm": 8.734695128441313, + "learning_rate": 3.5427928227728568e-06, + "loss": 0.8541, + "step": 10579 + }, + { + "epoch": 0.7643542182166272, + "grad_norm": 6.40183519039124, + "learning_rate": 3.542526974039081e-06, + "loss": 0.8018, + "step": 10580 + }, + { + "epoch": 0.7644264634168367, + "grad_norm": 5.862947803955302, + "learning_rate": 3.5422611110339207e-06, + "loss": 0.8342, + "step": 10581 + }, + { + "epoch": 0.7644987086170463, + "grad_norm": 6.536021848494993, + "learning_rate": 3.5419952337610147e-06, + "loss": 0.8159, + "step": 10582 + }, + { + "epoch": 0.7645709538172558, + "grad_norm": 8.345973943578697, + "learning_rate": 3.5417293422240028e-06, + "loss": 0.8684, + "step": 10583 + }, + { + "epoch": 0.7646431990174652, + "grad_norm": 7.866895350133442, + "learning_rate": 3.5414634364265245e-06, + "loss": 0.7834, + "step": 10584 + }, + { + "epoch": 0.7647154442176748, + "grad_norm": 7.393656522235721, + "learning_rate": 3.5411975163722214e-06, + "loss": 0.8873, + "step": 10585 + }, + { + "epoch": 0.7647876894178843, + "grad_norm": 5.947575102272183, + "learning_rate": 3.5409315820647322e-06, + "loss": 0.8042, + "step": 10586 + }, + { + "epoch": 0.7648599346180938, + "grad_norm": 6.775307860189217, + "learning_rate": 3.5406656335076984e-06, + "loss": 0.8111, + "step": 10587 + }, + { + "epoch": 0.7649321798183033, + "grad_norm": 7.032340139015642, + "learning_rate": 3.540399670704759e-06, + "loss": 0.7652, + "step": 10588 + }, + { + "epoch": 0.7650044250185128, + "grad_norm": 8.419301844849578, + "learning_rate": 3.540133693659557e-06, + "loss": 0.8302, + "step": 10589 + }, + { + "epoch": 0.7650766702187224, + "grad_norm": 7.140903058470863, + "learning_rate": 3.539867702375732e-06, + "loss": 0.856, + "step": 10590 + }, + { + "epoch": 0.7651489154189318, + "grad_norm": 10.425840890027056, + "learning_rate": 3.5396016968569256e-06, + "loss": 0.8022, + "step": 10591 + }, + { + "epoch": 0.7652211606191414, + "grad_norm": 5.418930969384289, + "learning_rate": 3.539335677106779e-06, + "loss": 0.8049, + "step": 10592 + }, + { + "epoch": 0.7652934058193509, + "grad_norm": 6.8877090893143444, + "learning_rate": 3.539069643128934e-06, + "loss": 0.8738, + "step": 10593 + }, + { + "epoch": 0.7653656510195603, + "grad_norm": 7.643463294078783, + "learning_rate": 3.5388035949270327e-06, + "loss": 0.7744, + "step": 10594 + }, + { + "epoch": 0.7654378962197699, + "grad_norm": 6.679774876073259, + "learning_rate": 3.5385375325047167e-06, + "loss": 0.784, + "step": 10595 + }, + { + "epoch": 0.7655101414199794, + "grad_norm": 5.7324195458896465, + "learning_rate": 3.5382714558656283e-06, + "loss": 0.8695, + "step": 10596 + }, + { + "epoch": 0.765582386620189, + "grad_norm": 5.557466436877643, + "learning_rate": 3.5380053650134095e-06, + "loss": 0.8842, + "step": 10597 + }, + { + "epoch": 0.7656546318203984, + "grad_norm": 5.90507762111776, + "learning_rate": 3.537739259951703e-06, + "loss": 0.7624, + "step": 10598 + }, + { + "epoch": 0.7657268770206079, + "grad_norm": 5.728745360633725, + "learning_rate": 3.537473140684152e-06, + "loss": 0.8139, + "step": 10599 + }, + { + "epoch": 0.7657991222208175, + "grad_norm": 5.424982153515972, + "learning_rate": 3.537207007214399e-06, + "loss": 0.9093, + "step": 10600 + }, + { + "epoch": 0.765871367421027, + "grad_norm": 6.143867548985138, + "learning_rate": 3.5369408595460886e-06, + "loss": 0.7784, + "step": 10601 + }, + { + "epoch": 0.7659436126212364, + "grad_norm": 6.42538734782426, + "learning_rate": 3.5366746976828614e-06, + "loss": 0.8832, + "step": 10602 + }, + { + "epoch": 0.766015857821446, + "grad_norm": 6.510030196862569, + "learning_rate": 3.536408521628364e-06, + "loss": 0.7811, + "step": 10603 + }, + { + "epoch": 0.7660881030216555, + "grad_norm": 6.060112866650503, + "learning_rate": 3.5361423313862375e-06, + "loss": 0.8757, + "step": 10604 + }, + { + "epoch": 0.766160348221865, + "grad_norm": 6.299575718716288, + "learning_rate": 3.535876126960128e-06, + "loss": 0.7758, + "step": 10605 + }, + { + "epoch": 0.7662325934220745, + "grad_norm": 7.5679480735013565, + "learning_rate": 3.5356099083536778e-06, + "loss": 0.858, + "step": 10606 + }, + { + "epoch": 0.766304838622284, + "grad_norm": 5.687315214739432, + "learning_rate": 3.5353436755705317e-06, + "loss": 0.7494, + "step": 10607 + }, + { + "epoch": 0.7663770838224936, + "grad_norm": 6.171946078506851, + "learning_rate": 3.5350774286143353e-06, + "loss": 0.8186, + "step": 10608 + }, + { + "epoch": 0.766449329022703, + "grad_norm": 7.618527025647894, + "learning_rate": 3.5348111674887325e-06, + "loss": 0.8172, + "step": 10609 + }, + { + "epoch": 0.7665215742229126, + "grad_norm": 6.160830579967168, + "learning_rate": 3.5345448921973684e-06, + "loss": 0.8811, + "step": 10610 + }, + { + "epoch": 0.7665938194231221, + "grad_norm": 5.902542807695653, + "learning_rate": 3.534278602743888e-06, + "loss": 0.9042, + "step": 10611 + }, + { + "epoch": 0.7666660646233315, + "grad_norm": 6.131197227072039, + "learning_rate": 3.5340122991319358e-06, + "loss": 0.8464, + "step": 10612 + }, + { + "epoch": 0.7667383098235411, + "grad_norm": 7.531193523274366, + "learning_rate": 3.533745981365159e-06, + "loss": 0.7866, + "step": 10613 + }, + { + "epoch": 0.7668105550237506, + "grad_norm": 6.838688899419033, + "learning_rate": 3.5334796494472024e-06, + "loss": 0.8498, + "step": 10614 + }, + { + "epoch": 0.7668828002239602, + "grad_norm": 6.23568642956032, + "learning_rate": 3.533213303381711e-06, + "loss": 0.8847, + "step": 10615 + }, + { + "epoch": 0.7669550454241696, + "grad_norm": 6.5122254648753595, + "learning_rate": 3.5329469431723325e-06, + "loss": 0.8592, + "step": 10616 + }, + { + "epoch": 0.7670272906243791, + "grad_norm": 6.341869958868229, + "learning_rate": 3.5326805688227124e-06, + "loss": 0.8843, + "step": 10617 + }, + { + "epoch": 0.7670995358245887, + "grad_norm": 6.4453204900518655, + "learning_rate": 3.5324141803364966e-06, + "loss": 0.8272, + "step": 10618 + }, + { + "epoch": 0.7671717810247982, + "grad_norm": 6.898292064753749, + "learning_rate": 3.532147777717333e-06, + "loss": 0.7771, + "step": 10619 + }, + { + "epoch": 0.7672440262250076, + "grad_norm": 6.592485496438498, + "learning_rate": 3.531881360968867e-06, + "loss": 0.8498, + "step": 10620 + }, + { + "epoch": 0.7673162714252172, + "grad_norm": 6.6331290988375695, + "learning_rate": 3.5316149300947465e-06, + "loss": 0.771, + "step": 10621 + }, + { + "epoch": 0.7673885166254267, + "grad_norm": 6.09627445456172, + "learning_rate": 3.5313484850986183e-06, + "loss": 0.789, + "step": 10622 + }, + { + "epoch": 0.7674607618256362, + "grad_norm": 8.21275104853156, + "learning_rate": 3.531082025984131e-06, + "loss": 0.7957, + "step": 10623 + }, + { + "epoch": 0.7675330070258457, + "grad_norm": 8.401258610484168, + "learning_rate": 3.530815552754931e-06, + "loss": 0.8273, + "step": 10624 + }, + { + "epoch": 0.7676052522260552, + "grad_norm": 5.994980938064266, + "learning_rate": 3.530549065414667e-06, + "loss": 0.8729, + "step": 10625 + }, + { + "epoch": 0.7676774974262648, + "grad_norm": 5.885882423898645, + "learning_rate": 3.5302825639669854e-06, + "loss": 0.8322, + "step": 10626 + }, + { + "epoch": 0.7677497426264742, + "grad_norm": 7.9570815881139705, + "learning_rate": 3.5300160484155356e-06, + "loss": 0.8047, + "step": 10627 + }, + { + "epoch": 0.7678219878266838, + "grad_norm": 6.973921836736637, + "learning_rate": 3.5297495187639662e-06, + "loss": 0.755, + "step": 10628 + }, + { + "epoch": 0.7678942330268933, + "grad_norm": 5.883103970099292, + "learning_rate": 3.5294829750159255e-06, + "loss": 0.7767, + "step": 10629 + }, + { + "epoch": 0.7679664782271027, + "grad_norm": 7.362582340006726, + "learning_rate": 3.529216417175062e-06, + "loss": 0.9031, + "step": 10630 + }, + { + "epoch": 0.7680387234273123, + "grad_norm": 5.5577181718278785, + "learning_rate": 3.528949845245025e-06, + "loss": 0.8295, + "step": 10631 + }, + { + "epoch": 0.7681109686275218, + "grad_norm": 5.7538777419948905, + "learning_rate": 3.528683259229464e-06, + "loss": 0.9054, + "step": 10632 + }, + { + "epoch": 0.7681832138277314, + "grad_norm": 5.861122135354874, + "learning_rate": 3.528416659132027e-06, + "loss": 0.7916, + "step": 10633 + }, + { + "epoch": 0.7682554590279408, + "grad_norm": 6.562172436486383, + "learning_rate": 3.5281500449563654e-06, + "loss": 0.9045, + "step": 10634 + }, + { + "epoch": 0.7683277042281503, + "grad_norm": 5.836603864545791, + "learning_rate": 3.527883416706127e-06, + "loss": 0.8442, + "step": 10635 + }, + { + "epoch": 0.7683999494283599, + "grad_norm": 6.293012395100045, + "learning_rate": 3.5276167743849633e-06, + "loss": 0.8578, + "step": 10636 + }, + { + "epoch": 0.7684721946285694, + "grad_norm": 5.924554465238063, + "learning_rate": 3.527350117996524e-06, + "loss": 0.8434, + "step": 10637 + }, + { + "epoch": 0.7685444398287788, + "grad_norm": 6.566946503280891, + "learning_rate": 3.5270834475444587e-06, + "loss": 0.8394, + "step": 10638 + }, + { + "epoch": 0.7686166850289884, + "grad_norm": 5.863196345050644, + "learning_rate": 3.526816763032419e-06, + "loss": 0.7872, + "step": 10639 + }, + { + "epoch": 0.7686889302291979, + "grad_norm": 5.583012168339432, + "learning_rate": 3.526550064464055e-06, + "loss": 0.8849, + "step": 10640 + }, + { + "epoch": 0.7687611754294074, + "grad_norm": 6.868146219695471, + "learning_rate": 3.5262833518430175e-06, + "loss": 0.8929, + "step": 10641 + }, + { + "epoch": 0.7688334206296169, + "grad_norm": 6.481917244123683, + "learning_rate": 3.5260166251729585e-06, + "loss": 0.7789, + "step": 10642 + }, + { + "epoch": 0.7689056658298264, + "grad_norm": 5.920488582760189, + "learning_rate": 3.525749884457528e-06, + "loss": 0.8758, + "step": 10643 + }, + { + "epoch": 0.768977911030036, + "grad_norm": 5.958645721284922, + "learning_rate": 3.5254831297003773e-06, + "loss": 0.8314, + "step": 10644 + }, + { + "epoch": 0.7690501562302454, + "grad_norm": 5.706615143005603, + "learning_rate": 3.525216360905159e-06, + "loss": 0.8552, + "step": 10645 + }, + { + "epoch": 0.769122401430455, + "grad_norm": 5.307007766732325, + "learning_rate": 3.5249495780755257e-06, + "loss": 0.8474, + "step": 10646 + }, + { + "epoch": 0.7691946466306645, + "grad_norm": 6.0440768290596845, + "learning_rate": 3.524682781215128e-06, + "loss": 0.9122, + "step": 10647 + }, + { + "epoch": 0.7692668918308739, + "grad_norm": 7.33157000871741, + "learning_rate": 3.5244159703276186e-06, + "loss": 0.8607, + "step": 10648 + }, + { + "epoch": 0.7693391370310835, + "grad_norm": 5.869195546228059, + "learning_rate": 3.5241491454166497e-06, + "loss": 0.8287, + "step": 10649 + }, + { + "epoch": 0.769411382231293, + "grad_norm": 7.444158058288213, + "learning_rate": 3.5238823064858744e-06, + "loss": 0.8436, + "step": 10650 + }, + { + "epoch": 0.7694836274315026, + "grad_norm": 6.265464571375854, + "learning_rate": 3.523615453538946e-06, + "loss": 0.7694, + "step": 10651 + }, + { + "epoch": 0.769555872631712, + "grad_norm": 8.030449615850587, + "learning_rate": 3.523348586579516e-06, + "loss": 0.8184, + "step": 10652 + }, + { + "epoch": 0.7696281178319215, + "grad_norm": 7.796132757599827, + "learning_rate": 3.5230817056112387e-06, + "loss": 0.908, + "step": 10653 + }, + { + "epoch": 0.7697003630321311, + "grad_norm": 4.960979022956148, + "learning_rate": 3.5228148106377672e-06, + "loss": 0.8452, + "step": 10654 + }, + { + "epoch": 0.7697726082323406, + "grad_norm": 6.585248483475117, + "learning_rate": 3.522547901662755e-06, + "loss": 0.862, + "step": 10655 + }, + { + "epoch": 0.76984485343255, + "grad_norm": 5.437921617319735, + "learning_rate": 3.5222809786898558e-06, + "loss": 0.8518, + "step": 10656 + }, + { + "epoch": 0.7699170986327596, + "grad_norm": 6.612718246117784, + "learning_rate": 3.5220140417227244e-06, + "loss": 0.8626, + "step": 10657 + }, + { + "epoch": 0.7699893438329691, + "grad_norm": 7.023477420544918, + "learning_rate": 3.5217470907650143e-06, + "loss": 0.8247, + "step": 10658 + }, + { + "epoch": 0.7700615890331786, + "grad_norm": 6.072633120513491, + "learning_rate": 3.521480125820379e-06, + "loss": 0.7633, + "step": 10659 + }, + { + "epoch": 0.7701338342333881, + "grad_norm": 7.272303245927828, + "learning_rate": 3.521213146892475e-06, + "loss": 0.8663, + "step": 10660 + }, + { + "epoch": 0.7702060794335976, + "grad_norm": 6.364304789822976, + "learning_rate": 3.520946153984955e-06, + "loss": 0.7622, + "step": 10661 + }, + { + "epoch": 0.7702783246338072, + "grad_norm": 5.174047284724544, + "learning_rate": 3.5206791471014757e-06, + "loss": 0.855, + "step": 10662 + }, + { + "epoch": 0.7703505698340166, + "grad_norm": 6.290803694330754, + "learning_rate": 3.5204121262456903e-06, + "loss": 0.7619, + "step": 10663 + }, + { + "epoch": 0.7704228150342262, + "grad_norm": 5.681697694367903, + "learning_rate": 3.5201450914212555e-06, + "loss": 0.7897, + "step": 10664 + }, + { + "epoch": 0.7704950602344357, + "grad_norm": 6.777202189173601, + "learning_rate": 3.519878042631827e-06, + "loss": 0.8137, + "step": 10665 + }, + { + "epoch": 0.7705673054346451, + "grad_norm": 6.313074973330863, + "learning_rate": 3.51961097988106e-06, + "loss": 0.818, + "step": 10666 + }, + { + "epoch": 0.7706395506348547, + "grad_norm": 6.508041103052703, + "learning_rate": 3.5193439031726105e-06, + "loss": 0.7999, + "step": 10667 + }, + { + "epoch": 0.7707117958350642, + "grad_norm": 5.375243425401702, + "learning_rate": 3.5190768125101337e-06, + "loss": 0.8418, + "step": 10668 + }, + { + "epoch": 0.7707840410352738, + "grad_norm": 8.758881775017315, + "learning_rate": 3.5188097078972864e-06, + "loss": 0.79, + "step": 10669 + }, + { + "epoch": 0.7708562862354832, + "grad_norm": 6.427330799583555, + "learning_rate": 3.518542589337726e-06, + "loss": 0.8901, + "step": 10670 + }, + { + "epoch": 0.7709285314356927, + "grad_norm": 6.417096738728443, + "learning_rate": 3.5182754568351086e-06, + "loss": 0.8532, + "step": 10671 + }, + { + "epoch": 0.7710007766359023, + "grad_norm": 5.579537178178909, + "learning_rate": 3.51800831039309e-06, + "loss": 0.7419, + "step": 10672 + }, + { + "epoch": 0.7710730218361118, + "grad_norm": 5.9462524152313385, + "learning_rate": 3.5177411500153285e-06, + "loss": 0.9244, + "step": 10673 + }, + { + "epoch": 0.7711452670363212, + "grad_norm": 7.309651920574001, + "learning_rate": 3.517473975705481e-06, + "loss": 0.8429, + "step": 10674 + }, + { + "epoch": 0.7712175122365308, + "grad_norm": 7.550506665742626, + "learning_rate": 3.517206787467205e-06, + "loss": 0.7823, + "step": 10675 + }, + { + "epoch": 0.7712897574367403, + "grad_norm": 6.105115727757862, + "learning_rate": 3.5169395853041577e-06, + "loss": 0.885, + "step": 10676 + }, + { + "epoch": 0.7713620026369498, + "grad_norm": 6.427144730911997, + "learning_rate": 3.5166723692199967e-06, + "loss": 0.7526, + "step": 10677 + }, + { + "epoch": 0.7714342478371593, + "grad_norm": 7.19284681388989, + "learning_rate": 3.5164051392183808e-06, + "loss": 0.808, + "step": 10678 + }, + { + "epoch": 0.7715064930373688, + "grad_norm": 6.859631648725839, + "learning_rate": 3.5161378953029668e-06, + "loss": 0.8124, + "step": 10679 + }, + { + "epoch": 0.7715787382375784, + "grad_norm": 5.918095089410501, + "learning_rate": 3.5158706374774155e-06, + "loss": 0.8274, + "step": 10680 + }, + { + "epoch": 0.7716509834377878, + "grad_norm": 6.869933602402283, + "learning_rate": 3.5156033657453827e-06, + "loss": 0.8375, + "step": 10681 + }, + { + "epoch": 0.7717232286379974, + "grad_norm": 6.454438969396795, + "learning_rate": 3.51533608011053e-06, + "loss": 0.7488, + "step": 10682 + }, + { + "epoch": 0.7717954738382069, + "grad_norm": 7.355527872505353, + "learning_rate": 3.5150687805765125e-06, + "loss": 0.8136, + "step": 10683 + }, + { + "epoch": 0.7718677190384163, + "grad_norm": 5.8333111717166455, + "learning_rate": 3.5148014671469925e-06, + "loss": 0.7554, + "step": 10684 + }, + { + "epoch": 0.7719399642386259, + "grad_norm": 6.256584361284761, + "learning_rate": 3.5145341398256287e-06, + "loss": 0.8494, + "step": 10685 + }, + { + "epoch": 0.7720122094388354, + "grad_norm": 6.310665657250244, + "learning_rate": 3.51426679861608e-06, + "loss": 0.8092, + "step": 10686 + }, + { + "epoch": 0.772084454639045, + "grad_norm": 6.214581343159968, + "learning_rate": 3.5139994435220064e-06, + "loss": 0.8539, + "step": 10687 + }, + { + "epoch": 0.7721566998392544, + "grad_norm": 7.842974742252916, + "learning_rate": 3.5137320745470677e-06, + "loss": 0.8685, + "step": 10688 + }, + { + "epoch": 0.7722289450394639, + "grad_norm": 6.1772406709734815, + "learning_rate": 3.5134646916949243e-06, + "loss": 0.79, + "step": 10689 + }, + { + "epoch": 0.7723011902396735, + "grad_norm": 7.8637576311856945, + "learning_rate": 3.5131972949692355e-06, + "loss": 0.8096, + "step": 10690 + }, + { + "epoch": 0.772373435439883, + "grad_norm": 6.002363693043304, + "learning_rate": 3.5129298843736638e-06, + "loss": 0.8608, + "step": 10691 + }, + { + "epoch": 0.7724456806400924, + "grad_norm": 6.9108462681916265, + "learning_rate": 3.512662459911867e-06, + "loss": 0.8836, + "step": 10692 + }, + { + "epoch": 0.772517925840302, + "grad_norm": 6.038372992352774, + "learning_rate": 3.512395021587508e-06, + "loss": 0.8082, + "step": 10693 + }, + { + "epoch": 0.7725901710405115, + "grad_norm": 5.520585290866843, + "learning_rate": 3.512127569404247e-06, + "loss": 0.8239, + "step": 10694 + }, + { + "epoch": 0.772662416240721, + "grad_norm": 5.267257301594557, + "learning_rate": 3.511860103365746e-06, + "loss": 0.6621, + "step": 10695 + }, + { + "epoch": 0.7727346614409305, + "grad_norm": 4.896012041874956, + "learning_rate": 3.5115926234756653e-06, + "loss": 0.7684, + "step": 10696 + }, + { + "epoch": 0.77280690664114, + "grad_norm": 5.796199461038126, + "learning_rate": 3.5113251297376667e-06, + "loss": 0.7914, + "step": 10697 + }, + { + "epoch": 0.7728791518413496, + "grad_norm": 6.499596216424718, + "learning_rate": 3.511057622155413e-06, + "loss": 0.8679, + "step": 10698 + }, + { + "epoch": 0.772951397041559, + "grad_norm": 8.258390841266195, + "learning_rate": 3.510790100732565e-06, + "loss": 0.8002, + "step": 10699 + }, + { + "epoch": 0.7730236422417686, + "grad_norm": 6.449840383994313, + "learning_rate": 3.510522565472786e-06, + "loss": 0.7941, + "step": 10700 + }, + { + "epoch": 0.7730958874419781, + "grad_norm": 7.2400360039084495, + "learning_rate": 3.510255016379737e-06, + "loss": 0.9143, + "step": 10701 + }, + { + "epoch": 0.7731681326421875, + "grad_norm": 5.912850057872664, + "learning_rate": 3.5099874534570814e-06, + "loss": 0.8202, + "step": 10702 + }, + { + "epoch": 0.7732403778423971, + "grad_norm": 5.370829361390548, + "learning_rate": 3.5097198767084826e-06, + "loss": 0.8047, + "step": 10703 + }, + { + "epoch": 0.7733126230426066, + "grad_norm": 6.05189010477983, + "learning_rate": 3.5094522861376017e-06, + "loss": 0.7982, + "step": 10704 + }, + { + "epoch": 0.7733848682428162, + "grad_norm": 6.391135346943391, + "learning_rate": 3.5091846817481033e-06, + "loss": 0.7919, + "step": 10705 + }, + { + "epoch": 0.7734571134430256, + "grad_norm": 6.033654558822959, + "learning_rate": 3.5089170635436496e-06, + "loss": 0.7923, + "step": 10706 + }, + { + "epoch": 0.7735293586432351, + "grad_norm": 5.908902334658875, + "learning_rate": 3.5086494315279053e-06, + "loss": 0.82, + "step": 10707 + }, + { + "epoch": 0.7736016038434447, + "grad_norm": 6.812141339021287, + "learning_rate": 3.5083817857045337e-06, + "loss": 0.8198, + "step": 10708 + }, + { + "epoch": 0.7736738490436542, + "grad_norm": 7.039134034736402, + "learning_rate": 3.508114126077198e-06, + "loss": 0.7494, + "step": 10709 + }, + { + "epoch": 0.7737460942438636, + "grad_norm": 5.485244030040213, + "learning_rate": 3.5078464526495625e-06, + "loss": 0.7941, + "step": 10710 + }, + { + "epoch": 0.7738183394440732, + "grad_norm": 5.675056342966967, + "learning_rate": 3.5075787654252915e-06, + "loss": 0.8092, + "step": 10711 + }, + { + "epoch": 0.7738905846442827, + "grad_norm": 7.134118920441129, + "learning_rate": 3.50731106440805e-06, + "loss": 0.8541, + "step": 10712 + }, + { + "epoch": 0.7739628298444922, + "grad_norm": 6.7151033663090285, + "learning_rate": 3.5070433496015015e-06, + "loss": 0.9155, + "step": 10713 + }, + { + "epoch": 0.7740350750447017, + "grad_norm": 7.909955871304613, + "learning_rate": 3.5067756210093123e-06, + "loss": 0.8793, + "step": 10714 + }, + { + "epoch": 0.7741073202449112, + "grad_norm": 6.360802792439539, + "learning_rate": 3.5065078786351458e-06, + "loss": 0.8361, + "step": 10715 + }, + { + "epoch": 0.7741795654451208, + "grad_norm": 7.38176843435947, + "learning_rate": 3.5062401224826682e-06, + "loss": 0.8488, + "step": 10716 + }, + { + "epoch": 0.7742518106453302, + "grad_norm": 6.340419552756779, + "learning_rate": 3.505972352555545e-06, + "loss": 0.8348, + "step": 10717 + }, + { + "epoch": 0.7743240558455398, + "grad_norm": 8.009572024681137, + "learning_rate": 3.505704568857441e-06, + "loss": 0.9695, + "step": 10718 + }, + { + "epoch": 0.7743963010457493, + "grad_norm": 5.870157396848021, + "learning_rate": 3.505436771392022e-06, + "loss": 0.9116, + "step": 10719 + }, + { + "epoch": 0.7744685462459587, + "grad_norm": 6.599077576420749, + "learning_rate": 3.505168960162955e-06, + "loss": 0.8197, + "step": 10720 + }, + { + "epoch": 0.7745407914461683, + "grad_norm": 6.955814964215642, + "learning_rate": 3.504901135173905e-06, + "loss": 0.8349, + "step": 10721 + }, + { + "epoch": 0.7746130366463778, + "grad_norm": 6.368352079080192, + "learning_rate": 3.5046332964285385e-06, + "loss": 0.7805, + "step": 10722 + }, + { + "epoch": 0.7746852818465874, + "grad_norm": 7.132955560158311, + "learning_rate": 3.504365443930523e-06, + "loss": 0.8641, + "step": 10723 + }, + { + "epoch": 0.7747575270467968, + "grad_norm": 6.578950445074065, + "learning_rate": 3.504097577683524e-06, + "loss": 0.8808, + "step": 10724 + }, + { + "epoch": 0.7748297722470063, + "grad_norm": 7.394785832830301, + "learning_rate": 3.503829697691208e-06, + "loss": 0.8171, + "step": 10725 + }, + { + "epoch": 0.7749020174472159, + "grad_norm": 5.513068884734892, + "learning_rate": 3.503561803957244e-06, + "loss": 0.8009, + "step": 10726 + }, + { + "epoch": 0.7749742626474254, + "grad_norm": 7.3606329164788775, + "learning_rate": 3.5032938964852977e-06, + "loss": 0.8237, + "step": 10727 + }, + { + "epoch": 0.7750465078476348, + "grad_norm": 6.574916754946229, + "learning_rate": 3.5030259752790373e-06, + "loss": 0.882, + "step": 10728 + }, + { + "epoch": 0.7751187530478444, + "grad_norm": 7.3587363709105995, + "learning_rate": 3.5027580403421297e-06, + "loss": 0.7892, + "step": 10729 + }, + { + "epoch": 0.7751909982480539, + "grad_norm": 5.751569119185101, + "learning_rate": 3.5024900916782427e-06, + "loss": 0.7979, + "step": 10730 + }, + { + "epoch": 0.7752632434482634, + "grad_norm": 6.314394827000447, + "learning_rate": 3.5022221292910454e-06, + "loss": 0.7975, + "step": 10731 + }, + { + "epoch": 0.7753354886484729, + "grad_norm": 7.111092630335847, + "learning_rate": 3.501954153184205e-06, + "loss": 0.7689, + "step": 10732 + }, + { + "epoch": 0.7754077338486824, + "grad_norm": 5.866254750153727, + "learning_rate": 3.5016861633613906e-06, + "loss": 0.8651, + "step": 10733 + }, + { + "epoch": 0.775479979048892, + "grad_norm": 5.993592655793684, + "learning_rate": 3.5014181598262696e-06, + "loss": 0.826, + "step": 10734 + }, + { + "epoch": 0.7755522242491014, + "grad_norm": 6.239717799432338, + "learning_rate": 3.5011501425825124e-06, + "loss": 0.7401, + "step": 10735 + }, + { + "epoch": 0.775624469449311, + "grad_norm": 5.520351902345757, + "learning_rate": 3.5008821116337867e-06, + "loss": 0.7204, + "step": 10736 + }, + { + "epoch": 0.7756967146495205, + "grad_norm": 6.93324465450405, + "learning_rate": 3.5006140669837623e-06, + "loss": 0.8439, + "step": 10737 + }, + { + "epoch": 0.7757689598497299, + "grad_norm": 5.742861753568988, + "learning_rate": 3.500346008636108e-06, + "loss": 0.8466, + "step": 10738 + }, + { + "epoch": 0.7758412050499395, + "grad_norm": 5.6687699322839, + "learning_rate": 3.5000779365944932e-06, + "loss": 0.8153, + "step": 10739 + }, + { + "epoch": 0.775913450250149, + "grad_norm": 6.831064390294217, + "learning_rate": 3.499809850862588e-06, + "loss": 0.8649, + "step": 10740 + }, + { + "epoch": 0.7759856954503586, + "grad_norm": 7.0480328135251575, + "learning_rate": 3.4995417514440634e-06, + "loss": 0.92, + "step": 10741 + }, + { + "epoch": 0.776057940650568, + "grad_norm": 6.221424543627511, + "learning_rate": 3.4992736383425875e-06, + "loss": 0.827, + "step": 10742 + }, + { + "epoch": 0.7761301858507775, + "grad_norm": 6.5107186778908455, + "learning_rate": 3.4990055115618315e-06, + "loss": 0.7781, + "step": 10743 + }, + { + "epoch": 0.7762024310509871, + "grad_norm": 7.661703355209118, + "learning_rate": 3.498737371105465e-06, + "loss": 0.744, + "step": 10744 + }, + { + "epoch": 0.7762746762511966, + "grad_norm": 5.063934311366891, + "learning_rate": 3.49846921697716e-06, + "loss": 0.826, + "step": 10745 + }, + { + "epoch": 0.776346921451406, + "grad_norm": 9.164579489093542, + "learning_rate": 3.4982010491805867e-06, + "loss": 0.9183, + "step": 10746 + }, + { + "epoch": 0.7764191666516156, + "grad_norm": 6.274496974768332, + "learning_rate": 3.497932867719416e-06, + "loss": 0.8265, + "step": 10747 + }, + { + "epoch": 0.7764914118518251, + "grad_norm": 5.328129474716321, + "learning_rate": 3.4976646725973197e-06, + "loss": 0.7872, + "step": 10748 + }, + { + "epoch": 0.7765636570520346, + "grad_norm": 6.025377962912141, + "learning_rate": 3.497396463817968e-06, + "loss": 0.8283, + "step": 10749 + }, + { + "epoch": 0.7766359022522441, + "grad_norm": 7.837093857458594, + "learning_rate": 3.4971282413850334e-06, + "loss": 0.8334, + "step": 10750 + }, + { + "epoch": 0.7767081474524536, + "grad_norm": 5.874074660130987, + "learning_rate": 3.4968600053021875e-06, + "loss": 0.827, + "step": 10751 + }, + { + "epoch": 0.7767803926526632, + "grad_norm": 7.28049789277151, + "learning_rate": 3.4965917555731016e-06, + "loss": 0.8141, + "step": 10752 + }, + { + "epoch": 0.7768526378528726, + "grad_norm": 7.610131447417492, + "learning_rate": 3.496323492201449e-06, + "loss": 0.7595, + "step": 10753 + }, + { + "epoch": 0.7769248830530822, + "grad_norm": 7.096131046998895, + "learning_rate": 3.4960552151909006e-06, + "loss": 0.8712, + "step": 10754 + }, + { + "epoch": 0.7769971282532917, + "grad_norm": 7.568057957707558, + "learning_rate": 3.4957869245451306e-06, + "loss": 0.8321, + "step": 10755 + }, + { + "epoch": 0.7770693734535011, + "grad_norm": 7.574572298672216, + "learning_rate": 3.4955186202678102e-06, + "loss": 0.8597, + "step": 10756 + }, + { + "epoch": 0.7771416186537107, + "grad_norm": 6.166614772818848, + "learning_rate": 3.4952503023626133e-06, + "loss": 0.8095, + "step": 10757 + }, + { + "epoch": 0.7772138638539202, + "grad_norm": 9.39192238983893, + "learning_rate": 3.4949819708332124e-06, + "loss": 0.8174, + "step": 10758 + }, + { + "epoch": 0.7772861090541298, + "grad_norm": 5.856974117493459, + "learning_rate": 3.4947136256832803e-06, + "loss": 0.7703, + "step": 10759 + }, + { + "epoch": 0.7773583542543392, + "grad_norm": 7.386626825512103, + "learning_rate": 3.494445266916492e-06, + "loss": 0.8338, + "step": 10760 + }, + { + "epoch": 0.7774305994545487, + "grad_norm": 5.6229303579127246, + "learning_rate": 3.494176894536519e-06, + "loss": 0.822, + "step": 10761 + }, + { + "epoch": 0.7775028446547583, + "grad_norm": 5.279762555187511, + "learning_rate": 3.493908508547037e-06, + "loss": 0.8991, + "step": 10762 + }, + { + "epoch": 0.7775750898549678, + "grad_norm": 5.269609070380712, + "learning_rate": 3.493640108951719e-06, + "loss": 0.7853, + "step": 10763 + }, + { + "epoch": 0.7776473350551772, + "grad_norm": 5.860188420101507, + "learning_rate": 3.4933716957542394e-06, + "loss": 0.7821, + "step": 10764 + }, + { + "epoch": 0.7777195802553868, + "grad_norm": 5.932871409805138, + "learning_rate": 3.493103268958273e-06, + "loss": 0.7959, + "step": 10765 + }, + { + "epoch": 0.7777918254555963, + "grad_norm": 6.288011849350964, + "learning_rate": 3.4928348285674934e-06, + "loss": 0.8926, + "step": 10766 + }, + { + "epoch": 0.7778640706558058, + "grad_norm": 6.358876801642472, + "learning_rate": 3.4925663745855763e-06, + "loss": 0.8274, + "step": 10767 + }, + { + "epoch": 0.7779363158560153, + "grad_norm": 6.448111075663306, + "learning_rate": 3.4922979070161954e-06, + "loss": 0.763, + "step": 10768 + }, + { + "epoch": 0.7780085610562248, + "grad_norm": 5.254780182600986, + "learning_rate": 3.4920294258630276e-06, + "loss": 0.8259, + "step": 10769 + }, + { + "epoch": 0.7780808062564344, + "grad_norm": 6.793587005540996, + "learning_rate": 3.491760931129747e-06, + "loss": 0.8494, + "step": 10770 + }, + { + "epoch": 0.7781530514566438, + "grad_norm": 5.067716948085989, + "learning_rate": 3.4914924228200293e-06, + "loss": 0.8714, + "step": 10771 + }, + { + "epoch": 0.7782252966568534, + "grad_norm": 6.183252591465376, + "learning_rate": 3.49122390093755e-06, + "loss": 0.7658, + "step": 10772 + }, + { + "epoch": 0.7782975418570629, + "grad_norm": 5.828265068273293, + "learning_rate": 3.490955365485985e-06, + "loss": 0.8373, + "step": 10773 + }, + { + "epoch": 0.7783697870572723, + "grad_norm": 4.823560019550605, + "learning_rate": 3.490686816469011e-06, + "loss": 0.7522, + "step": 10774 + }, + { + "epoch": 0.7784420322574819, + "grad_norm": 5.754058400801471, + "learning_rate": 3.490418253890303e-06, + "loss": 0.7513, + "step": 10775 + }, + { + "epoch": 0.7785142774576914, + "grad_norm": 7.053708706670179, + "learning_rate": 3.4901496777535387e-06, + "loss": 0.924, + "step": 10776 + }, + { + "epoch": 0.778586522657901, + "grad_norm": 5.800911785673526, + "learning_rate": 3.489881088062394e-06, + "loss": 0.879, + "step": 10777 + }, + { + "epoch": 0.7786587678581104, + "grad_norm": 8.230773717356596, + "learning_rate": 3.4896124848205454e-06, + "loss": 0.8347, + "step": 10778 + }, + { + "epoch": 0.7787310130583199, + "grad_norm": 7.823170691241923, + "learning_rate": 3.4893438680316707e-06, + "loss": 0.8933, + "step": 10779 + }, + { + "epoch": 0.7788032582585295, + "grad_norm": 5.569121973220941, + "learning_rate": 3.4890752376994464e-06, + "loss": 0.7761, + "step": 10780 + }, + { + "epoch": 0.7788755034587389, + "grad_norm": 7.872094723495987, + "learning_rate": 3.48880659382755e-06, + "loss": 0.8722, + "step": 10781 + }, + { + "epoch": 0.7789477486589484, + "grad_norm": 6.640452088741481, + "learning_rate": 3.488537936419659e-06, + "loss": 0.7815, + "step": 10782 + }, + { + "epoch": 0.779019993859158, + "grad_norm": 6.161800457787626, + "learning_rate": 3.4882692654794515e-06, + "loss": 0.8782, + "step": 10783 + }, + { + "epoch": 0.7790922390593675, + "grad_norm": 5.778921947326423, + "learning_rate": 3.488000581010605e-06, + "loss": 0.8721, + "step": 10784 + }, + { + "epoch": 0.779164484259577, + "grad_norm": 5.796168857555219, + "learning_rate": 3.487731883016798e-06, + "loss": 0.8364, + "step": 10785 + }, + { + "epoch": 0.7792367294597865, + "grad_norm": 6.182134903224769, + "learning_rate": 3.487463171501708e-06, + "loss": 0.8099, + "step": 10786 + }, + { + "epoch": 0.779308974659996, + "grad_norm": 7.4905055667170775, + "learning_rate": 3.487194446469014e-06, + "loss": 0.8094, + "step": 10787 + }, + { + "epoch": 0.7793812198602056, + "grad_norm": 7.488279212310625, + "learning_rate": 3.486925707922394e-06, + "loss": 0.8606, + "step": 10788 + }, + { + "epoch": 0.779453465060415, + "grad_norm": 6.307854784506526, + "learning_rate": 3.486656955865528e-06, + "loss": 0.8116, + "step": 10789 + }, + { + "epoch": 0.7795257102606246, + "grad_norm": 5.636201175182961, + "learning_rate": 3.486388190302095e-06, + "loss": 0.8255, + "step": 10790 + }, + { + "epoch": 0.7795979554608341, + "grad_norm": 7.147666881276933, + "learning_rate": 3.4861194112357726e-06, + "loss": 0.8888, + "step": 10791 + }, + { + "epoch": 0.7796702006610435, + "grad_norm": 9.05478355481838, + "learning_rate": 3.4858506186702413e-06, + "loss": 0.8611, + "step": 10792 + }, + { + "epoch": 0.7797424458612531, + "grad_norm": 7.812755611052807, + "learning_rate": 3.4855818126091804e-06, + "loss": 0.8666, + "step": 10793 + }, + { + "epoch": 0.7798146910614626, + "grad_norm": 7.08285694390286, + "learning_rate": 3.48531299305627e-06, + "loss": 0.8833, + "step": 10794 + }, + { + "epoch": 0.7798869362616722, + "grad_norm": 5.983306869122449, + "learning_rate": 3.4850441600151896e-06, + "loss": 0.8114, + "step": 10795 + }, + { + "epoch": 0.7799591814618816, + "grad_norm": 6.980645261303582, + "learning_rate": 3.4847753134896196e-06, + "loss": 0.88, + "step": 10796 + }, + { + "epoch": 0.7800314266620911, + "grad_norm": 6.9798290640550595, + "learning_rate": 3.48450645348324e-06, + "loss": 0.8571, + "step": 10797 + }, + { + "epoch": 0.7801036718623007, + "grad_norm": 5.682157921083919, + "learning_rate": 3.484237579999732e-06, + "loss": 0.7503, + "step": 10798 + }, + { + "epoch": 0.7801759170625101, + "grad_norm": 6.419784316958751, + "learning_rate": 3.483968693042776e-06, + "loss": 0.7537, + "step": 10799 + }, + { + "epoch": 0.7802481622627196, + "grad_norm": 6.763106725414416, + "learning_rate": 3.4836997926160516e-06, + "loss": 0.8262, + "step": 10800 + }, + { + "epoch": 0.7803204074629292, + "grad_norm": 6.001416674895427, + "learning_rate": 3.4834308787232407e-06, + "loss": 0.8825, + "step": 10801 + }, + { + "epoch": 0.7803926526631387, + "grad_norm": 5.852827722237466, + "learning_rate": 3.483161951368025e-06, + "loss": 0.8146, + "step": 10802 + }, + { + "epoch": 0.7804648978633482, + "grad_norm": 6.83021551817058, + "learning_rate": 3.4828930105540857e-06, + "loss": 0.9095, + "step": 10803 + }, + { + "epoch": 0.7805371430635577, + "grad_norm": 6.06524658909458, + "learning_rate": 3.4826240562851044e-06, + "loss": 0.8603, + "step": 10804 + }, + { + "epoch": 0.7806093882637672, + "grad_norm": 6.8946272457021, + "learning_rate": 3.4823550885647626e-06, + "loss": 0.7434, + "step": 10805 + }, + { + "epoch": 0.7806816334639768, + "grad_norm": 7.187324322751142, + "learning_rate": 3.482086107396742e-06, + "loss": 0.9029, + "step": 10806 + }, + { + "epoch": 0.7807538786641862, + "grad_norm": 6.393371439565968, + "learning_rate": 3.481817112784726e-06, + "loss": 0.8569, + "step": 10807 + }, + { + "epoch": 0.7808261238643958, + "grad_norm": 6.138531743051364, + "learning_rate": 3.4815481047323964e-06, + "loss": 0.7915, + "step": 10808 + }, + { + "epoch": 0.7808983690646053, + "grad_norm": 7.183879703474588, + "learning_rate": 3.481279083243434e-06, + "loss": 0.7465, + "step": 10809 + }, + { + "epoch": 0.7809706142648147, + "grad_norm": 6.5232304888561305, + "learning_rate": 3.481010048321523e-06, + "loss": 0.8465, + "step": 10810 + }, + { + "epoch": 0.7810428594650243, + "grad_norm": 6.374658014436962, + "learning_rate": 3.4807409999703467e-06, + "loss": 0.8044, + "step": 10811 + }, + { + "epoch": 0.7811151046652338, + "grad_norm": 7.971864577839497, + "learning_rate": 3.4804719381935877e-06, + "loss": 0.8386, + "step": 10812 + }, + { + "epoch": 0.7811873498654434, + "grad_norm": 7.126556494170211, + "learning_rate": 3.4802028629949284e-06, + "loss": 0.8035, + "step": 10813 + }, + { + "epoch": 0.7812595950656528, + "grad_norm": 7.166292698292309, + "learning_rate": 3.4799337743780535e-06, + "loss": 0.844, + "step": 10814 + }, + { + "epoch": 0.7813318402658623, + "grad_norm": 7.7883768191172225, + "learning_rate": 3.4796646723466466e-06, + "loss": 0.8647, + "step": 10815 + }, + { + "epoch": 0.7814040854660719, + "grad_norm": 6.461658663557382, + "learning_rate": 3.4793955569043903e-06, + "loss": 0.8369, + "step": 10816 + }, + { + "epoch": 0.7814763306662813, + "grad_norm": 7.5561610365711305, + "learning_rate": 3.4791264280549695e-06, + "loss": 0.9064, + "step": 10817 + }, + { + "epoch": 0.7815485758664908, + "grad_norm": 6.548652207412039, + "learning_rate": 3.478857285802068e-06, + "loss": 0.8226, + "step": 10818 + }, + { + "epoch": 0.7816208210667004, + "grad_norm": 6.086189783863553, + "learning_rate": 3.478588130149371e-06, + "loss": 0.7664, + "step": 10819 + }, + { + "epoch": 0.7816930662669099, + "grad_norm": 7.688924765947219, + "learning_rate": 3.478318961100561e-06, + "loss": 0.7886, + "step": 10820 + }, + { + "epoch": 0.7817653114671194, + "grad_norm": 6.439156032254138, + "learning_rate": 3.478049778659325e-06, + "loss": 0.7908, + "step": 10821 + }, + { + "epoch": 0.7818375566673289, + "grad_norm": 7.561586435167154, + "learning_rate": 3.4777805828293465e-06, + "loss": 0.8983, + "step": 10822 + }, + { + "epoch": 0.7819098018675384, + "grad_norm": 6.601640220049158, + "learning_rate": 3.4775113736143113e-06, + "loss": 0.831, + "step": 10823 + }, + { + "epoch": 0.781982047067748, + "grad_norm": 4.7712071479150255, + "learning_rate": 3.477242151017904e-06, + "loss": 0.766, + "step": 10824 + }, + { + "epoch": 0.7820542922679574, + "grad_norm": 5.495733166587519, + "learning_rate": 3.4769729150438107e-06, + "loss": 0.803, + "step": 10825 + }, + { + "epoch": 0.782126537468167, + "grad_norm": 5.747199662016113, + "learning_rate": 3.476703665695717e-06, + "loss": 0.8342, + "step": 10826 + }, + { + "epoch": 0.7821987826683765, + "grad_norm": 5.733751642506782, + "learning_rate": 3.4764344029773082e-06, + "loss": 0.8246, + "step": 10827 + }, + { + "epoch": 0.7822710278685859, + "grad_norm": 5.925455827522134, + "learning_rate": 3.4761651268922715e-06, + "loss": 0.8875, + "step": 10828 + }, + { + "epoch": 0.7823432730687955, + "grad_norm": 6.038467752994553, + "learning_rate": 3.475895837444291e-06, + "loss": 0.732, + "step": 10829 + }, + { + "epoch": 0.782415518269005, + "grad_norm": 8.566878709754022, + "learning_rate": 3.4756265346370545e-06, + "loss": 0.8676, + "step": 10830 + }, + { + "epoch": 0.7824877634692146, + "grad_norm": 5.133097694405165, + "learning_rate": 3.475357218474248e-06, + "loss": 0.7598, + "step": 10831 + }, + { + "epoch": 0.782560008669424, + "grad_norm": 7.4592275527425596, + "learning_rate": 3.4750878889595584e-06, + "loss": 0.7929, + "step": 10832 + }, + { + "epoch": 0.7826322538696335, + "grad_norm": 6.56947406585589, + "learning_rate": 3.474818546096674e-06, + "loss": 0.8469, + "step": 10833 + }, + { + "epoch": 0.7827044990698431, + "grad_norm": 6.392968678395759, + "learning_rate": 3.4745491898892787e-06, + "loss": 0.8819, + "step": 10834 + }, + { + "epoch": 0.7827767442700525, + "grad_norm": 5.5052281586558625, + "learning_rate": 3.4742798203410633e-06, + "loss": 0.8737, + "step": 10835 + }, + { + "epoch": 0.782848989470262, + "grad_norm": 6.1475790701765085, + "learning_rate": 3.474010437455713e-06, + "loss": 0.7917, + "step": 10836 + }, + { + "epoch": 0.7829212346704716, + "grad_norm": 6.079957974941174, + "learning_rate": 3.473741041236916e-06, + "loss": 0.7464, + "step": 10837 + }, + { + "epoch": 0.7829934798706811, + "grad_norm": 7.242667074281197, + "learning_rate": 3.47347163168836e-06, + "loss": 0.9053, + "step": 10838 + }, + { + "epoch": 0.7830657250708906, + "grad_norm": 6.057129673802097, + "learning_rate": 3.4732022088137336e-06, + "loss": 0.7316, + "step": 10839 + }, + { + "epoch": 0.7831379702711001, + "grad_norm": 5.896699480073644, + "learning_rate": 3.472932772616725e-06, + "loss": 0.7898, + "step": 10840 + }, + { + "epoch": 0.7832102154713096, + "grad_norm": 6.212725457565623, + "learning_rate": 3.472663323101021e-06, + "loss": 0.8257, + "step": 10841 + }, + { + "epoch": 0.7832824606715192, + "grad_norm": 7.598723866586729, + "learning_rate": 3.472393860270313e-06, + "loss": 0.8208, + "step": 10842 + }, + { + "epoch": 0.7833547058717286, + "grad_norm": 5.596275569268581, + "learning_rate": 3.4721243841282866e-06, + "loss": 0.7521, + "step": 10843 + }, + { + "epoch": 0.7834269510719382, + "grad_norm": 5.504378569935221, + "learning_rate": 3.4718548946786324e-06, + "loss": 0.9345, + "step": 10844 + }, + { + "epoch": 0.7834991962721477, + "grad_norm": 10.030492070641362, + "learning_rate": 3.471585391925039e-06, + "loss": 0.9532, + "step": 10845 + }, + { + "epoch": 0.7835714414723571, + "grad_norm": 6.067831939180474, + "learning_rate": 3.4713158758711973e-06, + "loss": 0.7707, + "step": 10846 + }, + { + "epoch": 0.7836436866725667, + "grad_norm": 6.654371972719766, + "learning_rate": 3.4710463465207945e-06, + "loss": 0.7826, + "step": 10847 + }, + { + "epoch": 0.7837159318727762, + "grad_norm": 6.354843587203565, + "learning_rate": 3.470776803877521e-06, + "loss": 0.7858, + "step": 10848 + }, + { + "epoch": 0.7837881770729858, + "grad_norm": 6.955394314544703, + "learning_rate": 3.4705072479450675e-06, + "loss": 0.8171, + "step": 10849 + }, + { + "epoch": 0.7838604222731952, + "grad_norm": 5.672886445628892, + "learning_rate": 3.470237678727123e-06, + "loss": 0.8019, + "step": 10850 + }, + { + "epoch": 0.7839326674734047, + "grad_norm": 6.5783639171280175, + "learning_rate": 3.4699680962273785e-06, + "loss": 0.8594, + "step": 10851 + }, + { + "epoch": 0.7840049126736143, + "grad_norm": 6.135941366957497, + "learning_rate": 3.469698500449523e-06, + "loss": 0.8289, + "step": 10852 + }, + { + "epoch": 0.7840771578738237, + "grad_norm": 7.899016799586293, + "learning_rate": 3.4694288913972484e-06, + "loss": 0.8463, + "step": 10853 + }, + { + "epoch": 0.7841494030740332, + "grad_norm": 6.822275969341571, + "learning_rate": 3.4691592690742447e-06, + "loss": 0.871, + "step": 10854 + }, + { + "epoch": 0.7842216482742428, + "grad_norm": 6.730399035658972, + "learning_rate": 3.4688896334842038e-06, + "loss": 0.8032, + "step": 10855 + }, + { + "epoch": 0.7842938934744523, + "grad_norm": 5.917063346065971, + "learning_rate": 3.4686199846308157e-06, + "loss": 0.8344, + "step": 10856 + }, + { + "epoch": 0.7843661386746618, + "grad_norm": 7.449510381598049, + "learning_rate": 3.468350322517772e-06, + "loss": 0.8456, + "step": 10857 + }, + { + "epoch": 0.7844383838748713, + "grad_norm": 7.612386557950536, + "learning_rate": 3.4680806471487636e-06, + "loss": 0.8168, + "step": 10858 + }, + { + "epoch": 0.7845106290750808, + "grad_norm": 6.778018304416842, + "learning_rate": 3.4678109585274836e-06, + "loss": 0.8801, + "step": 10859 + }, + { + "epoch": 0.7845828742752904, + "grad_norm": 5.262591566435418, + "learning_rate": 3.4675412566576237e-06, + "loss": 0.8597, + "step": 10860 + }, + { + "epoch": 0.7846551194754998, + "grad_norm": 4.843545232566934, + "learning_rate": 3.4672715415428742e-06, + "loss": 0.8101, + "step": 10861 + }, + { + "epoch": 0.7847273646757094, + "grad_norm": 6.255557526670378, + "learning_rate": 3.467001813186929e-06, + "loss": 0.7853, + "step": 10862 + }, + { + "epoch": 0.7847996098759189, + "grad_norm": 5.171790730230662, + "learning_rate": 3.466732071593479e-06, + "loss": 0.8247, + "step": 10863 + }, + { + "epoch": 0.7848718550761283, + "grad_norm": 6.749917489006695, + "learning_rate": 3.4664623167662182e-06, + "loss": 0.8203, + "step": 10864 + }, + { + "epoch": 0.7849441002763379, + "grad_norm": 4.958629161226854, + "learning_rate": 3.4661925487088387e-06, + "loss": 0.8384, + "step": 10865 + }, + { + "epoch": 0.7850163454765474, + "grad_norm": 5.53873760093725, + "learning_rate": 3.465922767425033e-06, + "loss": 0.82, + "step": 10866 + }, + { + "epoch": 0.785088590676757, + "grad_norm": 5.264741999000525, + "learning_rate": 3.4656529729184944e-06, + "loss": 0.7563, + "step": 10867 + }, + { + "epoch": 0.7851608358769664, + "grad_norm": 5.344786604786762, + "learning_rate": 3.465383165192917e-06, + "loss": 0.8251, + "step": 10868 + }, + { + "epoch": 0.7852330810771759, + "grad_norm": 5.331583371757399, + "learning_rate": 3.4651133442519934e-06, + "loss": 0.8202, + "step": 10869 + }, + { + "epoch": 0.7853053262773855, + "grad_norm": 5.175992404301302, + "learning_rate": 3.464843510099418e-06, + "loss": 0.7863, + "step": 10870 + }, + { + "epoch": 0.7853775714775949, + "grad_norm": 6.811140852270833, + "learning_rate": 3.4645736627388836e-06, + "loss": 0.852, + "step": 10871 + }, + { + "epoch": 0.7854498166778044, + "grad_norm": 6.076594692075308, + "learning_rate": 3.4643038021740844e-06, + "loss": 0.842, + "step": 10872 + }, + { + "epoch": 0.785522061878014, + "grad_norm": 6.736520943076174, + "learning_rate": 3.4640339284087155e-06, + "loss": 0.8668, + "step": 10873 + }, + { + "epoch": 0.7855943070782235, + "grad_norm": 6.954098751162365, + "learning_rate": 3.4637640414464703e-06, + "loss": 0.7651, + "step": 10874 + }, + { + "epoch": 0.785666552278433, + "grad_norm": 5.798988635577469, + "learning_rate": 3.4634941412910437e-06, + "loss": 0.822, + "step": 10875 + }, + { + "epoch": 0.7857387974786425, + "grad_norm": 7.29907951299755, + "learning_rate": 3.4632242279461303e-06, + "loss": 0.8651, + "step": 10876 + }, + { + "epoch": 0.785811042678852, + "grad_norm": 6.634000027122647, + "learning_rate": 3.462954301415425e-06, + "loss": 0.8315, + "step": 10877 + }, + { + "epoch": 0.7858832878790616, + "grad_norm": 6.2470244381669175, + "learning_rate": 3.4626843617026234e-06, + "loss": 0.8274, + "step": 10878 + }, + { + "epoch": 0.785955533079271, + "grad_norm": 6.837798295845525, + "learning_rate": 3.46241440881142e-06, + "loss": 0.7843, + "step": 10879 + }, + { + "epoch": 0.7860277782794806, + "grad_norm": 5.410330054180245, + "learning_rate": 3.4621444427455113e-06, + "loss": 0.7867, + "step": 10880 + }, + { + "epoch": 0.7861000234796901, + "grad_norm": 8.854850138295943, + "learning_rate": 3.4618744635085917e-06, + "loss": 0.9524, + "step": 10881 + }, + { + "epoch": 0.7861722686798995, + "grad_norm": 5.999468144050321, + "learning_rate": 3.4616044711043573e-06, + "loss": 0.8041, + "step": 10882 + }, + { + "epoch": 0.7862445138801091, + "grad_norm": 6.894142273906965, + "learning_rate": 3.461334465536505e-06, + "loss": 0.8613, + "step": 10883 + }, + { + "epoch": 0.7863167590803186, + "grad_norm": 7.452313712887627, + "learning_rate": 3.4610644468087295e-06, + "loss": 0.8075, + "step": 10884 + }, + { + "epoch": 0.7863890042805282, + "grad_norm": 6.144103484593544, + "learning_rate": 3.460794414924729e-06, + "loss": 0.8724, + "step": 10885 + }, + { + "epoch": 0.7864612494807376, + "grad_norm": 5.832977647382449, + "learning_rate": 3.460524369888198e-06, + "loss": 0.8495, + "step": 10886 + }, + { + "epoch": 0.7865334946809471, + "grad_norm": 5.018841529097074, + "learning_rate": 3.460254311702834e-06, + "loss": 0.8431, + "step": 10887 + }, + { + "epoch": 0.7866057398811567, + "grad_norm": 7.1368374117803395, + "learning_rate": 3.459984240372335e-06, + "loss": 0.9079, + "step": 10888 + }, + { + "epoch": 0.7866779850813661, + "grad_norm": 5.911969034818463, + "learning_rate": 3.4597141559003968e-06, + "loss": 0.8568, + "step": 10889 + }, + { + "epoch": 0.7867502302815756, + "grad_norm": 6.120303163187766, + "learning_rate": 3.459444058290717e-06, + "loss": 0.9451, + "step": 10890 + }, + { + "epoch": 0.7868224754817852, + "grad_norm": 5.359326420777918, + "learning_rate": 3.4591739475469926e-06, + "loss": 0.7852, + "step": 10891 + }, + { + "epoch": 0.7868947206819947, + "grad_norm": 7.790510557629401, + "learning_rate": 3.4589038236729224e-06, + "loss": 0.7859, + "step": 10892 + }, + { + "epoch": 0.7869669658822042, + "grad_norm": 6.247214345030423, + "learning_rate": 3.4586336866722026e-06, + "loss": 0.845, + "step": 10893 + }, + { + "epoch": 0.7870392110824137, + "grad_norm": 7.40476775326116, + "learning_rate": 3.4583635365485323e-06, + "loss": 0.9222, + "step": 10894 + }, + { + "epoch": 0.7871114562826232, + "grad_norm": 6.049904234577486, + "learning_rate": 3.4580933733056095e-06, + "loss": 0.8278, + "step": 10895 + }, + { + "epoch": 0.7871837014828328, + "grad_norm": 7.118010187598899, + "learning_rate": 3.4578231969471315e-06, + "loss": 0.8716, + "step": 10896 + }, + { + "epoch": 0.7872559466830422, + "grad_norm": 4.93029491231136, + "learning_rate": 3.4575530074767983e-06, + "loss": 0.7258, + "step": 10897 + }, + { + "epoch": 0.7873281918832518, + "grad_norm": 5.085627158170263, + "learning_rate": 3.4572828048983083e-06, + "loss": 0.7785, + "step": 10898 + }, + { + "epoch": 0.7874004370834613, + "grad_norm": 5.884160144548196, + "learning_rate": 3.4570125892153593e-06, + "loss": 0.7981, + "step": 10899 + }, + { + "epoch": 0.7874726822836707, + "grad_norm": 5.563614122920878, + "learning_rate": 3.456742360431651e-06, + "loss": 0.7205, + "step": 10900 + }, + { + "epoch": 0.7875449274838803, + "grad_norm": 5.908563070130583, + "learning_rate": 3.4564721185508833e-06, + "loss": 0.7953, + "step": 10901 + }, + { + "epoch": 0.7876171726840898, + "grad_norm": 6.533974695189967, + "learning_rate": 3.456201863576754e-06, + "loss": 0.879, + "step": 10902 + }, + { + "epoch": 0.7876894178842994, + "grad_norm": 5.764550747335261, + "learning_rate": 3.4559315955129646e-06, + "loss": 0.8742, + "step": 10903 + }, + { + "epoch": 0.7877616630845088, + "grad_norm": 7.437706183132106, + "learning_rate": 3.4556613143632135e-06, + "loss": 0.9027, + "step": 10904 + }, + { + "epoch": 0.7878339082847183, + "grad_norm": 6.770653950197874, + "learning_rate": 3.4553910201312008e-06, + "loss": 0.814, + "step": 10905 + }, + { + "epoch": 0.7879061534849279, + "grad_norm": 6.083347930193942, + "learning_rate": 3.455120712820627e-06, + "loss": 0.7928, + "step": 10906 + }, + { + "epoch": 0.7879783986851373, + "grad_norm": 6.158474128294053, + "learning_rate": 3.454850392435192e-06, + "loss": 0.8639, + "step": 10907 + }, + { + "epoch": 0.7880506438853468, + "grad_norm": 5.836305496889907, + "learning_rate": 3.4545800589785977e-06, + "loss": 0.8495, + "step": 10908 + }, + { + "epoch": 0.7881228890855564, + "grad_norm": 5.299089447113142, + "learning_rate": 3.4543097124545422e-06, + "loss": 0.8523, + "step": 10909 + }, + { + "epoch": 0.7881951342857659, + "grad_norm": 5.536992944102161, + "learning_rate": 3.454039352866728e-06, + "loss": 0.8749, + "step": 10910 + }, + { + "epoch": 0.7882673794859754, + "grad_norm": 5.3316483021251235, + "learning_rate": 3.4537689802188555e-06, + "loss": 0.828, + "step": 10911 + }, + { + "epoch": 0.7883396246861849, + "grad_norm": 7.248101873865031, + "learning_rate": 3.453498594514628e-06, + "loss": 0.8331, + "step": 10912 + }, + { + "epoch": 0.7884118698863944, + "grad_norm": 5.7776332780097786, + "learning_rate": 3.453228195757743e-06, + "loss": 0.828, + "step": 10913 + }, + { + "epoch": 0.788484115086604, + "grad_norm": 6.297809086658009, + "learning_rate": 3.452957783951906e-06, + "loss": 0.709, + "step": 10914 + }, + { + "epoch": 0.7885563602868134, + "grad_norm": 5.676608209822329, + "learning_rate": 3.452687359100815e-06, + "loss": 0.8414, + "step": 10915 + }, + { + "epoch": 0.788628605487023, + "grad_norm": 7.513596734905021, + "learning_rate": 3.452416921208175e-06, + "loss": 0.8935, + "step": 10916 + }, + { + "epoch": 0.7887008506872325, + "grad_norm": 6.462423723313001, + "learning_rate": 3.4521464702776864e-06, + "loss": 0.8358, + "step": 10917 + }, + { + "epoch": 0.7887730958874419, + "grad_norm": 7.803410972698919, + "learning_rate": 3.451876006313052e-06, + "loss": 0.8898, + "step": 10918 + }, + { + "epoch": 0.7888453410876515, + "grad_norm": 5.78712321758385, + "learning_rate": 3.4516055293179734e-06, + "loss": 0.8194, + "step": 10919 + }, + { + "epoch": 0.788917586287861, + "grad_norm": 6.150222538589146, + "learning_rate": 3.451335039296155e-06, + "loss": 0.8302, + "step": 10920 + }, + { + "epoch": 0.7889898314880706, + "grad_norm": 6.0249525012917955, + "learning_rate": 3.4510645362512986e-06, + "loss": 0.8328, + "step": 10921 + }, + { + "epoch": 0.78906207668828, + "grad_norm": 5.802672579400005, + "learning_rate": 3.450794020187107e-06, + "loss": 0.8418, + "step": 10922 + }, + { + "epoch": 0.7891343218884895, + "grad_norm": 6.37553672775042, + "learning_rate": 3.4505234911072827e-06, + "loss": 0.8932, + "step": 10923 + }, + { + "epoch": 0.7892065670886991, + "grad_norm": 6.469604730574585, + "learning_rate": 3.4502529490155305e-06, + "loss": 0.8821, + "step": 10924 + }, + { + "epoch": 0.7892788122889085, + "grad_norm": 6.466674766666544, + "learning_rate": 3.449982393915553e-06, + "loss": 0.7714, + "step": 10925 + }, + { + "epoch": 0.789351057489118, + "grad_norm": 5.472298130513691, + "learning_rate": 3.449711825811055e-06, + "loss": 0.8308, + "step": 10926 + }, + { + "epoch": 0.7894233026893276, + "grad_norm": 5.744008135053989, + "learning_rate": 3.4494412447057386e-06, + "loss": 0.8138, + "step": 10927 + }, + { + "epoch": 0.7894955478895371, + "grad_norm": 9.540259733682978, + "learning_rate": 3.4491706506033094e-06, + "loss": 0.8271, + "step": 10928 + }, + { + "epoch": 0.7895677930897466, + "grad_norm": 7.201449979373593, + "learning_rate": 3.4489000435074697e-06, + "loss": 0.7663, + "step": 10929 + }, + { + "epoch": 0.7896400382899561, + "grad_norm": 7.061071740951183, + "learning_rate": 3.4486294234219266e-06, + "loss": 0.8867, + "step": 10930 + }, + { + "epoch": 0.7897122834901656, + "grad_norm": 6.79155008726952, + "learning_rate": 3.448358790350383e-06, + "loss": 0.8199, + "step": 10931 + }, + { + "epoch": 0.7897845286903752, + "grad_norm": 6.4623936185068125, + "learning_rate": 3.448088144296543e-06, + "loss": 0.8448, + "step": 10932 + }, + { + "epoch": 0.7898567738905846, + "grad_norm": 6.167161904459015, + "learning_rate": 3.4478174852641134e-06, + "loss": 0.7884, + "step": 10933 + }, + { + "epoch": 0.7899290190907942, + "grad_norm": 7.168146764223461, + "learning_rate": 3.447546813256798e-06, + "loss": 0.8145, + "step": 10934 + }, + { + "epoch": 0.7900012642910037, + "grad_norm": 5.156547399093851, + "learning_rate": 3.447276128278303e-06, + "loss": 0.7998, + "step": 10935 + }, + { + "epoch": 0.7900735094912131, + "grad_norm": 7.466981558744659, + "learning_rate": 3.447005430332332e-06, + "loss": 0.785, + "step": 10936 + }, + { + "epoch": 0.7901457546914227, + "grad_norm": 5.8763237942993936, + "learning_rate": 3.446734719422593e-06, + "loss": 0.7942, + "step": 10937 + }, + { + "epoch": 0.7902179998916322, + "grad_norm": 7.200241074234918, + "learning_rate": 3.4464639955527905e-06, + "loss": 0.853, + "step": 10938 + }, + { + "epoch": 0.7902902450918418, + "grad_norm": 5.257359206000942, + "learning_rate": 3.446193258726631e-06, + "loss": 0.7918, + "step": 10939 + }, + { + "epoch": 0.7903624902920512, + "grad_norm": 5.485962032218716, + "learning_rate": 3.4459225089478204e-06, + "loss": 0.8618, + "step": 10940 + }, + { + "epoch": 0.7904347354922607, + "grad_norm": 6.655846525330193, + "learning_rate": 3.445651746220065e-06, + "loss": 0.8561, + "step": 10941 + }, + { + "epoch": 0.7905069806924703, + "grad_norm": 5.511958386722274, + "learning_rate": 3.445380970547072e-06, + "loss": 0.7722, + "step": 10942 + }, + { + "epoch": 0.7905792258926797, + "grad_norm": 8.206922484107928, + "learning_rate": 3.4451101819325467e-06, + "loss": 0.8604, + "step": 10943 + }, + { + "epoch": 0.7906514710928892, + "grad_norm": 7.9459740767460545, + "learning_rate": 3.4448393803801973e-06, + "loss": 0.8923, + "step": 10944 + }, + { + "epoch": 0.7907237162930988, + "grad_norm": 5.631297845668949, + "learning_rate": 3.4445685658937293e-06, + "loss": 0.8876, + "step": 10945 + }, + { + "epoch": 0.7907959614933083, + "grad_norm": 5.370815688825123, + "learning_rate": 3.4442977384768527e-06, + "loss": 0.8629, + "step": 10946 + }, + { + "epoch": 0.7908682066935178, + "grad_norm": 5.96497261392809, + "learning_rate": 3.444026898133273e-06, + "loss": 0.7976, + "step": 10947 + }, + { + "epoch": 0.7909404518937273, + "grad_norm": 6.598725525307869, + "learning_rate": 3.443756044866697e-06, + "loss": 0.8072, + "step": 10948 + }, + { + "epoch": 0.7910126970939368, + "grad_norm": 6.203367905941485, + "learning_rate": 3.4434851786808345e-06, + "loss": 0.8744, + "step": 10949 + }, + { + "epoch": 0.7910849422941464, + "grad_norm": 6.537357085144684, + "learning_rate": 3.4432142995793915e-06, + "loss": 0.8891, + "step": 10950 + }, + { + "epoch": 0.7911571874943558, + "grad_norm": 7.138920085886645, + "learning_rate": 3.442943407566078e-06, + "loss": 0.8565, + "step": 10951 + }, + { + "epoch": 0.7912294326945654, + "grad_norm": 7.363644926363744, + "learning_rate": 3.442672502644601e-06, + "loss": 0.738, + "step": 10952 + }, + { + "epoch": 0.7913016778947749, + "grad_norm": 6.735276730522567, + "learning_rate": 3.4424015848186693e-06, + "loss": 0.9392, + "step": 10953 + }, + { + "epoch": 0.7913739230949843, + "grad_norm": 6.046718861537927, + "learning_rate": 3.442130654091992e-06, + "loss": 0.7512, + "step": 10954 + }, + { + "epoch": 0.7914461682951939, + "grad_norm": 5.592121761049671, + "learning_rate": 3.4418597104682777e-06, + "loss": 0.8264, + "step": 10955 + }, + { + "epoch": 0.7915184134954034, + "grad_norm": 7.335132233741571, + "learning_rate": 3.441588753951235e-06, + "loss": 0.8121, + "step": 10956 + }, + { + "epoch": 0.791590658695613, + "grad_norm": 6.3081943444579425, + "learning_rate": 3.4413177845445733e-06, + "loss": 0.8675, + "step": 10957 + }, + { + "epoch": 0.7916629038958224, + "grad_norm": 5.871491378750563, + "learning_rate": 3.4410468022520015e-06, + "loss": 0.8772, + "step": 10958 + }, + { + "epoch": 0.7917351490960319, + "grad_norm": 5.064606216653052, + "learning_rate": 3.44077580707723e-06, + "loss": 0.7705, + "step": 10959 + }, + { + "epoch": 0.7918073942962415, + "grad_norm": 5.028751108130082, + "learning_rate": 3.4405047990239694e-06, + "loss": 0.7912, + "step": 10960 + }, + { + "epoch": 0.7918796394964509, + "grad_norm": 5.554273372616251, + "learning_rate": 3.4402337780959265e-06, + "loss": 0.8812, + "step": 10961 + }, + { + "epoch": 0.7919518846966604, + "grad_norm": 6.728728225959075, + "learning_rate": 3.4399627442968144e-06, + "loss": 0.7472, + "step": 10962 + }, + { + "epoch": 0.79202412989687, + "grad_norm": 5.820369520484113, + "learning_rate": 3.4396916976303417e-06, + "loss": 0.7688, + "step": 10963 + }, + { + "epoch": 0.7920963750970795, + "grad_norm": 5.569687218367762, + "learning_rate": 3.4394206381002194e-06, + "loss": 0.8699, + "step": 10964 + }, + { + "epoch": 0.792168620297289, + "grad_norm": 7.906957059462607, + "learning_rate": 3.4391495657101583e-06, + "loss": 0.8608, + "step": 10965 + }, + { + "epoch": 0.7922408654974985, + "grad_norm": 5.8269372297329625, + "learning_rate": 3.438878480463868e-06, + "loss": 0.7832, + "step": 10966 + }, + { + "epoch": 0.792313110697708, + "grad_norm": 6.093128661732635, + "learning_rate": 3.4386073823650605e-06, + "loss": 0.8184, + "step": 10967 + }, + { + "epoch": 0.7923853558979176, + "grad_norm": 6.568189788362336, + "learning_rate": 3.438336271417447e-06, + "loss": 0.9302, + "step": 10968 + }, + { + "epoch": 0.792457601098127, + "grad_norm": 5.709210553486617, + "learning_rate": 3.438065147624739e-06, + "loss": 0.7511, + "step": 10969 + }, + { + "epoch": 0.7925298462983366, + "grad_norm": 7.2091796319398345, + "learning_rate": 3.4377940109906466e-06, + "loss": 0.9033, + "step": 10970 + }, + { + "epoch": 0.7926020914985461, + "grad_norm": 5.750706836543926, + "learning_rate": 3.4375228615188834e-06, + "loss": 0.8756, + "step": 10971 + }, + { + "epoch": 0.7926743366987555, + "grad_norm": 6.825519963952952, + "learning_rate": 3.4372516992131587e-06, + "loss": 0.774, + "step": 10972 + }, + { + "epoch": 0.7927465818989651, + "grad_norm": 6.6506186290761, + "learning_rate": 3.436980524077187e-06, + "loss": 0.8718, + "step": 10973 + }, + { + "epoch": 0.7928188270991746, + "grad_norm": 6.844715346442694, + "learning_rate": 3.43670933611468e-06, + "loss": 0.8387, + "step": 10974 + }, + { + "epoch": 0.7928910722993842, + "grad_norm": 5.825485327153009, + "learning_rate": 3.436438135329348e-06, + "loss": 0.8411, + "step": 10975 + }, + { + "epoch": 0.7929633174995936, + "grad_norm": 5.1576447421171485, + "learning_rate": 3.4361669217249056e-06, + "loss": 0.8186, + "step": 10976 + }, + { + "epoch": 0.7930355626998031, + "grad_norm": 6.342790202176651, + "learning_rate": 3.435895695305065e-06, + "loss": 0.8597, + "step": 10977 + }, + { + "epoch": 0.7931078079000127, + "grad_norm": 5.471711844425184, + "learning_rate": 3.43562445607354e-06, + "loss": 0.7698, + "step": 10978 + }, + { + "epoch": 0.7931800531002221, + "grad_norm": 5.88043253910151, + "learning_rate": 3.435353204034041e-06, + "loss": 0.8072, + "step": 10979 + }, + { + "epoch": 0.7932522983004316, + "grad_norm": 6.775685079361181, + "learning_rate": 3.435081939190284e-06, + "loss": 0.8412, + "step": 10980 + }, + { + "epoch": 0.7933245435006412, + "grad_norm": 7.152191743844865, + "learning_rate": 3.4348106615459807e-06, + "loss": 0.8635, + "step": 10981 + }, + { + "epoch": 0.7933967887008507, + "grad_norm": 5.531898589820374, + "learning_rate": 3.4345393711048454e-06, + "loss": 0.768, + "step": 10982 + }, + { + "epoch": 0.7934690339010602, + "grad_norm": 6.708984943595318, + "learning_rate": 3.434268067870592e-06, + "loss": 0.9403, + "step": 10983 + }, + { + "epoch": 0.7935412791012697, + "grad_norm": 7.059595101392534, + "learning_rate": 3.433996751846934e-06, + "loss": 0.8295, + "step": 10984 + }, + { + "epoch": 0.7936135243014792, + "grad_norm": 5.850490008315743, + "learning_rate": 3.433725423037586e-06, + "loss": 0.8324, + "step": 10985 + }, + { + "epoch": 0.7936857695016888, + "grad_norm": 5.510939555847806, + "learning_rate": 3.4334540814462606e-06, + "loss": 0.8045, + "step": 10986 + }, + { + "epoch": 0.7937580147018982, + "grad_norm": 5.280684943132655, + "learning_rate": 3.433182727076675e-06, + "loss": 0.7555, + "step": 10987 + }, + { + "epoch": 0.7938302599021078, + "grad_norm": 6.4600822131709545, + "learning_rate": 3.4329113599325426e-06, + "loss": 0.7502, + "step": 10988 + }, + { + "epoch": 0.7939025051023173, + "grad_norm": 5.68362059701563, + "learning_rate": 3.4326399800175774e-06, + "loss": 0.8823, + "step": 10989 + }, + { + "epoch": 0.7939747503025267, + "grad_norm": 5.876618831884851, + "learning_rate": 3.432368587335495e-06, + "loss": 0.7787, + "step": 10990 + }, + { + "epoch": 0.7940469955027363, + "grad_norm": 6.277528789762642, + "learning_rate": 3.4320971818900105e-06, + "loss": 0.8696, + "step": 10991 + }, + { + "epoch": 0.7941192407029458, + "grad_norm": 6.683774106274908, + "learning_rate": 3.4318257636848396e-06, + "loss": 0.8076, + "step": 10992 + }, + { + "epoch": 0.7941914859031554, + "grad_norm": 4.99390650421607, + "learning_rate": 3.431554332723697e-06, + "loss": 0.8099, + "step": 10993 + }, + { + "epoch": 0.7942637311033648, + "grad_norm": 6.005330261245493, + "learning_rate": 3.4312828890103e-06, + "loss": 0.7473, + "step": 10994 + }, + { + "epoch": 0.7943359763035743, + "grad_norm": 6.133800801140057, + "learning_rate": 3.431011432548362e-06, + "loss": 0.8251, + "step": 10995 + }, + { + "epoch": 0.7944082215037839, + "grad_norm": 6.935440995126049, + "learning_rate": 3.430739963341601e-06, + "loss": 0.8025, + "step": 10996 + }, + { + "epoch": 0.7944804667039933, + "grad_norm": 6.179282540460806, + "learning_rate": 3.4304684813937335e-06, + "loss": 0.8754, + "step": 10997 + }, + { + "epoch": 0.7945527119042028, + "grad_norm": 9.181470903890311, + "learning_rate": 3.430196986708474e-06, + "loss": 0.9057, + "step": 10998 + }, + { + "epoch": 0.7946249571044124, + "grad_norm": 7.254731574531608, + "learning_rate": 3.4299254792895398e-06, + "loss": 0.8063, + "step": 10999 + }, + { + "epoch": 0.7946972023046219, + "grad_norm": 6.629105933019279, + "learning_rate": 3.4296539591406476e-06, + "loss": 0.8914, + "step": 11000 + }, + { + "epoch": 0.7947694475048314, + "grad_norm": 6.878467864278506, + "learning_rate": 3.429382426265515e-06, + "loss": 0.7659, + "step": 11001 + }, + { + "epoch": 0.7948416927050409, + "grad_norm": 7.281387262319399, + "learning_rate": 3.4291108806678584e-06, + "loss": 0.8016, + "step": 11002 + }, + { + "epoch": 0.7949139379052504, + "grad_norm": 7.20614567306447, + "learning_rate": 3.428839322351396e-06, + "loss": 0.8205, + "step": 11003 + }, + { + "epoch": 0.7949861831054599, + "grad_norm": 5.411919239769574, + "learning_rate": 3.428567751319844e-06, + "loss": 0.8017, + "step": 11004 + }, + { + "epoch": 0.7950584283056694, + "grad_norm": 6.154654823869212, + "learning_rate": 3.42829616757692e-06, + "loss": 0.8229, + "step": 11005 + }, + { + "epoch": 0.795130673505879, + "grad_norm": 4.986544815417484, + "learning_rate": 3.4280245711263434e-06, + "loss": 0.8604, + "step": 11006 + }, + { + "epoch": 0.7952029187060885, + "grad_norm": 8.773806203294482, + "learning_rate": 3.4277529619718297e-06, + "loss": 0.7431, + "step": 11007 + }, + { + "epoch": 0.7952751639062979, + "grad_norm": 5.589537670144714, + "learning_rate": 3.427481340117099e-06, + "loss": 0.8321, + "step": 11008 + }, + { + "epoch": 0.7953474091065075, + "grad_norm": 8.433766082798092, + "learning_rate": 3.4272097055658688e-06, + "loss": 0.8738, + "step": 11009 + }, + { + "epoch": 0.795419654306717, + "grad_norm": 6.846096524894594, + "learning_rate": 3.4269380583218576e-06, + "loss": 0.8456, + "step": 11010 + }, + { + "epoch": 0.7954918995069266, + "grad_norm": 5.231477852329032, + "learning_rate": 3.426666398388784e-06, + "loss": 0.8211, + "step": 11011 + }, + { + "epoch": 0.795564144707136, + "grad_norm": 6.481322523721076, + "learning_rate": 3.4263947257703673e-06, + "loss": 0.8999, + "step": 11012 + }, + { + "epoch": 0.7956363899073455, + "grad_norm": 6.395828329906515, + "learning_rate": 3.426123040470326e-06, + "loss": 0.8722, + "step": 11013 + }, + { + "epoch": 0.7957086351075551, + "grad_norm": 7.16234132813993, + "learning_rate": 3.4258513424923785e-06, + "loss": 0.9548, + "step": 11014 + }, + { + "epoch": 0.7957808803077645, + "grad_norm": 6.25311201819617, + "learning_rate": 3.4255796318402463e-06, + "loss": 0.8477, + "step": 11015 + }, + { + "epoch": 0.795853125507974, + "grad_norm": 7.191172316774977, + "learning_rate": 3.4253079085176468e-06, + "loss": 0.7575, + "step": 11016 + }, + { + "epoch": 0.7959253707081836, + "grad_norm": 6.408044479996779, + "learning_rate": 3.425036172528301e-06, + "loss": 0.7991, + "step": 11017 + }, + { + "epoch": 0.7959976159083931, + "grad_norm": 7.110175789807372, + "learning_rate": 3.4247644238759274e-06, + "loss": 0.8475, + "step": 11018 + }, + { + "epoch": 0.7960698611086026, + "grad_norm": 7.063974412260446, + "learning_rate": 3.424492662564247e-06, + "loss": 0.847, + "step": 11019 + }, + { + "epoch": 0.7961421063088121, + "grad_norm": 5.948526523766628, + "learning_rate": 3.42422088859698e-06, + "loss": 0.8495, + "step": 11020 + }, + { + "epoch": 0.7962143515090216, + "grad_norm": 5.262732370642163, + "learning_rate": 3.4239491019778475e-06, + "loss": 0.8312, + "step": 11021 + }, + { + "epoch": 0.7962865967092311, + "grad_norm": 4.96020598537585, + "learning_rate": 3.423677302710569e-06, + "loss": 0.8214, + "step": 11022 + }, + { + "epoch": 0.7963588419094406, + "grad_norm": 5.5272519592643325, + "learning_rate": 3.423405490798865e-06, + "loss": 0.721, + "step": 11023 + }, + { + "epoch": 0.7964310871096502, + "grad_norm": 5.46811327906085, + "learning_rate": 3.4231336662464563e-06, + "loss": 0.7903, + "step": 11024 + }, + { + "epoch": 0.7965033323098597, + "grad_norm": 7.543078239511544, + "learning_rate": 3.422861829057065e-06, + "loss": 0.8251, + "step": 11025 + }, + { + "epoch": 0.7965755775100691, + "grad_norm": 8.668080972544155, + "learning_rate": 3.422589979234412e-06, + "loss": 0.8406, + "step": 11026 + }, + { + "epoch": 0.7966478227102787, + "grad_norm": 5.493337931276144, + "learning_rate": 3.4223181167822183e-06, + "loss": 0.812, + "step": 11027 + }, + { + "epoch": 0.7967200679104882, + "grad_norm": 7.222269892739017, + "learning_rate": 3.4220462417042053e-06, + "loss": 0.8902, + "step": 11028 + }, + { + "epoch": 0.7967923131106978, + "grad_norm": 6.2380938230101775, + "learning_rate": 3.4217743540040966e-06, + "loss": 0.8558, + "step": 11029 + }, + { + "epoch": 0.7968645583109072, + "grad_norm": 6.191333854564914, + "learning_rate": 3.421502453685611e-06, + "loss": 0.817, + "step": 11030 + }, + { + "epoch": 0.7969368035111167, + "grad_norm": 5.831477533053339, + "learning_rate": 3.4212305407524738e-06, + "loss": 0.7897, + "step": 11031 + }, + { + "epoch": 0.7970090487113263, + "grad_norm": 8.898409634915765, + "learning_rate": 3.420958615208405e-06, + "loss": 0.787, + "step": 11032 + }, + { + "epoch": 0.7970812939115357, + "grad_norm": 7.828865671047914, + "learning_rate": 3.4206866770571273e-06, + "loss": 0.8215, + "step": 11033 + }, + { + "epoch": 0.7971535391117452, + "grad_norm": 5.723499431269833, + "learning_rate": 3.4204147263023647e-06, + "loss": 0.7277, + "step": 11034 + }, + { + "epoch": 0.7972257843119548, + "grad_norm": 6.04468490801688, + "learning_rate": 3.420142762947839e-06, + "loss": 0.7764, + "step": 11035 + }, + { + "epoch": 0.7972980295121643, + "grad_norm": 5.450753732059319, + "learning_rate": 3.419870786997273e-06, + "loss": 0.7975, + "step": 11036 + }, + { + "epoch": 0.7973702747123738, + "grad_norm": 6.512093371328785, + "learning_rate": 3.419598798454391e-06, + "loss": 0.8568, + "step": 11037 + }, + { + "epoch": 0.7974425199125833, + "grad_norm": 6.595956442240917, + "learning_rate": 3.4193267973229145e-06, + "loss": 0.8569, + "step": 11038 + }, + { + "epoch": 0.7975147651127928, + "grad_norm": 8.716588343699364, + "learning_rate": 3.419054783606568e-06, + "loss": 0.8576, + "step": 11039 + }, + { + "epoch": 0.7975870103130023, + "grad_norm": 6.947396582526604, + "learning_rate": 3.4187827573090758e-06, + "loss": 0.8458, + "step": 11040 + }, + { + "epoch": 0.7976592555132118, + "grad_norm": 7.248964696165912, + "learning_rate": 3.4185107184341604e-06, + "loss": 0.8081, + "step": 11041 + }, + { + "epoch": 0.7977315007134214, + "grad_norm": 5.000423604187297, + "learning_rate": 3.418238666985547e-06, + "loss": 0.8, + "step": 11042 + }, + { + "epoch": 0.7978037459136309, + "grad_norm": 6.646989137918714, + "learning_rate": 3.4179666029669583e-06, + "loss": 0.7919, + "step": 11043 + }, + { + "epoch": 0.7978759911138403, + "grad_norm": 6.575298218436709, + "learning_rate": 3.4176945263821204e-06, + "loss": 0.8284, + "step": 11044 + }, + { + "epoch": 0.7979482363140499, + "grad_norm": 5.289204763192434, + "learning_rate": 3.4174224372347565e-06, + "loss": 0.7927, + "step": 11045 + }, + { + "epoch": 0.7980204815142594, + "grad_norm": 5.766980846571003, + "learning_rate": 3.417150335528592e-06, + "loss": 0.7748, + "step": 11046 + }, + { + "epoch": 0.798092726714469, + "grad_norm": 6.817511998422358, + "learning_rate": 3.416878221267352e-06, + "loss": 0.8119, + "step": 11047 + }, + { + "epoch": 0.7981649719146784, + "grad_norm": 8.753795563501477, + "learning_rate": 3.41660609445476e-06, + "loss": 0.8469, + "step": 11048 + }, + { + "epoch": 0.7982372171148879, + "grad_norm": 7.110844521028856, + "learning_rate": 3.4163339550945435e-06, + "loss": 0.8742, + "step": 11049 + }, + { + "epoch": 0.7983094623150975, + "grad_norm": 7.432395914723807, + "learning_rate": 3.416061803190426e-06, + "loss": 0.889, + "step": 11050 + }, + { + "epoch": 0.7983817075153069, + "grad_norm": 7.236171557010671, + "learning_rate": 3.415789638746134e-06, + "loss": 0.834, + "step": 11051 + }, + { + "epoch": 0.7984539527155164, + "grad_norm": 7.945506705852317, + "learning_rate": 3.415517461765392e-06, + "loss": 0.8368, + "step": 11052 + }, + { + "epoch": 0.798526197915726, + "grad_norm": 5.938127344016779, + "learning_rate": 3.4152452722519273e-06, + "loss": 0.8134, + "step": 11053 + }, + { + "epoch": 0.7985984431159355, + "grad_norm": 5.447548887288166, + "learning_rate": 3.4149730702094664e-06, + "loss": 0.8534, + "step": 11054 + }, + { + "epoch": 0.798670688316145, + "grad_norm": 6.646974503490203, + "learning_rate": 3.4147008556417333e-06, + "loss": 0.8802, + "step": 11055 + }, + { + "epoch": 0.7987429335163545, + "grad_norm": 6.1382269217121355, + "learning_rate": 3.4144286285524564e-06, + "loss": 0.8238, + "step": 11056 + }, + { + "epoch": 0.798815178716564, + "grad_norm": 10.186618134880629, + "learning_rate": 3.4141563889453612e-06, + "loss": 0.9405, + "step": 11057 + }, + { + "epoch": 0.7988874239167735, + "grad_norm": 5.719341977023798, + "learning_rate": 3.413884136824175e-06, + "loss": 0.8713, + "step": 11058 + }, + { + "epoch": 0.798959669116983, + "grad_norm": 6.566240389134439, + "learning_rate": 3.4136118721926243e-06, + "loss": 0.8032, + "step": 11059 + }, + { + "epoch": 0.7990319143171926, + "grad_norm": 7.1508318117101055, + "learning_rate": 3.413339595054437e-06, + "loss": 0.8984, + "step": 11060 + }, + { + "epoch": 0.7991041595174021, + "grad_norm": 7.223402233396632, + "learning_rate": 3.413067305413339e-06, + "loss": 0.8766, + "step": 11061 + }, + { + "epoch": 0.7991764047176115, + "grad_norm": 5.637646509537815, + "learning_rate": 3.4127950032730596e-06, + "loss": 0.7118, + "step": 11062 + }, + { + "epoch": 0.7992486499178211, + "grad_norm": 6.316497141526782, + "learning_rate": 3.4125226886373246e-06, + "loss": 0.9031, + "step": 11063 + }, + { + "epoch": 0.7993208951180306, + "grad_norm": 6.157539658598782, + "learning_rate": 3.4122503615098623e-06, + "loss": 0.8319, + "step": 11064 + }, + { + "epoch": 0.7993931403182402, + "grad_norm": 5.3804417941416744, + "learning_rate": 3.411978021894402e-06, + "loss": 0.8245, + "step": 11065 + }, + { + "epoch": 0.7994653855184496, + "grad_norm": 6.62728871092538, + "learning_rate": 3.4117056697946694e-06, + "loss": 0.8634, + "step": 11066 + }, + { + "epoch": 0.7995376307186591, + "grad_norm": 5.324937689331554, + "learning_rate": 3.4114333052143943e-06, + "loss": 0.862, + "step": 11067 + }, + { + "epoch": 0.7996098759188687, + "grad_norm": 5.699234479394846, + "learning_rate": 3.4111609281573057e-06, + "loss": 0.8047, + "step": 11068 + }, + { + "epoch": 0.7996821211190781, + "grad_norm": 6.628782811811088, + "learning_rate": 3.4108885386271313e-06, + "loss": 0.9097, + "step": 11069 + }, + { + "epoch": 0.7997543663192876, + "grad_norm": 5.461052409042294, + "learning_rate": 3.410616136627599e-06, + "loss": 0.7523, + "step": 11070 + }, + { + "epoch": 0.7998266115194972, + "grad_norm": 6.072172962363909, + "learning_rate": 3.4103437221624404e-06, + "loss": 0.789, + "step": 11071 + }, + { + "epoch": 0.7998988567197067, + "grad_norm": 5.44678852758965, + "learning_rate": 3.410071295235382e-06, + "loss": 0.7837, + "step": 11072 + }, + { + "epoch": 0.7999711019199162, + "grad_norm": 6.08301745426371, + "learning_rate": 3.4097988558501544e-06, + "loss": 0.8344, + "step": 11073 + }, + { + "epoch": 0.8000433471201257, + "grad_norm": 6.007486758683844, + "learning_rate": 3.4095264040104873e-06, + "loss": 0.8723, + "step": 11074 + }, + { + "epoch": 0.8001155923203352, + "grad_norm": 5.0587888702113295, + "learning_rate": 3.4092539397201097e-06, + "loss": 0.8579, + "step": 11075 + }, + { + "epoch": 0.8001878375205447, + "grad_norm": 5.247878599716583, + "learning_rate": 3.408981462982751e-06, + "loss": 0.7996, + "step": 11076 + }, + { + "epoch": 0.8002600827207542, + "grad_norm": 7.165620890174083, + "learning_rate": 3.4087089738021422e-06, + "loss": 0.8701, + "step": 11077 + }, + { + "epoch": 0.8003323279209638, + "grad_norm": 5.687126398915897, + "learning_rate": 3.408436472182014e-06, + "loss": 0.8512, + "step": 11078 + }, + { + "epoch": 0.8004045731211733, + "grad_norm": 6.227229639016919, + "learning_rate": 3.4081639581260954e-06, + "loss": 0.8512, + "step": 11079 + }, + { + "epoch": 0.8004768183213827, + "grad_norm": 5.968822219172056, + "learning_rate": 3.4078914316381173e-06, + "loss": 0.8815, + "step": 11080 + }, + { + "epoch": 0.8005490635215923, + "grad_norm": 6.064829585235616, + "learning_rate": 3.4076188927218103e-06, + "loss": 0.862, + "step": 11081 + }, + { + "epoch": 0.8006213087218018, + "grad_norm": 6.220746050676072, + "learning_rate": 3.407346341380905e-06, + "loss": 0.748, + "step": 11082 + }, + { + "epoch": 0.8006935539220114, + "grad_norm": 6.988936536930088, + "learning_rate": 3.4070737776191332e-06, + "loss": 0.8137, + "step": 11083 + }, + { + "epoch": 0.8007657991222208, + "grad_norm": 8.137294446748621, + "learning_rate": 3.406801201440225e-06, + "loss": 0.8297, + "step": 11084 + }, + { + "epoch": 0.8008380443224303, + "grad_norm": 7.860635402321027, + "learning_rate": 3.4065286128479134e-06, + "loss": 0.8754, + "step": 11085 + }, + { + "epoch": 0.8009102895226399, + "grad_norm": 6.794114808620387, + "learning_rate": 3.406256011845928e-06, + "loss": 0.8727, + "step": 11086 + }, + { + "epoch": 0.8009825347228493, + "grad_norm": 7.063988452802575, + "learning_rate": 3.405983398438003e-06, + "loss": 0.8457, + "step": 11087 + }, + { + "epoch": 0.8010547799230588, + "grad_norm": 6.348941576598922, + "learning_rate": 3.4057107726278682e-06, + "loss": 0.8392, + "step": 11088 + }, + { + "epoch": 0.8011270251232684, + "grad_norm": 6.177898934241254, + "learning_rate": 3.4054381344192556e-06, + "loss": 0.8234, + "step": 11089 + }, + { + "epoch": 0.8011992703234779, + "grad_norm": 6.8167895889702335, + "learning_rate": 3.4051654838158976e-06, + "loss": 0.9577, + "step": 11090 + }, + { + "epoch": 0.8012715155236874, + "grad_norm": 5.018530268627351, + "learning_rate": 3.404892820821527e-06, + "loss": 0.8784, + "step": 11091 + }, + { + "epoch": 0.8013437607238969, + "grad_norm": 7.29975680638561, + "learning_rate": 3.4046201454398774e-06, + "loss": 0.8411, + "step": 11092 + }, + { + "epoch": 0.8014160059241064, + "grad_norm": 6.331514030290976, + "learning_rate": 3.4043474576746794e-06, + "loss": 0.8674, + "step": 11093 + }, + { + "epoch": 0.8014882511243159, + "grad_norm": 6.374957813796539, + "learning_rate": 3.404074757529668e-06, + "loss": 0.8196, + "step": 11094 + }, + { + "epoch": 0.8015604963245254, + "grad_norm": 5.683023220935095, + "learning_rate": 3.403802045008574e-06, + "loss": 0.789, + "step": 11095 + }, + { + "epoch": 0.801632741524735, + "grad_norm": 6.144128319393711, + "learning_rate": 3.403529320115132e-06, + "loss": 0.8844, + "step": 11096 + }, + { + "epoch": 0.8017049867249445, + "grad_norm": 8.634471613053174, + "learning_rate": 3.4032565828530756e-06, + "loss": 0.8484, + "step": 11097 + }, + { + "epoch": 0.8017772319251539, + "grad_norm": 6.4933302478404356, + "learning_rate": 3.402983833226138e-06, + "loss": 0.8658, + "step": 11098 + }, + { + "epoch": 0.8018494771253635, + "grad_norm": 6.59426950152851, + "learning_rate": 3.402711071238052e-06, + "loss": 0.809, + "step": 11099 + }, + { + "epoch": 0.801921722325573, + "grad_norm": 6.349349834168093, + "learning_rate": 3.4024382968925523e-06, + "loss": 0.7569, + "step": 11100 + }, + { + "epoch": 0.8019939675257826, + "grad_norm": 5.061727394206136, + "learning_rate": 3.4021655101933736e-06, + "loss": 0.8199, + "step": 11101 + }, + { + "epoch": 0.802066212725992, + "grad_norm": 6.123557095967046, + "learning_rate": 3.4018927111442496e-06, + "loss": 0.7759, + "step": 11102 + }, + { + "epoch": 0.8021384579262015, + "grad_norm": 5.836274776900448, + "learning_rate": 3.401619899748915e-06, + "loss": 0.8873, + "step": 11103 + }, + { + "epoch": 0.8022107031264111, + "grad_norm": 7.676408659552152, + "learning_rate": 3.4013470760111034e-06, + "loss": 0.7621, + "step": 11104 + }, + { + "epoch": 0.8022829483266205, + "grad_norm": 6.562742101654023, + "learning_rate": 3.4010742399345503e-06, + "loss": 0.8713, + "step": 11105 + }, + { + "epoch": 0.80235519352683, + "grad_norm": 6.212985793832562, + "learning_rate": 3.400801391522991e-06, + "loss": 0.851, + "step": 11106 + }, + { + "epoch": 0.8024274387270396, + "grad_norm": 6.890278433390659, + "learning_rate": 3.4005285307801595e-06, + "loss": 0.8213, + "step": 11107 + }, + { + "epoch": 0.8024996839272491, + "grad_norm": 8.06090772415876, + "learning_rate": 3.400255657709792e-06, + "loss": 0.8911, + "step": 11108 + }, + { + "epoch": 0.8025719291274586, + "grad_norm": 6.2211912337937045, + "learning_rate": 3.3999827723156225e-06, + "loss": 0.8418, + "step": 11109 + }, + { + "epoch": 0.8026441743276681, + "grad_norm": 6.683804355426712, + "learning_rate": 3.399709874601389e-06, + "loss": 0.8981, + "step": 11110 + }, + { + "epoch": 0.8027164195278776, + "grad_norm": 6.472689548514982, + "learning_rate": 3.3994369645708247e-06, + "loss": 0.7351, + "step": 11111 + }, + { + "epoch": 0.8027886647280871, + "grad_norm": 7.351972675941009, + "learning_rate": 3.399164042227668e-06, + "loss": 0.7798, + "step": 11112 + }, + { + "epoch": 0.8028609099282966, + "grad_norm": 6.093334633780465, + "learning_rate": 3.3988911075756527e-06, + "loss": 0.8413, + "step": 11113 + }, + { + "epoch": 0.8029331551285062, + "grad_norm": 6.572785663860997, + "learning_rate": 3.3986181606185163e-06, + "loss": 0.8555, + "step": 11114 + }, + { + "epoch": 0.8030054003287157, + "grad_norm": 8.793819893001384, + "learning_rate": 3.3983452013599955e-06, + "loss": 0.8485, + "step": 11115 + }, + { + "epoch": 0.8030776455289251, + "grad_norm": 6.286437667985486, + "learning_rate": 3.398072229803826e-06, + "loss": 0.8626, + "step": 11116 + }, + { + "epoch": 0.8031498907291347, + "grad_norm": 6.442965659386763, + "learning_rate": 3.3977992459537456e-06, + "loss": 0.8564, + "step": 11117 + }, + { + "epoch": 0.8032221359293442, + "grad_norm": 5.817845337522981, + "learning_rate": 3.39752624981349e-06, + "loss": 0.7627, + "step": 11118 + }, + { + "epoch": 0.8032943811295538, + "grad_norm": 6.9655480662530245, + "learning_rate": 3.3972532413867965e-06, + "loss": 0.8189, + "step": 11119 + }, + { + "epoch": 0.8033666263297632, + "grad_norm": 5.529718909129944, + "learning_rate": 3.3969802206774037e-06, + "loss": 0.7838, + "step": 11120 + }, + { + "epoch": 0.8034388715299727, + "grad_norm": 6.181882523900518, + "learning_rate": 3.3967071876890477e-06, + "loss": 0.8686, + "step": 11121 + }, + { + "epoch": 0.8035111167301823, + "grad_norm": 6.8200001158952, + "learning_rate": 3.3964341424254665e-06, + "loss": 0.8728, + "step": 11122 + }, + { + "epoch": 0.8035833619303917, + "grad_norm": 6.814066295627096, + "learning_rate": 3.3961610848903977e-06, + "loss": 0.8219, + "step": 11123 + }, + { + "epoch": 0.8036556071306012, + "grad_norm": 6.053581363173096, + "learning_rate": 3.39588801508758e-06, + "loss": 0.8499, + "step": 11124 + }, + { + "epoch": 0.8037278523308108, + "grad_norm": 7.102978857830365, + "learning_rate": 3.3956149330207506e-06, + "loss": 0.8221, + "step": 11125 + }, + { + "epoch": 0.8038000975310203, + "grad_norm": 7.120530985631197, + "learning_rate": 3.3953418386936487e-06, + "loss": 0.8437, + "step": 11126 + }, + { + "epoch": 0.8038723427312298, + "grad_norm": 4.778419604303804, + "learning_rate": 3.3950687321100118e-06, + "loss": 0.748, + "step": 11127 + }, + { + "epoch": 0.8039445879314393, + "grad_norm": 5.2859164055728405, + "learning_rate": 3.3947956132735787e-06, + "loss": 0.8963, + "step": 11128 + }, + { + "epoch": 0.8040168331316488, + "grad_norm": 5.070782514749538, + "learning_rate": 3.394522482188089e-06, + "loss": 0.7922, + "step": 11129 + }, + { + "epoch": 0.8040890783318583, + "grad_norm": 6.77849498282456, + "learning_rate": 3.3942493388572804e-06, + "loss": 0.8081, + "step": 11130 + }, + { + "epoch": 0.8041613235320678, + "grad_norm": 6.740017326174237, + "learning_rate": 3.3939761832848937e-06, + "loss": 0.856, + "step": 11131 + }, + { + "epoch": 0.8042335687322774, + "grad_norm": 6.288606956521713, + "learning_rate": 3.3937030154746665e-06, + "loss": 0.7607, + "step": 11132 + }, + { + "epoch": 0.8043058139324869, + "grad_norm": 6.84823753905758, + "learning_rate": 3.3934298354303384e-06, + "loss": 0.9005, + "step": 11133 + }, + { + "epoch": 0.8043780591326963, + "grad_norm": 6.554790800642348, + "learning_rate": 3.3931566431556503e-06, + "loss": 0.8444, + "step": 11134 + }, + { + "epoch": 0.8044503043329059, + "grad_norm": 5.8890142287504945, + "learning_rate": 3.3928834386543414e-06, + "loss": 0.8533, + "step": 11135 + }, + { + "epoch": 0.8045225495331154, + "grad_norm": 5.46801002945779, + "learning_rate": 3.392610221930151e-06, + "loss": 0.8047, + "step": 11136 + }, + { + "epoch": 0.804594794733325, + "grad_norm": 5.526023334996495, + "learning_rate": 3.39233699298682e-06, + "loss": 0.7787, + "step": 11137 + }, + { + "epoch": 0.8046670399335344, + "grad_norm": 6.384631790620869, + "learning_rate": 3.3920637518280886e-06, + "loss": 0.8735, + "step": 11138 + }, + { + "epoch": 0.8047392851337439, + "grad_norm": 7.723033133665258, + "learning_rate": 3.3917904984576967e-06, + "loss": 0.8438, + "step": 11139 + }, + { + "epoch": 0.8048115303339535, + "grad_norm": 7.923095609088768, + "learning_rate": 3.3915172328793857e-06, + "loss": 0.8689, + "step": 11140 + }, + { + "epoch": 0.8048837755341629, + "grad_norm": 6.976410218015372, + "learning_rate": 3.3912439550968956e-06, + "loss": 0.8692, + "step": 11141 + }, + { + "epoch": 0.8049560207343724, + "grad_norm": 6.3215475679270545, + "learning_rate": 3.3909706651139674e-06, + "loss": 0.8317, + "step": 11142 + }, + { + "epoch": 0.805028265934582, + "grad_norm": 5.9722972934154495, + "learning_rate": 3.390697362934343e-06, + "loss": 0.8343, + "step": 11143 + }, + { + "epoch": 0.8051005111347915, + "grad_norm": 5.114060239852559, + "learning_rate": 3.3904240485617634e-06, + "loss": 0.7569, + "step": 11144 + }, + { + "epoch": 0.805172756335001, + "grad_norm": 6.608281182576017, + "learning_rate": 3.3901507219999707e-06, + "loss": 0.8612, + "step": 11145 + }, + { + "epoch": 0.8052450015352105, + "grad_norm": 6.321434119516914, + "learning_rate": 3.389877383252705e-06, + "loss": 0.8824, + "step": 11146 + }, + { + "epoch": 0.80531724673542, + "grad_norm": 5.848570768084873, + "learning_rate": 3.3896040323237084e-06, + "loss": 0.833, + "step": 11147 + }, + { + "epoch": 0.8053894919356295, + "grad_norm": 6.072638145939018, + "learning_rate": 3.3893306692167236e-06, + "loss": 0.8166, + "step": 11148 + }, + { + "epoch": 0.805461737135839, + "grad_norm": 5.1540731747946396, + "learning_rate": 3.3890572939354927e-06, + "loss": 0.9002, + "step": 11149 + }, + { + "epoch": 0.8055339823360486, + "grad_norm": 6.928097238171007, + "learning_rate": 3.388783906483758e-06, + "loss": 0.8295, + "step": 11150 + }, + { + "epoch": 0.8056062275362581, + "grad_norm": 8.467252141378783, + "learning_rate": 3.388510506865261e-06, + "loss": 0.8545, + "step": 11151 + }, + { + "epoch": 0.8056784727364675, + "grad_norm": 5.804283323117466, + "learning_rate": 3.3882370950837453e-06, + "loss": 0.8529, + "step": 11152 + }, + { + "epoch": 0.8057507179366771, + "grad_norm": 7.480601336945091, + "learning_rate": 3.3879636711429542e-06, + "loss": 0.8175, + "step": 11153 + }, + { + "epoch": 0.8058229631368866, + "grad_norm": 5.398930175205377, + "learning_rate": 3.3876902350466296e-06, + "loss": 0.8308, + "step": 11154 + }, + { + "epoch": 0.8058952083370962, + "grad_norm": 5.892075094538233, + "learning_rate": 3.387416786798514e-06, + "loss": 0.9056, + "step": 11155 + }, + { + "epoch": 0.8059674535373056, + "grad_norm": 5.949518508021882, + "learning_rate": 3.3871433264023525e-06, + "loss": 0.7877, + "step": 11156 + }, + { + "epoch": 0.8060396987375151, + "grad_norm": 6.664786296145662, + "learning_rate": 3.3868698538618873e-06, + "loss": 0.808, + "step": 11157 + }, + { + "epoch": 0.8061119439377247, + "grad_norm": 5.691640834647334, + "learning_rate": 3.3865963691808625e-06, + "loss": 0.7602, + "step": 11158 + }, + { + "epoch": 0.8061841891379341, + "grad_norm": 6.662645017036918, + "learning_rate": 3.3863228723630214e-06, + "loss": 0.8016, + "step": 11159 + }, + { + "epoch": 0.8062564343381436, + "grad_norm": 6.345008161583139, + "learning_rate": 3.3860493634121094e-06, + "loss": 0.8088, + "step": 11160 + }, + { + "epoch": 0.8063286795383532, + "grad_norm": 6.387249416504931, + "learning_rate": 3.385775842331869e-06, + "loss": 0.8512, + "step": 11161 + }, + { + "epoch": 0.8064009247385627, + "grad_norm": 7.226463259453713, + "learning_rate": 3.3855023091260447e-06, + "loss": 0.8734, + "step": 11162 + }, + { + "epoch": 0.8064731699387722, + "grad_norm": 5.85282446338678, + "learning_rate": 3.385228763798382e-06, + "loss": 0.7996, + "step": 11163 + }, + { + "epoch": 0.8065454151389817, + "grad_norm": 11.561827603970979, + "learning_rate": 3.3849552063526246e-06, + "loss": 0.9146, + "step": 11164 + }, + { + "epoch": 0.8066176603391912, + "grad_norm": 6.815830641437892, + "learning_rate": 3.3846816367925178e-06, + "loss": 0.8161, + "step": 11165 + }, + { + "epoch": 0.8066899055394007, + "grad_norm": 6.817147166036423, + "learning_rate": 3.384408055121805e-06, + "loss": 0.8036, + "step": 11166 + }, + { + "epoch": 0.8067621507396102, + "grad_norm": 5.540121431913445, + "learning_rate": 3.3841344613442345e-06, + "loss": 0.93, + "step": 11167 + }, + { + "epoch": 0.8068343959398198, + "grad_norm": 8.918351153923437, + "learning_rate": 3.383860855463549e-06, + "loss": 0.8436, + "step": 11168 + }, + { + "epoch": 0.8069066411400293, + "grad_norm": 5.714380147698621, + "learning_rate": 3.3835872374834943e-06, + "loss": 0.8617, + "step": 11169 + }, + { + "epoch": 0.8069788863402387, + "grad_norm": 5.837290738457629, + "learning_rate": 3.3833136074078165e-06, + "loss": 0.8198, + "step": 11170 + }, + { + "epoch": 0.8070511315404483, + "grad_norm": 7.799248282571427, + "learning_rate": 3.3830399652402614e-06, + "loss": 0.7595, + "step": 11171 + }, + { + "epoch": 0.8071233767406578, + "grad_norm": 5.8045284613980215, + "learning_rate": 3.382766310984575e-06, + "loss": 0.8343, + "step": 11172 + }, + { + "epoch": 0.8071956219408674, + "grad_norm": 7.173749691155083, + "learning_rate": 3.382492644644503e-06, + "loss": 0.8269, + "step": 11173 + }, + { + "epoch": 0.8072678671410768, + "grad_norm": 7.204022597782122, + "learning_rate": 3.382218966223792e-06, + "loss": 0.77, + "step": 11174 + }, + { + "epoch": 0.8073401123412863, + "grad_norm": 6.671770653366544, + "learning_rate": 3.3819452757261882e-06, + "loss": 0.7891, + "step": 11175 + }, + { + "epoch": 0.8074123575414959, + "grad_norm": 8.293775076350325, + "learning_rate": 3.3816715731554384e-06, + "loss": 0.8479, + "step": 11176 + }, + { + "epoch": 0.8074846027417053, + "grad_norm": 8.210165319601995, + "learning_rate": 3.381397858515289e-06, + "loss": 0.9174, + "step": 11177 + }, + { + "epoch": 0.8075568479419148, + "grad_norm": 6.589694687944371, + "learning_rate": 3.381124131809488e-06, + "loss": 0.7838, + "step": 11178 + }, + { + "epoch": 0.8076290931421244, + "grad_norm": 4.417805986585131, + "learning_rate": 3.3808503930417812e-06, + "loss": 0.7794, + "step": 11179 + }, + { + "epoch": 0.8077013383423339, + "grad_norm": 5.178101447816742, + "learning_rate": 3.3805766422159164e-06, + "loss": 0.7508, + "step": 11180 + }, + { + "epoch": 0.8077735835425434, + "grad_norm": 5.807196053258109, + "learning_rate": 3.380302879335642e-06, + "loss": 0.8896, + "step": 11181 + }, + { + "epoch": 0.8078458287427529, + "grad_norm": 5.874676634634747, + "learning_rate": 3.3800291044047034e-06, + "loss": 0.8719, + "step": 11182 + }, + { + "epoch": 0.8079180739429624, + "grad_norm": 7.6515543337235625, + "learning_rate": 3.379755317426851e-06, + "loss": 0.756, + "step": 11183 + }, + { + "epoch": 0.8079903191431719, + "grad_norm": 5.379865528964568, + "learning_rate": 3.3794815184058305e-06, + "loss": 0.7991, + "step": 11184 + }, + { + "epoch": 0.8080625643433814, + "grad_norm": 6.7492525605234555, + "learning_rate": 3.379207707345391e-06, + "loss": 0.8335, + "step": 11185 + }, + { + "epoch": 0.808134809543591, + "grad_norm": 7.852296345136606, + "learning_rate": 3.378933884249281e-06, + "loss": 0.9868, + "step": 11186 + }, + { + "epoch": 0.8082070547438005, + "grad_norm": 7.390507574136511, + "learning_rate": 3.378660049121248e-06, + "loss": 0.8746, + "step": 11187 + }, + { + "epoch": 0.8082792999440099, + "grad_norm": 6.385953879562888, + "learning_rate": 3.378386201965042e-06, + "loss": 0.8566, + "step": 11188 + }, + { + "epoch": 0.8083515451442195, + "grad_norm": 6.253558252243996, + "learning_rate": 3.3781123427844097e-06, + "loss": 0.8734, + "step": 11189 + }, + { + "epoch": 0.808423790344429, + "grad_norm": 6.2223610673278404, + "learning_rate": 3.377838471583102e-06, + "loss": 0.814, + "step": 11190 + }, + { + "epoch": 0.8084960355446386, + "grad_norm": 7.187845238396732, + "learning_rate": 3.3775645883648674e-06, + "loss": 0.827, + "step": 11191 + }, + { + "epoch": 0.808568280744848, + "grad_norm": 7.031579853373991, + "learning_rate": 3.377290693133455e-06, + "loss": 0.7874, + "step": 11192 + }, + { + "epoch": 0.8086405259450575, + "grad_norm": 5.910360853406539, + "learning_rate": 3.3770167858926133e-06, + "loss": 0.9004, + "step": 11193 + }, + { + "epoch": 0.8087127711452671, + "grad_norm": 8.458124117463978, + "learning_rate": 3.3767428666460935e-06, + "loss": 0.8542, + "step": 11194 + }, + { + "epoch": 0.8087850163454765, + "grad_norm": 5.254225347535595, + "learning_rate": 3.376468935397645e-06, + "loss": 0.8289, + "step": 11195 + }, + { + "epoch": 0.808857261545686, + "grad_norm": 6.317395417676596, + "learning_rate": 3.3761949921510162e-06, + "loss": 0.8568, + "step": 11196 + }, + { + "epoch": 0.8089295067458956, + "grad_norm": 5.710649934513632, + "learning_rate": 3.3759210369099593e-06, + "loss": 0.7864, + "step": 11197 + }, + { + "epoch": 0.8090017519461051, + "grad_norm": 6.7405300821652165, + "learning_rate": 3.375647069678223e-06, + "loss": 0.7975, + "step": 11198 + }, + { + "epoch": 0.8090739971463146, + "grad_norm": 7.025677587880592, + "learning_rate": 3.3753730904595585e-06, + "loss": 0.9106, + "step": 11199 + }, + { + "epoch": 0.8091462423465241, + "grad_norm": 5.131934150314962, + "learning_rate": 3.3750990992577156e-06, + "loss": 0.8568, + "step": 11200 + }, + { + "epoch": 0.8092184875467336, + "grad_norm": 5.244321385967531, + "learning_rate": 3.3748250960764454e-06, + "loss": 0.8713, + "step": 11201 + }, + { + "epoch": 0.8092907327469431, + "grad_norm": 7.91933663836486, + "learning_rate": 3.3745510809194994e-06, + "loss": 0.7631, + "step": 11202 + }, + { + "epoch": 0.8093629779471526, + "grad_norm": 6.8714063095126425, + "learning_rate": 3.3742770537906283e-06, + "loss": 0.7616, + "step": 11203 + }, + { + "epoch": 0.8094352231473622, + "grad_norm": 7.280145520443127, + "learning_rate": 3.3740030146935827e-06, + "loss": 0.8165, + "step": 11204 + }, + { + "epoch": 0.8095074683475717, + "grad_norm": 6.267692548230999, + "learning_rate": 3.3737289636321146e-06, + "loss": 0.8558, + "step": 11205 + }, + { + "epoch": 0.8095797135477811, + "grad_norm": 6.497953826449066, + "learning_rate": 3.373454900609976e-06, + "loss": 0.8336, + "step": 11206 + }, + { + "epoch": 0.8096519587479907, + "grad_norm": 7.487764423267442, + "learning_rate": 3.3731808256309167e-06, + "loss": 0.8558, + "step": 11207 + }, + { + "epoch": 0.8097242039482002, + "grad_norm": 6.230814976175587, + "learning_rate": 3.372906738698691e-06, + "loss": 0.8213, + "step": 11208 + }, + { + "epoch": 0.8097964491484096, + "grad_norm": 6.502540385389625, + "learning_rate": 3.372632639817049e-06, + "loss": 0.7591, + "step": 11209 + }, + { + "epoch": 0.8098686943486192, + "grad_norm": 5.581994859431328, + "learning_rate": 3.3723585289897445e-06, + "loss": 0.7324, + "step": 11210 + }, + { + "epoch": 0.8099409395488287, + "grad_norm": 6.461573651326252, + "learning_rate": 3.3720844062205287e-06, + "loss": 0.7496, + "step": 11211 + }, + { + "epoch": 0.8100131847490383, + "grad_norm": 5.65638151068628, + "learning_rate": 3.3718102715131547e-06, + "loss": 0.8051, + "step": 11212 + }, + { + "epoch": 0.8100854299492477, + "grad_norm": 6.980074179181408, + "learning_rate": 3.3715361248713746e-06, + "loss": 0.8054, + "step": 11213 + }, + { + "epoch": 0.8101576751494572, + "grad_norm": 6.528661479609126, + "learning_rate": 3.3712619662989413e-06, + "loss": 0.7962, + "step": 11214 + }, + { + "epoch": 0.8102299203496668, + "grad_norm": 5.46264901034334, + "learning_rate": 3.3709877957996096e-06, + "loss": 0.8043, + "step": 11215 + }, + { + "epoch": 0.8103021655498763, + "grad_norm": 8.470882583627604, + "learning_rate": 3.37071361337713e-06, + "loss": 0.8721, + "step": 11216 + }, + { + "epoch": 0.8103744107500858, + "grad_norm": 7.173623131685217, + "learning_rate": 3.370439419035258e-06, + "loss": 0.7935, + "step": 11217 + }, + { + "epoch": 0.8104466559502953, + "grad_norm": 7.282123300465572, + "learning_rate": 3.3701652127777448e-06, + "loss": 0.8223, + "step": 11218 + }, + { + "epoch": 0.8105189011505048, + "grad_norm": 5.161066014084048, + "learning_rate": 3.369890994608347e-06, + "loss": 0.6912, + "step": 11219 + }, + { + "epoch": 0.8105911463507143, + "grad_norm": 6.605274127639937, + "learning_rate": 3.3696167645308166e-06, + "loss": 0.9514, + "step": 11220 + }, + { + "epoch": 0.8106633915509238, + "grad_norm": 5.508061483179925, + "learning_rate": 3.3693425225489074e-06, + "loss": 0.7874, + "step": 11221 + }, + { + "epoch": 0.8107356367511334, + "grad_norm": 7.861571473329835, + "learning_rate": 3.369068268666374e-06, + "loss": 0.8024, + "step": 11222 + }, + { + "epoch": 0.8108078819513429, + "grad_norm": 6.248252624386621, + "learning_rate": 3.3687940028869704e-06, + "loss": 0.7714, + "step": 11223 + }, + { + "epoch": 0.8108801271515523, + "grad_norm": 5.44761296064562, + "learning_rate": 3.368519725214453e-06, + "loss": 0.8005, + "step": 11224 + }, + { + "epoch": 0.8109523723517619, + "grad_norm": 6.650917460318151, + "learning_rate": 3.3682454356525736e-06, + "loss": 0.9325, + "step": 11225 + }, + { + "epoch": 0.8110246175519714, + "grad_norm": 6.701997533202962, + "learning_rate": 3.367971134205089e-06, + "loss": 0.8781, + "step": 11226 + }, + { + "epoch": 0.8110968627521808, + "grad_norm": 6.51382794743679, + "learning_rate": 3.367696820875753e-06, + "loss": 0.7803, + "step": 11227 + }, + { + "epoch": 0.8111691079523904, + "grad_norm": 7.131080308478288, + "learning_rate": 3.3674224956683214e-06, + "loss": 0.875, + "step": 11228 + }, + { + "epoch": 0.8112413531525999, + "grad_norm": 5.942761790118446, + "learning_rate": 3.36714815858655e-06, + "loss": 0.8045, + "step": 11229 + }, + { + "epoch": 0.8113135983528095, + "grad_norm": 7.119649919625778, + "learning_rate": 3.366873809634193e-06, + "loss": 0.7952, + "step": 11230 + }, + { + "epoch": 0.8113858435530189, + "grad_norm": 6.541134012813373, + "learning_rate": 3.3665994488150073e-06, + "loss": 0.8796, + "step": 11231 + }, + { + "epoch": 0.8114580887532284, + "grad_norm": 5.2966076268594104, + "learning_rate": 3.366325076132747e-06, + "loss": 0.7319, + "step": 11232 + }, + { + "epoch": 0.811530333953438, + "grad_norm": 6.73029276240823, + "learning_rate": 3.3660506915911695e-06, + "loss": 0.7501, + "step": 11233 + }, + { + "epoch": 0.8116025791536475, + "grad_norm": 7.001063674765785, + "learning_rate": 3.36577629519403e-06, + "loss": 0.8711, + "step": 11234 + }, + { + "epoch": 0.811674824353857, + "grad_norm": 6.213398379911772, + "learning_rate": 3.365501886945086e-06, + "loss": 0.8098, + "step": 11235 + }, + { + "epoch": 0.8117470695540665, + "grad_norm": 6.29930624396635, + "learning_rate": 3.3652274668480933e-06, + "loss": 0.7675, + "step": 11236 + }, + { + "epoch": 0.811819314754276, + "grad_norm": 5.555323396706279, + "learning_rate": 3.3649530349068073e-06, + "loss": 0.8576, + "step": 11237 + }, + { + "epoch": 0.8118915599544855, + "grad_norm": 6.4156368799931105, + "learning_rate": 3.364678591124987e-06, + "loss": 0.7982, + "step": 11238 + }, + { + "epoch": 0.811963805154695, + "grad_norm": 4.908041851583433, + "learning_rate": 3.364404135506388e-06, + "loss": 0.7528, + "step": 11239 + }, + { + "epoch": 0.8120360503549046, + "grad_norm": 4.902186174633873, + "learning_rate": 3.3641296680547676e-06, + "loss": 0.8001, + "step": 11240 + }, + { + "epoch": 0.8121082955551141, + "grad_norm": 6.146903596781472, + "learning_rate": 3.3638551887738825e-06, + "loss": 0.7777, + "step": 11241 + }, + { + "epoch": 0.8121805407553235, + "grad_norm": 5.818700293176015, + "learning_rate": 3.36358069766749e-06, + "loss": 0.8004, + "step": 11242 + }, + { + "epoch": 0.8122527859555331, + "grad_norm": 7.805273513253666, + "learning_rate": 3.363306194739349e-06, + "loss": 0.7925, + "step": 11243 + }, + { + "epoch": 0.8123250311557426, + "grad_norm": 7.139441060219193, + "learning_rate": 3.3630316799932173e-06, + "loss": 0.8027, + "step": 11244 + }, + { + "epoch": 0.812397276355952, + "grad_norm": 6.779134816018912, + "learning_rate": 3.3627571534328513e-06, + "loss": 0.9045, + "step": 11245 + }, + { + "epoch": 0.8124695215561616, + "grad_norm": 6.964363667816854, + "learning_rate": 3.3624826150620093e-06, + "loss": 0.796, + "step": 11246 + }, + { + "epoch": 0.8125417667563711, + "grad_norm": 6.243061028424973, + "learning_rate": 3.36220806488445e-06, + "loss": 0.8756, + "step": 11247 + }, + { + "epoch": 0.8126140119565807, + "grad_norm": 6.709519117608265, + "learning_rate": 3.3619335029039325e-06, + "loss": 0.8508, + "step": 11248 + }, + { + "epoch": 0.8126862571567901, + "grad_norm": 6.189134786405679, + "learning_rate": 3.361658929124214e-06, + "loss": 0.777, + "step": 11249 + }, + { + "epoch": 0.8127585023569996, + "grad_norm": 8.136347900964372, + "learning_rate": 3.3613843435490546e-06, + "loss": 0.8584, + "step": 11250 + }, + { + "epoch": 0.8128307475572092, + "grad_norm": 6.548341427541798, + "learning_rate": 3.361109746182211e-06, + "loss": 0.9019, + "step": 11251 + }, + { + "epoch": 0.8129029927574187, + "grad_norm": 5.120294411062417, + "learning_rate": 3.360835137027445e-06, + "loss": 0.8202, + "step": 11252 + }, + { + "epoch": 0.8129752379576282, + "grad_norm": 5.923195402349484, + "learning_rate": 3.3605605160885137e-06, + "loss": 0.7361, + "step": 11253 + }, + { + "epoch": 0.8130474831578377, + "grad_norm": 9.472945388628476, + "learning_rate": 3.3602858833691776e-06, + "loss": 0.8898, + "step": 11254 + }, + { + "epoch": 0.8131197283580472, + "grad_norm": 7.085205309470132, + "learning_rate": 3.3600112388731947e-06, + "loss": 0.8119, + "step": 11255 + }, + { + "epoch": 0.8131919735582567, + "grad_norm": 8.00381521803406, + "learning_rate": 3.359736582604326e-06, + "loss": 0.8809, + "step": 11256 + }, + { + "epoch": 0.8132642187584662, + "grad_norm": 6.477913820216984, + "learning_rate": 3.3594619145663316e-06, + "loss": 0.8672, + "step": 11257 + }, + { + "epoch": 0.8133364639586758, + "grad_norm": 4.483457252085093, + "learning_rate": 3.3591872347629705e-06, + "loss": 0.711, + "step": 11258 + }, + { + "epoch": 0.8134087091588853, + "grad_norm": 7.670202877141756, + "learning_rate": 3.3589125431980037e-06, + "loss": 0.8135, + "step": 11259 + }, + { + "epoch": 0.8134809543590947, + "grad_norm": 8.153097076777595, + "learning_rate": 3.3586378398751905e-06, + "loss": 0.8127, + "step": 11260 + }, + { + "epoch": 0.8135531995593043, + "grad_norm": 6.578991033308787, + "learning_rate": 3.358363124798292e-06, + "loss": 0.8254, + "step": 11261 + }, + { + "epoch": 0.8136254447595138, + "grad_norm": 6.37414485207348, + "learning_rate": 3.3580883979710693e-06, + "loss": 0.8749, + "step": 11262 + }, + { + "epoch": 0.8136976899597232, + "grad_norm": 6.157034422185453, + "learning_rate": 3.3578136593972825e-06, + "loss": 0.8423, + "step": 11263 + }, + { + "epoch": 0.8137699351599328, + "grad_norm": 5.057951402568738, + "learning_rate": 3.3575389090806926e-06, + "loss": 0.8191, + "step": 11264 + }, + { + "epoch": 0.8138421803601423, + "grad_norm": 5.895996882278674, + "learning_rate": 3.3572641470250613e-06, + "loss": 0.8218, + "step": 11265 + }, + { + "epoch": 0.8139144255603519, + "grad_norm": 8.241561272863017, + "learning_rate": 3.3569893732341484e-06, + "loss": 0.7856, + "step": 11266 + }, + { + "epoch": 0.8139866707605613, + "grad_norm": 7.126514742272422, + "learning_rate": 3.3567145877117174e-06, + "loss": 0.8109, + "step": 11267 + }, + { + "epoch": 0.8140589159607708, + "grad_norm": 6.326898451251701, + "learning_rate": 3.3564397904615286e-06, + "loss": 0.8569, + "step": 11268 + }, + { + "epoch": 0.8141311611609804, + "grad_norm": 6.490299028228983, + "learning_rate": 3.3561649814873443e-06, + "loss": 0.8171, + "step": 11269 + }, + { + "epoch": 0.8142034063611899, + "grad_norm": 5.878611651671836, + "learning_rate": 3.3558901607929256e-06, + "loss": 0.7776, + "step": 11270 + }, + { + "epoch": 0.8142756515613994, + "grad_norm": 7.793547568181628, + "learning_rate": 3.355615328382036e-06, + "loss": 0.8974, + "step": 11271 + }, + { + "epoch": 0.8143478967616089, + "grad_norm": 5.123844481669878, + "learning_rate": 3.3553404842584363e-06, + "loss": 0.7929, + "step": 11272 + }, + { + "epoch": 0.8144201419618184, + "grad_norm": 6.071423756573475, + "learning_rate": 3.355065628425889e-06, + "loss": 0.8006, + "step": 11273 + }, + { + "epoch": 0.8144923871620279, + "grad_norm": 7.899187514619038, + "learning_rate": 3.354790760888158e-06, + "loss": 0.7992, + "step": 11274 + }, + { + "epoch": 0.8145646323622374, + "grad_norm": 6.7059061465318575, + "learning_rate": 3.354515881649005e-06, + "loss": 0.8997, + "step": 11275 + }, + { + "epoch": 0.814636877562447, + "grad_norm": 5.0179120610096755, + "learning_rate": 3.354240990712193e-06, + "loss": 0.789, + "step": 11276 + }, + { + "epoch": 0.8147091227626565, + "grad_norm": 5.347120148095966, + "learning_rate": 3.353966088081485e-06, + "loss": 0.7797, + "step": 11277 + }, + { + "epoch": 0.8147813679628659, + "grad_norm": 6.543032007623316, + "learning_rate": 3.3536911737606447e-06, + "loss": 0.9313, + "step": 11278 + }, + { + "epoch": 0.8148536131630755, + "grad_norm": 6.489020244075938, + "learning_rate": 3.3534162477534342e-06, + "loss": 0.6895, + "step": 11279 + }, + { + "epoch": 0.814925858363285, + "grad_norm": 5.566318016022186, + "learning_rate": 3.3531413100636184e-06, + "loss": 0.812, + "step": 11280 + }, + { + "epoch": 0.8149981035634944, + "grad_norm": 6.821049356583719, + "learning_rate": 3.352866360694961e-06, + "loss": 0.9714, + "step": 11281 + }, + { + "epoch": 0.815070348763704, + "grad_norm": 5.94187332514423, + "learning_rate": 3.352591399651225e-06, + "loss": 0.7803, + "step": 11282 + }, + { + "epoch": 0.8151425939639135, + "grad_norm": 6.9542186091338145, + "learning_rate": 3.352316426936175e-06, + "loss": 0.9381, + "step": 11283 + }, + { + "epoch": 0.8152148391641231, + "grad_norm": 7.35996148431071, + "learning_rate": 3.352041442553574e-06, + "loss": 0.8092, + "step": 11284 + }, + { + "epoch": 0.8152870843643325, + "grad_norm": 5.245090914248339, + "learning_rate": 3.3517664465071875e-06, + "loss": 0.7674, + "step": 11285 + }, + { + "epoch": 0.815359329564542, + "grad_norm": 4.870029948179681, + "learning_rate": 3.3514914388007804e-06, + "loss": 0.8457, + "step": 11286 + }, + { + "epoch": 0.8154315747647516, + "grad_norm": 5.44222256136872, + "learning_rate": 3.3512164194381157e-06, + "loss": 0.84, + "step": 11287 + }, + { + "epoch": 0.8155038199649611, + "grad_norm": 6.1917139981678275, + "learning_rate": 3.35094138842296e-06, + "loss": 0.84, + "step": 11288 + }, + { + "epoch": 0.8155760651651706, + "grad_norm": 6.5785996363244985, + "learning_rate": 3.3506663457590764e-06, + "loss": 0.8314, + "step": 11289 + }, + { + "epoch": 0.8156483103653801, + "grad_norm": 6.689110962964624, + "learning_rate": 3.3503912914502323e-06, + "loss": 0.7925, + "step": 11290 + }, + { + "epoch": 0.8157205555655896, + "grad_norm": 7.198030138754489, + "learning_rate": 3.3501162255001905e-06, + "loss": 0.8682, + "step": 11291 + }, + { + "epoch": 0.8157928007657991, + "grad_norm": 5.686074885306943, + "learning_rate": 3.349841147912719e-06, + "loss": 0.8238, + "step": 11292 + }, + { + "epoch": 0.8158650459660086, + "grad_norm": 6.9259822878675825, + "learning_rate": 3.3495660586915812e-06, + "loss": 0.7917, + "step": 11293 + }, + { + "epoch": 0.8159372911662182, + "grad_norm": 6.515108010252317, + "learning_rate": 3.3492909578405434e-06, + "loss": 0.777, + "step": 11294 + }, + { + "epoch": 0.8160095363664277, + "grad_norm": 4.8970738996803655, + "learning_rate": 3.3490158453633727e-06, + "loss": 0.7439, + "step": 11295 + }, + { + "epoch": 0.8160817815666371, + "grad_norm": 6.039027126351769, + "learning_rate": 3.3487407212638334e-06, + "loss": 0.7945, + "step": 11296 + }, + { + "epoch": 0.8161540267668467, + "grad_norm": 4.821289952621703, + "learning_rate": 3.348465585545694e-06, + "loss": 0.7613, + "step": 11297 + }, + { + "epoch": 0.8162262719670562, + "grad_norm": 7.105187962600759, + "learning_rate": 3.3481904382127177e-06, + "loss": 0.8573, + "step": 11298 + }, + { + "epoch": 0.8162985171672656, + "grad_norm": 5.938712869542945, + "learning_rate": 3.3479152792686736e-06, + "loss": 0.8111, + "step": 11299 + }, + { + "epoch": 0.8163707623674752, + "grad_norm": 6.273870735838424, + "learning_rate": 3.347640108717328e-06, + "loss": 0.8091, + "step": 11300 + }, + { + "epoch": 0.8164430075676847, + "grad_norm": 6.794316092930495, + "learning_rate": 3.3473649265624476e-06, + "loss": 0.8084, + "step": 11301 + }, + { + "epoch": 0.8165152527678943, + "grad_norm": 6.648541639074969, + "learning_rate": 3.347089732807799e-06, + "loss": 0.8165, + "step": 11302 + }, + { + "epoch": 0.8165874979681037, + "grad_norm": 6.8675782805301155, + "learning_rate": 3.346814527457149e-06, + "loss": 0.7647, + "step": 11303 + }, + { + "epoch": 0.8166597431683132, + "grad_norm": 5.8458189540186805, + "learning_rate": 3.346539310514267e-06, + "loss": 0.7401, + "step": 11304 + }, + { + "epoch": 0.8167319883685228, + "grad_norm": 6.35086847809222, + "learning_rate": 3.346264081982918e-06, + "loss": 0.8217, + "step": 11305 + }, + { + "epoch": 0.8168042335687323, + "grad_norm": 6.387198949837455, + "learning_rate": 3.3459888418668716e-06, + "loss": 0.8561, + "step": 11306 + }, + { + "epoch": 0.8168764787689418, + "grad_norm": 7.6035038349992075, + "learning_rate": 3.345713590169894e-06, + "loss": 0.7938, + "step": 11307 + }, + { + "epoch": 0.8169487239691513, + "grad_norm": 7.148813973607654, + "learning_rate": 3.345438326895755e-06, + "loss": 0.8103, + "step": 11308 + }, + { + "epoch": 0.8170209691693608, + "grad_norm": 6.484740318210815, + "learning_rate": 3.3451630520482203e-06, + "loss": 0.8732, + "step": 11309 + }, + { + "epoch": 0.8170932143695703, + "grad_norm": 5.9481045436637645, + "learning_rate": 3.3448877656310606e-06, + "loss": 0.8769, + "step": 11310 + }, + { + "epoch": 0.8171654595697798, + "grad_norm": 5.036715553866094, + "learning_rate": 3.344612467648044e-06, + "loss": 0.8543, + "step": 11311 + }, + { + "epoch": 0.8172377047699894, + "grad_norm": 5.265061164348526, + "learning_rate": 3.3443371581029376e-06, + "loss": 0.7439, + "step": 11312 + }, + { + "epoch": 0.8173099499701989, + "grad_norm": 6.81914147530355, + "learning_rate": 3.3440618369995115e-06, + "loss": 0.8275, + "step": 11313 + }, + { + "epoch": 0.8173821951704083, + "grad_norm": 6.601159150029898, + "learning_rate": 3.3437865043415337e-06, + "loss": 0.8997, + "step": 11314 + }, + { + "epoch": 0.8174544403706179, + "grad_norm": 5.182644697525677, + "learning_rate": 3.343511160132774e-06, + "loss": 0.7792, + "step": 11315 + }, + { + "epoch": 0.8175266855708274, + "grad_norm": 6.127865179565845, + "learning_rate": 3.343235804377002e-06, + "loss": 0.7674, + "step": 11316 + }, + { + "epoch": 0.8175989307710368, + "grad_norm": 6.627534777284922, + "learning_rate": 3.3429604370779854e-06, + "loss": 0.7202, + "step": 11317 + }, + { + "epoch": 0.8176711759712464, + "grad_norm": 7.767779721029272, + "learning_rate": 3.3426850582394964e-06, + "loss": 0.7436, + "step": 11318 + }, + { + "epoch": 0.8177434211714559, + "grad_norm": 6.906390296581199, + "learning_rate": 3.342409667865303e-06, + "loss": 0.8846, + "step": 11319 + }, + { + "epoch": 0.8178156663716655, + "grad_norm": 5.829938754615554, + "learning_rate": 3.342134265959175e-06, + "loss": 0.8487, + "step": 11320 + }, + { + "epoch": 0.8178879115718749, + "grad_norm": 5.719545069389807, + "learning_rate": 3.3418588525248826e-06, + "loss": 0.8571, + "step": 11321 + }, + { + "epoch": 0.8179601567720844, + "grad_norm": 6.58478851975168, + "learning_rate": 3.341583427566196e-06, + "loss": 0.8773, + "step": 11322 + }, + { + "epoch": 0.818032401972294, + "grad_norm": 8.295525450064979, + "learning_rate": 3.3413079910868856e-06, + "loss": 0.8591, + "step": 11323 + }, + { + "epoch": 0.8181046471725035, + "grad_norm": 5.8172689583295805, + "learning_rate": 3.341032543090723e-06, + "loss": 0.8587, + "step": 11324 + }, + { + "epoch": 0.818176892372713, + "grad_norm": 5.975316455074262, + "learning_rate": 3.340757083581477e-06, + "loss": 0.7819, + "step": 11325 + }, + { + "epoch": 0.8182491375729225, + "grad_norm": 4.913465503034668, + "learning_rate": 3.34048161256292e-06, + "loss": 0.7441, + "step": 11326 + }, + { + "epoch": 0.818321382773132, + "grad_norm": 7.354483823628394, + "learning_rate": 3.340206130038822e-06, + "loss": 0.8677, + "step": 11327 + }, + { + "epoch": 0.8183936279733415, + "grad_norm": 5.2920420155605346, + "learning_rate": 3.3399306360129543e-06, + "loss": 0.8743, + "step": 11328 + }, + { + "epoch": 0.818465873173551, + "grad_norm": 6.726189615190903, + "learning_rate": 3.3396551304890883e-06, + "loss": 0.8536, + "step": 11329 + }, + { + "epoch": 0.8185381183737606, + "grad_norm": 6.252506211379549, + "learning_rate": 3.3393796134709954e-06, + "loss": 0.7993, + "step": 11330 + }, + { + "epoch": 0.8186103635739701, + "grad_norm": 5.506413968055585, + "learning_rate": 3.3391040849624483e-06, + "loss": 0.7562, + "step": 11331 + }, + { + "epoch": 0.8186826087741795, + "grad_norm": 7.920806872948909, + "learning_rate": 3.3388285449672162e-06, + "loss": 0.8834, + "step": 11332 + }, + { + "epoch": 0.8187548539743891, + "grad_norm": 5.819063480181064, + "learning_rate": 3.3385529934890744e-06, + "loss": 0.7967, + "step": 11333 + }, + { + "epoch": 0.8188270991745986, + "grad_norm": 5.545844527686342, + "learning_rate": 3.3382774305317922e-06, + "loss": 0.8035, + "step": 11334 + }, + { + "epoch": 0.818899344374808, + "grad_norm": 5.248410302126054, + "learning_rate": 3.3380018560991435e-06, + "loss": 0.7744, + "step": 11335 + }, + { + "epoch": 0.8189715895750176, + "grad_norm": 6.496076720324942, + "learning_rate": 3.3377262701948994e-06, + "loss": 0.8332, + "step": 11336 + }, + { + "epoch": 0.8190438347752271, + "grad_norm": 5.114495467643882, + "learning_rate": 3.337450672822833e-06, + "loss": 0.8485, + "step": 11337 + }, + { + "epoch": 0.8191160799754367, + "grad_norm": 5.326167040529985, + "learning_rate": 3.3371750639867174e-06, + "loss": 0.8645, + "step": 11338 + }, + { + "epoch": 0.8191883251756461, + "grad_norm": 5.583870458625097, + "learning_rate": 3.336899443690325e-06, + "loss": 0.7246, + "step": 11339 + }, + { + "epoch": 0.8192605703758556, + "grad_norm": 7.460982493319831, + "learning_rate": 3.33662381193743e-06, + "loss": 0.8018, + "step": 11340 + }, + { + "epoch": 0.8193328155760652, + "grad_norm": 7.586335506664448, + "learning_rate": 3.3363481687318034e-06, + "loss": 0.9752, + "step": 11341 + }, + { + "epoch": 0.8194050607762747, + "grad_norm": 5.856458909777653, + "learning_rate": 3.33607251407722e-06, + "loss": 0.8631, + "step": 11342 + }, + { + "epoch": 0.8194773059764842, + "grad_norm": 6.617631088935681, + "learning_rate": 3.335796847977454e-06, + "loss": 0.8517, + "step": 11343 + }, + { + "epoch": 0.8195495511766937, + "grad_norm": 6.63674289099494, + "learning_rate": 3.3355211704362766e-06, + "loss": 0.8455, + "step": 11344 + }, + { + "epoch": 0.8196217963769032, + "grad_norm": 10.813995434316494, + "learning_rate": 3.3352454814574647e-06, + "loss": 0.864, + "step": 11345 + }, + { + "epoch": 0.8196940415771127, + "grad_norm": 7.690083139513314, + "learning_rate": 3.3349697810447888e-06, + "loss": 0.7788, + "step": 11346 + }, + { + "epoch": 0.8197662867773222, + "grad_norm": 7.091126078506805, + "learning_rate": 3.3346940692020267e-06, + "loss": 0.8192, + "step": 11347 + }, + { + "epoch": 0.8198385319775318, + "grad_norm": 7.312063611695033, + "learning_rate": 3.3344183459329498e-06, + "loss": 0.8429, + "step": 11348 + }, + { + "epoch": 0.8199107771777413, + "grad_norm": 6.519639203438913, + "learning_rate": 3.3341426112413346e-06, + "loss": 0.8723, + "step": 11349 + }, + { + "epoch": 0.8199830223779507, + "grad_norm": 6.614148735927895, + "learning_rate": 3.3338668651309538e-06, + "loss": 0.8091, + "step": 11350 + }, + { + "epoch": 0.8200552675781603, + "grad_norm": 4.7384054320211995, + "learning_rate": 3.3335911076055834e-06, + "loss": 0.784, + "step": 11351 + }, + { + "epoch": 0.8201275127783698, + "grad_norm": 5.3756442016641826, + "learning_rate": 3.3333153386689983e-06, + "loss": 0.7946, + "step": 11352 + }, + { + "epoch": 0.8201997579785792, + "grad_norm": 6.634256769565185, + "learning_rate": 3.3330395583249725e-06, + "loss": 0.883, + "step": 11353 + }, + { + "epoch": 0.8202720031787888, + "grad_norm": 6.519997280096879, + "learning_rate": 3.3327637665772828e-06, + "loss": 0.8101, + "step": 11354 + }, + { + "epoch": 0.8203442483789983, + "grad_norm": 8.734407755836388, + "learning_rate": 3.332487963429703e-06, + "loss": 0.851, + "step": 11355 + }, + { + "epoch": 0.8204164935792079, + "grad_norm": 5.139079409438394, + "learning_rate": 3.3322121488860097e-06, + "loss": 0.8581, + "step": 11356 + }, + { + "epoch": 0.8204887387794173, + "grad_norm": 5.777226548105481, + "learning_rate": 3.331936322949978e-06, + "loss": 0.8054, + "step": 11357 + }, + { + "epoch": 0.8205609839796268, + "grad_norm": 9.607086554234165, + "learning_rate": 3.3316604856253843e-06, + "loss": 0.7881, + "step": 11358 + }, + { + "epoch": 0.8206332291798364, + "grad_norm": 8.911362208091903, + "learning_rate": 3.3313846369160045e-06, + "loss": 0.878, + "step": 11359 + }, + { + "epoch": 0.8207054743800459, + "grad_norm": 5.570459564536111, + "learning_rate": 3.331108776825614e-06, + "loss": 0.7213, + "step": 11360 + }, + { + "epoch": 0.8207777195802554, + "grad_norm": 5.631729847302519, + "learning_rate": 3.33083290535799e-06, + "loss": 0.8186, + "step": 11361 + }, + { + "epoch": 0.8208499647804649, + "grad_norm": 6.708597150397269, + "learning_rate": 3.3305570225169087e-06, + "loss": 0.8091, + "step": 11362 + }, + { + "epoch": 0.8209222099806744, + "grad_norm": 7.01257176218016, + "learning_rate": 3.3302811283061463e-06, + "loss": 0.7812, + "step": 11363 + }, + { + "epoch": 0.8209944551808839, + "grad_norm": 8.032481058373246, + "learning_rate": 3.3300052227294804e-06, + "loss": 0.8887, + "step": 11364 + }, + { + "epoch": 0.8210667003810934, + "grad_norm": 6.305073817257372, + "learning_rate": 3.329729305790687e-06, + "loss": 0.7606, + "step": 11365 + }, + { + "epoch": 0.821138945581303, + "grad_norm": 7.555502183887741, + "learning_rate": 3.3294533774935435e-06, + "loss": 0.8236, + "step": 11366 + }, + { + "epoch": 0.8212111907815125, + "grad_norm": 8.124686836663162, + "learning_rate": 3.329177437841828e-06, + "loss": 0.8566, + "step": 11367 + }, + { + "epoch": 0.8212834359817219, + "grad_norm": 6.229945853841499, + "learning_rate": 3.328901486839317e-06, + "loss": 0.866, + "step": 11368 + }, + { + "epoch": 0.8213556811819315, + "grad_norm": 6.39404026539772, + "learning_rate": 3.328625524489788e-06, + "loss": 0.7499, + "step": 11369 + }, + { + "epoch": 0.821427926382141, + "grad_norm": 7.2352405137457625, + "learning_rate": 3.3283495507970185e-06, + "loss": 0.7754, + "step": 11370 + }, + { + "epoch": 0.8215001715823504, + "grad_norm": 7.239740939906146, + "learning_rate": 3.328073565764788e-06, + "loss": 0.8481, + "step": 11371 + }, + { + "epoch": 0.82157241678256, + "grad_norm": 7.239044066911428, + "learning_rate": 3.3277975693968727e-06, + "loss": 0.7812, + "step": 11372 + }, + { + "epoch": 0.8216446619827695, + "grad_norm": 7.170520928903681, + "learning_rate": 3.3275215616970514e-06, + "loss": 0.8122, + "step": 11373 + }, + { + "epoch": 0.8217169071829791, + "grad_norm": 6.966336914484076, + "learning_rate": 3.3272455426691017e-06, + "loss": 0.9223, + "step": 11374 + }, + { + "epoch": 0.8217891523831885, + "grad_norm": 5.09060759239897, + "learning_rate": 3.326969512316804e-06, + "loss": 0.8003, + "step": 11375 + }, + { + "epoch": 0.821861397583398, + "grad_norm": 6.577128935220715, + "learning_rate": 3.3266934706439357e-06, + "loss": 0.7849, + "step": 11376 + }, + { + "epoch": 0.8219336427836076, + "grad_norm": 6.4159585471205425, + "learning_rate": 3.3264174176542754e-06, + "loss": 0.7488, + "step": 11377 + }, + { + "epoch": 0.8220058879838171, + "grad_norm": 5.832597013959227, + "learning_rate": 3.326141353351602e-06, + "loss": 0.812, + "step": 11378 + }, + { + "epoch": 0.8220781331840266, + "grad_norm": 7.64789633156333, + "learning_rate": 3.3258652777396947e-06, + "loss": 0.8093, + "step": 11379 + }, + { + "epoch": 0.8221503783842361, + "grad_norm": 5.410283871464068, + "learning_rate": 3.325589190822334e-06, + "loss": 0.7977, + "step": 11380 + }, + { + "epoch": 0.8222226235844456, + "grad_norm": 5.048932580217818, + "learning_rate": 3.3253130926032977e-06, + "loss": 0.7586, + "step": 11381 + }, + { + "epoch": 0.8222948687846551, + "grad_norm": 5.706404237031079, + "learning_rate": 3.325036983086366e-06, + "loss": 0.8648, + "step": 11382 + }, + { + "epoch": 0.8223671139848646, + "grad_norm": 5.1742032161008895, + "learning_rate": 3.324760862275319e-06, + "loss": 0.8243, + "step": 11383 + }, + { + "epoch": 0.8224393591850742, + "grad_norm": 5.43076319296213, + "learning_rate": 3.3244847301739357e-06, + "loss": 0.7542, + "step": 11384 + }, + { + "epoch": 0.8225116043852837, + "grad_norm": 5.5603254546822445, + "learning_rate": 3.3242085867859965e-06, + "loss": 0.7587, + "step": 11385 + }, + { + "epoch": 0.8225838495854931, + "grad_norm": 5.858137564646153, + "learning_rate": 3.323932432115282e-06, + "loss": 0.8244, + "step": 11386 + }, + { + "epoch": 0.8226560947857027, + "grad_norm": 7.243448388417223, + "learning_rate": 3.3236562661655725e-06, + "loss": 0.8902, + "step": 11387 + }, + { + "epoch": 0.8227283399859122, + "grad_norm": 5.629340637515516, + "learning_rate": 3.3233800889406475e-06, + "loss": 0.7866, + "step": 11388 + }, + { + "epoch": 0.8228005851861216, + "grad_norm": 5.986196215920261, + "learning_rate": 3.323103900444289e-06, + "loss": 0.7757, + "step": 11389 + }, + { + "epoch": 0.8228728303863312, + "grad_norm": 8.149446302456449, + "learning_rate": 3.3228277006802774e-06, + "loss": 0.7958, + "step": 11390 + }, + { + "epoch": 0.8229450755865407, + "grad_norm": 6.89655683550152, + "learning_rate": 3.3225514896523927e-06, + "loss": 0.8654, + "step": 11391 + }, + { + "epoch": 0.8230173207867503, + "grad_norm": 5.09375299558961, + "learning_rate": 3.322275267364418e-06, + "loss": 0.7877, + "step": 11392 + }, + { + "epoch": 0.8230895659869597, + "grad_norm": 6.349511447645346, + "learning_rate": 3.321999033820132e-06, + "loss": 0.8139, + "step": 11393 + }, + { + "epoch": 0.8231618111871692, + "grad_norm": 6.189915041240343, + "learning_rate": 3.321722789023318e-06, + "loss": 0.9134, + "step": 11394 + }, + { + "epoch": 0.8232340563873788, + "grad_norm": 6.329317861144465, + "learning_rate": 3.321446532977757e-06, + "loss": 0.7969, + "step": 11395 + }, + { + "epoch": 0.8233063015875883, + "grad_norm": 6.503469274922677, + "learning_rate": 3.321170265687231e-06, + "loss": 0.8145, + "step": 11396 + }, + { + "epoch": 0.8233785467877978, + "grad_norm": 5.779601815839756, + "learning_rate": 3.3208939871555218e-06, + "loss": 0.7862, + "step": 11397 + }, + { + "epoch": 0.8234507919880073, + "grad_norm": 6.982757039595709, + "learning_rate": 3.3206176973864102e-06, + "loss": 0.879, + "step": 11398 + }, + { + "epoch": 0.8235230371882168, + "grad_norm": 8.051958630095022, + "learning_rate": 3.3203413963836805e-06, + "loss": 0.8713, + "step": 11399 + }, + { + "epoch": 0.8235952823884263, + "grad_norm": 6.4432220213080615, + "learning_rate": 3.320065084151114e-06, + "loss": 0.8132, + "step": 11400 + }, + { + "epoch": 0.8236675275886358, + "grad_norm": 6.513498229038486, + "learning_rate": 3.319788760692493e-06, + "loss": 0.7659, + "step": 11401 + }, + { + "epoch": 0.8237397727888454, + "grad_norm": 5.385007369267514, + "learning_rate": 3.3195124260116002e-06, + "loss": 0.7342, + "step": 11402 + }, + { + "epoch": 0.8238120179890549, + "grad_norm": 6.454338790867343, + "learning_rate": 3.3192360801122187e-06, + "loss": 0.8599, + "step": 11403 + }, + { + "epoch": 0.8238842631892643, + "grad_norm": 7.019784488684686, + "learning_rate": 3.318959722998132e-06, + "loss": 0.8666, + "step": 11404 + }, + { + "epoch": 0.8239565083894739, + "grad_norm": 7.316036314810031, + "learning_rate": 3.318683354673122e-06, + "loss": 0.8089, + "step": 11405 + }, + { + "epoch": 0.8240287535896834, + "grad_norm": 6.045875959642636, + "learning_rate": 3.3184069751409732e-06, + "loss": 0.8353, + "step": 11406 + }, + { + "epoch": 0.8241009987898928, + "grad_norm": 8.870877020139853, + "learning_rate": 3.3181305844054675e-06, + "loss": 0.8995, + "step": 11407 + }, + { + "epoch": 0.8241732439901024, + "grad_norm": 5.431615341769635, + "learning_rate": 3.3178541824703892e-06, + "loss": 0.7225, + "step": 11408 + }, + { + "epoch": 0.8242454891903119, + "grad_norm": 7.104871191028961, + "learning_rate": 3.317577769339523e-06, + "loss": 0.7835, + "step": 11409 + }, + { + "epoch": 0.8243177343905215, + "grad_norm": 6.379297640415421, + "learning_rate": 3.3173013450166515e-06, + "loss": 0.7706, + "step": 11410 + }, + { + "epoch": 0.8243899795907309, + "grad_norm": 5.713126664461386, + "learning_rate": 3.3170249095055595e-06, + "loss": 0.755, + "step": 11411 + }, + { + "epoch": 0.8244622247909404, + "grad_norm": 6.385826342469068, + "learning_rate": 3.31674846281003e-06, + "loss": 0.7898, + "step": 11412 + }, + { + "epoch": 0.82453446999115, + "grad_norm": 6.8038981315975215, + "learning_rate": 3.3164720049338488e-06, + "loss": 0.786, + "step": 11413 + }, + { + "epoch": 0.8246067151913595, + "grad_norm": 4.94214980973071, + "learning_rate": 3.3161955358807996e-06, + "loss": 0.7663, + "step": 11414 + }, + { + "epoch": 0.824678960391569, + "grad_norm": 5.961289461676763, + "learning_rate": 3.3159190556546676e-06, + "loss": 0.7968, + "step": 11415 + }, + { + "epoch": 0.8247512055917785, + "grad_norm": 7.305971801786937, + "learning_rate": 3.3156425642592373e-06, + "loss": 0.8662, + "step": 11416 + }, + { + "epoch": 0.824823450791988, + "grad_norm": 7.339636376960107, + "learning_rate": 3.315366061698293e-06, + "loss": 0.8161, + "step": 11417 + }, + { + "epoch": 0.8248956959921975, + "grad_norm": 8.131297326738231, + "learning_rate": 3.3150895479756207e-06, + "loss": 0.8697, + "step": 11418 + }, + { + "epoch": 0.824967941192407, + "grad_norm": 7.917147518077331, + "learning_rate": 3.3148130230950053e-06, + "loss": 0.8376, + "step": 11419 + }, + { + "epoch": 0.8250401863926166, + "grad_norm": 6.520438997900509, + "learning_rate": 3.314536487060233e-06, + "loss": 0.7796, + "step": 11420 + }, + { + "epoch": 0.8251124315928261, + "grad_norm": 5.521864865460511, + "learning_rate": 3.314259939875088e-06, + "loss": 0.8931, + "step": 11421 + }, + { + "epoch": 0.8251846767930355, + "grad_norm": 6.43703007140224, + "learning_rate": 3.3139833815433565e-06, + "loss": 0.791, + "step": 11422 + }, + { + "epoch": 0.8252569219932451, + "grad_norm": 7.707127272273089, + "learning_rate": 3.3137068120688243e-06, + "loss": 0.7622, + "step": 11423 + }, + { + "epoch": 0.8253291671934546, + "grad_norm": 7.540765220201539, + "learning_rate": 3.3134302314552785e-06, + "loss": 0.7377, + "step": 11424 + }, + { + "epoch": 0.825401412393664, + "grad_norm": 6.074021236238106, + "learning_rate": 3.3131536397065046e-06, + "loss": 0.7955, + "step": 11425 + }, + { + "epoch": 0.8254736575938736, + "grad_norm": 5.137076872318951, + "learning_rate": 3.312877036826288e-06, + "loss": 0.7216, + "step": 11426 + }, + { + "epoch": 0.8255459027940831, + "grad_norm": 5.88234008675461, + "learning_rate": 3.3126004228184163e-06, + "loss": 0.8223, + "step": 11427 + }, + { + "epoch": 0.8256181479942927, + "grad_norm": 7.383456538128971, + "learning_rate": 3.312323797686675e-06, + "loss": 0.8327, + "step": 11428 + }, + { + "epoch": 0.8256903931945021, + "grad_norm": 5.74188887669506, + "learning_rate": 3.3120471614348524e-06, + "loss": 0.7838, + "step": 11429 + }, + { + "epoch": 0.8257626383947116, + "grad_norm": 6.737138150323397, + "learning_rate": 3.3117705140667345e-06, + "loss": 0.7729, + "step": 11430 + }, + { + "epoch": 0.8258348835949212, + "grad_norm": 5.247179636197096, + "learning_rate": 3.311493855586108e-06, + "loss": 0.8495, + "step": 11431 + }, + { + "epoch": 0.8259071287951306, + "grad_norm": 7.578462258925681, + "learning_rate": 3.3112171859967614e-06, + "loss": 0.7664, + "step": 11432 + }, + { + "epoch": 0.8259793739953402, + "grad_norm": 6.099338257963613, + "learning_rate": 3.310940505302481e-06, + "loss": 0.8137, + "step": 11433 + }, + { + "epoch": 0.8260516191955497, + "grad_norm": 7.640487950505921, + "learning_rate": 3.3106638135070558e-06, + "loss": 0.8539, + "step": 11434 + }, + { + "epoch": 0.8261238643957592, + "grad_norm": 6.15308186296538, + "learning_rate": 3.310387110614271e-06, + "loss": 0.7892, + "step": 11435 + }, + { + "epoch": 0.8261961095959687, + "grad_norm": 5.439555338750144, + "learning_rate": 3.3101103966279164e-06, + "loss": 0.7547, + "step": 11436 + }, + { + "epoch": 0.8262683547961782, + "grad_norm": 6.492671650118308, + "learning_rate": 3.309833671551779e-06, + "loss": 0.8263, + "step": 11437 + }, + { + "epoch": 0.8263405999963878, + "grad_norm": 5.660675172468663, + "learning_rate": 3.3095569353896485e-06, + "loss": 0.7068, + "step": 11438 + }, + { + "epoch": 0.8264128451965973, + "grad_norm": 6.352949115440711, + "learning_rate": 3.309280188145312e-06, + "loss": 0.7906, + "step": 11439 + }, + { + "epoch": 0.8264850903968067, + "grad_norm": 7.425784332258697, + "learning_rate": 3.3090034298225576e-06, + "loss": 0.7775, + "step": 11440 + }, + { + "epoch": 0.8265573355970163, + "grad_norm": 6.026253800105108, + "learning_rate": 3.308726660425174e-06, + "loss": 0.7823, + "step": 11441 + }, + { + "epoch": 0.8266295807972258, + "grad_norm": 7.687332523661619, + "learning_rate": 3.308449879956951e-06, + "loss": 0.8376, + "step": 11442 + }, + { + "epoch": 0.8267018259974352, + "grad_norm": 6.375880760236189, + "learning_rate": 3.308173088421677e-06, + "loss": 0.8607, + "step": 11443 + }, + { + "epoch": 0.8267740711976448, + "grad_norm": 7.605065976269895, + "learning_rate": 3.30789628582314e-06, + "loss": 0.9042, + "step": 11444 + }, + { + "epoch": 0.8268463163978543, + "grad_norm": 6.029835589493973, + "learning_rate": 3.307619472165131e-06, + "loss": 0.9088, + "step": 11445 + }, + { + "epoch": 0.8269185615980639, + "grad_norm": 5.5759396158941374, + "learning_rate": 3.307342647451438e-06, + "loss": 0.7395, + "step": 11446 + }, + { + "epoch": 0.8269908067982733, + "grad_norm": 6.549695411181198, + "learning_rate": 3.3070658116858517e-06, + "loss": 0.7701, + "step": 11447 + }, + { + "epoch": 0.8270630519984828, + "grad_norm": 6.349268725482382, + "learning_rate": 3.3067889648721606e-06, + "loss": 0.7561, + "step": 11448 + }, + { + "epoch": 0.8271352971986924, + "grad_norm": 7.109709276997053, + "learning_rate": 3.306512107014155e-06, + "loss": 0.8537, + "step": 11449 + }, + { + "epoch": 0.8272075423989018, + "grad_norm": 9.007088307022853, + "learning_rate": 3.306235238115625e-06, + "loss": 0.7677, + "step": 11450 + }, + { + "epoch": 0.8272797875991114, + "grad_norm": 5.125693669709801, + "learning_rate": 3.30595835818036e-06, + "loss": 0.7584, + "step": 11451 + }, + { + "epoch": 0.8273520327993209, + "grad_norm": 6.122705769505947, + "learning_rate": 3.305681467212152e-06, + "loss": 0.8021, + "step": 11452 + }, + { + "epoch": 0.8274242779995304, + "grad_norm": 6.549769669770898, + "learning_rate": 3.30540456521479e-06, + "loss": 0.8702, + "step": 11453 + }, + { + "epoch": 0.8274965231997399, + "grad_norm": 6.499217353065873, + "learning_rate": 3.3051276521920646e-06, + "loss": 0.8069, + "step": 11454 + }, + { + "epoch": 0.8275687683999494, + "grad_norm": 6.474568427000982, + "learning_rate": 3.304850728147766e-06, + "loss": 0.8687, + "step": 11455 + }, + { + "epoch": 0.827641013600159, + "grad_norm": 6.480573528010706, + "learning_rate": 3.304573793085687e-06, + "loss": 0.7499, + "step": 11456 + }, + { + "epoch": 0.8277132588003685, + "grad_norm": 5.952117914836629, + "learning_rate": 3.3042968470096163e-06, + "loss": 0.8592, + "step": 11457 + }, + { + "epoch": 0.8277855040005779, + "grad_norm": 4.9891218106035495, + "learning_rate": 3.3040198899233477e-06, + "loss": 0.8395, + "step": 11458 + }, + { + "epoch": 0.8278577492007875, + "grad_norm": 5.744775139746551, + "learning_rate": 3.3037429218306704e-06, + "loss": 0.7696, + "step": 11459 + }, + { + "epoch": 0.827929994400997, + "grad_norm": 5.909648583512833, + "learning_rate": 3.3034659427353768e-06, + "loss": 0.7802, + "step": 11460 + }, + { + "epoch": 0.8280022396012064, + "grad_norm": 7.223285257730777, + "learning_rate": 3.303188952641258e-06, + "loss": 0.7946, + "step": 11461 + }, + { + "epoch": 0.828074484801416, + "grad_norm": 8.09807597727333, + "learning_rate": 3.302911951552106e-06, + "loss": 0.8491, + "step": 11462 + }, + { + "epoch": 0.8281467300016255, + "grad_norm": 5.3708217260660955, + "learning_rate": 3.3026349394717132e-06, + "loss": 0.8884, + "step": 11463 + }, + { + "epoch": 0.8282189752018351, + "grad_norm": 5.901358056964846, + "learning_rate": 3.302357916403871e-06, + "loss": 0.8783, + "step": 11464 + }, + { + "epoch": 0.8282912204020445, + "grad_norm": 6.11597260046261, + "learning_rate": 3.3020808823523716e-06, + "loss": 0.8766, + "step": 11465 + }, + { + "epoch": 0.828363465602254, + "grad_norm": 7.149664234733936, + "learning_rate": 3.3018038373210083e-06, + "loss": 0.8096, + "step": 11466 + }, + { + "epoch": 0.8284357108024636, + "grad_norm": 6.768670434035038, + "learning_rate": 3.3015267813135726e-06, + "loss": 0.8222, + "step": 11467 + }, + { + "epoch": 0.828507956002673, + "grad_norm": 6.979863768726469, + "learning_rate": 3.3012497143338584e-06, + "loss": 0.8518, + "step": 11468 + }, + { + "epoch": 0.8285802012028826, + "grad_norm": 7.8237007110549746, + "learning_rate": 3.3009726363856563e-06, + "loss": 0.888, + "step": 11469 + }, + { + "epoch": 0.8286524464030921, + "grad_norm": 6.6958500436013715, + "learning_rate": 3.300695547472762e-06, + "loss": 0.7668, + "step": 11470 + }, + { + "epoch": 0.8287246916033016, + "grad_norm": 7.130756395192457, + "learning_rate": 3.3004184475989665e-06, + "loss": 0.8211, + "step": 11471 + }, + { + "epoch": 0.8287969368035111, + "grad_norm": 7.15298294052976, + "learning_rate": 3.3001413367680645e-06, + "loss": 0.8437, + "step": 11472 + }, + { + "epoch": 0.8288691820037206, + "grad_norm": 6.516021643259015, + "learning_rate": 3.299864214983849e-06, + "loss": 0.7933, + "step": 11473 + }, + { + "epoch": 0.8289414272039302, + "grad_norm": 8.555148121874163, + "learning_rate": 3.2995870822501132e-06, + "loss": 0.7827, + "step": 11474 + }, + { + "epoch": 0.8290136724041397, + "grad_norm": 6.935556500362935, + "learning_rate": 3.2993099385706505e-06, + "loss": 0.8539, + "step": 11475 + }, + { + "epoch": 0.8290859176043491, + "grad_norm": 6.853332499690815, + "learning_rate": 3.2990327839492564e-06, + "loss": 0.8411, + "step": 11476 + }, + { + "epoch": 0.8291581628045587, + "grad_norm": 4.9197636995712655, + "learning_rate": 3.2987556183897235e-06, + "loss": 0.7658, + "step": 11477 + }, + { + "epoch": 0.8292304080047682, + "grad_norm": 5.529544200910616, + "learning_rate": 3.298478441895846e-06, + "loss": 0.7458, + "step": 11478 + }, + { + "epoch": 0.8293026532049776, + "grad_norm": 7.016191013672022, + "learning_rate": 3.298201254471419e-06, + "loss": 0.8148, + "step": 11479 + }, + { + "epoch": 0.8293748984051872, + "grad_norm": 7.806622056363956, + "learning_rate": 3.2979240561202366e-06, + "loss": 0.8909, + "step": 11480 + }, + { + "epoch": 0.8294471436053967, + "grad_norm": 5.16775431516101, + "learning_rate": 3.297646846846093e-06, + "loss": 0.7619, + "step": 11481 + }, + { + "epoch": 0.8295193888056063, + "grad_norm": 5.88619772080402, + "learning_rate": 3.297369626652784e-06, + "loss": 0.8419, + "step": 11482 + }, + { + "epoch": 0.8295916340058157, + "grad_norm": 4.475972561135959, + "learning_rate": 3.2970923955441033e-06, + "loss": 0.8222, + "step": 11483 + }, + { + "epoch": 0.8296638792060252, + "grad_norm": 7.105847500127508, + "learning_rate": 3.296815153523847e-06, + "loss": 0.8349, + "step": 11484 + }, + { + "epoch": 0.8297361244062348, + "grad_norm": 6.879549775140999, + "learning_rate": 3.2965379005958097e-06, + "loss": 0.8247, + "step": 11485 + }, + { + "epoch": 0.8298083696064442, + "grad_norm": 6.795317653391786, + "learning_rate": 3.296260636763788e-06, + "loss": 0.8702, + "step": 11486 + }, + { + "epoch": 0.8298806148066538, + "grad_norm": 6.424216481685876, + "learning_rate": 3.295983362031575e-06, + "loss": 0.84, + "step": 11487 + }, + { + "epoch": 0.8299528600068633, + "grad_norm": 7.571377674353282, + "learning_rate": 3.295706076402968e-06, + "loss": 0.8233, + "step": 11488 + }, + { + "epoch": 0.8300251052070728, + "grad_norm": 6.158340021868813, + "learning_rate": 3.295428779881763e-06, + "loss": 0.8013, + "step": 11489 + }, + { + "epoch": 0.8300973504072823, + "grad_norm": 6.029759988895217, + "learning_rate": 3.2951514724717566e-06, + "loss": 0.8013, + "step": 11490 + }, + { + "epoch": 0.8301695956074918, + "grad_norm": 7.916590024342892, + "learning_rate": 3.2948741541767432e-06, + "loss": 0.8656, + "step": 11491 + }, + { + "epoch": 0.8302418408077014, + "grad_norm": 6.198793219977031, + "learning_rate": 3.2945968250005197e-06, + "loss": 0.9329, + "step": 11492 + }, + { + "epoch": 0.8303140860079109, + "grad_norm": 6.210048395972633, + "learning_rate": 3.294319484946882e-06, + "loss": 0.773, + "step": 11493 + }, + { + "epoch": 0.8303863312081203, + "grad_norm": 5.146362812843263, + "learning_rate": 3.2940421340196278e-06, + "loss": 0.7592, + "step": 11494 + }, + { + "epoch": 0.8304585764083299, + "grad_norm": 8.29839579694379, + "learning_rate": 3.2937647722225535e-06, + "loss": 0.8261, + "step": 11495 + }, + { + "epoch": 0.8305308216085394, + "grad_norm": 8.52888159661409, + "learning_rate": 3.2934873995594555e-06, + "loss": 0.8148, + "step": 11496 + }, + { + "epoch": 0.8306030668087488, + "grad_norm": 6.586334831949761, + "learning_rate": 3.293210016034131e-06, + "loss": 0.83, + "step": 11497 + }, + { + "epoch": 0.8306753120089584, + "grad_norm": 5.596934855709216, + "learning_rate": 3.2929326216503776e-06, + "loss": 0.8686, + "step": 11498 + }, + { + "epoch": 0.8307475572091679, + "grad_norm": 6.030551375806381, + "learning_rate": 3.292655216411992e-06, + "loss": 0.7803, + "step": 11499 + }, + { + "epoch": 0.8308198024093775, + "grad_norm": 8.135243223900053, + "learning_rate": 3.292377800322773e-06, + "loss": 0.8807, + "step": 11500 + }, + { + "epoch": 0.8308920476095869, + "grad_norm": 6.144364865833749, + "learning_rate": 3.2921003733865166e-06, + "loss": 0.7987, + "step": 11501 + }, + { + "epoch": 0.8309642928097964, + "grad_norm": 6.586326723374057, + "learning_rate": 3.29182293560702e-06, + "loss": 0.8224, + "step": 11502 + }, + { + "epoch": 0.831036538010006, + "grad_norm": 6.810062357299141, + "learning_rate": 3.291545486988083e-06, + "loss": 0.7869, + "step": 11503 + }, + { + "epoch": 0.8311087832102154, + "grad_norm": 7.298946503171012, + "learning_rate": 3.2912680275335036e-06, + "loss": 0.8519, + "step": 11504 + }, + { + "epoch": 0.831181028410425, + "grad_norm": 6.961592620950206, + "learning_rate": 3.2909905572470785e-06, + "loss": 0.8845, + "step": 11505 + }, + { + "epoch": 0.8312532736106345, + "grad_norm": 5.07895482252337, + "learning_rate": 3.2907130761326073e-06, + "loss": 0.7546, + "step": 11506 + }, + { + "epoch": 0.831325518810844, + "grad_norm": 5.6028993253689405, + "learning_rate": 3.2904355841938873e-06, + "loss": 0.8311, + "step": 11507 + }, + { + "epoch": 0.8313977640110535, + "grad_norm": 5.113984714549363, + "learning_rate": 3.290158081434718e-06, + "loss": 0.7281, + "step": 11508 + }, + { + "epoch": 0.831470009211263, + "grad_norm": 6.329938011890401, + "learning_rate": 3.2898805678588986e-06, + "loss": 0.815, + "step": 11509 + }, + { + "epoch": 0.8315422544114726, + "grad_norm": 6.585924467569401, + "learning_rate": 3.2896030434702273e-06, + "loss": 0.822, + "step": 11510 + }, + { + "epoch": 0.8316144996116821, + "grad_norm": 9.232537772635677, + "learning_rate": 3.2893255082725034e-06, + "loss": 0.7827, + "step": 11511 + }, + { + "epoch": 0.8316867448118915, + "grad_norm": 5.81509833556955, + "learning_rate": 3.2890479622695258e-06, + "loss": 0.8371, + "step": 11512 + }, + { + "epoch": 0.8317589900121011, + "grad_norm": 5.6302540295275945, + "learning_rate": 3.2887704054650946e-06, + "loss": 0.8429, + "step": 11513 + }, + { + "epoch": 0.8318312352123106, + "grad_norm": 8.26332634329901, + "learning_rate": 3.2884928378630094e-06, + "loss": 0.8338, + "step": 11514 + }, + { + "epoch": 0.83190348041252, + "grad_norm": 6.497907741973054, + "learning_rate": 3.288215259467069e-06, + "loss": 0.8494, + "step": 11515 + }, + { + "epoch": 0.8319757256127296, + "grad_norm": 5.4124542093432355, + "learning_rate": 3.287937670281074e-06, + "loss": 0.7068, + "step": 11516 + }, + { + "epoch": 0.8320479708129391, + "grad_norm": 6.32005839550226, + "learning_rate": 3.2876600703088236e-06, + "loss": 0.868, + "step": 11517 + }, + { + "epoch": 0.8321202160131487, + "grad_norm": 6.242831276936364, + "learning_rate": 3.2873824595541193e-06, + "loss": 0.8102, + "step": 11518 + }, + { + "epoch": 0.8321924612133581, + "grad_norm": 7.287472926722599, + "learning_rate": 3.2871048380207593e-06, + "loss": 0.7913, + "step": 11519 + }, + { + "epoch": 0.8322647064135676, + "grad_norm": 6.717854284861123, + "learning_rate": 3.2868272057125465e-06, + "loss": 0.8224, + "step": 11520 + }, + { + "epoch": 0.8323369516137772, + "grad_norm": 6.613976862781291, + "learning_rate": 3.2865495626332793e-06, + "loss": 0.8045, + "step": 11521 + }, + { + "epoch": 0.8324091968139866, + "grad_norm": 5.878497117719554, + "learning_rate": 3.2862719087867593e-06, + "loss": 0.7492, + "step": 11522 + }, + { + "epoch": 0.8324814420141962, + "grad_norm": 5.821255560133632, + "learning_rate": 3.285994244176787e-06, + "loss": 0.8902, + "step": 11523 + }, + { + "epoch": 0.8325536872144057, + "grad_norm": 6.599289144901713, + "learning_rate": 3.2857165688071653e-06, + "loss": 0.7644, + "step": 11524 + }, + { + "epoch": 0.8326259324146152, + "grad_norm": 6.528778922863969, + "learning_rate": 3.285438882681693e-06, + "loss": 0.7089, + "step": 11525 + }, + { + "epoch": 0.8326981776148247, + "grad_norm": 7.5582800827197385, + "learning_rate": 3.2851611858041714e-06, + "loss": 0.8247, + "step": 11526 + }, + { + "epoch": 0.8327704228150342, + "grad_norm": 5.955029761098294, + "learning_rate": 3.284883478178404e-06, + "loss": 0.8233, + "step": 11527 + }, + { + "epoch": 0.8328426680152438, + "grad_norm": 6.221254697387467, + "learning_rate": 3.2846057598081902e-06, + "loss": 0.7908, + "step": 11528 + }, + { + "epoch": 0.8329149132154533, + "grad_norm": 5.054161645813933, + "learning_rate": 3.284328030697334e-06, + "loss": 0.7594, + "step": 11529 + }, + { + "epoch": 0.8329871584156627, + "grad_norm": 6.292210974316778, + "learning_rate": 3.2840502908496354e-06, + "loss": 0.8054, + "step": 11530 + }, + { + "epoch": 0.8330594036158723, + "grad_norm": 7.106951960884245, + "learning_rate": 3.283772540268897e-06, + "loss": 0.7805, + "step": 11531 + }, + { + "epoch": 0.8331316488160818, + "grad_norm": 7.9073989696724025, + "learning_rate": 3.2834947789589212e-06, + "loss": 0.9024, + "step": 11532 + }, + { + "epoch": 0.8332038940162912, + "grad_norm": 5.370632082431616, + "learning_rate": 3.28321700692351e-06, + "loss": 0.8062, + "step": 11533 + }, + { + "epoch": 0.8332761392165008, + "grad_norm": 7.901639174364121, + "learning_rate": 3.2829392241664665e-06, + "loss": 0.8643, + "step": 11534 + }, + { + "epoch": 0.8333483844167103, + "grad_norm": 6.475756697459186, + "learning_rate": 3.2826614306915926e-06, + "loss": 0.7635, + "step": 11535 + }, + { + "epoch": 0.8334206296169199, + "grad_norm": 4.944892082001054, + "learning_rate": 3.2823836265026914e-06, + "loss": 0.7205, + "step": 11536 + }, + { + "epoch": 0.8334928748171293, + "grad_norm": 5.111875709527112, + "learning_rate": 3.282105811603566e-06, + "loss": 0.7717, + "step": 11537 + }, + { + "epoch": 0.8335651200173388, + "grad_norm": 8.206613376587992, + "learning_rate": 3.2818279859980194e-06, + "loss": 0.7974, + "step": 11538 + }, + { + "epoch": 0.8336373652175484, + "grad_norm": 6.48241245950657, + "learning_rate": 3.2815501496898545e-06, + "loss": 0.8463, + "step": 11539 + }, + { + "epoch": 0.8337096104177578, + "grad_norm": 6.535339243459333, + "learning_rate": 3.2812723026828745e-06, + "loss": 0.8187, + "step": 11540 + }, + { + "epoch": 0.8337818556179674, + "grad_norm": 6.517750199069515, + "learning_rate": 3.280994444980884e-06, + "loss": 0.8316, + "step": 11541 + }, + { + "epoch": 0.8338541008181769, + "grad_norm": 6.15904273187375, + "learning_rate": 3.2807165765876856e-06, + "loss": 0.8753, + "step": 11542 + }, + { + "epoch": 0.8339263460183864, + "grad_norm": 7.8353664384074015, + "learning_rate": 3.280438697507084e-06, + "loss": 0.8784, + "step": 11543 + }, + { + "epoch": 0.8339985912185959, + "grad_norm": 7.129195467164104, + "learning_rate": 3.280160807742882e-06, + "loss": 0.8384, + "step": 11544 + }, + { + "epoch": 0.8340708364188054, + "grad_norm": 5.695009409266227, + "learning_rate": 3.279882907298884e-06, + "loss": 0.8428, + "step": 11545 + }, + { + "epoch": 0.834143081619015, + "grad_norm": 7.430294104360534, + "learning_rate": 3.279604996178895e-06, + "loss": 0.8645, + "step": 11546 + }, + { + "epoch": 0.8342153268192245, + "grad_norm": 5.1828638524480555, + "learning_rate": 3.2793270743867195e-06, + "loss": 0.7696, + "step": 11547 + }, + { + "epoch": 0.8342875720194339, + "grad_norm": 6.195264164277868, + "learning_rate": 3.2790491419261604e-06, + "loss": 0.813, + "step": 11548 + }, + { + "epoch": 0.8343598172196435, + "grad_norm": 6.584222500143543, + "learning_rate": 3.2787711988010244e-06, + "loss": 0.8239, + "step": 11549 + }, + { + "epoch": 0.834432062419853, + "grad_norm": 7.236429602442995, + "learning_rate": 3.278493245015115e-06, + "loss": 0.8006, + "step": 11550 + }, + { + "epoch": 0.8345043076200624, + "grad_norm": 5.40031239347721, + "learning_rate": 3.2782152805722374e-06, + "loss": 0.7628, + "step": 11551 + }, + { + "epoch": 0.834576552820272, + "grad_norm": 7.918677653499308, + "learning_rate": 3.277937305476197e-06, + "loss": 0.8147, + "step": 11552 + }, + { + "epoch": 0.8346487980204815, + "grad_norm": 7.380883924691244, + "learning_rate": 3.2776593197307983e-06, + "loss": 0.9229, + "step": 11553 + }, + { + "epoch": 0.8347210432206911, + "grad_norm": 8.21206172429749, + "learning_rate": 3.2773813233398477e-06, + "loss": 0.8647, + "step": 11554 + }, + { + "epoch": 0.8347932884209005, + "grad_norm": 7.1789436162511056, + "learning_rate": 3.2771033163071508e-06, + "loss": 0.8669, + "step": 11555 + }, + { + "epoch": 0.83486553362111, + "grad_norm": 7.991396330662164, + "learning_rate": 3.2768252986365124e-06, + "loss": 0.8435, + "step": 11556 + }, + { + "epoch": 0.8349377788213196, + "grad_norm": 7.091321621968038, + "learning_rate": 3.2765472703317397e-06, + "loss": 0.8232, + "step": 11557 + }, + { + "epoch": 0.835010024021529, + "grad_norm": 6.3470678813857, + "learning_rate": 3.276269231396637e-06, + "loss": 0.7398, + "step": 11558 + }, + { + "epoch": 0.8350822692217386, + "grad_norm": 7.120176857884138, + "learning_rate": 3.275991181835011e-06, + "loss": 0.8881, + "step": 11559 + }, + { + "epoch": 0.8351545144219481, + "grad_norm": 5.914544956377931, + "learning_rate": 3.275713121650669e-06, + "loss": 0.7927, + "step": 11560 + }, + { + "epoch": 0.8352267596221576, + "grad_norm": 5.146540707708772, + "learning_rate": 3.2754350508474165e-06, + "loss": 0.7465, + "step": 11561 + }, + { + "epoch": 0.8352990048223671, + "grad_norm": 5.723672717960402, + "learning_rate": 3.2751569694290595e-06, + "loss": 0.8673, + "step": 11562 + }, + { + "epoch": 0.8353712500225766, + "grad_norm": 9.141742970493878, + "learning_rate": 3.2748788773994066e-06, + "loss": 0.8792, + "step": 11563 + }, + { + "epoch": 0.8354434952227862, + "grad_norm": 5.7658185228573915, + "learning_rate": 3.2746007747622623e-06, + "loss": 0.8922, + "step": 11564 + }, + { + "epoch": 0.8355157404229957, + "grad_norm": 5.712996126162596, + "learning_rate": 3.2743226615214353e-06, + "loss": 0.8799, + "step": 11565 + }, + { + "epoch": 0.8355879856232051, + "grad_norm": 7.2665106777048925, + "learning_rate": 3.274044537680733e-06, + "loss": 0.7749, + "step": 11566 + }, + { + "epoch": 0.8356602308234147, + "grad_norm": 5.715361711831572, + "learning_rate": 3.2737664032439613e-06, + "loss": 0.7872, + "step": 11567 + }, + { + "epoch": 0.8357324760236242, + "grad_norm": 6.252763671196284, + "learning_rate": 3.2734882582149287e-06, + "loss": 0.7866, + "step": 11568 + }, + { + "epoch": 0.8358047212238336, + "grad_norm": 6.431395367609716, + "learning_rate": 3.273210102597442e-06, + "loss": 0.7927, + "step": 11569 + }, + { + "epoch": 0.8358769664240432, + "grad_norm": 6.046671546056448, + "learning_rate": 3.2729319363953093e-06, + "loss": 0.7249, + "step": 11570 + }, + { + "epoch": 0.8359492116242527, + "grad_norm": 6.2134619231008585, + "learning_rate": 3.272653759612339e-06, + "loss": 0.7491, + "step": 11571 + }, + { + "epoch": 0.8360214568244623, + "grad_norm": 5.716322086429418, + "learning_rate": 3.272375572252339e-06, + "loss": 0.7713, + "step": 11572 + }, + { + "epoch": 0.8360937020246717, + "grad_norm": 5.909131190022645, + "learning_rate": 3.272097374319116e-06, + "loss": 0.7934, + "step": 11573 + }, + { + "epoch": 0.8361659472248812, + "grad_norm": 6.503228779468355, + "learning_rate": 3.2718191658164797e-06, + "loss": 0.8186, + "step": 11574 + }, + { + "epoch": 0.8362381924250908, + "grad_norm": 6.937551687237072, + "learning_rate": 3.271540946748239e-06, + "loss": 0.8134, + "step": 11575 + }, + { + "epoch": 0.8363104376253002, + "grad_norm": 5.875697642359061, + "learning_rate": 3.271262717118201e-06, + "loss": 0.8263, + "step": 11576 + }, + { + "epoch": 0.8363826828255098, + "grad_norm": 6.0410591960116395, + "learning_rate": 3.2709844769301757e-06, + "loss": 0.8234, + "step": 11577 + }, + { + "epoch": 0.8364549280257193, + "grad_norm": 7.055796454748383, + "learning_rate": 3.2707062261879712e-06, + "loss": 0.8886, + "step": 11578 + }, + { + "epoch": 0.8365271732259288, + "grad_norm": 5.071199454153681, + "learning_rate": 3.2704279648953975e-06, + "loss": 0.8318, + "step": 11579 + }, + { + "epoch": 0.8365994184261383, + "grad_norm": 5.3046520833109865, + "learning_rate": 3.2701496930562625e-06, + "loss": 0.8593, + "step": 11580 + }, + { + "epoch": 0.8366716636263478, + "grad_norm": 7.039719020698259, + "learning_rate": 3.269871410674377e-06, + "loss": 0.811, + "step": 11581 + }, + { + "epoch": 0.8367439088265574, + "grad_norm": 6.321662824599231, + "learning_rate": 3.269593117753549e-06, + "loss": 0.9437, + "step": 11582 + }, + { + "epoch": 0.8368161540267669, + "grad_norm": 6.900843925282147, + "learning_rate": 3.2693148142975884e-06, + "loss": 0.8931, + "step": 11583 + }, + { + "epoch": 0.8368883992269763, + "grad_norm": 6.21200548502624, + "learning_rate": 3.2690365003103065e-06, + "loss": 0.7997, + "step": 11584 + }, + { + "epoch": 0.8369606444271859, + "grad_norm": 6.077063303502391, + "learning_rate": 3.2687581757955113e-06, + "loss": 0.7806, + "step": 11585 + }, + { + "epoch": 0.8370328896273954, + "grad_norm": 5.876189740425212, + "learning_rate": 3.268479840757014e-06, + "loss": 0.775, + "step": 11586 + }, + { + "epoch": 0.8371051348276048, + "grad_norm": 6.730631130604303, + "learning_rate": 3.2682014951986236e-06, + "loss": 0.8075, + "step": 11587 + }, + { + "epoch": 0.8371773800278144, + "grad_norm": 5.596382075956563, + "learning_rate": 3.2679231391241516e-06, + "loss": 0.7684, + "step": 11588 + }, + { + "epoch": 0.8372496252280239, + "grad_norm": 6.287513456842117, + "learning_rate": 3.2676447725374077e-06, + "loss": 0.8405, + "step": 11589 + }, + { + "epoch": 0.8373218704282335, + "grad_norm": 6.444228128478615, + "learning_rate": 3.267366395442204e-06, + "loss": 0.7527, + "step": 11590 + }, + { + "epoch": 0.8373941156284429, + "grad_norm": 5.459701987716705, + "learning_rate": 3.2670880078423495e-06, + "loss": 0.7485, + "step": 11591 + }, + { + "epoch": 0.8374663608286524, + "grad_norm": 5.583470964396064, + "learning_rate": 3.266809609741655e-06, + "loss": 0.7833, + "step": 11592 + }, + { + "epoch": 0.837538606028862, + "grad_norm": 7.664781449219974, + "learning_rate": 3.2665312011439337e-06, + "loss": 0.8433, + "step": 11593 + }, + { + "epoch": 0.8376108512290714, + "grad_norm": 5.199275611893231, + "learning_rate": 3.266252782052994e-06, + "loss": 0.8576, + "step": 11594 + }, + { + "epoch": 0.837683096429281, + "grad_norm": 5.399132602397941, + "learning_rate": 3.2659743524726506e-06, + "loss": 0.8157, + "step": 11595 + }, + { + "epoch": 0.8377553416294905, + "grad_norm": 6.238487932941212, + "learning_rate": 3.2656959124067117e-06, + "loss": 0.8295, + "step": 11596 + }, + { + "epoch": 0.8378275868297, + "grad_norm": 7.805768584409826, + "learning_rate": 3.26541746185899e-06, + "loss": 0.7925, + "step": 11597 + }, + { + "epoch": 0.8378998320299095, + "grad_norm": 5.6044658973344434, + "learning_rate": 3.265139000833298e-06, + "loss": 0.8061, + "step": 11598 + }, + { + "epoch": 0.837972077230119, + "grad_norm": 6.042529691245222, + "learning_rate": 3.264860529333448e-06, + "loss": 0.8264, + "step": 11599 + }, + { + "epoch": 0.8380443224303286, + "grad_norm": 7.164284805625964, + "learning_rate": 3.2645820473632508e-06, + "loss": 0.8212, + "step": 11600 + }, + { + "epoch": 0.8381165676305381, + "grad_norm": 7.441129167659631, + "learning_rate": 3.2643035549265183e-06, + "loss": 0.7826, + "step": 11601 + }, + { + "epoch": 0.8381888128307475, + "grad_norm": 6.804679651617173, + "learning_rate": 3.264025052027064e-06, + "loss": 0.8012, + "step": 11602 + }, + { + "epoch": 0.8382610580309571, + "grad_norm": 6.456840720022465, + "learning_rate": 3.2637465386686993e-06, + "loss": 0.8893, + "step": 11603 + }, + { + "epoch": 0.8383333032311666, + "grad_norm": 5.769761005390619, + "learning_rate": 3.263468014855239e-06, + "loss": 0.8209, + "step": 11604 + }, + { + "epoch": 0.838405548431376, + "grad_norm": 6.723303666712089, + "learning_rate": 3.263189480590493e-06, + "loss": 0.813, + "step": 11605 + }, + { + "epoch": 0.8384777936315856, + "grad_norm": 8.857271782424561, + "learning_rate": 3.2629109358782763e-06, + "loss": 0.873, + "step": 11606 + }, + { + "epoch": 0.8385500388317951, + "grad_norm": 7.016658579382989, + "learning_rate": 3.2626323807224014e-06, + "loss": 0.7405, + "step": 11607 + }, + { + "epoch": 0.8386222840320047, + "grad_norm": 7.626984134860135, + "learning_rate": 3.2623538151266803e-06, + "loss": 0.9326, + "step": 11608 + }, + { + "epoch": 0.8386945292322141, + "grad_norm": 6.8819852541848965, + "learning_rate": 3.2620752390949284e-06, + "loss": 0.865, + "step": 11609 + }, + { + "epoch": 0.8387667744324236, + "grad_norm": 6.968841962677963, + "learning_rate": 3.261796652630958e-06, + "loss": 0.8184, + "step": 11610 + }, + { + "epoch": 0.8388390196326332, + "grad_norm": 7.771827231928745, + "learning_rate": 3.2615180557385826e-06, + "loss": 0.8434, + "step": 11611 + }, + { + "epoch": 0.8389112648328426, + "grad_norm": 6.995258360229963, + "learning_rate": 3.2612394484216163e-06, + "loss": 0.8235, + "step": 11612 + }, + { + "epoch": 0.8389835100330522, + "grad_norm": 7.939186457698763, + "learning_rate": 3.2609608306838734e-06, + "loss": 0.7352, + "step": 11613 + }, + { + "epoch": 0.8390557552332617, + "grad_norm": 6.019598740800266, + "learning_rate": 3.2606822025291673e-06, + "loss": 0.7951, + "step": 11614 + }, + { + "epoch": 0.8391280004334712, + "grad_norm": 6.926448232653508, + "learning_rate": 3.260403563961313e-06, + "loss": 0.7761, + "step": 11615 + }, + { + "epoch": 0.8392002456336807, + "grad_norm": 7.005468956921905, + "learning_rate": 3.2601249149841243e-06, + "loss": 0.7521, + "step": 11616 + }, + { + "epoch": 0.8392724908338902, + "grad_norm": 6.9267792216687685, + "learning_rate": 3.259846255601415e-06, + "loss": 0.8, + "step": 11617 + }, + { + "epoch": 0.8393447360340998, + "grad_norm": 6.7069058370445624, + "learning_rate": 3.2595675858170007e-06, + "loss": 0.7933, + "step": 11618 + }, + { + "epoch": 0.8394169812343093, + "grad_norm": 6.262649552621104, + "learning_rate": 3.259288905634696e-06, + "loss": 0.8941, + "step": 11619 + }, + { + "epoch": 0.8394892264345187, + "grad_norm": 7.189888996652152, + "learning_rate": 3.2590102150583156e-06, + "loss": 0.8544, + "step": 11620 + }, + { + "epoch": 0.8395614716347283, + "grad_norm": 6.952058105910723, + "learning_rate": 3.2587315140916744e-06, + "loss": 0.8114, + "step": 11621 + }, + { + "epoch": 0.8396337168349378, + "grad_norm": 7.438754448765159, + "learning_rate": 3.2584528027385885e-06, + "loss": 0.8239, + "step": 11622 + }, + { + "epoch": 0.8397059620351472, + "grad_norm": 5.296428830420823, + "learning_rate": 3.2581740810028726e-06, + "loss": 0.806, + "step": 11623 + }, + { + "epoch": 0.8397782072353568, + "grad_norm": 5.390149966331725, + "learning_rate": 3.2578953488883426e-06, + "loss": 0.7966, + "step": 11624 + }, + { + "epoch": 0.8398504524355663, + "grad_norm": 6.45533755151911, + "learning_rate": 3.2576166063988126e-06, + "loss": 0.7744, + "step": 11625 + }, + { + "epoch": 0.8399226976357759, + "grad_norm": 5.062044099601326, + "learning_rate": 3.2573378535381002e-06, + "loss": 0.8584, + "step": 11626 + }, + { + "epoch": 0.8399949428359853, + "grad_norm": 6.581487308292023, + "learning_rate": 3.2570590903100206e-06, + "loss": 0.8672, + "step": 11627 + }, + { + "epoch": 0.8400671880361948, + "grad_norm": 7.271823265504761, + "learning_rate": 3.25678031671839e-06, + "loss": 0.8339, + "step": 11628 + }, + { + "epoch": 0.8401394332364044, + "grad_norm": 5.96657374787393, + "learning_rate": 3.256501532767024e-06, + "loss": 0.8826, + "step": 11629 + }, + { + "epoch": 0.8402116784366138, + "grad_norm": 5.489620297890798, + "learning_rate": 3.25622273845974e-06, + "loss": 0.8067, + "step": 11630 + }, + { + "epoch": 0.8402839236368234, + "grad_norm": 5.804581693811943, + "learning_rate": 3.255943933800353e-06, + "loss": 0.7998, + "step": 11631 + }, + { + "epoch": 0.8403561688370329, + "grad_norm": 5.183361010871089, + "learning_rate": 3.2556651187926813e-06, + "loss": 0.8616, + "step": 11632 + }, + { + "epoch": 0.8404284140372424, + "grad_norm": 5.3465465344435845, + "learning_rate": 3.2553862934405405e-06, + "loss": 0.7519, + "step": 11633 + }, + { + "epoch": 0.8405006592374519, + "grad_norm": 6.81875882799397, + "learning_rate": 3.2551074577477482e-06, + "loss": 0.7523, + "step": 11634 + }, + { + "epoch": 0.8405729044376614, + "grad_norm": 7.884377361090114, + "learning_rate": 3.2548286117181203e-06, + "loss": 0.8021, + "step": 11635 + }, + { + "epoch": 0.840645149637871, + "grad_norm": 5.702997577236624, + "learning_rate": 3.2545497553554757e-06, + "loss": 0.8478, + "step": 11636 + }, + { + "epoch": 0.8407173948380804, + "grad_norm": 6.412012997105419, + "learning_rate": 3.2542708886636306e-06, + "loss": 0.7689, + "step": 11637 + }, + { + "epoch": 0.8407896400382899, + "grad_norm": 6.650041290384471, + "learning_rate": 3.2539920116464026e-06, + "loss": 0.8308, + "step": 11638 + }, + { + "epoch": 0.8408618852384995, + "grad_norm": 6.688487327634971, + "learning_rate": 3.2537131243076094e-06, + "loss": 0.8993, + "step": 11639 + }, + { + "epoch": 0.840934130438709, + "grad_norm": 4.528003871606041, + "learning_rate": 3.2534342266510684e-06, + "loss": 0.7351, + "step": 11640 + }, + { + "epoch": 0.8410063756389184, + "grad_norm": 6.4249292985759165, + "learning_rate": 3.2531553186805985e-06, + "loss": 0.8561, + "step": 11641 + }, + { + "epoch": 0.841078620839128, + "grad_norm": 6.715330025295318, + "learning_rate": 3.252876400400016e-06, + "loss": 0.816, + "step": 11642 + }, + { + "epoch": 0.8411508660393375, + "grad_norm": 5.852914081119489, + "learning_rate": 3.2525974718131413e-06, + "loss": 0.7914, + "step": 11643 + }, + { + "epoch": 0.8412231112395471, + "grad_norm": 6.600797385395713, + "learning_rate": 3.2523185329237916e-06, + "loss": 0.7818, + "step": 11644 + }, + { + "epoch": 0.8412953564397565, + "grad_norm": 6.697235436648206, + "learning_rate": 3.252039583735784e-06, + "loss": 0.8399, + "step": 11645 + }, + { + "epoch": 0.841367601639966, + "grad_norm": 6.058890930010031, + "learning_rate": 3.25176062425294e-06, + "loss": 0.7463, + "step": 11646 + }, + { + "epoch": 0.8414398468401756, + "grad_norm": 5.627382918113897, + "learning_rate": 3.251481654479076e-06, + "loss": 0.847, + "step": 11647 + }, + { + "epoch": 0.841512092040385, + "grad_norm": 6.983956889837689, + "learning_rate": 3.251202674418012e-06, + "loss": 0.911, + "step": 11648 + }, + { + "epoch": 0.8415843372405946, + "grad_norm": 6.215149110093286, + "learning_rate": 3.2509236840735657e-06, + "loss": 0.8401, + "step": 11649 + }, + { + "epoch": 0.8416565824408041, + "grad_norm": 5.882873453784485, + "learning_rate": 3.250644683449558e-06, + "loss": 0.8055, + "step": 11650 + }, + { + "epoch": 0.8417288276410136, + "grad_norm": 5.7126933058872424, + "learning_rate": 3.250365672549807e-06, + "loss": 0.7502, + "step": 11651 + }, + { + "epoch": 0.8418010728412231, + "grad_norm": 6.625468687357026, + "learning_rate": 3.2500866513781333e-06, + "loss": 0.8519, + "step": 11652 + }, + { + "epoch": 0.8418733180414326, + "grad_norm": 6.599826417432081, + "learning_rate": 3.2498076199383554e-06, + "loss": 0.8531, + "step": 11653 + }, + { + "epoch": 0.8419455632416422, + "grad_norm": 7.123190666820516, + "learning_rate": 3.249528578234293e-06, + "loss": 0.8396, + "step": 11654 + }, + { + "epoch": 0.8420178084418516, + "grad_norm": 5.7420217477140145, + "learning_rate": 3.2492495262697665e-06, + "loss": 0.7699, + "step": 11655 + }, + { + "epoch": 0.8420900536420611, + "grad_norm": 6.906760080410323, + "learning_rate": 3.2489704640485957e-06, + "loss": 0.8548, + "step": 11656 + }, + { + "epoch": 0.8421622988422707, + "grad_norm": 7.027839884862605, + "learning_rate": 3.2486913915746014e-06, + "loss": 0.8013, + "step": 11657 + }, + { + "epoch": 0.8422345440424802, + "grad_norm": 4.717608383219391, + "learning_rate": 3.248412308851603e-06, + "loss": 0.8035, + "step": 11658 + }, + { + "epoch": 0.8423067892426896, + "grad_norm": 7.1738618911708745, + "learning_rate": 3.2481332158834204e-06, + "loss": 0.802, + "step": 11659 + }, + { + "epoch": 0.8423790344428992, + "grad_norm": 6.638348357445165, + "learning_rate": 3.2478541126738755e-06, + "loss": 0.905, + "step": 11660 + }, + { + "epoch": 0.8424512796431087, + "grad_norm": 5.180966962899329, + "learning_rate": 3.247574999226789e-06, + "loss": 0.8547, + "step": 11661 + }, + { + "epoch": 0.8425235248433183, + "grad_norm": 6.311227500242283, + "learning_rate": 3.2472958755459803e-06, + "loss": 0.8299, + "step": 11662 + }, + { + "epoch": 0.8425957700435277, + "grad_norm": 6.41193535830349, + "learning_rate": 3.2470167416352714e-06, + "loss": 0.85, + "step": 11663 + }, + { + "epoch": 0.8426680152437372, + "grad_norm": 6.208753976930067, + "learning_rate": 3.2467375974984845e-06, + "loss": 0.7287, + "step": 11664 + }, + { + "epoch": 0.8427402604439468, + "grad_norm": 5.893147138585756, + "learning_rate": 3.2464584431394384e-06, + "loss": 0.7652, + "step": 11665 + }, + { + "epoch": 0.8428125056441562, + "grad_norm": 6.327909489482295, + "learning_rate": 3.2461792785619568e-06, + "loss": 0.7552, + "step": 11666 + }, + { + "epoch": 0.8428847508443658, + "grad_norm": 7.53716631351582, + "learning_rate": 3.2459001037698595e-06, + "loss": 0.7991, + "step": 11667 + }, + { + "epoch": 0.8429569960445753, + "grad_norm": 6.789579136969959, + "learning_rate": 3.2456209187669686e-06, + "loss": 0.7851, + "step": 11668 + }, + { + "epoch": 0.8430292412447848, + "grad_norm": 7.161913367409256, + "learning_rate": 3.2453417235571066e-06, + "loss": 0.8827, + "step": 11669 + }, + { + "epoch": 0.8431014864449943, + "grad_norm": 9.084912177476507, + "learning_rate": 3.245062518144096e-06, + "loss": 0.9136, + "step": 11670 + }, + { + "epoch": 0.8431737316452038, + "grad_norm": 6.648459016471499, + "learning_rate": 3.244783302531757e-06, + "loss": 0.742, + "step": 11671 + }, + { + "epoch": 0.8432459768454134, + "grad_norm": 6.087519664066835, + "learning_rate": 3.2445040767239133e-06, + "loss": 0.8171, + "step": 11672 + }, + { + "epoch": 0.8433182220456228, + "grad_norm": 7.196291381653345, + "learning_rate": 3.244224840724387e-06, + "loss": 0.8727, + "step": 11673 + }, + { + "epoch": 0.8433904672458323, + "grad_norm": 5.937690169651937, + "learning_rate": 3.2439455945370002e-06, + "loss": 0.8295, + "step": 11674 + }, + { + "epoch": 0.8434627124460419, + "grad_norm": 7.521967188639079, + "learning_rate": 3.2436663381655763e-06, + "loss": 0.9085, + "step": 11675 + }, + { + "epoch": 0.8435349576462514, + "grad_norm": 5.979568343897886, + "learning_rate": 3.243387071613937e-06, + "loss": 0.8551, + "step": 11676 + }, + { + "epoch": 0.8436072028464608, + "grad_norm": 5.689682761736301, + "learning_rate": 3.243107794885906e-06, + "loss": 0.7917, + "step": 11677 + }, + { + "epoch": 0.8436794480466704, + "grad_norm": 6.79649586990378, + "learning_rate": 3.2428285079853063e-06, + "loss": 0.8582, + "step": 11678 + }, + { + "epoch": 0.8437516932468799, + "grad_norm": 6.5043907008014115, + "learning_rate": 3.2425492109159614e-06, + "loss": 0.8355, + "step": 11679 + }, + { + "epoch": 0.8438239384470895, + "grad_norm": 7.260173368459454, + "learning_rate": 3.242269903681694e-06, + "loss": 0.83, + "step": 11680 + }, + { + "epoch": 0.8438961836472989, + "grad_norm": 6.281517212910727, + "learning_rate": 3.241990586286329e-06, + "loss": 0.8123, + "step": 11681 + }, + { + "epoch": 0.8439684288475084, + "grad_norm": 5.994011751730456, + "learning_rate": 3.2417112587336874e-06, + "loss": 0.8966, + "step": 11682 + }, + { + "epoch": 0.844040674047718, + "grad_norm": 6.546943463898434, + "learning_rate": 3.241431921027595e-06, + "loss": 0.8307, + "step": 11683 + }, + { + "epoch": 0.8441129192479274, + "grad_norm": 6.661455151932717, + "learning_rate": 3.2411525731718763e-06, + "loss": 0.8247, + "step": 11684 + }, + { + "epoch": 0.844185164448137, + "grad_norm": 6.240363359386834, + "learning_rate": 3.2408732151703533e-06, + "loss": 0.8511, + "step": 11685 + }, + { + "epoch": 0.8442574096483465, + "grad_norm": 8.410666968062417, + "learning_rate": 3.2405938470268515e-06, + "loss": 0.8687, + "step": 11686 + }, + { + "epoch": 0.844329654848556, + "grad_norm": 7.188279350155303, + "learning_rate": 3.2403144687451947e-06, + "loss": 0.7789, + "step": 11687 + }, + { + "epoch": 0.8444019000487655, + "grad_norm": 7.535190422016216, + "learning_rate": 3.240035080329208e-06, + "loss": 0.7997, + "step": 11688 + }, + { + "epoch": 0.844474145248975, + "grad_norm": 6.245286003020317, + "learning_rate": 3.2397556817827164e-06, + "loss": 0.7332, + "step": 11689 + }, + { + "epoch": 0.8445463904491846, + "grad_norm": 6.404153024745385, + "learning_rate": 3.2394762731095433e-06, + "loss": 0.7529, + "step": 11690 + }, + { + "epoch": 0.844618635649394, + "grad_norm": 6.294372212660582, + "learning_rate": 3.2391968543135132e-06, + "loss": 0.817, + "step": 11691 + }, + { + "epoch": 0.8446908808496035, + "grad_norm": 6.158996588987065, + "learning_rate": 3.238917425398453e-06, + "loss": 0.9345, + "step": 11692 + }, + { + "epoch": 0.8447631260498131, + "grad_norm": 6.173363460250297, + "learning_rate": 3.238637986368187e-06, + "loss": 0.8587, + "step": 11693 + }, + { + "epoch": 0.8448353712500226, + "grad_norm": 6.983808319520148, + "learning_rate": 3.2383585372265403e-06, + "loss": 0.7946, + "step": 11694 + }, + { + "epoch": 0.844907616450232, + "grad_norm": 5.76106127074969, + "learning_rate": 3.238079077977339e-06, + "loss": 0.7399, + "step": 11695 + }, + { + "epoch": 0.8449798616504416, + "grad_norm": 8.641996140362426, + "learning_rate": 3.2377996086244077e-06, + "loss": 0.8426, + "step": 11696 + }, + { + "epoch": 0.8450521068506511, + "grad_norm": 6.380986712457608, + "learning_rate": 3.2375201291715724e-06, + "loss": 0.7465, + "step": 11697 + }, + { + "epoch": 0.8451243520508607, + "grad_norm": 6.307744718409044, + "learning_rate": 3.2372406396226597e-06, + "loss": 0.8029, + "step": 11698 + }, + { + "epoch": 0.8451965972510701, + "grad_norm": 5.48433639643198, + "learning_rate": 3.236961139981495e-06, + "loss": 0.8225, + "step": 11699 + }, + { + "epoch": 0.8452688424512796, + "grad_norm": 5.939314073203761, + "learning_rate": 3.2366816302519046e-06, + "loss": 0.883, + "step": 11700 + }, + { + "epoch": 0.8453410876514892, + "grad_norm": 5.604209965913217, + "learning_rate": 3.2364021104377135e-06, + "loss": 0.778, + "step": 11701 + }, + { + "epoch": 0.8454133328516986, + "grad_norm": 7.466475519503804, + "learning_rate": 3.236122580542751e-06, + "loss": 0.7767, + "step": 11702 + }, + { + "epoch": 0.8454855780519082, + "grad_norm": 6.822373820445376, + "learning_rate": 3.2358430405708408e-06, + "loss": 0.788, + "step": 11703 + }, + { + "epoch": 0.8455578232521177, + "grad_norm": 5.032944103795383, + "learning_rate": 3.2355634905258117e-06, + "loss": 0.8681, + "step": 11704 + }, + { + "epoch": 0.8456300684523272, + "grad_norm": 5.68435481373546, + "learning_rate": 3.2352839304114887e-06, + "loss": 0.7869, + "step": 11705 + }, + { + "epoch": 0.8457023136525367, + "grad_norm": 6.42379516704452, + "learning_rate": 3.2350043602316996e-06, + "loss": 0.7986, + "step": 11706 + }, + { + "epoch": 0.8457745588527462, + "grad_norm": 5.840235015111831, + "learning_rate": 3.234724779990272e-06, + "loss": 0.8376, + "step": 11707 + }, + { + "epoch": 0.8458468040529558, + "grad_norm": 7.412310088685583, + "learning_rate": 3.234445189691032e-06, + "loss": 0.8416, + "step": 11708 + }, + { + "epoch": 0.8459190492531652, + "grad_norm": 6.21002658905873, + "learning_rate": 3.234165589337809e-06, + "loss": 0.8348, + "step": 11709 + }, + { + "epoch": 0.8459912944533747, + "grad_norm": 6.177018043554704, + "learning_rate": 3.233885978934428e-06, + "loss": 0.8176, + "step": 11710 + }, + { + "epoch": 0.8460635396535843, + "grad_norm": 7.910822695690775, + "learning_rate": 3.233606358484717e-06, + "loss": 0.8103, + "step": 11711 + }, + { + "epoch": 0.8461357848537938, + "grad_norm": 6.261341638090571, + "learning_rate": 3.233326727992506e-06, + "loss": 0.867, + "step": 11712 + }, + { + "epoch": 0.8462080300540032, + "grad_norm": 7.353660357141425, + "learning_rate": 3.233047087461621e-06, + "loss": 0.7917, + "step": 11713 + }, + { + "epoch": 0.8462802752542128, + "grad_norm": 6.570473323128358, + "learning_rate": 3.2327674368958905e-06, + "loss": 0.8808, + "step": 11714 + }, + { + "epoch": 0.8463525204544223, + "grad_norm": 8.861569840832114, + "learning_rate": 3.232487776299143e-06, + "loss": 0.7705, + "step": 11715 + }, + { + "epoch": 0.8464247656546319, + "grad_norm": 6.976680331714198, + "learning_rate": 3.2322081056752058e-06, + "loss": 0.7995, + "step": 11716 + }, + { + "epoch": 0.8464970108548413, + "grad_norm": 7.197422774318954, + "learning_rate": 3.231928425027909e-06, + "loss": 0.7511, + "step": 11717 + }, + { + "epoch": 0.8465692560550508, + "grad_norm": 5.466324622610767, + "learning_rate": 3.2316487343610805e-06, + "loss": 0.7874, + "step": 11718 + }, + { + "epoch": 0.8466415012552604, + "grad_norm": 7.210677748741197, + "learning_rate": 3.2313690336785482e-06, + "loss": 0.8816, + "step": 11719 + }, + { + "epoch": 0.8467137464554698, + "grad_norm": 6.060716504333332, + "learning_rate": 3.2310893229841416e-06, + "loss": 0.8462, + "step": 11720 + }, + { + "epoch": 0.8467859916556794, + "grad_norm": 6.208276564916719, + "learning_rate": 3.2308096022816896e-06, + "loss": 0.8431, + "step": 11721 + }, + { + "epoch": 0.8468582368558889, + "grad_norm": 7.973569123386947, + "learning_rate": 3.2305298715750226e-06, + "loss": 0.8598, + "step": 11722 + }, + { + "epoch": 0.8469304820560984, + "grad_norm": 6.278825434285342, + "learning_rate": 3.230250130867969e-06, + "loss": 0.8542, + "step": 11723 + }, + { + "epoch": 0.8470027272563079, + "grad_norm": 5.845967081306792, + "learning_rate": 3.229970380164357e-06, + "loss": 0.8103, + "step": 11724 + }, + { + "epoch": 0.8470749724565174, + "grad_norm": 6.444646035618563, + "learning_rate": 3.2296906194680176e-06, + "loss": 0.8217, + "step": 11725 + }, + { + "epoch": 0.847147217656727, + "grad_norm": 5.13549400204299, + "learning_rate": 3.2294108487827807e-06, + "loss": 0.7905, + "step": 11726 + }, + { + "epoch": 0.8472194628569364, + "grad_norm": 5.84035291206372, + "learning_rate": 3.2291310681124756e-06, + "loss": 0.7593, + "step": 11727 + }, + { + "epoch": 0.8472917080571459, + "grad_norm": 5.352429218136028, + "learning_rate": 3.228851277460932e-06, + "loss": 0.7756, + "step": 11728 + }, + { + "epoch": 0.8473639532573555, + "grad_norm": 7.181139445304635, + "learning_rate": 3.228571476831981e-06, + "loss": 0.8346, + "step": 11729 + }, + { + "epoch": 0.847436198457565, + "grad_norm": 5.5913994821462465, + "learning_rate": 3.228291666229451e-06, + "loss": 0.8155, + "step": 11730 + }, + { + "epoch": 0.8475084436577744, + "grad_norm": 6.285794108986263, + "learning_rate": 3.2280118456571743e-06, + "loss": 0.7788, + "step": 11731 + }, + { + "epoch": 0.847580688857984, + "grad_norm": 6.836091795133689, + "learning_rate": 3.2277320151189804e-06, + "loss": 0.8714, + "step": 11732 + }, + { + "epoch": 0.8476529340581935, + "grad_norm": 6.028413570096128, + "learning_rate": 3.2274521746187004e-06, + "loss": 0.8928, + "step": 11733 + }, + { + "epoch": 0.8477251792584031, + "grad_norm": 7.69818538735309, + "learning_rate": 3.227172324160165e-06, + "loss": 0.8296, + "step": 11734 + }, + { + "epoch": 0.8477974244586125, + "grad_norm": 6.33113715927022, + "learning_rate": 3.226892463747205e-06, + "loss": 0.843, + "step": 11735 + }, + { + "epoch": 0.847869669658822, + "grad_norm": 5.5368661763170115, + "learning_rate": 3.2266125933836517e-06, + "loss": 0.8055, + "step": 11736 + }, + { + "epoch": 0.8479419148590316, + "grad_norm": 8.629936243133368, + "learning_rate": 3.2263327130733364e-06, + "loss": 0.8303, + "step": 11737 + }, + { + "epoch": 0.848014160059241, + "grad_norm": 6.111343730005644, + "learning_rate": 3.2260528228200898e-06, + "loss": 0.8592, + "step": 11738 + }, + { + "epoch": 0.8480864052594506, + "grad_norm": 5.939450234811036, + "learning_rate": 3.225772922627744e-06, + "loss": 0.7409, + "step": 11739 + }, + { + "epoch": 0.8481586504596601, + "grad_norm": 6.805977314792686, + "learning_rate": 3.22549301250013e-06, + "loss": 0.8383, + "step": 11740 + }, + { + "epoch": 0.8482308956598696, + "grad_norm": 5.612104766455093, + "learning_rate": 3.2252130924410807e-06, + "loss": 0.8064, + "step": 11741 + }, + { + "epoch": 0.8483031408600791, + "grad_norm": 7.148497801145936, + "learning_rate": 3.224933162454427e-06, + "loss": 0.7755, + "step": 11742 + }, + { + "epoch": 0.8483753860602886, + "grad_norm": 6.034629388916295, + "learning_rate": 3.2246532225440007e-06, + "loss": 0.8026, + "step": 11743 + }, + { + "epoch": 0.8484476312604982, + "grad_norm": 6.2657714217811815, + "learning_rate": 3.2243732727136346e-06, + "loss": 0.8491, + "step": 11744 + }, + { + "epoch": 0.8485198764607076, + "grad_norm": 6.588047255308935, + "learning_rate": 3.2240933129671613e-06, + "loss": 0.7798, + "step": 11745 + }, + { + "epoch": 0.8485921216609171, + "grad_norm": 5.603975465294704, + "learning_rate": 3.2238133433084125e-06, + "loss": 0.8465, + "step": 11746 + }, + { + "epoch": 0.8486643668611267, + "grad_norm": 8.244209338066524, + "learning_rate": 3.2235333637412213e-06, + "loss": 0.8596, + "step": 11747 + }, + { + "epoch": 0.8487366120613362, + "grad_norm": 8.272946437710718, + "learning_rate": 3.2232533742694193e-06, + "loss": 0.7418, + "step": 11748 + }, + { + "epoch": 0.8488088572615456, + "grad_norm": 6.259013276258711, + "learning_rate": 3.2229733748968407e-06, + "loss": 0.88, + "step": 11749 + }, + { + "epoch": 0.8488811024617552, + "grad_norm": 6.2196446067620155, + "learning_rate": 3.2226933656273186e-06, + "loss": 0.7556, + "step": 11750 + }, + { + "epoch": 0.8489533476619647, + "grad_norm": 6.361975735572226, + "learning_rate": 3.2224133464646846e-06, + "loss": 0.7689, + "step": 11751 + }, + { + "epoch": 0.8490255928621743, + "grad_norm": 8.338967313432292, + "learning_rate": 3.2221333174127732e-06, + "loss": 0.806, + "step": 11752 + }, + { + "epoch": 0.8490978380623837, + "grad_norm": 8.681544176559305, + "learning_rate": 3.2218532784754177e-06, + "loss": 0.7933, + "step": 11753 + }, + { + "epoch": 0.8491700832625932, + "grad_norm": 5.850390898857408, + "learning_rate": 3.22157322965645e-06, + "loss": 0.7988, + "step": 11754 + }, + { + "epoch": 0.8492423284628028, + "grad_norm": 5.130467173699697, + "learning_rate": 3.221293170959706e-06, + "loss": 0.9204, + "step": 11755 + }, + { + "epoch": 0.8493145736630122, + "grad_norm": 6.809005541011076, + "learning_rate": 3.221013102389019e-06, + "loss": 0.7734, + "step": 11756 + }, + { + "epoch": 0.8493868188632218, + "grad_norm": 6.215981945684227, + "learning_rate": 3.220733023948222e-06, + "loss": 0.8094, + "step": 11757 + }, + { + "epoch": 0.8494590640634313, + "grad_norm": 4.881845216690771, + "learning_rate": 3.2204529356411484e-06, + "loss": 0.8224, + "step": 11758 + }, + { + "epoch": 0.8495313092636408, + "grad_norm": 5.285141273133344, + "learning_rate": 3.2201728374716353e-06, + "loss": 0.7461, + "step": 11759 + }, + { + "epoch": 0.8496035544638503, + "grad_norm": 6.675304665916943, + "learning_rate": 3.219892729443514e-06, + "loss": 0.8215, + "step": 11760 + }, + { + "epoch": 0.8496757996640598, + "grad_norm": 5.768709014383283, + "learning_rate": 3.2196126115606208e-06, + "loss": 0.7984, + "step": 11761 + }, + { + "epoch": 0.8497480448642694, + "grad_norm": 5.49149393013566, + "learning_rate": 3.2193324838267893e-06, + "loss": 0.7521, + "step": 11762 + }, + { + "epoch": 0.8498202900644788, + "grad_norm": 5.402684441648002, + "learning_rate": 3.219052346245855e-06, + "loss": 0.7472, + "step": 11763 + }, + { + "epoch": 0.8498925352646883, + "grad_norm": 7.35683285487315, + "learning_rate": 3.2187721988216526e-06, + "loss": 0.8016, + "step": 11764 + }, + { + "epoch": 0.8499647804648979, + "grad_norm": 5.384990899123468, + "learning_rate": 3.218492041558016e-06, + "loss": 0.7845, + "step": 11765 + }, + { + "epoch": 0.8500370256651074, + "grad_norm": 5.34211250781398, + "learning_rate": 3.218211874458782e-06, + "loss": 0.8541, + "step": 11766 + }, + { + "epoch": 0.8501092708653168, + "grad_norm": 7.630211721860027, + "learning_rate": 3.217931697527785e-06, + "loss": 0.7766, + "step": 11767 + }, + { + "epoch": 0.8501815160655264, + "grad_norm": 5.631378456877584, + "learning_rate": 3.21765151076886e-06, + "loss": 0.8412, + "step": 11768 + }, + { + "epoch": 0.8502537612657359, + "grad_norm": 5.177231518513448, + "learning_rate": 3.217371314185843e-06, + "loss": 0.7644, + "step": 11769 + }, + { + "epoch": 0.8503260064659455, + "grad_norm": 6.38619640209131, + "learning_rate": 3.2170911077825705e-06, + "loss": 0.8778, + "step": 11770 + }, + { + "epoch": 0.8503982516661549, + "grad_norm": 5.296813424351718, + "learning_rate": 3.2168108915628776e-06, + "loss": 0.7983, + "step": 11771 + }, + { + "epoch": 0.8504704968663644, + "grad_norm": 7.900568552925331, + "learning_rate": 3.2165306655305994e-06, + "loss": 0.8997, + "step": 11772 + }, + { + "epoch": 0.850542742066574, + "grad_norm": 6.450260293129348, + "learning_rate": 3.216250429689573e-06, + "loss": 0.7782, + "step": 11773 + }, + { + "epoch": 0.8506149872667834, + "grad_norm": 7.346792297027599, + "learning_rate": 3.215970184043634e-06, + "loss": 0.7214, + "step": 11774 + }, + { + "epoch": 0.850687232466993, + "grad_norm": 5.83287693236412, + "learning_rate": 3.2156899285966202e-06, + "loss": 0.7359, + "step": 11775 + }, + { + "epoch": 0.8507594776672025, + "grad_norm": 5.50389169511028, + "learning_rate": 3.215409663352366e-06, + "loss": 0.7934, + "step": 11776 + }, + { + "epoch": 0.850831722867412, + "grad_norm": 6.1139963140105325, + "learning_rate": 3.215129388314709e-06, + "loss": 0.8224, + "step": 11777 + }, + { + "epoch": 0.8509039680676215, + "grad_norm": 6.6731559006269485, + "learning_rate": 3.214849103487486e-06, + "loss": 0.771, + "step": 11778 + }, + { + "epoch": 0.850976213267831, + "grad_norm": 6.652412275318672, + "learning_rate": 3.214568808874534e-06, + "loss": 0.6574, + "step": 11779 + }, + { + "epoch": 0.8510484584680406, + "grad_norm": 7.322672493030502, + "learning_rate": 3.2142885044796905e-06, + "loss": 0.8225, + "step": 11780 + }, + { + "epoch": 0.85112070366825, + "grad_norm": 7.451612914945306, + "learning_rate": 3.214008190306791e-06, + "loss": 0.8451, + "step": 11781 + }, + { + "epoch": 0.8511929488684595, + "grad_norm": 7.670922742740776, + "learning_rate": 3.213727866359674e-06, + "loss": 0.82, + "step": 11782 + }, + { + "epoch": 0.8512651940686691, + "grad_norm": 5.959891731251683, + "learning_rate": 3.2134475326421764e-06, + "loss": 0.7642, + "step": 11783 + }, + { + "epoch": 0.8513374392688786, + "grad_norm": 7.259308988194674, + "learning_rate": 3.2131671891581367e-06, + "loss": 0.9033, + "step": 11784 + }, + { + "epoch": 0.851409684469088, + "grad_norm": 6.5827802916707014, + "learning_rate": 3.2128868359113918e-06, + "loss": 0.8549, + "step": 11785 + }, + { + "epoch": 0.8514819296692976, + "grad_norm": 11.79521860784428, + "learning_rate": 3.2126064729057795e-06, + "loss": 0.8422, + "step": 11786 + }, + { + "epoch": 0.8515541748695071, + "grad_norm": 5.419380251653479, + "learning_rate": 3.2123261001451374e-06, + "loss": 0.7304, + "step": 11787 + }, + { + "epoch": 0.8516264200697167, + "grad_norm": 5.223897439811575, + "learning_rate": 3.2120457176333046e-06, + "loss": 0.7566, + "step": 11788 + }, + { + "epoch": 0.8516986652699261, + "grad_norm": 7.660584013266639, + "learning_rate": 3.2117653253741186e-06, + "loss": 0.8733, + "step": 11789 + }, + { + "epoch": 0.8517709104701356, + "grad_norm": 6.726662026974931, + "learning_rate": 3.2114849233714186e-06, + "loss": 0.7144, + "step": 11790 + }, + { + "epoch": 0.8518431556703452, + "grad_norm": 6.839664441628132, + "learning_rate": 3.211204511629041e-06, + "loss": 0.8141, + "step": 11791 + }, + { + "epoch": 0.8519154008705546, + "grad_norm": 7.154596795771135, + "learning_rate": 3.210924090150827e-06, + "loss": 0.7293, + "step": 11792 + }, + { + "epoch": 0.8519876460707642, + "grad_norm": 7.848092507084499, + "learning_rate": 3.2106436589406144e-06, + "loss": 0.8429, + "step": 11793 + }, + { + "epoch": 0.8520598912709737, + "grad_norm": 6.08093134171617, + "learning_rate": 3.210363218002241e-06, + "loss": 0.7883, + "step": 11794 + }, + { + "epoch": 0.8521321364711832, + "grad_norm": 5.042720631967462, + "learning_rate": 3.2100827673395474e-06, + "loss": 0.7595, + "step": 11795 + }, + { + "epoch": 0.8522043816713927, + "grad_norm": 7.883836662671395, + "learning_rate": 3.2098023069563716e-06, + "loss": 0.8881, + "step": 11796 + }, + { + "epoch": 0.8522766268716022, + "grad_norm": 5.301841185710497, + "learning_rate": 3.2095218368565535e-06, + "loss": 0.7328, + "step": 11797 + }, + { + "epoch": 0.8523488720718118, + "grad_norm": 5.104397557025316, + "learning_rate": 3.2092413570439327e-06, + "loss": 0.7303, + "step": 11798 + }, + { + "epoch": 0.8524211172720212, + "grad_norm": 7.006742499921421, + "learning_rate": 3.2089608675223476e-06, + "loss": 0.8151, + "step": 11799 + }, + { + "epoch": 0.8524933624722307, + "grad_norm": 5.986332904259919, + "learning_rate": 3.2086803682956393e-06, + "loss": 0.813, + "step": 11800 + }, + { + "epoch": 0.8525656076724403, + "grad_norm": 9.898700328248985, + "learning_rate": 3.2083998593676467e-06, + "loss": 0.8025, + "step": 11801 + }, + { + "epoch": 0.8526378528726498, + "grad_norm": 5.247019512101701, + "learning_rate": 3.2081193407422106e-06, + "loss": 0.8719, + "step": 11802 + }, + { + "epoch": 0.8527100980728592, + "grad_norm": 7.407899310768629, + "learning_rate": 3.2078388124231702e-06, + "loss": 0.8054, + "step": 11803 + }, + { + "epoch": 0.8527823432730688, + "grad_norm": 5.855068078561499, + "learning_rate": 3.2075582744143664e-06, + "loss": 0.8124, + "step": 11804 + }, + { + "epoch": 0.8528545884732783, + "grad_norm": 7.970169479501827, + "learning_rate": 3.207277726719639e-06, + "loss": 0.8305, + "step": 11805 + }, + { + "epoch": 0.8529268336734879, + "grad_norm": 7.316033968438347, + "learning_rate": 3.2069971693428283e-06, + "loss": 0.862, + "step": 11806 + }, + { + "epoch": 0.8529990788736973, + "grad_norm": 8.523575345471047, + "learning_rate": 3.2067166022877757e-06, + "loss": 0.886, + "step": 11807 + }, + { + "epoch": 0.8530713240739068, + "grad_norm": 6.674625160465181, + "learning_rate": 3.206436025558321e-06, + "loss": 0.8015, + "step": 11808 + }, + { + "epoch": 0.8531435692741164, + "grad_norm": 5.215584897297987, + "learning_rate": 3.206155439158306e-06, + "loss": 0.7482, + "step": 11809 + }, + { + "epoch": 0.8532158144743258, + "grad_norm": 6.493917406878033, + "learning_rate": 3.2058748430915715e-06, + "loss": 0.8118, + "step": 11810 + }, + { + "epoch": 0.8532880596745354, + "grad_norm": 6.761269098820144, + "learning_rate": 3.205594237361958e-06, + "loss": 0.7938, + "step": 11811 + }, + { + "epoch": 0.8533603048747449, + "grad_norm": 5.425028386898748, + "learning_rate": 3.2053136219733076e-06, + "loss": 0.7691, + "step": 11812 + }, + { + "epoch": 0.8534325500749544, + "grad_norm": 7.2715761812630495, + "learning_rate": 3.205032996929462e-06, + "loss": 0.7974, + "step": 11813 + }, + { + "epoch": 0.8535047952751639, + "grad_norm": 6.344243974667701, + "learning_rate": 3.2047523622342614e-06, + "loss": 0.8043, + "step": 11814 + }, + { + "epoch": 0.8535770404753734, + "grad_norm": 6.817792883674856, + "learning_rate": 3.2044717178915473e-06, + "loss": 0.7964, + "step": 11815 + }, + { + "epoch": 0.853649285675583, + "grad_norm": 6.223707513796351, + "learning_rate": 3.2041910639051636e-06, + "loss": 0.8915, + "step": 11816 + }, + { + "epoch": 0.8537215308757924, + "grad_norm": 6.012011901965514, + "learning_rate": 3.203910400278951e-06, + "loss": 0.7916, + "step": 11817 + }, + { + "epoch": 0.8537937760760019, + "grad_norm": 5.476483615192835, + "learning_rate": 3.2036297270167514e-06, + "loss": 0.8908, + "step": 11818 + }, + { + "epoch": 0.8538660212762115, + "grad_norm": 5.353511705922498, + "learning_rate": 3.2033490441224068e-06, + "loss": 0.8038, + "step": 11819 + }, + { + "epoch": 0.853938266476421, + "grad_norm": 7.488729018791809, + "learning_rate": 3.2030683515997603e-06, + "loss": 0.8453, + "step": 11820 + }, + { + "epoch": 0.8540105116766304, + "grad_norm": 6.7350835936299704, + "learning_rate": 3.202787649452654e-06, + "loss": 0.7741, + "step": 11821 + }, + { + "epoch": 0.85408275687684, + "grad_norm": 7.337395438857842, + "learning_rate": 3.20250693768493e-06, + "loss": 0.8094, + "step": 11822 + }, + { + "epoch": 0.8541550020770495, + "grad_norm": 6.24930843341854, + "learning_rate": 3.202226216300432e-06, + "loss": 0.7921, + "step": 11823 + }, + { + "epoch": 0.8542272472772591, + "grad_norm": 6.125078784669484, + "learning_rate": 3.201945485303002e-06, + "loss": 0.8807, + "step": 11824 + }, + { + "epoch": 0.8542994924774685, + "grad_norm": 5.954931750927388, + "learning_rate": 3.201664744696483e-06, + "loss": 0.7781, + "step": 11825 + }, + { + "epoch": 0.854371737677678, + "grad_norm": 6.219165577253505, + "learning_rate": 3.2013839944847185e-06, + "loss": 0.8766, + "step": 11826 + }, + { + "epoch": 0.8544439828778876, + "grad_norm": 6.010418429669189, + "learning_rate": 3.2011032346715525e-06, + "loss": 0.7877, + "step": 11827 + }, + { + "epoch": 0.854516228078097, + "grad_norm": 6.5613553456711955, + "learning_rate": 3.2008224652608273e-06, + "loss": 0.8273, + "step": 11828 + }, + { + "epoch": 0.8545884732783066, + "grad_norm": 5.234517109777997, + "learning_rate": 3.2005416862563858e-06, + "loss": 0.7794, + "step": 11829 + }, + { + "epoch": 0.8546607184785161, + "grad_norm": 6.0484282792853055, + "learning_rate": 3.200260897662074e-06, + "loss": 0.7987, + "step": 11830 + }, + { + "epoch": 0.8547329636787256, + "grad_norm": 7.877904689300396, + "learning_rate": 3.1999800994817332e-06, + "loss": 0.7378, + "step": 11831 + }, + { + "epoch": 0.8548052088789351, + "grad_norm": 6.905690442369962, + "learning_rate": 3.1996992917192094e-06, + "loss": 0.7411, + "step": 11832 + }, + { + "epoch": 0.8548774540791446, + "grad_norm": 6.528712897811542, + "learning_rate": 3.199418474378344e-06, + "loss": 0.8274, + "step": 11833 + }, + { + "epoch": 0.8549496992793542, + "grad_norm": 9.75105671169176, + "learning_rate": 3.199137647462984e-06, + "loss": 0.6573, + "step": 11834 + }, + { + "epoch": 0.8550219444795636, + "grad_norm": 5.907463796716954, + "learning_rate": 3.198856810976972e-06, + "loss": 0.7886, + "step": 11835 + }, + { + "epoch": 0.8550941896797731, + "grad_norm": 7.635429378844153, + "learning_rate": 3.1985759649241534e-06, + "loss": 0.8621, + "step": 11836 + }, + { + "epoch": 0.8551664348799827, + "grad_norm": 6.770354487775121, + "learning_rate": 3.1982951093083715e-06, + "loss": 0.8115, + "step": 11837 + }, + { + "epoch": 0.8552386800801922, + "grad_norm": 5.477208685743371, + "learning_rate": 3.198014244133472e-06, + "loss": 0.761, + "step": 11838 + }, + { + "epoch": 0.8553109252804016, + "grad_norm": 6.5752784931106065, + "learning_rate": 3.197733369403299e-06, + "loss": 0.819, + "step": 11839 + }, + { + "epoch": 0.8553831704806112, + "grad_norm": 7.382928756525136, + "learning_rate": 3.1974524851216985e-06, + "loss": 0.8628, + "step": 11840 + }, + { + "epoch": 0.8554554156808207, + "grad_norm": 8.002532081432443, + "learning_rate": 3.1971715912925157e-06, + "loss": 0.8658, + "step": 11841 + }, + { + "epoch": 0.8555276608810303, + "grad_norm": 6.207710627448002, + "learning_rate": 3.1968906879195936e-06, + "loss": 0.8326, + "step": 11842 + }, + { + "epoch": 0.8555999060812397, + "grad_norm": 8.020237597515766, + "learning_rate": 3.1966097750067797e-06, + "loss": 0.7756, + "step": 11843 + }, + { + "epoch": 0.8556721512814492, + "grad_norm": 5.60300298282422, + "learning_rate": 3.196328852557919e-06, + "loss": 0.7712, + "step": 11844 + }, + { + "epoch": 0.8557443964816588, + "grad_norm": 6.1658727117584435, + "learning_rate": 3.1960479205768576e-06, + "loss": 0.8772, + "step": 11845 + }, + { + "epoch": 0.8558166416818682, + "grad_norm": 5.686528720293142, + "learning_rate": 3.19576697906744e-06, + "loss": 0.8437, + "step": 11846 + }, + { + "epoch": 0.8558888868820778, + "grad_norm": 5.694047449844793, + "learning_rate": 3.1954860280335127e-06, + "loss": 0.7675, + "step": 11847 + }, + { + "epoch": 0.8559611320822873, + "grad_norm": 5.849679804459152, + "learning_rate": 3.1952050674789215e-06, + "loss": 0.78, + "step": 11848 + }, + { + "epoch": 0.8560333772824968, + "grad_norm": 6.583149999971073, + "learning_rate": 3.1949240974075124e-06, + "loss": 0.8117, + "step": 11849 + }, + { + "epoch": 0.8561056224827063, + "grad_norm": 5.94603108338085, + "learning_rate": 3.194643117823133e-06, + "loss": 0.773, + "step": 11850 + }, + { + "epoch": 0.8561778676829158, + "grad_norm": 4.689372388088308, + "learning_rate": 3.194362128729628e-06, + "loss": 0.7757, + "step": 11851 + }, + { + "epoch": 0.8562501128831254, + "grad_norm": 5.788840429909017, + "learning_rate": 3.194081130130845e-06, + "loss": 0.7504, + "step": 11852 + }, + { + "epoch": 0.8563223580833348, + "grad_norm": 6.936075150167387, + "learning_rate": 3.193800122030629e-06, + "loss": 0.7986, + "step": 11853 + }, + { + "epoch": 0.8563946032835443, + "grad_norm": 5.653286826884116, + "learning_rate": 3.1935191044328294e-06, + "loss": 0.7788, + "step": 11854 + }, + { + "epoch": 0.8564668484837539, + "grad_norm": 7.249298324830658, + "learning_rate": 3.1932380773412917e-06, + "loss": 0.8462, + "step": 11855 + }, + { + "epoch": 0.8565390936839634, + "grad_norm": 5.7132602043917045, + "learning_rate": 3.1929570407598633e-06, + "loss": 0.7955, + "step": 11856 + }, + { + "epoch": 0.8566113388841728, + "grad_norm": 6.325036922543077, + "learning_rate": 3.1926759946923896e-06, + "loss": 0.9241, + "step": 11857 + }, + { + "epoch": 0.8566835840843824, + "grad_norm": 6.4073789020574745, + "learning_rate": 3.19239493914272e-06, + "loss": 0.8587, + "step": 11858 + }, + { + "epoch": 0.8567558292845919, + "grad_norm": 8.090265768658007, + "learning_rate": 3.192113874114702e-06, + "loss": 0.8365, + "step": 11859 + }, + { + "epoch": 0.8568280744848014, + "grad_norm": 5.463740730356327, + "learning_rate": 3.191832799612182e-06, + "loss": 0.8066, + "step": 11860 + }, + { + "epoch": 0.8569003196850109, + "grad_norm": 5.5959313257896675, + "learning_rate": 3.191551715639008e-06, + "loss": 0.8538, + "step": 11861 + }, + { + "epoch": 0.8569725648852204, + "grad_norm": 5.656276808195707, + "learning_rate": 3.191270622199028e-06, + "loss": 0.808, + "step": 11862 + }, + { + "epoch": 0.85704481008543, + "grad_norm": 6.592938269237219, + "learning_rate": 3.1909895192960895e-06, + "loss": 0.8161, + "step": 11863 + }, + { + "epoch": 0.8571170552856394, + "grad_norm": 5.706139172985059, + "learning_rate": 3.1907084069340423e-06, + "loss": 0.8474, + "step": 11864 + }, + { + "epoch": 0.857189300485849, + "grad_norm": 8.234094256658656, + "learning_rate": 3.190427285116732e-06, + "loss": 0.884, + "step": 11865 + }, + { + "epoch": 0.8572615456860585, + "grad_norm": 6.203411566507301, + "learning_rate": 3.190146153848009e-06, + "loss": 0.9281, + "step": 11866 + }, + { + "epoch": 0.857333790886268, + "grad_norm": 6.296703856913545, + "learning_rate": 3.1898650131317197e-06, + "loss": 0.8232, + "step": 11867 + }, + { + "epoch": 0.8574060360864775, + "grad_norm": 6.764930601363163, + "learning_rate": 3.189583862971716e-06, + "loss": 0.8365, + "step": 11868 + }, + { + "epoch": 0.857478281286687, + "grad_norm": 6.4473582217487175, + "learning_rate": 3.189302703371843e-06, + "loss": 0.7354, + "step": 11869 + }, + { + "epoch": 0.8575505264868966, + "grad_norm": 6.186638049574689, + "learning_rate": 3.1890215343359526e-06, + "loss": 0.8286, + "step": 11870 + }, + { + "epoch": 0.857622771687106, + "grad_norm": 6.224711718644054, + "learning_rate": 3.1887403558678916e-06, + "loss": 0.8408, + "step": 11871 + }, + { + "epoch": 0.8576950168873155, + "grad_norm": 7.127131093197943, + "learning_rate": 3.1884591679715094e-06, + "loss": 0.8283, + "step": 11872 + }, + { + "epoch": 0.8577672620875251, + "grad_norm": 5.002326996045562, + "learning_rate": 3.1881779706506566e-06, + "loss": 0.7822, + "step": 11873 + }, + { + "epoch": 0.8578395072877346, + "grad_norm": 5.818859600014227, + "learning_rate": 3.1878967639091813e-06, + "loss": 0.8843, + "step": 11874 + }, + { + "epoch": 0.857911752487944, + "grad_norm": 7.004738974653327, + "learning_rate": 3.187615547750934e-06, + "loss": 0.7804, + "step": 11875 + }, + { + "epoch": 0.8579839976881536, + "grad_norm": 7.860611865662106, + "learning_rate": 3.187334322179763e-06, + "loss": 0.8209, + "step": 11876 + }, + { + "epoch": 0.8580562428883631, + "grad_norm": 6.982602434291322, + "learning_rate": 3.187053087199519e-06, + "loss": 0.807, + "step": 11877 + }, + { + "epoch": 0.8581284880885726, + "grad_norm": 6.423242873442547, + "learning_rate": 3.1867718428140514e-06, + "loss": 0.7731, + "step": 11878 + }, + { + "epoch": 0.8582007332887821, + "grad_norm": 5.876133261606165, + "learning_rate": 3.1864905890272113e-06, + "loss": 0.8251, + "step": 11879 + }, + { + "epoch": 0.8582729784889916, + "grad_norm": 7.091569607433203, + "learning_rate": 3.1862093258428485e-06, + "loss": 0.8146, + "step": 11880 + }, + { + "epoch": 0.8583452236892012, + "grad_norm": 6.454951362692562, + "learning_rate": 3.185928053264811e-06, + "loss": 0.7994, + "step": 11881 + }, + { + "epoch": 0.8584174688894106, + "grad_norm": 7.070184358731749, + "learning_rate": 3.1856467712969524e-06, + "loss": 0.8859, + "step": 11882 + }, + { + "epoch": 0.8584897140896202, + "grad_norm": 6.522433668554504, + "learning_rate": 3.1853654799431215e-06, + "loss": 0.95, + "step": 11883 + }, + { + "epoch": 0.8585619592898297, + "grad_norm": 7.075269919055402, + "learning_rate": 3.1850841792071695e-06, + "loss": 0.8607, + "step": 11884 + }, + { + "epoch": 0.8586342044900392, + "grad_norm": 7.676060546763432, + "learning_rate": 3.1848028690929467e-06, + "loss": 0.87, + "step": 11885 + }, + { + "epoch": 0.8587064496902487, + "grad_norm": 7.095438079965948, + "learning_rate": 3.1845215496043045e-06, + "loss": 0.8155, + "step": 11886 + }, + { + "epoch": 0.8587786948904582, + "grad_norm": 5.578040883688256, + "learning_rate": 3.184240220745094e-06, + "loss": 0.8395, + "step": 11887 + }, + { + "epoch": 0.8588509400906678, + "grad_norm": 6.679167217493381, + "learning_rate": 3.183958882519166e-06, + "loss": 0.9021, + "step": 11888 + }, + { + "epoch": 0.8589231852908772, + "grad_norm": 6.296606015501774, + "learning_rate": 3.1836775349303722e-06, + "loss": 0.7594, + "step": 11889 + }, + { + "epoch": 0.8589954304910867, + "grad_norm": 5.749317958020923, + "learning_rate": 3.1833961779825636e-06, + "loss": 0.7761, + "step": 11890 + }, + { + "epoch": 0.8590676756912963, + "grad_norm": 5.641475032254489, + "learning_rate": 3.183114811679591e-06, + "loss": 0.8302, + "step": 11891 + }, + { + "epoch": 0.8591399208915058, + "grad_norm": 6.562257598759779, + "learning_rate": 3.182833436025308e-06, + "loss": 0.7851, + "step": 11892 + }, + { + "epoch": 0.8592121660917152, + "grad_norm": 7.976750206303302, + "learning_rate": 3.1825520510235658e-06, + "loss": 0.8432, + "step": 11893 + }, + { + "epoch": 0.8592844112919248, + "grad_norm": 6.829476578457046, + "learning_rate": 3.1822706566782153e-06, + "loss": 0.8689, + "step": 11894 + }, + { + "epoch": 0.8593566564921343, + "grad_norm": 5.977907515626255, + "learning_rate": 3.1819892529931095e-06, + "loss": 0.8298, + "step": 11895 + }, + { + "epoch": 0.8594289016923438, + "grad_norm": 7.103063980693005, + "learning_rate": 3.181707839972101e-06, + "loss": 0.7798, + "step": 11896 + }, + { + "epoch": 0.8595011468925533, + "grad_norm": 6.612090867190675, + "learning_rate": 3.1814264176190402e-06, + "loss": 0.8073, + "step": 11897 + }, + { + "epoch": 0.8595733920927628, + "grad_norm": 7.8838006147916415, + "learning_rate": 3.181144985937782e-06, + "loss": 0.7453, + "step": 11898 + }, + { + "epoch": 0.8596456372929724, + "grad_norm": 6.079386680688281, + "learning_rate": 3.180863544932177e-06, + "loss": 0.7369, + "step": 11899 + }, + { + "epoch": 0.8597178824931818, + "grad_norm": 5.949935579351889, + "learning_rate": 3.1805820946060785e-06, + "loss": 0.8119, + "step": 11900 + }, + { + "epoch": 0.8597901276933914, + "grad_norm": 6.595912199177443, + "learning_rate": 3.18030063496334e-06, + "loss": 0.8122, + "step": 11901 + }, + { + "epoch": 0.8598623728936009, + "grad_norm": 6.938863259478474, + "learning_rate": 3.1800191660078146e-06, + "loss": 0.8248, + "step": 11902 + }, + { + "epoch": 0.8599346180938104, + "grad_norm": 5.986704400388975, + "learning_rate": 3.1797376877433543e-06, + "loss": 0.7971, + "step": 11903 + }, + { + "epoch": 0.8600068632940199, + "grad_norm": 6.476087159313063, + "learning_rate": 3.1794562001738126e-06, + "loss": 0.6817, + "step": 11904 + }, + { + "epoch": 0.8600791084942294, + "grad_norm": 6.404872243710405, + "learning_rate": 3.1791747033030436e-06, + "loss": 0.8221, + "step": 11905 + }, + { + "epoch": 0.860151353694439, + "grad_norm": 6.1996931676926685, + "learning_rate": 3.1788931971348997e-06, + "loss": 0.9214, + "step": 11906 + }, + { + "epoch": 0.8602235988946484, + "grad_norm": 8.34547709331462, + "learning_rate": 3.178611681673236e-06, + "loss": 0.9349, + "step": 11907 + }, + { + "epoch": 0.8602958440948579, + "grad_norm": 6.606501538586493, + "learning_rate": 3.1783301569219037e-06, + "loss": 0.838, + "step": 11908 + }, + { + "epoch": 0.8603680892950675, + "grad_norm": 5.712433208183618, + "learning_rate": 3.17804862288476e-06, + "loss": 0.8745, + "step": 11909 + }, + { + "epoch": 0.860440334495277, + "grad_norm": 7.251370267615401, + "learning_rate": 3.177767079565656e-06, + "loss": 0.8786, + "step": 11910 + }, + { + "epoch": 0.8605125796954864, + "grad_norm": 6.246092833900026, + "learning_rate": 3.177485526968447e-06, + "loss": 0.761, + "step": 11911 + }, + { + "epoch": 0.860584824895696, + "grad_norm": 7.093389426322708, + "learning_rate": 3.1772039650969875e-06, + "loss": 0.8138, + "step": 11912 + }, + { + "epoch": 0.8606570700959055, + "grad_norm": 6.908379826161107, + "learning_rate": 3.176922393955132e-06, + "loss": 0.8153, + "step": 11913 + }, + { + "epoch": 0.860729315296115, + "grad_norm": 5.732633820081021, + "learning_rate": 3.176640813546733e-06, + "loss": 0.8883, + "step": 11914 + }, + { + "epoch": 0.8608015604963245, + "grad_norm": 5.281739703165254, + "learning_rate": 3.176359223875648e-06, + "loss": 0.8153, + "step": 11915 + }, + { + "epoch": 0.860873805696534, + "grad_norm": 7.189237964978614, + "learning_rate": 3.17607762494573e-06, + "loss": 0.8065, + "step": 11916 + }, + { + "epoch": 0.8609460508967436, + "grad_norm": 5.81878551959906, + "learning_rate": 3.1757960167608343e-06, + "loss": 0.7443, + "step": 11917 + }, + { + "epoch": 0.861018296096953, + "grad_norm": 5.591793634571625, + "learning_rate": 3.175514399324816e-06, + "loss": 0.7965, + "step": 11918 + }, + { + "epoch": 0.8610905412971626, + "grad_norm": 7.432030213178604, + "learning_rate": 3.17523277264153e-06, + "loss": 0.8006, + "step": 11919 + }, + { + "epoch": 0.8611627864973721, + "grad_norm": 6.717518112447815, + "learning_rate": 3.1749511367148313e-06, + "loss": 0.8232, + "step": 11920 + }, + { + "epoch": 0.8612350316975816, + "grad_norm": 6.414763362382226, + "learning_rate": 3.174669491548576e-06, + "loss": 0.8633, + "step": 11921 + }, + { + "epoch": 0.8613072768977911, + "grad_norm": 7.876491587023945, + "learning_rate": 3.1743878371466187e-06, + "loss": 0.8786, + "step": 11922 + }, + { + "epoch": 0.8613795220980006, + "grad_norm": 5.655102186552993, + "learning_rate": 3.174106173512816e-06, + "loss": 0.8209, + "step": 11923 + }, + { + "epoch": 0.8614517672982102, + "grad_norm": 5.620807526427631, + "learning_rate": 3.1738245006510227e-06, + "loss": 0.7693, + "step": 11924 + }, + { + "epoch": 0.8615240124984196, + "grad_norm": 5.933535245967342, + "learning_rate": 3.1735428185650958e-06, + "loss": 0.8391, + "step": 11925 + }, + { + "epoch": 0.8615962576986291, + "grad_norm": 5.490600182917378, + "learning_rate": 3.17326112725889e-06, + "loss": 0.7571, + "step": 11926 + }, + { + "epoch": 0.8616685028988387, + "grad_norm": 6.276889179247527, + "learning_rate": 3.172979426736263e-06, + "loss": 0.7738, + "step": 11927 + }, + { + "epoch": 0.8617407480990482, + "grad_norm": 8.08774417467324, + "learning_rate": 3.172697717001069e-06, + "loss": 0.7624, + "step": 11928 + }, + { + "epoch": 0.8618129932992576, + "grad_norm": 9.037610622377949, + "learning_rate": 3.172415998057167e-06, + "loss": 0.845, + "step": 11929 + }, + { + "epoch": 0.8618852384994672, + "grad_norm": 6.704171479173841, + "learning_rate": 3.1721342699084113e-06, + "loss": 0.7853, + "step": 11930 + }, + { + "epoch": 0.8619574836996767, + "grad_norm": 7.231289100376409, + "learning_rate": 3.171852532558659e-06, + "loss": 0.8627, + "step": 11931 + }, + { + "epoch": 0.8620297288998862, + "grad_norm": 6.054890684594775, + "learning_rate": 3.1715707860117678e-06, + "loss": 0.848, + "step": 11932 + }, + { + "epoch": 0.8621019741000957, + "grad_norm": 6.990353886237376, + "learning_rate": 3.1712890302715937e-06, + "loss": 0.7995, + "step": 11933 + }, + { + "epoch": 0.8621742193003052, + "grad_norm": 8.502124296533834, + "learning_rate": 3.1710072653419936e-06, + "loss": 0.8144, + "step": 11934 + }, + { + "epoch": 0.8622464645005148, + "grad_norm": 6.780520351550681, + "learning_rate": 3.1707254912268255e-06, + "loss": 0.7989, + "step": 11935 + }, + { + "epoch": 0.8623187097007242, + "grad_norm": 7.475405995242076, + "learning_rate": 3.1704437079299465e-06, + "loss": 0.8756, + "step": 11936 + }, + { + "epoch": 0.8623909549009338, + "grad_norm": 6.217753728299413, + "learning_rate": 3.170161915455214e-06, + "loss": 0.7581, + "step": 11937 + }, + { + "epoch": 0.8624632001011433, + "grad_norm": 7.47060457451338, + "learning_rate": 3.169880113806484e-06, + "loss": 0.8221, + "step": 11938 + }, + { + "epoch": 0.8625354453013528, + "grad_norm": 6.6464975756554185, + "learning_rate": 3.169598302987616e-06, + "loss": 0.8097, + "step": 11939 + }, + { + "epoch": 0.8626076905015623, + "grad_norm": 5.596992277602498, + "learning_rate": 3.169316483002467e-06, + "loss": 0.799, + "step": 11940 + }, + { + "epoch": 0.8626799357017718, + "grad_norm": 7.517325924582257, + "learning_rate": 3.1690346538548954e-06, + "loss": 0.8547, + "step": 11941 + }, + { + "epoch": 0.8627521809019814, + "grad_norm": 5.833470079544877, + "learning_rate": 3.1687528155487584e-06, + "loss": 0.8128, + "step": 11942 + }, + { + "epoch": 0.8628244261021908, + "grad_norm": 7.440984342534248, + "learning_rate": 3.1684709680879148e-06, + "loss": 0.7763, + "step": 11943 + }, + { + "epoch": 0.8628966713024003, + "grad_norm": 5.332324568397387, + "learning_rate": 3.1681891114762227e-06, + "loss": 0.7569, + "step": 11944 + }, + { + "epoch": 0.8629689165026099, + "grad_norm": 4.974511411051997, + "learning_rate": 3.1679072457175408e-06, + "loss": 0.839, + "step": 11945 + }, + { + "epoch": 0.8630411617028194, + "grad_norm": 5.71421125227233, + "learning_rate": 3.1676253708157273e-06, + "loss": 0.8337, + "step": 11946 + }, + { + "epoch": 0.8631134069030288, + "grad_norm": 4.70060221283721, + "learning_rate": 3.1673434867746406e-06, + "loss": 0.8034, + "step": 11947 + }, + { + "epoch": 0.8631856521032384, + "grad_norm": 6.11238792644332, + "learning_rate": 3.167061593598139e-06, + "loss": 0.8386, + "step": 11948 + }, + { + "epoch": 0.8632578973034479, + "grad_norm": 6.334489951010509, + "learning_rate": 3.166779691290083e-06, + "loss": 0.805, + "step": 11949 + }, + { + "epoch": 0.8633301425036574, + "grad_norm": 7.7674968466799905, + "learning_rate": 3.1664977798543307e-06, + "loss": 0.8571, + "step": 11950 + }, + { + "epoch": 0.8634023877038669, + "grad_norm": 6.042333666935162, + "learning_rate": 3.166215859294741e-06, + "loss": 0.8206, + "step": 11951 + }, + { + "epoch": 0.8634746329040764, + "grad_norm": 6.9534220771332205, + "learning_rate": 3.1659339296151735e-06, + "loss": 0.7475, + "step": 11952 + }, + { + "epoch": 0.863546878104286, + "grad_norm": 6.344236157960903, + "learning_rate": 3.1656519908194884e-06, + "loss": 0.7532, + "step": 11953 + }, + { + "epoch": 0.8636191233044954, + "grad_norm": 6.089001492729888, + "learning_rate": 3.165370042911543e-06, + "loss": 0.8321, + "step": 11954 + }, + { + "epoch": 0.863691368504705, + "grad_norm": 5.540053952777231, + "learning_rate": 3.1650880858951993e-06, + "loss": 0.8332, + "step": 11955 + }, + { + "epoch": 0.8637636137049145, + "grad_norm": 6.188934468151543, + "learning_rate": 3.1648061197743156e-06, + "loss": 0.7817, + "step": 11956 + }, + { + "epoch": 0.863835858905124, + "grad_norm": 6.495724959465528, + "learning_rate": 3.1645241445527524e-06, + "loss": 0.8652, + "step": 11957 + }, + { + "epoch": 0.8639081041053335, + "grad_norm": 6.784611163428308, + "learning_rate": 3.16424216023437e-06, + "loss": 0.766, + "step": 11958 + }, + { + "epoch": 0.863980349305543, + "grad_norm": 6.326317196926111, + "learning_rate": 3.163960166823028e-06, + "loss": 0.8517, + "step": 11959 + }, + { + "epoch": 0.8640525945057526, + "grad_norm": 6.84520577053865, + "learning_rate": 3.163678164322587e-06, + "loss": 0.7331, + "step": 11960 + }, + { + "epoch": 0.864124839705962, + "grad_norm": 7.161400686321012, + "learning_rate": 3.1633961527369073e-06, + "loss": 0.7634, + "step": 11961 + }, + { + "epoch": 0.8641970849061715, + "grad_norm": 6.391555531848192, + "learning_rate": 3.1631141320698487e-06, + "loss": 0.7968, + "step": 11962 + }, + { + "epoch": 0.8642693301063811, + "grad_norm": 5.5832051884004095, + "learning_rate": 3.1628321023252727e-06, + "loss": 0.778, + "step": 11963 + }, + { + "epoch": 0.8643415753065906, + "grad_norm": 5.756360392386931, + "learning_rate": 3.1625500635070405e-06, + "loss": 0.8263, + "step": 11964 + }, + { + "epoch": 0.8644138205068, + "grad_norm": 6.352910986061915, + "learning_rate": 3.1622680156190116e-06, + "loss": 0.8965, + "step": 11965 + }, + { + "epoch": 0.8644860657070096, + "grad_norm": 5.925396599352582, + "learning_rate": 3.161985958665048e-06, + "loss": 0.8057, + "step": 11966 + }, + { + "epoch": 0.8645583109072191, + "grad_norm": 7.076964024198843, + "learning_rate": 3.1617038926490105e-06, + "loss": 0.8171, + "step": 11967 + }, + { + "epoch": 0.8646305561074286, + "grad_norm": 5.355412299973734, + "learning_rate": 3.161421817574761e-06, + "loss": 0.7898, + "step": 11968 + }, + { + "epoch": 0.8647028013076381, + "grad_norm": 5.08265948389219, + "learning_rate": 3.1611397334461608e-06, + "loss": 0.708, + "step": 11969 + }, + { + "epoch": 0.8647750465078476, + "grad_norm": 5.551225186769021, + "learning_rate": 3.16085764026707e-06, + "loss": 0.7743, + "step": 11970 + }, + { + "epoch": 0.8648472917080572, + "grad_norm": 6.474592583405828, + "learning_rate": 3.1605755380413516e-06, + "loss": 0.8378, + "step": 11971 + }, + { + "epoch": 0.8649195369082666, + "grad_norm": 6.951713504467324, + "learning_rate": 3.1602934267728664e-06, + "loss": 0.8824, + "step": 11972 + }, + { + "epoch": 0.8649917821084762, + "grad_norm": 6.572557571524639, + "learning_rate": 3.160011306465478e-06, + "loss": 0.7805, + "step": 11973 + }, + { + "epoch": 0.8650640273086857, + "grad_norm": 5.625260749707276, + "learning_rate": 3.1597291771230466e-06, + "loss": 0.7983, + "step": 11974 + }, + { + "epoch": 0.8651362725088952, + "grad_norm": 7.328633044755857, + "learning_rate": 3.159447038749435e-06, + "loss": 0.8139, + "step": 11975 + }, + { + "epoch": 0.8652085177091047, + "grad_norm": 6.847371297283456, + "learning_rate": 3.1591648913485053e-06, + "loss": 0.7243, + "step": 11976 + }, + { + "epoch": 0.8652807629093142, + "grad_norm": 5.303664635540616, + "learning_rate": 3.1588827349241203e-06, + "loss": 0.793, + "step": 11977 + }, + { + "epoch": 0.8653530081095238, + "grad_norm": 6.868622752461006, + "learning_rate": 3.1586005694801423e-06, + "loss": 0.8087, + "step": 11978 + }, + { + "epoch": 0.8654252533097332, + "grad_norm": 6.679116386403594, + "learning_rate": 3.1583183950204345e-06, + "loss": 0.8416, + "step": 11979 + }, + { + "epoch": 0.8654974985099427, + "grad_norm": 6.223863196078147, + "learning_rate": 3.1580362115488577e-06, + "loss": 0.8566, + "step": 11980 + }, + { + "epoch": 0.8655697437101523, + "grad_norm": 5.59519828929095, + "learning_rate": 3.157754019069277e-06, + "loss": 0.7249, + "step": 11981 + }, + { + "epoch": 0.8656419889103618, + "grad_norm": 5.827039029395432, + "learning_rate": 3.157471817585554e-06, + "loss": 0.8061, + "step": 11982 + }, + { + "epoch": 0.8657142341105712, + "grad_norm": 6.393836224255174, + "learning_rate": 3.157189607101553e-06, + "loss": 0.7693, + "step": 11983 + }, + { + "epoch": 0.8657864793107808, + "grad_norm": 4.803734860144488, + "learning_rate": 3.156907387621136e-06, + "loss": 0.8584, + "step": 11984 + }, + { + "epoch": 0.8658587245109903, + "grad_norm": 6.265730935435501, + "learning_rate": 3.1566251591481667e-06, + "loss": 0.8592, + "step": 11985 + }, + { + "epoch": 0.8659309697111998, + "grad_norm": 4.786564485066732, + "learning_rate": 3.1563429216865095e-06, + "loss": 0.7639, + "step": 11986 + }, + { + "epoch": 0.8660032149114093, + "grad_norm": 6.517637532037466, + "learning_rate": 3.156060675240027e-06, + "loss": 0.768, + "step": 11987 + }, + { + "epoch": 0.8660754601116188, + "grad_norm": 8.007823932918667, + "learning_rate": 3.155778419812583e-06, + "loss": 0.886, + "step": 11988 + }, + { + "epoch": 0.8661477053118284, + "grad_norm": 5.472409838425596, + "learning_rate": 3.1554961554080423e-06, + "loss": 0.693, + "step": 11989 + }, + { + "epoch": 0.8662199505120378, + "grad_norm": 7.771685869701902, + "learning_rate": 3.155213882030267e-06, + "loss": 0.8646, + "step": 11990 + }, + { + "epoch": 0.8662921957122474, + "grad_norm": 6.143427008116295, + "learning_rate": 3.154931599683123e-06, + "loss": 0.841, + "step": 11991 + }, + { + "epoch": 0.8663644409124569, + "grad_norm": 7.467817048735773, + "learning_rate": 3.1546493083704744e-06, + "loss": 0.7645, + "step": 11992 + }, + { + "epoch": 0.8664366861126664, + "grad_norm": 6.37373608794324, + "learning_rate": 3.154367008096185e-06, + "loss": 0.7968, + "step": 11993 + }, + { + "epoch": 0.8665089313128759, + "grad_norm": 5.7429902845510075, + "learning_rate": 3.1540846988641182e-06, + "loss": 0.8267, + "step": 11994 + }, + { + "epoch": 0.8665811765130854, + "grad_norm": 5.991851996189348, + "learning_rate": 3.153802380678141e-06, + "loss": 0.8058, + "step": 11995 + }, + { + "epoch": 0.866653421713295, + "grad_norm": 6.191025471334999, + "learning_rate": 3.153520053542116e-06, + "loss": 0.7829, + "step": 11996 + }, + { + "epoch": 0.8667256669135044, + "grad_norm": 6.473642756370587, + "learning_rate": 3.1532377174599093e-06, + "loss": 0.8414, + "step": 11997 + }, + { + "epoch": 0.8667979121137139, + "grad_norm": 8.430817394879382, + "learning_rate": 3.152955372435386e-06, + "loss": 0.8077, + "step": 11998 + }, + { + "epoch": 0.8668701573139235, + "grad_norm": 5.307772878207558, + "learning_rate": 3.1526730184724102e-06, + "loss": 0.7253, + "step": 11999 + }, + { + "epoch": 0.866942402514133, + "grad_norm": 5.890833519601794, + "learning_rate": 3.1523906555748476e-06, + "loss": 0.8284, + "step": 12000 + }, + { + "epoch": 0.8670146477143424, + "grad_norm": 7.385817789036079, + "learning_rate": 3.152108283746563e-06, + "loss": 0.7869, + "step": 12001 + }, + { + "epoch": 0.867086892914552, + "grad_norm": 6.343418958322912, + "learning_rate": 3.1518259029914226e-06, + "loss": 0.866, + "step": 12002 + }, + { + "epoch": 0.8671591381147615, + "grad_norm": 6.459017743998321, + "learning_rate": 3.151543513313292e-06, + "loss": 0.8249, + "step": 12003 + }, + { + "epoch": 0.867231383314971, + "grad_norm": 6.82761938112943, + "learning_rate": 3.1512611147160367e-06, + "loss": 0.8134, + "step": 12004 + }, + { + "epoch": 0.8673036285151805, + "grad_norm": 6.403075981045592, + "learning_rate": 3.150978707203521e-06, + "loss": 0.8118, + "step": 12005 + }, + { + "epoch": 0.86737587371539, + "grad_norm": 7.044510607537406, + "learning_rate": 3.1506962907796134e-06, + "loss": 0.8954, + "step": 12006 + }, + { + "epoch": 0.8674481189155996, + "grad_norm": 7.063727078134814, + "learning_rate": 3.1504138654481797e-06, + "loss": 0.835, + "step": 12007 + }, + { + "epoch": 0.867520364115809, + "grad_norm": 7.645722543466374, + "learning_rate": 3.1501314312130837e-06, + "loss": 0.8532, + "step": 12008 + }, + { + "epoch": 0.8675926093160186, + "grad_norm": 6.347517426366575, + "learning_rate": 3.1498489880781936e-06, + "loss": 0.8495, + "step": 12009 + }, + { + "epoch": 0.8676648545162281, + "grad_norm": 6.351784715131867, + "learning_rate": 3.149566536047376e-06, + "loss": 0.8718, + "step": 12010 + }, + { + "epoch": 0.8677370997164376, + "grad_norm": 5.556273844166797, + "learning_rate": 3.1492840751244965e-06, + "loss": 0.7649, + "step": 12011 + }, + { + "epoch": 0.8678093449166471, + "grad_norm": 7.131421056863997, + "learning_rate": 3.149001605313422e-06, + "loss": 0.8053, + "step": 12012 + }, + { + "epoch": 0.8678815901168566, + "grad_norm": 6.807991425774416, + "learning_rate": 3.1487191266180195e-06, + "loss": 0.8049, + "step": 12013 + }, + { + "epoch": 0.8679538353170662, + "grad_norm": 5.873117023400491, + "learning_rate": 3.1484366390421554e-06, + "loss": 0.8482, + "step": 12014 + }, + { + "epoch": 0.8680260805172756, + "grad_norm": 5.873124168097971, + "learning_rate": 3.1481541425896976e-06, + "loss": 0.7926, + "step": 12015 + }, + { + "epoch": 0.8680983257174851, + "grad_norm": 6.60522417171761, + "learning_rate": 3.1478716372645135e-06, + "loss": 0.8951, + "step": 12016 + }, + { + "epoch": 0.8681705709176947, + "grad_norm": 7.696832466085405, + "learning_rate": 3.1475891230704687e-06, + "loss": 0.8801, + "step": 12017 + }, + { + "epoch": 0.8682428161179042, + "grad_norm": 5.764117945928941, + "learning_rate": 3.1473066000114325e-06, + "loss": 0.826, + "step": 12018 + }, + { + "epoch": 0.8683150613181136, + "grad_norm": 5.270055158062252, + "learning_rate": 3.147024068091271e-06, + "loss": 0.782, + "step": 12019 + }, + { + "epoch": 0.8683873065183232, + "grad_norm": 6.022002250779921, + "learning_rate": 3.1467415273138522e-06, + "loss": 0.8506, + "step": 12020 + }, + { + "epoch": 0.8684595517185327, + "grad_norm": 6.926668526603637, + "learning_rate": 3.1464589776830444e-06, + "loss": 0.7298, + "step": 12021 + }, + { + "epoch": 0.8685317969187422, + "grad_norm": 4.835766914082901, + "learning_rate": 3.146176419202715e-06, + "loss": 0.7291, + "step": 12022 + }, + { + "epoch": 0.8686040421189517, + "grad_norm": 6.9243479287425505, + "learning_rate": 3.1458938518767325e-06, + "loss": 0.85, + "step": 12023 + }, + { + "epoch": 0.8686762873191612, + "grad_norm": 5.895800838672968, + "learning_rate": 3.145611275708964e-06, + "loss": 0.8492, + "step": 12024 + }, + { + "epoch": 0.8687485325193708, + "grad_norm": 5.293914777945283, + "learning_rate": 3.1453286907032795e-06, + "loss": 0.8038, + "step": 12025 + }, + { + "epoch": 0.8688207777195802, + "grad_norm": 6.150000173677271, + "learning_rate": 3.145046096863545e-06, + "loss": 0.7541, + "step": 12026 + }, + { + "epoch": 0.8688930229197898, + "grad_norm": 5.748767886630619, + "learning_rate": 3.144763494193631e-06, + "loss": 0.7476, + "step": 12027 + }, + { + "epoch": 0.8689652681199993, + "grad_norm": 5.811436053288795, + "learning_rate": 3.1444808826974055e-06, + "loss": 0.7905, + "step": 12028 + }, + { + "epoch": 0.8690375133202088, + "grad_norm": 5.079725746801665, + "learning_rate": 3.1441982623787365e-06, + "loss": 0.8704, + "step": 12029 + }, + { + "epoch": 0.8691097585204183, + "grad_norm": 4.782737749793603, + "learning_rate": 3.1439156332414945e-06, + "loss": 0.7236, + "step": 12030 + }, + { + "epoch": 0.8691820037206278, + "grad_norm": 5.204833703806236, + "learning_rate": 3.1436329952895466e-06, + "loss": 0.78, + "step": 12031 + }, + { + "epoch": 0.8692542489208374, + "grad_norm": 5.494205196474446, + "learning_rate": 3.1433503485267636e-06, + "loss": 0.7941, + "step": 12032 + }, + { + "epoch": 0.8693264941210468, + "grad_norm": 6.436277699398877, + "learning_rate": 3.143067692957012e-06, + "loss": 0.7918, + "step": 12033 + }, + { + "epoch": 0.8693987393212563, + "grad_norm": 6.864608401849459, + "learning_rate": 3.142785028584165e-06, + "loss": 0.7386, + "step": 12034 + }, + { + "epoch": 0.8694709845214659, + "grad_norm": 6.35931531423542, + "learning_rate": 3.1425023554120893e-06, + "loss": 0.8894, + "step": 12035 + }, + { + "epoch": 0.8695432297216754, + "grad_norm": 5.965876821455443, + "learning_rate": 3.1422196734446553e-06, + "loss": 0.8252, + "step": 12036 + }, + { + "epoch": 0.8696154749218848, + "grad_norm": 5.490527231717056, + "learning_rate": 3.141936982685732e-06, + "loss": 0.7817, + "step": 12037 + }, + { + "epoch": 0.8696877201220944, + "grad_norm": 5.376813183510841, + "learning_rate": 3.1416542831391906e-06, + "loss": 0.8127, + "step": 12038 + }, + { + "epoch": 0.8697599653223039, + "grad_norm": 5.541578746279863, + "learning_rate": 3.1413715748089e-06, + "loss": 0.7278, + "step": 12039 + }, + { + "epoch": 0.8698322105225134, + "grad_norm": 6.589680794610124, + "learning_rate": 3.14108885769873e-06, + "loss": 0.8536, + "step": 12040 + }, + { + "epoch": 0.8699044557227229, + "grad_norm": 6.566109963199529, + "learning_rate": 3.1408061318125527e-06, + "loss": 0.8281, + "step": 12041 + }, + { + "epoch": 0.8699767009229324, + "grad_norm": 6.849739136566711, + "learning_rate": 3.140523397154236e-06, + "loss": 0.8343, + "step": 12042 + }, + { + "epoch": 0.870048946123142, + "grad_norm": 6.669979925168324, + "learning_rate": 3.1402406537276513e-06, + "loss": 0.8204, + "step": 12043 + }, + { + "epoch": 0.8701211913233514, + "grad_norm": 5.511194627310196, + "learning_rate": 3.13995790153667e-06, + "loss": 0.786, + "step": 12044 + }, + { + "epoch": 0.870193436523561, + "grad_norm": 5.322545505829057, + "learning_rate": 3.139675140585161e-06, + "loss": 0.8092, + "step": 12045 + }, + { + "epoch": 0.8702656817237705, + "grad_norm": 7.14036509502039, + "learning_rate": 3.1393923708769968e-06, + "loss": 0.9214, + "step": 12046 + }, + { + "epoch": 0.87033792692398, + "grad_norm": 8.452116788014834, + "learning_rate": 3.1391095924160463e-06, + "loss": 0.7815, + "step": 12047 + }, + { + "epoch": 0.8704101721241895, + "grad_norm": 8.938931863967436, + "learning_rate": 3.1388268052061827e-06, + "loss": 0.7579, + "step": 12048 + }, + { + "epoch": 0.870482417324399, + "grad_norm": 5.498853737419928, + "learning_rate": 3.1385440092512753e-06, + "loss": 0.7773, + "step": 12049 + }, + { + "epoch": 0.8705546625246086, + "grad_norm": 7.320611351206941, + "learning_rate": 3.1382612045551975e-06, + "loss": 0.8084, + "step": 12050 + }, + { + "epoch": 0.870626907724818, + "grad_norm": 7.908046834865841, + "learning_rate": 3.1379783911218185e-06, + "loss": 0.79, + "step": 12051 + }, + { + "epoch": 0.8706991529250275, + "grad_norm": 6.926948840525518, + "learning_rate": 3.1376955689550102e-06, + "loss": 0.7945, + "step": 12052 + }, + { + "epoch": 0.8707713981252371, + "grad_norm": 7.970171154678128, + "learning_rate": 3.137412738058646e-06, + "loss": 0.7397, + "step": 12053 + }, + { + "epoch": 0.8708436433254466, + "grad_norm": 8.627632955512135, + "learning_rate": 3.1371298984365958e-06, + "loss": 0.7913, + "step": 12054 + }, + { + "epoch": 0.870915888525656, + "grad_norm": 8.966763053669062, + "learning_rate": 3.136847050092732e-06, + "loss": 0.8595, + "step": 12055 + }, + { + "epoch": 0.8709881337258656, + "grad_norm": 6.400842384689891, + "learning_rate": 3.1365641930309266e-06, + "loss": 0.7675, + "step": 12056 + }, + { + "epoch": 0.8710603789260751, + "grad_norm": 6.548722109006178, + "learning_rate": 3.1362813272550506e-06, + "loss": 0.8527, + "step": 12057 + }, + { + "epoch": 0.8711326241262846, + "grad_norm": 6.280886511462617, + "learning_rate": 3.1359984527689785e-06, + "loss": 0.7801, + "step": 12058 + }, + { + "epoch": 0.8712048693264941, + "grad_norm": 7.394937752957572, + "learning_rate": 3.135715569576581e-06, + "loss": 0.7768, + "step": 12059 + }, + { + "epoch": 0.8712771145267036, + "grad_norm": 6.635406957246952, + "learning_rate": 3.135432677681732e-06, + "loss": 0.8127, + "step": 12060 + }, + { + "epoch": 0.8713493597269132, + "grad_norm": 6.042166046901564, + "learning_rate": 3.135149777088301e-06, + "loss": 0.8084, + "step": 12061 + }, + { + "epoch": 0.8714216049271226, + "grad_norm": 5.780265435588523, + "learning_rate": 3.134866867800164e-06, + "loss": 0.852, + "step": 12062 + }, + { + "epoch": 0.8714938501273322, + "grad_norm": 6.946489438208102, + "learning_rate": 3.1345839498211922e-06, + "loss": 0.8052, + "step": 12063 + }, + { + "epoch": 0.8715660953275417, + "grad_norm": 6.263018972247211, + "learning_rate": 3.1343010231552597e-06, + "loss": 0.8817, + "step": 12064 + }, + { + "epoch": 0.8716383405277511, + "grad_norm": 5.982055533286222, + "learning_rate": 3.1340180878062378e-06, + "loss": 0.754, + "step": 12065 + }, + { + "epoch": 0.8717105857279607, + "grad_norm": 5.780515221992712, + "learning_rate": 3.133735143778e-06, + "loss": 0.8626, + "step": 12066 + }, + { + "epoch": 0.8717828309281702, + "grad_norm": 6.166284738346522, + "learning_rate": 3.133452191074421e-06, + "loss": 0.8408, + "step": 12067 + }, + { + "epoch": 0.8718550761283798, + "grad_norm": 6.703185608062974, + "learning_rate": 3.133169229699373e-06, + "loss": 0.784, + "step": 12068 + }, + { + "epoch": 0.8719273213285892, + "grad_norm": 8.807292725427445, + "learning_rate": 3.1328862596567304e-06, + "loss": 0.8029, + "step": 12069 + }, + { + "epoch": 0.8719995665287987, + "grad_norm": 6.009657399192076, + "learning_rate": 3.132603280950366e-06, + "loss": 0.8746, + "step": 12070 + }, + { + "epoch": 0.8720718117290083, + "grad_norm": 7.2924930067947535, + "learning_rate": 3.1323202935841536e-06, + "loss": 0.7225, + "step": 12071 + }, + { + "epoch": 0.8721440569292178, + "grad_norm": 4.971515965280811, + "learning_rate": 3.1320372975619673e-06, + "loss": 0.7223, + "step": 12072 + }, + { + "epoch": 0.8722163021294272, + "grad_norm": 6.8142255645044, + "learning_rate": 3.131754292887682e-06, + "loss": 0.8472, + "step": 12073 + }, + { + "epoch": 0.8722885473296368, + "grad_norm": 6.217791459476984, + "learning_rate": 3.13147127956517e-06, + "loss": 0.7235, + "step": 12074 + }, + { + "epoch": 0.8723607925298463, + "grad_norm": 5.8096562473945585, + "learning_rate": 3.131188257598307e-06, + "loss": 0.7647, + "step": 12075 + }, + { + "epoch": 0.8724330377300558, + "grad_norm": 7.121737352206859, + "learning_rate": 3.1309052269909668e-06, + "loss": 0.8374, + "step": 12076 + }, + { + "epoch": 0.8725052829302653, + "grad_norm": 6.052128365415331, + "learning_rate": 3.130622187747024e-06, + "loss": 0.7682, + "step": 12077 + }, + { + "epoch": 0.8725775281304748, + "grad_norm": 5.919791063797241, + "learning_rate": 3.130339139870353e-06, + "loss": 0.8519, + "step": 12078 + }, + { + "epoch": 0.8726497733306844, + "grad_norm": 6.077507713372605, + "learning_rate": 3.1300560833648285e-06, + "loss": 0.7757, + "step": 12079 + }, + { + "epoch": 0.8727220185308938, + "grad_norm": 5.780784794416366, + "learning_rate": 3.129773018234325e-06, + "loss": 0.8415, + "step": 12080 + }, + { + "epoch": 0.8727942637311034, + "grad_norm": 7.821543593783335, + "learning_rate": 3.129489944482718e-06, + "loss": 0.7518, + "step": 12081 + }, + { + "epoch": 0.8728665089313129, + "grad_norm": 6.431680066972137, + "learning_rate": 3.1292068621138833e-06, + "loss": 0.8497, + "step": 12082 + }, + { + "epoch": 0.8729387541315223, + "grad_norm": 5.96625918199243, + "learning_rate": 3.1289237711316943e-06, + "loss": 0.8606, + "step": 12083 + }, + { + "epoch": 0.8730109993317319, + "grad_norm": 6.5258097646741735, + "learning_rate": 3.1286406715400282e-06, + "loss": 0.8004, + "step": 12084 + }, + { + "epoch": 0.8730832445319414, + "grad_norm": 6.284610892961292, + "learning_rate": 3.1283575633427585e-06, + "loss": 0.7706, + "step": 12085 + }, + { + "epoch": 0.873155489732151, + "grad_norm": 6.525007705391613, + "learning_rate": 3.1280744465437617e-06, + "loss": 0.7754, + "step": 12086 + }, + { + "epoch": 0.8732277349323604, + "grad_norm": 5.7597702611035935, + "learning_rate": 3.127791321146914e-06, + "loss": 0.7471, + "step": 12087 + }, + { + "epoch": 0.8732999801325699, + "grad_norm": 9.5674995866132, + "learning_rate": 3.12750818715609e-06, + "loss": 0.8168, + "step": 12088 + }, + { + "epoch": 0.8733722253327795, + "grad_norm": 7.071175706974495, + "learning_rate": 3.127225044575166e-06, + "loss": 0.8085, + "step": 12089 + }, + { + "epoch": 0.873444470532989, + "grad_norm": 7.394120855374015, + "learning_rate": 3.1269418934080186e-06, + "loss": 0.8242, + "step": 12090 + }, + { + "epoch": 0.8735167157331984, + "grad_norm": 6.008991815229743, + "learning_rate": 3.1266587336585234e-06, + "loss": 0.7883, + "step": 12091 + }, + { + "epoch": 0.873588960933408, + "grad_norm": 6.6171183216572835, + "learning_rate": 3.1263755653305568e-06, + "loss": 0.7643, + "step": 12092 + }, + { + "epoch": 0.8736612061336175, + "grad_norm": 6.134342153483604, + "learning_rate": 3.1260923884279947e-06, + "loss": 0.7704, + "step": 12093 + }, + { + "epoch": 0.873733451333827, + "grad_norm": 5.489664597123946, + "learning_rate": 3.1258092029547145e-06, + "loss": 0.6943, + "step": 12094 + }, + { + "epoch": 0.8738056965340365, + "grad_norm": 7.026411095350127, + "learning_rate": 3.1255260089145913e-06, + "loss": 0.7421, + "step": 12095 + }, + { + "epoch": 0.873877941734246, + "grad_norm": 5.5553245983860045, + "learning_rate": 3.125242806311504e-06, + "loss": 0.7245, + "step": 12096 + }, + { + "epoch": 0.8739501869344556, + "grad_norm": 7.425353831071053, + "learning_rate": 3.124959595149327e-06, + "loss": 0.8148, + "step": 12097 + }, + { + "epoch": 0.874022432134665, + "grad_norm": 5.471860513281259, + "learning_rate": 3.1246763754319392e-06, + "loss": 0.8119, + "step": 12098 + }, + { + "epoch": 0.8740946773348746, + "grad_norm": 6.131608473071402, + "learning_rate": 3.124393147163216e-06, + "loss": 0.7927, + "step": 12099 + }, + { + "epoch": 0.8741669225350841, + "grad_norm": 6.249048084246793, + "learning_rate": 3.1241099103470358e-06, + "loss": 0.7281, + "step": 12100 + }, + { + "epoch": 0.8742391677352935, + "grad_norm": 5.816171122577087, + "learning_rate": 3.123826664987276e-06, + "loss": 0.8328, + "step": 12101 + }, + { + "epoch": 0.8743114129355031, + "grad_norm": 6.310202605415037, + "learning_rate": 3.123543411087813e-06, + "loss": 0.788, + "step": 12102 + }, + { + "epoch": 0.8743836581357126, + "grad_norm": 6.007160682191706, + "learning_rate": 3.1232601486525255e-06, + "loss": 0.8152, + "step": 12103 + }, + { + "epoch": 0.8744559033359222, + "grad_norm": 6.471595912130357, + "learning_rate": 3.1229768776852893e-06, + "loss": 0.8975, + "step": 12104 + }, + { + "epoch": 0.8745281485361316, + "grad_norm": 8.859006843789025, + "learning_rate": 3.122693598189984e-06, + "loss": 0.8683, + "step": 12105 + }, + { + "epoch": 0.8746003937363411, + "grad_norm": 5.402361579473923, + "learning_rate": 3.122410310170487e-06, + "loss": 0.7754, + "step": 12106 + }, + { + "epoch": 0.8746726389365507, + "grad_norm": 6.467724999936584, + "learning_rate": 3.1221270136306764e-06, + "loss": 0.8217, + "step": 12107 + }, + { + "epoch": 0.8747448841367602, + "grad_norm": 6.05089094679453, + "learning_rate": 3.121843708574429e-06, + "loss": 0.8196, + "step": 12108 + }, + { + "epoch": 0.8748171293369696, + "grad_norm": 6.658849647310584, + "learning_rate": 3.121560395005625e-06, + "loss": 0.7885, + "step": 12109 + }, + { + "epoch": 0.8748893745371792, + "grad_norm": 6.870033828674274, + "learning_rate": 3.1212770729281418e-06, + "loss": 0.7689, + "step": 12110 + }, + { + "epoch": 0.8749616197373887, + "grad_norm": 5.09138836555331, + "learning_rate": 3.120993742345857e-06, + "loss": 0.8195, + "step": 12111 + }, + { + "epoch": 0.8750338649375982, + "grad_norm": 6.932255317560209, + "learning_rate": 3.120710403262651e-06, + "loss": 0.7713, + "step": 12112 + }, + { + "epoch": 0.8751061101378077, + "grad_norm": 6.092027000001724, + "learning_rate": 3.1204270556824013e-06, + "loss": 0.7956, + "step": 12113 + }, + { + "epoch": 0.8751783553380172, + "grad_norm": 6.840042294377074, + "learning_rate": 3.1201436996089864e-06, + "loss": 0.8749, + "step": 12114 + }, + { + "epoch": 0.8752506005382268, + "grad_norm": 6.188183062833599, + "learning_rate": 3.119860335046286e-06, + "loss": 0.8332, + "step": 12115 + }, + { + "epoch": 0.8753228457384362, + "grad_norm": 7.067246993845728, + "learning_rate": 3.119576961998179e-06, + "loss": 0.9409, + "step": 12116 + }, + { + "epoch": 0.8753950909386458, + "grad_norm": 7.634741392021532, + "learning_rate": 3.1192935804685443e-06, + "loss": 0.835, + "step": 12117 + }, + { + "epoch": 0.8754673361388553, + "grad_norm": 5.777376764188971, + "learning_rate": 3.119010190461261e-06, + "loss": 0.7747, + "step": 12118 + }, + { + "epoch": 0.8755395813390647, + "grad_norm": 6.44738573426443, + "learning_rate": 3.11872679198021e-06, + "loss": 0.8257, + "step": 12119 + }, + { + "epoch": 0.8756118265392743, + "grad_norm": 6.654439903896519, + "learning_rate": 3.118443385029269e-06, + "loss": 0.7707, + "step": 12120 + }, + { + "epoch": 0.8756840717394838, + "grad_norm": 5.397269491103323, + "learning_rate": 3.118159969612319e-06, + "loss": 0.8519, + "step": 12121 + }, + { + "epoch": 0.8757563169396934, + "grad_norm": 6.666971231497203, + "learning_rate": 3.1178765457332376e-06, + "loss": 0.8382, + "step": 12122 + }, + { + "epoch": 0.8758285621399028, + "grad_norm": 6.3367788412381625, + "learning_rate": 3.1175931133959065e-06, + "loss": 0.8114, + "step": 12123 + }, + { + "epoch": 0.8759008073401123, + "grad_norm": 7.267236411947713, + "learning_rate": 3.1173096726042053e-06, + "loss": 0.8688, + "step": 12124 + }, + { + "epoch": 0.8759730525403219, + "grad_norm": 8.700873092162531, + "learning_rate": 3.117026223362014e-06, + "loss": 0.8398, + "step": 12125 + }, + { + "epoch": 0.8760452977405314, + "grad_norm": 7.359511841880632, + "learning_rate": 3.1167427656732135e-06, + "loss": 0.933, + "step": 12126 + }, + { + "epoch": 0.8761175429407408, + "grad_norm": 6.335122541704162, + "learning_rate": 3.1164592995416826e-06, + "loss": 0.9002, + "step": 12127 + }, + { + "epoch": 0.8761897881409504, + "grad_norm": 5.77422305739083, + "learning_rate": 3.1161758249713027e-06, + "loss": 0.7862, + "step": 12128 + }, + { + "epoch": 0.8762620333411599, + "grad_norm": 5.481871639481622, + "learning_rate": 3.1158923419659536e-06, + "loss": 0.788, + "step": 12129 + }, + { + "epoch": 0.8763342785413694, + "grad_norm": 5.290227358859244, + "learning_rate": 3.115608850529517e-06, + "loss": 0.855, + "step": 12130 + }, + { + "epoch": 0.8764065237415789, + "grad_norm": 6.397084895474822, + "learning_rate": 3.1153253506658737e-06, + "loss": 0.7937, + "step": 12131 + }, + { + "epoch": 0.8764787689417884, + "grad_norm": 6.7698222965600765, + "learning_rate": 3.1150418423789034e-06, + "loss": 0.8083, + "step": 12132 + }, + { + "epoch": 0.876551014141998, + "grad_norm": 6.276112747317484, + "learning_rate": 3.1147583256724884e-06, + "loss": 0.7898, + "step": 12133 + }, + { + "epoch": 0.8766232593422074, + "grad_norm": 6.511288157816478, + "learning_rate": 3.1144748005505092e-06, + "loss": 0.7281, + "step": 12134 + }, + { + "epoch": 0.876695504542417, + "grad_norm": 6.068536329190997, + "learning_rate": 3.1141912670168474e-06, + "loss": 0.7945, + "step": 12135 + }, + { + "epoch": 0.8767677497426265, + "grad_norm": 7.29634932068249, + "learning_rate": 3.1139077250753837e-06, + "loss": 0.829, + "step": 12136 + }, + { + "epoch": 0.8768399949428359, + "grad_norm": 6.528260637465664, + "learning_rate": 3.1136241747299988e-06, + "loss": 0.7724, + "step": 12137 + }, + { + "epoch": 0.8769122401430455, + "grad_norm": 6.235592830713007, + "learning_rate": 3.1133406159845762e-06, + "loss": 0.6996, + "step": 12138 + }, + { + "epoch": 0.876984485343255, + "grad_norm": 6.170225130545975, + "learning_rate": 3.113057048842998e-06, + "loss": 0.8499, + "step": 12139 + }, + { + "epoch": 0.8770567305434646, + "grad_norm": 6.725977784727253, + "learning_rate": 3.112773473309143e-06, + "loss": 0.7746, + "step": 12140 + }, + { + "epoch": 0.877128975743674, + "grad_norm": 6.192238582657389, + "learning_rate": 3.1124898893868966e-06, + "loss": 0.7692, + "step": 12141 + }, + { + "epoch": 0.8772012209438835, + "grad_norm": 9.059347090952828, + "learning_rate": 3.112206297080138e-06, + "loss": 0.7712, + "step": 12142 + }, + { + "epoch": 0.8772734661440931, + "grad_norm": 6.858964286447603, + "learning_rate": 3.1119226963927505e-06, + "loss": 0.783, + "step": 12143 + }, + { + "epoch": 0.8773457113443026, + "grad_norm": 7.300509542161787, + "learning_rate": 3.1116390873286174e-06, + "loss": 0.8156, + "step": 12144 + }, + { + "epoch": 0.877417956544512, + "grad_norm": 6.272751862147287, + "learning_rate": 3.1113554698916188e-06, + "loss": 0.8794, + "step": 12145 + }, + { + "epoch": 0.8774902017447216, + "grad_norm": 6.4982752345657016, + "learning_rate": 3.111071844085639e-06, + "loss": 0.7666, + "step": 12146 + }, + { + "epoch": 0.8775624469449311, + "grad_norm": 6.483020613449726, + "learning_rate": 3.1107882099145604e-06, + "loss": 0.8677, + "step": 12147 + }, + { + "epoch": 0.8776346921451406, + "grad_norm": 7.6601323463421025, + "learning_rate": 3.1105045673822654e-06, + "loss": 0.8479, + "step": 12148 + }, + { + "epoch": 0.8777069373453501, + "grad_norm": 6.915121465072684, + "learning_rate": 3.110220916492637e-06, + "loss": 0.9173, + "step": 12149 + }, + { + "epoch": 0.8777791825455596, + "grad_norm": 6.1441255254837035, + "learning_rate": 3.109937257249558e-06, + "loss": 0.8478, + "step": 12150 + }, + { + "epoch": 0.8778514277457692, + "grad_norm": 7.837493954888892, + "learning_rate": 3.1096535896569115e-06, + "loss": 0.7265, + "step": 12151 + }, + { + "epoch": 0.8779236729459786, + "grad_norm": 7.039792716269217, + "learning_rate": 3.1093699137185802e-06, + "loss": 0.8454, + "step": 12152 + }, + { + "epoch": 0.8779959181461882, + "grad_norm": 6.419395692138481, + "learning_rate": 3.1090862294384487e-06, + "loss": 0.8578, + "step": 12153 + }, + { + "epoch": 0.8780681633463977, + "grad_norm": 6.2513359167964175, + "learning_rate": 3.1088025368203994e-06, + "loss": 0.7931, + "step": 12154 + }, + { + "epoch": 0.8781404085466071, + "grad_norm": 6.186017099568376, + "learning_rate": 3.108518835868316e-06, + "loss": 0.7899, + "step": 12155 + }, + { + "epoch": 0.8782126537468167, + "grad_norm": 5.9486400300225855, + "learning_rate": 3.1082351265860815e-06, + "loss": 0.8051, + "step": 12156 + }, + { + "epoch": 0.8782848989470262, + "grad_norm": 6.924712622012623, + "learning_rate": 3.1079514089775815e-06, + "loss": 0.7883, + "step": 12157 + }, + { + "epoch": 0.8783571441472358, + "grad_norm": 6.42693521173353, + "learning_rate": 3.1076676830466983e-06, + "loss": 0.8462, + "step": 12158 + }, + { + "epoch": 0.8784293893474452, + "grad_norm": 5.375831539682209, + "learning_rate": 3.1073839487973167e-06, + "loss": 0.7919, + "step": 12159 + }, + { + "epoch": 0.8785016345476547, + "grad_norm": 5.962905978730642, + "learning_rate": 3.10710020623332e-06, + "loss": 0.7934, + "step": 12160 + }, + { + "epoch": 0.8785738797478643, + "grad_norm": 5.144715325401294, + "learning_rate": 3.106816455358593e-06, + "loss": 0.8412, + "step": 12161 + }, + { + "epoch": 0.8786461249480738, + "grad_norm": 6.59762849683328, + "learning_rate": 3.1065326961770204e-06, + "loss": 0.8408, + "step": 12162 + }, + { + "epoch": 0.8787183701482832, + "grad_norm": 6.10906123065345, + "learning_rate": 3.106248928692485e-06, + "loss": 0.8162, + "step": 12163 + }, + { + "epoch": 0.8787906153484928, + "grad_norm": 6.6868816339403025, + "learning_rate": 3.1059651529088738e-06, + "loss": 0.9187, + "step": 12164 + }, + { + "epoch": 0.8788628605487023, + "grad_norm": 6.333132422757295, + "learning_rate": 3.105681368830069e-06, + "loss": 0.8097, + "step": 12165 + }, + { + "epoch": 0.8789351057489118, + "grad_norm": 5.839343689396985, + "learning_rate": 3.105397576459957e-06, + "loss": 0.8276, + "step": 12166 + }, + { + "epoch": 0.8790073509491213, + "grad_norm": 7.106487383605884, + "learning_rate": 3.1051137758024225e-06, + "loss": 0.8634, + "step": 12167 + }, + { + "epoch": 0.8790795961493308, + "grad_norm": 7.366405846076588, + "learning_rate": 3.1048299668613495e-06, + "loss": 0.7889, + "step": 12168 + }, + { + "epoch": 0.8791518413495404, + "grad_norm": 5.590279125689968, + "learning_rate": 3.1045461496406247e-06, + "loss": 0.7896, + "step": 12169 + }, + { + "epoch": 0.8792240865497498, + "grad_norm": 5.547146986619528, + "learning_rate": 3.1042623241441318e-06, + "loss": 0.7707, + "step": 12170 + }, + { + "epoch": 0.8792963317499594, + "grad_norm": 6.178509895402604, + "learning_rate": 3.1039784903757573e-06, + "loss": 0.7232, + "step": 12171 + }, + { + "epoch": 0.8793685769501689, + "grad_norm": 5.5161490475086925, + "learning_rate": 3.1036946483393863e-06, + "loss": 0.8192, + "step": 12172 + }, + { + "epoch": 0.8794408221503783, + "grad_norm": 6.471398442588308, + "learning_rate": 3.1034107980389044e-06, + "loss": 0.8416, + "step": 12173 + }, + { + "epoch": 0.8795130673505879, + "grad_norm": 5.5801743448017636, + "learning_rate": 3.103126939478197e-06, + "loss": 0.783, + "step": 12174 + }, + { + "epoch": 0.8795853125507974, + "grad_norm": 6.711145542104171, + "learning_rate": 3.1028430726611496e-06, + "loss": 0.8357, + "step": 12175 + }, + { + "epoch": 0.879657557751007, + "grad_norm": 6.342030860712712, + "learning_rate": 3.10255919759165e-06, + "loss": 0.763, + "step": 12176 + }, + { + "epoch": 0.8797298029512164, + "grad_norm": 5.480500420324565, + "learning_rate": 3.102275314273581e-06, + "loss": 0.8179, + "step": 12177 + }, + { + "epoch": 0.8798020481514259, + "grad_norm": 6.2991354697304, + "learning_rate": 3.1019914227108323e-06, + "loss": 0.8371, + "step": 12178 + }, + { + "epoch": 0.8798742933516355, + "grad_norm": 6.689416913679363, + "learning_rate": 3.1017075229072873e-06, + "loss": 0.7862, + "step": 12179 + }, + { + "epoch": 0.879946538551845, + "grad_norm": 8.251372714425356, + "learning_rate": 3.1014236148668336e-06, + "loss": 0.8173, + "step": 12180 + }, + { + "epoch": 0.8800187837520544, + "grad_norm": 6.873820810627892, + "learning_rate": 3.1011396985933576e-06, + "loss": 0.825, + "step": 12181 + }, + { + "epoch": 0.880091028952264, + "grad_norm": 5.354707425884377, + "learning_rate": 3.100855774090746e-06, + "loss": 0.8725, + "step": 12182 + }, + { + "epoch": 0.8801632741524735, + "grad_norm": 6.08555733077284, + "learning_rate": 3.100571841362886e-06, + "loss": 0.8819, + "step": 12183 + }, + { + "epoch": 0.880235519352683, + "grad_norm": 5.509698812626637, + "learning_rate": 3.1002879004136633e-06, + "loss": 0.7214, + "step": 12184 + }, + { + "epoch": 0.8803077645528925, + "grad_norm": 5.7424635217460445, + "learning_rate": 3.1000039512469656e-06, + "loss": 0.7438, + "step": 12185 + }, + { + "epoch": 0.880380009753102, + "grad_norm": 6.111953230354358, + "learning_rate": 3.0997199938666788e-06, + "loss": 0.8164, + "step": 12186 + }, + { + "epoch": 0.8804522549533116, + "grad_norm": 8.224096147680246, + "learning_rate": 3.0994360282766924e-06, + "loss": 0.8786, + "step": 12187 + }, + { + "epoch": 0.880524500153521, + "grad_norm": 5.6867853071948264, + "learning_rate": 3.0991520544808914e-06, + "loss": 0.8595, + "step": 12188 + }, + { + "epoch": 0.8805967453537306, + "grad_norm": 7.073601563617402, + "learning_rate": 3.098868072483163e-06, + "loss": 0.8092, + "step": 12189 + }, + { + "epoch": 0.8806689905539401, + "grad_norm": 6.763030296870384, + "learning_rate": 3.098584082287397e-06, + "loss": 0.7241, + "step": 12190 + }, + { + "epoch": 0.8807412357541495, + "grad_norm": 6.170208128845532, + "learning_rate": 3.0983000838974797e-06, + "loss": 0.7118, + "step": 12191 + }, + { + "epoch": 0.8808134809543591, + "grad_norm": 6.258223349886956, + "learning_rate": 3.0980160773172985e-06, + "loss": 0.8681, + "step": 12192 + }, + { + "epoch": 0.8808857261545686, + "grad_norm": 8.158512430170232, + "learning_rate": 3.0977320625507413e-06, + "loss": 0.7476, + "step": 12193 + }, + { + "epoch": 0.8809579713547782, + "grad_norm": 9.606817336196972, + "learning_rate": 3.0974480396016963e-06, + "loss": 0.8359, + "step": 12194 + }, + { + "epoch": 0.8810302165549876, + "grad_norm": 9.065863044503773, + "learning_rate": 3.0971640084740514e-06, + "loss": 0.8979, + "step": 12195 + }, + { + "epoch": 0.8811024617551971, + "grad_norm": 7.046963777849871, + "learning_rate": 3.0968799691716957e-06, + "loss": 0.801, + "step": 12196 + }, + { + "epoch": 0.8811747069554067, + "grad_norm": 8.233386796579513, + "learning_rate": 3.096595921698516e-06, + "loss": 0.8493, + "step": 12197 + }, + { + "epoch": 0.8812469521556162, + "grad_norm": 7.177701158511577, + "learning_rate": 3.096311866058401e-06, + "loss": 0.937, + "step": 12198 + }, + { + "epoch": 0.8813191973558256, + "grad_norm": 6.596979444174829, + "learning_rate": 3.0960278022552398e-06, + "loss": 0.7736, + "step": 12199 + }, + { + "epoch": 0.8813914425560352, + "grad_norm": 10.59341727215317, + "learning_rate": 3.0957437302929217e-06, + "loss": 0.7976, + "step": 12200 + }, + { + "epoch": 0.8814636877562447, + "grad_norm": 6.567134419675296, + "learning_rate": 3.0954596501753335e-06, + "loss": 0.818, + "step": 12201 + }, + { + "epoch": 0.8815359329564542, + "grad_norm": 5.481197121168287, + "learning_rate": 3.0951755619063657e-06, + "loss": 0.8326, + "step": 12202 + }, + { + "epoch": 0.8816081781566637, + "grad_norm": 8.793484130831175, + "learning_rate": 3.094891465489906e-06, + "loss": 0.7881, + "step": 12203 + }, + { + "epoch": 0.8816804233568732, + "grad_norm": 7.022892711247368, + "learning_rate": 3.094607360929844e-06, + "loss": 0.8238, + "step": 12204 + }, + { + "epoch": 0.8817526685570828, + "grad_norm": 5.9827524879057945, + "learning_rate": 3.0943232482300696e-06, + "loss": 0.7896, + "step": 12205 + }, + { + "epoch": 0.8818249137572922, + "grad_norm": 6.284010550972709, + "learning_rate": 3.09403912739447e-06, + "loss": 0.9508, + "step": 12206 + }, + { + "epoch": 0.8818971589575018, + "grad_norm": 7.806091121281509, + "learning_rate": 3.0937549984269376e-06, + "loss": 0.7749, + "step": 12207 + }, + { + "epoch": 0.8819694041577113, + "grad_norm": 6.518144957277495, + "learning_rate": 3.0934708613313595e-06, + "loss": 0.868, + "step": 12208 + }, + { + "epoch": 0.8820416493579207, + "grad_norm": 5.767164071448111, + "learning_rate": 3.093186716111626e-06, + "loss": 0.861, + "step": 12209 + }, + { + "epoch": 0.8821138945581303, + "grad_norm": 5.574108740209565, + "learning_rate": 3.0929025627716282e-06, + "loss": 0.7889, + "step": 12210 + }, + { + "epoch": 0.8821861397583398, + "grad_norm": 7.555951017738979, + "learning_rate": 3.0926184013152534e-06, + "loss": 0.7985, + "step": 12211 + }, + { + "epoch": 0.8822583849585494, + "grad_norm": 5.792522925149188, + "learning_rate": 3.0923342317463934e-06, + "loss": 0.8116, + "step": 12212 + }, + { + "epoch": 0.8823306301587588, + "grad_norm": 5.559060051044875, + "learning_rate": 3.0920500540689365e-06, + "loss": 0.857, + "step": 12213 + }, + { + "epoch": 0.8824028753589683, + "grad_norm": 5.783263149806699, + "learning_rate": 3.0917658682867753e-06, + "loss": 0.849, + "step": 12214 + }, + { + "epoch": 0.8824751205591779, + "grad_norm": 6.121522188559745, + "learning_rate": 3.0914816744037986e-06, + "loss": 0.8283, + "step": 12215 + }, + { + "epoch": 0.8825473657593874, + "grad_norm": 8.02048111839966, + "learning_rate": 3.091197472423897e-06, + "loss": 0.8523, + "step": 12216 + }, + { + "epoch": 0.8826196109595968, + "grad_norm": 8.043482389348155, + "learning_rate": 3.090913262350961e-06, + "loss": 0.8072, + "step": 12217 + }, + { + "epoch": 0.8826918561598064, + "grad_norm": 6.062338600763136, + "learning_rate": 3.0906290441888807e-06, + "loss": 0.7467, + "step": 12218 + }, + { + "epoch": 0.8827641013600159, + "grad_norm": 5.3312528048974155, + "learning_rate": 3.090344817941548e-06, + "loss": 0.8168, + "step": 12219 + }, + { + "epoch": 0.8828363465602254, + "grad_norm": 6.048755915071891, + "learning_rate": 3.0900605836128526e-06, + "loss": 0.838, + "step": 12220 + }, + { + "epoch": 0.8829085917604349, + "grad_norm": 5.270313021142132, + "learning_rate": 3.089776341206687e-06, + "loss": 0.8194, + "step": 12221 + }, + { + "epoch": 0.8829808369606444, + "grad_norm": 5.6369901236217, + "learning_rate": 3.0894920907269403e-06, + "loss": 0.8038, + "step": 12222 + }, + { + "epoch": 0.883053082160854, + "grad_norm": 10.221219478942752, + "learning_rate": 3.0892078321775047e-06, + "loss": 0.9285, + "step": 12223 + }, + { + "epoch": 0.8831253273610634, + "grad_norm": 7.529725992959686, + "learning_rate": 3.088923565562271e-06, + "loss": 0.8364, + "step": 12224 + }, + { + "epoch": 0.883197572561273, + "grad_norm": 6.007530255378421, + "learning_rate": 3.088639290885132e-06, + "loss": 0.8, + "step": 12225 + }, + { + "epoch": 0.8832698177614825, + "grad_norm": 6.478845062942112, + "learning_rate": 3.088355008149978e-06, + "loss": 0.7643, + "step": 12226 + }, + { + "epoch": 0.8833420629616919, + "grad_norm": 6.162890265992416, + "learning_rate": 3.0880707173606998e-06, + "loss": 0.8571, + "step": 12227 + }, + { + "epoch": 0.8834143081619015, + "grad_norm": 5.9519980658046086, + "learning_rate": 3.0877864185211916e-06, + "loss": 0.7356, + "step": 12228 + }, + { + "epoch": 0.883486553362111, + "grad_norm": 6.60006859483619, + "learning_rate": 3.0875021116353423e-06, + "loss": 0.8571, + "step": 12229 + }, + { + "epoch": 0.8835587985623206, + "grad_norm": 7.494128917439382, + "learning_rate": 3.087217796707046e-06, + "loss": 0.8115, + "step": 12230 + }, + { + "epoch": 0.88363104376253, + "grad_norm": 5.526487034699543, + "learning_rate": 3.0869334737401935e-06, + "loss": 0.8495, + "step": 12231 + }, + { + "epoch": 0.8837032889627395, + "grad_norm": 6.199417505281053, + "learning_rate": 3.0866491427386775e-06, + "loss": 0.828, + "step": 12232 + }, + { + "epoch": 0.8837755341629491, + "grad_norm": 5.7665052284766745, + "learning_rate": 3.086364803706391e-06, + "loss": 0.777, + "step": 12233 + }, + { + "epoch": 0.8838477793631586, + "grad_norm": 5.970573106921785, + "learning_rate": 3.0860804566472245e-06, + "loss": 0.8673, + "step": 12234 + }, + { + "epoch": 0.883920024563368, + "grad_norm": 5.8896428511258945, + "learning_rate": 3.085796101565073e-06, + "loss": 0.7449, + "step": 12235 + }, + { + "epoch": 0.8839922697635776, + "grad_norm": 7.927186528326605, + "learning_rate": 3.0855117384638267e-06, + "loss": 0.9266, + "step": 12236 + }, + { + "epoch": 0.8840645149637871, + "grad_norm": 6.39695519498156, + "learning_rate": 3.085227367347379e-06, + "loss": 0.8471, + "step": 12237 + }, + { + "epoch": 0.8841367601639966, + "grad_norm": 6.352891470936987, + "learning_rate": 3.0849429882196238e-06, + "loss": 0.7075, + "step": 12238 + }, + { + "epoch": 0.8842090053642061, + "grad_norm": 6.557708153338728, + "learning_rate": 3.0846586010844538e-06, + "loss": 0.877, + "step": 12239 + }, + { + "epoch": 0.8842812505644156, + "grad_norm": 6.67432281830957, + "learning_rate": 3.0843742059457606e-06, + "loss": 0.7369, + "step": 12240 + }, + { + "epoch": 0.8843534957646252, + "grad_norm": 5.317890618687471, + "learning_rate": 3.084089802807438e-06, + "loss": 0.8354, + "step": 12241 + }, + { + "epoch": 0.8844257409648346, + "grad_norm": 5.850420566667388, + "learning_rate": 3.08380539167338e-06, + "loss": 0.8039, + "step": 12242 + }, + { + "epoch": 0.8844979861650442, + "grad_norm": 5.582345258290969, + "learning_rate": 3.083520972547479e-06, + "loss": 0.7629, + "step": 12243 + }, + { + "epoch": 0.8845702313652537, + "grad_norm": 5.810399486050851, + "learning_rate": 3.08323654543363e-06, + "loss": 0.8144, + "step": 12244 + }, + { + "epoch": 0.8846424765654631, + "grad_norm": 5.415753605499286, + "learning_rate": 3.0829521103357246e-06, + "loss": 0.7579, + "step": 12245 + }, + { + "epoch": 0.8847147217656727, + "grad_norm": 5.60230100224679, + "learning_rate": 3.082667667257658e-06, + "loss": 0.7792, + "step": 12246 + }, + { + "epoch": 0.8847869669658822, + "grad_norm": 5.475213465600815, + "learning_rate": 3.082383216203323e-06, + "loss": 0.7406, + "step": 12247 + }, + { + "epoch": 0.8848592121660918, + "grad_norm": 6.608311200034689, + "learning_rate": 3.082098757176614e-06, + "loss": 0.908, + "step": 12248 + }, + { + "epoch": 0.8849314573663012, + "grad_norm": 7.5968317756380355, + "learning_rate": 3.0818142901814254e-06, + "loss": 0.7737, + "step": 12249 + }, + { + "epoch": 0.8850037025665107, + "grad_norm": 5.283421177339377, + "learning_rate": 3.081529815221651e-06, + "loss": 0.7891, + "step": 12250 + }, + { + "epoch": 0.8850759477667203, + "grad_norm": 6.215431439745574, + "learning_rate": 3.081245332301184e-06, + "loss": 0.778, + "step": 12251 + }, + { + "epoch": 0.8851481929669298, + "grad_norm": 5.424013447162534, + "learning_rate": 3.0809608414239205e-06, + "loss": 0.8207, + "step": 12252 + }, + { + "epoch": 0.8852204381671392, + "grad_norm": 6.9588111515417825, + "learning_rate": 3.080676342593755e-06, + "loss": 0.8477, + "step": 12253 + }, + { + "epoch": 0.8852926833673488, + "grad_norm": 7.20972595286601, + "learning_rate": 3.0803918358145796e-06, + "loss": 0.8153, + "step": 12254 + }, + { + "epoch": 0.8853649285675583, + "grad_norm": 5.732340688309245, + "learning_rate": 3.080107321090291e-06, + "loss": 0.7879, + "step": 12255 + }, + { + "epoch": 0.8854371737677678, + "grad_norm": 5.880061140715688, + "learning_rate": 3.0798227984247837e-06, + "loss": 0.7966, + "step": 12256 + }, + { + "epoch": 0.8855094189679773, + "grad_norm": 7.741292676593974, + "learning_rate": 3.079538267821953e-06, + "loss": 0.7884, + "step": 12257 + }, + { + "epoch": 0.8855816641681868, + "grad_norm": 6.863423536515983, + "learning_rate": 3.0792537292856933e-06, + "loss": 0.8172, + "step": 12258 + }, + { + "epoch": 0.8856539093683964, + "grad_norm": 6.2380754774883735, + "learning_rate": 3.0789691828199e-06, + "loss": 0.8132, + "step": 12259 + }, + { + "epoch": 0.8857261545686058, + "grad_norm": 6.402533816390159, + "learning_rate": 3.078684628428467e-06, + "loss": 0.8502, + "step": 12260 + }, + { + "epoch": 0.8857983997688154, + "grad_norm": 6.433591674730236, + "learning_rate": 3.0784000661152914e-06, + "loss": 0.8411, + "step": 12261 + }, + { + "epoch": 0.8858706449690249, + "grad_norm": 5.399183296326768, + "learning_rate": 3.0781154958842683e-06, + "loss": 0.7745, + "step": 12262 + }, + { + "epoch": 0.8859428901692343, + "grad_norm": 6.353727865441353, + "learning_rate": 3.0778309177392923e-06, + "loss": 0.8849, + "step": 12263 + }, + { + "epoch": 0.8860151353694439, + "grad_norm": 5.188275750307656, + "learning_rate": 3.077546331684261e-06, + "loss": 0.8247, + "step": 12264 + }, + { + "epoch": 0.8860873805696534, + "grad_norm": 5.749790685409808, + "learning_rate": 3.077261737723067e-06, + "loss": 0.773, + "step": 12265 + }, + { + "epoch": 0.886159625769863, + "grad_norm": 5.218285088760303, + "learning_rate": 3.076977135859609e-06, + "loss": 0.7856, + "step": 12266 + }, + { + "epoch": 0.8862318709700724, + "grad_norm": 6.353490708047514, + "learning_rate": 3.0766925260977827e-06, + "loss": 0.7909, + "step": 12267 + }, + { + "epoch": 0.8863041161702819, + "grad_norm": 7.692592019915217, + "learning_rate": 3.0764079084414822e-06, + "loss": 0.8202, + "step": 12268 + }, + { + "epoch": 0.8863763613704915, + "grad_norm": 6.561128018786575, + "learning_rate": 3.0761232828946053e-06, + "loss": 0.8616, + "step": 12269 + }, + { + "epoch": 0.886448606570701, + "grad_norm": 5.576374708986075, + "learning_rate": 3.0758386494610483e-06, + "loss": 0.7168, + "step": 12270 + }, + { + "epoch": 0.8865208517709104, + "grad_norm": 7.3572243303716185, + "learning_rate": 3.075554008144708e-06, + "loss": 0.8684, + "step": 12271 + }, + { + "epoch": 0.88659309697112, + "grad_norm": 7.5698922338839925, + "learning_rate": 3.0752693589494787e-06, + "loss": 0.8297, + "step": 12272 + }, + { + "epoch": 0.8866653421713295, + "grad_norm": 6.795738949598767, + "learning_rate": 3.0749847018792597e-06, + "loss": 0.8358, + "step": 12273 + }, + { + "epoch": 0.886737587371539, + "grad_norm": 6.052225431874019, + "learning_rate": 3.074700036937946e-06, + "loss": 0.8288, + "step": 12274 + }, + { + "epoch": 0.8868098325717485, + "grad_norm": 5.75141060153687, + "learning_rate": 3.074415364129435e-06, + "loss": 0.8175, + "step": 12275 + }, + { + "epoch": 0.886882077771958, + "grad_norm": 7.240292330999494, + "learning_rate": 3.074130683457624e-06, + "loss": 0.7703, + "step": 12276 + }, + { + "epoch": 0.8869543229721676, + "grad_norm": 6.700983732512595, + "learning_rate": 3.073845994926409e-06, + "loss": 0.8419, + "step": 12277 + }, + { + "epoch": 0.887026568172377, + "grad_norm": 6.280744997287126, + "learning_rate": 3.0735612985396897e-06, + "loss": 0.7895, + "step": 12278 + }, + { + "epoch": 0.8870988133725866, + "grad_norm": 6.546567340623061, + "learning_rate": 3.0732765943013594e-06, + "loss": 0.7191, + "step": 12279 + }, + { + "epoch": 0.8871710585727961, + "grad_norm": 5.501434832594598, + "learning_rate": 3.0729918822153188e-06, + "loss": 0.7695, + "step": 12280 + }, + { + "epoch": 0.8872433037730055, + "grad_norm": 6.485389721997546, + "learning_rate": 3.072707162285464e-06, + "loss": 0.8447, + "step": 12281 + }, + { + "epoch": 0.8873155489732151, + "grad_norm": 5.746265691470649, + "learning_rate": 3.0724224345156926e-06, + "loss": 0.775, + "step": 12282 + }, + { + "epoch": 0.8873877941734246, + "grad_norm": 6.3855452739797345, + "learning_rate": 3.0721376989099026e-06, + "loss": 0.8573, + "step": 12283 + }, + { + "epoch": 0.8874600393736342, + "grad_norm": 6.7984883816111905, + "learning_rate": 3.071852955471992e-06, + "loss": 0.8556, + "step": 12284 + }, + { + "epoch": 0.8875322845738436, + "grad_norm": 6.090491729577887, + "learning_rate": 3.0715682042058585e-06, + "loss": 0.7352, + "step": 12285 + }, + { + "epoch": 0.8876045297740531, + "grad_norm": 6.998073857835923, + "learning_rate": 3.0712834451154e-06, + "loss": 0.836, + "step": 12286 + }, + { + "epoch": 0.8876767749742627, + "grad_norm": 6.084432981554752, + "learning_rate": 3.070998678204515e-06, + "loss": 0.7856, + "step": 12287 + }, + { + "epoch": 0.8877490201744721, + "grad_norm": 6.89053420787691, + "learning_rate": 3.0707139034771013e-06, + "loss": 0.8702, + "step": 12288 + }, + { + "epoch": 0.8878212653746816, + "grad_norm": 6.551173144397199, + "learning_rate": 3.070429120937057e-06, + "loss": 0.8719, + "step": 12289 + }, + { + "epoch": 0.8878935105748912, + "grad_norm": 6.659208544651692, + "learning_rate": 3.0701443305882806e-06, + "loss": 0.7901, + "step": 12290 + }, + { + "epoch": 0.8879657557751007, + "grad_norm": 7.574106436925157, + "learning_rate": 3.069859532434672e-06, + "loss": 0.8258, + "step": 12291 + }, + { + "epoch": 0.8880380009753102, + "grad_norm": 6.56695754025022, + "learning_rate": 3.0695747264801286e-06, + "loss": 0.7954, + "step": 12292 + }, + { + "epoch": 0.8881102461755197, + "grad_norm": 5.59523698020236, + "learning_rate": 3.069289912728549e-06, + "loss": 0.8767, + "step": 12293 + }, + { + "epoch": 0.8881824913757292, + "grad_norm": 5.910950097462124, + "learning_rate": 3.0690050911838325e-06, + "loss": 0.7875, + "step": 12294 + }, + { + "epoch": 0.8882547365759388, + "grad_norm": 7.101359180859406, + "learning_rate": 3.0687202618498786e-06, + "loss": 0.8294, + "step": 12295 + }, + { + "epoch": 0.8883269817761482, + "grad_norm": 6.9159246146228694, + "learning_rate": 3.0684354247305857e-06, + "loss": 0.8581, + "step": 12296 + }, + { + "epoch": 0.8883992269763578, + "grad_norm": 7.669084527041683, + "learning_rate": 3.0681505798298527e-06, + "loss": 0.7467, + "step": 12297 + }, + { + "epoch": 0.8884714721765673, + "grad_norm": 5.506386949825715, + "learning_rate": 3.06786572715158e-06, + "loss": 0.7912, + "step": 12298 + }, + { + "epoch": 0.8885437173767767, + "grad_norm": 5.966627772255971, + "learning_rate": 3.0675808666996665e-06, + "loss": 0.817, + "step": 12299 + }, + { + "epoch": 0.8886159625769863, + "grad_norm": 5.035075374217138, + "learning_rate": 3.0672959984780115e-06, + "loss": 0.7807, + "step": 12300 + }, + { + "epoch": 0.8886882077771958, + "grad_norm": 6.787758231124561, + "learning_rate": 3.0670111224905146e-06, + "loss": 0.8065, + "step": 12301 + }, + { + "epoch": 0.8887604529774054, + "grad_norm": 5.9638236122169745, + "learning_rate": 3.066726238741076e-06, + "loss": 0.7278, + "step": 12302 + }, + { + "epoch": 0.8888326981776148, + "grad_norm": 9.382396171029885, + "learning_rate": 3.0664413472335945e-06, + "loss": 0.9062, + "step": 12303 + }, + { + "epoch": 0.8889049433778243, + "grad_norm": 6.463224105554676, + "learning_rate": 3.066156447971971e-06, + "loss": 0.823, + "step": 12304 + }, + { + "epoch": 0.8889771885780339, + "grad_norm": 6.537647089570791, + "learning_rate": 3.0658715409601057e-06, + "loss": 0.8055, + "step": 12305 + }, + { + "epoch": 0.8890494337782433, + "grad_norm": 6.765826004249251, + "learning_rate": 3.0655866262018987e-06, + "loss": 0.7913, + "step": 12306 + }, + { + "epoch": 0.8891216789784528, + "grad_norm": 7.9830789428615185, + "learning_rate": 3.0653017037012493e-06, + "loss": 0.86, + "step": 12307 + }, + { + "epoch": 0.8891939241786624, + "grad_norm": 7.035575383995734, + "learning_rate": 3.065016773462059e-06, + "loss": 0.8494, + "step": 12308 + }, + { + "epoch": 0.8892661693788719, + "grad_norm": 5.798483078000605, + "learning_rate": 3.0647318354882273e-06, + "loss": 0.7853, + "step": 12309 + }, + { + "epoch": 0.8893384145790814, + "grad_norm": 6.108388055172032, + "learning_rate": 3.0644468897836554e-06, + "loss": 0.813, + "step": 12310 + }, + { + "epoch": 0.8894106597792909, + "grad_norm": 11.249924723055441, + "learning_rate": 3.064161936352244e-06, + "loss": 0.7566, + "step": 12311 + }, + { + "epoch": 0.8894829049795004, + "grad_norm": 5.856753319736302, + "learning_rate": 3.063876975197893e-06, + "loss": 0.731, + "step": 12312 + }, + { + "epoch": 0.88955515017971, + "grad_norm": 8.135964844454241, + "learning_rate": 3.063592006324505e-06, + "loss": 0.801, + "step": 12313 + }, + { + "epoch": 0.8896273953799194, + "grad_norm": 6.497530835009888, + "learning_rate": 3.0633070297359797e-06, + "loss": 0.7664, + "step": 12314 + }, + { + "epoch": 0.889699640580129, + "grad_norm": 5.705435506929376, + "learning_rate": 3.0630220454362185e-06, + "loss": 0.8277, + "step": 12315 + }, + { + "epoch": 0.8897718857803385, + "grad_norm": 7.213010149124771, + "learning_rate": 3.062737053429123e-06, + "loss": 0.7488, + "step": 12316 + }, + { + "epoch": 0.8898441309805479, + "grad_norm": 7.58398638697501, + "learning_rate": 3.0624520537185935e-06, + "loss": 0.8676, + "step": 12317 + }, + { + "epoch": 0.8899163761807575, + "grad_norm": 5.954100521388502, + "learning_rate": 3.0621670463085324e-06, + "loss": 0.7988, + "step": 12318 + }, + { + "epoch": 0.889988621380967, + "grad_norm": 7.186911915479372, + "learning_rate": 3.0618820312028415e-06, + "loss": 0.7808, + "step": 12319 + }, + { + "epoch": 0.8900608665811766, + "grad_norm": 5.578857715685643, + "learning_rate": 3.0615970084054207e-06, + "loss": 0.8116, + "step": 12320 + }, + { + "epoch": 0.890133111781386, + "grad_norm": 5.688074774571207, + "learning_rate": 3.0613119779201738e-06, + "loss": 0.7689, + "step": 12321 + }, + { + "epoch": 0.8902053569815955, + "grad_norm": 7.11784110224351, + "learning_rate": 3.061026939751001e-06, + "loss": 0.799, + "step": 12322 + }, + { + "epoch": 0.8902776021818051, + "grad_norm": 5.496529958369367, + "learning_rate": 3.060741893901806e-06, + "loss": 0.8212, + "step": 12323 + }, + { + "epoch": 0.8903498473820145, + "grad_norm": 6.189181320948998, + "learning_rate": 3.0604568403764895e-06, + "loss": 0.8607, + "step": 12324 + }, + { + "epoch": 0.890422092582224, + "grad_norm": 6.535207033387564, + "learning_rate": 3.0601717791789537e-06, + "loss": 0.8358, + "step": 12325 + }, + { + "epoch": 0.8904943377824336, + "grad_norm": 6.259610835644713, + "learning_rate": 3.0598867103131015e-06, + "loss": 0.7749, + "step": 12326 + }, + { + "epoch": 0.8905665829826431, + "grad_norm": 5.807082738382494, + "learning_rate": 3.0596016337828344e-06, + "loss": 0.7582, + "step": 12327 + }, + { + "epoch": 0.8906388281828526, + "grad_norm": 5.007653291882258, + "learning_rate": 3.0593165495920564e-06, + "loss": 0.7427, + "step": 12328 + }, + { + "epoch": 0.8907110733830621, + "grad_norm": 8.125617840090877, + "learning_rate": 3.0590314577446685e-06, + "loss": 0.8657, + "step": 12329 + }, + { + "epoch": 0.8907833185832716, + "grad_norm": 6.499422194401898, + "learning_rate": 3.0587463582445743e-06, + "loss": 0.7843, + "step": 12330 + }, + { + "epoch": 0.8908555637834812, + "grad_norm": 5.454518662011206, + "learning_rate": 3.0584612510956755e-06, + "loss": 0.8661, + "step": 12331 + }, + { + "epoch": 0.8909278089836906, + "grad_norm": 6.3459781435672165, + "learning_rate": 3.0581761363018762e-06, + "loss": 0.785, + "step": 12332 + }, + { + "epoch": 0.8910000541839002, + "grad_norm": 6.4119588582699505, + "learning_rate": 3.0578910138670796e-06, + "loss": 0.7747, + "step": 12333 + }, + { + "epoch": 0.8910722993841097, + "grad_norm": 6.757487912183105, + "learning_rate": 3.0576058837951878e-06, + "loss": 0.8813, + "step": 12334 + }, + { + "epoch": 0.8911445445843191, + "grad_norm": 5.1685494509701675, + "learning_rate": 3.057320746090105e-06, + "loss": 0.7849, + "step": 12335 + }, + { + "epoch": 0.8912167897845287, + "grad_norm": 5.445353656979954, + "learning_rate": 3.057035600755732e-06, + "loss": 0.8016, + "step": 12336 + }, + { + "epoch": 0.8912890349847382, + "grad_norm": 5.771359122477604, + "learning_rate": 3.0567504477959764e-06, + "loss": 0.7967, + "step": 12337 + }, + { + "epoch": 0.8913612801849478, + "grad_norm": 6.156908029156534, + "learning_rate": 3.0564652872147384e-06, + "loss": 0.7928, + "step": 12338 + }, + { + "epoch": 0.8914335253851572, + "grad_norm": 6.924545151959328, + "learning_rate": 3.056180119015923e-06, + "loss": 0.8455, + "step": 12339 + }, + { + "epoch": 0.8915057705853667, + "grad_norm": 7.323726025093033, + "learning_rate": 3.055894943203433e-06, + "loss": 0.7925, + "step": 12340 + }, + { + "epoch": 0.8915780157855763, + "grad_norm": 5.186054258075227, + "learning_rate": 3.055609759781173e-06, + "loss": 0.816, + "step": 12341 + }, + { + "epoch": 0.8916502609857857, + "grad_norm": 8.001791753391764, + "learning_rate": 3.0553245687530474e-06, + "loss": 0.8441, + "step": 12342 + }, + { + "epoch": 0.8917225061859952, + "grad_norm": 7.292416633911479, + "learning_rate": 3.055039370122959e-06, + "loss": 0.8824, + "step": 12343 + }, + { + "epoch": 0.8917947513862048, + "grad_norm": 7.769469877051468, + "learning_rate": 3.054754163894813e-06, + "loss": 0.8206, + "step": 12344 + }, + { + "epoch": 0.8918669965864143, + "grad_norm": 8.535483403341075, + "learning_rate": 3.054468950072513e-06, + "loss": 0.8466, + "step": 12345 + }, + { + "epoch": 0.8919392417866238, + "grad_norm": 6.5939617394172405, + "learning_rate": 3.0541837286599634e-06, + "loss": 0.8802, + "step": 12346 + }, + { + "epoch": 0.8920114869868333, + "grad_norm": 5.167173524831324, + "learning_rate": 3.0538984996610693e-06, + "loss": 0.8031, + "step": 12347 + }, + { + "epoch": 0.8920837321870428, + "grad_norm": 5.502846761381607, + "learning_rate": 3.053613263079735e-06, + "loss": 0.8696, + "step": 12348 + }, + { + "epoch": 0.8921559773872524, + "grad_norm": 5.9513202644980385, + "learning_rate": 3.053328018919865e-06, + "loss": 0.8053, + "step": 12349 + }, + { + "epoch": 0.8922282225874618, + "grad_norm": 9.712624536057078, + "learning_rate": 3.053042767185362e-06, + "loss": 0.7897, + "step": 12350 + }, + { + "epoch": 0.8923004677876714, + "grad_norm": 6.976198603419036, + "learning_rate": 3.0527575078801358e-06, + "loss": 0.7761, + "step": 12351 + }, + { + "epoch": 0.8923727129878809, + "grad_norm": 8.696972848696094, + "learning_rate": 3.052472241008087e-06, + "loss": 0.801, + "step": 12352 + }, + { + "epoch": 0.8924449581880903, + "grad_norm": 5.023330616147615, + "learning_rate": 3.052186966573123e-06, + "loss": 0.7928, + "step": 12353 + }, + { + "epoch": 0.8925172033882999, + "grad_norm": 7.099634628565815, + "learning_rate": 3.051901684579147e-06, + "loss": 0.9359, + "step": 12354 + }, + { + "epoch": 0.8925894485885094, + "grad_norm": 5.802772503871701, + "learning_rate": 3.0516163950300663e-06, + "loss": 0.7898, + "step": 12355 + }, + { + "epoch": 0.892661693788719, + "grad_norm": 5.91422472067395, + "learning_rate": 3.051331097929785e-06, + "loss": 0.8172, + "step": 12356 + }, + { + "epoch": 0.8927339389889284, + "grad_norm": 5.0503392096177215, + "learning_rate": 3.0510457932822094e-06, + "loss": 0.805, + "step": 12357 + }, + { + "epoch": 0.8928061841891379, + "grad_norm": 8.101361440519822, + "learning_rate": 3.0507604810912455e-06, + "loss": 0.8539, + "step": 12358 + }, + { + "epoch": 0.8928784293893475, + "grad_norm": 7.72128192595492, + "learning_rate": 3.050475161360797e-06, + "loss": 0.7608, + "step": 12359 + }, + { + "epoch": 0.8929506745895569, + "grad_norm": 6.946169273644647, + "learning_rate": 3.050189834094771e-06, + "loss": 0.7629, + "step": 12360 + }, + { + "epoch": 0.8930229197897664, + "grad_norm": 5.941767393794787, + "learning_rate": 3.049904499297074e-06, + "loss": 0.7637, + "step": 12361 + }, + { + "epoch": 0.893095164989976, + "grad_norm": 5.374830021499263, + "learning_rate": 3.0496191569716116e-06, + "loss": 0.7519, + "step": 12362 + }, + { + "epoch": 0.8931674101901855, + "grad_norm": 7.663451750798488, + "learning_rate": 3.049333807122289e-06, + "loss": 0.7921, + "step": 12363 + }, + { + "epoch": 0.893239655390395, + "grad_norm": 5.752540690490764, + "learning_rate": 3.0490484497530125e-06, + "loss": 0.829, + "step": 12364 + }, + { + "epoch": 0.8933119005906045, + "grad_norm": 5.920105199136891, + "learning_rate": 3.0487630848676907e-06, + "loss": 0.8054, + "step": 12365 + }, + { + "epoch": 0.893384145790814, + "grad_norm": 7.742544956982083, + "learning_rate": 3.048477712470227e-06, + "loss": 0.8465, + "step": 12366 + }, + { + "epoch": 0.8934563909910236, + "grad_norm": 8.560986016072144, + "learning_rate": 3.04819233256453e-06, + "loss": 0.8314, + "step": 12367 + }, + { + "epoch": 0.893528636191233, + "grad_norm": 7.205166541880919, + "learning_rate": 3.0479069451545058e-06, + "loss": 0.8124, + "step": 12368 + }, + { + "epoch": 0.8936008813914426, + "grad_norm": 6.138239351012818, + "learning_rate": 3.0476215502440606e-06, + "loss": 0.7837, + "step": 12369 + }, + { + "epoch": 0.8936731265916521, + "grad_norm": 5.07474440508855, + "learning_rate": 3.047336147837101e-06, + "loss": 0.8359, + "step": 12370 + }, + { + "epoch": 0.8937453717918615, + "grad_norm": 6.233997763786612, + "learning_rate": 3.0470507379375354e-06, + "loss": 0.8447, + "step": 12371 + }, + { + "epoch": 0.8938176169920711, + "grad_norm": 6.559857226888266, + "learning_rate": 3.0467653205492693e-06, + "loss": 0.7428, + "step": 12372 + }, + { + "epoch": 0.8938898621922806, + "grad_norm": 7.329526722449251, + "learning_rate": 3.046479895676211e-06, + "loss": 0.7836, + "step": 12373 + }, + { + "epoch": 0.8939621073924902, + "grad_norm": 6.364599682609508, + "learning_rate": 3.0461944633222673e-06, + "loss": 0.8351, + "step": 12374 + }, + { + "epoch": 0.8940343525926996, + "grad_norm": 5.972305916290932, + "learning_rate": 3.0459090234913445e-06, + "loss": 0.9225, + "step": 12375 + }, + { + "epoch": 0.8941065977929091, + "grad_norm": 5.309871247593073, + "learning_rate": 3.0456235761873528e-06, + "loss": 0.7915, + "step": 12376 + }, + { + "epoch": 0.8941788429931187, + "grad_norm": 7.796774701102122, + "learning_rate": 3.0453381214141966e-06, + "loss": 0.8494, + "step": 12377 + }, + { + "epoch": 0.8942510881933281, + "grad_norm": 5.045521085083237, + "learning_rate": 3.045052659175785e-06, + "loss": 0.7808, + "step": 12378 + }, + { + "epoch": 0.8943233333935376, + "grad_norm": 5.56104490986199, + "learning_rate": 3.0447671894760257e-06, + "loss": 0.7466, + "step": 12379 + }, + { + "epoch": 0.8943955785937472, + "grad_norm": 5.681355605282235, + "learning_rate": 3.0444817123188274e-06, + "loss": 0.8995, + "step": 12380 + }, + { + "epoch": 0.8944678237939567, + "grad_norm": 5.219846690181058, + "learning_rate": 3.044196227708097e-06, + "loss": 0.775, + "step": 12381 + }, + { + "epoch": 0.8945400689941662, + "grad_norm": 8.09329234121752, + "learning_rate": 3.043910735647742e-06, + "loss": 0.8415, + "step": 12382 + }, + { + "epoch": 0.8946123141943757, + "grad_norm": 7.3602506922444935, + "learning_rate": 3.043625236141672e-06, + "loss": 0.787, + "step": 12383 + }, + { + "epoch": 0.8946845593945852, + "grad_norm": 5.908181172479877, + "learning_rate": 3.0433397291937937e-06, + "loss": 0.7857, + "step": 12384 + }, + { + "epoch": 0.8947568045947948, + "grad_norm": 6.794295319093559, + "learning_rate": 3.0430542148080176e-06, + "loss": 0.7527, + "step": 12385 + }, + { + "epoch": 0.8948290497950042, + "grad_norm": 6.8352293159956306, + "learning_rate": 3.0427686929882505e-06, + "loss": 0.8478, + "step": 12386 + }, + { + "epoch": 0.8949012949952138, + "grad_norm": 6.749078475778512, + "learning_rate": 3.0424831637384015e-06, + "loss": 0.8365, + "step": 12387 + }, + { + "epoch": 0.8949735401954233, + "grad_norm": 6.807239426225364, + "learning_rate": 3.042197627062379e-06, + "loss": 0.8563, + "step": 12388 + }, + { + "epoch": 0.8950457853956327, + "grad_norm": 5.716114541723386, + "learning_rate": 3.041912082964092e-06, + "loss": 0.7682, + "step": 12389 + }, + { + "epoch": 0.8951180305958423, + "grad_norm": 6.1594815365477205, + "learning_rate": 3.04162653144745e-06, + "loss": 0.8225, + "step": 12390 + }, + { + "epoch": 0.8951902757960518, + "grad_norm": 6.125633985444511, + "learning_rate": 3.0413409725163606e-06, + "loss": 0.8141, + "step": 12391 + }, + { + "epoch": 0.8952625209962614, + "grad_norm": 8.061026031842047, + "learning_rate": 3.041055406174734e-06, + "loss": 0.764, + "step": 12392 + }, + { + "epoch": 0.8953347661964708, + "grad_norm": 6.252758790540043, + "learning_rate": 3.0407698324264777e-06, + "loss": 0.7617, + "step": 12393 + }, + { + "epoch": 0.8954070113966803, + "grad_norm": 7.159850168631733, + "learning_rate": 3.040484251275504e-06, + "loss": 0.7401, + "step": 12394 + }, + { + "epoch": 0.8954792565968899, + "grad_norm": 7.172080575161669, + "learning_rate": 3.0401986627257198e-06, + "loss": 0.8304, + "step": 12395 + }, + { + "epoch": 0.8955515017970993, + "grad_norm": 7.151867719941758, + "learning_rate": 3.0399130667810356e-06, + "loss": 0.7574, + "step": 12396 + }, + { + "epoch": 0.8956237469973088, + "grad_norm": 7.377593796643421, + "learning_rate": 3.0396274634453605e-06, + "loss": 0.8624, + "step": 12397 + }, + { + "epoch": 0.8956959921975184, + "grad_norm": 6.20266652755103, + "learning_rate": 3.0393418527226043e-06, + "loss": 0.7989, + "step": 12398 + }, + { + "epoch": 0.8957682373977279, + "grad_norm": 6.387177449124266, + "learning_rate": 3.0390562346166774e-06, + "loss": 0.848, + "step": 12399 + }, + { + "epoch": 0.8958404825979374, + "grad_norm": 5.396318431046572, + "learning_rate": 3.0387706091314888e-06, + "loss": 0.8074, + "step": 12400 + }, + { + "epoch": 0.8959127277981469, + "grad_norm": 7.298880389329869, + "learning_rate": 3.0384849762709497e-06, + "loss": 0.7726, + "step": 12401 + }, + { + "epoch": 0.8959849729983564, + "grad_norm": 5.216760130381171, + "learning_rate": 3.0381993360389683e-06, + "loss": 0.8195, + "step": 12402 + }, + { + "epoch": 0.896057218198566, + "grad_norm": 7.541244017277587, + "learning_rate": 3.0379136884394554e-06, + "loss": 0.8667, + "step": 12403 + }, + { + "epoch": 0.8961294633987754, + "grad_norm": 5.581779244716076, + "learning_rate": 3.037628033476323e-06, + "loss": 0.7717, + "step": 12404 + }, + { + "epoch": 0.896201708598985, + "grad_norm": 6.168633259110189, + "learning_rate": 3.03734237115348e-06, + "loss": 0.8147, + "step": 12405 + }, + { + "epoch": 0.8962739537991945, + "grad_norm": 6.917482657347055, + "learning_rate": 3.0370567014748374e-06, + "loss": 0.8138, + "step": 12406 + }, + { + "epoch": 0.8963461989994039, + "grad_norm": 6.111413327845657, + "learning_rate": 3.0367710244443046e-06, + "loss": 0.8366, + "step": 12407 + }, + { + "epoch": 0.8964184441996135, + "grad_norm": 8.958334800069526, + "learning_rate": 3.036485340065794e-06, + "loss": 0.7896, + "step": 12408 + }, + { + "epoch": 0.896490689399823, + "grad_norm": 6.916713055681066, + "learning_rate": 3.0361996483432155e-06, + "loss": 0.8319, + "step": 12409 + }, + { + "epoch": 0.8965629346000326, + "grad_norm": 6.378702621807288, + "learning_rate": 3.0359139492804803e-06, + "loss": 0.8442, + "step": 12410 + }, + { + "epoch": 0.896635179800242, + "grad_norm": 7.422362237296198, + "learning_rate": 3.0356282428814987e-06, + "loss": 0.7937, + "step": 12411 + }, + { + "epoch": 0.8967074250004515, + "grad_norm": 8.40729411281159, + "learning_rate": 3.0353425291501827e-06, + "loss": 0.8268, + "step": 12412 + }, + { + "epoch": 0.8967796702006611, + "grad_norm": 8.451092657287031, + "learning_rate": 3.035056808090443e-06, + "loss": 0.8795, + "step": 12413 + }, + { + "epoch": 0.8968519154008705, + "grad_norm": 6.548529295299555, + "learning_rate": 3.034771079706192e-06, + "loss": 0.8218, + "step": 12414 + }, + { + "epoch": 0.89692416060108, + "grad_norm": 7.445455703879229, + "learning_rate": 3.0344853440013393e-06, + "loss": 0.8926, + "step": 12415 + }, + { + "epoch": 0.8969964058012896, + "grad_norm": 6.318151380367075, + "learning_rate": 3.034199600979797e-06, + "loss": 0.8194, + "step": 12416 + }, + { + "epoch": 0.8970686510014991, + "grad_norm": 6.385321844080393, + "learning_rate": 3.0339138506454776e-06, + "loss": 0.8162, + "step": 12417 + }, + { + "epoch": 0.8971408962017086, + "grad_norm": 7.160302759514444, + "learning_rate": 3.033628093002291e-06, + "loss": 0.7718, + "step": 12418 + }, + { + "epoch": 0.8972131414019181, + "grad_norm": 9.59526760759833, + "learning_rate": 3.0333423280541513e-06, + "loss": 0.8739, + "step": 12419 + }, + { + "epoch": 0.8972853866021276, + "grad_norm": 6.371883752656905, + "learning_rate": 3.033056555804969e-06, + "loss": 0.8845, + "step": 12420 + }, + { + "epoch": 0.8973576318023372, + "grad_norm": 5.312968334423141, + "learning_rate": 3.0327707762586558e-06, + "loss": 0.7405, + "step": 12421 + }, + { + "epoch": 0.8974298770025466, + "grad_norm": 5.4927764620198, + "learning_rate": 3.0324849894191244e-06, + "loss": 0.7683, + "step": 12422 + }, + { + "epoch": 0.8975021222027562, + "grad_norm": 6.05356308863757, + "learning_rate": 3.032199195290288e-06, + "loss": 0.8312, + "step": 12423 + }, + { + "epoch": 0.8975743674029657, + "grad_norm": 6.610869398782861, + "learning_rate": 3.0319133938760577e-06, + "loss": 0.8961, + "step": 12424 + }, + { + "epoch": 0.8976466126031751, + "grad_norm": 6.983753970397509, + "learning_rate": 3.031627585180345e-06, + "loss": 0.8124, + "step": 12425 + }, + { + "epoch": 0.8977188578033847, + "grad_norm": 5.735082099413375, + "learning_rate": 3.0313417692070634e-06, + "loss": 0.8343, + "step": 12426 + }, + { + "epoch": 0.8977911030035942, + "grad_norm": 6.957942228309332, + "learning_rate": 3.0310559459601265e-06, + "loss": 0.8243, + "step": 12427 + }, + { + "epoch": 0.8978633482038038, + "grad_norm": 6.2577265472795585, + "learning_rate": 3.0307701154434454e-06, + "loss": 0.7563, + "step": 12428 + }, + { + "epoch": 0.8979355934040132, + "grad_norm": 7.63431392962232, + "learning_rate": 3.0304842776609335e-06, + "loss": 0.7546, + "step": 12429 + }, + { + "epoch": 0.8980078386042227, + "grad_norm": 6.942562421298984, + "learning_rate": 3.0301984326165045e-06, + "loss": 0.8913, + "step": 12430 + }, + { + "epoch": 0.8980800838044323, + "grad_norm": 7.520941679817807, + "learning_rate": 3.02991258031407e-06, + "loss": 0.7637, + "step": 12431 + }, + { + "epoch": 0.8981523290046417, + "grad_norm": 6.629167217811675, + "learning_rate": 3.0296267207575436e-06, + "loss": 0.7686, + "step": 12432 + }, + { + "epoch": 0.8982245742048512, + "grad_norm": 5.865211936458078, + "learning_rate": 3.029340853950839e-06, + "loss": 0.8087, + "step": 12433 + }, + { + "epoch": 0.8982968194050608, + "grad_norm": 5.770190408563154, + "learning_rate": 3.0290549798978687e-06, + "loss": 0.8496, + "step": 12434 + }, + { + "epoch": 0.8983690646052703, + "grad_norm": 4.470377378982949, + "learning_rate": 3.0287690986025462e-06, + "loss": 0.7326, + "step": 12435 + }, + { + "epoch": 0.8984413098054798, + "grad_norm": 7.639952709990336, + "learning_rate": 3.028483210068786e-06, + "loss": 0.8741, + "step": 12436 + }, + { + "epoch": 0.8985135550056893, + "grad_norm": 5.483750701758169, + "learning_rate": 3.0281973143005017e-06, + "loss": 0.7917, + "step": 12437 + }, + { + "epoch": 0.8985858002058988, + "grad_norm": 5.341231438595377, + "learning_rate": 3.027911411301605e-06, + "loss": 0.8106, + "step": 12438 + }, + { + "epoch": 0.8986580454061084, + "grad_norm": 6.523245693287966, + "learning_rate": 3.027625501076012e-06, + "loss": 0.7643, + "step": 12439 + }, + { + "epoch": 0.8987302906063178, + "grad_norm": 5.892368048645903, + "learning_rate": 3.0273395836276344e-06, + "loss": 0.7834, + "step": 12440 + }, + { + "epoch": 0.8988025358065274, + "grad_norm": 7.138034075047763, + "learning_rate": 3.0270536589603876e-06, + "loss": 0.7364, + "step": 12441 + }, + { + "epoch": 0.8988747810067369, + "grad_norm": 6.26435667980491, + "learning_rate": 3.0267677270781865e-06, + "loss": 0.7761, + "step": 12442 + }, + { + "epoch": 0.8989470262069463, + "grad_norm": 6.612676134269566, + "learning_rate": 3.0264817879849433e-06, + "loss": 0.8514, + "step": 12443 + }, + { + "epoch": 0.8990192714071559, + "grad_norm": 7.493800080221916, + "learning_rate": 3.0261958416845737e-06, + "loss": 0.8682, + "step": 12444 + }, + { + "epoch": 0.8990915166073654, + "grad_norm": 8.368503613082089, + "learning_rate": 3.0259098881809906e-06, + "loss": 0.8762, + "step": 12445 + }, + { + "epoch": 0.899163761807575, + "grad_norm": 5.642002772241646, + "learning_rate": 3.0256239274781107e-06, + "loss": 0.8136, + "step": 12446 + }, + { + "epoch": 0.8992360070077844, + "grad_norm": 6.032688423941794, + "learning_rate": 3.0253379595798472e-06, + "loss": 0.8305, + "step": 12447 + }, + { + "epoch": 0.8993082522079939, + "grad_norm": 6.854832424616964, + "learning_rate": 3.025051984490114e-06, + "loss": 0.8575, + "step": 12448 + }, + { + "epoch": 0.8993804974082035, + "grad_norm": 6.544713503378784, + "learning_rate": 3.024766002212827e-06, + "loss": 0.7601, + "step": 12449 + }, + { + "epoch": 0.8994527426084129, + "grad_norm": 6.656400150521841, + "learning_rate": 3.0244800127519013e-06, + "loss": 0.8136, + "step": 12450 + }, + { + "epoch": 0.8995249878086224, + "grad_norm": 5.311245938553383, + "learning_rate": 3.024194016111252e-06, + "loss": 0.747, + "step": 12451 + }, + { + "epoch": 0.899597233008832, + "grad_norm": 6.684938029171057, + "learning_rate": 3.0239080122947927e-06, + "loss": 0.8441, + "step": 12452 + }, + { + "epoch": 0.8996694782090415, + "grad_norm": 6.3218103621417505, + "learning_rate": 3.02362200130644e-06, + "loss": 0.812, + "step": 12453 + }, + { + "epoch": 0.899741723409251, + "grad_norm": 5.785706158452322, + "learning_rate": 3.0233359831501087e-06, + "loss": 0.7764, + "step": 12454 + }, + { + "epoch": 0.8998139686094605, + "grad_norm": 6.117391414824457, + "learning_rate": 3.0230499578297136e-06, + "loss": 0.7579, + "step": 12455 + }, + { + "epoch": 0.89988621380967, + "grad_norm": 5.847147723901539, + "learning_rate": 3.0227639253491715e-06, + "loss": 0.7326, + "step": 12456 + }, + { + "epoch": 0.8999584590098796, + "grad_norm": 4.908921213139058, + "learning_rate": 3.0224778857123966e-06, + "loss": 0.7748, + "step": 12457 + }, + { + "epoch": 0.900030704210089, + "grad_norm": 6.299136680911495, + "learning_rate": 3.022191838923305e-06, + "loss": 0.8085, + "step": 12458 + }, + { + "epoch": 0.9001029494102986, + "grad_norm": 4.918837418318638, + "learning_rate": 3.0219057849858126e-06, + "loss": 0.7898, + "step": 12459 + }, + { + "epoch": 0.9001751946105081, + "grad_norm": 5.398261546291359, + "learning_rate": 3.021619723903836e-06, + "loss": 0.7898, + "step": 12460 + }, + { + "epoch": 0.9002474398107175, + "grad_norm": 5.437357758163315, + "learning_rate": 3.0213336556812893e-06, + "loss": 0.847, + "step": 12461 + }, + { + "epoch": 0.9003196850109271, + "grad_norm": 5.169511791029476, + "learning_rate": 3.021047580322091e-06, + "loss": 0.754, + "step": 12462 + }, + { + "epoch": 0.9003919302111366, + "grad_norm": 5.375970974537287, + "learning_rate": 3.020761497830155e-06, + "loss": 0.7292, + "step": 12463 + }, + { + "epoch": 0.9004641754113462, + "grad_norm": 6.49534234651926, + "learning_rate": 3.020475408209398e-06, + "loss": 0.8386, + "step": 12464 + }, + { + "epoch": 0.9005364206115556, + "grad_norm": 6.377881165721301, + "learning_rate": 3.0201893114637376e-06, + "loss": 0.8903, + "step": 12465 + }, + { + "epoch": 0.9006086658117651, + "grad_norm": 6.216696548730998, + "learning_rate": 3.0199032075970887e-06, + "loss": 0.6787, + "step": 12466 + }, + { + "epoch": 0.9006809110119747, + "grad_norm": 5.68142644190655, + "learning_rate": 3.019617096613369e-06, + "loss": 0.7551, + "step": 12467 + }, + { + "epoch": 0.9007531562121841, + "grad_norm": 6.609901641714101, + "learning_rate": 3.019330978516495e-06, + "loss": 0.7773, + "step": 12468 + }, + { + "epoch": 0.9008254014123936, + "grad_norm": 6.220520994194054, + "learning_rate": 3.0190448533103827e-06, + "loss": 0.8138, + "step": 12469 + }, + { + "epoch": 0.9008976466126032, + "grad_norm": 8.396658605208495, + "learning_rate": 3.0187587209989496e-06, + "loss": 0.7507, + "step": 12470 + }, + { + "epoch": 0.9009698918128127, + "grad_norm": 5.434695869085328, + "learning_rate": 3.018472581586112e-06, + "loss": 0.8066, + "step": 12471 + }, + { + "epoch": 0.9010421370130222, + "grad_norm": 7.044487322420432, + "learning_rate": 3.0181864350757885e-06, + "loss": 0.8508, + "step": 12472 + }, + { + "epoch": 0.9011143822132317, + "grad_norm": 8.365986998815828, + "learning_rate": 3.0179002814718935e-06, + "loss": 0.8399, + "step": 12473 + }, + { + "epoch": 0.9011866274134412, + "grad_norm": 6.838620009388773, + "learning_rate": 3.0176141207783467e-06, + "loss": 0.8381, + "step": 12474 + }, + { + "epoch": 0.9012588726136508, + "grad_norm": 7.970426973897963, + "learning_rate": 3.017327952999064e-06, + "loss": 0.8446, + "step": 12475 + }, + { + "epoch": 0.9013311178138602, + "grad_norm": 5.815406318828992, + "learning_rate": 3.0170417781379637e-06, + "loss": 0.7614, + "step": 12476 + }, + { + "epoch": 0.9014033630140698, + "grad_norm": 5.601244120680948, + "learning_rate": 3.0167555961989625e-06, + "loss": 0.7948, + "step": 12477 + }, + { + "epoch": 0.9014756082142793, + "grad_norm": 5.820864002289892, + "learning_rate": 3.0164694071859783e-06, + "loss": 0.8149, + "step": 12478 + }, + { + "epoch": 0.9015478534144887, + "grad_norm": 8.680857362374896, + "learning_rate": 3.0161832111029295e-06, + "loss": 0.8591, + "step": 12479 + }, + { + "epoch": 0.9016200986146983, + "grad_norm": 8.911942653697507, + "learning_rate": 3.0158970079537337e-06, + "loss": 0.8812, + "step": 12480 + }, + { + "epoch": 0.9016923438149078, + "grad_norm": 10.036658424450916, + "learning_rate": 3.0156107977423076e-06, + "loss": 0.8819, + "step": 12481 + }, + { + "epoch": 0.9017645890151174, + "grad_norm": 5.189747518818428, + "learning_rate": 3.0153245804725707e-06, + "loss": 0.8642, + "step": 12482 + }, + { + "epoch": 0.9018368342153268, + "grad_norm": 7.355678788324822, + "learning_rate": 3.0150383561484397e-06, + "loss": 0.876, + "step": 12483 + }, + { + "epoch": 0.9019090794155363, + "grad_norm": 6.029710958711493, + "learning_rate": 3.014752124773833e-06, + "loss": 0.7998, + "step": 12484 + }, + { + "epoch": 0.9019813246157459, + "grad_norm": 6.497243736999446, + "learning_rate": 3.014465886352671e-06, + "loss": 0.8473, + "step": 12485 + }, + { + "epoch": 0.9020535698159553, + "grad_norm": 7.058250836907752, + "learning_rate": 3.0141796408888697e-06, + "loss": 0.7495, + "step": 12486 + }, + { + "epoch": 0.9021258150161648, + "grad_norm": 8.598636306164112, + "learning_rate": 3.0138933883863485e-06, + "loss": 0.8095, + "step": 12487 + }, + { + "epoch": 0.9021980602163744, + "grad_norm": 6.38595955445635, + "learning_rate": 3.013607128849025e-06, + "loss": 0.8614, + "step": 12488 + }, + { + "epoch": 0.9022703054165839, + "grad_norm": 6.645138918513133, + "learning_rate": 3.0133208622808195e-06, + "loss": 0.7728, + "step": 12489 + }, + { + "epoch": 0.9023425506167934, + "grad_norm": 5.555532313404209, + "learning_rate": 3.0130345886856506e-06, + "loss": 0.7845, + "step": 12490 + }, + { + "epoch": 0.9024147958170029, + "grad_norm": 5.616992591585445, + "learning_rate": 3.012748308067435e-06, + "loss": 0.8654, + "step": 12491 + }, + { + "epoch": 0.9024870410172124, + "grad_norm": 5.407130528004202, + "learning_rate": 3.0124620204300944e-06, + "loss": 0.7953, + "step": 12492 + }, + { + "epoch": 0.902559286217422, + "grad_norm": 6.278929931930112, + "learning_rate": 3.012175725777546e-06, + "loss": 0.8247, + "step": 12493 + }, + { + "epoch": 0.9026315314176314, + "grad_norm": 5.920944422575221, + "learning_rate": 3.0118894241137096e-06, + "loss": 0.8173, + "step": 12494 + }, + { + "epoch": 0.902703776617841, + "grad_norm": 5.5884979995296655, + "learning_rate": 3.0116031154425045e-06, + "loss": 0.8046, + "step": 12495 + }, + { + "epoch": 0.9027760218180505, + "grad_norm": 6.14838600379483, + "learning_rate": 3.0113167997678505e-06, + "loss": 0.7925, + "step": 12496 + }, + { + "epoch": 0.9028482670182599, + "grad_norm": 7.012267126884789, + "learning_rate": 3.011030477093666e-06, + "loss": 0.7872, + "step": 12497 + }, + { + "epoch": 0.9029205122184695, + "grad_norm": 5.345441227143162, + "learning_rate": 3.010744147423871e-06, + "loss": 0.7421, + "step": 12498 + }, + { + "epoch": 0.902992757418679, + "grad_norm": 5.42770081544366, + "learning_rate": 3.0104578107623854e-06, + "loss": 0.819, + "step": 12499 + }, + { + "epoch": 0.9030650026188886, + "grad_norm": 5.825434250259938, + "learning_rate": 3.010171467113129e-06, + "loss": 0.8146, + "step": 12500 + }, + { + "epoch": 0.903137247819098, + "grad_norm": 5.561998323255805, + "learning_rate": 3.0098851164800208e-06, + "loss": 0.8074, + "step": 12501 + }, + { + "epoch": 0.9032094930193075, + "grad_norm": 5.561330951042339, + "learning_rate": 3.0095987588669813e-06, + "loss": 0.8335, + "step": 12502 + }, + { + "epoch": 0.9032817382195171, + "grad_norm": 6.876873801374378, + "learning_rate": 3.0093123942779306e-06, + "loss": 0.879, + "step": 12503 + }, + { + "epoch": 0.9033539834197265, + "grad_norm": 9.311086252928423, + "learning_rate": 3.009026022716788e-06, + "loss": 0.8622, + "step": 12504 + }, + { + "epoch": 0.903426228619936, + "grad_norm": 6.828930839980467, + "learning_rate": 3.0087396441874763e-06, + "loss": 0.9034, + "step": 12505 + }, + { + "epoch": 0.9034984738201456, + "grad_norm": 6.89643846451725, + "learning_rate": 3.008453258693912e-06, + "loss": 0.9339, + "step": 12506 + }, + { + "epoch": 0.9035707190203551, + "grad_norm": 5.782643742999154, + "learning_rate": 3.0081668662400184e-06, + "loss": 0.7321, + "step": 12507 + }, + { + "epoch": 0.9036429642205646, + "grad_norm": 6.141319177425455, + "learning_rate": 3.0078804668297146e-06, + "loss": 0.711, + "step": 12508 + }, + { + "epoch": 0.9037152094207741, + "grad_norm": 5.582439730669523, + "learning_rate": 3.0075940604669213e-06, + "loss": 0.7957, + "step": 12509 + }, + { + "epoch": 0.9037874546209836, + "grad_norm": 6.257196784238105, + "learning_rate": 3.00730764715556e-06, + "loss": 0.8626, + "step": 12510 + }, + { + "epoch": 0.9038596998211931, + "grad_norm": 6.124911249257355, + "learning_rate": 3.007021226899551e-06, + "loss": 0.7918, + "step": 12511 + }, + { + "epoch": 0.9039319450214026, + "grad_norm": 6.78049897284859, + "learning_rate": 3.006734799702814e-06, + "loss": 0.8645, + "step": 12512 + }, + { + "epoch": 0.9040041902216122, + "grad_norm": 5.694018977121204, + "learning_rate": 3.0064483655692723e-06, + "loss": 0.7723, + "step": 12513 + }, + { + "epoch": 0.9040764354218217, + "grad_norm": 7.583959476765078, + "learning_rate": 3.006161924502845e-06, + "loss": 0.8152, + "step": 12514 + }, + { + "epoch": 0.9041486806220311, + "grad_norm": 6.574818992298886, + "learning_rate": 3.0058754765074543e-06, + "loss": 0.8481, + "step": 12515 + }, + { + "epoch": 0.9042209258222407, + "grad_norm": 7.316318395344789, + "learning_rate": 3.0055890215870205e-06, + "loss": 0.7803, + "step": 12516 + }, + { + "epoch": 0.9042931710224502, + "grad_norm": 6.641239631482472, + "learning_rate": 3.005302559745466e-06, + "loss": 0.7834, + "step": 12517 + }, + { + "epoch": 0.9043654162226598, + "grad_norm": 6.202526303857875, + "learning_rate": 3.0050160909867114e-06, + "loss": 0.9191, + "step": 12518 + }, + { + "epoch": 0.9044376614228692, + "grad_norm": 6.693946939486607, + "learning_rate": 3.0047296153146797e-06, + "loss": 0.7401, + "step": 12519 + }, + { + "epoch": 0.9045099066230787, + "grad_norm": 6.675209516458869, + "learning_rate": 3.0044431327332906e-06, + "loss": 0.868, + "step": 12520 + }, + { + "epoch": 0.9045821518232883, + "grad_norm": 6.686517946383936, + "learning_rate": 3.004156643246466e-06, + "loss": 0.8033, + "step": 12521 + }, + { + "epoch": 0.9046543970234977, + "grad_norm": 6.055027397213481, + "learning_rate": 3.00387014685813e-06, + "loss": 0.8383, + "step": 12522 + }, + { + "epoch": 0.9047266422237072, + "grad_norm": 7.182515414493961, + "learning_rate": 3.003583643572202e-06, + "loss": 0.7498, + "step": 12523 + }, + { + "epoch": 0.9047988874239168, + "grad_norm": 7.9850500609011625, + "learning_rate": 3.003297133392606e-06, + "loss": 0.8047, + "step": 12524 + }, + { + "epoch": 0.9048711326241263, + "grad_norm": 6.970851974629591, + "learning_rate": 3.003010616323262e-06, + "loss": 0.8371, + "step": 12525 + }, + { + "epoch": 0.9049433778243358, + "grad_norm": 5.161209957589243, + "learning_rate": 3.0027240923680928e-06, + "loss": 0.7604, + "step": 12526 + }, + { + "epoch": 0.9050156230245453, + "grad_norm": 6.8143677557170514, + "learning_rate": 3.0024375615310215e-06, + "loss": 0.8193, + "step": 12527 + }, + { + "epoch": 0.9050878682247548, + "grad_norm": 6.018733977457905, + "learning_rate": 3.0021510238159707e-06, + "loss": 0.7889, + "step": 12528 + }, + { + "epoch": 0.9051601134249643, + "grad_norm": 6.061645132926317, + "learning_rate": 3.0018644792268613e-06, + "loss": 0.7846, + "step": 12529 + }, + { + "epoch": 0.9052323586251738, + "grad_norm": 6.906135385520238, + "learning_rate": 3.0015779277676175e-06, + "loss": 0.8441, + "step": 12530 + }, + { + "epoch": 0.9053046038253834, + "grad_norm": 6.490846791562192, + "learning_rate": 3.001291369442162e-06, + "loss": 0.8196, + "step": 12531 + }, + { + "epoch": 0.9053768490255929, + "grad_norm": 6.657403263960667, + "learning_rate": 3.0010048042544155e-06, + "loss": 0.8874, + "step": 12532 + }, + { + "epoch": 0.9054490942258023, + "grad_norm": 6.42778808422942, + "learning_rate": 3.000718232208303e-06, + "loss": 0.8466, + "step": 12533 + }, + { + "epoch": 0.9055213394260119, + "grad_norm": 5.818226936013675, + "learning_rate": 3.0004316533077466e-06, + "loss": 0.7966, + "step": 12534 + }, + { + "epoch": 0.9055935846262214, + "grad_norm": 6.89332607207417, + "learning_rate": 3.000145067556669e-06, + "loss": 0.8117, + "step": 12535 + }, + { + "epoch": 0.905665829826431, + "grad_norm": 7.324001298855344, + "learning_rate": 2.9998584749589944e-06, + "loss": 0.8193, + "step": 12536 + }, + { + "epoch": 0.9057380750266404, + "grad_norm": 6.3899035804411, + "learning_rate": 2.9995718755186452e-06, + "loss": 0.782, + "step": 12537 + }, + { + "epoch": 0.9058103202268499, + "grad_norm": 6.41239106381336, + "learning_rate": 2.999285269239545e-06, + "loss": 0.6683, + "step": 12538 + }, + { + "epoch": 0.9058825654270595, + "grad_norm": 5.530922228051035, + "learning_rate": 2.998998656125617e-06, + "loss": 0.7883, + "step": 12539 + }, + { + "epoch": 0.9059548106272689, + "grad_norm": 6.651515369279671, + "learning_rate": 2.9987120361807846e-06, + "loss": 0.8248, + "step": 12540 + }, + { + "epoch": 0.9060270558274784, + "grad_norm": 6.141166061365033, + "learning_rate": 2.9984254094089713e-06, + "loss": 0.7796, + "step": 12541 + }, + { + "epoch": 0.906099301027688, + "grad_norm": 5.948715379024662, + "learning_rate": 2.9981387758141023e-06, + "loss": 0.8034, + "step": 12542 + }, + { + "epoch": 0.9061715462278975, + "grad_norm": 5.733990149615595, + "learning_rate": 2.9978521354001e-06, + "loss": 0.8564, + "step": 12543 + }, + { + "epoch": 0.906243791428107, + "grad_norm": 6.0152340452302, + "learning_rate": 2.9975654881708877e-06, + "loss": 0.7318, + "step": 12544 + }, + { + "epoch": 0.9063160366283165, + "grad_norm": 5.31340252558808, + "learning_rate": 2.9972788341303903e-06, + "loss": 0.8217, + "step": 12545 + }, + { + "epoch": 0.906388281828526, + "grad_norm": 5.241511612596633, + "learning_rate": 2.9969921732825326e-06, + "loss": 0.8327, + "step": 12546 + }, + { + "epoch": 0.9064605270287355, + "grad_norm": 6.0564437976926175, + "learning_rate": 2.9967055056312377e-06, + "loss": 0.8262, + "step": 12547 + }, + { + "epoch": 0.906532772228945, + "grad_norm": 6.164330460945315, + "learning_rate": 2.99641883118043e-06, + "loss": 0.7676, + "step": 12548 + }, + { + "epoch": 0.9066050174291546, + "grad_norm": 9.214893840606758, + "learning_rate": 2.9961321499340334e-06, + "loss": 0.8007, + "step": 12549 + }, + { + "epoch": 0.9066772626293641, + "grad_norm": 6.761603378113798, + "learning_rate": 2.995845461895973e-06, + "loss": 0.779, + "step": 12550 + }, + { + "epoch": 0.9067495078295735, + "grad_norm": 6.130777553511833, + "learning_rate": 2.995558767070174e-06, + "loss": 0.8747, + "step": 12551 + }, + { + "epoch": 0.9068217530297831, + "grad_norm": 6.676236658873992, + "learning_rate": 2.99527206546056e-06, + "loss": 0.7583, + "step": 12552 + }, + { + "epoch": 0.9068939982299926, + "grad_norm": 7.270569396670844, + "learning_rate": 2.9949853570710557e-06, + "loss": 0.7565, + "step": 12553 + }, + { + "epoch": 0.9069662434302022, + "grad_norm": 6.55487460391105, + "learning_rate": 2.9946986419055857e-06, + "loss": 0.8264, + "step": 12554 + }, + { + "epoch": 0.9070384886304116, + "grad_norm": 8.130101876271581, + "learning_rate": 2.9944119199680767e-06, + "loss": 0.7701, + "step": 12555 + }, + { + "epoch": 0.9071107338306211, + "grad_norm": 7.1488177088949305, + "learning_rate": 2.9941251912624517e-06, + "loss": 0.8296, + "step": 12556 + }, + { + "epoch": 0.9071829790308307, + "grad_norm": 6.067673510886754, + "learning_rate": 2.9938384557926364e-06, + "loss": 0.851, + "step": 12557 + }, + { + "epoch": 0.9072552242310401, + "grad_norm": 7.273893512071178, + "learning_rate": 2.9935517135625557e-06, + "loss": 0.7674, + "step": 12558 + }, + { + "epoch": 0.9073274694312496, + "grad_norm": 8.52200162630529, + "learning_rate": 2.993264964576136e-06, + "loss": 0.7661, + "step": 12559 + }, + { + "epoch": 0.9073997146314592, + "grad_norm": 8.224413874650459, + "learning_rate": 2.9929782088373015e-06, + "loss": 0.7799, + "step": 12560 + }, + { + "epoch": 0.9074719598316687, + "grad_norm": 7.151233764053901, + "learning_rate": 2.9926914463499786e-06, + "loss": 0.8287, + "step": 12561 + }, + { + "epoch": 0.9075442050318782, + "grad_norm": 7.5164606656804045, + "learning_rate": 2.992404677118092e-06, + "loss": 0.866, + "step": 12562 + }, + { + "epoch": 0.9076164502320877, + "grad_norm": 5.561678707912641, + "learning_rate": 2.9921179011455674e-06, + "loss": 0.8108, + "step": 12563 + }, + { + "epoch": 0.9076886954322972, + "grad_norm": 5.862113943178902, + "learning_rate": 2.9918311184363307e-06, + "loss": 0.7906, + "step": 12564 + }, + { + "epoch": 0.9077609406325067, + "grad_norm": 6.6691134095495475, + "learning_rate": 2.991544328994309e-06, + "loss": 0.8674, + "step": 12565 + }, + { + "epoch": 0.9078331858327162, + "grad_norm": 5.767554645122373, + "learning_rate": 2.9912575328234263e-06, + "loss": 0.8066, + "step": 12566 + }, + { + "epoch": 0.9079054310329258, + "grad_norm": 6.657247692431078, + "learning_rate": 2.9909707299276098e-06, + "loss": 0.8372, + "step": 12567 + }, + { + "epoch": 0.9079776762331353, + "grad_norm": 8.91944395171105, + "learning_rate": 2.9906839203107846e-06, + "loss": 0.8351, + "step": 12568 + }, + { + "epoch": 0.9080499214333447, + "grad_norm": 8.427106318935612, + "learning_rate": 2.9903971039768776e-06, + "loss": 0.7826, + "step": 12569 + }, + { + "epoch": 0.9081221666335543, + "grad_norm": 5.807884106260046, + "learning_rate": 2.9901102809298147e-06, + "loss": 0.7836, + "step": 12570 + }, + { + "epoch": 0.9081944118337638, + "grad_norm": 5.891794751333901, + "learning_rate": 2.9898234511735242e-06, + "loss": 0.8569, + "step": 12571 + }, + { + "epoch": 0.9082666570339734, + "grad_norm": 6.839014932294023, + "learning_rate": 2.98953661471193e-06, + "loss": 0.8637, + "step": 12572 + }, + { + "epoch": 0.9083389022341828, + "grad_norm": 4.845492338787569, + "learning_rate": 2.9892497715489586e-06, + "loss": 0.7208, + "step": 12573 + }, + { + "epoch": 0.9084111474343923, + "grad_norm": 5.8791657957657195, + "learning_rate": 2.9889629216885397e-06, + "loss": 0.8728, + "step": 12574 + }, + { + "epoch": 0.9084833926346019, + "grad_norm": 5.824164714600947, + "learning_rate": 2.9886760651345963e-06, + "loss": 0.7077, + "step": 12575 + }, + { + "epoch": 0.9085556378348113, + "grad_norm": 5.718994135413888, + "learning_rate": 2.988389201891058e-06, + "loss": 0.8083, + "step": 12576 + }, + { + "epoch": 0.9086278830350208, + "grad_norm": 7.722290462616235, + "learning_rate": 2.9881023319618507e-06, + "loss": 0.88, + "step": 12577 + }, + { + "epoch": 0.9087001282352304, + "grad_norm": 7.013237833791898, + "learning_rate": 2.987815455350901e-06, + "loss": 0.8169, + "step": 12578 + }, + { + "epoch": 0.9087723734354399, + "grad_norm": 6.154447494983271, + "learning_rate": 2.987528572062137e-06, + "loss": 0.8517, + "step": 12579 + }, + { + "epoch": 0.9088446186356494, + "grad_norm": 6.93727950012056, + "learning_rate": 2.987241682099485e-06, + "loss": 0.7929, + "step": 12580 + }, + { + "epoch": 0.9089168638358589, + "grad_norm": 6.487780896759343, + "learning_rate": 2.986954785466873e-06, + "loss": 0.8532, + "step": 12581 + }, + { + "epoch": 0.9089891090360684, + "grad_norm": 5.0558865523829155, + "learning_rate": 2.986667882168228e-06, + "loss": 0.7937, + "step": 12582 + }, + { + "epoch": 0.9090613542362779, + "grad_norm": 6.2203486702391295, + "learning_rate": 2.9863809722074772e-06, + "loss": 0.7854, + "step": 12583 + }, + { + "epoch": 0.9091335994364874, + "grad_norm": 7.371210450106268, + "learning_rate": 2.986094055588549e-06, + "loss": 0.8597, + "step": 12584 + }, + { + "epoch": 0.909205844636697, + "grad_norm": 6.999681465530877, + "learning_rate": 2.98580713231537e-06, + "loss": 0.7874, + "step": 12585 + }, + { + "epoch": 0.9092780898369065, + "grad_norm": 5.855044298012565, + "learning_rate": 2.985520202391869e-06, + "loss": 0.7759, + "step": 12586 + }, + { + "epoch": 0.9093503350371159, + "grad_norm": 6.041393545707902, + "learning_rate": 2.9852332658219727e-06, + "loss": 0.7054, + "step": 12587 + }, + { + "epoch": 0.9094225802373255, + "grad_norm": 4.987310328498132, + "learning_rate": 2.984946322609611e-06, + "loss": 0.7023, + "step": 12588 + }, + { + "epoch": 0.909494825437535, + "grad_norm": 6.051141224564909, + "learning_rate": 2.9846593727587103e-06, + "loss": 0.8253, + "step": 12589 + }, + { + "epoch": 0.9095670706377446, + "grad_norm": 7.031172417106697, + "learning_rate": 2.984372416273199e-06, + "loss": 0.8472, + "step": 12590 + }, + { + "epoch": 0.909639315837954, + "grad_norm": 6.372564411518732, + "learning_rate": 2.984085453157005e-06, + "loss": 0.8005, + "step": 12591 + }, + { + "epoch": 0.9097115610381635, + "grad_norm": 7.3093160887028485, + "learning_rate": 2.9837984834140575e-06, + "loss": 0.8091, + "step": 12592 + }, + { + "epoch": 0.9097838062383731, + "grad_norm": 7.019501088989878, + "learning_rate": 2.983511507048284e-06, + "loss": 0.8357, + "step": 12593 + }, + { + "epoch": 0.9098560514385825, + "grad_norm": 7.2257688014306485, + "learning_rate": 2.9832245240636136e-06, + "loss": 0.7594, + "step": 12594 + }, + { + "epoch": 0.909928296638792, + "grad_norm": 6.0478054385503, + "learning_rate": 2.982937534463975e-06, + "loss": 0.7958, + "step": 12595 + }, + { + "epoch": 0.9100005418390016, + "grad_norm": 6.74903551911178, + "learning_rate": 2.9826505382532965e-06, + "loss": 0.7902, + "step": 12596 + }, + { + "epoch": 0.9100727870392111, + "grad_norm": 5.314702553522809, + "learning_rate": 2.9823635354355062e-06, + "loss": 0.8194, + "step": 12597 + }, + { + "epoch": 0.9101450322394206, + "grad_norm": 7.795294775438984, + "learning_rate": 2.9820765260145342e-06, + "loss": 0.8315, + "step": 12598 + }, + { + "epoch": 0.9102172774396301, + "grad_norm": 6.981973597935302, + "learning_rate": 2.981789509994309e-06, + "loss": 0.7937, + "step": 12599 + }, + { + "epoch": 0.9102895226398396, + "grad_norm": 5.829773861407189, + "learning_rate": 2.9815024873787592e-06, + "loss": 0.8415, + "step": 12600 + }, + { + "epoch": 0.9103617678400491, + "grad_norm": 5.974889663252126, + "learning_rate": 2.9812154581718133e-06, + "loss": 0.8251, + "step": 12601 + }, + { + "epoch": 0.9104340130402586, + "grad_norm": 7.943283737268971, + "learning_rate": 2.980928422377403e-06, + "loss": 0.8594, + "step": 12602 + }, + { + "epoch": 0.9105062582404682, + "grad_norm": 5.779779360562746, + "learning_rate": 2.980641379999455e-06, + "loss": 0.8492, + "step": 12603 + }, + { + "epoch": 0.9105785034406777, + "grad_norm": 7.105802137072947, + "learning_rate": 2.9803543310419005e-06, + "loss": 0.8703, + "step": 12604 + }, + { + "epoch": 0.9106507486408871, + "grad_norm": 7.89790983918409, + "learning_rate": 2.9800672755086675e-06, + "loss": 0.8115, + "step": 12605 + }, + { + "epoch": 0.9107229938410967, + "grad_norm": 5.78648082088996, + "learning_rate": 2.979780213403686e-06, + "loss": 0.767, + "step": 12606 + }, + { + "epoch": 0.9107952390413062, + "grad_norm": 5.2270151806135265, + "learning_rate": 2.9794931447308867e-06, + "loss": 0.8361, + "step": 12607 + }, + { + "epoch": 0.9108674842415158, + "grad_norm": 9.084875646652337, + "learning_rate": 2.979206069494198e-06, + "loss": 0.8597, + "step": 12608 + }, + { + "epoch": 0.9109397294417252, + "grad_norm": 9.039824220416852, + "learning_rate": 2.978918987697551e-06, + "loss": 0.9859, + "step": 12609 + }, + { + "epoch": 0.9110119746419347, + "grad_norm": 6.804564727831892, + "learning_rate": 2.978631899344875e-06, + "loss": 0.8263, + "step": 12610 + }, + { + "epoch": 0.9110842198421443, + "grad_norm": 6.311759074323471, + "learning_rate": 2.9783448044400982e-06, + "loss": 0.7841, + "step": 12611 + }, + { + "epoch": 0.9111564650423537, + "grad_norm": 7.286295309326736, + "learning_rate": 2.978057702987154e-06, + "loss": 0.8115, + "step": 12612 + }, + { + "epoch": 0.9112287102425632, + "grad_norm": 5.7472627178125935, + "learning_rate": 2.977770594989971e-06, + "loss": 0.8057, + "step": 12613 + }, + { + "epoch": 0.9113009554427728, + "grad_norm": 7.456332430259992, + "learning_rate": 2.9774834804524787e-06, + "loss": 0.7904, + "step": 12614 + }, + { + "epoch": 0.9113732006429823, + "grad_norm": 5.3839154490708845, + "learning_rate": 2.9771963593786086e-06, + "loss": 0.8646, + "step": 12615 + }, + { + "epoch": 0.9114454458431918, + "grad_norm": 6.582513428184296, + "learning_rate": 2.9769092317722907e-06, + "loss": 0.8077, + "step": 12616 + }, + { + "epoch": 0.9115176910434013, + "grad_norm": 5.9004568925234375, + "learning_rate": 2.9766220976374567e-06, + "loss": 0.8172, + "step": 12617 + }, + { + "epoch": 0.9115899362436108, + "grad_norm": 7.210872959827641, + "learning_rate": 2.9763349569780354e-06, + "loss": 0.7779, + "step": 12618 + }, + { + "epoch": 0.9116621814438203, + "grad_norm": 6.289589882293692, + "learning_rate": 2.976047809797959e-06, + "loss": 0.7772, + "step": 12619 + }, + { + "epoch": 0.9117344266440298, + "grad_norm": 7.287675502751096, + "learning_rate": 2.975760656101157e-06, + "loss": 0.7784, + "step": 12620 + }, + { + "epoch": 0.9118066718442394, + "grad_norm": 5.868301793737505, + "learning_rate": 2.975473495891561e-06, + "loss": 0.8477, + "step": 12621 + }, + { + "epoch": 0.9118789170444489, + "grad_norm": 7.062391432206177, + "learning_rate": 2.9751863291731024e-06, + "loss": 0.7931, + "step": 12622 + }, + { + "epoch": 0.9119511622446583, + "grad_norm": 5.509674753050257, + "learning_rate": 2.974899155949712e-06, + "loss": 0.7789, + "step": 12623 + }, + { + "epoch": 0.9120234074448679, + "grad_norm": 5.854230488964363, + "learning_rate": 2.974611976225321e-06, + "loss": 0.7747, + "step": 12624 + }, + { + "epoch": 0.9120956526450774, + "grad_norm": 8.150532205441163, + "learning_rate": 2.9743247900038595e-06, + "loss": 0.8934, + "step": 12625 + }, + { + "epoch": 0.912167897845287, + "grad_norm": 6.6391407463506535, + "learning_rate": 2.974037597289261e-06, + "loss": 0.775, + "step": 12626 + }, + { + "epoch": 0.9122401430454964, + "grad_norm": 5.477207641042178, + "learning_rate": 2.9737503980854555e-06, + "loss": 0.7984, + "step": 12627 + }, + { + "epoch": 0.9123123882457059, + "grad_norm": 7.204485121469716, + "learning_rate": 2.9734631923963753e-06, + "loss": 0.8135, + "step": 12628 + }, + { + "epoch": 0.9123846334459155, + "grad_norm": 5.6341468193578415, + "learning_rate": 2.9731759802259506e-06, + "loss": 0.7977, + "step": 12629 + }, + { + "epoch": 0.9124568786461249, + "grad_norm": 6.005889227568084, + "learning_rate": 2.972888761578115e-06, + "loss": 0.7874, + "step": 12630 + }, + { + "epoch": 0.9125291238463344, + "grad_norm": 5.4621727328174785, + "learning_rate": 2.9726015364567996e-06, + "loss": 0.8033, + "step": 12631 + }, + { + "epoch": 0.912601369046544, + "grad_norm": 6.52751556663454, + "learning_rate": 2.9723143048659355e-06, + "loss": 0.8613, + "step": 12632 + }, + { + "epoch": 0.9126736142467535, + "grad_norm": 5.859846986198543, + "learning_rate": 2.972027066809456e-06, + "loss": 0.767, + "step": 12633 + }, + { + "epoch": 0.912745859446963, + "grad_norm": 6.775760520675514, + "learning_rate": 2.9717398222912918e-06, + "loss": 0.8129, + "step": 12634 + }, + { + "epoch": 0.9128181046471725, + "grad_norm": 5.648042066141877, + "learning_rate": 2.9714525713153756e-06, + "loss": 0.8403, + "step": 12635 + }, + { + "epoch": 0.912890349847382, + "grad_norm": 5.650391300120941, + "learning_rate": 2.97116531388564e-06, + "loss": 0.7624, + "step": 12636 + }, + { + "epoch": 0.9129625950475915, + "grad_norm": 6.814234801425255, + "learning_rate": 2.9708780500060173e-06, + "loss": 0.7627, + "step": 12637 + }, + { + "epoch": 0.913034840247801, + "grad_norm": 6.765100049098787, + "learning_rate": 2.97059077968044e-06, + "loss": 0.7584, + "step": 12638 + }, + { + "epoch": 0.9131070854480106, + "grad_norm": 6.680828438743932, + "learning_rate": 2.970303502912839e-06, + "loss": 0.7704, + "step": 12639 + }, + { + "epoch": 0.9131793306482201, + "grad_norm": 7.764690626948801, + "learning_rate": 2.9700162197071497e-06, + "loss": 0.8567, + "step": 12640 + }, + { + "epoch": 0.9132515758484295, + "grad_norm": 6.548816183723829, + "learning_rate": 2.9697289300673025e-06, + "loss": 0.8683, + "step": 12641 + }, + { + "epoch": 0.9133238210486391, + "grad_norm": 6.554574303905626, + "learning_rate": 2.9694416339972314e-06, + "loss": 0.7438, + "step": 12642 + }, + { + "epoch": 0.9133960662488486, + "grad_norm": 6.596552104266792, + "learning_rate": 2.9691543315008687e-06, + "loss": 0.8977, + "step": 12643 + }, + { + "epoch": 0.9134683114490582, + "grad_norm": 8.91352798504255, + "learning_rate": 2.968867022582147e-06, + "loss": 0.8299, + "step": 12644 + }, + { + "epoch": 0.9135405566492676, + "grad_norm": 6.977121294460717, + "learning_rate": 2.9685797072450005e-06, + "loss": 0.7945, + "step": 12645 + }, + { + "epoch": 0.9136128018494771, + "grad_norm": 6.780210635083804, + "learning_rate": 2.968292385493361e-06, + "loss": 0.8369, + "step": 12646 + }, + { + "epoch": 0.9136850470496867, + "grad_norm": 6.236412843438814, + "learning_rate": 2.968005057331163e-06, + "loss": 0.8559, + "step": 12647 + }, + { + "epoch": 0.9137572922498961, + "grad_norm": 5.690016975578636, + "learning_rate": 2.9677177227623386e-06, + "loss": 0.792, + "step": 12648 + }, + { + "epoch": 0.9138295374501056, + "grad_norm": 7.162145859516426, + "learning_rate": 2.967430381790822e-06, + "loss": 0.8276, + "step": 12649 + }, + { + "epoch": 0.9139017826503152, + "grad_norm": 5.428074702862737, + "learning_rate": 2.967143034420546e-06, + "loss": 0.6904, + "step": 12650 + }, + { + "epoch": 0.9139740278505247, + "grad_norm": 5.872794528686652, + "learning_rate": 2.9668556806554456e-06, + "loss": 0.8545, + "step": 12651 + }, + { + "epoch": 0.9140462730507342, + "grad_norm": 5.772948867706208, + "learning_rate": 2.966568320499452e-06, + "loss": 0.7346, + "step": 12652 + }, + { + "epoch": 0.9141185182509437, + "grad_norm": 5.333693829909608, + "learning_rate": 2.966280953956501e-06, + "loss": 0.7516, + "step": 12653 + }, + { + "epoch": 0.9141907634511532, + "grad_norm": 6.100491125628899, + "learning_rate": 2.965993581030526e-06, + "loss": 0.8162, + "step": 12654 + }, + { + "epoch": 0.9142630086513627, + "grad_norm": 7.156131926649433, + "learning_rate": 2.9657062017254606e-06, + "loss": 0.8945, + "step": 12655 + }, + { + "epoch": 0.9143352538515722, + "grad_norm": 6.001802809397408, + "learning_rate": 2.9654188160452397e-06, + "loss": 0.7904, + "step": 12656 + }, + { + "epoch": 0.9144074990517818, + "grad_norm": 6.606518861032764, + "learning_rate": 2.9651314239937958e-06, + "loss": 0.7414, + "step": 12657 + }, + { + "epoch": 0.9144797442519913, + "grad_norm": 6.454653505795862, + "learning_rate": 2.9648440255750633e-06, + "loss": 0.7465, + "step": 12658 + }, + { + "epoch": 0.9145519894522007, + "grad_norm": 5.297272703831179, + "learning_rate": 2.964556620792977e-06, + "loss": 0.7441, + "step": 12659 + }, + { + "epoch": 0.9146242346524103, + "grad_norm": 4.854746880734461, + "learning_rate": 2.9642692096514727e-06, + "loss": 0.8405, + "step": 12660 + }, + { + "epoch": 0.9146964798526198, + "grad_norm": 8.701513171968056, + "learning_rate": 2.9639817921544822e-06, + "loss": 0.8311, + "step": 12661 + }, + { + "epoch": 0.9147687250528294, + "grad_norm": 6.4253989247962995, + "learning_rate": 2.9636943683059415e-06, + "loss": 0.7824, + "step": 12662 + }, + { + "epoch": 0.9148409702530388, + "grad_norm": 7.07387362838435, + "learning_rate": 2.963406938109785e-06, + "loss": 0.8026, + "step": 12663 + }, + { + "epoch": 0.9149132154532483, + "grad_norm": 7.075808788211014, + "learning_rate": 2.9631195015699473e-06, + "loss": 0.7834, + "step": 12664 + }, + { + "epoch": 0.9149854606534579, + "grad_norm": 7.557374336391678, + "learning_rate": 2.9628320586903635e-06, + "loss": 0.8948, + "step": 12665 + }, + { + "epoch": 0.9150577058536673, + "grad_norm": 7.220658747665904, + "learning_rate": 2.9625446094749677e-06, + "loss": 0.8591, + "step": 12666 + }, + { + "epoch": 0.9151299510538768, + "grad_norm": 5.310873163994607, + "learning_rate": 2.962257153927695e-06, + "loss": 0.7664, + "step": 12667 + }, + { + "epoch": 0.9152021962540864, + "grad_norm": 6.572157955563081, + "learning_rate": 2.9619696920524817e-06, + "loss": 0.8589, + "step": 12668 + }, + { + "epoch": 0.9152744414542959, + "grad_norm": 6.526644456388552, + "learning_rate": 2.9616822238532615e-06, + "loss": 0.88, + "step": 12669 + }, + { + "epoch": 0.9153466866545054, + "grad_norm": 5.406252998836325, + "learning_rate": 2.9613947493339707e-06, + "loss": 0.7808, + "step": 12670 + }, + { + "epoch": 0.9154189318547149, + "grad_norm": 7.357288882939915, + "learning_rate": 2.9611072684985433e-06, + "loss": 0.7831, + "step": 12671 + }, + { + "epoch": 0.9154911770549244, + "grad_norm": 7.7743352477045375, + "learning_rate": 2.9608197813509153e-06, + "loss": 0.7564, + "step": 12672 + }, + { + "epoch": 0.9155634222551339, + "grad_norm": 7.323164507459894, + "learning_rate": 2.9605322878950226e-06, + "loss": 0.8221, + "step": 12673 + }, + { + "epoch": 0.9156356674553434, + "grad_norm": 6.454139611236953, + "learning_rate": 2.9602447881348014e-06, + "loss": 0.8495, + "step": 12674 + }, + { + "epoch": 0.915707912655553, + "grad_norm": 6.343003703600268, + "learning_rate": 2.959957282074185e-06, + "loss": 0.8639, + "step": 12675 + }, + { + "epoch": 0.9157801578557625, + "grad_norm": 6.370387783733163, + "learning_rate": 2.959669769717112e-06, + "loss": 0.826, + "step": 12676 + }, + { + "epoch": 0.9158524030559719, + "grad_norm": 6.240696505775398, + "learning_rate": 2.9593822510675154e-06, + "loss": 0.8185, + "step": 12677 + }, + { + "epoch": 0.9159246482561815, + "grad_norm": 5.747260726587493, + "learning_rate": 2.9590947261293325e-06, + "loss": 0.8238, + "step": 12678 + }, + { + "epoch": 0.915996893456391, + "grad_norm": 5.7207134607494545, + "learning_rate": 2.9588071949065e-06, + "loss": 0.8417, + "step": 12679 + }, + { + "epoch": 0.9160691386566006, + "grad_norm": 7.181318726638197, + "learning_rate": 2.958519657402953e-06, + "loss": 0.8024, + "step": 12680 + }, + { + "epoch": 0.91614138385681, + "grad_norm": 5.089486426504278, + "learning_rate": 2.9582321136226283e-06, + "loss": 0.7685, + "step": 12681 + }, + { + "epoch": 0.9162136290570195, + "grad_norm": 6.189723376377511, + "learning_rate": 2.9579445635694603e-06, + "loss": 0.8191, + "step": 12682 + }, + { + "epoch": 0.9162858742572291, + "grad_norm": 5.62170347468411, + "learning_rate": 2.9576570072473883e-06, + "loss": 0.7554, + "step": 12683 + }, + { + "epoch": 0.9163581194574385, + "grad_norm": 6.22775643842529, + "learning_rate": 2.9573694446603464e-06, + "loss": 0.82, + "step": 12684 + }, + { + "epoch": 0.916430364657648, + "grad_norm": 5.387803552842609, + "learning_rate": 2.957081875812273e-06, + "loss": 0.7622, + "step": 12685 + }, + { + "epoch": 0.9165026098578576, + "grad_norm": 7.924522544365202, + "learning_rate": 2.956794300707102e-06, + "loss": 0.7249, + "step": 12686 + }, + { + "epoch": 0.9165748550580671, + "grad_norm": 5.264916618534625, + "learning_rate": 2.956506719348773e-06, + "loss": 0.7819, + "step": 12687 + }, + { + "epoch": 0.9166471002582766, + "grad_norm": 5.830936520768213, + "learning_rate": 2.9562191317412214e-06, + "loss": 0.8035, + "step": 12688 + }, + { + "epoch": 0.9167193454584861, + "grad_norm": 6.839130671620908, + "learning_rate": 2.9559315378883833e-06, + "loss": 0.8258, + "step": 12689 + }, + { + "epoch": 0.9167915906586956, + "grad_norm": 7.434732900104645, + "learning_rate": 2.955643937794197e-06, + "loss": 0.7062, + "step": 12690 + }, + { + "epoch": 0.9168638358589051, + "grad_norm": 7.1724180460596, + "learning_rate": 2.9553563314625988e-06, + "loss": 0.8361, + "step": 12691 + }, + { + "epoch": 0.9169360810591146, + "grad_norm": 7.487746337506617, + "learning_rate": 2.955068718897526e-06, + "loss": 0.9254, + "step": 12692 + }, + { + "epoch": 0.9170083262593242, + "grad_norm": 6.846996077525131, + "learning_rate": 2.954781100102916e-06, + "loss": 0.772, + "step": 12693 + }, + { + "epoch": 0.9170805714595337, + "grad_norm": 5.46595684110685, + "learning_rate": 2.954493475082706e-06, + "loss": 0.81, + "step": 12694 + }, + { + "epoch": 0.9171528166597431, + "grad_norm": 5.804205770513727, + "learning_rate": 2.9542058438408332e-06, + "loss": 0.7956, + "step": 12695 + }, + { + "epoch": 0.9172250618599527, + "grad_norm": 7.928231181695584, + "learning_rate": 2.953918206381234e-06, + "loss": 0.7971, + "step": 12696 + }, + { + "epoch": 0.9172973070601622, + "grad_norm": 6.512408809792224, + "learning_rate": 2.953630562707848e-06, + "loss": 0.9343, + "step": 12697 + }, + { + "epoch": 0.9173695522603718, + "grad_norm": 5.827326415629802, + "learning_rate": 2.9533429128246115e-06, + "loss": 0.8068, + "step": 12698 + }, + { + "epoch": 0.9174417974605812, + "grad_norm": 5.207699342923808, + "learning_rate": 2.953055256735463e-06, + "loss": 0.713, + "step": 12699 + }, + { + "epoch": 0.9175140426607907, + "grad_norm": 7.4423773665058475, + "learning_rate": 2.952767594444339e-06, + "loss": 0.8187, + "step": 12700 + }, + { + "epoch": 0.9175862878610003, + "grad_norm": 6.525034598189373, + "learning_rate": 2.9524799259551783e-06, + "loss": 0.7674, + "step": 12701 + }, + { + "epoch": 0.9176585330612097, + "grad_norm": 5.232316708477936, + "learning_rate": 2.9521922512719194e-06, + "loss": 0.7973, + "step": 12702 + }, + { + "epoch": 0.9177307782614192, + "grad_norm": 5.01111778170491, + "learning_rate": 2.9519045703984993e-06, + "loss": 0.7752, + "step": 12703 + }, + { + "epoch": 0.9178030234616288, + "grad_norm": 7.018418195699552, + "learning_rate": 2.951616883338857e-06, + "loss": 0.7746, + "step": 12704 + }, + { + "epoch": 0.9178752686618383, + "grad_norm": 6.7220992855604, + "learning_rate": 2.95132919009693e-06, + "loss": 0.8162, + "step": 12705 + }, + { + "epoch": 0.9179475138620478, + "grad_norm": 6.3536540173791805, + "learning_rate": 2.951041490676656e-06, + "loss": 0.7646, + "step": 12706 + }, + { + "epoch": 0.9180197590622573, + "grad_norm": 5.645877833267269, + "learning_rate": 2.9507537850819747e-06, + "loss": 0.8087, + "step": 12707 + }, + { + "epoch": 0.9180920042624668, + "grad_norm": 6.74121709141125, + "learning_rate": 2.9504660733168247e-06, + "loss": 0.8, + "step": 12708 + }, + { + "epoch": 0.9181642494626763, + "grad_norm": 7.436800627323327, + "learning_rate": 2.950178355385143e-06, + "loss": 0.7941, + "step": 12709 + }, + { + "epoch": 0.9182364946628858, + "grad_norm": 5.815149831243771, + "learning_rate": 2.9498906312908693e-06, + "loss": 0.7612, + "step": 12710 + }, + { + "epoch": 0.9183087398630954, + "grad_norm": 7.277096328993132, + "learning_rate": 2.9496029010379428e-06, + "loss": 0.8501, + "step": 12711 + }, + { + "epoch": 0.9183809850633049, + "grad_norm": 6.264971691898772, + "learning_rate": 2.9493151646303016e-06, + "loss": 0.7972, + "step": 12712 + }, + { + "epoch": 0.9184532302635143, + "grad_norm": 6.765582712230718, + "learning_rate": 2.9490274220718845e-06, + "loss": 0.8042, + "step": 12713 + }, + { + "epoch": 0.9185254754637239, + "grad_norm": 5.367876166926453, + "learning_rate": 2.9487396733666302e-06, + "loss": 0.785, + "step": 12714 + }, + { + "epoch": 0.9185977206639334, + "grad_norm": 6.2783582113347105, + "learning_rate": 2.9484519185184783e-06, + "loss": 0.8063, + "step": 12715 + }, + { + "epoch": 0.9186699658641428, + "grad_norm": 5.909802856771806, + "learning_rate": 2.948164157531368e-06, + "loss": 0.783, + "step": 12716 + }, + { + "epoch": 0.9187422110643524, + "grad_norm": 5.115562868294422, + "learning_rate": 2.9478763904092388e-06, + "loss": 0.7372, + "step": 12717 + }, + { + "epoch": 0.9188144562645619, + "grad_norm": 6.090039811027704, + "learning_rate": 2.9475886171560286e-06, + "loss": 0.8503, + "step": 12718 + }, + { + "epoch": 0.9188867014647715, + "grad_norm": 7.050173645045416, + "learning_rate": 2.947300837775679e-06, + "loss": 0.8301, + "step": 12719 + }, + { + "epoch": 0.9189589466649809, + "grad_norm": 5.127635557260389, + "learning_rate": 2.9470130522721274e-06, + "loss": 0.7378, + "step": 12720 + }, + { + "epoch": 0.9190311918651904, + "grad_norm": 6.990453477374407, + "learning_rate": 2.946725260649314e-06, + "loss": 0.7943, + "step": 12721 + }, + { + "epoch": 0.9191034370654, + "grad_norm": 4.78086622106606, + "learning_rate": 2.9464374629111796e-06, + "loss": 0.7019, + "step": 12722 + }, + { + "epoch": 0.9191756822656095, + "grad_norm": 5.2087857266091815, + "learning_rate": 2.946149659061662e-06, + "loss": 0.774, + "step": 12723 + }, + { + "epoch": 0.919247927465819, + "grad_norm": 6.238290422465079, + "learning_rate": 2.9458618491047026e-06, + "loss": 0.8067, + "step": 12724 + }, + { + "epoch": 0.9193201726660285, + "grad_norm": 5.086302572762884, + "learning_rate": 2.94557403304424e-06, + "loss": 0.8073, + "step": 12725 + }, + { + "epoch": 0.919392417866238, + "grad_norm": 7.768193946389386, + "learning_rate": 2.9452862108842157e-06, + "loss": 0.8601, + "step": 12726 + }, + { + "epoch": 0.9194646630664475, + "grad_norm": 7.281060343224721, + "learning_rate": 2.9449983826285684e-06, + "loss": 0.7961, + "step": 12727 + }, + { + "epoch": 0.919536908266657, + "grad_norm": 5.95607382162548, + "learning_rate": 2.9447105482812387e-06, + "loss": 0.764, + "step": 12728 + }, + { + "epoch": 0.9196091534668666, + "grad_norm": 6.448003403712432, + "learning_rate": 2.9444227078461673e-06, + "loss": 0.7933, + "step": 12729 + }, + { + "epoch": 0.9196813986670761, + "grad_norm": 5.6659833084652345, + "learning_rate": 2.9441348613272936e-06, + "loss": 0.74, + "step": 12730 + }, + { + "epoch": 0.9197536438672855, + "grad_norm": 7.319054950514605, + "learning_rate": 2.9438470087285592e-06, + "loss": 0.8964, + "step": 12731 + }, + { + "epoch": 0.9198258890674951, + "grad_norm": 6.018518163349543, + "learning_rate": 2.943559150053903e-06, + "loss": 0.7838, + "step": 12732 + }, + { + "epoch": 0.9198981342677046, + "grad_norm": 5.684361524602637, + "learning_rate": 2.9432712853072677e-06, + "loss": 0.7626, + "step": 12733 + }, + { + "epoch": 0.919970379467914, + "grad_norm": 5.0585277656169465, + "learning_rate": 2.9429834144925905e-06, + "loss": 0.7621, + "step": 12734 + }, + { + "epoch": 0.9200426246681236, + "grad_norm": 5.887224505998833, + "learning_rate": 2.942695537613816e-06, + "loss": 0.7829, + "step": 12735 + }, + { + "epoch": 0.9201148698683331, + "grad_norm": 6.989642245806665, + "learning_rate": 2.9424076546748837e-06, + "loss": 0.8871, + "step": 12736 + }, + { + "epoch": 0.9201871150685427, + "grad_norm": 6.058885263578362, + "learning_rate": 2.9421197656797327e-06, + "loss": 0.8074, + "step": 12737 + }, + { + "epoch": 0.9202593602687521, + "grad_norm": 7.823369391929422, + "learning_rate": 2.9418318706323064e-06, + "loss": 0.7978, + "step": 12738 + }, + { + "epoch": 0.9203316054689616, + "grad_norm": 6.137160393978311, + "learning_rate": 2.9415439695365444e-06, + "loss": 0.818, + "step": 12739 + }, + { + "epoch": 0.9204038506691712, + "grad_norm": 6.962778312558427, + "learning_rate": 2.9412560623963887e-06, + "loss": 0.7176, + "step": 12740 + }, + { + "epoch": 0.9204760958693807, + "grad_norm": 5.875857026917967, + "learning_rate": 2.9409681492157794e-06, + "loss": 0.7386, + "step": 12741 + }, + { + "epoch": 0.9205483410695902, + "grad_norm": 6.343539530452542, + "learning_rate": 2.9406802299986597e-06, + "loss": 0.7538, + "step": 12742 + }, + { + "epoch": 0.9206205862697997, + "grad_norm": 6.059761293953522, + "learning_rate": 2.9403923047489693e-06, + "loss": 0.8086, + "step": 12743 + }, + { + "epoch": 0.9206928314700092, + "grad_norm": 6.051721803915989, + "learning_rate": 2.9401043734706493e-06, + "loss": 0.8853, + "step": 12744 + }, + { + "epoch": 0.9207650766702187, + "grad_norm": 9.072298212064725, + "learning_rate": 2.9398164361676428e-06, + "loss": 0.8462, + "step": 12745 + }, + { + "epoch": 0.9208373218704282, + "grad_norm": 5.463215496071755, + "learning_rate": 2.939528492843891e-06, + "loss": 0.8488, + "step": 12746 + }, + { + "epoch": 0.9209095670706378, + "grad_norm": 6.053727872541444, + "learning_rate": 2.9392405435033357e-06, + "loss": 0.7921, + "step": 12747 + }, + { + "epoch": 0.9209818122708473, + "grad_norm": 6.145261921154366, + "learning_rate": 2.938952588149917e-06, + "loss": 0.7572, + "step": 12748 + }, + { + "epoch": 0.9210540574710567, + "grad_norm": 6.51821284508627, + "learning_rate": 2.9386646267875794e-06, + "loss": 0.7906, + "step": 12749 + }, + { + "epoch": 0.9211263026712663, + "grad_norm": 5.32705525597354, + "learning_rate": 2.9383766594202633e-06, + "loss": 0.7565, + "step": 12750 + }, + { + "epoch": 0.9211985478714758, + "grad_norm": 6.826962299998841, + "learning_rate": 2.9380886860519114e-06, + "loss": 0.7925, + "step": 12751 + }, + { + "epoch": 0.9212707930716852, + "grad_norm": 8.748734736924552, + "learning_rate": 2.9378007066864654e-06, + "loss": 0.809, + "step": 12752 + }, + { + "epoch": 0.9213430382718948, + "grad_norm": 6.876917623630319, + "learning_rate": 2.9375127213278675e-06, + "loss": 0.8053, + "step": 12753 + }, + { + "epoch": 0.9214152834721043, + "grad_norm": 5.3765492535278, + "learning_rate": 2.9372247299800606e-06, + "loss": 0.7433, + "step": 12754 + }, + { + "epoch": 0.9214875286723139, + "grad_norm": 5.833448499723451, + "learning_rate": 2.9369367326469863e-06, + "loss": 0.718, + "step": 12755 + }, + { + "epoch": 0.9215597738725233, + "grad_norm": 5.716568994753526, + "learning_rate": 2.9366487293325875e-06, + "loss": 0.8311, + "step": 12756 + }, + { + "epoch": 0.9216320190727328, + "grad_norm": 5.893325469988472, + "learning_rate": 2.936360720040806e-06, + "loss": 0.782, + "step": 12757 + }, + { + "epoch": 0.9217042642729424, + "grad_norm": 6.134567573311377, + "learning_rate": 2.9360727047755855e-06, + "loss": 0.8102, + "step": 12758 + }, + { + "epoch": 0.9217765094731519, + "grad_norm": 6.377850063739887, + "learning_rate": 2.935784683540868e-06, + "loss": 0.8794, + "step": 12759 + }, + { + "epoch": 0.9218487546733614, + "grad_norm": 6.601326733955645, + "learning_rate": 2.9354966563405973e-06, + "loss": 0.778, + "step": 12760 + }, + { + "epoch": 0.9219209998735709, + "grad_norm": 6.598981616895667, + "learning_rate": 2.9352086231787154e-06, + "loss": 0.7163, + "step": 12761 + }, + { + "epoch": 0.9219932450737804, + "grad_norm": 6.999756945068811, + "learning_rate": 2.934920584059164e-06, + "loss": 0.7861, + "step": 12762 + }, + { + "epoch": 0.9220654902739899, + "grad_norm": 5.83472124573067, + "learning_rate": 2.934632538985889e-06, + "loss": 0.8171, + "step": 12763 + }, + { + "epoch": 0.9221377354741994, + "grad_norm": 5.9941874005384115, + "learning_rate": 2.9343444879628314e-06, + "loss": 0.8267, + "step": 12764 + }, + { + "epoch": 0.922209980674409, + "grad_norm": 8.440433698602686, + "learning_rate": 2.9340564309939357e-06, + "loss": 0.8065, + "step": 12765 + }, + { + "epoch": 0.9222822258746185, + "grad_norm": 6.135499013489551, + "learning_rate": 2.9337683680831437e-06, + "loss": 0.7642, + "step": 12766 + }, + { + "epoch": 0.9223544710748279, + "grad_norm": 6.546832464621668, + "learning_rate": 2.9334802992343996e-06, + "loss": 0.8722, + "step": 12767 + }, + { + "epoch": 0.9224267162750375, + "grad_norm": 5.926918639815151, + "learning_rate": 2.933192224451647e-06, + "loss": 0.8155, + "step": 12768 + }, + { + "epoch": 0.922498961475247, + "grad_norm": 6.231050374497967, + "learning_rate": 2.9329041437388294e-06, + "loss": 0.8027, + "step": 12769 + }, + { + "epoch": 0.9225712066754564, + "grad_norm": 9.469498104342412, + "learning_rate": 2.93261605709989e-06, + "loss": 0.7583, + "step": 12770 + }, + { + "epoch": 0.922643451875666, + "grad_norm": 6.068195302738512, + "learning_rate": 2.9323279645387727e-06, + "loss": 0.7729, + "step": 12771 + }, + { + "epoch": 0.9227156970758755, + "grad_norm": 7.6877811039338235, + "learning_rate": 2.932039866059421e-06, + "loss": 0.8462, + "step": 12772 + }, + { + "epoch": 0.9227879422760851, + "grad_norm": 5.747474115619556, + "learning_rate": 2.931751761665779e-06, + "loss": 0.7653, + "step": 12773 + }, + { + "epoch": 0.9228601874762945, + "grad_norm": 4.901499787289082, + "learning_rate": 2.9314636513617913e-06, + "loss": 0.837, + "step": 12774 + }, + { + "epoch": 0.922932432676504, + "grad_norm": 5.667326645658758, + "learning_rate": 2.9311755351514002e-06, + "loss": 0.8073, + "step": 12775 + }, + { + "epoch": 0.9230046778767136, + "grad_norm": 5.71216775608287, + "learning_rate": 2.9308874130385516e-06, + "loss": 0.7042, + "step": 12776 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 7.220564444925363, + "learning_rate": 2.9305992850271887e-06, + "loss": 0.7665, + "step": 12777 + }, + { + "epoch": 0.9231491682771326, + "grad_norm": 6.038126607724303, + "learning_rate": 2.9303111511212558e-06, + "loss": 0.7648, + "step": 12778 + }, + { + "epoch": 0.9232214134773421, + "grad_norm": 7.152022933223405, + "learning_rate": 2.9300230113246974e-06, + "loss": 0.8526, + "step": 12779 + }, + { + "epoch": 0.9232936586775516, + "grad_norm": 8.902362175688479, + "learning_rate": 2.9297348656414583e-06, + "loss": 0.7806, + "step": 12780 + }, + { + "epoch": 0.9233659038777611, + "grad_norm": 6.654468280005519, + "learning_rate": 2.9294467140754813e-06, + "loss": 0.7869, + "step": 12781 + }, + { + "epoch": 0.9234381490779706, + "grad_norm": 6.745774536002235, + "learning_rate": 2.929158556630713e-06, + "loss": 0.7507, + "step": 12782 + }, + { + "epoch": 0.9235103942781802, + "grad_norm": 5.8668702061341875, + "learning_rate": 2.928870393311098e-06, + "loss": 0.802, + "step": 12783 + }, + { + "epoch": 0.9235826394783897, + "grad_norm": 5.91749849874641, + "learning_rate": 2.928582224120579e-06, + "loss": 0.7292, + "step": 12784 + }, + { + "epoch": 0.9236548846785991, + "grad_norm": 4.850444671066775, + "learning_rate": 2.9282940490631034e-06, + "loss": 0.7547, + "step": 12785 + }, + { + "epoch": 0.9237271298788087, + "grad_norm": 5.438561872956092, + "learning_rate": 2.928005868142614e-06, + "loss": 0.7855, + "step": 12786 + }, + { + "epoch": 0.9237993750790182, + "grad_norm": 7.765163476995374, + "learning_rate": 2.9277176813630566e-06, + "loss": 0.7886, + "step": 12787 + }, + { + "epoch": 0.9238716202792276, + "grad_norm": 6.911881442271813, + "learning_rate": 2.9274294887283767e-06, + "loss": 0.8623, + "step": 12788 + }, + { + "epoch": 0.9239438654794372, + "grad_norm": 5.463806359273782, + "learning_rate": 2.927141290242518e-06, + "loss": 0.7879, + "step": 12789 + }, + { + "epoch": 0.9240161106796467, + "grad_norm": 6.236602155935235, + "learning_rate": 2.9268530859094278e-06, + "loss": 0.7914, + "step": 12790 + }, + { + "epoch": 0.9240883558798563, + "grad_norm": 7.595677009629829, + "learning_rate": 2.9265648757330497e-06, + "loss": 0.8522, + "step": 12791 + }, + { + "epoch": 0.9241606010800657, + "grad_norm": 8.256226010977167, + "learning_rate": 2.9262766597173297e-06, + "loss": 0.8845, + "step": 12792 + }, + { + "epoch": 0.9242328462802752, + "grad_norm": 6.32903458451387, + "learning_rate": 2.9259884378662135e-06, + "loss": 0.7705, + "step": 12793 + }, + { + "epoch": 0.9243050914804848, + "grad_norm": 7.935370933165244, + "learning_rate": 2.9257002101836464e-06, + "loss": 0.7683, + "step": 12794 + }, + { + "epoch": 0.9243773366806943, + "grad_norm": 5.671033224057953, + "learning_rate": 2.9254119766735735e-06, + "loss": 0.7574, + "step": 12795 + }, + { + "epoch": 0.9244495818809038, + "grad_norm": 6.170548463059093, + "learning_rate": 2.925123737339941e-06, + "loss": 0.8522, + "step": 12796 + }, + { + "epoch": 0.9245218270811133, + "grad_norm": 6.112856290141768, + "learning_rate": 2.924835492186695e-06, + "loss": 0.7851, + "step": 12797 + }, + { + "epoch": 0.9245940722813228, + "grad_norm": 6.5340952539241055, + "learning_rate": 2.9245472412177805e-06, + "loss": 0.794, + "step": 12798 + }, + { + "epoch": 0.9246663174815323, + "grad_norm": 4.804863555908071, + "learning_rate": 2.9242589844371448e-06, + "loss": 0.7619, + "step": 12799 + }, + { + "epoch": 0.9247385626817418, + "grad_norm": 6.6403891846916565, + "learning_rate": 2.9239707218487324e-06, + "loss": 0.7885, + "step": 12800 + }, + { + "epoch": 0.9248108078819514, + "grad_norm": 6.160305178625393, + "learning_rate": 2.9236824534564896e-06, + "loss": 0.8476, + "step": 12801 + }, + { + "epoch": 0.9248830530821609, + "grad_norm": 7.6045738912435334, + "learning_rate": 2.923394179264364e-06, + "loss": 0.8429, + "step": 12802 + }, + { + "epoch": 0.9249552982823703, + "grad_norm": 6.663997815300078, + "learning_rate": 2.9231058992762995e-06, + "loss": 0.7724, + "step": 12803 + }, + { + "epoch": 0.9250275434825799, + "grad_norm": 6.200118943580904, + "learning_rate": 2.922817613496245e-06, + "loss": 0.7864, + "step": 12804 + }, + { + "epoch": 0.9250997886827894, + "grad_norm": 5.894374311922576, + "learning_rate": 2.9225293219281447e-06, + "loss": 0.7858, + "step": 12805 + }, + { + "epoch": 0.9251720338829988, + "grad_norm": 6.098915141208164, + "learning_rate": 2.922241024575947e-06, + "loss": 0.8214, + "step": 12806 + }, + { + "epoch": 0.9252442790832084, + "grad_norm": 6.838670770530984, + "learning_rate": 2.9219527214435967e-06, + "loss": 0.8645, + "step": 12807 + }, + { + "epoch": 0.9253165242834179, + "grad_norm": 6.891676219240123, + "learning_rate": 2.921664412535042e-06, + "loss": 0.7272, + "step": 12808 + }, + { + "epoch": 0.9253887694836275, + "grad_norm": 6.6778489108889065, + "learning_rate": 2.921376097854228e-06, + "loss": 0.824, + "step": 12809 + }, + { + "epoch": 0.9254610146838369, + "grad_norm": 6.414543031634249, + "learning_rate": 2.9210877774051027e-06, + "loss": 0.7617, + "step": 12810 + }, + { + "epoch": 0.9255332598840464, + "grad_norm": 6.349773084535986, + "learning_rate": 2.920799451191613e-06, + "loss": 0.7721, + "step": 12811 + }, + { + "epoch": 0.925605505084256, + "grad_norm": 6.273196699145649, + "learning_rate": 2.9205111192177053e-06, + "loss": 0.7969, + "step": 12812 + }, + { + "epoch": 0.9256777502844655, + "grad_norm": 5.82138367083098, + "learning_rate": 2.9202227814873272e-06, + "loss": 0.8454, + "step": 12813 + }, + { + "epoch": 0.925749995484675, + "grad_norm": 6.530078837691383, + "learning_rate": 2.9199344380044253e-06, + "loss": 0.8098, + "step": 12814 + }, + { + "epoch": 0.9258222406848845, + "grad_norm": 5.514752283836074, + "learning_rate": 2.919646088772947e-06, + "loss": 0.778, + "step": 12815 + }, + { + "epoch": 0.925894485885094, + "grad_norm": 6.713963990779398, + "learning_rate": 2.9193577337968393e-06, + "loss": 0.7832, + "step": 12816 + }, + { + "epoch": 0.9259667310853035, + "grad_norm": 5.904323687604518, + "learning_rate": 2.919069373080051e-06, + "loss": 0.8045, + "step": 12817 + }, + { + "epoch": 0.926038976285513, + "grad_norm": 5.854307378885619, + "learning_rate": 2.9187810066265277e-06, + "loss": 0.8048, + "step": 12818 + }, + { + "epoch": 0.9261112214857226, + "grad_norm": 8.327490118565418, + "learning_rate": 2.9184926344402164e-06, + "loss": 0.803, + "step": 12819 + }, + { + "epoch": 0.9261834666859321, + "grad_norm": 6.627715220142356, + "learning_rate": 2.9182042565250673e-06, + "loss": 0.7559, + "step": 12820 + }, + { + "epoch": 0.9262557118861415, + "grad_norm": 5.568405274275929, + "learning_rate": 2.917915872885026e-06, + "loss": 0.8096, + "step": 12821 + }, + { + "epoch": 0.9263279570863511, + "grad_norm": 5.825486964225508, + "learning_rate": 2.9176274835240416e-06, + "loss": 0.7297, + "step": 12822 + }, + { + "epoch": 0.9264002022865606, + "grad_norm": 8.380886002273595, + "learning_rate": 2.91733908844606e-06, + "loss": 0.8694, + "step": 12823 + }, + { + "epoch": 0.92647244748677, + "grad_norm": 6.896783338801016, + "learning_rate": 2.917050687655031e-06, + "loss": 0.8161, + "step": 12824 + }, + { + "epoch": 0.9265446926869796, + "grad_norm": 5.95526260885937, + "learning_rate": 2.9167622811549016e-06, + "loss": 0.8086, + "step": 12825 + }, + { + "epoch": 0.9266169378871891, + "grad_norm": 5.235539457695447, + "learning_rate": 2.916473868949621e-06, + "loss": 0.881, + "step": 12826 + }, + { + "epoch": 0.9266891830873987, + "grad_norm": 5.783413538894243, + "learning_rate": 2.9161854510431364e-06, + "loss": 0.7755, + "step": 12827 + }, + { + "epoch": 0.9267614282876081, + "grad_norm": 6.556915231998897, + "learning_rate": 2.9158970274393956e-06, + "loss": 0.79, + "step": 12828 + }, + { + "epoch": 0.9268336734878176, + "grad_norm": 5.899799356443324, + "learning_rate": 2.915608598142347e-06, + "loss": 0.744, + "step": 12829 + }, + { + "epoch": 0.9269059186880272, + "grad_norm": 5.672208918050337, + "learning_rate": 2.91532016315594e-06, + "loss": 0.7522, + "step": 12830 + }, + { + "epoch": 0.9269781638882367, + "grad_norm": 5.689251640463412, + "learning_rate": 2.9150317224841222e-06, + "loss": 0.7986, + "step": 12831 + }, + { + "epoch": 0.9270504090884462, + "grad_norm": 5.717346686278829, + "learning_rate": 2.9147432761308425e-06, + "loss": 0.8424, + "step": 12832 + }, + { + "epoch": 0.9271226542886557, + "grad_norm": 6.005711380290399, + "learning_rate": 2.9144548241000492e-06, + "loss": 0.7457, + "step": 12833 + }, + { + "epoch": 0.9271948994888652, + "grad_norm": 5.606472103690777, + "learning_rate": 2.9141663663956916e-06, + "loss": 0.7727, + "step": 12834 + }, + { + "epoch": 0.9272671446890747, + "grad_norm": 5.206590203095376, + "learning_rate": 2.913877903021718e-06, + "loss": 0.7239, + "step": 12835 + }, + { + "epoch": 0.9273393898892842, + "grad_norm": 5.982341530435017, + "learning_rate": 2.9135894339820776e-06, + "loss": 0.8151, + "step": 12836 + }, + { + "epoch": 0.9274116350894938, + "grad_norm": 5.9467139779049205, + "learning_rate": 2.913300959280718e-06, + "loss": 0.8422, + "step": 12837 + }, + { + "epoch": 0.9274838802897033, + "grad_norm": 8.057775724962136, + "learning_rate": 2.9130124789215897e-06, + "loss": 0.9009, + "step": 12838 + }, + { + "epoch": 0.9275561254899127, + "grad_norm": 6.446564154035066, + "learning_rate": 2.9127239929086405e-06, + "loss": 0.8287, + "step": 12839 + }, + { + "epoch": 0.9276283706901223, + "grad_norm": 6.10648052986906, + "learning_rate": 2.9124355012458214e-06, + "loss": 0.7207, + "step": 12840 + }, + { + "epoch": 0.9277006158903318, + "grad_norm": 5.166318553559254, + "learning_rate": 2.91214700393708e-06, + "loss": 0.791, + "step": 12841 + }, + { + "epoch": 0.9277728610905412, + "grad_norm": 6.046254207474761, + "learning_rate": 2.9118585009863663e-06, + "loss": 0.7974, + "step": 12842 + }, + { + "epoch": 0.9278451062907508, + "grad_norm": 6.529684508853108, + "learning_rate": 2.9115699923976292e-06, + "loss": 0.7568, + "step": 12843 + }, + { + "epoch": 0.9279173514909603, + "grad_norm": 5.543859559431251, + "learning_rate": 2.911281478174819e-06, + "loss": 0.8268, + "step": 12844 + }, + { + "epoch": 0.9279895966911699, + "grad_norm": 6.266302590582287, + "learning_rate": 2.910992958321884e-06, + "loss": 0.8338, + "step": 12845 + }, + { + "epoch": 0.9280618418913793, + "grad_norm": 5.971920110271637, + "learning_rate": 2.9107044328427753e-06, + "loss": 0.8151, + "step": 12846 + }, + { + "epoch": 0.9281340870915888, + "grad_norm": 5.661452456286719, + "learning_rate": 2.9104159017414415e-06, + "loss": 0.8413, + "step": 12847 + }, + { + "epoch": 0.9282063322917984, + "grad_norm": 6.05893437247677, + "learning_rate": 2.9101273650218322e-06, + "loss": 0.7215, + "step": 12848 + }, + { + "epoch": 0.9282785774920079, + "grad_norm": 7.397267233953197, + "learning_rate": 2.909838822687898e-06, + "loss": 0.8247, + "step": 12849 + }, + { + "epoch": 0.9283508226922174, + "grad_norm": 5.634713496292667, + "learning_rate": 2.909550274743589e-06, + "loss": 0.8002, + "step": 12850 + }, + { + "epoch": 0.9284230678924269, + "grad_norm": 6.584843554886781, + "learning_rate": 2.9092617211928544e-06, + "loss": 0.8127, + "step": 12851 + }, + { + "epoch": 0.9284953130926364, + "grad_norm": 6.049705926983436, + "learning_rate": 2.9089731620396444e-06, + "loss": 0.7129, + "step": 12852 + }, + { + "epoch": 0.9285675582928459, + "grad_norm": 6.862783779815309, + "learning_rate": 2.9086845972879096e-06, + "loss": 0.7635, + "step": 12853 + }, + { + "epoch": 0.9286398034930554, + "grad_norm": 6.290448641401297, + "learning_rate": 2.9083960269416e-06, + "loss": 0.8223, + "step": 12854 + }, + { + "epoch": 0.928712048693265, + "grad_norm": 6.410483547783759, + "learning_rate": 2.9081074510046656e-06, + "loss": 0.8208, + "step": 12855 + }, + { + "epoch": 0.9287842938934745, + "grad_norm": 5.751142637083528, + "learning_rate": 2.9078188694810576e-06, + "loss": 0.8102, + "step": 12856 + }, + { + "epoch": 0.9288565390936839, + "grad_norm": 6.03877508311036, + "learning_rate": 2.9075302823747253e-06, + "loss": 0.7265, + "step": 12857 + }, + { + "epoch": 0.9289287842938935, + "grad_norm": 5.8564947347891705, + "learning_rate": 2.90724168968962e-06, + "loss": 0.8189, + "step": 12858 + }, + { + "epoch": 0.929001029494103, + "grad_norm": 6.123197640687733, + "learning_rate": 2.9069530914296923e-06, + "loss": 0.8613, + "step": 12859 + }, + { + "epoch": 0.9290732746943124, + "grad_norm": 5.974600436206128, + "learning_rate": 2.906664487598893e-06, + "loss": 0.7896, + "step": 12860 + }, + { + "epoch": 0.929145519894522, + "grad_norm": 6.810705386024608, + "learning_rate": 2.9063758782011726e-06, + "loss": 0.9054, + "step": 12861 + }, + { + "epoch": 0.9292177650947315, + "grad_norm": 5.776542767962309, + "learning_rate": 2.906087263240481e-06, + "loss": 0.789, + "step": 12862 + }, + { + "epoch": 0.9292900102949411, + "grad_norm": 6.137711705054681, + "learning_rate": 2.905798642720772e-06, + "loss": 0.7597, + "step": 12863 + }, + { + "epoch": 0.9293622554951505, + "grad_norm": 6.418462064500927, + "learning_rate": 2.9055100166459925e-06, + "loss": 0.7337, + "step": 12864 + }, + { + "epoch": 0.92943450069536, + "grad_norm": 5.43626407301312, + "learning_rate": 2.9052213850200972e-06, + "loss": 0.7798, + "step": 12865 + }, + { + "epoch": 0.9295067458955696, + "grad_norm": 6.308720429745906, + "learning_rate": 2.9049327478470357e-06, + "loss": 0.8246, + "step": 12866 + }, + { + "epoch": 0.9295789910957791, + "grad_norm": 8.570665016678985, + "learning_rate": 2.9046441051307585e-06, + "loss": 0.8935, + "step": 12867 + }, + { + "epoch": 0.9296512362959886, + "grad_norm": 6.382579112834544, + "learning_rate": 2.9043554568752187e-06, + "loss": 0.7924, + "step": 12868 + }, + { + "epoch": 0.9297234814961981, + "grad_norm": 6.239420366998306, + "learning_rate": 2.9040668030843656e-06, + "loss": 0.7498, + "step": 12869 + }, + { + "epoch": 0.9297957266964076, + "grad_norm": 5.2998871917235295, + "learning_rate": 2.9037781437621527e-06, + "loss": 0.7605, + "step": 12870 + }, + { + "epoch": 0.9298679718966171, + "grad_norm": 5.507953614936494, + "learning_rate": 2.9034894789125293e-06, + "loss": 0.9292, + "step": 12871 + }, + { + "epoch": 0.9299402170968266, + "grad_norm": 5.318910568090768, + "learning_rate": 2.903200808539449e-06, + "loss": 0.7705, + "step": 12872 + }, + { + "epoch": 0.9300124622970362, + "grad_norm": 8.21280121272673, + "learning_rate": 2.9029121326468623e-06, + "loss": 0.8258, + "step": 12873 + }, + { + "epoch": 0.9300847074972457, + "grad_norm": 6.496910241175013, + "learning_rate": 2.902623451238722e-06, + "loss": 0.7616, + "step": 12874 + }, + { + "epoch": 0.9301569526974551, + "grad_norm": 6.742788913464737, + "learning_rate": 2.9023347643189787e-06, + "loss": 0.7614, + "step": 12875 + }, + { + "epoch": 0.9302291978976647, + "grad_norm": 6.661074899987403, + "learning_rate": 2.902046071891585e-06, + "loss": 0.8119, + "step": 12876 + }, + { + "epoch": 0.9303014430978742, + "grad_norm": 5.887380986721728, + "learning_rate": 2.901757373960493e-06, + "loss": 0.8143, + "step": 12877 + }, + { + "epoch": 0.9303736882980836, + "grad_norm": 6.5085475181409285, + "learning_rate": 2.901468670529654e-06, + "loss": 0.8334, + "step": 12878 + }, + { + "epoch": 0.9304459334982932, + "grad_norm": 7.138759511260982, + "learning_rate": 2.9011799616030217e-06, + "loss": 0.8445, + "step": 12879 + }, + { + "epoch": 0.9305181786985027, + "grad_norm": 6.416833685480647, + "learning_rate": 2.900891247184546e-06, + "loss": 0.8232, + "step": 12880 + }, + { + "epoch": 0.9305904238987123, + "grad_norm": 5.558284748252051, + "learning_rate": 2.9006025272781806e-06, + "loss": 0.7876, + "step": 12881 + }, + { + "epoch": 0.9306626690989217, + "grad_norm": 5.72271324363684, + "learning_rate": 2.900313801887878e-06, + "loss": 0.7956, + "step": 12882 + }, + { + "epoch": 0.9307349142991312, + "grad_norm": 7.180758557216031, + "learning_rate": 2.9000250710175905e-06, + "loss": 0.7781, + "step": 12883 + }, + { + "epoch": 0.9308071594993408, + "grad_norm": 5.73083253627585, + "learning_rate": 2.8997363346712705e-06, + "loss": 0.8189, + "step": 12884 + }, + { + "epoch": 0.9308794046995503, + "grad_norm": 5.7481271762144885, + "learning_rate": 2.89944759285287e-06, + "loss": 0.7804, + "step": 12885 + }, + { + "epoch": 0.9309516498997598, + "grad_norm": 6.16613317020522, + "learning_rate": 2.8991588455663417e-06, + "loss": 0.716, + "step": 12886 + }, + { + "epoch": 0.9310238950999693, + "grad_norm": 6.852238933791036, + "learning_rate": 2.8988700928156388e-06, + "loss": 0.8556, + "step": 12887 + }, + { + "epoch": 0.9310961403001788, + "grad_norm": 6.303152585288392, + "learning_rate": 2.898581334604715e-06, + "loss": 0.8416, + "step": 12888 + }, + { + "epoch": 0.9311683855003883, + "grad_norm": 5.5275982370191725, + "learning_rate": 2.8982925709375217e-06, + "loss": 0.8686, + "step": 12889 + }, + { + "epoch": 0.9312406307005978, + "grad_norm": 6.59390880520036, + "learning_rate": 2.8980038018180116e-06, + "loss": 0.7938, + "step": 12890 + }, + { + "epoch": 0.9313128759008074, + "grad_norm": 5.671561576653597, + "learning_rate": 2.897715027250139e-06, + "loss": 0.8667, + "step": 12891 + }, + { + "epoch": 0.9313851211010169, + "grad_norm": 6.405642596967, + "learning_rate": 2.8974262472378566e-06, + "loss": 0.7699, + "step": 12892 + }, + { + "epoch": 0.9314573663012263, + "grad_norm": 6.9108918069686895, + "learning_rate": 2.8971374617851182e-06, + "loss": 0.8183, + "step": 12893 + }, + { + "epoch": 0.9315296115014359, + "grad_norm": 8.565803725818045, + "learning_rate": 2.8968486708958755e-06, + "loss": 0.8084, + "step": 12894 + }, + { + "epoch": 0.9316018567016454, + "grad_norm": 7.56129862172655, + "learning_rate": 2.8965598745740824e-06, + "loss": 0.8195, + "step": 12895 + }, + { + "epoch": 0.9316741019018548, + "grad_norm": 6.298219750174201, + "learning_rate": 2.8962710728236927e-06, + "loss": 0.8088, + "step": 12896 + }, + { + "epoch": 0.9317463471020644, + "grad_norm": 6.152615011381053, + "learning_rate": 2.89598226564866e-06, + "loss": 0.8217, + "step": 12897 + }, + { + "epoch": 0.9318185923022739, + "grad_norm": 6.722656817438965, + "learning_rate": 2.8956934530529374e-06, + "loss": 0.8186, + "step": 12898 + }, + { + "epoch": 0.9318908375024835, + "grad_norm": 7.257954673072638, + "learning_rate": 2.895404635040479e-06, + "loss": 0.787, + "step": 12899 + }, + { + "epoch": 0.9319630827026929, + "grad_norm": 7.00683750272195, + "learning_rate": 2.8951158116152368e-06, + "loss": 0.8539, + "step": 12900 + }, + { + "epoch": 0.9320353279029024, + "grad_norm": 5.488757520760423, + "learning_rate": 2.8948269827811677e-06, + "loss": 0.7864, + "step": 12901 + }, + { + "epoch": 0.932107573103112, + "grad_norm": 6.083922301047701, + "learning_rate": 2.894538148542223e-06, + "loss": 0.7629, + "step": 12902 + }, + { + "epoch": 0.9321798183033215, + "grad_norm": 6.392631234502878, + "learning_rate": 2.8942493089023572e-06, + "loss": 0.8265, + "step": 12903 + }, + { + "epoch": 0.932252063503531, + "grad_norm": 7.751002339179294, + "learning_rate": 2.8939604638655245e-06, + "loss": 0.8079, + "step": 12904 + }, + { + "epoch": 0.9323243087037405, + "grad_norm": 6.027029191200423, + "learning_rate": 2.893671613435679e-06, + "loss": 0.7937, + "step": 12905 + }, + { + "epoch": 0.93239655390395, + "grad_norm": 8.16590654637445, + "learning_rate": 2.893382757616775e-06, + "loss": 0.7981, + "step": 12906 + }, + { + "epoch": 0.9324687991041595, + "grad_norm": 8.003749922216636, + "learning_rate": 2.8930938964127665e-06, + "loss": 0.8135, + "step": 12907 + }, + { + "epoch": 0.932541044304369, + "grad_norm": 6.516542072844938, + "learning_rate": 2.892805029827608e-06, + "loss": 0.7944, + "step": 12908 + }, + { + "epoch": 0.9326132895045786, + "grad_norm": 6.926357910108731, + "learning_rate": 2.892516157865253e-06, + "loss": 0.8406, + "step": 12909 + }, + { + "epoch": 0.9326855347047881, + "grad_norm": 6.35966472293812, + "learning_rate": 2.8922272805296564e-06, + "loss": 0.8919, + "step": 12910 + }, + { + "epoch": 0.9327577799049975, + "grad_norm": 6.044571312027959, + "learning_rate": 2.891938397824774e-06, + "loss": 0.7468, + "step": 12911 + }, + { + "epoch": 0.9328300251052071, + "grad_norm": 5.5094931777508895, + "learning_rate": 2.8916495097545584e-06, + "loss": 0.7857, + "step": 12912 + }, + { + "epoch": 0.9329022703054166, + "grad_norm": 5.8237360160022735, + "learning_rate": 2.891360616322966e-06, + "loss": 0.7533, + "step": 12913 + }, + { + "epoch": 0.932974515505626, + "grad_norm": 5.005476432021575, + "learning_rate": 2.8910717175339488e-06, + "loss": 0.6873, + "step": 12914 + }, + { + "epoch": 0.9330467607058356, + "grad_norm": 5.7911384762475, + "learning_rate": 2.8907828133914645e-06, + "loss": 0.8396, + "step": 12915 + }, + { + "epoch": 0.9331190059060451, + "grad_norm": 7.452900561266268, + "learning_rate": 2.8904939038994674e-06, + "loss": 0.8193, + "step": 12916 + }, + { + "epoch": 0.9331912511062547, + "grad_norm": 5.915416567556727, + "learning_rate": 2.8902049890619114e-06, + "loss": 0.739, + "step": 12917 + }, + { + "epoch": 0.9332634963064641, + "grad_norm": 6.3362770601057115, + "learning_rate": 2.8899160688827522e-06, + "loss": 0.8778, + "step": 12918 + }, + { + "epoch": 0.9333357415066736, + "grad_norm": 6.336297529459609, + "learning_rate": 2.8896271433659446e-06, + "loss": 0.8251, + "step": 12919 + }, + { + "epoch": 0.9334079867068832, + "grad_norm": 6.055424287327895, + "learning_rate": 2.889338212515444e-06, + "loss": 0.847, + "step": 12920 + }, + { + "epoch": 0.9334802319070927, + "grad_norm": 5.304120265201358, + "learning_rate": 2.889049276335206e-06, + "loss": 0.7744, + "step": 12921 + }, + { + "epoch": 0.9335524771073022, + "grad_norm": 6.161066175563913, + "learning_rate": 2.888760334829185e-06, + "loss": 0.7789, + "step": 12922 + }, + { + "epoch": 0.9336247223075117, + "grad_norm": 5.495275896138666, + "learning_rate": 2.888471388001337e-06, + "loss": 0.8368, + "step": 12923 + }, + { + "epoch": 0.9336969675077212, + "grad_norm": 5.62118265109511, + "learning_rate": 2.8881824358556173e-06, + "loss": 0.8262, + "step": 12924 + }, + { + "epoch": 0.9337692127079307, + "grad_norm": 6.969353482925852, + "learning_rate": 2.8878934783959813e-06, + "loss": 0.8231, + "step": 12925 + }, + { + "epoch": 0.9338414579081402, + "grad_norm": 5.244152991139982, + "learning_rate": 2.8876045156263856e-06, + "loss": 0.7874, + "step": 12926 + }, + { + "epoch": 0.9339137031083498, + "grad_norm": 7.045732968106393, + "learning_rate": 2.887315547550784e-06, + "loss": 0.8521, + "step": 12927 + }, + { + "epoch": 0.9339859483085593, + "grad_norm": 7.5754543357038315, + "learning_rate": 2.8870265741731336e-06, + "loss": 0.7505, + "step": 12928 + }, + { + "epoch": 0.9340581935087687, + "grad_norm": 6.701103563663769, + "learning_rate": 2.8867375954973895e-06, + "loss": 0.883, + "step": 12929 + }, + { + "epoch": 0.9341304387089783, + "grad_norm": 7.865852735190825, + "learning_rate": 2.8864486115275087e-06, + "loss": 0.8281, + "step": 12930 + }, + { + "epoch": 0.9342026839091878, + "grad_norm": 7.616823909591619, + "learning_rate": 2.8861596222674467e-06, + "loss": 0.8371, + "step": 12931 + }, + { + "epoch": 0.9342749291093972, + "grad_norm": 6.116962063314861, + "learning_rate": 2.8858706277211584e-06, + "loss": 0.8505, + "step": 12932 + }, + { + "epoch": 0.9343471743096068, + "grad_norm": 7.582136653617558, + "learning_rate": 2.8855816278926015e-06, + "loss": 0.7766, + "step": 12933 + }, + { + "epoch": 0.9344194195098163, + "grad_norm": 6.108975995002725, + "learning_rate": 2.8852926227857313e-06, + "loss": 0.7637, + "step": 12934 + }, + { + "epoch": 0.9344916647100259, + "grad_norm": 7.650172970879482, + "learning_rate": 2.8850036124045043e-06, + "loss": 0.7745, + "step": 12935 + }, + { + "epoch": 0.9345639099102353, + "grad_norm": 8.025588121581222, + "learning_rate": 2.884714596752877e-06, + "loss": 0.798, + "step": 12936 + }, + { + "epoch": 0.9346361551104448, + "grad_norm": 6.460070403102207, + "learning_rate": 2.884425575834805e-06, + "loss": 0.8786, + "step": 12937 + }, + { + "epoch": 0.9347084003106544, + "grad_norm": 7.811178599187093, + "learning_rate": 2.8841365496542457e-06, + "loss": 0.7454, + "step": 12938 + }, + { + "epoch": 0.9347806455108638, + "grad_norm": 7.137392744070167, + "learning_rate": 2.883847518215155e-06, + "loss": 0.7611, + "step": 12939 + }, + { + "epoch": 0.9348528907110734, + "grad_norm": 6.059551347799695, + "learning_rate": 2.88355848152149e-06, + "loss": 0.8206, + "step": 12940 + }, + { + "epoch": 0.9349251359112829, + "grad_norm": 7.178672610873236, + "learning_rate": 2.883269439577207e-06, + "loss": 0.7939, + "step": 12941 + }, + { + "epoch": 0.9349973811114924, + "grad_norm": 6.5907490486636595, + "learning_rate": 2.8829803923862633e-06, + "loss": 0.8394, + "step": 12942 + }, + { + "epoch": 0.9350696263117019, + "grad_norm": 6.327544461218064, + "learning_rate": 2.8826913399526156e-06, + "loss": 0.7796, + "step": 12943 + }, + { + "epoch": 0.9351418715119114, + "grad_norm": 5.0274437195649995, + "learning_rate": 2.88240228228022e-06, + "loss": 0.7681, + "step": 12944 + }, + { + "epoch": 0.935214116712121, + "grad_norm": 6.391084314082923, + "learning_rate": 2.882113219373035e-06, + "loss": 0.8211, + "step": 12945 + }, + { + "epoch": 0.9352863619123305, + "grad_norm": 8.34275371875596, + "learning_rate": 2.8818241512350158e-06, + "loss": 0.8107, + "step": 12946 + }, + { + "epoch": 0.9353586071125399, + "grad_norm": 10.389220270412427, + "learning_rate": 2.8815350778701205e-06, + "loss": 0.8015, + "step": 12947 + }, + { + "epoch": 0.9354308523127495, + "grad_norm": 6.602888238275596, + "learning_rate": 2.881245999282306e-06, + "loss": 0.7827, + "step": 12948 + }, + { + "epoch": 0.935503097512959, + "grad_norm": 6.400036287204867, + "learning_rate": 2.880956915475531e-06, + "loss": 0.8963, + "step": 12949 + }, + { + "epoch": 0.9355753427131684, + "grad_norm": 6.504577492107392, + "learning_rate": 2.8806678264537515e-06, + "loss": 0.7873, + "step": 12950 + }, + { + "epoch": 0.935647587913378, + "grad_norm": 6.923688181766281, + "learning_rate": 2.8803787322209235e-06, + "loss": 0.716, + "step": 12951 + }, + { + "epoch": 0.9357198331135875, + "grad_norm": 7.3122975900138165, + "learning_rate": 2.880089632781007e-06, + "loss": 0.746, + "step": 12952 + }, + { + "epoch": 0.9357920783137971, + "grad_norm": 5.95276582690687, + "learning_rate": 2.879800528137958e-06, + "loss": 0.7818, + "step": 12953 + }, + { + "epoch": 0.9358643235140065, + "grad_norm": 7.29830755184638, + "learning_rate": 2.8795114182957355e-06, + "loss": 0.9018, + "step": 12954 + }, + { + "epoch": 0.935936568714216, + "grad_norm": 6.461174551179572, + "learning_rate": 2.879222303258296e-06, + "loss": 0.8308, + "step": 12955 + }, + { + "epoch": 0.9360088139144256, + "grad_norm": 6.187886235676431, + "learning_rate": 2.8789331830295973e-06, + "loss": 0.8807, + "step": 12956 + }, + { + "epoch": 0.936081059114635, + "grad_norm": 5.465828076838107, + "learning_rate": 2.8786440576135978e-06, + "loss": 0.7712, + "step": 12957 + }, + { + "epoch": 0.9361533043148446, + "grad_norm": 5.143513618683347, + "learning_rate": 2.878354927014256e-06, + "loss": 0.7513, + "step": 12958 + }, + { + "epoch": 0.9362255495150541, + "grad_norm": 7.342307870368911, + "learning_rate": 2.8780657912355282e-06, + "loss": 0.7376, + "step": 12959 + }, + { + "epoch": 0.9362977947152636, + "grad_norm": 5.822963689387878, + "learning_rate": 2.877776650281373e-06, + "loss": 0.7864, + "step": 12960 + }, + { + "epoch": 0.9363700399154731, + "grad_norm": 5.338498038757199, + "learning_rate": 2.8774875041557486e-06, + "loss": 0.7687, + "step": 12961 + }, + { + "epoch": 0.9364422851156826, + "grad_norm": 4.917113310790918, + "learning_rate": 2.8771983528626142e-06, + "loss": 0.7622, + "step": 12962 + }, + { + "epoch": 0.9365145303158922, + "grad_norm": 7.973401196976684, + "learning_rate": 2.876909196405927e-06, + "loss": 0.8654, + "step": 12963 + }, + { + "epoch": 0.9365867755161017, + "grad_norm": 5.1106980037058545, + "learning_rate": 2.876620034789645e-06, + "loss": 0.7471, + "step": 12964 + }, + { + "epoch": 0.9366590207163111, + "grad_norm": 6.381975434230003, + "learning_rate": 2.876330868017728e-06, + "loss": 0.8797, + "step": 12965 + }, + { + "epoch": 0.9367312659165207, + "grad_norm": 6.643209515438725, + "learning_rate": 2.876041696094133e-06, + "loss": 0.8048, + "step": 12966 + }, + { + "epoch": 0.9368035111167302, + "grad_norm": 8.351790879127176, + "learning_rate": 2.875752519022819e-06, + "loss": 0.7794, + "step": 12967 + }, + { + "epoch": 0.9368757563169396, + "grad_norm": 7.391202295189072, + "learning_rate": 2.8754633368077457e-06, + "loss": 0.75, + "step": 12968 + }, + { + "epoch": 0.9369480015171492, + "grad_norm": 5.972006982675342, + "learning_rate": 2.8751741494528697e-06, + "loss": 0.7677, + "step": 12969 + }, + { + "epoch": 0.9370202467173587, + "grad_norm": 6.846839799533688, + "learning_rate": 2.8748849569621518e-06, + "loss": 0.7873, + "step": 12970 + }, + { + "epoch": 0.9370924919175683, + "grad_norm": 7.047620374416781, + "learning_rate": 2.874595759339549e-06, + "loss": 0.7004, + "step": 12971 + }, + { + "epoch": 0.9371647371177777, + "grad_norm": 8.275536981314358, + "learning_rate": 2.8743065565890223e-06, + "loss": 0.8062, + "step": 12972 + }, + { + "epoch": 0.9372369823179872, + "grad_norm": 7.514665633610117, + "learning_rate": 2.8740173487145284e-06, + "loss": 0.8384, + "step": 12973 + }, + { + "epoch": 0.9373092275181968, + "grad_norm": 6.080302105977233, + "learning_rate": 2.873728135720028e-06, + "loss": 0.8316, + "step": 12974 + }, + { + "epoch": 0.9373814727184062, + "grad_norm": 5.18656526899554, + "learning_rate": 2.8734389176094795e-06, + "loss": 0.7944, + "step": 12975 + }, + { + "epoch": 0.9374537179186158, + "grad_norm": 6.446551431587855, + "learning_rate": 2.873149694386842e-06, + "loss": 0.7712, + "step": 12976 + }, + { + "epoch": 0.9375259631188253, + "grad_norm": 5.91778858275666, + "learning_rate": 2.872860466056075e-06, + "loss": 0.8133, + "step": 12977 + }, + { + "epoch": 0.9375982083190348, + "grad_norm": 6.492410777598574, + "learning_rate": 2.8725712326211374e-06, + "loss": 0.773, + "step": 12978 + }, + { + "epoch": 0.9376704535192443, + "grad_norm": 8.441826509378947, + "learning_rate": 2.8722819940859896e-06, + "loss": 0.8764, + "step": 12979 + }, + { + "epoch": 0.9377426987194538, + "grad_norm": 6.437488444790145, + "learning_rate": 2.8719927504545902e-06, + "loss": 0.7832, + "step": 12980 + }, + { + "epoch": 0.9378149439196634, + "grad_norm": 6.1565452543609345, + "learning_rate": 2.8717035017308987e-06, + "loss": 0.8055, + "step": 12981 + }, + { + "epoch": 0.9378871891198729, + "grad_norm": 6.843421126872623, + "learning_rate": 2.8714142479188743e-06, + "loss": 0.841, + "step": 12982 + }, + { + "epoch": 0.9379594343200823, + "grad_norm": 5.353605406717055, + "learning_rate": 2.8711249890224784e-06, + "loss": 0.7505, + "step": 12983 + }, + { + "epoch": 0.9380316795202919, + "grad_norm": 5.93948459585546, + "learning_rate": 2.870835725045669e-06, + "loss": 0.8551, + "step": 12984 + }, + { + "epoch": 0.9381039247205014, + "grad_norm": 6.385458352314314, + "learning_rate": 2.8705464559924058e-06, + "loss": 0.9152, + "step": 12985 + }, + { + "epoch": 0.9381761699207108, + "grad_norm": 6.995548467722728, + "learning_rate": 2.87025718186665e-06, + "loss": 0.8142, + "step": 12986 + }, + { + "epoch": 0.9382484151209204, + "grad_norm": 5.553798770763378, + "learning_rate": 2.869967902672361e-06, + "loss": 0.8431, + "step": 12987 + }, + { + "epoch": 0.9383206603211299, + "grad_norm": 6.710017148946801, + "learning_rate": 2.869678618413499e-06, + "loss": 0.8692, + "step": 12988 + }, + { + "epoch": 0.9383929055213395, + "grad_norm": 7.056624137210879, + "learning_rate": 2.869389329094023e-06, + "loss": 0.8331, + "step": 12989 + }, + { + "epoch": 0.9384651507215489, + "grad_norm": 6.2951618448287645, + "learning_rate": 2.869100034717894e-06, + "loss": 0.8231, + "step": 12990 + }, + { + "epoch": 0.9385373959217584, + "grad_norm": 8.162485362765638, + "learning_rate": 2.868810735289072e-06, + "loss": 0.8058, + "step": 12991 + }, + { + "epoch": 0.938609641121968, + "grad_norm": 5.442027169515521, + "learning_rate": 2.868521430811518e-06, + "loss": 0.8497, + "step": 12992 + }, + { + "epoch": 0.9386818863221774, + "grad_norm": 6.555828663512496, + "learning_rate": 2.8682321212891917e-06, + "loss": 0.7501, + "step": 12993 + }, + { + "epoch": 0.938754131522387, + "grad_norm": 7.975368249696168, + "learning_rate": 2.8679428067260527e-06, + "loss": 0.8482, + "step": 12994 + }, + { + "epoch": 0.9388263767225965, + "grad_norm": 7.829835746669544, + "learning_rate": 2.867653487126063e-06, + "loss": 0.8604, + "step": 12995 + }, + { + "epoch": 0.938898621922806, + "grad_norm": 8.913448810766146, + "learning_rate": 2.8673641624931825e-06, + "loss": 0.7822, + "step": 12996 + }, + { + "epoch": 0.9389708671230155, + "grad_norm": 8.552167686823891, + "learning_rate": 2.867074832831372e-06, + "loss": 0.7987, + "step": 12997 + }, + { + "epoch": 0.939043112323225, + "grad_norm": 5.61227843393133, + "learning_rate": 2.866785498144592e-06, + "loss": 0.7882, + "step": 12998 + }, + { + "epoch": 0.9391153575234346, + "grad_norm": 7.169357090514365, + "learning_rate": 2.866496158436803e-06, + "loss": 0.8271, + "step": 12999 + }, + { + "epoch": 0.9391876027236441, + "grad_norm": 6.943648903949447, + "learning_rate": 2.8662068137119666e-06, + "loss": 0.8356, + "step": 13000 + }, + { + "epoch": 0.9392598479238535, + "grad_norm": 6.317632420506496, + "learning_rate": 2.865917463974043e-06, + "loss": 0.8414, + "step": 13001 + }, + { + "epoch": 0.9393320931240631, + "grad_norm": 12.100723345865076, + "learning_rate": 2.8656281092269933e-06, + "loss": 0.8078, + "step": 13002 + }, + { + "epoch": 0.9394043383242726, + "grad_norm": 9.240083895215724, + "learning_rate": 2.8653387494747787e-06, + "loss": 0.8306, + "step": 13003 + }, + { + "epoch": 0.939476583524482, + "grad_norm": 6.477513076727803, + "learning_rate": 2.86504938472136e-06, + "loss": 0.8243, + "step": 13004 + }, + { + "epoch": 0.9395488287246916, + "grad_norm": 6.607405709594473, + "learning_rate": 2.8647600149706994e-06, + "loss": 0.7617, + "step": 13005 + }, + { + "epoch": 0.9396210739249011, + "grad_norm": 5.633772557909392, + "learning_rate": 2.8644706402267576e-06, + "loss": 0.8693, + "step": 13006 + }, + { + "epoch": 0.9396933191251107, + "grad_norm": 5.764949438530107, + "learning_rate": 2.8641812604934948e-06, + "loss": 0.7904, + "step": 13007 + }, + { + "epoch": 0.9397655643253201, + "grad_norm": 6.334224972785493, + "learning_rate": 2.863891875774874e-06, + "loss": 0.7848, + "step": 13008 + }, + { + "epoch": 0.9398378095255296, + "grad_norm": 9.553619849129422, + "learning_rate": 2.8636024860748556e-06, + "loss": 0.8392, + "step": 13009 + }, + { + "epoch": 0.9399100547257392, + "grad_norm": 6.576947684505828, + "learning_rate": 2.863313091397401e-06, + "loss": 0.784, + "step": 13010 + }, + { + "epoch": 0.9399822999259486, + "grad_norm": 7.384797393560998, + "learning_rate": 2.8630236917464736e-06, + "loss": 0.8439, + "step": 13011 + }, + { + "epoch": 0.9400545451261582, + "grad_norm": 5.6649645698698645, + "learning_rate": 2.8627342871260327e-06, + "loss": 0.8228, + "step": 13012 + }, + { + "epoch": 0.9401267903263677, + "grad_norm": 5.96925104245185, + "learning_rate": 2.862444877540041e-06, + "loss": 0.8209, + "step": 13013 + }, + { + "epoch": 0.9401990355265772, + "grad_norm": 6.899582349880549, + "learning_rate": 2.8621554629924608e-06, + "loss": 0.8583, + "step": 13014 + }, + { + "epoch": 0.9402712807267867, + "grad_norm": 6.105193519368206, + "learning_rate": 2.8618660434872534e-06, + "loss": 0.7455, + "step": 13015 + }, + { + "epoch": 0.9403435259269962, + "grad_norm": 5.986753145597232, + "learning_rate": 2.861576619028381e-06, + "loss": 0.7351, + "step": 13016 + }, + { + "epoch": 0.9404157711272058, + "grad_norm": 6.738060175268332, + "learning_rate": 2.8612871896198054e-06, + "loss": 0.8136, + "step": 13017 + }, + { + "epoch": 0.9404880163274153, + "grad_norm": 5.564806856297925, + "learning_rate": 2.8609977552654883e-06, + "loss": 0.7072, + "step": 13018 + }, + { + "epoch": 0.9405602615276247, + "grad_norm": 7.32740321397065, + "learning_rate": 2.8607083159693925e-06, + "loss": 0.864, + "step": 13019 + }, + { + "epoch": 0.9406325067278343, + "grad_norm": 7.528340771006869, + "learning_rate": 2.86041887173548e-06, + "loss": 0.773, + "step": 13020 + }, + { + "epoch": 0.9407047519280438, + "grad_norm": 7.806263624168706, + "learning_rate": 2.860129422567713e-06, + "loss": 0.8584, + "step": 13021 + }, + { + "epoch": 0.9407769971282532, + "grad_norm": 7.484527395206049, + "learning_rate": 2.859839968470054e-06, + "loss": 0.7732, + "step": 13022 + }, + { + "epoch": 0.9408492423284628, + "grad_norm": 6.428590405148547, + "learning_rate": 2.8595505094464643e-06, + "loss": 0.7402, + "step": 13023 + }, + { + "epoch": 0.9409214875286723, + "grad_norm": 6.226921808269221, + "learning_rate": 2.8592610455009085e-06, + "loss": 0.8031, + "step": 13024 + }, + { + "epoch": 0.9409937327288819, + "grad_norm": 5.686950321673439, + "learning_rate": 2.8589715766373473e-06, + "loss": 0.8091, + "step": 13025 + }, + { + "epoch": 0.9410659779290913, + "grad_norm": 7.948922656494135, + "learning_rate": 2.858682102859744e-06, + "loss": 0.9392, + "step": 13026 + }, + { + "epoch": 0.9411382231293008, + "grad_norm": 5.486716267920466, + "learning_rate": 2.858392624172061e-06, + "loss": 0.8047, + "step": 13027 + }, + { + "epoch": 0.9412104683295104, + "grad_norm": 6.655569982044263, + "learning_rate": 2.858103140578261e-06, + "loss": 0.7916, + "step": 13028 + }, + { + "epoch": 0.9412827135297198, + "grad_norm": 7.121812341606084, + "learning_rate": 2.857813652082308e-06, + "loss": 0.8135, + "step": 13029 + }, + { + "epoch": 0.9413549587299294, + "grad_norm": 7.407555831199248, + "learning_rate": 2.8575241586881634e-06, + "loss": 0.7019, + "step": 13030 + }, + { + "epoch": 0.9414272039301389, + "grad_norm": 5.262563658848008, + "learning_rate": 2.8572346603997914e-06, + "loss": 0.8039, + "step": 13031 + }, + { + "epoch": 0.9414994491303484, + "grad_norm": 5.9572912340808255, + "learning_rate": 2.856945157221153e-06, + "loss": 0.7979, + "step": 13032 + }, + { + "epoch": 0.9415716943305579, + "grad_norm": 8.093403576834627, + "learning_rate": 2.8566556491562133e-06, + "loss": 0.8269, + "step": 13033 + }, + { + "epoch": 0.9416439395307674, + "grad_norm": 5.48744189811656, + "learning_rate": 2.8563661362089345e-06, + "loss": 0.7581, + "step": 13034 + }, + { + "epoch": 0.941716184730977, + "grad_norm": 6.031664582071456, + "learning_rate": 2.85607661838328e-06, + "loss": 0.7998, + "step": 13035 + }, + { + "epoch": 0.9417884299311865, + "grad_norm": 7.124691337458257, + "learning_rate": 2.8557870956832135e-06, + "loss": 0.8242, + "step": 13036 + }, + { + "epoch": 0.9418606751313959, + "grad_norm": 6.43236625926722, + "learning_rate": 2.8554975681126966e-06, + "loss": 0.8676, + "step": 13037 + }, + { + "epoch": 0.9419329203316055, + "grad_norm": 8.766032761934031, + "learning_rate": 2.8552080356756958e-06, + "loss": 0.8231, + "step": 13038 + }, + { + "epoch": 0.942005165531815, + "grad_norm": 5.149161350148515, + "learning_rate": 2.854918498376171e-06, + "loss": 0.7074, + "step": 13039 + }, + { + "epoch": 0.9420774107320244, + "grad_norm": 4.942595930466131, + "learning_rate": 2.8546289562180896e-06, + "loss": 0.7888, + "step": 13040 + }, + { + "epoch": 0.942149655932234, + "grad_norm": 6.025629300378191, + "learning_rate": 2.8543394092054115e-06, + "loss": 0.7701, + "step": 13041 + }, + { + "epoch": 0.9422219011324435, + "grad_norm": 7.672122120033481, + "learning_rate": 2.8540498573421022e-06, + "loss": 0.7849, + "step": 13042 + }, + { + "epoch": 0.9422941463326531, + "grad_norm": 6.689737391314108, + "learning_rate": 2.8537603006321264e-06, + "loss": 0.7776, + "step": 13043 + }, + { + "epoch": 0.9423663915328625, + "grad_norm": 6.383511116334277, + "learning_rate": 2.8534707390794455e-06, + "loss": 0.844, + "step": 13044 + }, + { + "epoch": 0.942438636733072, + "grad_norm": 6.4317785225689255, + "learning_rate": 2.8531811726880253e-06, + "loss": 0.7622, + "step": 13045 + }, + { + "epoch": 0.9425108819332816, + "grad_norm": 6.1319686795086135, + "learning_rate": 2.8528916014618284e-06, + "loss": 0.7697, + "step": 13046 + }, + { + "epoch": 0.942583127133491, + "grad_norm": 5.760767267835089, + "learning_rate": 2.8526020254048204e-06, + "loss": 0.8318, + "step": 13047 + }, + { + "epoch": 0.9426553723337006, + "grad_norm": 6.763806951167091, + "learning_rate": 2.8523124445209634e-06, + "loss": 0.8234, + "step": 13048 + }, + { + "epoch": 0.9427276175339101, + "grad_norm": 5.187103624022529, + "learning_rate": 2.8520228588142236e-06, + "loss": 0.7558, + "step": 13049 + }, + { + "epoch": 0.9427998627341196, + "grad_norm": 5.94626556657474, + "learning_rate": 2.851733268288564e-06, + "loss": 0.7779, + "step": 13050 + }, + { + "epoch": 0.9428721079343291, + "grad_norm": 7.300786736304119, + "learning_rate": 2.8514436729479484e-06, + "loss": 0.7884, + "step": 13051 + }, + { + "epoch": 0.9429443531345386, + "grad_norm": 6.6512352047221395, + "learning_rate": 2.851154072796342e-06, + "loss": 0.8347, + "step": 13052 + }, + { + "epoch": 0.9430165983347482, + "grad_norm": 4.8318473022282875, + "learning_rate": 2.8508644678377097e-06, + "loss": 0.7437, + "step": 13053 + }, + { + "epoch": 0.9430888435349577, + "grad_norm": 6.165704428716554, + "learning_rate": 2.850574858076015e-06, + "loss": 0.7292, + "step": 13054 + }, + { + "epoch": 0.9431610887351671, + "grad_norm": 4.748764479721258, + "learning_rate": 2.850285243515222e-06, + "loss": 0.7021, + "step": 13055 + }, + { + "epoch": 0.9432333339353767, + "grad_norm": 7.018714139715966, + "learning_rate": 2.8499956241592964e-06, + "loss": 0.8089, + "step": 13056 + }, + { + "epoch": 0.9433055791355862, + "grad_norm": 8.050651893520458, + "learning_rate": 2.849706000012203e-06, + "loss": 0.8292, + "step": 13057 + }, + { + "epoch": 0.9433778243357956, + "grad_norm": 6.341469340652646, + "learning_rate": 2.849416371077905e-06, + "loss": 0.7755, + "step": 13058 + }, + { + "epoch": 0.9434500695360052, + "grad_norm": 5.561582682547699, + "learning_rate": 2.849126737360369e-06, + "loss": 0.8708, + "step": 13059 + }, + { + "epoch": 0.9435223147362147, + "grad_norm": 6.517683184368267, + "learning_rate": 2.848837098863559e-06, + "loss": 0.7816, + "step": 13060 + }, + { + "epoch": 0.9435945599364243, + "grad_norm": 6.3163506877580975, + "learning_rate": 2.8485474555914393e-06, + "loss": 0.7894, + "step": 13061 + }, + { + "epoch": 0.9436668051366337, + "grad_norm": 6.12926381455777, + "learning_rate": 2.8482578075479755e-06, + "loss": 0.8349, + "step": 13062 + }, + { + "epoch": 0.9437390503368432, + "grad_norm": 6.881923172095926, + "learning_rate": 2.8479681547371342e-06, + "loss": 0.7443, + "step": 13063 + }, + { + "epoch": 0.9438112955370528, + "grad_norm": 5.507884356523734, + "learning_rate": 2.8476784971628773e-06, + "loss": 0.8928, + "step": 13064 + }, + { + "epoch": 0.9438835407372622, + "grad_norm": 7.008404182623297, + "learning_rate": 2.847388834829173e-06, + "loss": 0.7889, + "step": 13065 + }, + { + "epoch": 0.9439557859374718, + "grad_norm": 7.6476828462927555, + "learning_rate": 2.8470991677399844e-06, + "loss": 0.8028, + "step": 13066 + }, + { + "epoch": 0.9440280311376813, + "grad_norm": 6.35866113353533, + "learning_rate": 2.846809495899278e-06, + "loss": 0.8187, + "step": 13067 + }, + { + "epoch": 0.9441002763378908, + "grad_norm": 5.967469082941224, + "learning_rate": 2.8465198193110193e-06, + "loss": 0.814, + "step": 13068 + }, + { + "epoch": 0.9441725215381003, + "grad_norm": 5.143828440331148, + "learning_rate": 2.846230137979173e-06, + "loss": 0.8277, + "step": 13069 + }, + { + "epoch": 0.9442447667383098, + "grad_norm": 6.140276794655941, + "learning_rate": 2.8459404519077042e-06, + "loss": 0.8139, + "step": 13070 + }, + { + "epoch": 0.9443170119385194, + "grad_norm": 6.214684465765997, + "learning_rate": 2.8456507611005802e-06, + "loss": 0.8123, + "step": 13071 + }, + { + "epoch": 0.9443892571387289, + "grad_norm": 10.118787578586026, + "learning_rate": 2.8453610655617654e-06, + "loss": 0.8508, + "step": 13072 + }, + { + "epoch": 0.9444615023389383, + "grad_norm": 5.527190534537246, + "learning_rate": 2.8450713652952256e-06, + "loss": 0.7434, + "step": 13073 + }, + { + "epoch": 0.9445337475391479, + "grad_norm": 5.37622996607021, + "learning_rate": 2.844781660304927e-06, + "loss": 0.8412, + "step": 13074 + }, + { + "epoch": 0.9446059927393574, + "grad_norm": 7.239119158530701, + "learning_rate": 2.8444919505948355e-06, + "loss": 0.7552, + "step": 13075 + }, + { + "epoch": 0.9446782379395668, + "grad_norm": 7.945603446922215, + "learning_rate": 2.844202236168916e-06, + "loss": 0.8528, + "step": 13076 + }, + { + "epoch": 0.9447504831397764, + "grad_norm": 5.321924623536902, + "learning_rate": 2.8439125170311356e-06, + "loss": 0.7721, + "step": 13077 + }, + { + "epoch": 0.9448227283399859, + "grad_norm": 7.275174598302849, + "learning_rate": 2.8436227931854593e-06, + "loss": 0.7616, + "step": 13078 + }, + { + "epoch": 0.9448949735401955, + "grad_norm": 6.322478914478358, + "learning_rate": 2.843333064635855e-06, + "loss": 0.8587, + "step": 13079 + }, + { + "epoch": 0.9449672187404049, + "grad_norm": 6.842517480517781, + "learning_rate": 2.8430433313862854e-06, + "loss": 0.8254, + "step": 13080 + }, + { + "epoch": 0.9450394639406144, + "grad_norm": 5.891867266328494, + "learning_rate": 2.842753593440721e-06, + "loss": 0.7817, + "step": 13081 + }, + { + "epoch": 0.945111709140824, + "grad_norm": 7.426299565189489, + "learning_rate": 2.8424638508031256e-06, + "loss": 0.7752, + "step": 13082 + }, + { + "epoch": 0.9451839543410334, + "grad_norm": 7.1850916269044, + "learning_rate": 2.842174103477465e-06, + "loss": 0.8686, + "step": 13083 + }, + { + "epoch": 0.945256199541243, + "grad_norm": 6.9457104185960805, + "learning_rate": 2.8418843514677074e-06, + "loss": 0.8105, + "step": 13084 + }, + { + "epoch": 0.9453284447414525, + "grad_norm": 5.9505356491427515, + "learning_rate": 2.8415945947778186e-06, + "loss": 0.6346, + "step": 13085 + }, + { + "epoch": 0.945400689941662, + "grad_norm": 6.740011949387097, + "learning_rate": 2.841304833411765e-06, + "loss": 0.8332, + "step": 13086 + }, + { + "epoch": 0.9454729351418715, + "grad_norm": 6.011656881687008, + "learning_rate": 2.8410150673735124e-06, + "loss": 0.7682, + "step": 13087 + }, + { + "epoch": 0.945545180342081, + "grad_norm": 7.233242003864837, + "learning_rate": 2.8407252966670296e-06, + "loss": 0.846, + "step": 13088 + }, + { + "epoch": 0.9456174255422906, + "grad_norm": 7.527356166067212, + "learning_rate": 2.8404355212962815e-06, + "loss": 0.7525, + "step": 13089 + }, + { + "epoch": 0.9456896707425001, + "grad_norm": 5.4295863299927145, + "learning_rate": 2.840145741265235e-06, + "loss": 0.7682, + "step": 13090 + }, + { + "epoch": 0.9457619159427095, + "grad_norm": 6.0794506833736435, + "learning_rate": 2.839855956577858e-06, + "loss": 0.8446, + "step": 13091 + }, + { + "epoch": 0.9458341611429191, + "grad_norm": 7.623775446383817, + "learning_rate": 2.8395661672381163e-06, + "loss": 0.8754, + "step": 13092 + }, + { + "epoch": 0.9459064063431286, + "grad_norm": 5.762355301171853, + "learning_rate": 2.8392763732499783e-06, + "loss": 0.7966, + "step": 13093 + }, + { + "epoch": 0.945978651543338, + "grad_norm": 5.770093225423496, + "learning_rate": 2.8389865746174087e-06, + "loss": 0.7967, + "step": 13094 + }, + { + "epoch": 0.9460508967435476, + "grad_norm": 7.022943770013921, + "learning_rate": 2.8386967713443776e-06, + "loss": 0.7386, + "step": 13095 + }, + { + "epoch": 0.9461231419437571, + "grad_norm": 7.627815102139135, + "learning_rate": 2.83840696343485e-06, + "loss": 0.7485, + "step": 13096 + }, + { + "epoch": 0.9461953871439667, + "grad_norm": 6.2348329763059125, + "learning_rate": 2.838117150892794e-06, + "loss": 0.7619, + "step": 13097 + }, + { + "epoch": 0.9462676323441761, + "grad_norm": 5.495201792183469, + "learning_rate": 2.8378273337221766e-06, + "loss": 0.7366, + "step": 13098 + }, + { + "epoch": 0.9463398775443856, + "grad_norm": 7.211875383380536, + "learning_rate": 2.8375375119269655e-06, + "loss": 0.785, + "step": 13099 + }, + { + "epoch": 0.9464121227445952, + "grad_norm": 5.242111455647212, + "learning_rate": 2.8372476855111283e-06, + "loss": 0.762, + "step": 13100 + }, + { + "epoch": 0.9464843679448046, + "grad_norm": 7.79492407722031, + "learning_rate": 2.8369578544786315e-06, + "loss": 0.8852, + "step": 13101 + }, + { + "epoch": 0.9465566131450142, + "grad_norm": 5.337840063621221, + "learning_rate": 2.8366680188334433e-06, + "loss": 0.7808, + "step": 13102 + }, + { + "epoch": 0.9466288583452237, + "grad_norm": 6.060352062913907, + "learning_rate": 2.8363781785795314e-06, + "loss": 0.8157, + "step": 13103 + }, + { + "epoch": 0.9467011035454332, + "grad_norm": 6.476495942623929, + "learning_rate": 2.8360883337208633e-06, + "loss": 0.805, + "step": 13104 + }, + { + "epoch": 0.9467733487456427, + "grad_norm": 5.5839268193409275, + "learning_rate": 2.835798484261407e-06, + "loss": 0.6774, + "step": 13105 + }, + { + "epoch": 0.9468455939458522, + "grad_norm": 6.30631519658212, + "learning_rate": 2.8355086302051306e-06, + "loss": 0.8341, + "step": 13106 + }, + { + "epoch": 0.9469178391460618, + "grad_norm": 6.670046839513842, + "learning_rate": 2.8352187715560013e-06, + "loss": 0.7484, + "step": 13107 + }, + { + "epoch": 0.9469900843462713, + "grad_norm": 6.357528080910107, + "learning_rate": 2.8349289083179863e-06, + "loss": 0.89, + "step": 13108 + }, + { + "epoch": 0.9470623295464807, + "grad_norm": 5.467771431700606, + "learning_rate": 2.8346390404950565e-06, + "loss": 0.9268, + "step": 13109 + }, + { + "epoch": 0.9471345747466903, + "grad_norm": 6.315926707600497, + "learning_rate": 2.8343491680911765e-06, + "loss": 0.7485, + "step": 13110 + }, + { + "epoch": 0.9472068199468998, + "grad_norm": 6.33981637249557, + "learning_rate": 2.834059291110317e-06, + "loss": 0.752, + "step": 13111 + }, + { + "epoch": 0.9472790651471092, + "grad_norm": 6.443435747122409, + "learning_rate": 2.833769409556445e-06, + "loss": 0.8099, + "step": 13112 + }, + { + "epoch": 0.9473513103473188, + "grad_norm": 8.20912773303029, + "learning_rate": 2.8334795234335284e-06, + "loss": 0.7528, + "step": 13113 + }, + { + "epoch": 0.9474235555475283, + "grad_norm": 5.84740085082296, + "learning_rate": 2.833189632745536e-06, + "loss": 0.7577, + "step": 13114 + }, + { + "epoch": 0.9474958007477379, + "grad_norm": 6.472193589190378, + "learning_rate": 2.832899737496437e-06, + "loss": 0.8507, + "step": 13115 + }, + { + "epoch": 0.9475680459479473, + "grad_norm": 6.60023418357835, + "learning_rate": 2.832609837690199e-06, + "loss": 0.7546, + "step": 13116 + }, + { + "epoch": 0.9476402911481568, + "grad_norm": 6.320475761275962, + "learning_rate": 2.83231993333079e-06, + "loss": 0.7365, + "step": 13117 + }, + { + "epoch": 0.9477125363483664, + "grad_norm": 6.685645015530038, + "learning_rate": 2.8320300244221792e-06, + "loss": 0.7843, + "step": 13118 + }, + { + "epoch": 0.9477847815485758, + "grad_norm": 9.007963260338563, + "learning_rate": 2.831740110968335e-06, + "loss": 0.7544, + "step": 13119 + }, + { + "epoch": 0.9478570267487854, + "grad_norm": 6.246608271588802, + "learning_rate": 2.831450192973227e-06, + "loss": 0.7867, + "step": 13120 + }, + { + "epoch": 0.9479292719489949, + "grad_norm": 6.297638271823365, + "learning_rate": 2.8311602704408224e-06, + "loss": 0.7696, + "step": 13121 + }, + { + "epoch": 0.9480015171492044, + "grad_norm": 5.305900513695151, + "learning_rate": 2.830870343375091e-06, + "loss": 0.715, + "step": 13122 + }, + { + "epoch": 0.9480737623494139, + "grad_norm": 5.380563030632699, + "learning_rate": 2.8305804117800024e-06, + "loss": 0.7999, + "step": 13123 + }, + { + "epoch": 0.9481460075496234, + "grad_norm": 6.402570756538382, + "learning_rate": 2.830290475659524e-06, + "loss": 0.8131, + "step": 13124 + }, + { + "epoch": 0.948218252749833, + "grad_norm": 6.805886794698826, + "learning_rate": 2.830000535017626e-06, + "loss": 0.7824, + "step": 13125 + }, + { + "epoch": 0.9482904979500425, + "grad_norm": 5.686276146976836, + "learning_rate": 2.8297105898582756e-06, + "loss": 0.7144, + "step": 13126 + }, + { + "epoch": 0.9483627431502519, + "grad_norm": 6.98862404832791, + "learning_rate": 2.829420640185444e-06, + "loss": 0.7512, + "step": 13127 + }, + { + "epoch": 0.9484349883504615, + "grad_norm": 6.102834962046296, + "learning_rate": 2.829130686003099e-06, + "loss": 0.7706, + "step": 13128 + }, + { + "epoch": 0.948507233550671, + "grad_norm": 5.106221477003989, + "learning_rate": 2.828840727315212e-06, + "loss": 0.8326, + "step": 13129 + }, + { + "epoch": 0.9485794787508804, + "grad_norm": 7.338796170335863, + "learning_rate": 2.8285507641257493e-06, + "loss": 0.8174, + "step": 13130 + }, + { + "epoch": 0.94865172395109, + "grad_norm": 8.641112825995389, + "learning_rate": 2.8282607964386828e-06, + "loss": 0.7724, + "step": 13131 + }, + { + "epoch": 0.9487239691512995, + "grad_norm": 7.301240256521303, + "learning_rate": 2.82797082425798e-06, + "loss": 0.8508, + "step": 13132 + }, + { + "epoch": 0.9487962143515091, + "grad_norm": 8.056767754678797, + "learning_rate": 2.8276808475876115e-06, + "loss": 0.7158, + "step": 13133 + }, + { + "epoch": 0.9488684595517185, + "grad_norm": 5.602558382641648, + "learning_rate": 2.827390866431547e-06, + "loss": 0.8275, + "step": 13134 + }, + { + "epoch": 0.948940704751928, + "grad_norm": 5.812698853075306, + "learning_rate": 2.827100880793755e-06, + "loss": 0.7722, + "step": 13135 + }, + { + "epoch": 0.9490129499521376, + "grad_norm": 6.858604161912589, + "learning_rate": 2.8268108906782067e-06, + "loss": 0.8133, + "step": 13136 + }, + { + "epoch": 0.949085195152347, + "grad_norm": 7.026660014420219, + "learning_rate": 2.8265208960888702e-06, + "loss": 0.8719, + "step": 13137 + }, + { + "epoch": 0.9491574403525566, + "grad_norm": 6.6548901812107015, + "learning_rate": 2.8262308970297168e-06, + "loss": 0.8171, + "step": 13138 + }, + { + "epoch": 0.9492296855527661, + "grad_norm": 6.0429678031067455, + "learning_rate": 2.8259408935047155e-06, + "loss": 0.8011, + "step": 13139 + }, + { + "epoch": 0.9493019307529756, + "grad_norm": 9.058405510253094, + "learning_rate": 2.8256508855178373e-06, + "loss": 0.768, + "step": 13140 + }, + { + "epoch": 0.9493741759531851, + "grad_norm": 7.4071580032464945, + "learning_rate": 2.82536087307305e-06, + "loss": 0.8271, + "step": 13141 + }, + { + "epoch": 0.9494464211533946, + "grad_norm": 7.63072115028701, + "learning_rate": 2.8250708561743257e-06, + "loss": 0.8518, + "step": 13142 + }, + { + "epoch": 0.9495186663536042, + "grad_norm": 6.556128615136595, + "learning_rate": 2.824780834825634e-06, + "loss": 0.7915, + "step": 13143 + }, + { + "epoch": 0.9495909115538136, + "grad_norm": 5.930166433742376, + "learning_rate": 2.824490809030944e-06, + "loss": 0.7709, + "step": 13144 + }, + { + "epoch": 0.9496631567540231, + "grad_norm": 6.137886659830547, + "learning_rate": 2.8242007787942277e-06, + "loss": 0.8405, + "step": 13145 + }, + { + "epoch": 0.9497354019542327, + "grad_norm": 6.248090223356972, + "learning_rate": 2.8239107441194543e-06, + "loss": 0.6966, + "step": 13146 + }, + { + "epoch": 0.9498076471544422, + "grad_norm": 5.8391787351288755, + "learning_rate": 2.8236207050105934e-06, + "loss": 0.7839, + "step": 13147 + }, + { + "epoch": 0.9498798923546516, + "grad_norm": 6.182846322978152, + "learning_rate": 2.8233306614716177e-06, + "loss": 0.8597, + "step": 13148 + }, + { + "epoch": 0.9499521375548612, + "grad_norm": 5.47721739157889, + "learning_rate": 2.8230406135064954e-06, + "loss": 0.7181, + "step": 13149 + }, + { + "epoch": 0.9500243827550707, + "grad_norm": 7.0522408028793855, + "learning_rate": 2.822750561119198e-06, + "loss": 0.8089, + "step": 13150 + }, + { + "epoch": 0.9500966279552803, + "grad_norm": 8.03051849477491, + "learning_rate": 2.8224605043136956e-06, + "loss": 0.8621, + "step": 13151 + }, + { + "epoch": 0.9501688731554897, + "grad_norm": 7.416140105453827, + "learning_rate": 2.82217044309396e-06, + "loss": 0.8068, + "step": 13152 + }, + { + "epoch": 0.9502411183556992, + "grad_norm": 6.16559244370602, + "learning_rate": 2.821880377463961e-06, + "loss": 0.7043, + "step": 13153 + }, + { + "epoch": 0.9503133635559088, + "grad_norm": 6.052063443473484, + "learning_rate": 2.82159030742767e-06, + "loss": 0.8991, + "step": 13154 + }, + { + "epoch": 0.9503856087561182, + "grad_norm": 5.912188738317666, + "learning_rate": 2.8213002329890566e-06, + "loss": 0.7246, + "step": 13155 + }, + { + "epoch": 0.9504578539563278, + "grad_norm": 6.614628861238084, + "learning_rate": 2.8210101541520928e-06, + "loss": 0.8151, + "step": 13156 + }, + { + "epoch": 0.9505300991565373, + "grad_norm": 6.026540548192131, + "learning_rate": 2.8207200709207495e-06, + "loss": 0.7241, + "step": 13157 + }, + { + "epoch": 0.9506023443567468, + "grad_norm": 5.870656132330718, + "learning_rate": 2.820429983298997e-06, + "loss": 0.7578, + "step": 13158 + }, + { + "epoch": 0.9506745895569563, + "grad_norm": 5.393127744989849, + "learning_rate": 2.8201398912908075e-06, + "loss": 0.7156, + "step": 13159 + }, + { + "epoch": 0.9507468347571658, + "grad_norm": 6.546750306464674, + "learning_rate": 2.819849794900151e-06, + "loss": 0.809, + "step": 13160 + }, + { + "epoch": 0.9508190799573754, + "grad_norm": 7.0563051855127235, + "learning_rate": 2.819559694130998e-06, + "loss": 0.8104, + "step": 13161 + }, + { + "epoch": 0.9508913251575848, + "grad_norm": 6.648912280620106, + "learning_rate": 2.8192695889873224e-06, + "loss": 0.7676, + "step": 13162 + }, + { + "epoch": 0.9509635703577943, + "grad_norm": 10.400047859668648, + "learning_rate": 2.818979479473094e-06, + "loss": 0.8878, + "step": 13163 + }, + { + "epoch": 0.9510358155580039, + "grad_norm": 4.795144828383278, + "learning_rate": 2.8186893655922836e-06, + "loss": 0.7755, + "step": 13164 + }, + { + "epoch": 0.9511080607582134, + "grad_norm": 6.660678017501824, + "learning_rate": 2.818399247348863e-06, + "loss": 0.8107, + "step": 13165 + }, + { + "epoch": 0.9511803059584228, + "grad_norm": 6.806750189666964, + "learning_rate": 2.818109124746805e-06, + "loss": 0.7661, + "step": 13166 + }, + { + "epoch": 0.9512525511586324, + "grad_norm": 6.3881780514587, + "learning_rate": 2.8178189977900794e-06, + "loss": 0.8776, + "step": 13167 + }, + { + "epoch": 0.9513247963588419, + "grad_norm": 7.0568127984635245, + "learning_rate": 2.817528866482659e-06, + "loss": 0.8104, + "step": 13168 + }, + { + "epoch": 0.9513970415590515, + "grad_norm": 6.097640609965528, + "learning_rate": 2.8172387308285143e-06, + "loss": 0.7895, + "step": 13169 + }, + { + "epoch": 0.9514692867592609, + "grad_norm": 7.00645476151779, + "learning_rate": 2.816948590831618e-06, + "loss": 0.7774, + "step": 13170 + }, + { + "epoch": 0.9515415319594704, + "grad_norm": 6.722155466440078, + "learning_rate": 2.816658446495941e-06, + "loss": 0.7613, + "step": 13171 + }, + { + "epoch": 0.95161377715968, + "grad_norm": 7.255060205821031, + "learning_rate": 2.816368297825457e-06, + "loss": 0.7601, + "step": 13172 + }, + { + "epoch": 0.9516860223598894, + "grad_norm": 7.5814415664559425, + "learning_rate": 2.816078144824136e-06, + "loss": 0.7553, + "step": 13173 + }, + { + "epoch": 0.951758267560099, + "grad_norm": 7.2981300989975875, + "learning_rate": 2.8157879874959504e-06, + "loss": 0.8868, + "step": 13174 + }, + { + "epoch": 0.9518305127603085, + "grad_norm": 6.345290424502968, + "learning_rate": 2.815497825844872e-06, + "loss": 0.8529, + "step": 13175 + }, + { + "epoch": 0.951902757960518, + "grad_norm": 6.203275049350192, + "learning_rate": 2.815207659874874e-06, + "loss": 0.7118, + "step": 13176 + }, + { + "epoch": 0.9519750031607275, + "grad_norm": 7.034606132624776, + "learning_rate": 2.8149174895899285e-06, + "loss": 0.7268, + "step": 13177 + }, + { + "epoch": 0.952047248360937, + "grad_norm": 5.703828951096723, + "learning_rate": 2.8146273149940058e-06, + "loss": 0.7021, + "step": 13178 + }, + { + "epoch": 0.9521194935611466, + "grad_norm": 5.90114312170297, + "learning_rate": 2.8143371360910797e-06, + "loss": 0.7332, + "step": 13179 + }, + { + "epoch": 0.952191738761356, + "grad_norm": 6.553775473833825, + "learning_rate": 2.814046952885123e-06, + "loss": 0.7977, + "step": 13180 + }, + { + "epoch": 0.9522639839615655, + "grad_norm": 8.138543681120826, + "learning_rate": 2.8137567653801067e-06, + "loss": 0.732, + "step": 13181 + }, + { + "epoch": 0.9523362291617751, + "grad_norm": 6.415219758953239, + "learning_rate": 2.813466573580005e-06, + "loss": 0.7419, + "step": 13182 + }, + { + "epoch": 0.9524084743619846, + "grad_norm": 7.29731935377482, + "learning_rate": 2.8131763774887878e-06, + "loss": 0.8813, + "step": 13183 + }, + { + "epoch": 0.952480719562194, + "grad_norm": 5.659263187611984, + "learning_rate": 2.8128861771104297e-06, + "loss": 0.6738, + "step": 13184 + }, + { + "epoch": 0.9525529647624036, + "grad_norm": 7.466567738195542, + "learning_rate": 2.8125959724489027e-06, + "loss": 0.8873, + "step": 13185 + }, + { + "epoch": 0.9526252099626131, + "grad_norm": 7.179378531929928, + "learning_rate": 2.8123057635081804e-06, + "loss": 0.7885, + "step": 13186 + }, + { + "epoch": 0.9526974551628227, + "grad_norm": 7.436423343979208, + "learning_rate": 2.8120155502922338e-06, + "loss": 0.8655, + "step": 13187 + }, + { + "epoch": 0.9527697003630321, + "grad_norm": 6.132316734539634, + "learning_rate": 2.811725332805037e-06, + "loss": 0.8082, + "step": 13188 + }, + { + "epoch": 0.9528419455632416, + "grad_norm": 6.6964775505091225, + "learning_rate": 2.8114351110505622e-06, + "loss": 0.8824, + "step": 13189 + }, + { + "epoch": 0.9529141907634512, + "grad_norm": 6.734038519363654, + "learning_rate": 2.811144885032782e-06, + "loss": 0.7509, + "step": 13190 + }, + { + "epoch": 0.9529864359636606, + "grad_norm": 6.028415152061566, + "learning_rate": 2.810854654755671e-06, + "loss": 0.6967, + "step": 13191 + }, + { + "epoch": 0.9530586811638702, + "grad_norm": 5.654868278554571, + "learning_rate": 2.8105644202232003e-06, + "loss": 0.8312, + "step": 13192 + }, + { + "epoch": 0.9531309263640797, + "grad_norm": 6.8533066168436285, + "learning_rate": 2.810274181439344e-06, + "loss": 0.7322, + "step": 13193 + }, + { + "epoch": 0.9532031715642892, + "grad_norm": 6.107410319884336, + "learning_rate": 2.8099839384080752e-06, + "loss": 0.8294, + "step": 13194 + }, + { + "epoch": 0.9532754167644987, + "grad_norm": 5.474262010140185, + "learning_rate": 2.809693691133367e-06, + "loss": 0.7682, + "step": 13195 + }, + { + "epoch": 0.9533476619647082, + "grad_norm": 5.8402487317770495, + "learning_rate": 2.8094034396191926e-06, + "loss": 0.7949, + "step": 13196 + }, + { + "epoch": 0.9534199071649178, + "grad_norm": 7.282395170474156, + "learning_rate": 2.809113183869526e-06, + "loss": 0.8328, + "step": 13197 + }, + { + "epoch": 0.9534921523651272, + "grad_norm": 5.94463489896825, + "learning_rate": 2.8088229238883393e-06, + "loss": 0.7906, + "step": 13198 + }, + { + "epoch": 0.9535643975653367, + "grad_norm": 5.467702187566004, + "learning_rate": 2.8085326596796057e-06, + "loss": 0.8073, + "step": 13199 + }, + { + "epoch": 0.9536366427655463, + "grad_norm": 6.069506813890828, + "learning_rate": 2.8082423912473012e-06, + "loss": 0.8158, + "step": 13200 + }, + { + "epoch": 0.9537088879657558, + "grad_norm": 5.930602554663139, + "learning_rate": 2.8079521185953966e-06, + "loss": 0.7408, + "step": 13201 + }, + { + "epoch": 0.9537811331659652, + "grad_norm": 6.492184268294172, + "learning_rate": 2.8076618417278675e-06, + "loss": 0.7065, + "step": 13202 + }, + { + "epoch": 0.9538533783661748, + "grad_norm": 6.316815402545335, + "learning_rate": 2.807371560648685e-06, + "loss": 0.815, + "step": 13203 + }, + { + "epoch": 0.9539256235663843, + "grad_norm": 6.701675080385249, + "learning_rate": 2.807081275361826e-06, + "loss": 0.8522, + "step": 13204 + }, + { + "epoch": 0.9539978687665939, + "grad_norm": 6.650562417476435, + "learning_rate": 2.806790985871262e-06, + "loss": 0.7508, + "step": 13205 + }, + { + "epoch": 0.9540701139668033, + "grad_norm": 7.5144280572988595, + "learning_rate": 2.8065006921809683e-06, + "loss": 0.8043, + "step": 13206 + }, + { + "epoch": 0.9541423591670128, + "grad_norm": 5.573971866521611, + "learning_rate": 2.8062103942949175e-06, + "loss": 0.7893, + "step": 13207 + }, + { + "epoch": 0.9542146043672224, + "grad_norm": 6.987479593045866, + "learning_rate": 2.805920092217084e-06, + "loss": 0.8465, + "step": 13208 + }, + { + "epoch": 0.9542868495674318, + "grad_norm": 5.340176642421636, + "learning_rate": 2.805629785951443e-06, + "loss": 0.7367, + "step": 13209 + }, + { + "epoch": 0.9543590947676414, + "grad_norm": 5.503259040027601, + "learning_rate": 2.8053394755019668e-06, + "loss": 0.7808, + "step": 13210 + }, + { + "epoch": 0.9544313399678509, + "grad_norm": 5.399612674244362, + "learning_rate": 2.805049160872631e-06, + "loss": 0.7964, + "step": 13211 + }, + { + "epoch": 0.9545035851680604, + "grad_norm": 7.340203390603675, + "learning_rate": 2.8047588420674084e-06, + "loss": 0.7893, + "step": 13212 + }, + { + "epoch": 0.9545758303682699, + "grad_norm": 6.190172947453469, + "learning_rate": 2.8044685190902736e-06, + "loss": 0.7806, + "step": 13213 + }, + { + "epoch": 0.9546480755684794, + "grad_norm": 6.829350062621774, + "learning_rate": 2.8041781919452015e-06, + "loss": 0.7326, + "step": 13214 + }, + { + "epoch": 0.954720320768689, + "grad_norm": 6.53004407929152, + "learning_rate": 2.803887860636166e-06, + "loss": 0.805, + "step": 13215 + }, + { + "epoch": 0.9547925659688984, + "grad_norm": 7.138871994031548, + "learning_rate": 2.803597525167142e-06, + "loss": 0.7302, + "step": 13216 + }, + { + "epoch": 0.9548648111691079, + "grad_norm": 5.141861508890659, + "learning_rate": 2.803307185542103e-06, + "loss": 0.7951, + "step": 13217 + }, + { + "epoch": 0.9549370563693175, + "grad_norm": 6.313341141359898, + "learning_rate": 2.803016841765025e-06, + "loss": 0.8347, + "step": 13218 + }, + { + "epoch": 0.955009301569527, + "grad_norm": 7.954887749379284, + "learning_rate": 2.8027264938398807e-06, + "loss": 0.8378, + "step": 13219 + }, + { + "epoch": 0.9550815467697364, + "grad_norm": 6.038263699899292, + "learning_rate": 2.802436141770647e-06, + "loss": 0.785, + "step": 13220 + }, + { + "epoch": 0.955153791969946, + "grad_norm": 5.234670511131281, + "learning_rate": 2.802145785561296e-06, + "loss": 0.7168, + "step": 13221 + }, + { + "epoch": 0.9552260371701555, + "grad_norm": 7.235711849601183, + "learning_rate": 2.8018554252158043e-06, + "loss": 0.8132, + "step": 13222 + }, + { + "epoch": 0.9552982823703651, + "grad_norm": 6.501560904079352, + "learning_rate": 2.8015650607381467e-06, + "loss": 0.7859, + "step": 13223 + }, + { + "epoch": 0.9553705275705745, + "grad_norm": 7.855707909477872, + "learning_rate": 2.801274692132297e-06, + "loss": 0.7903, + "step": 13224 + }, + { + "epoch": 0.955442772770784, + "grad_norm": 6.703441692420516, + "learning_rate": 2.8009843194022313e-06, + "loss": 0.8027, + "step": 13225 + }, + { + "epoch": 0.9555150179709936, + "grad_norm": 6.730574453700564, + "learning_rate": 2.8006939425519233e-06, + "loss": 0.7839, + "step": 13226 + }, + { + "epoch": 0.955587263171203, + "grad_norm": 6.358105581995071, + "learning_rate": 2.8004035615853485e-06, + "loss": 0.8064, + "step": 13227 + }, + { + "epoch": 0.9556595083714126, + "grad_norm": 6.080309634606246, + "learning_rate": 2.8001131765064825e-06, + "loss": 0.8816, + "step": 13228 + }, + { + "epoch": 0.9557317535716221, + "grad_norm": 5.372214726772994, + "learning_rate": 2.7998227873193005e-06, + "loss": 0.7823, + "step": 13229 + }, + { + "epoch": 0.9558039987718316, + "grad_norm": 7.646367635603119, + "learning_rate": 2.7995323940277776e-06, + "loss": 0.8352, + "step": 13230 + }, + { + "epoch": 0.9558762439720411, + "grad_norm": 6.258177938280258, + "learning_rate": 2.799241996635888e-06, + "loss": 0.7525, + "step": 13231 + }, + { + "epoch": 0.9559484891722506, + "grad_norm": 5.6121912610068865, + "learning_rate": 2.7989515951476083e-06, + "loss": 0.8401, + "step": 13232 + }, + { + "epoch": 0.9560207343724602, + "grad_norm": 6.242459745999162, + "learning_rate": 2.798661189566913e-06, + "loss": 0.8227, + "step": 13233 + }, + { + "epoch": 0.9560929795726696, + "grad_norm": 5.465660225093305, + "learning_rate": 2.7983707798977784e-06, + "loss": 0.7548, + "step": 13234 + }, + { + "epoch": 0.9561652247728791, + "grad_norm": 7.630925861944804, + "learning_rate": 2.798080366144179e-06, + "loss": 0.7763, + "step": 13235 + }, + { + "epoch": 0.9562374699730887, + "grad_norm": 7.6857109158954735, + "learning_rate": 2.797789948310091e-06, + "loss": 0.8011, + "step": 13236 + }, + { + "epoch": 0.9563097151732982, + "grad_norm": 6.50328391823634, + "learning_rate": 2.7974995263994896e-06, + "loss": 0.8059, + "step": 13237 + }, + { + "epoch": 0.9563819603735076, + "grad_norm": 6.65113799031129, + "learning_rate": 2.7972091004163515e-06, + "loss": 0.7637, + "step": 13238 + }, + { + "epoch": 0.9564542055737172, + "grad_norm": 6.95655556390666, + "learning_rate": 2.796918670364651e-06, + "loss": 0.8758, + "step": 13239 + }, + { + "epoch": 0.9565264507739267, + "grad_norm": 7.567630005177557, + "learning_rate": 2.7966282362483644e-06, + "loss": 0.8196, + "step": 13240 + }, + { + "epoch": 0.9565986959741363, + "grad_norm": 6.156330553486311, + "learning_rate": 2.7963377980714675e-06, + "loss": 0.8717, + "step": 13241 + }, + { + "epoch": 0.9566709411743457, + "grad_norm": 5.905436464825263, + "learning_rate": 2.7960473558379366e-06, + "loss": 0.736, + "step": 13242 + }, + { + "epoch": 0.9567431863745552, + "grad_norm": 6.3257328739698035, + "learning_rate": 2.7957569095517465e-06, + "loss": 0.7803, + "step": 13243 + }, + { + "epoch": 0.9568154315747648, + "grad_norm": 4.912920068539027, + "learning_rate": 2.7954664592168745e-06, + "loss": 0.8124, + "step": 13244 + }, + { + "epoch": 0.9568876767749742, + "grad_norm": 6.119528370056293, + "learning_rate": 2.795176004837296e-06, + "loss": 0.7546, + "step": 13245 + }, + { + "epoch": 0.9569599219751838, + "grad_norm": 5.749323597804823, + "learning_rate": 2.794885546416987e-06, + "loss": 0.757, + "step": 13246 + }, + { + "epoch": 0.9570321671753933, + "grad_norm": 5.860039675841914, + "learning_rate": 2.7945950839599246e-06, + "loss": 0.7191, + "step": 13247 + }, + { + "epoch": 0.9571044123756028, + "grad_norm": 7.8927598539149955, + "learning_rate": 2.794304617470084e-06, + "loss": 0.8169, + "step": 13248 + }, + { + "epoch": 0.9571766575758123, + "grad_norm": 5.802773818655912, + "learning_rate": 2.7940141469514413e-06, + "loss": 0.7726, + "step": 13249 + }, + { + "epoch": 0.9572489027760218, + "grad_norm": 6.225585248161726, + "learning_rate": 2.793723672407973e-06, + "loss": 0.8912, + "step": 13250 + }, + { + "epoch": 0.9573211479762314, + "grad_norm": 6.279381924968344, + "learning_rate": 2.793433193843656e-06, + "loss": 0.8217, + "step": 13251 + }, + { + "epoch": 0.9573933931764408, + "grad_norm": 6.392802793084543, + "learning_rate": 2.793142711262467e-06, + "loss": 0.7656, + "step": 13252 + }, + { + "epoch": 0.9574656383766503, + "grad_norm": 6.2893369626640006, + "learning_rate": 2.792852224668381e-06, + "loss": 0.8165, + "step": 13253 + }, + { + "epoch": 0.9575378835768599, + "grad_norm": 6.504725865680005, + "learning_rate": 2.792561734065376e-06, + "loss": 0.8168, + "step": 13254 + }, + { + "epoch": 0.9576101287770694, + "grad_norm": 6.230013514422948, + "learning_rate": 2.7922712394574273e-06, + "loss": 0.7993, + "step": 13255 + }, + { + "epoch": 0.9576823739772788, + "grad_norm": 7.793484671205333, + "learning_rate": 2.791980740848513e-06, + "loss": 0.822, + "step": 13256 + }, + { + "epoch": 0.9577546191774884, + "grad_norm": 5.949821777239063, + "learning_rate": 2.791690238242609e-06, + "loss": 0.7396, + "step": 13257 + }, + { + "epoch": 0.9578268643776979, + "grad_norm": 8.089588170455782, + "learning_rate": 2.7913997316436917e-06, + "loss": 0.8474, + "step": 13258 + }, + { + "epoch": 0.9578991095779075, + "grad_norm": 6.712139053159232, + "learning_rate": 2.7911092210557387e-06, + "loss": 0.872, + "step": 13259 + }, + { + "epoch": 0.9579713547781169, + "grad_norm": 5.439012514414543, + "learning_rate": 2.7908187064827257e-06, + "loss": 0.7815, + "step": 13260 + }, + { + "epoch": 0.9580435999783264, + "grad_norm": 6.982759497956294, + "learning_rate": 2.7905281879286315e-06, + "loss": 0.8903, + "step": 13261 + }, + { + "epoch": 0.958115845178536, + "grad_norm": 5.523353068827377, + "learning_rate": 2.790237665397432e-06, + "loss": 0.7885, + "step": 13262 + }, + { + "epoch": 0.9581880903787454, + "grad_norm": 5.945779909866515, + "learning_rate": 2.7899471388931038e-06, + "loss": 0.8589, + "step": 13263 + }, + { + "epoch": 0.958260335578955, + "grad_norm": 7.9073808788549735, + "learning_rate": 2.7896566084196242e-06, + "loss": 0.798, + "step": 13264 + }, + { + "epoch": 0.9583325807791645, + "grad_norm": 8.596265168159286, + "learning_rate": 2.7893660739809704e-06, + "loss": 0.8045, + "step": 13265 + }, + { + "epoch": 0.958404825979374, + "grad_norm": 6.045060075156893, + "learning_rate": 2.7890755355811205e-06, + "loss": 0.8549, + "step": 13266 + }, + { + "epoch": 0.9584770711795835, + "grad_norm": 7.131292408876419, + "learning_rate": 2.7887849932240503e-06, + "loss": 0.7951, + "step": 13267 + }, + { + "epoch": 0.958549316379793, + "grad_norm": 5.812190878246841, + "learning_rate": 2.788494446913738e-06, + "loss": 0.72, + "step": 13268 + }, + { + "epoch": 0.9586215615800026, + "grad_norm": 8.079143998532484, + "learning_rate": 2.788203896654161e-06, + "loss": 0.861, + "step": 13269 + }, + { + "epoch": 0.958693806780212, + "grad_norm": 8.083763766544688, + "learning_rate": 2.7879133424492954e-06, + "loss": 0.9154, + "step": 13270 + }, + { + "epoch": 0.9587660519804215, + "grad_norm": 9.174296538698783, + "learning_rate": 2.7876227843031208e-06, + "loss": 0.8401, + "step": 13271 + }, + { + "epoch": 0.9588382971806311, + "grad_norm": 6.997488933852953, + "learning_rate": 2.787332222219613e-06, + "loss": 0.8061, + "step": 13272 + }, + { + "epoch": 0.9589105423808406, + "grad_norm": 6.112134539516485, + "learning_rate": 2.7870416562027514e-06, + "loss": 0.7419, + "step": 13273 + }, + { + "epoch": 0.95898278758105, + "grad_norm": 6.558214804884173, + "learning_rate": 2.7867510862565108e-06, + "loss": 0.8237, + "step": 13274 + }, + { + "epoch": 0.9590550327812596, + "grad_norm": 7.11346921300604, + "learning_rate": 2.786460512384871e-06, + "loss": 0.743, + "step": 13275 + }, + { + "epoch": 0.9591272779814691, + "grad_norm": 6.928046581631496, + "learning_rate": 2.7861699345918093e-06, + "loss": 0.7771, + "step": 13276 + }, + { + "epoch": 0.9591995231816787, + "grad_norm": 5.890263958701158, + "learning_rate": 2.7858793528813034e-06, + "loss": 0.8166, + "step": 13277 + }, + { + "epoch": 0.9592717683818881, + "grad_norm": 8.255719138787072, + "learning_rate": 2.7855887672573312e-06, + "loss": 0.8115, + "step": 13278 + }, + { + "epoch": 0.9593440135820976, + "grad_norm": 6.282585647308876, + "learning_rate": 2.7852981777238704e-06, + "loss": 0.7487, + "step": 13279 + }, + { + "epoch": 0.9594162587823072, + "grad_norm": 5.78357414748299, + "learning_rate": 2.785007584284899e-06, + "loss": 0.7512, + "step": 13280 + }, + { + "epoch": 0.9594885039825166, + "grad_norm": 7.293251722717239, + "learning_rate": 2.7847169869443954e-06, + "loss": 0.8847, + "step": 13281 + }, + { + "epoch": 0.9595607491827262, + "grad_norm": 6.2232324748898336, + "learning_rate": 2.784426385706337e-06, + "loss": 0.8926, + "step": 13282 + }, + { + "epoch": 0.9596329943829357, + "grad_norm": 6.483662540867442, + "learning_rate": 2.784135780574702e-06, + "loss": 0.8044, + "step": 13283 + }, + { + "epoch": 0.9597052395831452, + "grad_norm": 7.037144068225611, + "learning_rate": 2.783845171553469e-06, + "loss": 0.7872, + "step": 13284 + }, + { + "epoch": 0.9597774847833547, + "grad_norm": 5.665990377722964, + "learning_rate": 2.783554558646616e-06, + "loss": 0.7509, + "step": 13285 + }, + { + "epoch": 0.9598497299835642, + "grad_norm": 5.800193967535495, + "learning_rate": 2.783263941858122e-06, + "loss": 0.7257, + "step": 13286 + }, + { + "epoch": 0.9599219751837738, + "grad_norm": 7.732109832441434, + "learning_rate": 2.7829733211919635e-06, + "loss": 0.8612, + "step": 13287 + }, + { + "epoch": 0.9599942203839832, + "grad_norm": 8.294206035526585, + "learning_rate": 2.7826826966521205e-06, + "loss": 0.7651, + "step": 13288 + }, + { + "epoch": 0.9600664655841927, + "grad_norm": 7.064741740976752, + "learning_rate": 2.782392068242571e-06, + "loss": 0.7422, + "step": 13289 + }, + { + "epoch": 0.9601387107844023, + "grad_norm": 6.463369297021309, + "learning_rate": 2.782101435967293e-06, + "loss": 0.7906, + "step": 13290 + }, + { + "epoch": 0.9602109559846118, + "grad_norm": 5.769548440648053, + "learning_rate": 2.7818107998302656e-06, + "loss": 0.8109, + "step": 13291 + }, + { + "epoch": 0.9602832011848212, + "grad_norm": 6.579689111756957, + "learning_rate": 2.781520159835467e-06, + "loss": 0.7774, + "step": 13292 + }, + { + "epoch": 0.9603554463850308, + "grad_norm": 9.394149956529422, + "learning_rate": 2.7812295159868757e-06, + "loss": 0.8098, + "step": 13293 + }, + { + "epoch": 0.9604276915852403, + "grad_norm": 5.8523923237121185, + "learning_rate": 2.780938868288471e-06, + "loss": 0.679, + "step": 13294 + }, + { + "epoch": 0.9604999367854499, + "grad_norm": 6.983491231488261, + "learning_rate": 2.780648216744231e-06, + "loss": 0.7427, + "step": 13295 + }, + { + "epoch": 0.9605721819856593, + "grad_norm": 6.922542359018278, + "learning_rate": 2.780357561358135e-06, + "loss": 0.7494, + "step": 13296 + }, + { + "epoch": 0.9606444271858688, + "grad_norm": 5.907618449892623, + "learning_rate": 2.7800669021341627e-06, + "loss": 0.7979, + "step": 13297 + }, + { + "epoch": 0.9607166723860784, + "grad_norm": 6.371234754245143, + "learning_rate": 2.7797762390762905e-06, + "loss": 0.7844, + "step": 13298 + }, + { + "epoch": 0.9607889175862878, + "grad_norm": 6.886979087889787, + "learning_rate": 2.7794855721884996e-06, + "loss": 0.8873, + "step": 13299 + }, + { + "epoch": 0.9608611627864974, + "grad_norm": 5.437925826312866, + "learning_rate": 2.7791949014747684e-06, + "loss": 0.8919, + "step": 13300 + }, + { + "epoch": 0.9609334079867069, + "grad_norm": 6.548263074899763, + "learning_rate": 2.7789042269390754e-06, + "loss": 0.7616, + "step": 13301 + }, + { + "epoch": 0.9610056531869164, + "grad_norm": 5.7899434478580645, + "learning_rate": 2.7786135485853993e-06, + "loss": 0.8549, + "step": 13302 + }, + { + "epoch": 0.9610778983871259, + "grad_norm": 5.592501708820408, + "learning_rate": 2.778322866417721e-06, + "loss": 0.809, + "step": 13303 + }, + { + "epoch": 0.9611501435873354, + "grad_norm": 5.8702305040478855, + "learning_rate": 2.778032180440019e-06, + "loss": 0.793, + "step": 13304 + }, + { + "epoch": 0.961222388787545, + "grad_norm": 7.595964273763471, + "learning_rate": 2.7777414906562723e-06, + "loss": 0.788, + "step": 13305 + }, + { + "epoch": 0.9612946339877544, + "grad_norm": 6.760986711619633, + "learning_rate": 2.7774507970704594e-06, + "loss": 0.8015, + "step": 13306 + }, + { + "epoch": 0.9613668791879639, + "grad_norm": 6.717755195247982, + "learning_rate": 2.777160099686561e-06, + "loss": 0.7877, + "step": 13307 + }, + { + "epoch": 0.9614391243881735, + "grad_norm": 5.762536356184459, + "learning_rate": 2.776869398508556e-06, + "loss": 0.7639, + "step": 13308 + }, + { + "epoch": 0.961511369588383, + "grad_norm": 6.1812182048600866, + "learning_rate": 2.7765786935404243e-06, + "loss": 0.8387, + "step": 13309 + }, + { + "epoch": 0.9615836147885924, + "grad_norm": 6.520423786923974, + "learning_rate": 2.7762879847861444e-06, + "loss": 0.7908, + "step": 13310 + }, + { + "epoch": 0.961655859988802, + "grad_norm": 9.291641121274676, + "learning_rate": 2.775997272249697e-06, + "loss": 0.8055, + "step": 13311 + }, + { + "epoch": 0.9617281051890115, + "grad_norm": 4.82501950432981, + "learning_rate": 2.7757065559350605e-06, + "loss": 0.6834, + "step": 13312 + }, + { + "epoch": 0.9618003503892211, + "grad_norm": 6.851740105131084, + "learning_rate": 2.7754158358462165e-06, + "loss": 0.8155, + "step": 13313 + }, + { + "epoch": 0.9618725955894305, + "grad_norm": 6.071317572340372, + "learning_rate": 2.7751251119871433e-06, + "loss": 0.8239, + "step": 13314 + }, + { + "epoch": 0.96194484078964, + "grad_norm": 6.783059718106888, + "learning_rate": 2.77483438436182e-06, + "loss": 0.8092, + "step": 13315 + }, + { + "epoch": 0.9620170859898496, + "grad_norm": 6.482108214014431, + "learning_rate": 2.7745436529742275e-06, + "loss": 0.7991, + "step": 13316 + }, + { + "epoch": 0.962089331190059, + "grad_norm": 5.985885867635744, + "learning_rate": 2.774252917828346e-06, + "loss": 0.7647, + "step": 13317 + }, + { + "epoch": 0.9621615763902686, + "grad_norm": 5.362644455037781, + "learning_rate": 2.7739621789281557e-06, + "loss": 0.7789, + "step": 13318 + }, + { + "epoch": 0.9622338215904781, + "grad_norm": 7.853543554492118, + "learning_rate": 2.7736714362776345e-06, + "loss": 0.808, + "step": 13319 + }, + { + "epoch": 0.9623060667906876, + "grad_norm": 6.968823898676284, + "learning_rate": 2.7733806898807653e-06, + "loss": 0.7064, + "step": 13320 + }, + { + "epoch": 0.9623783119908971, + "grad_norm": 5.803554090536859, + "learning_rate": 2.7730899397415255e-06, + "loss": 0.8376, + "step": 13321 + }, + { + "epoch": 0.9624505571911066, + "grad_norm": 6.554731730333486, + "learning_rate": 2.772799185863897e-06, + "loss": 0.7503, + "step": 13322 + }, + { + "epoch": 0.9625228023913162, + "grad_norm": 6.599022370960824, + "learning_rate": 2.7725084282518595e-06, + "loss": 0.7478, + "step": 13323 + }, + { + "epoch": 0.9625950475915256, + "grad_norm": 6.7114561722040635, + "learning_rate": 2.7722176669093926e-06, + "loss": 0.8939, + "step": 13324 + }, + { + "epoch": 0.9626672927917351, + "grad_norm": 6.613646080918042, + "learning_rate": 2.7719269018404783e-06, + "loss": 0.8027, + "step": 13325 + }, + { + "epoch": 0.9627395379919447, + "grad_norm": 5.500108371013749, + "learning_rate": 2.7716361330490944e-06, + "loss": 0.7293, + "step": 13326 + }, + { + "epoch": 0.9628117831921542, + "grad_norm": 7.029229310334499, + "learning_rate": 2.7713453605392236e-06, + "loss": 0.8108, + "step": 13327 + }, + { + "epoch": 0.9628840283923636, + "grad_norm": 5.7384055626000245, + "learning_rate": 2.7710545843148456e-06, + "loss": 0.7888, + "step": 13328 + }, + { + "epoch": 0.9629562735925732, + "grad_norm": 6.493154295398918, + "learning_rate": 2.770763804379941e-06, + "loss": 0.7663, + "step": 13329 + }, + { + "epoch": 0.9630285187927827, + "grad_norm": 5.427981936206151, + "learning_rate": 2.7704730207384894e-06, + "loss": 0.7683, + "step": 13330 + }, + { + "epoch": 0.9631007639929923, + "grad_norm": 5.9915645429726805, + "learning_rate": 2.770182233394473e-06, + "loss": 0.7735, + "step": 13331 + }, + { + "epoch": 0.9631730091932017, + "grad_norm": 8.518408196267785, + "learning_rate": 2.7698914423518715e-06, + "loss": 0.8597, + "step": 13332 + }, + { + "epoch": 0.9632452543934112, + "grad_norm": 9.639958239579663, + "learning_rate": 2.7696006476146647e-06, + "loss": 0.838, + "step": 13333 + }, + { + "epoch": 0.9633174995936208, + "grad_norm": 8.102215084915146, + "learning_rate": 2.7693098491868355e-06, + "loss": 0.8002, + "step": 13334 + }, + { + "epoch": 0.9633897447938302, + "grad_norm": 5.994561273526241, + "learning_rate": 2.769019047072363e-06, + "loss": 0.7696, + "step": 13335 + }, + { + "epoch": 0.9634619899940398, + "grad_norm": 5.908843586154127, + "learning_rate": 2.7687282412752287e-06, + "loss": 0.8168, + "step": 13336 + }, + { + "epoch": 0.9635342351942493, + "grad_norm": 5.953922728908766, + "learning_rate": 2.7684374317994136e-06, + "loss": 0.8587, + "step": 13337 + }, + { + "epoch": 0.9636064803944588, + "grad_norm": 6.063915598661917, + "learning_rate": 2.768146618648898e-06, + "loss": 0.7174, + "step": 13338 + }, + { + "epoch": 0.9636787255946683, + "grad_norm": 6.429286717175583, + "learning_rate": 2.7678558018276646e-06, + "loss": 0.878, + "step": 13339 + }, + { + "epoch": 0.9637509707948778, + "grad_norm": 5.848388462946471, + "learning_rate": 2.7675649813396923e-06, + "loss": 0.8713, + "step": 13340 + }, + { + "epoch": 0.9638232159950874, + "grad_norm": 6.329005352054767, + "learning_rate": 2.767274157188963e-06, + "loss": 0.7295, + "step": 13341 + }, + { + "epoch": 0.9638954611952968, + "grad_norm": 6.61560082807346, + "learning_rate": 2.766983329379458e-06, + "loss": 0.7994, + "step": 13342 + }, + { + "epoch": 0.9639677063955063, + "grad_norm": 6.523261190076255, + "learning_rate": 2.766692497915159e-06, + "loss": 0.7376, + "step": 13343 + }, + { + "epoch": 0.9640399515957159, + "grad_norm": 8.06540314207779, + "learning_rate": 2.766401662800046e-06, + "loss": 0.7172, + "step": 13344 + }, + { + "epoch": 0.9641121967959254, + "grad_norm": 7.068276262809242, + "learning_rate": 2.7661108240381016e-06, + "loss": 0.7964, + "step": 13345 + }, + { + "epoch": 0.9641844419961348, + "grad_norm": 5.869477943717577, + "learning_rate": 2.7658199816333077e-06, + "loss": 0.7786, + "step": 13346 + }, + { + "epoch": 0.9642566871963444, + "grad_norm": 6.894481730080661, + "learning_rate": 2.765529135589643e-06, + "loss": 0.7619, + "step": 13347 + }, + { + "epoch": 0.9643289323965539, + "grad_norm": 7.938179527691925, + "learning_rate": 2.7652382859110916e-06, + "loss": 0.7764, + "step": 13348 + }, + { + "epoch": 0.9644011775967635, + "grad_norm": 5.720866827740547, + "learning_rate": 2.7649474326016336e-06, + "loss": 0.8587, + "step": 13349 + }, + { + "epoch": 0.9644734227969729, + "grad_norm": 8.911192262216078, + "learning_rate": 2.764656575665251e-06, + "loss": 0.7988, + "step": 13350 + }, + { + "epoch": 0.9645456679971824, + "grad_norm": 5.75435506672005, + "learning_rate": 2.764365715105925e-06, + "loss": 0.7623, + "step": 13351 + }, + { + "epoch": 0.964617913197392, + "grad_norm": 6.008276317535781, + "learning_rate": 2.7640748509276383e-06, + "loss": 0.7739, + "step": 13352 + }, + { + "epoch": 0.9646901583976014, + "grad_norm": 7.496858829100812, + "learning_rate": 2.7637839831343718e-06, + "loss": 0.8689, + "step": 13353 + }, + { + "epoch": 0.964762403597811, + "grad_norm": 6.996280772032615, + "learning_rate": 2.7634931117301072e-06, + "loss": 0.7963, + "step": 13354 + }, + { + "epoch": 0.9648346487980205, + "grad_norm": 6.207440237371368, + "learning_rate": 2.7632022367188266e-06, + "loss": 0.8179, + "step": 13355 + }, + { + "epoch": 0.96490689399823, + "grad_norm": 5.897411373389381, + "learning_rate": 2.7629113581045113e-06, + "loss": 0.8234, + "step": 13356 + }, + { + "epoch": 0.9649791391984395, + "grad_norm": 7.513783060653825, + "learning_rate": 2.7626204758911446e-06, + "loss": 0.7954, + "step": 13357 + }, + { + "epoch": 0.965051384398649, + "grad_norm": 6.4049991037694225, + "learning_rate": 2.762329590082707e-06, + "loss": 0.7353, + "step": 13358 + }, + { + "epoch": 0.9651236295988586, + "grad_norm": 7.28950759166592, + "learning_rate": 2.7620387006831805e-06, + "loss": 0.8024, + "step": 13359 + }, + { + "epoch": 0.965195874799068, + "grad_norm": 6.142515711626477, + "learning_rate": 2.7617478076965474e-06, + "loss": 0.8376, + "step": 13360 + }, + { + "epoch": 0.9652681199992775, + "grad_norm": 6.375219303921686, + "learning_rate": 2.7614569111267913e-06, + "loss": 0.8013, + "step": 13361 + }, + { + "epoch": 0.9653403651994871, + "grad_norm": 7.010186005000654, + "learning_rate": 2.7611660109778924e-06, + "loss": 0.8312, + "step": 13362 + }, + { + "epoch": 0.9654126103996966, + "grad_norm": 6.358050084169102, + "learning_rate": 2.7608751072538343e-06, + "loss": 0.8133, + "step": 13363 + }, + { + "epoch": 0.965484855599906, + "grad_norm": 6.781080393274277, + "learning_rate": 2.7605841999585976e-06, + "loss": 0.852, + "step": 13364 + }, + { + "epoch": 0.9655571008001156, + "grad_norm": 6.480321881341313, + "learning_rate": 2.7602932890961655e-06, + "loss": 0.8489, + "step": 13365 + }, + { + "epoch": 0.9656293460003251, + "grad_norm": 6.0803902530909, + "learning_rate": 2.760002374670521e-06, + "loss": 0.7653, + "step": 13366 + }, + { + "epoch": 0.9657015912005346, + "grad_norm": 6.67695856399328, + "learning_rate": 2.7597114566856457e-06, + "loss": 0.7633, + "step": 13367 + }, + { + "epoch": 0.9657738364007441, + "grad_norm": 7.297141615511921, + "learning_rate": 2.7594205351455222e-06, + "loss": 0.7527, + "step": 13368 + }, + { + "epoch": 0.9658460816009536, + "grad_norm": 6.245025180738397, + "learning_rate": 2.7591296100541326e-06, + "loss": 0.6989, + "step": 13369 + }, + { + "epoch": 0.9659183268011632, + "grad_norm": 6.530650718244771, + "learning_rate": 2.7588386814154605e-06, + "loss": 0.8557, + "step": 13370 + }, + { + "epoch": 0.9659905720013726, + "grad_norm": 6.1855663013785565, + "learning_rate": 2.758547749233488e-06, + "loss": 0.8486, + "step": 13371 + }, + { + "epoch": 0.9660628172015822, + "grad_norm": 7.403536400046037, + "learning_rate": 2.758256813512197e-06, + "loss": 0.8046, + "step": 13372 + }, + { + "epoch": 0.9661350624017917, + "grad_norm": 7.0062339816561146, + "learning_rate": 2.7579658742555704e-06, + "loss": 0.9052, + "step": 13373 + }, + { + "epoch": 0.9662073076020012, + "grad_norm": 6.66525561976483, + "learning_rate": 2.757674931467591e-06, + "loss": 0.7931, + "step": 13374 + }, + { + "epoch": 0.9662795528022107, + "grad_norm": 6.0284306553008875, + "learning_rate": 2.7573839851522435e-06, + "loss": 0.8067, + "step": 13375 + }, + { + "epoch": 0.9663517980024202, + "grad_norm": 6.8804496666981585, + "learning_rate": 2.7570930353135077e-06, + "loss": 0.7662, + "step": 13376 + }, + { + "epoch": 0.9664240432026298, + "grad_norm": 8.140490042067492, + "learning_rate": 2.7568020819553687e-06, + "loss": 0.7919, + "step": 13377 + }, + { + "epoch": 0.9664962884028392, + "grad_norm": 7.166736365873901, + "learning_rate": 2.756511125081808e-06, + "loss": 0.76, + "step": 13378 + }, + { + "epoch": 0.9665685336030487, + "grad_norm": 6.457814579300416, + "learning_rate": 2.756220164696809e-06, + "loss": 0.795, + "step": 13379 + }, + { + "epoch": 0.9666407788032583, + "grad_norm": 7.617784682119518, + "learning_rate": 2.755929200804356e-06, + "loss": 0.8446, + "step": 13380 + }, + { + "epoch": 0.9667130240034678, + "grad_norm": 5.758924110385523, + "learning_rate": 2.7556382334084296e-06, + "loss": 0.7612, + "step": 13381 + }, + { + "epoch": 0.9667852692036772, + "grad_norm": 7.063439230791884, + "learning_rate": 2.755347262513015e-06, + "loss": 0.8122, + "step": 13382 + }, + { + "epoch": 0.9668575144038868, + "grad_norm": 5.8781236501526895, + "learning_rate": 2.755056288122094e-06, + "loss": 0.8205, + "step": 13383 + }, + { + "epoch": 0.9669297596040963, + "grad_norm": 8.167213525433233, + "learning_rate": 2.7547653102396515e-06, + "loss": 0.8413, + "step": 13384 + }, + { + "epoch": 0.9670020048043058, + "grad_norm": 5.232717860935959, + "learning_rate": 2.754474328869669e-06, + "loss": 0.7452, + "step": 13385 + }, + { + "epoch": 0.9670742500045153, + "grad_norm": 6.2581645280783444, + "learning_rate": 2.754183344016131e-06, + "loss": 0.8443, + "step": 13386 + }, + { + "epoch": 0.9671464952047248, + "grad_norm": 5.4273483397152935, + "learning_rate": 2.7538923556830198e-06, + "loss": 0.7989, + "step": 13387 + }, + { + "epoch": 0.9672187404049344, + "grad_norm": 6.851014623191772, + "learning_rate": 2.75360136387432e-06, + "loss": 0.7449, + "step": 13388 + }, + { + "epoch": 0.9672909856051438, + "grad_norm": 6.165575738585361, + "learning_rate": 2.753310368594014e-06, + "loss": 0.7812, + "step": 13389 + }, + { + "epoch": 0.9673632308053534, + "grad_norm": 7.023035566224418, + "learning_rate": 2.7530193698460855e-06, + "loss": 0.7593, + "step": 13390 + }, + { + "epoch": 0.9674354760055629, + "grad_norm": 6.2196722065698715, + "learning_rate": 2.752728367634519e-06, + "loss": 0.8579, + "step": 13391 + }, + { + "epoch": 0.9675077212057724, + "grad_norm": 6.2171144107760865, + "learning_rate": 2.752437361963297e-06, + "loss": 0.8214, + "step": 13392 + }, + { + "epoch": 0.9675799664059819, + "grad_norm": 6.947308728693244, + "learning_rate": 2.7521463528364034e-06, + "loss": 0.8299, + "step": 13393 + }, + { + "epoch": 0.9676522116061914, + "grad_norm": 5.35756908719317, + "learning_rate": 2.751855340257822e-06, + "loss": 0.7788, + "step": 13394 + }, + { + "epoch": 0.967724456806401, + "grad_norm": 6.246384757624462, + "learning_rate": 2.751564324231537e-06, + "loss": 0.829, + "step": 13395 + }, + { + "epoch": 0.9677967020066104, + "grad_norm": 6.130612351665736, + "learning_rate": 2.7512733047615313e-06, + "loss": 0.7899, + "step": 13396 + }, + { + "epoch": 0.9678689472068199, + "grad_norm": 7.24898627198559, + "learning_rate": 2.750982281851789e-06, + "loss": 0.7844, + "step": 13397 + }, + { + "epoch": 0.9679411924070295, + "grad_norm": 5.334779225494306, + "learning_rate": 2.750691255506295e-06, + "loss": 0.7318, + "step": 13398 + }, + { + "epoch": 0.968013437607239, + "grad_norm": 6.8199274013380435, + "learning_rate": 2.7504002257290313e-06, + "loss": 0.8309, + "step": 13399 + }, + { + "epoch": 0.9680856828074484, + "grad_norm": 6.998151807666402, + "learning_rate": 2.7501091925239835e-06, + "loss": 0.7615, + "step": 13400 + }, + { + "epoch": 0.968157928007658, + "grad_norm": 6.276160156482183, + "learning_rate": 2.749818155895135e-06, + "loss": 0.6709, + "step": 13401 + }, + { + "epoch": 0.9682301732078675, + "grad_norm": 7.231418343235497, + "learning_rate": 2.74952711584647e-06, + "loss": 0.777, + "step": 13402 + }, + { + "epoch": 0.968302418408077, + "grad_norm": 5.709353873177763, + "learning_rate": 2.7492360723819715e-06, + "loss": 0.8381, + "step": 13403 + }, + { + "epoch": 0.9683746636082865, + "grad_norm": 6.978693005634689, + "learning_rate": 2.748945025505626e-06, + "loss": 0.7795, + "step": 13404 + }, + { + "epoch": 0.968446908808496, + "grad_norm": 6.785504250395022, + "learning_rate": 2.7486539752214166e-06, + "loss": 0.7503, + "step": 13405 + }, + { + "epoch": 0.9685191540087056, + "grad_norm": 5.135615264374538, + "learning_rate": 2.748362921533327e-06, + "loss": 0.8172, + "step": 13406 + }, + { + "epoch": 0.968591399208915, + "grad_norm": 5.984372450227917, + "learning_rate": 2.7480718644453407e-06, + "loss": 0.7827, + "step": 13407 + }, + { + "epoch": 0.9686636444091246, + "grad_norm": 6.152789232383535, + "learning_rate": 2.747780803961444e-06, + "loss": 0.774, + "step": 13408 + }, + { + "epoch": 0.9687358896093341, + "grad_norm": 6.346656471746468, + "learning_rate": 2.747489740085621e-06, + "loss": 0.8385, + "step": 13409 + }, + { + "epoch": 0.9688081348095436, + "grad_norm": 5.7755818391790354, + "learning_rate": 2.7471986728218553e-06, + "loss": 0.7914, + "step": 13410 + }, + { + "epoch": 0.9688803800097531, + "grad_norm": 5.714270332860681, + "learning_rate": 2.746907602174131e-06, + "loss": 0.8257, + "step": 13411 + }, + { + "epoch": 0.9689526252099626, + "grad_norm": 5.740410146913298, + "learning_rate": 2.7466165281464345e-06, + "loss": 0.7466, + "step": 13412 + }, + { + "epoch": 0.9690248704101722, + "grad_norm": 6.570212927028631, + "learning_rate": 2.7463254507427484e-06, + "loss": 0.7412, + "step": 13413 + }, + { + "epoch": 0.9690971156103816, + "grad_norm": 6.120395720410083, + "learning_rate": 2.746034369967059e-06, + "loss": 0.8272, + "step": 13414 + }, + { + "epoch": 0.9691693608105911, + "grad_norm": 6.743431568847605, + "learning_rate": 2.745743285823349e-06, + "loss": 0.7606, + "step": 13415 + }, + { + "epoch": 0.9692416060108007, + "grad_norm": 6.299703185340455, + "learning_rate": 2.7454521983156047e-06, + "loss": 0.8146, + "step": 13416 + }, + { + "epoch": 0.9693138512110102, + "grad_norm": 5.8759329846366875, + "learning_rate": 2.7451611074478103e-06, + "loss": 0.7557, + "step": 13417 + }, + { + "epoch": 0.9693860964112196, + "grad_norm": 7.414472048093049, + "learning_rate": 2.744870013223951e-06, + "loss": 0.8045, + "step": 13418 + }, + { + "epoch": 0.9694583416114292, + "grad_norm": 6.480038436022137, + "learning_rate": 2.744578915648011e-06, + "loss": 0.7877, + "step": 13419 + }, + { + "epoch": 0.9695305868116387, + "grad_norm": 7.223887807386749, + "learning_rate": 2.744287814723976e-06, + "loss": 0.8416, + "step": 13420 + }, + { + "epoch": 0.9696028320118482, + "grad_norm": 6.419229004100109, + "learning_rate": 2.7439967104558295e-06, + "loss": 0.8463, + "step": 13421 + }, + { + "epoch": 0.9696750772120577, + "grad_norm": 6.372834080783165, + "learning_rate": 2.7437056028475584e-06, + "loss": 0.778, + "step": 13422 + }, + { + "epoch": 0.9697473224122672, + "grad_norm": 5.666979313621643, + "learning_rate": 2.7434144919031468e-06, + "loss": 0.7995, + "step": 13423 + }, + { + "epoch": 0.9698195676124768, + "grad_norm": 6.39901339555743, + "learning_rate": 2.743123377626579e-06, + "loss": 0.8123, + "step": 13424 + }, + { + "epoch": 0.9698918128126862, + "grad_norm": 7.714513936779278, + "learning_rate": 2.7428322600218415e-06, + "loss": 0.7817, + "step": 13425 + }, + { + "epoch": 0.9699640580128958, + "grad_norm": 5.77367271594601, + "learning_rate": 2.742541139092919e-06, + "loss": 0.775, + "step": 13426 + }, + { + "epoch": 0.9700363032131053, + "grad_norm": 5.937510279596364, + "learning_rate": 2.742250014843797e-06, + "loss": 0.724, + "step": 13427 + }, + { + "epoch": 0.9701085484133148, + "grad_norm": 9.459235015327366, + "learning_rate": 2.74195888727846e-06, + "loss": 0.8878, + "step": 13428 + }, + { + "epoch": 0.9701807936135243, + "grad_norm": 7.043867801740634, + "learning_rate": 2.7416677564008938e-06, + "loss": 0.8259, + "step": 13429 + }, + { + "epoch": 0.9702530388137338, + "grad_norm": 5.691931035989195, + "learning_rate": 2.7413766222150827e-06, + "loss": 0.7478, + "step": 13430 + }, + { + "epoch": 0.9703252840139434, + "grad_norm": 5.3855492620420655, + "learning_rate": 2.7410854847250142e-06, + "loss": 0.7667, + "step": 13431 + }, + { + "epoch": 0.9703975292141528, + "grad_norm": 6.922005335698004, + "learning_rate": 2.7407943439346725e-06, + "loss": 0.7681, + "step": 13432 + }, + { + "epoch": 0.9704697744143623, + "grad_norm": 9.639405011061895, + "learning_rate": 2.740503199848043e-06, + "loss": 0.7923, + "step": 13433 + }, + { + "epoch": 0.9705420196145719, + "grad_norm": 7.1447779225459485, + "learning_rate": 2.7402120524691115e-06, + "loss": 0.7366, + "step": 13434 + }, + { + "epoch": 0.9706142648147814, + "grad_norm": 7.996216117544994, + "learning_rate": 2.7399209018018634e-06, + "loss": 0.8315, + "step": 13435 + }, + { + "epoch": 0.9706865100149908, + "grad_norm": 7.183076243509348, + "learning_rate": 2.7396297478502843e-06, + "loss": 0.8158, + "step": 13436 + }, + { + "epoch": 0.9707587552152004, + "grad_norm": 5.983841755268902, + "learning_rate": 2.73933859061836e-06, + "loss": 0.7812, + "step": 13437 + }, + { + "epoch": 0.9708310004154099, + "grad_norm": 5.302903607163731, + "learning_rate": 2.739047430110077e-06, + "loss": 0.8077, + "step": 13438 + }, + { + "epoch": 0.9709032456156194, + "grad_norm": 6.429235987146371, + "learning_rate": 2.738756266329419e-06, + "loss": 0.7704, + "step": 13439 + }, + { + "epoch": 0.9709754908158289, + "grad_norm": 6.390737220133283, + "learning_rate": 2.7384650992803737e-06, + "loss": 0.7908, + "step": 13440 + }, + { + "epoch": 0.9710477360160384, + "grad_norm": 6.146068227988194, + "learning_rate": 2.738173928966927e-06, + "loss": 0.8595, + "step": 13441 + }, + { + "epoch": 0.971119981216248, + "grad_norm": 7.196812443330896, + "learning_rate": 2.737882755393063e-06, + "loss": 0.7831, + "step": 13442 + }, + { + "epoch": 0.9711922264164574, + "grad_norm": 8.096747001834672, + "learning_rate": 2.73759157856277e-06, + "loss": 0.8513, + "step": 13443 + }, + { + "epoch": 0.971264471616667, + "grad_norm": 7.2503777767281505, + "learning_rate": 2.7373003984800318e-06, + "loss": 0.8131, + "step": 13444 + }, + { + "epoch": 0.9713367168168765, + "grad_norm": 6.305943171406309, + "learning_rate": 2.7370092151488354e-06, + "loss": 0.788, + "step": 13445 + }, + { + "epoch": 0.971408962017086, + "grad_norm": 6.816794905198558, + "learning_rate": 2.7367180285731675e-06, + "loss": 0.795, + "step": 13446 + }, + { + "epoch": 0.9714812072172955, + "grad_norm": 5.8022554418855625, + "learning_rate": 2.736426838757013e-06, + "loss": 0.7591, + "step": 13447 + }, + { + "epoch": 0.971553452417505, + "grad_norm": 6.30088175098087, + "learning_rate": 2.73613564570436e-06, + "loss": 0.7805, + "step": 13448 + }, + { + "epoch": 0.9716256976177146, + "grad_norm": 6.4779491527666435, + "learning_rate": 2.7358444494191914e-06, + "loss": 0.7962, + "step": 13449 + }, + { + "epoch": 0.971697942817924, + "grad_norm": 6.953576234041513, + "learning_rate": 2.735553249905496e-06, + "loss": 0.8839, + "step": 13450 + }, + { + "epoch": 0.9717701880181335, + "grad_norm": 5.362744931760693, + "learning_rate": 2.7352620471672597e-06, + "loss": 0.7648, + "step": 13451 + }, + { + "epoch": 0.9718424332183431, + "grad_norm": 5.854309333700962, + "learning_rate": 2.7349708412084693e-06, + "loss": 0.8536, + "step": 13452 + }, + { + "epoch": 0.9719146784185526, + "grad_norm": 6.887824841158525, + "learning_rate": 2.7346796320331106e-06, + "loss": 0.8237, + "step": 13453 + }, + { + "epoch": 0.971986923618762, + "grad_norm": 6.9289013611297445, + "learning_rate": 2.734388419645168e-06, + "loss": 0.7507, + "step": 13454 + }, + { + "epoch": 0.9720591688189716, + "grad_norm": 7.486278952763766, + "learning_rate": 2.7340972040486318e-06, + "loss": 0.815, + "step": 13455 + }, + { + "epoch": 0.9721314140191811, + "grad_norm": 6.999417144487734, + "learning_rate": 2.7338059852474857e-06, + "loss": 0.8305, + "step": 13456 + }, + { + "epoch": 0.9722036592193906, + "grad_norm": 6.31449329895587, + "learning_rate": 2.7335147632457184e-06, + "loss": 0.7868, + "step": 13457 + }, + { + "epoch": 0.9722759044196001, + "grad_norm": 7.126018785955659, + "learning_rate": 2.7332235380473144e-06, + "loss": 0.8315, + "step": 13458 + }, + { + "epoch": 0.9723481496198096, + "grad_norm": 6.12378945357599, + "learning_rate": 2.732932309656261e-06, + "loss": 0.7459, + "step": 13459 + }, + { + "epoch": 0.9724203948200192, + "grad_norm": 5.365622568546515, + "learning_rate": 2.732641078076545e-06, + "loss": 0.766, + "step": 13460 + }, + { + "epoch": 0.9724926400202286, + "grad_norm": 6.347636117756536, + "learning_rate": 2.7323498433121542e-06, + "loss": 0.818, + "step": 13461 + }, + { + "epoch": 0.9725648852204382, + "grad_norm": 6.586901538743654, + "learning_rate": 2.7320586053670746e-06, + "loss": 0.8401, + "step": 13462 + }, + { + "epoch": 0.9726371304206477, + "grad_norm": 5.333348731177673, + "learning_rate": 2.731767364245292e-06, + "loss": 0.7844, + "step": 13463 + }, + { + "epoch": 0.9727093756208572, + "grad_norm": 5.475312224939441, + "learning_rate": 2.7314761199507944e-06, + "loss": 0.7625, + "step": 13464 + }, + { + "epoch": 0.9727816208210667, + "grad_norm": 5.649368230556311, + "learning_rate": 2.731184872487568e-06, + "loss": 0.7527, + "step": 13465 + }, + { + "epoch": 0.9728538660212762, + "grad_norm": 6.05229507925038, + "learning_rate": 2.7308936218596012e-06, + "loss": 0.827, + "step": 13466 + }, + { + "epoch": 0.9729261112214858, + "grad_norm": 6.242857246610922, + "learning_rate": 2.730602368070879e-06, + "loss": 0.7948, + "step": 13467 + }, + { + "epoch": 0.9729983564216952, + "grad_norm": 6.496937543853294, + "learning_rate": 2.7303111111253898e-06, + "loss": 0.7377, + "step": 13468 + }, + { + "epoch": 0.9730706016219047, + "grad_norm": 6.392030594008925, + "learning_rate": 2.73001985102712e-06, + "loss": 0.771, + "step": 13469 + }, + { + "epoch": 0.9731428468221143, + "grad_norm": 7.726909078109253, + "learning_rate": 2.7297285877800584e-06, + "loss": 0.8523, + "step": 13470 + }, + { + "epoch": 0.9732150920223238, + "grad_norm": 8.026662265780606, + "learning_rate": 2.7294373213881895e-06, + "loss": 0.7516, + "step": 13471 + }, + { + "epoch": 0.9732873372225332, + "grad_norm": 7.033348211239571, + "learning_rate": 2.7291460518555026e-06, + "loss": 0.8001, + "step": 13472 + }, + { + "epoch": 0.9733595824227428, + "grad_norm": 6.432683531733361, + "learning_rate": 2.7288547791859832e-06, + "loss": 0.7329, + "step": 13473 + }, + { + "epoch": 0.9734318276229523, + "grad_norm": 5.2794592716305395, + "learning_rate": 2.72856350338362e-06, + "loss": 0.8125, + "step": 13474 + }, + { + "epoch": 0.9735040728231618, + "grad_norm": 7.221294004343541, + "learning_rate": 2.7282722244524005e-06, + "loss": 0.8327, + "step": 13475 + }, + { + "epoch": 0.9735763180233713, + "grad_norm": 5.286870910881058, + "learning_rate": 2.727980942396311e-06, + "loss": 0.6935, + "step": 13476 + }, + { + "epoch": 0.9736485632235808, + "grad_norm": 5.886043800992078, + "learning_rate": 2.7276896572193394e-06, + "loss": 0.7721, + "step": 13477 + }, + { + "epoch": 0.9737208084237904, + "grad_norm": 6.557892553437548, + "learning_rate": 2.727398368925473e-06, + "loss": 0.8441, + "step": 13478 + }, + { + "epoch": 0.9737930536239998, + "grad_norm": 5.56246245296536, + "learning_rate": 2.7271070775187e-06, + "loss": 0.7868, + "step": 13479 + }, + { + "epoch": 0.9738652988242094, + "grad_norm": 7.291580664717897, + "learning_rate": 2.7268157830030075e-06, + "loss": 0.8189, + "step": 13480 + }, + { + "epoch": 0.9739375440244189, + "grad_norm": 5.73181827103221, + "learning_rate": 2.7265244853823823e-06, + "loss": 0.8097, + "step": 13481 + }, + { + "epoch": 0.9740097892246284, + "grad_norm": 7.587792088324163, + "learning_rate": 2.7262331846608127e-06, + "loss": 0.8012, + "step": 13482 + }, + { + "epoch": 0.9740820344248379, + "grad_norm": 7.295501775812865, + "learning_rate": 2.7259418808422865e-06, + "loss": 0.8067, + "step": 13483 + }, + { + "epoch": 0.9741542796250474, + "grad_norm": 7.092315125393867, + "learning_rate": 2.7256505739307924e-06, + "loss": 0.8525, + "step": 13484 + }, + { + "epoch": 0.974226524825257, + "grad_norm": 5.725895321064052, + "learning_rate": 2.7253592639303165e-06, + "loss": 0.8248, + "step": 13485 + }, + { + "epoch": 0.9742987700254664, + "grad_norm": 7.024214144698746, + "learning_rate": 2.7250679508448476e-06, + "loss": 0.7918, + "step": 13486 + }, + { + "epoch": 0.9743710152256759, + "grad_norm": 5.96358758056379, + "learning_rate": 2.724776634678373e-06, + "loss": 0.7329, + "step": 13487 + }, + { + "epoch": 0.9744432604258855, + "grad_norm": 6.641416830088767, + "learning_rate": 2.7244853154348804e-06, + "loss": 0.8327, + "step": 13488 + }, + { + "epoch": 0.974515505626095, + "grad_norm": 6.600898808631072, + "learning_rate": 2.7241939931183587e-06, + "loss": 0.752, + "step": 13489 + }, + { + "epoch": 0.9745877508263044, + "grad_norm": 5.81469389827317, + "learning_rate": 2.7239026677327952e-06, + "loss": 0.8366, + "step": 13490 + }, + { + "epoch": 0.974659996026514, + "grad_norm": 5.84724069060336, + "learning_rate": 2.723611339282178e-06, + "loss": 0.8035, + "step": 13491 + }, + { + "epoch": 0.9747322412267235, + "grad_norm": 6.984914327872935, + "learning_rate": 2.7233200077704943e-06, + "loss": 0.7286, + "step": 13492 + }, + { + "epoch": 0.974804486426933, + "grad_norm": 6.187045302204696, + "learning_rate": 2.723028673201734e-06, + "loss": 0.888, + "step": 13493 + }, + { + "epoch": 0.9748767316271425, + "grad_norm": 5.440083208008733, + "learning_rate": 2.722737335579885e-06, + "loss": 0.7603, + "step": 13494 + }, + { + "epoch": 0.974948976827352, + "grad_norm": 5.685333845523814, + "learning_rate": 2.722445994908934e-06, + "loss": 0.796, + "step": 13495 + }, + { + "epoch": 0.9750212220275616, + "grad_norm": 9.704558679686597, + "learning_rate": 2.7221546511928697e-06, + "loss": 0.9522, + "step": 13496 + }, + { + "epoch": 0.975093467227771, + "grad_norm": 6.735687623580177, + "learning_rate": 2.7218633044356807e-06, + "loss": 0.8148, + "step": 13497 + }, + { + "epoch": 0.9751657124279806, + "grad_norm": 6.9215652706521835, + "learning_rate": 2.721571954641356e-06, + "loss": 0.8162, + "step": 13498 + }, + { + "epoch": 0.9752379576281901, + "grad_norm": 5.710694690140083, + "learning_rate": 2.7212806018138825e-06, + "loss": 0.7545, + "step": 13499 + }, + { + "epoch": 0.9753102028283996, + "grad_norm": 5.5143607530493055, + "learning_rate": 2.7209892459572507e-06, + "loss": 0.736, + "step": 13500 + }, + { + "epoch": 0.9753824480286091, + "grad_norm": 5.86568833054267, + "learning_rate": 2.720697887075447e-06, + "loss": 0.8289, + "step": 13501 + }, + { + "epoch": 0.9754546932288186, + "grad_norm": 6.039611080860807, + "learning_rate": 2.72040652517246e-06, + "loss": 0.8454, + "step": 13502 + }, + { + "epoch": 0.9755269384290282, + "grad_norm": 8.230847408381386, + "learning_rate": 2.72011516025228e-06, + "loss": 0.732, + "step": 13503 + }, + { + "epoch": 0.9755991836292376, + "grad_norm": 5.504264738646784, + "learning_rate": 2.719823792318893e-06, + "loss": 0.7268, + "step": 13504 + }, + { + "epoch": 0.9756714288294471, + "grad_norm": 6.2740230453306305, + "learning_rate": 2.71953242137629e-06, + "loss": 0.8824, + "step": 13505 + }, + { + "epoch": 0.9757436740296567, + "grad_norm": 7.035467756238548, + "learning_rate": 2.7192410474284577e-06, + "loss": 0.7756, + "step": 13506 + }, + { + "epoch": 0.9758159192298662, + "grad_norm": 6.317434667167187, + "learning_rate": 2.7189496704793862e-06, + "loss": 0.7432, + "step": 13507 + }, + { + "epoch": 0.9758881644300756, + "grad_norm": 5.030626909992824, + "learning_rate": 2.7186582905330637e-06, + "loss": 0.7589, + "step": 13508 + }, + { + "epoch": 0.9759604096302852, + "grad_norm": 5.371840857130444, + "learning_rate": 2.718366907593479e-06, + "loss": 0.769, + "step": 13509 + }, + { + "epoch": 0.9760326548304947, + "grad_norm": 7.240071041959944, + "learning_rate": 2.718075521664621e-06, + "loss": 0.8794, + "step": 13510 + }, + { + "epoch": 0.9761049000307042, + "grad_norm": 6.149401578243839, + "learning_rate": 2.7177841327504785e-06, + "loss": 0.8366, + "step": 13511 + }, + { + "epoch": 0.9761771452309137, + "grad_norm": 6.400981541706368, + "learning_rate": 2.71749274085504e-06, + "loss": 0.8387, + "step": 13512 + }, + { + "epoch": 0.9762493904311232, + "grad_norm": 5.054429202430075, + "learning_rate": 2.717201345982295e-06, + "loss": 0.7438, + "step": 13513 + }, + { + "epoch": 0.9763216356313328, + "grad_norm": 5.572967453009196, + "learning_rate": 2.716909948136232e-06, + "loss": 0.7045, + "step": 13514 + }, + { + "epoch": 0.9763938808315422, + "grad_norm": 6.4414796842387805, + "learning_rate": 2.7166185473208404e-06, + "loss": 0.7685, + "step": 13515 + }, + { + "epoch": 0.9764661260317518, + "grad_norm": 6.243496983072605, + "learning_rate": 2.716327143540109e-06, + "loss": 0.7764, + "step": 13516 + }, + { + "epoch": 0.9765383712319613, + "grad_norm": 6.647026441218201, + "learning_rate": 2.7160357367980265e-06, + "loss": 0.7961, + "step": 13517 + }, + { + "epoch": 0.9766106164321708, + "grad_norm": 5.705355273459026, + "learning_rate": 2.7157443270985833e-06, + "loss": 0.7273, + "step": 13518 + }, + { + "epoch": 0.9766828616323803, + "grad_norm": 6.754931591034845, + "learning_rate": 2.715452914445768e-06, + "loss": 0.8892, + "step": 13519 + }, + { + "epoch": 0.9767551068325898, + "grad_norm": 6.908701741971605, + "learning_rate": 2.7151614988435683e-06, + "loss": 0.843, + "step": 13520 + }, + { + "epoch": 0.9768273520327994, + "grad_norm": 8.304507895076897, + "learning_rate": 2.714870080295976e-06, + "loss": 0.8833, + "step": 13521 + }, + { + "epoch": 0.9768995972330088, + "grad_norm": 6.346130826325892, + "learning_rate": 2.7145786588069786e-06, + "loss": 0.8356, + "step": 13522 + }, + { + "epoch": 0.9769718424332183, + "grad_norm": 6.553535369263039, + "learning_rate": 2.714287234380566e-06, + "loss": 0.8028, + "step": 13523 + }, + { + "epoch": 0.9770440876334279, + "grad_norm": 6.059255459311103, + "learning_rate": 2.7139958070207272e-06, + "loss": 0.7238, + "step": 13524 + }, + { + "epoch": 0.9771163328336374, + "grad_norm": 5.242318300983186, + "learning_rate": 2.7137043767314525e-06, + "loss": 0.7633, + "step": 13525 + }, + { + "epoch": 0.9771885780338468, + "grad_norm": 5.840486808762612, + "learning_rate": 2.7134129435167305e-06, + "loss": 0.7698, + "step": 13526 + }, + { + "epoch": 0.9772608232340564, + "grad_norm": 5.959658104223621, + "learning_rate": 2.7131215073805513e-06, + "loss": 0.7678, + "step": 13527 + }, + { + "epoch": 0.9773330684342659, + "grad_norm": 5.963239592740651, + "learning_rate": 2.7128300683269042e-06, + "loss": 0.8431, + "step": 13528 + }, + { + "epoch": 0.9774053136344754, + "grad_norm": 7.076213115305577, + "learning_rate": 2.7125386263597785e-06, + "loss": 0.8343, + "step": 13529 + }, + { + "epoch": 0.9774775588346849, + "grad_norm": 6.318971847729846, + "learning_rate": 2.7122471814831637e-06, + "loss": 0.7884, + "step": 13530 + }, + { + "epoch": 0.9775498040348944, + "grad_norm": 6.053825858548854, + "learning_rate": 2.71195573370105e-06, + "loss": 0.8187, + "step": 13531 + }, + { + "epoch": 0.977622049235104, + "grad_norm": 8.1014786865843, + "learning_rate": 2.7116642830174272e-06, + "loss": 0.766, + "step": 13532 + }, + { + "epoch": 0.9776942944353134, + "grad_norm": 5.10017242888133, + "learning_rate": 2.7113728294362847e-06, + "loss": 0.7437, + "step": 13533 + }, + { + "epoch": 0.977766539635523, + "grad_norm": 7.146015961948166, + "learning_rate": 2.7110813729616115e-06, + "loss": 0.794, + "step": 13534 + }, + { + "epoch": 0.9778387848357325, + "grad_norm": 6.143434148917838, + "learning_rate": 2.7107899135973985e-06, + "loss": 0.7607, + "step": 13535 + }, + { + "epoch": 0.977911030035942, + "grad_norm": 5.064424878801871, + "learning_rate": 2.7104984513476364e-06, + "loss": 0.8198, + "step": 13536 + }, + { + "epoch": 0.9779832752361515, + "grad_norm": 5.665740928579014, + "learning_rate": 2.710206986216313e-06, + "loss": 0.6682, + "step": 13537 + }, + { + "epoch": 0.978055520436361, + "grad_norm": 6.106115071456576, + "learning_rate": 2.7099155182074196e-06, + "loss": 0.8096, + "step": 13538 + }, + { + "epoch": 0.9781277656365706, + "grad_norm": 7.140548605478926, + "learning_rate": 2.7096240473249447e-06, + "loss": 0.7774, + "step": 13539 + }, + { + "epoch": 0.97820001083678, + "grad_norm": 5.8167079337477, + "learning_rate": 2.7093325735728797e-06, + "loss": 0.8187, + "step": 13540 + }, + { + "epoch": 0.9782722560369895, + "grad_norm": 6.449626278954742, + "learning_rate": 2.7090410969552154e-06, + "loss": 0.843, + "step": 13541 + }, + { + "epoch": 0.9783445012371991, + "grad_norm": 4.941951628129861, + "learning_rate": 2.7087496174759397e-06, + "loss": 0.7328, + "step": 13542 + }, + { + "epoch": 0.9784167464374086, + "grad_norm": 7.060015486141003, + "learning_rate": 2.708458135139045e-06, + "loss": 0.8508, + "step": 13543 + }, + { + "epoch": 0.978488991637618, + "grad_norm": 5.527909299136738, + "learning_rate": 2.7081666499485187e-06, + "loss": 0.7958, + "step": 13544 + }, + { + "epoch": 0.9785612368378276, + "grad_norm": 6.584939720439829, + "learning_rate": 2.707875161908353e-06, + "loss": 0.8067, + "step": 13545 + }, + { + "epoch": 0.9786334820380371, + "grad_norm": 5.740648377380782, + "learning_rate": 2.707583671022539e-06, + "loss": 0.8593, + "step": 13546 + }, + { + "epoch": 0.9787057272382466, + "grad_norm": 5.228411839124484, + "learning_rate": 2.7072921772950646e-06, + "loss": 0.7332, + "step": 13547 + }, + { + "epoch": 0.9787779724384561, + "grad_norm": 6.594584769248826, + "learning_rate": 2.7070006807299213e-06, + "loss": 0.7226, + "step": 13548 + }, + { + "epoch": 0.9788502176386656, + "grad_norm": 6.116222398026346, + "learning_rate": 2.7067091813310993e-06, + "loss": 0.7849, + "step": 13549 + }, + { + "epoch": 0.9789224628388752, + "grad_norm": 6.4512239108179905, + "learning_rate": 2.70641767910259e-06, + "loss": 0.7848, + "step": 13550 + }, + { + "epoch": 0.9789947080390846, + "grad_norm": 6.6653008333438795, + "learning_rate": 2.7061261740483818e-06, + "loss": 0.7627, + "step": 13551 + }, + { + "epoch": 0.9790669532392942, + "grad_norm": 7.524586115509397, + "learning_rate": 2.705834666172467e-06, + "loss": 0.7716, + "step": 13552 + }, + { + "epoch": 0.9791391984395037, + "grad_norm": 7.780172372982511, + "learning_rate": 2.7055431554788355e-06, + "loss": 0.8266, + "step": 13553 + }, + { + "epoch": 0.9792114436397132, + "grad_norm": 6.659155842662434, + "learning_rate": 2.705251641971477e-06, + "loss": 0.8202, + "step": 13554 + }, + { + "epoch": 0.9792836888399227, + "grad_norm": 5.905110890173371, + "learning_rate": 2.704960125654384e-06, + "loss": 0.8039, + "step": 13555 + }, + { + "epoch": 0.9793559340401322, + "grad_norm": 6.0317341506413955, + "learning_rate": 2.7046686065315453e-06, + "loss": 0.7588, + "step": 13556 + }, + { + "epoch": 0.9794281792403418, + "grad_norm": 8.153392305740512, + "learning_rate": 2.7043770846069527e-06, + "loss": 0.9228, + "step": 13557 + }, + { + "epoch": 0.9795004244405512, + "grad_norm": 6.32056991373593, + "learning_rate": 2.704085559884596e-06, + "loss": 0.8493, + "step": 13558 + }, + { + "epoch": 0.9795726696407607, + "grad_norm": 6.696599456059944, + "learning_rate": 2.7037940323684663e-06, + "loss": 0.8136, + "step": 13559 + }, + { + "epoch": 0.9796449148409703, + "grad_norm": 6.386291974942031, + "learning_rate": 2.7035025020625555e-06, + "loss": 0.8203, + "step": 13560 + }, + { + "epoch": 0.9797171600411798, + "grad_norm": 6.016578024143414, + "learning_rate": 2.703210968970853e-06, + "loss": 0.7819, + "step": 13561 + }, + { + "epoch": 0.9797894052413892, + "grad_norm": 8.33417318562485, + "learning_rate": 2.70291943309735e-06, + "loss": 0.7877, + "step": 13562 + }, + { + "epoch": 0.9798616504415988, + "grad_norm": 8.504184366145788, + "learning_rate": 2.7026278944460373e-06, + "loss": 0.8104, + "step": 13563 + }, + { + "epoch": 0.9799338956418083, + "grad_norm": 5.683529316645077, + "learning_rate": 2.702336353020906e-06, + "loss": 0.8405, + "step": 13564 + }, + { + "epoch": 0.9800061408420178, + "grad_norm": 5.354209078411738, + "learning_rate": 2.702044808825948e-06, + "loss": 0.7392, + "step": 13565 + }, + { + "epoch": 0.9800783860422273, + "grad_norm": 5.954153697927601, + "learning_rate": 2.7017532618651526e-06, + "loss": 0.817, + "step": 13566 + }, + { + "epoch": 0.9801506312424368, + "grad_norm": 6.188040353319186, + "learning_rate": 2.701461712142512e-06, + "loss": 0.7218, + "step": 13567 + }, + { + "epoch": 0.9802228764426464, + "grad_norm": 6.603517358029941, + "learning_rate": 2.7011701596620167e-06, + "loss": 0.7675, + "step": 13568 + }, + { + "epoch": 0.9802951216428558, + "grad_norm": 6.558435252979883, + "learning_rate": 2.7008786044276586e-06, + "loss": 0.8234, + "step": 13569 + }, + { + "epoch": 0.9803673668430654, + "grad_norm": 6.013110142955024, + "learning_rate": 2.700587046443428e-06, + "loss": 0.7991, + "step": 13570 + }, + { + "epoch": 0.9804396120432749, + "grad_norm": 6.145551806537468, + "learning_rate": 2.700295485713317e-06, + "loss": 0.8178, + "step": 13571 + }, + { + "epoch": 0.9805118572434844, + "grad_norm": 5.534041912409561, + "learning_rate": 2.700003922241316e-06, + "loss": 0.7848, + "step": 13572 + }, + { + "epoch": 0.9805841024436939, + "grad_norm": 7.204543629791843, + "learning_rate": 2.6997123560314166e-06, + "loss": 0.8665, + "step": 13573 + }, + { + "epoch": 0.9806563476439034, + "grad_norm": 6.623410304095352, + "learning_rate": 2.6994207870876095e-06, + "loss": 0.8698, + "step": 13574 + }, + { + "epoch": 0.980728592844113, + "grad_norm": 6.6541893863310015, + "learning_rate": 2.6991292154138877e-06, + "loss": 0.7589, + "step": 13575 + }, + { + "epoch": 0.9808008380443224, + "grad_norm": 7.019875782794121, + "learning_rate": 2.6988376410142407e-06, + "loss": 0.8077, + "step": 13576 + }, + { + "epoch": 0.9808730832445319, + "grad_norm": 5.965497953441903, + "learning_rate": 2.698546063892661e-06, + "loss": 0.8608, + "step": 13577 + }, + { + "epoch": 0.9809453284447415, + "grad_norm": 5.915611961096467, + "learning_rate": 2.6982544840531404e-06, + "loss": 0.7681, + "step": 13578 + }, + { + "epoch": 0.981017573644951, + "grad_norm": 6.129247943988457, + "learning_rate": 2.6979629014996696e-06, + "loss": 0.802, + "step": 13579 + }, + { + "epoch": 0.9810898188451604, + "grad_norm": 7.200426767205039, + "learning_rate": 2.69767131623624e-06, + "loss": 0.8176, + "step": 13580 + }, + { + "epoch": 0.98116206404537, + "grad_norm": 8.966163607906672, + "learning_rate": 2.697379728266844e-06, + "loss": 0.8198, + "step": 13581 + }, + { + "epoch": 0.9812343092455795, + "grad_norm": 6.50839249131803, + "learning_rate": 2.6970881375954722e-06, + "loss": 0.8213, + "step": 13582 + }, + { + "epoch": 0.981306554445789, + "grad_norm": 5.211821382472996, + "learning_rate": 2.6967965442261166e-06, + "loss": 0.7405, + "step": 13583 + }, + { + "epoch": 0.9813787996459985, + "grad_norm": 6.012373563265826, + "learning_rate": 2.69650494816277e-06, + "loss": 0.8446, + "step": 13584 + }, + { + "epoch": 0.981451044846208, + "grad_norm": 6.334880472574572, + "learning_rate": 2.696213349409424e-06, + "loss": 0.8096, + "step": 13585 + }, + { + "epoch": 0.9815232900464176, + "grad_norm": 6.7339742235914555, + "learning_rate": 2.6959217479700684e-06, + "loss": 0.868, + "step": 13586 + }, + { + "epoch": 0.981595535246627, + "grad_norm": 6.156817879602083, + "learning_rate": 2.695630143848696e-06, + "loss": 0.7861, + "step": 13587 + }, + { + "epoch": 0.9816677804468366, + "grad_norm": 5.557542284600018, + "learning_rate": 2.695338537049299e-06, + "loss": 0.7801, + "step": 13588 + }, + { + "epoch": 0.9817400256470461, + "grad_norm": 5.714867446443524, + "learning_rate": 2.6950469275758694e-06, + "loss": 0.6739, + "step": 13589 + }, + { + "epoch": 0.9818122708472555, + "grad_norm": 5.685674688854478, + "learning_rate": 2.6947553154323987e-06, + "loss": 0.6638, + "step": 13590 + }, + { + "epoch": 0.9818845160474651, + "grad_norm": 6.242911629814712, + "learning_rate": 2.6944637006228786e-06, + "loss": 0.7486, + "step": 13591 + }, + { + "epoch": 0.9819567612476746, + "grad_norm": 5.762578722898079, + "learning_rate": 2.6941720831513017e-06, + "loss": 0.7469, + "step": 13592 + }, + { + "epoch": 0.9820290064478842, + "grad_norm": 6.710822107746294, + "learning_rate": 2.6938804630216604e-06, + "loss": 0.8617, + "step": 13593 + }, + { + "epoch": 0.9821012516480936, + "grad_norm": 6.149839520105128, + "learning_rate": 2.6935888402379456e-06, + "loss": 0.8134, + "step": 13594 + }, + { + "epoch": 0.9821734968483031, + "grad_norm": 5.525438607183851, + "learning_rate": 2.6932972148041497e-06, + "loss": 0.8196, + "step": 13595 + }, + { + "epoch": 0.9822457420485127, + "grad_norm": 7.191138366649663, + "learning_rate": 2.693005586724265e-06, + "loss": 0.8284, + "step": 13596 + }, + { + "epoch": 0.9823179872487222, + "grad_norm": 6.245311962487468, + "learning_rate": 2.6927139560022835e-06, + "loss": 0.7782, + "step": 13597 + }, + { + "epoch": 0.9823902324489316, + "grad_norm": 6.291112643811041, + "learning_rate": 2.692422322642198e-06, + "loss": 0.8144, + "step": 13598 + }, + { + "epoch": 0.9824624776491412, + "grad_norm": 7.552274237664352, + "learning_rate": 2.6921306866480003e-06, + "loss": 0.7705, + "step": 13599 + }, + { + "epoch": 0.9825347228493507, + "grad_norm": 5.758905232024186, + "learning_rate": 2.6918390480236826e-06, + "loss": 0.8846, + "step": 13600 + }, + { + "epoch": 0.9826069680495602, + "grad_norm": 8.108461890879173, + "learning_rate": 2.6915474067732367e-06, + "loss": 0.723, + "step": 13601 + }, + { + "epoch": 0.9826792132497697, + "grad_norm": 5.927156775298325, + "learning_rate": 2.6912557629006563e-06, + "loss": 0.8301, + "step": 13602 + }, + { + "epoch": 0.9827514584499792, + "grad_norm": 4.8126831019854945, + "learning_rate": 2.690964116409933e-06, + "loss": 0.6634, + "step": 13603 + }, + { + "epoch": 0.9828237036501888, + "grad_norm": 6.122650007074621, + "learning_rate": 2.690672467305059e-06, + "loss": 0.8086, + "step": 13604 + }, + { + "epoch": 0.9828959488503982, + "grad_norm": 7.190499650044158, + "learning_rate": 2.6903808155900267e-06, + "loss": 0.7681, + "step": 13605 + }, + { + "epoch": 0.9829681940506078, + "grad_norm": 6.377845278806207, + "learning_rate": 2.690089161268829e-06, + "loss": 0.8277, + "step": 13606 + }, + { + "epoch": 0.9830404392508173, + "grad_norm": 6.33311977361036, + "learning_rate": 2.689797504345459e-06, + "loss": 0.7936, + "step": 13607 + }, + { + "epoch": 0.9831126844510267, + "grad_norm": 6.148782761609235, + "learning_rate": 2.6895058448239075e-06, + "loss": 0.8, + "step": 13608 + }, + { + "epoch": 0.9831849296512363, + "grad_norm": 6.751123405502829, + "learning_rate": 2.6892141827081687e-06, + "loss": 0.8454, + "step": 13609 + }, + { + "epoch": 0.9832571748514458, + "grad_norm": 5.676533281002766, + "learning_rate": 2.6889225180022344e-06, + "loss": 0.7609, + "step": 13610 + }, + { + "epoch": 0.9833294200516554, + "grad_norm": 6.413620593219522, + "learning_rate": 2.6886308507100972e-06, + "loss": 0.8345, + "step": 13611 + }, + { + "epoch": 0.9834016652518648, + "grad_norm": 6.281652333293921, + "learning_rate": 2.6883391808357513e-06, + "loss": 0.7837, + "step": 13612 + }, + { + "epoch": 0.9834739104520743, + "grad_norm": 6.811886261058221, + "learning_rate": 2.688047508383187e-06, + "loss": 0.7956, + "step": 13613 + }, + { + "epoch": 0.9835461556522839, + "grad_norm": 6.632499049388302, + "learning_rate": 2.687755833356399e-06, + "loss": 0.7429, + "step": 13614 + }, + { + "epoch": 0.9836184008524934, + "grad_norm": 6.403352705638096, + "learning_rate": 2.687464155759379e-06, + "loss": 0.8397, + "step": 13615 + }, + { + "epoch": 0.9836906460527028, + "grad_norm": 6.104839543913795, + "learning_rate": 2.6871724755961203e-06, + "loss": 0.8528, + "step": 13616 + }, + { + "epoch": 0.9837628912529124, + "grad_norm": 6.777356132913759, + "learning_rate": 2.686880792870616e-06, + "loss": 0.845, + "step": 13617 + }, + { + "epoch": 0.9838351364531219, + "grad_norm": 6.142284062416681, + "learning_rate": 2.686589107586859e-06, + "loss": 0.8678, + "step": 13618 + }, + { + "epoch": 0.9839073816533314, + "grad_norm": 5.634689462765173, + "learning_rate": 2.6862974197488416e-06, + "loss": 0.758, + "step": 13619 + }, + { + "epoch": 0.9839796268535409, + "grad_norm": 7.840333232415093, + "learning_rate": 2.6860057293605566e-06, + "loss": 0.9605, + "step": 13620 + }, + { + "epoch": 0.9840518720537504, + "grad_norm": 5.284976344348871, + "learning_rate": 2.685714036425999e-06, + "loss": 0.7768, + "step": 13621 + }, + { + "epoch": 0.98412411725396, + "grad_norm": 6.458386656838418, + "learning_rate": 2.6854223409491596e-06, + "loss": 0.8562, + "step": 13622 + }, + { + "epoch": 0.9841963624541694, + "grad_norm": 7.388730731823469, + "learning_rate": 2.685130642934033e-06, + "loss": 0.7517, + "step": 13623 + }, + { + "epoch": 0.984268607654379, + "grad_norm": 5.784585516348364, + "learning_rate": 2.6848389423846107e-06, + "loss": 0.7474, + "step": 13624 + }, + { + "epoch": 0.9843408528545885, + "grad_norm": 5.103461433528779, + "learning_rate": 2.6845472393048868e-06, + "loss": 0.7448, + "step": 13625 + }, + { + "epoch": 0.9844130980547979, + "grad_norm": 5.549727788899541, + "learning_rate": 2.6842555336988554e-06, + "loss": 0.7609, + "step": 13626 + }, + { + "epoch": 0.9844853432550075, + "grad_norm": 5.46601843053151, + "learning_rate": 2.683963825570508e-06, + "loss": 0.7439, + "step": 13627 + }, + { + "epoch": 0.984557588455217, + "grad_norm": 5.029943450549187, + "learning_rate": 2.6836721149238394e-06, + "loss": 0.7239, + "step": 13628 + }, + { + "epoch": 0.9846298336554266, + "grad_norm": 6.951039342214351, + "learning_rate": 2.6833804017628412e-06, + "loss": 0.7724, + "step": 13629 + }, + { + "epoch": 0.984702078855636, + "grad_norm": 6.071113995007906, + "learning_rate": 2.683088686091508e-06, + "loss": 0.8186, + "step": 13630 + }, + { + "epoch": 0.9847743240558455, + "grad_norm": 6.046356415505152, + "learning_rate": 2.6827969679138326e-06, + "loss": 0.7276, + "step": 13631 + }, + { + "epoch": 0.9848465692560551, + "grad_norm": 7.354537507848179, + "learning_rate": 2.6825052472338097e-06, + "loss": 0.8152, + "step": 13632 + }, + { + "epoch": 0.9849188144562646, + "grad_norm": 5.987791356143518, + "learning_rate": 2.682213524055431e-06, + "loss": 0.7121, + "step": 13633 + }, + { + "epoch": 0.984991059656474, + "grad_norm": 7.882413492983303, + "learning_rate": 2.68192179838269e-06, + "loss": 0.8537, + "step": 13634 + }, + { + "epoch": 0.9850633048566836, + "grad_norm": 7.373901285345203, + "learning_rate": 2.6816300702195818e-06, + "loss": 0.8019, + "step": 13635 + }, + { + "epoch": 0.9851355500568931, + "grad_norm": 7.023023344904684, + "learning_rate": 2.681338339570099e-06, + "loss": 0.8087, + "step": 13636 + }, + { + "epoch": 0.9852077952571026, + "grad_norm": 5.118707841599417, + "learning_rate": 2.6810466064382346e-06, + "loss": 0.7688, + "step": 13637 + }, + { + "epoch": 0.9852800404573121, + "grad_norm": 5.978100866656887, + "learning_rate": 2.680754870827983e-06, + "loss": 0.764, + "step": 13638 + }, + { + "epoch": 0.9853522856575216, + "grad_norm": 5.834429646968067, + "learning_rate": 2.680463132743337e-06, + "loss": 0.7967, + "step": 13639 + }, + { + "epoch": 0.9854245308577312, + "grad_norm": 6.729833642680501, + "learning_rate": 2.680171392188291e-06, + "loss": 0.8032, + "step": 13640 + }, + { + "epoch": 0.9854967760579406, + "grad_norm": 6.810294818070155, + "learning_rate": 2.6798796491668394e-06, + "loss": 0.7652, + "step": 13641 + }, + { + "epoch": 0.9855690212581502, + "grad_norm": 7.016933395619803, + "learning_rate": 2.679587903682974e-06, + "loss": 0.8299, + "step": 13642 + }, + { + "epoch": 0.9856412664583597, + "grad_norm": 6.378613214717502, + "learning_rate": 2.679296155740691e-06, + "loss": 0.7418, + "step": 13643 + }, + { + "epoch": 0.9857135116585691, + "grad_norm": 5.984714428710269, + "learning_rate": 2.6790044053439817e-06, + "loss": 0.9055, + "step": 13644 + }, + { + "epoch": 0.9857857568587787, + "grad_norm": 6.313327848330689, + "learning_rate": 2.678712652496841e-06, + "loss": 0.7805, + "step": 13645 + }, + { + "epoch": 0.9858580020589882, + "grad_norm": 5.9505814853195425, + "learning_rate": 2.6784208972032637e-06, + "loss": 0.8306, + "step": 13646 + }, + { + "epoch": 0.9859302472591978, + "grad_norm": 6.813480971516597, + "learning_rate": 2.678129139467242e-06, + "loss": 0.8184, + "step": 13647 + }, + { + "epoch": 0.9860024924594072, + "grad_norm": 6.229473433920079, + "learning_rate": 2.6778373792927708e-06, + "loss": 0.8634, + "step": 13648 + }, + { + "epoch": 0.9860747376596167, + "grad_norm": 5.610384114488898, + "learning_rate": 2.677545616683844e-06, + "loss": 0.8127, + "step": 13649 + }, + { + "epoch": 0.9861469828598263, + "grad_norm": 6.534983466783433, + "learning_rate": 2.6772538516444563e-06, + "loss": 0.8249, + "step": 13650 + }, + { + "epoch": 0.9862192280600358, + "grad_norm": 5.739207546811742, + "learning_rate": 2.6769620841786008e-06, + "loss": 0.7399, + "step": 13651 + }, + { + "epoch": 0.9862914732602452, + "grad_norm": 5.428293787517724, + "learning_rate": 2.6766703142902717e-06, + "loss": 0.773, + "step": 13652 + }, + { + "epoch": 0.9863637184604548, + "grad_norm": 7.780899959182401, + "learning_rate": 2.6763785419834627e-06, + "loss": 0.8066, + "step": 13653 + }, + { + "epoch": 0.9864359636606643, + "grad_norm": 7.447341690455362, + "learning_rate": 2.676086767262168e-06, + "loss": 0.7551, + "step": 13654 + }, + { + "epoch": 0.9865082088608738, + "grad_norm": 6.015227703499774, + "learning_rate": 2.675794990130383e-06, + "loss": 0.8737, + "step": 13655 + }, + { + "epoch": 0.9865804540610833, + "grad_norm": 7.156824613598176, + "learning_rate": 2.6755032105921006e-06, + "loss": 0.7935, + "step": 13656 + }, + { + "epoch": 0.9866526992612928, + "grad_norm": 7.956069970622519, + "learning_rate": 2.6752114286513164e-06, + "loss": 0.786, + "step": 13657 + }, + { + "epoch": 0.9867249444615024, + "grad_norm": 5.562128976170778, + "learning_rate": 2.674919644312023e-06, + "loss": 0.7364, + "step": 13658 + }, + { + "epoch": 0.9867971896617118, + "grad_norm": 8.853571422240346, + "learning_rate": 2.674627857578216e-06, + "loss": 0.8545, + "step": 13659 + }, + { + "epoch": 0.9868694348619214, + "grad_norm": 5.822312143390146, + "learning_rate": 2.674336068453889e-06, + "loss": 0.7902, + "step": 13660 + }, + { + "epoch": 0.9869416800621309, + "grad_norm": 6.738430987919597, + "learning_rate": 2.6740442769430363e-06, + "loss": 0.7651, + "step": 13661 + }, + { + "epoch": 0.9870139252623403, + "grad_norm": 6.628558085104004, + "learning_rate": 2.6737524830496525e-06, + "loss": 0.767, + "step": 13662 + }, + { + "epoch": 0.9870861704625499, + "grad_norm": 5.907050830297531, + "learning_rate": 2.6734606867777323e-06, + "loss": 0.7851, + "step": 13663 + }, + { + "epoch": 0.9871584156627594, + "grad_norm": 5.910843289234843, + "learning_rate": 2.6731688881312705e-06, + "loss": 0.8023, + "step": 13664 + }, + { + "epoch": 0.987230660862969, + "grad_norm": 5.8858931176983456, + "learning_rate": 2.6728770871142608e-06, + "loss": 0.8404, + "step": 13665 + }, + { + "epoch": 0.9873029060631784, + "grad_norm": 6.306853233196261, + "learning_rate": 2.6725852837306978e-06, + "loss": 0.787, + "step": 13666 + }, + { + "epoch": 0.9873751512633879, + "grad_norm": 8.20775864294985, + "learning_rate": 2.672293477984576e-06, + "loss": 0.8007, + "step": 13667 + }, + { + "epoch": 0.9874473964635975, + "grad_norm": 7.334691731034552, + "learning_rate": 2.6720016698798906e-06, + "loss": 0.8277, + "step": 13668 + }, + { + "epoch": 0.987519641663807, + "grad_norm": 7.456947609945419, + "learning_rate": 2.6717098594206358e-06, + "loss": 0.7821, + "step": 13669 + }, + { + "epoch": 0.9875918868640164, + "grad_norm": 6.755196725506443, + "learning_rate": 2.6714180466108065e-06, + "loss": 0.831, + "step": 13670 + }, + { + "epoch": 0.987664132064226, + "grad_norm": 7.041695528892943, + "learning_rate": 2.6711262314543977e-06, + "loss": 0.8604, + "step": 13671 + }, + { + "epoch": 0.9877363772644355, + "grad_norm": 6.566401602467124, + "learning_rate": 2.670834413955402e-06, + "loss": 0.7813, + "step": 13672 + }, + { + "epoch": 0.987808622464645, + "grad_norm": 7.603845737600198, + "learning_rate": 2.6705425941178174e-06, + "loss": 0.7674, + "step": 13673 + }, + { + "epoch": 0.9878808676648545, + "grad_norm": 7.626338153424033, + "learning_rate": 2.670250771945637e-06, + "loss": 0.8047, + "step": 13674 + }, + { + "epoch": 0.987953112865064, + "grad_norm": 6.912734081864589, + "learning_rate": 2.6699589474428556e-06, + "loss": 0.7654, + "step": 13675 + }, + { + "epoch": 0.9880253580652736, + "grad_norm": 5.549937088068527, + "learning_rate": 2.6696671206134676e-06, + "loss": 0.7593, + "step": 13676 + }, + { + "epoch": 0.988097603265483, + "grad_norm": 6.116700446566543, + "learning_rate": 2.6693752914614683e-06, + "loss": 0.8228, + "step": 13677 + }, + { + "epoch": 0.9881698484656926, + "grad_norm": 5.522518874723715, + "learning_rate": 2.669083459990854e-06, + "loss": 0.7731, + "step": 13678 + }, + { + "epoch": 0.9882420936659021, + "grad_norm": 7.080763706030906, + "learning_rate": 2.6687916262056173e-06, + "loss": 0.7575, + "step": 13679 + }, + { + "epoch": 0.9883143388661115, + "grad_norm": 7.690554624481183, + "learning_rate": 2.6684997901097554e-06, + "loss": 0.7866, + "step": 13680 + }, + { + "epoch": 0.9883865840663211, + "grad_norm": 5.983627551539042, + "learning_rate": 2.6682079517072614e-06, + "loss": 0.7855, + "step": 13681 + }, + { + "epoch": 0.9884588292665306, + "grad_norm": 6.9270570529831215, + "learning_rate": 2.6679161110021307e-06, + "loss": 0.7899, + "step": 13682 + }, + { + "epoch": 0.9885310744667402, + "grad_norm": 6.089709071835029, + "learning_rate": 2.6676242679983593e-06, + "loss": 0.7943, + "step": 13683 + }, + { + "epoch": 0.9886033196669496, + "grad_norm": 5.969849490212548, + "learning_rate": 2.667332422699942e-06, + "loss": 0.8016, + "step": 13684 + }, + { + "epoch": 0.9886755648671591, + "grad_norm": 6.137143922254141, + "learning_rate": 2.667040575110874e-06, + "loss": 0.726, + "step": 13685 + }, + { + "epoch": 0.9887478100673687, + "grad_norm": 7.062296906648449, + "learning_rate": 2.6667487252351493e-06, + "loss": 0.7821, + "step": 13686 + }, + { + "epoch": 0.9888200552675782, + "grad_norm": 5.028145157872602, + "learning_rate": 2.666456873076765e-06, + "loss": 0.7751, + "step": 13687 + }, + { + "epoch": 0.9888923004677876, + "grad_norm": 6.230882015095067, + "learning_rate": 2.666165018639715e-06, + "loss": 0.7507, + "step": 13688 + }, + { + "epoch": 0.9889645456679972, + "grad_norm": 8.193898736344307, + "learning_rate": 2.665873161927995e-06, + "loss": 0.8292, + "step": 13689 + }, + { + "epoch": 0.9890367908682067, + "grad_norm": 7.661818367291108, + "learning_rate": 2.6655813029455997e-06, + "loss": 0.7081, + "step": 13690 + }, + { + "epoch": 0.9891090360684162, + "grad_norm": 6.575168262257777, + "learning_rate": 2.665289441696525e-06, + "loss": 0.8098, + "step": 13691 + }, + { + "epoch": 0.9891812812686257, + "grad_norm": 4.872861735586248, + "learning_rate": 2.664997578184767e-06, + "loss": 0.7558, + "step": 13692 + }, + { + "epoch": 0.9892535264688352, + "grad_norm": 5.7545201323068635, + "learning_rate": 2.6647057124143194e-06, + "loss": 0.8289, + "step": 13693 + }, + { + "epoch": 0.9893257716690448, + "grad_norm": 6.623017788291334, + "learning_rate": 2.664413844389179e-06, + "loss": 0.8087, + "step": 13694 + }, + { + "epoch": 0.9893980168692542, + "grad_norm": 6.237770322396513, + "learning_rate": 2.6641219741133405e-06, + "loss": 0.8594, + "step": 13695 + }, + { + "epoch": 0.9894702620694638, + "grad_norm": 5.7137168873637165, + "learning_rate": 2.6638301015907996e-06, + "loss": 0.7317, + "step": 13696 + }, + { + "epoch": 0.9895425072696733, + "grad_norm": 6.085750395771969, + "learning_rate": 2.6635382268255517e-06, + "loss": 0.8161, + "step": 13697 + }, + { + "epoch": 0.9896147524698827, + "grad_norm": 5.821395466047565, + "learning_rate": 2.6632463498215932e-06, + "loss": 0.7185, + "step": 13698 + }, + { + "epoch": 0.9896869976700923, + "grad_norm": 6.013640475214387, + "learning_rate": 2.662954470582918e-06, + "loss": 0.8165, + "step": 13699 + }, + { + "epoch": 0.9897592428703018, + "grad_norm": 4.456998726327435, + "learning_rate": 2.662662589113523e-06, + "loss": 0.74, + "step": 13700 + }, + { + "epoch": 0.9898314880705114, + "grad_norm": 5.693925853572082, + "learning_rate": 2.6623707054174035e-06, + "loss": 0.763, + "step": 13701 + }, + { + "epoch": 0.9899037332707208, + "grad_norm": 5.488554576434581, + "learning_rate": 2.662078819498555e-06, + "loss": 0.7888, + "step": 13702 + }, + { + "epoch": 0.9899759784709303, + "grad_norm": 8.066172628275483, + "learning_rate": 2.6617869313609735e-06, + "loss": 0.7666, + "step": 13703 + }, + { + "epoch": 0.9900482236711399, + "grad_norm": 6.485361488402471, + "learning_rate": 2.661495041008654e-06, + "loss": 0.8049, + "step": 13704 + }, + { + "epoch": 0.9901204688713494, + "grad_norm": 6.447881235283138, + "learning_rate": 2.661203148445593e-06, + "loss": 0.7897, + "step": 13705 + }, + { + "epoch": 0.9901927140715588, + "grad_norm": 6.762918895785074, + "learning_rate": 2.6609112536757865e-06, + "loss": 0.7619, + "step": 13706 + }, + { + "epoch": 0.9902649592717684, + "grad_norm": 6.62930906259928, + "learning_rate": 2.6606193567032295e-06, + "loss": 0.8383, + "step": 13707 + }, + { + "epoch": 0.9903372044719779, + "grad_norm": 6.897962473912417, + "learning_rate": 2.660327457531918e-06, + "loss": 0.8101, + "step": 13708 + }, + { + "epoch": 0.9904094496721874, + "grad_norm": 5.696125574249324, + "learning_rate": 2.6600355561658492e-06, + "loss": 0.7814, + "step": 13709 + }, + { + "epoch": 0.9904816948723969, + "grad_norm": 6.024945853215586, + "learning_rate": 2.659743652609017e-06, + "loss": 0.7515, + "step": 13710 + }, + { + "epoch": 0.9905539400726064, + "grad_norm": 6.215218773058005, + "learning_rate": 2.659451746865418e-06, + "loss": 0.7815, + "step": 13711 + }, + { + "epoch": 0.990626185272816, + "grad_norm": 6.354759547188854, + "learning_rate": 2.659159838939049e-06, + "loss": 0.8228, + "step": 13712 + }, + { + "epoch": 0.9906984304730254, + "grad_norm": 6.1593697480634635, + "learning_rate": 2.6588679288339054e-06, + "loss": 0.7878, + "step": 13713 + }, + { + "epoch": 0.990770675673235, + "grad_norm": 6.24796719871439, + "learning_rate": 2.6585760165539825e-06, + "loss": 0.8003, + "step": 13714 + }, + { + "epoch": 0.9908429208734445, + "grad_norm": 5.237993225326322, + "learning_rate": 2.658284102103277e-06, + "loss": 0.8329, + "step": 13715 + }, + { + "epoch": 0.9909151660736539, + "grad_norm": 5.8170250127345415, + "learning_rate": 2.657992185485786e-06, + "loss": 0.7569, + "step": 13716 + }, + { + "epoch": 0.9909874112738635, + "grad_norm": 6.361836324975773, + "learning_rate": 2.6577002667055046e-06, + "loss": 0.7883, + "step": 13717 + }, + { + "epoch": 0.991059656474073, + "grad_norm": 4.950992161124536, + "learning_rate": 2.6574083457664283e-06, + "loss": 0.7089, + "step": 13718 + }, + { + "epoch": 0.9911319016742826, + "grad_norm": 5.737828184228639, + "learning_rate": 2.6571164226725543e-06, + "loss": 0.738, + "step": 13719 + }, + { + "epoch": 0.991204146874492, + "grad_norm": 6.2548159545851085, + "learning_rate": 2.656824497427878e-06, + "loss": 0.8226, + "step": 13720 + }, + { + "epoch": 0.9912763920747015, + "grad_norm": 7.064768199101175, + "learning_rate": 2.656532570036397e-06, + "loss": 0.8277, + "step": 13721 + }, + { + "epoch": 0.9913486372749111, + "grad_norm": 6.027594689060944, + "learning_rate": 2.6562406405021056e-06, + "loss": 0.8083, + "step": 13722 + }, + { + "epoch": 0.9914208824751206, + "grad_norm": 7.257658760733785, + "learning_rate": 2.6559487088290025e-06, + "loss": 0.6747, + "step": 13723 + }, + { + "epoch": 0.99149312767533, + "grad_norm": 5.987917177961545, + "learning_rate": 2.6556567750210816e-06, + "loss": 0.7573, + "step": 13724 + }, + { + "epoch": 0.9915653728755396, + "grad_norm": 5.51425162445306, + "learning_rate": 2.6553648390823406e-06, + "loss": 0.7651, + "step": 13725 + }, + { + "epoch": 0.9916376180757491, + "grad_norm": 5.490899793753656, + "learning_rate": 2.655072901016776e-06, + "loss": 0.7674, + "step": 13726 + }, + { + "epoch": 0.9917098632759586, + "grad_norm": 6.306522069379625, + "learning_rate": 2.6547809608283825e-06, + "loss": 0.8074, + "step": 13727 + }, + { + "epoch": 0.9917821084761681, + "grad_norm": 6.220382706131688, + "learning_rate": 2.6544890185211585e-06, + "loss": 0.757, + "step": 13728 + }, + { + "epoch": 0.9918543536763776, + "grad_norm": 5.071928874615186, + "learning_rate": 2.6541970740991e-06, + "loss": 0.7851, + "step": 13729 + }, + { + "epoch": 0.9919265988765872, + "grad_norm": 5.630550443528514, + "learning_rate": 2.6539051275662036e-06, + "loss": 0.7702, + "step": 13730 + }, + { + "epoch": 0.9919988440767966, + "grad_norm": 8.129957828576579, + "learning_rate": 2.653613178926465e-06, + "loss": 0.8696, + "step": 13731 + }, + { + "epoch": 0.9920710892770062, + "grad_norm": 5.067383282996461, + "learning_rate": 2.6533212281838815e-06, + "loss": 0.7505, + "step": 13732 + }, + { + "epoch": 0.9921433344772157, + "grad_norm": 6.1703249759504475, + "learning_rate": 2.6530292753424498e-06, + "loss": 0.7974, + "step": 13733 + }, + { + "epoch": 0.9922155796774251, + "grad_norm": 5.642518462766527, + "learning_rate": 2.6527373204061653e-06, + "loss": 0.7809, + "step": 13734 + }, + { + "epoch": 0.9922878248776347, + "grad_norm": 6.5460836801748545, + "learning_rate": 2.6524453633790258e-06, + "loss": 0.6873, + "step": 13735 + }, + { + "epoch": 0.9923600700778442, + "grad_norm": 5.952187772250053, + "learning_rate": 2.6521534042650275e-06, + "loss": 0.791, + "step": 13736 + }, + { + "epoch": 0.9924323152780538, + "grad_norm": 6.394284867542668, + "learning_rate": 2.6518614430681683e-06, + "loss": 0.8012, + "step": 13737 + }, + { + "epoch": 0.9925045604782632, + "grad_norm": 6.169546262605199, + "learning_rate": 2.651569479792442e-06, + "loss": 0.7903, + "step": 13738 + }, + { + "epoch": 0.9925768056784727, + "grad_norm": 9.606599732624003, + "learning_rate": 2.6512775144418483e-06, + "loss": 0.8549, + "step": 13739 + }, + { + "epoch": 0.9926490508786823, + "grad_norm": 6.563100151594025, + "learning_rate": 2.650985547020383e-06, + "loss": 0.8458, + "step": 13740 + }, + { + "epoch": 0.9927212960788918, + "grad_norm": 6.4761448853221, + "learning_rate": 2.650693577532043e-06, + "loss": 0.7832, + "step": 13741 + }, + { + "epoch": 0.9927935412791012, + "grad_norm": 8.061393247832003, + "learning_rate": 2.6504016059808246e-06, + "loss": 0.8191, + "step": 13742 + }, + { + "epoch": 0.9928657864793108, + "grad_norm": 6.40053336782007, + "learning_rate": 2.6501096323707243e-06, + "loss": 0.8322, + "step": 13743 + }, + { + "epoch": 0.9929380316795203, + "grad_norm": 6.509279523086097, + "learning_rate": 2.6498176567057403e-06, + "loss": 0.8281, + "step": 13744 + }, + { + "epoch": 0.9930102768797298, + "grad_norm": 5.6792089373479415, + "learning_rate": 2.649525678989869e-06, + "loss": 0.7378, + "step": 13745 + }, + { + "epoch": 0.9930825220799393, + "grad_norm": 5.551914730219734, + "learning_rate": 2.6492336992271073e-06, + "loss": 0.8139, + "step": 13746 + }, + { + "epoch": 0.9931547672801488, + "grad_norm": 5.7054632540753, + "learning_rate": 2.648941717421452e-06, + "loss": 0.7903, + "step": 13747 + }, + { + "epoch": 0.9932270124803584, + "grad_norm": 7.125284557348805, + "learning_rate": 2.6486497335769e-06, + "loss": 0.7724, + "step": 13748 + }, + { + "epoch": 0.9932992576805678, + "grad_norm": 6.215023898925006, + "learning_rate": 2.6483577476974487e-06, + "loss": 0.8468, + "step": 13749 + }, + { + "epoch": 0.9933715028807774, + "grad_norm": 6.356298803931678, + "learning_rate": 2.6480657597870953e-06, + "loss": 0.8055, + "step": 13750 + }, + { + "epoch": 0.9934437480809869, + "grad_norm": 7.4831302380206965, + "learning_rate": 2.647773769849837e-06, + "loss": 0.7949, + "step": 13751 + }, + { + "epoch": 0.9935159932811963, + "grad_norm": 5.81045627555418, + "learning_rate": 2.6474817778896695e-06, + "loss": 0.6953, + "step": 13752 + }, + { + "epoch": 0.9935882384814059, + "grad_norm": 8.21601619144616, + "learning_rate": 2.6471897839105915e-06, + "loss": 0.8398, + "step": 13753 + }, + { + "epoch": 0.9936604836816154, + "grad_norm": 5.771938103463588, + "learning_rate": 2.646897787916599e-06, + "loss": 0.756, + "step": 13754 + }, + { + "epoch": 0.993732728881825, + "grad_norm": 6.491400972869891, + "learning_rate": 2.646605789911691e-06, + "loss": 0.8195, + "step": 13755 + }, + { + "epoch": 0.9938049740820344, + "grad_norm": 6.843091959213749, + "learning_rate": 2.6463137898998627e-06, + "loss": 0.7507, + "step": 13756 + }, + { + "epoch": 0.9938772192822439, + "grad_norm": 7.291763246895726, + "learning_rate": 2.6460217878851123e-06, + "loss": 0.7439, + "step": 13757 + }, + { + "epoch": 0.9939494644824535, + "grad_norm": 6.2180098663477565, + "learning_rate": 2.6457297838714373e-06, + "loss": 0.8604, + "step": 13758 + }, + { + "epoch": 0.994021709682663, + "grad_norm": 6.006678361334444, + "learning_rate": 2.645437777862835e-06, + "loss": 0.7097, + "step": 13759 + }, + { + "epoch": 0.9940939548828724, + "grad_norm": 7.07731007312056, + "learning_rate": 2.645145769863302e-06, + "loss": 0.8738, + "step": 13760 + }, + { + "epoch": 0.994166200083082, + "grad_norm": 5.541618672022058, + "learning_rate": 2.6448537598768357e-06, + "loss": 0.7878, + "step": 13761 + }, + { + "epoch": 0.9942384452832915, + "grad_norm": 7.969384979679781, + "learning_rate": 2.6445617479074348e-06, + "loss": 0.8028, + "step": 13762 + }, + { + "epoch": 0.994310690483501, + "grad_norm": 5.8405589810758505, + "learning_rate": 2.644269733959095e-06, + "loss": 0.8018, + "step": 13763 + }, + { + "epoch": 0.9943829356837105, + "grad_norm": 7.334000759385826, + "learning_rate": 2.643977718035815e-06, + "loss": 0.8106, + "step": 13764 + }, + { + "epoch": 0.99445518088392, + "grad_norm": 6.689356465998745, + "learning_rate": 2.6436857001415917e-06, + "loss": 0.8075, + "step": 13765 + }, + { + "epoch": 0.9945274260841296, + "grad_norm": 6.352840731331615, + "learning_rate": 2.6433936802804233e-06, + "loss": 0.787, + "step": 13766 + }, + { + "epoch": 0.994599671284339, + "grad_norm": 5.437830772424008, + "learning_rate": 2.6431016584563064e-06, + "loss": 0.7653, + "step": 13767 + }, + { + "epoch": 0.9946719164845486, + "grad_norm": 5.4313405546270905, + "learning_rate": 2.642809634673238e-06, + "loss": 0.7768, + "step": 13768 + }, + { + "epoch": 0.9947441616847581, + "grad_norm": 6.99780402479475, + "learning_rate": 2.642517608935218e-06, + "loss": 0.8486, + "step": 13769 + }, + { + "epoch": 0.9948164068849675, + "grad_norm": 6.1297819193992975, + "learning_rate": 2.6422255812462416e-06, + "loss": 0.8228, + "step": 13770 + }, + { + "epoch": 0.9948886520851771, + "grad_norm": 5.315118839804995, + "learning_rate": 2.641933551610308e-06, + "loss": 0.7139, + "step": 13771 + }, + { + "epoch": 0.9949608972853866, + "grad_norm": 5.824368409063935, + "learning_rate": 2.641641520031413e-06, + "loss": 0.7538, + "step": 13772 + }, + { + "epoch": 0.9950331424855962, + "grad_norm": 6.820336830121981, + "learning_rate": 2.6413494865135575e-06, + "loss": 0.7745, + "step": 13773 + }, + { + "epoch": 0.9951053876858056, + "grad_norm": 7.535174221968588, + "learning_rate": 2.641057451060736e-06, + "loss": 0.7436, + "step": 13774 + }, + { + "epoch": 0.9951776328860151, + "grad_norm": 5.955843247317805, + "learning_rate": 2.6407654136769483e-06, + "loss": 0.7937, + "step": 13775 + }, + { + "epoch": 0.9952498780862247, + "grad_norm": 6.92754329980364, + "learning_rate": 2.6404733743661907e-06, + "loss": 0.7633, + "step": 13776 + }, + { + "epoch": 0.9953221232864342, + "grad_norm": 7.860716688178416, + "learning_rate": 2.640181333132462e-06, + "loss": 0.834, + "step": 13777 + }, + { + "epoch": 0.9953943684866436, + "grad_norm": 6.30694486723561, + "learning_rate": 2.63988928997976e-06, + "loss": 0.7234, + "step": 13778 + }, + { + "epoch": 0.9954666136868532, + "grad_norm": 5.908219912166234, + "learning_rate": 2.6395972449120815e-06, + "loss": 0.7951, + "step": 13779 + }, + { + "epoch": 0.9955388588870627, + "grad_norm": 6.568878271300517, + "learning_rate": 2.639305197933426e-06, + "loss": 0.7632, + "step": 13780 + }, + { + "epoch": 0.9956111040872722, + "grad_norm": 5.970523271232113, + "learning_rate": 2.6390131490477894e-06, + "loss": 0.7796, + "step": 13781 + }, + { + "epoch": 0.9956833492874817, + "grad_norm": 7.007571893514432, + "learning_rate": 2.6387210982591717e-06, + "loss": 0.824, + "step": 13782 + }, + { + "epoch": 0.9957555944876912, + "grad_norm": 5.9338107242052605, + "learning_rate": 2.63842904557157e-06, + "loss": 0.8238, + "step": 13783 + }, + { + "epoch": 0.9958278396879008, + "grad_norm": 5.41619170625237, + "learning_rate": 2.6381369909889816e-06, + "loss": 0.8342, + "step": 13784 + }, + { + "epoch": 0.9959000848881102, + "grad_norm": 6.6809020962609695, + "learning_rate": 2.6378449345154044e-06, + "loss": 0.8234, + "step": 13785 + }, + { + "epoch": 0.9959723300883198, + "grad_norm": 7.401490561401641, + "learning_rate": 2.637552876154838e-06, + "loss": 0.8604, + "step": 13786 + }, + { + "epoch": 0.9960445752885293, + "grad_norm": 7.664740638400955, + "learning_rate": 2.6372608159112795e-06, + "loss": 0.7951, + "step": 13787 + }, + { + "epoch": 0.9961168204887387, + "grad_norm": 6.519596490367851, + "learning_rate": 2.6369687537887265e-06, + "loss": 0.7986, + "step": 13788 + }, + { + "epoch": 0.9961890656889483, + "grad_norm": 10.549275985389611, + "learning_rate": 2.6366766897911785e-06, + "loss": 0.7861, + "step": 13789 + }, + { + "epoch": 0.9962613108891578, + "grad_norm": 5.837263617947211, + "learning_rate": 2.636384623922632e-06, + "loss": 0.8416, + "step": 13790 + }, + { + "epoch": 0.9963335560893674, + "grad_norm": 6.97774674124541, + "learning_rate": 2.636092556187086e-06, + "loss": 0.7504, + "step": 13791 + }, + { + "epoch": 0.9964058012895768, + "grad_norm": 6.17930754254247, + "learning_rate": 2.635800486588539e-06, + "loss": 0.7776, + "step": 13792 + }, + { + "epoch": 0.9964780464897863, + "grad_norm": 6.951845201303095, + "learning_rate": 2.635508415130988e-06, + "loss": 0.8004, + "step": 13793 + }, + { + "epoch": 0.9965502916899959, + "grad_norm": 6.039694137407746, + "learning_rate": 2.635216341818433e-06, + "loss": 0.7851, + "step": 13794 + }, + { + "epoch": 0.9966225368902053, + "grad_norm": 8.357108063677787, + "learning_rate": 2.6349242666548697e-06, + "loss": 0.7712, + "step": 13795 + }, + { + "epoch": 0.9966947820904148, + "grad_norm": 7.151757575140129, + "learning_rate": 2.6346321896442993e-06, + "loss": 0.758, + "step": 13796 + }, + { + "epoch": 0.9967670272906244, + "grad_norm": 7.31097239652924, + "learning_rate": 2.6343401107907183e-06, + "loss": 0.817, + "step": 13797 + }, + { + "epoch": 0.9968392724908339, + "grad_norm": 6.0862474472783274, + "learning_rate": 2.6340480300981263e-06, + "loss": 0.8363, + "step": 13798 + }, + { + "epoch": 0.9969115176910434, + "grad_norm": 7.92791457628099, + "learning_rate": 2.63375594757052e-06, + "loss": 0.816, + "step": 13799 + }, + { + "epoch": 0.9969837628912529, + "grad_norm": 7.517325924582257, + "learning_rate": 2.6334638632118986e-06, + "loss": 0.745, + "step": 13800 + }, + { + "epoch": 0.9970560080914624, + "grad_norm": 5.513107979046215, + "learning_rate": 2.633171777026261e-06, + "loss": 0.7691, + "step": 13801 + }, + { + "epoch": 0.997128253291672, + "grad_norm": 6.514087669450205, + "learning_rate": 2.6328796890176045e-06, + "loss": 0.7126, + "step": 13802 + }, + { + "epoch": 0.9972004984918814, + "grad_norm": 7.255102795346522, + "learning_rate": 2.632587599189929e-06, + "loss": 0.8054, + "step": 13803 + }, + { + "epoch": 0.997272743692091, + "grad_norm": 6.016443925082499, + "learning_rate": 2.6322955075472317e-06, + "loss": 0.8155, + "step": 13804 + }, + { + "epoch": 0.9973449888923005, + "grad_norm": 5.735016581636917, + "learning_rate": 2.6320034140935114e-06, + "loss": 0.8293, + "step": 13805 + }, + { + "epoch": 0.9974172340925099, + "grad_norm": 6.740489617740334, + "learning_rate": 2.631711318832767e-06, + "loss": 0.7515, + "step": 13806 + }, + { + "epoch": 0.9974894792927195, + "grad_norm": 6.085365199944801, + "learning_rate": 2.6314192217689976e-06, + "loss": 0.776, + "step": 13807 + }, + { + "epoch": 0.997561724492929, + "grad_norm": 7.186300691806365, + "learning_rate": 2.631127122906201e-06, + "loss": 0.7624, + "step": 13808 + }, + { + "epoch": 0.9976339696931386, + "grad_norm": 6.853974808285147, + "learning_rate": 2.6308350222483747e-06, + "loss": 0.8056, + "step": 13809 + }, + { + "epoch": 0.997706214893348, + "grad_norm": 6.771744918101969, + "learning_rate": 2.63054291979952e-06, + "loss": 0.8473, + "step": 13810 + }, + { + "epoch": 0.9977784600935575, + "grad_norm": 5.614684076295952, + "learning_rate": 2.630250815563633e-06, + "loss": 0.7899, + "step": 13811 + }, + { + "epoch": 0.9978507052937671, + "grad_norm": 5.920536906665103, + "learning_rate": 2.6299587095447144e-06, + "loss": 0.9055, + "step": 13812 + }, + { + "epoch": 0.9979229504939765, + "grad_norm": 6.488970862847722, + "learning_rate": 2.6296666017467615e-06, + "loss": 0.8036, + "step": 13813 + }, + { + "epoch": 0.997995195694186, + "grad_norm": 5.804803161700985, + "learning_rate": 2.6293744921737734e-06, + "loss": 0.8632, + "step": 13814 + }, + { + "epoch": 0.9980674408943956, + "grad_norm": 6.621488684294509, + "learning_rate": 2.629082380829749e-06, + "loss": 0.8108, + "step": 13815 + }, + { + "epoch": 0.9981396860946051, + "grad_norm": 8.39176606388062, + "learning_rate": 2.6287902677186872e-06, + "loss": 0.7507, + "step": 13816 + }, + { + "epoch": 0.9982119312948146, + "grad_norm": 5.462827778285261, + "learning_rate": 2.6284981528445867e-06, + "loss": 0.7019, + "step": 13817 + }, + { + "epoch": 0.9982841764950241, + "grad_norm": 6.28137844580657, + "learning_rate": 2.628206036211446e-06, + "loss": 0.7715, + "step": 13818 + }, + { + "epoch": 0.9983564216952336, + "grad_norm": 5.482418048070449, + "learning_rate": 2.627913917823264e-06, + "loss": 0.8435, + "step": 13819 + }, + { + "epoch": 0.9984286668954432, + "grad_norm": 5.782041093467165, + "learning_rate": 2.6276217976840403e-06, + "loss": 0.7382, + "step": 13820 + }, + { + "epoch": 0.9985009120956526, + "grad_norm": 6.06584625705342, + "learning_rate": 2.627329675797773e-06, + "loss": 0.7585, + "step": 13821 + }, + { + "epoch": 0.9985731572958622, + "grad_norm": 7.1606354581695415, + "learning_rate": 2.627037552168461e-06, + "loss": 0.7948, + "step": 13822 + }, + { + "epoch": 0.9986454024960717, + "grad_norm": 6.759062723044, + "learning_rate": 2.626745426800104e-06, + "loss": 0.8038, + "step": 13823 + }, + { + "epoch": 0.9987176476962811, + "grad_norm": 7.193098193483372, + "learning_rate": 2.6264532996967006e-06, + "loss": 0.8901, + "step": 13824 + }, + { + "epoch": 0.9987898928964907, + "grad_norm": 5.827551928615728, + "learning_rate": 2.62616117086225e-06, + "loss": 0.7969, + "step": 13825 + }, + { + "epoch": 0.9988621380967002, + "grad_norm": 6.636959293849322, + "learning_rate": 2.62586904030075e-06, + "loss": 0.8271, + "step": 13826 + }, + { + "epoch": 0.9989343832969098, + "grad_norm": 7.853910757359693, + "learning_rate": 2.625576908016201e-06, + "loss": 0.7918, + "step": 13827 + }, + { + "epoch": 0.9990066284971192, + "grad_norm": 6.483951416887892, + "learning_rate": 2.625284774012602e-06, + "loss": 0.7528, + "step": 13828 + }, + { + "epoch": 0.9990788736973287, + "grad_norm": 6.49008831510717, + "learning_rate": 2.624992638293951e-06, + "loss": 0.7866, + "step": 13829 + }, + { + "epoch": 0.9991511188975383, + "grad_norm": 7.134526359165611, + "learning_rate": 2.6247005008642486e-06, + "loss": 0.7511, + "step": 13830 + }, + { + "epoch": 0.9992233640977477, + "grad_norm": 5.780413262928039, + "learning_rate": 2.6244083617274924e-06, + "loss": 0.7768, + "step": 13831 + }, + { + "epoch": 0.9992956092979572, + "grad_norm": 6.135569580797739, + "learning_rate": 2.624116220887683e-06, + "loss": 0.7964, + "step": 13832 + }, + { + "epoch": 0.9993678544981668, + "grad_norm": 5.287968983153823, + "learning_rate": 2.6238240783488184e-06, + "loss": 0.7712, + "step": 13833 + }, + { + "epoch": 0.9994400996983763, + "grad_norm": 5.165107655912698, + "learning_rate": 2.6235319341148983e-06, + "loss": 0.8034, + "step": 13834 + }, + { + "epoch": 0.9995123448985858, + "grad_norm": 6.439620178110668, + "learning_rate": 2.623239788189923e-06, + "loss": 0.7864, + "step": 13835 + }, + { + "epoch": 0.9995845900987953, + "grad_norm": 5.068275453905416, + "learning_rate": 2.622947640577889e-06, + "loss": 0.7623, + "step": 13836 + }, + { + "epoch": 0.9996568352990048, + "grad_norm": 6.601893887300061, + "learning_rate": 2.622655491282799e-06, + "loss": 0.7453, + "step": 13837 + }, + { + "epoch": 0.9997290804992144, + "grad_norm": 6.435535612660859, + "learning_rate": 2.622363340308649e-06, + "loss": 0.7949, + "step": 13838 + }, + { + "epoch": 0.9998013256994238, + "grad_norm": 6.020475101923507, + "learning_rate": 2.6220711876594406e-06, + "loss": 0.8014, + "step": 13839 + }, + { + "epoch": 0.9998735708996334, + "grad_norm": 6.252861283521091, + "learning_rate": 2.6217790333391724e-06, + "loss": 0.6852, + "step": 13840 + }, + { + "epoch": 0.9999458160998429, + "grad_norm": 6.146381970071619, + "learning_rate": 2.6214868773518443e-06, + "loss": 0.7262, + "step": 13841 + }, + { + "epoch": 1.0000180613000524, + "grad_norm": 5.167364545060928, + "learning_rate": 2.6211947197014542e-06, + "loss": 0.7619, + "step": 13842 + }, + { + "epoch": 1.000090306500262, + "grad_norm": 5.6034443140395345, + "learning_rate": 2.6209025603920028e-06, + "loss": 0.6801, + "step": 13843 + }, + { + "epoch": 1.0001625517004713, + "grad_norm": 5.073089826423625, + "learning_rate": 2.6206103994274896e-06, + "loss": 0.6713, + "step": 13844 + }, + { + "epoch": 1.0002347969006808, + "grad_norm": 5.897282326866625, + "learning_rate": 2.6203182368119135e-06, + "loss": 0.6659, + "step": 13845 + }, + { + "epoch": 1.0003070421008904, + "grad_norm": 5.582303744606334, + "learning_rate": 2.6200260725492742e-06, + "loss": 0.6558, + "step": 13846 + }, + { + "epoch": 1.0003792873011, + "grad_norm": 6.710523954534755, + "learning_rate": 2.619733906643571e-06, + "loss": 0.6382, + "step": 13847 + }, + { + "epoch": 1.0004515325013095, + "grad_norm": 5.326781520217977, + "learning_rate": 2.6194417390988036e-06, + "loss": 0.6843, + "step": 13848 + }, + { + "epoch": 1.000523777701519, + "grad_norm": 6.367263887099307, + "learning_rate": 2.6191495699189716e-06, + "loss": 0.7318, + "step": 13849 + }, + { + "epoch": 1.0005960229017286, + "grad_norm": 6.157385087298118, + "learning_rate": 2.6188573991080744e-06, + "loss": 0.6286, + "step": 13850 + }, + { + "epoch": 1.0006682681019379, + "grad_norm": 5.884036642223708, + "learning_rate": 2.6185652266701124e-06, + "loss": 0.6301, + "step": 13851 + }, + { + "epoch": 1.0007405133021474, + "grad_norm": 5.545975217239029, + "learning_rate": 2.6182730526090832e-06, + "loss": 0.6632, + "step": 13852 + }, + { + "epoch": 1.000812758502357, + "grad_norm": 5.635881876003406, + "learning_rate": 2.6179808769289887e-06, + "loss": 0.6333, + "step": 13853 + }, + { + "epoch": 1.0008850037025665, + "grad_norm": 5.683978321396508, + "learning_rate": 2.617688699633827e-06, + "loss": 0.6202, + "step": 13854 + }, + { + "epoch": 1.000957248902776, + "grad_norm": 8.589659449051725, + "learning_rate": 2.6173965207275994e-06, + "loss": 0.6568, + "step": 13855 + }, + { + "epoch": 1.0010294941029856, + "grad_norm": 5.601869284769378, + "learning_rate": 2.6171043402143035e-06, + "loss": 0.6867, + "step": 13856 + }, + { + "epoch": 1.0011017393031951, + "grad_norm": 5.885668219757754, + "learning_rate": 2.61681215809794e-06, + "loss": 0.6729, + "step": 13857 + }, + { + "epoch": 1.0011739845034044, + "grad_norm": 5.644604914095447, + "learning_rate": 2.61651997438251e-06, + "loss": 0.7458, + "step": 13858 + }, + { + "epoch": 1.001246229703614, + "grad_norm": 6.737513543223285, + "learning_rate": 2.6162277890720113e-06, + "loss": 0.7019, + "step": 13859 + }, + { + "epoch": 1.0013184749038235, + "grad_norm": 6.766482257052757, + "learning_rate": 2.6159356021704446e-06, + "loss": 0.6685, + "step": 13860 + }, + { + "epoch": 1.001390720104033, + "grad_norm": 6.349810932378469, + "learning_rate": 2.615643413681809e-06, + "loss": 0.7272, + "step": 13861 + }, + { + "epoch": 1.0014629653042426, + "grad_norm": 6.355667420642494, + "learning_rate": 2.615351223610105e-06, + "loss": 0.759, + "step": 13862 + }, + { + "epoch": 1.0015352105044522, + "grad_norm": 5.825325546663412, + "learning_rate": 2.615059031959332e-06, + "loss": 0.7097, + "step": 13863 + }, + { + "epoch": 1.0016074557046617, + "grad_norm": 5.8075238328264875, + "learning_rate": 2.6147668387334913e-06, + "loss": 0.6466, + "step": 13864 + }, + { + "epoch": 1.0016797009048712, + "grad_norm": 5.344359424484945, + "learning_rate": 2.6144746439365814e-06, + "loss": 0.7173, + "step": 13865 + }, + { + "epoch": 1.0017519461050806, + "grad_norm": 6.194544625529449, + "learning_rate": 2.6141824475726013e-06, + "loss": 0.7129, + "step": 13866 + }, + { + "epoch": 1.00182419130529, + "grad_norm": 5.850502070865471, + "learning_rate": 2.6138902496455536e-06, + "loss": 0.6939, + "step": 13867 + }, + { + "epoch": 1.0018964365054996, + "grad_norm": 5.6900494908223616, + "learning_rate": 2.613598050159436e-06, + "loss": 0.6434, + "step": 13868 + }, + { + "epoch": 1.0019686817057092, + "grad_norm": 6.27841684395089, + "learning_rate": 2.61330584911825e-06, + "loss": 0.6908, + "step": 13869 + }, + { + "epoch": 1.0020409269059187, + "grad_norm": 6.400901385250453, + "learning_rate": 2.6130136465259943e-06, + "loss": 0.7206, + "step": 13870 + }, + { + "epoch": 1.0021131721061283, + "grad_norm": 5.625428755632088, + "learning_rate": 2.6127214423866693e-06, + "loss": 0.6318, + "step": 13871 + }, + { + "epoch": 1.0021854173063378, + "grad_norm": 5.7486593922713505, + "learning_rate": 2.6124292367042752e-06, + "loss": 0.6628, + "step": 13872 + }, + { + "epoch": 1.0022576625065471, + "grad_norm": 5.934379962519318, + "learning_rate": 2.612137029482813e-06, + "loss": 0.7028, + "step": 13873 + }, + { + "epoch": 1.0023299077067567, + "grad_norm": 5.56086861336495, + "learning_rate": 2.611844820726282e-06, + "loss": 0.6175, + "step": 13874 + }, + { + "epoch": 1.0024021529069662, + "grad_norm": 6.2363792008752466, + "learning_rate": 2.6115526104386817e-06, + "loss": 0.6621, + "step": 13875 + }, + { + "epoch": 1.0024743981071758, + "grad_norm": 7.171222864890596, + "learning_rate": 2.6112603986240126e-06, + "loss": 0.6592, + "step": 13876 + }, + { + "epoch": 1.0025466433073853, + "grad_norm": 6.149190660271251, + "learning_rate": 2.610968185286275e-06, + "loss": 0.6979, + "step": 13877 + }, + { + "epoch": 1.0026188885075948, + "grad_norm": 6.3193648374059075, + "learning_rate": 2.61067597042947e-06, + "loss": 0.6503, + "step": 13878 + }, + { + "epoch": 1.0026911337078044, + "grad_norm": 6.81647424527333, + "learning_rate": 2.610383754057596e-06, + "loss": 0.6801, + "step": 13879 + }, + { + "epoch": 1.0027633789080137, + "grad_norm": 6.362761474512696, + "learning_rate": 2.6100915361746544e-06, + "loss": 0.6224, + "step": 13880 + }, + { + "epoch": 1.0028356241082232, + "grad_norm": 6.423157352768027, + "learning_rate": 2.6097993167846448e-06, + "loss": 0.7114, + "step": 13881 + }, + { + "epoch": 1.0029078693084328, + "grad_norm": 5.657229849736875, + "learning_rate": 2.609507095891568e-06, + "loss": 0.6721, + "step": 13882 + }, + { + "epoch": 1.0029801145086423, + "grad_norm": 6.345402244040316, + "learning_rate": 2.609214873499425e-06, + "loss": 0.7171, + "step": 13883 + }, + { + "epoch": 1.0030523597088519, + "grad_norm": 6.0410642477028045, + "learning_rate": 2.608922649612214e-06, + "loss": 0.7278, + "step": 13884 + }, + { + "epoch": 1.0031246049090614, + "grad_norm": 5.7336441946403, + "learning_rate": 2.6086304242339367e-06, + "loss": 0.6708, + "step": 13885 + }, + { + "epoch": 1.003196850109271, + "grad_norm": 5.851350299160556, + "learning_rate": 2.608338197368593e-06, + "loss": 0.7443, + "step": 13886 + }, + { + "epoch": 1.0032690953094803, + "grad_norm": 6.086443310776251, + "learning_rate": 2.608045969020185e-06, + "loss": 0.6344, + "step": 13887 + }, + { + "epoch": 1.0033413405096898, + "grad_norm": 8.17443597463568, + "learning_rate": 2.60775373919271e-06, + "loss": 0.5964, + "step": 13888 + }, + { + "epoch": 1.0034135857098994, + "grad_norm": 5.977562914125955, + "learning_rate": 2.607461507890171e-06, + "loss": 0.704, + "step": 13889 + }, + { + "epoch": 1.003485830910109, + "grad_norm": 6.093457337222985, + "learning_rate": 2.607169275116567e-06, + "loss": 0.6939, + "step": 13890 + }, + { + "epoch": 1.0035580761103184, + "grad_norm": 6.325125277639332, + "learning_rate": 2.6068770408758983e-06, + "loss": 0.6955, + "step": 13891 + }, + { + "epoch": 1.003630321310528, + "grad_norm": 5.388138792044428, + "learning_rate": 2.6065848051721666e-06, + "loss": 0.6243, + "step": 13892 + }, + { + "epoch": 1.0037025665107375, + "grad_norm": 7.962835293451983, + "learning_rate": 2.606292568009371e-06, + "loss": 0.6747, + "step": 13893 + }, + { + "epoch": 1.0037748117109468, + "grad_norm": 5.576277226456629, + "learning_rate": 2.606000329391513e-06, + "loss": 0.6532, + "step": 13894 + }, + { + "epoch": 1.0038470569111564, + "grad_norm": 6.633880421427627, + "learning_rate": 2.605708089322593e-06, + "loss": 0.7527, + "step": 13895 + }, + { + "epoch": 1.003919302111366, + "grad_norm": 7.110568506016925, + "learning_rate": 2.605415847806611e-06, + "loss": 0.6748, + "step": 13896 + }, + { + "epoch": 1.0039915473115755, + "grad_norm": 5.670694191501195, + "learning_rate": 2.605123604847568e-06, + "loss": 0.6796, + "step": 13897 + }, + { + "epoch": 1.004063792511785, + "grad_norm": 7.823440337823407, + "learning_rate": 2.604831360449465e-06, + "loss": 0.618, + "step": 13898 + }, + { + "epoch": 1.0041360377119946, + "grad_norm": 7.222348327839938, + "learning_rate": 2.6045391146163016e-06, + "loss": 0.7067, + "step": 13899 + }, + { + "epoch": 1.004208282912204, + "grad_norm": 5.9657636431514165, + "learning_rate": 2.604246867352079e-06, + "loss": 0.6709, + "step": 13900 + }, + { + "epoch": 1.0042805281124136, + "grad_norm": 5.977834768074834, + "learning_rate": 2.603954618660798e-06, + "loss": 0.7054, + "step": 13901 + }, + { + "epoch": 1.004352773312623, + "grad_norm": 6.174891717247077, + "learning_rate": 2.6036623685464587e-06, + "loss": 0.6557, + "step": 13902 + }, + { + "epoch": 1.0044250185128325, + "grad_norm": 7.724708635582229, + "learning_rate": 2.603370117013062e-06, + "loss": 0.6287, + "step": 13903 + }, + { + "epoch": 1.004497263713042, + "grad_norm": 6.759441977498436, + "learning_rate": 2.6030778640646077e-06, + "loss": 0.7692, + "step": 13904 + }, + { + "epoch": 1.0045695089132516, + "grad_norm": 6.442918589519184, + "learning_rate": 2.602785609705099e-06, + "loss": 0.647, + "step": 13905 + }, + { + "epoch": 1.0046417541134611, + "grad_norm": 6.051066520571142, + "learning_rate": 2.6024933539385345e-06, + "loss": 0.6599, + "step": 13906 + }, + { + "epoch": 1.0047139993136707, + "grad_norm": 7.258258719631954, + "learning_rate": 2.6022010967689158e-06, + "loss": 0.5928, + "step": 13907 + }, + { + "epoch": 1.0047862445138802, + "grad_norm": 5.944881949957105, + "learning_rate": 2.6019088382002434e-06, + "loss": 0.6315, + "step": 13908 + }, + { + "epoch": 1.0048584897140895, + "grad_norm": 6.148956781001872, + "learning_rate": 2.601616578236518e-06, + "loss": 0.67, + "step": 13909 + }, + { + "epoch": 1.004930734914299, + "grad_norm": 5.4813467506560585, + "learning_rate": 2.6013243168817402e-06, + "loss": 0.6557, + "step": 13910 + }, + { + "epoch": 1.0050029801145086, + "grad_norm": 5.267389652669694, + "learning_rate": 2.6010320541399114e-06, + "loss": 0.6514, + "step": 13911 + }, + { + "epoch": 1.0050752253147182, + "grad_norm": 7.2473315885937835, + "learning_rate": 2.6007397900150328e-06, + "loss": 0.5826, + "step": 13912 + }, + { + "epoch": 1.0051474705149277, + "grad_norm": 6.857357901652605, + "learning_rate": 2.6004475245111036e-06, + "loss": 0.58, + "step": 13913 + }, + { + "epoch": 1.0052197157151372, + "grad_norm": 7.148058338219673, + "learning_rate": 2.6001552576321266e-06, + "loss": 0.6757, + "step": 13914 + }, + { + "epoch": 1.0052919609153468, + "grad_norm": 6.268604207649275, + "learning_rate": 2.599862989382102e-06, + "loss": 0.6047, + "step": 13915 + }, + { + "epoch": 1.005364206115556, + "grad_norm": 7.324958035605984, + "learning_rate": 2.5995707197650307e-06, + "loss": 0.6838, + "step": 13916 + }, + { + "epoch": 1.0054364513157656, + "grad_norm": 7.02122739446287, + "learning_rate": 2.5992784487849132e-06, + "loss": 0.6435, + "step": 13917 + }, + { + "epoch": 1.0055086965159752, + "grad_norm": 5.478314390243555, + "learning_rate": 2.59898617644575e-06, + "loss": 0.6437, + "step": 13918 + }, + { + "epoch": 1.0055809417161847, + "grad_norm": 6.642508349659258, + "learning_rate": 2.598693902751544e-06, + "loss": 0.6593, + "step": 13919 + }, + { + "epoch": 1.0056531869163943, + "grad_norm": 6.361837524219737, + "learning_rate": 2.5984016277062947e-06, + "loss": 0.6486, + "step": 13920 + }, + { + "epoch": 1.0057254321166038, + "grad_norm": 5.854373190643236, + "learning_rate": 2.598109351314004e-06, + "loss": 0.7044, + "step": 13921 + }, + { + "epoch": 1.0057976773168134, + "grad_norm": 6.763833740483017, + "learning_rate": 2.5978170735786717e-06, + "loss": 0.6848, + "step": 13922 + }, + { + "epoch": 1.0058699225170227, + "grad_norm": 7.501790913697995, + "learning_rate": 2.5975247945042998e-06, + "loss": 0.6726, + "step": 13923 + }, + { + "epoch": 1.0059421677172322, + "grad_norm": 6.005170501651577, + "learning_rate": 2.59723251409489e-06, + "loss": 0.6791, + "step": 13924 + }, + { + "epoch": 1.0060144129174418, + "grad_norm": 7.305006573078141, + "learning_rate": 2.5969402323544417e-06, + "loss": 0.7246, + "step": 13925 + }, + { + "epoch": 1.0060866581176513, + "grad_norm": 6.815336704234784, + "learning_rate": 2.5966479492869574e-06, + "loss": 0.6805, + "step": 13926 + }, + { + "epoch": 1.0061589033178608, + "grad_norm": 7.378389775632102, + "learning_rate": 2.5963556648964373e-06, + "loss": 0.6593, + "step": 13927 + }, + { + "epoch": 1.0062311485180704, + "grad_norm": 5.79500119753176, + "learning_rate": 2.596063379186883e-06, + "loss": 0.6836, + "step": 13928 + }, + { + "epoch": 1.00630339371828, + "grad_norm": 5.369226770691689, + "learning_rate": 2.5957710921622953e-06, + "loss": 0.6012, + "step": 13929 + }, + { + "epoch": 1.0063756389184892, + "grad_norm": 5.923624308746452, + "learning_rate": 2.5954788038266765e-06, + "loss": 0.6682, + "step": 13930 + }, + { + "epoch": 1.0064478841186988, + "grad_norm": 6.425688639150257, + "learning_rate": 2.595186514184027e-06, + "loss": 0.6912, + "step": 13931 + }, + { + "epoch": 1.0065201293189083, + "grad_norm": 7.802938484892947, + "learning_rate": 2.5948942232383477e-06, + "loss": 0.7839, + "step": 13932 + }, + { + "epoch": 1.0065923745191179, + "grad_norm": 5.672466152690377, + "learning_rate": 2.59460193099364e-06, + "loss": 0.6844, + "step": 13933 + }, + { + "epoch": 1.0066646197193274, + "grad_norm": 6.350585562072301, + "learning_rate": 2.5943096374539055e-06, + "loss": 0.6862, + "step": 13934 + }, + { + "epoch": 1.006736864919537, + "grad_norm": 6.575309821542068, + "learning_rate": 2.5940173426231457e-06, + "loss": 0.7065, + "step": 13935 + }, + { + "epoch": 1.0068091101197465, + "grad_norm": 7.457710311128798, + "learning_rate": 2.5937250465053605e-06, + "loss": 0.622, + "step": 13936 + }, + { + "epoch": 1.006881355319956, + "grad_norm": 7.414331075548998, + "learning_rate": 2.5934327491045524e-06, + "loss": 0.7146, + "step": 13937 + }, + { + "epoch": 1.0069536005201654, + "grad_norm": 6.9113573892480815, + "learning_rate": 2.5931404504247233e-06, + "loss": 0.6393, + "step": 13938 + }, + { + "epoch": 1.007025845720375, + "grad_norm": 6.611765472055909, + "learning_rate": 2.5928481504698733e-06, + "loss": 0.6422, + "step": 13939 + }, + { + "epoch": 1.0070980909205844, + "grad_norm": 5.7567251930876875, + "learning_rate": 2.5925558492440046e-06, + "loss": 0.6081, + "step": 13940 + }, + { + "epoch": 1.007170336120794, + "grad_norm": 7.949570016808987, + "learning_rate": 2.5922635467511177e-06, + "loss": 0.6913, + "step": 13941 + }, + { + "epoch": 1.0072425813210035, + "grad_norm": 6.641204018874326, + "learning_rate": 2.591971242995214e-06, + "loss": 0.7103, + "step": 13942 + }, + { + "epoch": 1.007314826521213, + "grad_norm": 6.23852339855642, + "learning_rate": 2.591678937980296e-06, + "loss": 0.6664, + "step": 13943 + }, + { + "epoch": 1.0073870717214226, + "grad_norm": 6.770984949223945, + "learning_rate": 2.591386631710365e-06, + "loss": 0.6882, + "step": 13944 + }, + { + "epoch": 1.007459316921632, + "grad_norm": 5.8052848416150225, + "learning_rate": 2.5910943241894214e-06, + "loss": 0.6496, + "step": 13945 + }, + { + "epoch": 1.0075315621218415, + "grad_norm": 6.0364502936093825, + "learning_rate": 2.5908020154214675e-06, + "loss": 0.7116, + "step": 13946 + }, + { + "epoch": 1.007603807322051, + "grad_norm": 5.75369807201475, + "learning_rate": 2.590509705410504e-06, + "loss": 0.7014, + "step": 13947 + }, + { + "epoch": 1.0076760525222606, + "grad_norm": 6.319720378436145, + "learning_rate": 2.590217394160533e-06, + "loss": 0.6425, + "step": 13948 + }, + { + "epoch": 1.00774829772247, + "grad_norm": 5.522237903735595, + "learning_rate": 2.589925081675557e-06, + "loss": 0.6496, + "step": 13949 + }, + { + "epoch": 1.0078205429226796, + "grad_norm": 6.271194532094743, + "learning_rate": 2.589632767959575e-06, + "loss": 0.6825, + "step": 13950 + }, + { + "epoch": 1.0078927881228892, + "grad_norm": 5.782765452768968, + "learning_rate": 2.5893404530165904e-06, + "loss": 0.6372, + "step": 13951 + }, + { + "epoch": 1.0079650333230985, + "grad_norm": 6.503575148760578, + "learning_rate": 2.5890481368506043e-06, + "loss": 0.7906, + "step": 13952 + }, + { + "epoch": 1.008037278523308, + "grad_norm": 6.498628911884029, + "learning_rate": 2.588755819465619e-06, + "loss": 0.6875, + "step": 13953 + }, + { + "epoch": 1.0081095237235176, + "grad_norm": 5.907420854720371, + "learning_rate": 2.588463500865635e-06, + "loss": 0.6442, + "step": 13954 + }, + { + "epoch": 1.0081817689237271, + "grad_norm": 6.517017682208238, + "learning_rate": 2.5881711810546552e-06, + "loss": 0.7003, + "step": 13955 + }, + { + "epoch": 1.0082540141239367, + "grad_norm": 5.483389480597093, + "learning_rate": 2.587878860036679e-06, + "loss": 0.5841, + "step": 13956 + }, + { + "epoch": 1.0083262593241462, + "grad_norm": 8.229431869160845, + "learning_rate": 2.58758653781571e-06, + "loss": 0.7299, + "step": 13957 + }, + { + "epoch": 1.0083985045243558, + "grad_norm": 6.664978608356782, + "learning_rate": 2.5872942143957496e-06, + "loss": 0.6554, + "step": 13958 + }, + { + "epoch": 1.008470749724565, + "grad_norm": 6.6770408339395155, + "learning_rate": 2.5870018897807987e-06, + "loss": 0.6156, + "step": 13959 + }, + { + "epoch": 1.0085429949247746, + "grad_norm": 6.898932675726658, + "learning_rate": 2.58670956397486e-06, + "loss": 0.6776, + "step": 13960 + }, + { + "epoch": 1.0086152401249842, + "grad_norm": 4.872113277928659, + "learning_rate": 2.586417236981934e-06, + "loss": 0.6334, + "step": 13961 + }, + { + "epoch": 1.0086874853251937, + "grad_norm": 6.749957331769801, + "learning_rate": 2.5861249088060237e-06, + "loss": 0.7138, + "step": 13962 + }, + { + "epoch": 1.0087597305254032, + "grad_norm": 5.58984562669212, + "learning_rate": 2.58583257945113e-06, + "loss": 0.6758, + "step": 13963 + }, + { + "epoch": 1.0088319757256128, + "grad_norm": 5.487435467803304, + "learning_rate": 2.5855402489212554e-06, + "loss": 0.66, + "step": 13964 + }, + { + "epoch": 1.0089042209258223, + "grad_norm": 5.4788141560263215, + "learning_rate": 2.585247917220401e-06, + "loss": 0.6015, + "step": 13965 + }, + { + "epoch": 1.0089764661260316, + "grad_norm": 7.034885941416551, + "learning_rate": 2.584955584352568e-06, + "loss": 0.6395, + "step": 13966 + }, + { + "epoch": 1.0090487113262412, + "grad_norm": 6.358422960255468, + "learning_rate": 2.58466325032176e-06, + "loss": 0.6941, + "step": 13967 + }, + { + "epoch": 1.0091209565264507, + "grad_norm": 8.322981342801219, + "learning_rate": 2.5843709151319773e-06, + "loss": 0.7364, + "step": 13968 + }, + { + "epoch": 1.0091932017266603, + "grad_norm": 7.738676597359243, + "learning_rate": 2.584078578787223e-06, + "loss": 0.6818, + "step": 13969 + }, + { + "epoch": 1.0092654469268698, + "grad_norm": 8.011129267379966, + "learning_rate": 2.5837862412914976e-06, + "loss": 0.6852, + "step": 13970 + }, + { + "epoch": 1.0093376921270794, + "grad_norm": 6.849486015694843, + "learning_rate": 2.5834939026488035e-06, + "loss": 0.6755, + "step": 13971 + }, + { + "epoch": 1.009409937327289, + "grad_norm": 6.067654650100058, + "learning_rate": 2.583201562863143e-06, + "loss": 0.6384, + "step": 13972 + }, + { + "epoch": 1.0094821825274984, + "grad_norm": 6.813810729877787, + "learning_rate": 2.5829092219385178e-06, + "loss": 0.7113, + "step": 13973 + }, + { + "epoch": 1.0095544277277078, + "grad_norm": 6.338629699781118, + "learning_rate": 2.58261687987893e-06, + "loss": 0.653, + "step": 13974 + }, + { + "epoch": 1.0096266729279173, + "grad_norm": 6.758773470781937, + "learning_rate": 2.58232453668838e-06, + "loss": 0.7055, + "step": 13975 + }, + { + "epoch": 1.0096989181281268, + "grad_norm": 7.229247548056712, + "learning_rate": 2.5820321923708724e-06, + "loss": 0.6839, + "step": 13976 + }, + { + "epoch": 1.0097711633283364, + "grad_norm": 6.097436973055225, + "learning_rate": 2.5817398469304074e-06, + "loss": 0.6392, + "step": 13977 + }, + { + "epoch": 1.009843408528546, + "grad_norm": 5.271802408547349, + "learning_rate": 2.581447500370987e-06, + "loss": 0.7137, + "step": 13978 + }, + { + "epoch": 1.0099156537287555, + "grad_norm": 5.671047013643691, + "learning_rate": 2.5811551526966138e-06, + "loss": 0.636, + "step": 13979 + }, + { + "epoch": 1.009987898928965, + "grad_norm": 6.961074776371557, + "learning_rate": 2.5808628039112893e-06, + "loss": 0.6431, + "step": 13980 + }, + { + "epoch": 1.0100601441291743, + "grad_norm": 6.351244478604327, + "learning_rate": 2.5805704540190164e-06, + "loss": 0.6933, + "step": 13981 + }, + { + "epoch": 1.0101323893293839, + "grad_norm": 6.218403408801157, + "learning_rate": 2.580278103023796e-06, + "loss": 0.7274, + "step": 13982 + }, + { + "epoch": 1.0102046345295934, + "grad_norm": 6.596346231047421, + "learning_rate": 2.579985750929631e-06, + "loss": 0.6497, + "step": 13983 + }, + { + "epoch": 1.010276879729803, + "grad_norm": 5.607335817107828, + "learning_rate": 2.579693397740523e-06, + "loss": 0.6982, + "step": 13984 + }, + { + "epoch": 1.0103491249300125, + "grad_norm": 6.846020465629672, + "learning_rate": 2.5794010434604745e-06, + "loss": 0.6764, + "step": 13985 + }, + { + "epoch": 1.010421370130222, + "grad_norm": 6.103202491967556, + "learning_rate": 2.5791086880934868e-06, + "loss": 0.7011, + "step": 13986 + }, + { + "epoch": 1.0104936153304316, + "grad_norm": 7.369707697916816, + "learning_rate": 2.578816331643563e-06, + "loss": 0.7009, + "step": 13987 + }, + { + "epoch": 1.010565860530641, + "grad_norm": 8.074920313769695, + "learning_rate": 2.578523974114705e-06, + "loss": 0.6933, + "step": 13988 + }, + { + "epoch": 1.0106381057308504, + "grad_norm": 6.036842401994891, + "learning_rate": 2.5782316155109143e-06, + "loss": 0.6322, + "step": 13989 + }, + { + "epoch": 1.01071035093106, + "grad_norm": 6.040929429246846, + "learning_rate": 2.5779392558361944e-06, + "loss": 0.6891, + "step": 13990 + }, + { + "epoch": 1.0107825961312695, + "grad_norm": 5.391241154494305, + "learning_rate": 2.5776468950945455e-06, + "loss": 0.6201, + "step": 13991 + }, + { + "epoch": 1.010854841331479, + "grad_norm": 6.582631938929003, + "learning_rate": 2.577354533289972e-06, + "loss": 0.6639, + "step": 13992 + }, + { + "epoch": 1.0109270865316886, + "grad_norm": 5.969833834842055, + "learning_rate": 2.5770621704264735e-06, + "loss": 0.6801, + "step": 13993 + }, + { + "epoch": 1.0109993317318982, + "grad_norm": 5.905771710441372, + "learning_rate": 2.576769806508055e-06, + "loss": 0.6722, + "step": 13994 + }, + { + "epoch": 1.0110715769321075, + "grad_norm": 6.322364577731955, + "learning_rate": 2.576477441538717e-06, + "loss": 0.6357, + "step": 13995 + }, + { + "epoch": 1.011143822132317, + "grad_norm": 6.083558936268695, + "learning_rate": 2.576185075522462e-06, + "loss": 0.6332, + "step": 13996 + }, + { + "epoch": 1.0112160673325266, + "grad_norm": 6.9475085945529225, + "learning_rate": 2.5758927084632936e-06, + "loss": 0.6844, + "step": 13997 + }, + { + "epoch": 1.011288312532736, + "grad_norm": 7.818334735237841, + "learning_rate": 2.5756003403652117e-06, + "loss": 0.6839, + "step": 13998 + }, + { + "epoch": 1.0113605577329456, + "grad_norm": 5.643245352554756, + "learning_rate": 2.57530797123222e-06, + "loss": 0.5942, + "step": 13999 + }, + { + "epoch": 1.0114328029331552, + "grad_norm": 6.182113306375202, + "learning_rate": 2.575015601068321e-06, + "loss": 0.623, + "step": 14000 + }, + { + "epoch": 1.0115050481333647, + "grad_norm": 6.661266746678905, + "learning_rate": 2.5747232298775164e-06, + "loss": 0.6942, + "step": 14001 + }, + { + "epoch": 1.011577293333574, + "grad_norm": 6.248524300882625, + "learning_rate": 2.574430857663809e-06, + "loss": 0.6471, + "step": 14002 + }, + { + "epoch": 1.0116495385337836, + "grad_norm": 6.250494365213868, + "learning_rate": 2.5741384844312007e-06, + "loss": 0.6528, + "step": 14003 + }, + { + "epoch": 1.0117217837339931, + "grad_norm": 6.035182481233448, + "learning_rate": 2.5738461101836943e-06, + "loss": 0.602, + "step": 14004 + }, + { + "epoch": 1.0117940289342027, + "grad_norm": 5.4462219091868125, + "learning_rate": 2.573553734925292e-06, + "loss": 0.6744, + "step": 14005 + }, + { + "epoch": 1.0118662741344122, + "grad_norm": 7.28064276655834, + "learning_rate": 2.5732613586599964e-06, + "loss": 0.6918, + "step": 14006 + }, + { + "epoch": 1.0119385193346218, + "grad_norm": 5.39180647576891, + "learning_rate": 2.5729689813918097e-06, + "loss": 0.656, + "step": 14007 + }, + { + "epoch": 1.0120107645348313, + "grad_norm": 6.524172804274063, + "learning_rate": 2.5726766031247335e-06, + "loss": 0.751, + "step": 14008 + }, + { + "epoch": 1.0120830097350408, + "grad_norm": 6.883098756550834, + "learning_rate": 2.572384223862772e-06, + "loss": 0.6494, + "step": 14009 + }, + { + "epoch": 1.0121552549352502, + "grad_norm": 7.173824402735217, + "learning_rate": 2.572091843609926e-06, + "loss": 0.675, + "step": 14010 + }, + { + "epoch": 1.0122275001354597, + "grad_norm": 6.2661019997475496, + "learning_rate": 2.5717994623701988e-06, + "loss": 0.7045, + "step": 14011 + }, + { + "epoch": 1.0122997453356692, + "grad_norm": 5.953592757623989, + "learning_rate": 2.5715070801475935e-06, + "loss": 0.6357, + "step": 14012 + }, + { + "epoch": 1.0123719905358788, + "grad_norm": 7.457346362514471, + "learning_rate": 2.5712146969461104e-06, + "loss": 0.7232, + "step": 14013 + }, + { + "epoch": 1.0124442357360883, + "grad_norm": 5.8089276893122195, + "learning_rate": 2.570922312769754e-06, + "loss": 0.6223, + "step": 14014 + }, + { + "epoch": 1.0125164809362979, + "grad_norm": 6.424634503013728, + "learning_rate": 2.5706299276225267e-06, + "loss": 0.6882, + "step": 14015 + }, + { + "epoch": 1.0125887261365074, + "grad_norm": 6.03533702196903, + "learning_rate": 2.5703375415084297e-06, + "loss": 0.6341, + "step": 14016 + }, + { + "epoch": 1.0126609713367167, + "grad_norm": 5.516547712136488, + "learning_rate": 2.570045154431467e-06, + "loss": 0.6284, + "step": 14017 + }, + { + "epoch": 1.0127332165369263, + "grad_norm": 7.081414744522139, + "learning_rate": 2.56975276639564e-06, + "loss": 0.7753, + "step": 14018 + }, + { + "epoch": 1.0128054617371358, + "grad_norm": 6.5688007443516, + "learning_rate": 2.5694603774049528e-06, + "loss": 0.6552, + "step": 14019 + }, + { + "epoch": 1.0128777069373454, + "grad_norm": 7.0042531852514776, + "learning_rate": 2.5691679874634057e-06, + "loss": 0.7464, + "step": 14020 + }, + { + "epoch": 1.012949952137555, + "grad_norm": 8.707684019664937, + "learning_rate": 2.5688755965750033e-06, + "loss": 0.7698, + "step": 14021 + }, + { + "epoch": 1.0130221973377644, + "grad_norm": 7.106384855817629, + "learning_rate": 2.5685832047437474e-06, + "loss": 0.6345, + "step": 14022 + }, + { + "epoch": 1.013094442537974, + "grad_norm": 5.8550742680036, + "learning_rate": 2.5682908119736405e-06, + "loss": 0.6754, + "step": 14023 + }, + { + "epoch": 1.0131666877381833, + "grad_norm": 6.6807976051235824, + "learning_rate": 2.567998418268686e-06, + "loss": 0.6658, + "step": 14024 + }, + { + "epoch": 1.0132389329383928, + "grad_norm": 5.726621120095602, + "learning_rate": 2.5677060236328847e-06, + "loss": 0.6949, + "step": 14025 + }, + { + "epoch": 1.0133111781386024, + "grad_norm": 5.743608986469494, + "learning_rate": 2.5674136280702424e-06, + "loss": 0.6404, + "step": 14026 + }, + { + "epoch": 1.013383423338812, + "grad_norm": 5.990926557660936, + "learning_rate": 2.567121231584758e-06, + "loss": 0.6562, + "step": 14027 + }, + { + "epoch": 1.0134556685390215, + "grad_norm": 6.351555593188896, + "learning_rate": 2.5668288341804366e-06, + "loss": 0.6591, + "step": 14028 + }, + { + "epoch": 1.013527913739231, + "grad_norm": 6.694207366220656, + "learning_rate": 2.5665364358612805e-06, + "loss": 0.7031, + "step": 14029 + }, + { + "epoch": 1.0136001589394406, + "grad_norm": 7.62933251537295, + "learning_rate": 2.5662440366312926e-06, + "loss": 0.6661, + "step": 14030 + }, + { + "epoch": 1.0136724041396499, + "grad_norm": 5.815810378623723, + "learning_rate": 2.5659516364944754e-06, + "loss": 0.683, + "step": 14031 + }, + { + "epoch": 1.0137446493398594, + "grad_norm": 9.013480369376783, + "learning_rate": 2.56565923545483e-06, + "loss": 0.7155, + "step": 14032 + }, + { + "epoch": 1.013816894540069, + "grad_norm": 7.341118004078445, + "learning_rate": 2.565366833516362e-06, + "loss": 0.6627, + "step": 14033 + }, + { + "epoch": 1.0138891397402785, + "grad_norm": 6.439984777207479, + "learning_rate": 2.5650744306830728e-06, + "loss": 0.6438, + "step": 14034 + }, + { + "epoch": 1.013961384940488, + "grad_norm": 6.115388140333661, + "learning_rate": 2.564782026958965e-06, + "loss": 0.6232, + "step": 14035 + }, + { + "epoch": 1.0140336301406976, + "grad_norm": 5.74901273793488, + "learning_rate": 2.5644896223480416e-06, + "loss": 0.6694, + "step": 14036 + }, + { + "epoch": 1.0141058753409071, + "grad_norm": 6.594051697685426, + "learning_rate": 2.5641972168543043e-06, + "loss": 0.6655, + "step": 14037 + }, + { + "epoch": 1.0141781205411164, + "grad_norm": 6.510364778439058, + "learning_rate": 2.5639048104817576e-06, + "loss": 0.6817, + "step": 14038 + }, + { + "epoch": 1.014250365741326, + "grad_norm": 7.16351748923694, + "learning_rate": 2.5636124032344046e-06, + "loss": 0.7354, + "step": 14039 + }, + { + "epoch": 1.0143226109415355, + "grad_norm": 6.084868703286084, + "learning_rate": 2.5633199951162467e-06, + "loss": 0.6647, + "step": 14040 + }, + { + "epoch": 1.014394856141745, + "grad_norm": 6.31247461899291, + "learning_rate": 2.563027586131287e-06, + "loss": 0.7324, + "step": 14041 + }, + { + "epoch": 1.0144671013419546, + "grad_norm": 5.899237774250049, + "learning_rate": 2.5627351762835284e-06, + "loss": 0.6415, + "step": 14042 + }, + { + "epoch": 1.0145393465421642, + "grad_norm": 5.715044332283933, + "learning_rate": 2.562442765576974e-06, + "loss": 0.6213, + "step": 14043 + }, + { + "epoch": 1.0146115917423737, + "grad_norm": 5.5116190857396035, + "learning_rate": 2.562150354015627e-06, + "loss": 0.6764, + "step": 14044 + }, + { + "epoch": 1.014683836942583, + "grad_norm": 7.076569982159602, + "learning_rate": 2.5618579416034896e-06, + "loss": 0.6839, + "step": 14045 + }, + { + "epoch": 1.0147560821427926, + "grad_norm": 6.5044895219879, + "learning_rate": 2.5615655283445646e-06, + "loss": 0.6557, + "step": 14046 + }, + { + "epoch": 1.014828327343002, + "grad_norm": 6.584570981636316, + "learning_rate": 2.5612731142428567e-06, + "loss": 0.6394, + "step": 14047 + }, + { + "epoch": 1.0149005725432116, + "grad_norm": 4.840539274201573, + "learning_rate": 2.5609806993023663e-06, + "loss": 0.6765, + "step": 14048 + }, + { + "epoch": 1.0149728177434212, + "grad_norm": 8.373459403836344, + "learning_rate": 2.560688283527098e-06, + "loss": 0.6888, + "step": 14049 + }, + { + "epoch": 1.0150450629436307, + "grad_norm": 7.8061040713473115, + "learning_rate": 2.560395866921053e-06, + "loss": 0.7696, + "step": 14050 + }, + { + "epoch": 1.0151173081438403, + "grad_norm": 8.066080880241753, + "learning_rate": 2.5601034494882365e-06, + "loss": 0.702, + "step": 14051 + }, + { + "epoch": 1.0151895533440498, + "grad_norm": 5.6459798570976965, + "learning_rate": 2.5598110312326502e-06, + "loss": 0.6219, + "step": 14052 + }, + { + "epoch": 1.0152617985442591, + "grad_norm": 6.883537955702551, + "learning_rate": 2.559518612158298e-06, + "loss": 0.7319, + "step": 14053 + }, + { + "epoch": 1.0153340437444687, + "grad_norm": 5.411609440720164, + "learning_rate": 2.5592261922691813e-06, + "loss": 0.6467, + "step": 14054 + }, + { + "epoch": 1.0154062889446782, + "grad_norm": 8.306270703246161, + "learning_rate": 2.5589337715693052e-06, + "loss": 0.7241, + "step": 14055 + }, + { + "epoch": 1.0154785341448878, + "grad_norm": 5.974850717379532, + "learning_rate": 2.5586413500626705e-06, + "loss": 0.6388, + "step": 14056 + }, + { + "epoch": 1.0155507793450973, + "grad_norm": 6.169539461182328, + "learning_rate": 2.5583489277532815e-06, + "loss": 0.7233, + "step": 14057 + }, + { + "epoch": 1.0156230245453068, + "grad_norm": 5.730802582194224, + "learning_rate": 2.5580565046451413e-06, + "loss": 0.5937, + "step": 14058 + }, + { + "epoch": 1.0156952697455164, + "grad_norm": 5.931907188095882, + "learning_rate": 2.5577640807422516e-06, + "loss": 0.6492, + "step": 14059 + }, + { + "epoch": 1.0157675149457257, + "grad_norm": 5.773736473673841, + "learning_rate": 2.5574716560486173e-06, + "loss": 0.7336, + "step": 14060 + }, + { + "epoch": 1.0158397601459352, + "grad_norm": 5.613999014924673, + "learning_rate": 2.55717923056824e-06, + "loss": 0.6852, + "step": 14061 + }, + { + "epoch": 1.0159120053461448, + "grad_norm": 5.720245332591869, + "learning_rate": 2.556886804305124e-06, + "loss": 0.6721, + "step": 14062 + }, + { + "epoch": 1.0159842505463543, + "grad_norm": 7.675257665428106, + "learning_rate": 2.5565943772632727e-06, + "loss": 0.728, + "step": 14063 + }, + { + "epoch": 1.0160564957465639, + "grad_norm": 7.181512079859911, + "learning_rate": 2.5563019494466872e-06, + "loss": 0.6377, + "step": 14064 + }, + { + "epoch": 1.0161287409467734, + "grad_norm": 6.442316124053238, + "learning_rate": 2.5560095208593717e-06, + "loss": 0.6847, + "step": 14065 + }, + { + "epoch": 1.016200986146983, + "grad_norm": 5.502359057731085, + "learning_rate": 2.555717091505329e-06, + "loss": 0.6491, + "step": 14066 + }, + { + "epoch": 1.0162732313471923, + "grad_norm": 5.7052920888069805, + "learning_rate": 2.555424661388564e-06, + "loss": 0.6474, + "step": 14067 + }, + { + "epoch": 1.0163454765474018, + "grad_norm": 5.1570200951913145, + "learning_rate": 2.555132230513077e-06, + "loss": 0.6446, + "step": 14068 + }, + { + "epoch": 1.0164177217476114, + "grad_norm": 8.067485362141005, + "learning_rate": 2.5548397988828737e-06, + "loss": 0.7901, + "step": 14069 + }, + { + "epoch": 1.016489966947821, + "grad_norm": 7.711258561736666, + "learning_rate": 2.554547366501955e-06, + "loss": 0.6804, + "step": 14070 + }, + { + "epoch": 1.0165622121480304, + "grad_norm": 8.908286413979745, + "learning_rate": 2.554254933374326e-06, + "loss": 0.6982, + "step": 14071 + }, + { + "epoch": 1.01663445734824, + "grad_norm": 6.796333379102466, + "learning_rate": 2.5539624995039893e-06, + "loss": 0.6559, + "step": 14072 + }, + { + "epoch": 1.0167067025484495, + "grad_norm": 5.957016841683395, + "learning_rate": 2.5536700648949475e-06, + "loss": 0.6732, + "step": 14073 + }, + { + "epoch": 1.0167789477486588, + "grad_norm": 8.560684791339982, + "learning_rate": 2.5533776295512038e-06, + "loss": 0.622, + "step": 14074 + }, + { + "epoch": 1.0168511929488684, + "grad_norm": 7.206104382303406, + "learning_rate": 2.553085193476762e-06, + "loss": 0.6627, + "step": 14075 + }, + { + "epoch": 1.016923438149078, + "grad_norm": 6.0009377064700145, + "learning_rate": 2.5527927566756256e-06, + "loss": 0.599, + "step": 14076 + }, + { + "epoch": 1.0169956833492875, + "grad_norm": 8.769698721257253, + "learning_rate": 2.5525003191517965e-06, + "loss": 0.6317, + "step": 14077 + }, + { + "epoch": 1.017067928549497, + "grad_norm": 6.914075189783553, + "learning_rate": 2.55220788090928e-06, + "loss": 0.7144, + "step": 14078 + }, + { + "epoch": 1.0171401737497066, + "grad_norm": 6.861889910373806, + "learning_rate": 2.551915441952077e-06, + "loss": 0.6242, + "step": 14079 + }, + { + "epoch": 1.017212418949916, + "grad_norm": 6.8091234711281565, + "learning_rate": 2.5516230022841927e-06, + "loss": 0.6008, + "step": 14080 + }, + { + "epoch": 1.0172846641501254, + "grad_norm": 7.196597503334475, + "learning_rate": 2.551330561909629e-06, + "loss": 0.6907, + "step": 14081 + }, + { + "epoch": 1.017356909350335, + "grad_norm": 7.097931154813126, + "learning_rate": 2.55103812083239e-06, + "loss": 0.6493, + "step": 14082 + }, + { + "epoch": 1.0174291545505445, + "grad_norm": 6.201334889589644, + "learning_rate": 2.5507456790564795e-06, + "loss": 0.6713, + "step": 14083 + }, + { + "epoch": 1.017501399750754, + "grad_norm": 7.003404879137222, + "learning_rate": 2.550453236585898e-06, + "loss": 0.7026, + "step": 14084 + }, + { + "epoch": 1.0175736449509636, + "grad_norm": 7.644569673670682, + "learning_rate": 2.5501607934246525e-06, + "loss": 0.6598, + "step": 14085 + }, + { + "epoch": 1.0176458901511731, + "grad_norm": 7.7681843705931275, + "learning_rate": 2.5498683495767445e-06, + "loss": 0.6799, + "step": 14086 + }, + { + "epoch": 1.0177181353513827, + "grad_norm": 6.2105063234768, + "learning_rate": 2.5495759050461775e-06, + "loss": 0.6507, + "step": 14087 + }, + { + "epoch": 1.0177903805515922, + "grad_norm": 5.192569323955341, + "learning_rate": 2.5492834598369547e-06, + "loss": 0.643, + "step": 14088 + }, + { + "epoch": 1.0178626257518015, + "grad_norm": 7.085289299872703, + "learning_rate": 2.5489910139530793e-06, + "loss": 0.7004, + "step": 14089 + }, + { + "epoch": 1.017934870952011, + "grad_norm": 6.7583507169162065, + "learning_rate": 2.548698567398556e-06, + "loss": 0.7094, + "step": 14090 + }, + { + "epoch": 1.0180071161522206, + "grad_norm": 5.984864536113161, + "learning_rate": 2.548406120177386e-06, + "loss": 0.7383, + "step": 14091 + }, + { + "epoch": 1.0180793613524302, + "grad_norm": 7.014980091112942, + "learning_rate": 2.5481136722935747e-06, + "loss": 0.7333, + "step": 14092 + }, + { + "epoch": 1.0181516065526397, + "grad_norm": 6.514717752427869, + "learning_rate": 2.5478212237511242e-06, + "loss": 0.6567, + "step": 14093 + }, + { + "epoch": 1.0182238517528492, + "grad_norm": 7.573280658046228, + "learning_rate": 2.5475287745540376e-06, + "loss": 0.6535, + "step": 14094 + }, + { + "epoch": 1.0182960969530588, + "grad_norm": 7.550229545286914, + "learning_rate": 2.547236324706319e-06, + "loss": 0.7245, + "step": 14095 + }, + { + "epoch": 1.018368342153268, + "grad_norm": 7.245079475425001, + "learning_rate": 2.5469438742119728e-06, + "loss": 0.6972, + "step": 14096 + }, + { + "epoch": 1.0184405873534776, + "grad_norm": 6.489172500497343, + "learning_rate": 2.5466514230750016e-06, + "loss": 0.5927, + "step": 14097 + }, + { + "epoch": 1.0185128325536872, + "grad_norm": 6.3587187257345965, + "learning_rate": 2.5463589712994073e-06, + "loss": 0.6784, + "step": 14098 + }, + { + "epoch": 1.0185850777538967, + "grad_norm": 7.794205875722959, + "learning_rate": 2.546066518889196e-06, + "loss": 0.674, + "step": 14099 + }, + { + "epoch": 1.0186573229541063, + "grad_norm": 6.3594154900238955, + "learning_rate": 2.545774065848369e-06, + "loss": 0.7155, + "step": 14100 + }, + { + "epoch": 1.0187295681543158, + "grad_norm": 7.174805152719633, + "learning_rate": 2.5454816121809307e-06, + "loss": 0.6804, + "step": 14101 + }, + { + "epoch": 1.0188018133545254, + "grad_norm": 8.295249535484574, + "learning_rate": 2.5451891578908844e-06, + "loss": 0.7042, + "step": 14102 + }, + { + "epoch": 1.0188740585547347, + "grad_norm": 6.953927051704647, + "learning_rate": 2.5448967029822335e-06, + "loss": 0.6696, + "step": 14103 + }, + { + "epoch": 1.0189463037549442, + "grad_norm": 6.168023793492519, + "learning_rate": 2.544604247458982e-06, + "loss": 0.704, + "step": 14104 + }, + { + "epoch": 1.0190185489551538, + "grad_norm": 5.510384552927552, + "learning_rate": 2.544311791325133e-06, + "loss": 0.622, + "step": 14105 + }, + { + "epoch": 1.0190907941553633, + "grad_norm": 6.74911097570653, + "learning_rate": 2.5440193345846905e-06, + "loss": 0.6627, + "step": 14106 + }, + { + "epoch": 1.0191630393555728, + "grad_norm": 5.6602142099344395, + "learning_rate": 2.5437268772416563e-06, + "loss": 0.6635, + "step": 14107 + }, + { + "epoch": 1.0192352845557824, + "grad_norm": 5.5822178120065855, + "learning_rate": 2.5434344193000353e-06, + "loss": 0.6777, + "step": 14108 + }, + { + "epoch": 1.019307529755992, + "grad_norm": 6.780915845381687, + "learning_rate": 2.5431419607638306e-06, + "loss": 0.75, + "step": 14109 + }, + { + "epoch": 1.0193797749562012, + "grad_norm": 5.936822471109171, + "learning_rate": 2.542849501637047e-06, + "loss": 0.6417, + "step": 14110 + }, + { + "epoch": 1.0194520201564108, + "grad_norm": 7.282861621284273, + "learning_rate": 2.542557041923687e-06, + "loss": 0.6594, + "step": 14111 + }, + { + "epoch": 1.0195242653566203, + "grad_norm": 6.198003312047545, + "learning_rate": 2.542264581627753e-06, + "loss": 0.6748, + "step": 14112 + }, + { + "epoch": 1.0195965105568299, + "grad_norm": 6.946899370089817, + "learning_rate": 2.5419721207532504e-06, + "loss": 0.7079, + "step": 14113 + }, + { + "epoch": 1.0196687557570394, + "grad_norm": 6.750152868729767, + "learning_rate": 2.541679659304182e-06, + "loss": 0.6614, + "step": 14114 + }, + { + "epoch": 1.019741000957249, + "grad_norm": 6.325540198375909, + "learning_rate": 2.541387197284552e-06, + "loss": 0.6699, + "step": 14115 + }, + { + "epoch": 1.0198132461574585, + "grad_norm": 10.24280127832927, + "learning_rate": 2.541094734698362e-06, + "loss": 0.638, + "step": 14116 + }, + { + "epoch": 1.0198854913576678, + "grad_norm": 6.584779829951458, + "learning_rate": 2.540802271549618e-06, + "loss": 0.6992, + "step": 14117 + }, + { + "epoch": 1.0199577365578774, + "grad_norm": 6.029044108626966, + "learning_rate": 2.540509807842322e-06, + "loss": 0.649, + "step": 14118 + }, + { + "epoch": 1.020029981758087, + "grad_norm": 6.277930146265305, + "learning_rate": 2.540217343580479e-06, + "loss": 0.6894, + "step": 14119 + }, + { + "epoch": 1.0201022269582964, + "grad_norm": 6.649500617299968, + "learning_rate": 2.5399248787680906e-06, + "loss": 0.7268, + "step": 14120 + }, + { + "epoch": 1.020174472158506, + "grad_norm": 6.802961478767302, + "learning_rate": 2.539632413409163e-06, + "loss": 0.7076, + "step": 14121 + }, + { + "epoch": 1.0202467173587155, + "grad_norm": 6.17687322335361, + "learning_rate": 2.539339947507698e-06, + "loss": 0.6243, + "step": 14122 + }, + { + "epoch": 1.020318962558925, + "grad_norm": 5.355263247675533, + "learning_rate": 2.539047481067699e-06, + "loss": 0.6788, + "step": 14123 + }, + { + "epoch": 1.0203912077591346, + "grad_norm": 6.316720590218676, + "learning_rate": 2.538755014093171e-06, + "loss": 0.7233, + "step": 14124 + }, + { + "epoch": 1.020463452959344, + "grad_norm": 7.004812493656705, + "learning_rate": 2.538462546588117e-06, + "loss": 0.7047, + "step": 14125 + }, + { + "epoch": 1.0205356981595535, + "grad_norm": 7.2491186198158015, + "learning_rate": 2.5381700785565407e-06, + "loss": 0.7397, + "step": 14126 + }, + { + "epoch": 1.020607943359763, + "grad_norm": 5.969326770337209, + "learning_rate": 2.5378776100024443e-06, + "loss": 0.6612, + "step": 14127 + }, + { + "epoch": 1.0206801885599726, + "grad_norm": 6.031412231160897, + "learning_rate": 2.537585140929834e-06, + "loss": 0.7638, + "step": 14128 + }, + { + "epoch": 1.020752433760182, + "grad_norm": 6.660447780459695, + "learning_rate": 2.5372926713427125e-06, + "loss": 0.6862, + "step": 14129 + }, + { + "epoch": 1.0208246789603916, + "grad_norm": 6.239199958278687, + "learning_rate": 2.537000201245083e-06, + "loss": 0.6196, + "step": 14130 + }, + { + "epoch": 1.0208969241606012, + "grad_norm": 8.455716735310645, + "learning_rate": 2.5367077306409486e-06, + "loss": 0.6242, + "step": 14131 + }, + { + "epoch": 1.0209691693608105, + "grad_norm": 6.0467526130217975, + "learning_rate": 2.5364152595343143e-06, + "loss": 0.677, + "step": 14132 + }, + { + "epoch": 1.02104141456102, + "grad_norm": 5.711661192497403, + "learning_rate": 2.5361227879291845e-06, + "loss": 0.7104, + "step": 14133 + }, + { + "epoch": 1.0211136597612296, + "grad_norm": 7.398283073642434, + "learning_rate": 2.5358303158295607e-06, + "loss": 0.715, + "step": 14134 + }, + { + "epoch": 1.0211859049614391, + "grad_norm": 6.150681819182351, + "learning_rate": 2.5355378432394484e-06, + "loss": 0.6504, + "step": 14135 + }, + { + "epoch": 1.0212581501616487, + "grad_norm": 5.460938023906896, + "learning_rate": 2.53524537016285e-06, + "loss": 0.6623, + "step": 14136 + }, + { + "epoch": 1.0213303953618582, + "grad_norm": 6.147407493934689, + "learning_rate": 2.5349528966037694e-06, + "loss": 0.6571, + "step": 14137 + }, + { + "epoch": 1.0214026405620678, + "grad_norm": 6.7422876450050895, + "learning_rate": 2.5346604225662117e-06, + "loss": 0.6044, + "step": 14138 + }, + { + "epoch": 1.021474885762277, + "grad_norm": 5.884947127345359, + "learning_rate": 2.5343679480541792e-06, + "loss": 0.6521, + "step": 14139 + }, + { + "epoch": 1.0215471309624866, + "grad_norm": 6.990824544187702, + "learning_rate": 2.534075473071677e-06, + "loss": 0.7435, + "step": 14140 + }, + { + "epoch": 1.0216193761626962, + "grad_norm": 6.278396489685042, + "learning_rate": 2.5337829976227067e-06, + "loss": 0.6239, + "step": 14141 + }, + { + "epoch": 1.0216916213629057, + "grad_norm": 6.441587465234453, + "learning_rate": 2.533490521711275e-06, + "loss": 0.6292, + "step": 14142 + }, + { + "epoch": 1.0217638665631152, + "grad_norm": 6.547004352466168, + "learning_rate": 2.533198045341383e-06, + "loss": 0.6239, + "step": 14143 + }, + { + "epoch": 1.0218361117633248, + "grad_norm": 6.2040067954010825, + "learning_rate": 2.5329055685170363e-06, + "loss": 0.6663, + "step": 14144 + }, + { + "epoch": 1.0219083569635343, + "grad_norm": 8.062235509984625, + "learning_rate": 2.532613091242237e-06, + "loss": 0.67, + "step": 14145 + }, + { + "epoch": 1.0219806021637436, + "grad_norm": 6.358189277800244, + "learning_rate": 2.53232061352099e-06, + "loss": 0.637, + "step": 14146 + }, + { + "epoch": 1.0220528473639532, + "grad_norm": 6.6047312337755395, + "learning_rate": 2.532028135357299e-06, + "loss": 0.6146, + "step": 14147 + }, + { + "epoch": 1.0221250925641627, + "grad_norm": 7.661726009240596, + "learning_rate": 2.5317356567551676e-06, + "loss": 0.6935, + "step": 14148 + }, + { + "epoch": 1.0221973377643723, + "grad_norm": 6.934852713607105, + "learning_rate": 2.5314431777186006e-06, + "loss": 0.6953, + "step": 14149 + }, + { + "epoch": 1.0222695829645818, + "grad_norm": 6.667652089086796, + "learning_rate": 2.5311506982516e-06, + "loss": 0.6594, + "step": 14150 + }, + { + "epoch": 1.0223418281647914, + "grad_norm": 9.4139583597715, + "learning_rate": 2.5308582183581705e-06, + "loss": 0.7414, + "step": 14151 + }, + { + "epoch": 1.022414073365001, + "grad_norm": 7.944226876268143, + "learning_rate": 2.530565738042316e-06, + "loss": 0.6291, + "step": 14152 + }, + { + "epoch": 1.0224863185652102, + "grad_norm": 5.669556198916788, + "learning_rate": 2.530273257308041e-06, + "loss": 0.6089, + "step": 14153 + }, + { + "epoch": 1.0225585637654198, + "grad_norm": 6.288756482752789, + "learning_rate": 2.529980776159348e-06, + "loss": 0.6523, + "step": 14154 + }, + { + "epoch": 1.0226308089656293, + "grad_norm": 7.515455723876552, + "learning_rate": 2.5296882946002404e-06, + "loss": 0.7026, + "step": 14155 + }, + { + "epoch": 1.0227030541658388, + "grad_norm": 7.4448257399251165, + "learning_rate": 2.5293958126347245e-06, + "loss": 0.7203, + "step": 14156 + }, + { + "epoch": 1.0227752993660484, + "grad_norm": 6.846576542445319, + "learning_rate": 2.5291033302668027e-06, + "loss": 0.6303, + "step": 14157 + }, + { + "epoch": 1.022847544566258, + "grad_norm": 5.956452328426738, + "learning_rate": 2.528810847500479e-06, + "loss": 0.6417, + "step": 14158 + }, + { + "epoch": 1.0229197897664675, + "grad_norm": 7.167595411343535, + "learning_rate": 2.5285183643397565e-06, + "loss": 0.5875, + "step": 14159 + }, + { + "epoch": 1.022992034966677, + "grad_norm": 7.326614972604136, + "learning_rate": 2.5282258807886403e-06, + "loss": 0.6956, + "step": 14160 + }, + { + "epoch": 1.0230642801668863, + "grad_norm": 6.831287760163394, + "learning_rate": 2.5279333968511326e-06, + "loss": 0.6876, + "step": 14161 + }, + { + "epoch": 1.0231365253670959, + "grad_norm": 7.479010138472015, + "learning_rate": 2.52764091253124e-06, + "loss": 0.7425, + "step": 14162 + }, + { + "epoch": 1.0232087705673054, + "grad_norm": 5.872925087592439, + "learning_rate": 2.527348427832964e-06, + "loss": 0.6127, + "step": 14163 + }, + { + "epoch": 1.023281015767515, + "grad_norm": 7.405286766542304, + "learning_rate": 2.527055942760309e-06, + "loss": 0.5978, + "step": 14164 + }, + { + "epoch": 1.0233532609677245, + "grad_norm": 6.286471649467653, + "learning_rate": 2.5267634573172795e-06, + "loss": 0.6128, + "step": 14165 + }, + { + "epoch": 1.023425506167934, + "grad_norm": 7.060298610348231, + "learning_rate": 2.526470971507879e-06, + "loss": 0.6753, + "step": 14166 + }, + { + "epoch": 1.0234977513681436, + "grad_norm": 7.78539487451027, + "learning_rate": 2.5261784853361114e-06, + "loss": 0.6765, + "step": 14167 + }, + { + "epoch": 1.023569996568353, + "grad_norm": 10.745578655071313, + "learning_rate": 2.5258859988059807e-06, + "loss": 0.7368, + "step": 14168 + }, + { + "epoch": 1.0236422417685624, + "grad_norm": 7.413110316144376, + "learning_rate": 2.525593511921491e-06, + "loss": 0.6699, + "step": 14169 + }, + { + "epoch": 1.023714486968772, + "grad_norm": 6.444203858251166, + "learning_rate": 2.5253010246866457e-06, + "loss": 0.6698, + "step": 14170 + }, + { + "epoch": 1.0237867321689815, + "grad_norm": 5.737486449913364, + "learning_rate": 2.5250085371054487e-06, + "loss": 0.6683, + "step": 14171 + }, + { + "epoch": 1.023858977369191, + "grad_norm": 7.2117026802341, + "learning_rate": 2.5247160491819052e-06, + "loss": 0.7352, + "step": 14172 + }, + { + "epoch": 1.0239312225694006, + "grad_norm": 6.3674238477031535, + "learning_rate": 2.5244235609200174e-06, + "loss": 0.6787, + "step": 14173 + }, + { + "epoch": 1.0240034677696102, + "grad_norm": 6.469305484528339, + "learning_rate": 2.52413107232379e-06, + "loss": 0.6567, + "step": 14174 + }, + { + "epoch": 1.0240757129698195, + "grad_norm": 5.4218187136847815, + "learning_rate": 2.523838583397227e-06, + "loss": 0.6328, + "step": 14175 + }, + { + "epoch": 1.024147958170029, + "grad_norm": 6.326050368835437, + "learning_rate": 2.523546094144333e-06, + "loss": 0.6634, + "step": 14176 + }, + { + "epoch": 1.0242202033702386, + "grad_norm": 7.402833303402741, + "learning_rate": 2.5232536045691103e-06, + "loss": 0.6269, + "step": 14177 + }, + { + "epoch": 1.024292448570448, + "grad_norm": 5.913027149919371, + "learning_rate": 2.5229611146755647e-06, + "loss": 0.7269, + "step": 14178 + }, + { + "epoch": 1.0243646937706576, + "grad_norm": 7.290544725115951, + "learning_rate": 2.5226686244676982e-06, + "loss": 0.6581, + "step": 14179 + }, + { + "epoch": 1.0244369389708672, + "grad_norm": 9.627702606058369, + "learning_rate": 2.5223761339495166e-06, + "loss": 0.7227, + "step": 14180 + }, + { + "epoch": 1.0245091841710767, + "grad_norm": 7.402647276923504, + "learning_rate": 2.5220836431250234e-06, + "loss": 0.6041, + "step": 14181 + }, + { + "epoch": 1.024581429371286, + "grad_norm": 7.0971408083684375, + "learning_rate": 2.5217911519982215e-06, + "loss": 0.6421, + "step": 14182 + }, + { + "epoch": 1.0246536745714956, + "grad_norm": 6.573789060062786, + "learning_rate": 2.5214986605731158e-06, + "loss": 0.6537, + "step": 14183 + }, + { + "epoch": 1.0247259197717051, + "grad_norm": 6.867484128619151, + "learning_rate": 2.5212061688537097e-06, + "loss": 0.6819, + "step": 14184 + }, + { + "epoch": 1.0247981649719147, + "grad_norm": 6.352523674678711, + "learning_rate": 2.520913676844009e-06, + "loss": 0.752, + "step": 14185 + }, + { + "epoch": 1.0248704101721242, + "grad_norm": 6.388478709151745, + "learning_rate": 2.520621184548015e-06, + "loss": 0.5811, + "step": 14186 + }, + { + "epoch": 1.0249426553723338, + "grad_norm": 8.277868192151928, + "learning_rate": 2.520328691969734e-06, + "loss": 0.6358, + "step": 14187 + }, + { + "epoch": 1.0250149005725433, + "grad_norm": 7.601124138262919, + "learning_rate": 2.5200361991131684e-06, + "loss": 0.6525, + "step": 14188 + }, + { + "epoch": 1.0250871457727526, + "grad_norm": 6.401071530286147, + "learning_rate": 2.5197437059823226e-06, + "loss": 0.678, + "step": 14189 + }, + { + "epoch": 1.0251593909729622, + "grad_norm": 6.166213903903383, + "learning_rate": 2.5194512125812016e-06, + "loss": 0.6851, + "step": 14190 + }, + { + "epoch": 1.0252316361731717, + "grad_norm": 6.849511077584525, + "learning_rate": 2.519158718913808e-06, + "loss": 0.6616, + "step": 14191 + }, + { + "epoch": 1.0253038813733812, + "grad_norm": 7.846590173824362, + "learning_rate": 2.518866224984147e-06, + "loss": 0.7538, + "step": 14192 + }, + { + "epoch": 1.0253761265735908, + "grad_norm": 7.729633519416197, + "learning_rate": 2.5185737307962204e-06, + "loss": 0.7222, + "step": 14193 + }, + { + "epoch": 1.0254483717738003, + "grad_norm": 8.387622487840947, + "learning_rate": 2.5182812363540354e-06, + "loss": 0.6474, + "step": 14194 + }, + { + "epoch": 1.0255206169740099, + "grad_norm": 6.506664674060287, + "learning_rate": 2.5179887416615946e-06, + "loss": 0.6549, + "step": 14195 + }, + { + "epoch": 1.0255928621742192, + "grad_norm": 6.986300550773792, + "learning_rate": 2.517696246722901e-06, + "loss": 0.7123, + "step": 14196 + }, + { + "epoch": 1.0256651073744287, + "grad_norm": 7.617595642110391, + "learning_rate": 2.5174037515419596e-06, + "loss": 0.7117, + "step": 14197 + }, + { + "epoch": 1.0257373525746383, + "grad_norm": 5.653945537713696, + "learning_rate": 2.5171112561227746e-06, + "loss": 0.5722, + "step": 14198 + }, + { + "epoch": 1.0258095977748478, + "grad_norm": 5.792106373879704, + "learning_rate": 2.51681876046935e-06, + "loss": 0.7216, + "step": 14199 + }, + { + "epoch": 1.0258818429750574, + "grad_norm": 7.297194937445355, + "learning_rate": 2.5165262645856893e-06, + "loss": 0.6775, + "step": 14200 + }, + { + "epoch": 1.025954088175267, + "grad_norm": 6.297847852500264, + "learning_rate": 2.516233768475797e-06, + "loss": 0.6705, + "step": 14201 + }, + { + "epoch": 1.0260263333754764, + "grad_norm": 6.330654213410758, + "learning_rate": 2.5159412721436772e-06, + "loss": 0.6445, + "step": 14202 + }, + { + "epoch": 1.026098578575686, + "grad_norm": 6.017717270076597, + "learning_rate": 2.5156487755933336e-06, + "loss": 0.5901, + "step": 14203 + }, + { + "epoch": 1.0261708237758953, + "grad_norm": 6.007682967556783, + "learning_rate": 2.5153562788287706e-06, + "loss": 0.7304, + "step": 14204 + }, + { + "epoch": 1.0262430689761048, + "grad_norm": 7.096049066453503, + "learning_rate": 2.5150637818539914e-06, + "loss": 0.7405, + "step": 14205 + }, + { + "epoch": 1.0263153141763144, + "grad_norm": 5.8492471063741, + "learning_rate": 2.5147712846730016e-06, + "loss": 0.5922, + "step": 14206 + }, + { + "epoch": 1.026387559376524, + "grad_norm": 6.1829891523759315, + "learning_rate": 2.5144787872898035e-06, + "loss": 0.6364, + "step": 14207 + }, + { + "epoch": 1.0264598045767335, + "grad_norm": 6.434517178625097, + "learning_rate": 2.5141862897084026e-06, + "loss": 0.6256, + "step": 14208 + }, + { + "epoch": 1.026532049776943, + "grad_norm": 7.557073238133508, + "learning_rate": 2.513893791932802e-06, + "loss": 0.6481, + "step": 14209 + }, + { + "epoch": 1.0266042949771526, + "grad_norm": 6.119043373189841, + "learning_rate": 2.513601293967007e-06, + "loss": 0.6175, + "step": 14210 + }, + { + "epoch": 1.0266765401773619, + "grad_norm": 6.431950223151272, + "learning_rate": 2.5133087958150197e-06, + "loss": 0.6728, + "step": 14211 + }, + { + "epoch": 1.0267487853775714, + "grad_norm": 6.594315491061548, + "learning_rate": 2.513016297480846e-06, + "loss": 0.6353, + "step": 14212 + }, + { + "epoch": 1.026821030577781, + "grad_norm": 6.803180724841397, + "learning_rate": 2.5127237989684892e-06, + "loss": 0.7312, + "step": 14213 + }, + { + "epoch": 1.0268932757779905, + "grad_norm": 6.088683854121468, + "learning_rate": 2.512431300281954e-06, + "loss": 0.6633, + "step": 14214 + }, + { + "epoch": 1.0269655209782, + "grad_norm": 7.565398362305259, + "learning_rate": 2.5121388014252437e-06, + "loss": 0.665, + "step": 14215 + }, + { + "epoch": 1.0270377661784096, + "grad_norm": 6.8669233564280185, + "learning_rate": 2.511846302402362e-06, + "loss": 0.6271, + "step": 14216 + }, + { + "epoch": 1.0271100113786191, + "grad_norm": 6.850499278340873, + "learning_rate": 2.5115538032173135e-06, + "loss": 0.6989, + "step": 14217 + }, + { + "epoch": 1.0271822565788284, + "grad_norm": 6.205709461901766, + "learning_rate": 2.5112613038741028e-06, + "loss": 0.6792, + "step": 14218 + }, + { + "epoch": 1.027254501779038, + "grad_norm": 6.880890698108524, + "learning_rate": 2.5109688043767345e-06, + "loss": 0.6494, + "step": 14219 + }, + { + "epoch": 1.0273267469792475, + "grad_norm": 6.688882560065153, + "learning_rate": 2.5106763047292115e-06, + "loss": 0.6906, + "step": 14220 + }, + { + "epoch": 1.027398992179457, + "grad_norm": 6.185417980974708, + "learning_rate": 2.510383804935537e-06, + "loss": 0.6752, + "step": 14221 + }, + { + "epoch": 1.0274712373796666, + "grad_norm": 7.642679698804316, + "learning_rate": 2.510091304999717e-06, + "loss": 0.6489, + "step": 14222 + }, + { + "epoch": 1.0275434825798762, + "grad_norm": 6.91066576608697, + "learning_rate": 2.509798804925755e-06, + "loss": 0.5989, + "step": 14223 + }, + { + "epoch": 1.0276157277800857, + "grad_norm": 5.9850404533733625, + "learning_rate": 2.509506304717655e-06, + "loss": 0.6887, + "step": 14224 + }, + { + "epoch": 1.027687972980295, + "grad_norm": 5.799029091435937, + "learning_rate": 2.5092138043794205e-06, + "loss": 0.6386, + "step": 14225 + }, + { + "epoch": 1.0277602181805046, + "grad_norm": 7.169548105978878, + "learning_rate": 2.508921303915056e-06, + "loss": 0.6693, + "step": 14226 + }, + { + "epoch": 1.027832463380714, + "grad_norm": 5.650938627882861, + "learning_rate": 2.5086288033285666e-06, + "loss": 0.668, + "step": 14227 + }, + { + "epoch": 1.0279047085809236, + "grad_norm": 6.327391631405293, + "learning_rate": 2.5083363026239553e-06, + "loss": 0.7387, + "step": 14228 + }, + { + "epoch": 1.0279769537811332, + "grad_norm": 6.294199183325469, + "learning_rate": 2.508043801805227e-06, + "loss": 0.6333, + "step": 14229 + }, + { + "epoch": 1.0280491989813427, + "grad_norm": 6.442793364234842, + "learning_rate": 2.5077513008763846e-06, + "loss": 0.7636, + "step": 14230 + }, + { + "epoch": 1.0281214441815523, + "grad_norm": 7.7432741082048056, + "learning_rate": 2.507458799841433e-06, + "loss": 0.6031, + "step": 14231 + }, + { + "epoch": 1.0281936893817618, + "grad_norm": 7.9575768022968605, + "learning_rate": 2.507166298704376e-06, + "loss": 0.6843, + "step": 14232 + }, + { + "epoch": 1.0282659345819711, + "grad_norm": 6.537599826145403, + "learning_rate": 2.5068737974692188e-06, + "loss": 0.6722, + "step": 14233 + }, + { + "epoch": 1.0283381797821807, + "grad_norm": 7.4835502791304505, + "learning_rate": 2.506581296139964e-06, + "loss": 0.6199, + "step": 14234 + }, + { + "epoch": 1.0284104249823902, + "grad_norm": 6.429612744477331, + "learning_rate": 2.5062887947206165e-06, + "loss": 0.6807, + "step": 14235 + }, + { + "epoch": 1.0284826701825998, + "grad_norm": 7.0240867917166545, + "learning_rate": 2.50599629321518e-06, + "loss": 0.6699, + "step": 14236 + }, + { + "epoch": 1.0285549153828093, + "grad_norm": 6.795013944655055, + "learning_rate": 2.505703791627659e-06, + "loss": 0.6104, + "step": 14237 + }, + { + "epoch": 1.0286271605830188, + "grad_norm": 7.3744429038463535, + "learning_rate": 2.5054112899620583e-06, + "loss": 0.6607, + "step": 14238 + }, + { + "epoch": 1.0286994057832284, + "grad_norm": 7.5814317547803975, + "learning_rate": 2.5051187882223804e-06, + "loss": 0.6553, + "step": 14239 + }, + { + "epoch": 1.0287716509834377, + "grad_norm": 8.32252070535804, + "learning_rate": 2.50482628641263e-06, + "loss": 0.7156, + "step": 14240 + }, + { + "epoch": 1.0288438961836472, + "grad_norm": 6.661698811007894, + "learning_rate": 2.504533784536812e-06, + "loss": 0.6255, + "step": 14241 + }, + { + "epoch": 1.0289161413838568, + "grad_norm": 5.959508002026737, + "learning_rate": 2.5042412825989304e-06, + "loss": 0.6626, + "step": 14242 + }, + { + "epoch": 1.0289883865840663, + "grad_norm": 7.080008350949292, + "learning_rate": 2.503948780602988e-06, + "loss": 0.7098, + "step": 14243 + }, + { + "epoch": 1.0290606317842759, + "grad_norm": 6.6953030990125075, + "learning_rate": 2.5036562785529912e-06, + "loss": 0.6831, + "step": 14244 + }, + { + "epoch": 1.0291328769844854, + "grad_norm": 7.02608072785156, + "learning_rate": 2.503363776452941e-06, + "loss": 0.712, + "step": 14245 + }, + { + "epoch": 1.029205122184695, + "grad_norm": 10.360626559365834, + "learning_rate": 2.503071274306845e-06, + "loss": 0.741, + "step": 14246 + }, + { + "epoch": 1.0292773673849043, + "grad_norm": 6.476795739919596, + "learning_rate": 2.5027787721187045e-06, + "loss": 0.6408, + "step": 14247 + }, + { + "epoch": 1.0293496125851138, + "grad_norm": 6.944145525114995, + "learning_rate": 2.502486269892525e-06, + "loss": 0.6415, + "step": 14248 + }, + { + "epoch": 1.0294218577853234, + "grad_norm": 6.422611537750337, + "learning_rate": 2.502193767632311e-06, + "loss": 0.5926, + "step": 14249 + }, + { + "epoch": 1.029494102985533, + "grad_norm": 6.893916513847723, + "learning_rate": 2.501901265342065e-06, + "loss": 0.684, + "step": 14250 + }, + { + "epoch": 1.0295663481857424, + "grad_norm": 7.0284271677345025, + "learning_rate": 2.501608763025793e-06, + "loss": 0.6774, + "step": 14251 + }, + { + "epoch": 1.029638593385952, + "grad_norm": 5.7997990145882765, + "learning_rate": 2.501316260687498e-06, + "loss": 0.6143, + "step": 14252 + }, + { + "epoch": 1.0297108385861615, + "grad_norm": 7.74093245140953, + "learning_rate": 2.5010237583311847e-06, + "loss": 0.6902, + "step": 14253 + }, + { + "epoch": 1.0297830837863708, + "grad_norm": 8.69260700975125, + "learning_rate": 2.5007312559608564e-06, + "loss": 0.7328, + "step": 14254 + }, + { + "epoch": 1.0298553289865804, + "grad_norm": 6.022786106456949, + "learning_rate": 2.500438753580518e-06, + "loss": 0.673, + "step": 14255 + }, + { + "epoch": 1.02992757418679, + "grad_norm": 6.4543042156157195, + "learning_rate": 2.5001462511941736e-06, + "loss": 0.6733, + "step": 14256 + }, + { + "epoch": 1.0299998193869995, + "grad_norm": 8.740008071441764, + "learning_rate": 2.4998537488058277e-06, + "loss": 0.6849, + "step": 14257 + }, + { + "epoch": 1.030072064587209, + "grad_norm": 6.210637767924885, + "learning_rate": 2.4995612464194825e-06, + "loss": 0.6312, + "step": 14258 + }, + { + "epoch": 1.0301443097874186, + "grad_norm": 7.123325619631661, + "learning_rate": 2.499268744039144e-06, + "loss": 0.6656, + "step": 14259 + }, + { + "epoch": 1.030216554987628, + "grad_norm": 7.072934974193522, + "learning_rate": 2.4989762416688153e-06, + "loss": 0.5755, + "step": 14260 + }, + { + "epoch": 1.0302888001878374, + "grad_norm": 6.179048565460253, + "learning_rate": 2.498683739312503e-06, + "loss": 0.6725, + "step": 14261 + }, + { + "epoch": 1.030361045388047, + "grad_norm": 7.138513966704277, + "learning_rate": 2.498391236974208e-06, + "loss": 0.6988, + "step": 14262 + }, + { + "epoch": 1.0304332905882565, + "grad_norm": 5.596904866593085, + "learning_rate": 2.498098734657935e-06, + "loss": 0.665, + "step": 14263 + }, + { + "epoch": 1.030505535788466, + "grad_norm": 5.541766325931694, + "learning_rate": 2.49780623236769e-06, + "loss": 0.6472, + "step": 14264 + }, + { + "epoch": 1.0305777809886756, + "grad_norm": 6.075112034898802, + "learning_rate": 2.4975137301074756e-06, + "loss": 0.6203, + "step": 14265 + }, + { + "epoch": 1.0306500261888851, + "grad_norm": 6.898274645497727, + "learning_rate": 2.497221227881296e-06, + "loss": 0.7123, + "step": 14266 + }, + { + "epoch": 1.0307222713890947, + "grad_norm": 5.906578741057839, + "learning_rate": 2.4969287256931555e-06, + "loss": 0.6295, + "step": 14267 + }, + { + "epoch": 1.030794516589304, + "grad_norm": 7.450214961490179, + "learning_rate": 2.4966362235470594e-06, + "loss": 0.655, + "step": 14268 + }, + { + "epoch": 1.0308667617895135, + "grad_norm": 6.242672401281105, + "learning_rate": 2.49634372144701e-06, + "loss": 0.7276, + "step": 14269 + }, + { + "epoch": 1.030939006989723, + "grad_norm": 6.083583077656139, + "learning_rate": 2.4960512193970123e-06, + "loss": 0.6893, + "step": 14270 + }, + { + "epoch": 1.0310112521899326, + "grad_norm": 6.191394851625943, + "learning_rate": 2.4957587174010713e-06, + "loss": 0.709, + "step": 14271 + }, + { + "epoch": 1.0310834973901422, + "grad_norm": 7.792476047629589, + "learning_rate": 2.495466215463189e-06, + "loss": 0.7321, + "step": 14272 + }, + { + "epoch": 1.0311557425903517, + "grad_norm": 5.924436312230538, + "learning_rate": 2.495173713587371e-06, + "loss": 0.7341, + "step": 14273 + }, + { + "epoch": 1.0312279877905612, + "grad_norm": 7.472235596497899, + "learning_rate": 2.49488121177762e-06, + "loss": 0.6798, + "step": 14274 + }, + { + "epoch": 1.0313002329907708, + "grad_norm": 6.290679989009225, + "learning_rate": 2.494588710037943e-06, + "loss": 0.6146, + "step": 14275 + }, + { + "epoch": 1.03137247819098, + "grad_norm": 5.900325326517194, + "learning_rate": 2.4942962083723415e-06, + "loss": 0.6165, + "step": 14276 + }, + { + "epoch": 1.0314447233911896, + "grad_norm": 6.803385946222068, + "learning_rate": 2.4940037067848204e-06, + "loss": 0.6484, + "step": 14277 + }, + { + "epoch": 1.0315169685913992, + "grad_norm": 5.719998582959834, + "learning_rate": 2.4937112052793847e-06, + "loss": 0.6306, + "step": 14278 + }, + { + "epoch": 1.0315892137916087, + "grad_norm": 7.7921482964098026, + "learning_rate": 2.493418703860037e-06, + "loss": 0.6968, + "step": 14279 + }, + { + "epoch": 1.0316614589918183, + "grad_norm": 5.909691186572637, + "learning_rate": 2.493126202530782e-06, + "loss": 0.6322, + "step": 14280 + }, + { + "epoch": 1.0317337041920278, + "grad_norm": 8.064231982239779, + "learning_rate": 2.492833701295624e-06, + "loss": 0.7131, + "step": 14281 + }, + { + "epoch": 1.0318059493922374, + "grad_norm": 6.6945934287869795, + "learning_rate": 2.4925412001585678e-06, + "loss": 0.7071, + "step": 14282 + }, + { + "epoch": 1.0318781945924467, + "grad_norm": 6.291332143349249, + "learning_rate": 2.492248699123616e-06, + "loss": 0.7356, + "step": 14283 + }, + { + "epoch": 1.0319504397926562, + "grad_norm": 6.390540236678082, + "learning_rate": 2.4919561981947737e-06, + "loss": 0.6377, + "step": 14284 + }, + { + "epoch": 1.0320226849928658, + "grad_norm": 6.236431499691293, + "learning_rate": 2.4916636973760446e-06, + "loss": 0.6873, + "step": 14285 + }, + { + "epoch": 1.0320949301930753, + "grad_norm": 6.852836255602037, + "learning_rate": 2.4913711966714342e-06, + "loss": 0.6534, + "step": 14286 + }, + { + "epoch": 1.0321671753932848, + "grad_norm": 5.820162081214863, + "learning_rate": 2.4910786960849447e-06, + "loss": 0.6899, + "step": 14287 + }, + { + "epoch": 1.0322394205934944, + "grad_norm": 7.30469820562384, + "learning_rate": 2.49078619562058e-06, + "loss": 0.63, + "step": 14288 + }, + { + "epoch": 1.032311665793704, + "grad_norm": 6.057473527360063, + "learning_rate": 2.4904936952823464e-06, + "loss": 0.7197, + "step": 14289 + }, + { + "epoch": 1.0323839109939132, + "grad_norm": 5.854647181147866, + "learning_rate": 2.490201195074246e-06, + "loss": 0.6189, + "step": 14290 + }, + { + "epoch": 1.0324561561941228, + "grad_norm": 6.833407735031868, + "learning_rate": 2.4899086950002837e-06, + "loss": 0.7748, + "step": 14291 + }, + { + "epoch": 1.0325284013943323, + "grad_norm": 7.234666591889414, + "learning_rate": 2.4896161950644633e-06, + "loss": 0.6758, + "step": 14292 + }, + { + "epoch": 1.0326006465945419, + "grad_norm": 8.066490427829542, + "learning_rate": 2.4893236952707898e-06, + "loss": 0.7406, + "step": 14293 + }, + { + "epoch": 1.0326728917947514, + "grad_norm": 5.99703079506297, + "learning_rate": 2.4890311956232663e-06, + "loss": 0.6738, + "step": 14294 + }, + { + "epoch": 1.032745136994961, + "grad_norm": 5.507124880891603, + "learning_rate": 2.488738696125897e-06, + "loss": 0.6383, + "step": 14295 + }, + { + "epoch": 1.0328173821951705, + "grad_norm": 6.85456307452393, + "learning_rate": 2.488446196782687e-06, + "loss": 0.6384, + "step": 14296 + }, + { + "epoch": 1.0328896273953798, + "grad_norm": 6.199258132357578, + "learning_rate": 2.4881536975976387e-06, + "loss": 0.616, + "step": 14297 + }, + { + "epoch": 1.0329618725955894, + "grad_norm": 6.3485417050694295, + "learning_rate": 2.487861198574757e-06, + "loss": 0.679, + "step": 14298 + }, + { + "epoch": 1.033034117795799, + "grad_norm": 8.62673402023947, + "learning_rate": 2.4875686997180465e-06, + "loss": 0.7361, + "step": 14299 + }, + { + "epoch": 1.0331063629960084, + "grad_norm": 7.023948302910756, + "learning_rate": 2.4872762010315116e-06, + "loss": 0.6956, + "step": 14300 + }, + { + "epoch": 1.033178608196218, + "grad_norm": 5.193857181329108, + "learning_rate": 2.4869837025191546e-06, + "loss": 0.6553, + "step": 14301 + }, + { + "epoch": 1.0332508533964275, + "grad_norm": 6.307847225083233, + "learning_rate": 2.4866912041849807e-06, + "loss": 0.6481, + "step": 14302 + }, + { + "epoch": 1.033323098596637, + "grad_norm": 6.034361357947584, + "learning_rate": 2.4863987060329943e-06, + "loss": 0.7163, + "step": 14303 + }, + { + "epoch": 1.0333953437968466, + "grad_norm": 8.817182014081007, + "learning_rate": 2.486106208067199e-06, + "loss": 0.7624, + "step": 14304 + }, + { + "epoch": 1.033467588997056, + "grad_norm": 6.28296087697447, + "learning_rate": 2.4858137102915982e-06, + "loss": 0.6582, + "step": 14305 + }, + { + "epoch": 1.0335398341972655, + "grad_norm": 7.53566754842533, + "learning_rate": 2.485521212710197e-06, + "loss": 0.7724, + "step": 14306 + }, + { + "epoch": 1.033612079397475, + "grad_norm": 5.110007523980919, + "learning_rate": 2.4852287153269996e-06, + "loss": 0.6859, + "step": 14307 + }, + { + "epoch": 1.0336843245976846, + "grad_norm": 5.960234474331834, + "learning_rate": 2.4849362181460094e-06, + "loss": 0.658, + "step": 14308 + }, + { + "epoch": 1.033756569797894, + "grad_norm": 7.102095345796285, + "learning_rate": 2.4846437211712302e-06, + "loss": 0.7514, + "step": 14309 + }, + { + "epoch": 1.0338288149981036, + "grad_norm": 6.270102254468753, + "learning_rate": 2.484351224406667e-06, + "loss": 0.6953, + "step": 14310 + }, + { + "epoch": 1.0339010601983132, + "grad_norm": 6.51872929599945, + "learning_rate": 2.4840587278563236e-06, + "loss": 0.6995, + "step": 14311 + }, + { + "epoch": 1.0339733053985225, + "grad_norm": 6.033603031319742, + "learning_rate": 2.4837662315242035e-06, + "loss": 0.732, + "step": 14312 + }, + { + "epoch": 1.034045550598732, + "grad_norm": 7.327722078808517, + "learning_rate": 2.483473735414311e-06, + "loss": 0.7022, + "step": 14313 + }, + { + "epoch": 1.0341177957989416, + "grad_norm": 7.752270273730957, + "learning_rate": 2.4831812395306513e-06, + "loss": 0.7257, + "step": 14314 + }, + { + "epoch": 1.0341900409991511, + "grad_norm": 6.572207872610298, + "learning_rate": 2.4828887438772266e-06, + "loss": 0.6401, + "step": 14315 + }, + { + "epoch": 1.0342622861993607, + "grad_norm": 7.3303731954998534, + "learning_rate": 2.4825962484580412e-06, + "loss": 0.6823, + "step": 14316 + }, + { + "epoch": 1.0343345313995702, + "grad_norm": 6.019207727252679, + "learning_rate": 2.4823037532770995e-06, + "loss": 0.6206, + "step": 14317 + }, + { + "epoch": 1.0344067765997798, + "grad_norm": 5.7476061938191725, + "learning_rate": 2.482011258338407e-06, + "loss": 0.6299, + "step": 14318 + }, + { + "epoch": 1.034479021799989, + "grad_norm": 5.349067228769471, + "learning_rate": 2.4817187636459654e-06, + "loss": 0.6542, + "step": 14319 + }, + { + "epoch": 1.0345512670001986, + "grad_norm": 7.016337810913573, + "learning_rate": 2.4814262692037796e-06, + "loss": 0.6973, + "step": 14320 + }, + { + "epoch": 1.0346235122004082, + "grad_norm": 5.73872763271871, + "learning_rate": 2.4811337750158544e-06, + "loss": 0.6702, + "step": 14321 + }, + { + "epoch": 1.0346957574006177, + "grad_norm": 7.164679614062862, + "learning_rate": 2.480841281086193e-06, + "loss": 0.7077, + "step": 14322 + }, + { + "epoch": 1.0347680026008272, + "grad_norm": 5.267055419062494, + "learning_rate": 2.480548787418799e-06, + "loss": 0.6431, + "step": 14323 + }, + { + "epoch": 1.0348402478010368, + "grad_norm": 5.647213456313387, + "learning_rate": 2.4802562940176774e-06, + "loss": 0.6321, + "step": 14324 + }, + { + "epoch": 1.0349124930012463, + "grad_norm": 7.139949173946756, + "learning_rate": 2.4799638008868324e-06, + "loss": 0.6917, + "step": 14325 + }, + { + "epoch": 1.0349847382014556, + "grad_norm": 7.274822492498582, + "learning_rate": 2.479671308030267e-06, + "loss": 0.6953, + "step": 14326 + }, + { + "epoch": 1.0350569834016652, + "grad_norm": 9.290211881555727, + "learning_rate": 2.479378815451985e-06, + "loss": 0.6396, + "step": 14327 + }, + { + "epoch": 1.0351292286018747, + "grad_norm": 6.779231601709676, + "learning_rate": 2.4790863231559923e-06, + "loss": 0.6796, + "step": 14328 + }, + { + "epoch": 1.0352014738020843, + "grad_norm": 6.785991364752112, + "learning_rate": 2.4787938311462907e-06, + "loss": 0.7103, + "step": 14329 + }, + { + "epoch": 1.0352737190022938, + "grad_norm": 5.838313383822815, + "learning_rate": 2.478501339426885e-06, + "loss": 0.6346, + "step": 14330 + }, + { + "epoch": 1.0353459642025034, + "grad_norm": 6.829283871284022, + "learning_rate": 2.478208848001779e-06, + "loss": 0.6906, + "step": 14331 + }, + { + "epoch": 1.035418209402713, + "grad_norm": 6.061599192599962, + "learning_rate": 2.4779163568749783e-06, + "loss": 0.6546, + "step": 14332 + }, + { + "epoch": 1.0354904546029222, + "grad_norm": 6.775113049229067, + "learning_rate": 2.477623866050484e-06, + "loss": 0.7143, + "step": 14333 + }, + { + "epoch": 1.0355626998031318, + "grad_norm": 6.490130340725031, + "learning_rate": 2.477331375532302e-06, + "loss": 0.716, + "step": 14334 + }, + { + "epoch": 1.0356349450033413, + "grad_norm": 9.45564960154547, + "learning_rate": 2.4770388853244366e-06, + "loss": 0.772, + "step": 14335 + }, + { + "epoch": 1.0357071902035508, + "grad_norm": 7.738217410845327, + "learning_rate": 2.4767463954308905e-06, + "loss": 0.7569, + "step": 14336 + }, + { + "epoch": 1.0357794354037604, + "grad_norm": 5.7501698344483705, + "learning_rate": 2.476453905855668e-06, + "loss": 0.6699, + "step": 14337 + }, + { + "epoch": 1.03585168060397, + "grad_norm": 6.337582150139407, + "learning_rate": 2.476161416602773e-06, + "loss": 0.6828, + "step": 14338 + }, + { + "epoch": 1.0359239258041795, + "grad_norm": 7.480035531270708, + "learning_rate": 2.475868927676211e-06, + "loss": 0.7399, + "step": 14339 + }, + { + "epoch": 1.0359961710043888, + "grad_norm": 7.655644856107276, + "learning_rate": 2.4755764390799835e-06, + "loss": 0.6749, + "step": 14340 + }, + { + "epoch": 1.0360684162045983, + "grad_norm": 6.088155986055784, + "learning_rate": 2.4752839508180956e-06, + "loss": 0.6224, + "step": 14341 + }, + { + "epoch": 1.0361406614048079, + "grad_norm": 6.74858163130771, + "learning_rate": 2.4749914628945512e-06, + "loss": 0.6552, + "step": 14342 + }, + { + "epoch": 1.0362129066050174, + "grad_norm": 7.89959991987208, + "learning_rate": 2.4746989753133556e-06, + "loss": 0.6856, + "step": 14343 + }, + { + "epoch": 1.036285151805227, + "grad_norm": 6.3999903917240495, + "learning_rate": 2.47440648807851e-06, + "loss": 0.6421, + "step": 14344 + }, + { + "epoch": 1.0363573970054365, + "grad_norm": 7.698895451973609, + "learning_rate": 2.4741140011940197e-06, + "loss": 0.6727, + "step": 14345 + }, + { + "epoch": 1.036429642205646, + "grad_norm": 6.40358533568244, + "learning_rate": 2.47382151466389e-06, + "loss": 0.5884, + "step": 14346 + }, + { + "epoch": 1.0365018874058554, + "grad_norm": 5.594983353849982, + "learning_rate": 2.473529028492122e-06, + "loss": 0.6845, + "step": 14347 + }, + { + "epoch": 1.036574132606065, + "grad_norm": 7.326429093558434, + "learning_rate": 2.4732365426827214e-06, + "loss": 0.7141, + "step": 14348 + }, + { + "epoch": 1.0366463778062744, + "grad_norm": 5.847311474785168, + "learning_rate": 2.4729440572396914e-06, + "loss": 0.7189, + "step": 14349 + }, + { + "epoch": 1.036718623006484, + "grad_norm": 7.036580824238781, + "learning_rate": 2.472651572167037e-06, + "loss": 0.678, + "step": 14350 + }, + { + "epoch": 1.0367908682066935, + "grad_norm": 6.5598240800855265, + "learning_rate": 2.472359087468761e-06, + "loss": 0.6879, + "step": 14351 + }, + { + "epoch": 1.036863113406903, + "grad_norm": 4.852142741569987, + "learning_rate": 2.4720666031488674e-06, + "loss": 0.6124, + "step": 14352 + }, + { + "epoch": 1.0369353586071126, + "grad_norm": 6.5777161532313375, + "learning_rate": 2.471774119211361e-06, + "loss": 0.6327, + "step": 14353 + }, + { + "epoch": 1.0370076038073222, + "grad_norm": 6.745758136646671, + "learning_rate": 2.4714816356602443e-06, + "loss": 0.6781, + "step": 14354 + }, + { + "epoch": 1.0370798490075315, + "grad_norm": 6.332762274177062, + "learning_rate": 2.4711891524995218e-06, + "loss": 0.7004, + "step": 14355 + }, + { + "epoch": 1.037152094207741, + "grad_norm": 5.747374889045814, + "learning_rate": 2.4708966697331977e-06, + "loss": 0.6465, + "step": 14356 + }, + { + "epoch": 1.0372243394079506, + "grad_norm": 5.970184633093239, + "learning_rate": 2.4706041873652763e-06, + "loss": 0.7082, + "step": 14357 + }, + { + "epoch": 1.03729658460816, + "grad_norm": 7.928243451103826, + "learning_rate": 2.47031170539976e-06, + "loss": 0.6796, + "step": 14358 + }, + { + "epoch": 1.0373688298083696, + "grad_norm": 7.212800980606621, + "learning_rate": 2.4700192238406527e-06, + "loss": 0.7227, + "step": 14359 + }, + { + "epoch": 1.0374410750085792, + "grad_norm": 6.580123923853675, + "learning_rate": 2.4697267426919608e-06, + "loss": 0.6681, + "step": 14360 + }, + { + "epoch": 1.0375133202087887, + "grad_norm": 6.3902545998824145, + "learning_rate": 2.4694342619576848e-06, + "loss": 0.6457, + "step": 14361 + }, + { + "epoch": 1.037585565408998, + "grad_norm": 7.314457036464241, + "learning_rate": 2.4691417816418304e-06, + "loss": 0.7059, + "step": 14362 + }, + { + "epoch": 1.0376578106092076, + "grad_norm": 7.855510269578863, + "learning_rate": 2.468849301748401e-06, + "loss": 0.7103, + "step": 14363 + }, + { + "epoch": 1.0377300558094171, + "grad_norm": 5.53221234069774, + "learning_rate": 2.468556822281401e-06, + "loss": 0.6298, + "step": 14364 + }, + { + "epoch": 1.0378023010096267, + "grad_norm": 6.7718784249529875, + "learning_rate": 2.468264343244833e-06, + "loss": 0.6387, + "step": 14365 + }, + { + "epoch": 1.0378745462098362, + "grad_norm": 5.795058467066467, + "learning_rate": 2.4679718646427014e-06, + "loss": 0.6598, + "step": 14366 + }, + { + "epoch": 1.0379467914100458, + "grad_norm": 6.299338036531007, + "learning_rate": 2.4676793864790103e-06, + "loss": 0.6623, + "step": 14367 + }, + { + "epoch": 1.0380190366102553, + "grad_norm": 6.210231755304293, + "learning_rate": 2.467386908757764e-06, + "loss": 0.7236, + "step": 14368 + }, + { + "epoch": 1.0380912818104646, + "grad_norm": 7.955247157469451, + "learning_rate": 2.467094431482965e-06, + "loss": 0.6365, + "step": 14369 + }, + { + "epoch": 1.0381635270106742, + "grad_norm": 7.5817859733800566, + "learning_rate": 2.4668019546586176e-06, + "loss": 0.6617, + "step": 14370 + }, + { + "epoch": 1.0382357722108837, + "grad_norm": 6.694467782823641, + "learning_rate": 2.4665094782887263e-06, + "loss": 0.6767, + "step": 14371 + }, + { + "epoch": 1.0383080174110932, + "grad_norm": 5.4207509219536485, + "learning_rate": 2.4662170023772937e-06, + "loss": 0.6685, + "step": 14372 + }, + { + "epoch": 1.0383802626113028, + "grad_norm": 6.085374289468144, + "learning_rate": 2.4659245269283238e-06, + "loss": 0.6923, + "step": 14373 + }, + { + "epoch": 1.0384525078115123, + "grad_norm": 6.174619272209282, + "learning_rate": 2.4656320519458203e-06, + "loss": 0.6799, + "step": 14374 + }, + { + "epoch": 1.0385247530117219, + "grad_norm": 6.754917472826539, + "learning_rate": 2.465339577433789e-06, + "loss": 0.6813, + "step": 14375 + }, + { + "epoch": 1.0385969982119312, + "grad_norm": 7.091386443300944, + "learning_rate": 2.465047103396231e-06, + "loss": 0.6449, + "step": 14376 + }, + { + "epoch": 1.0386692434121407, + "grad_norm": 5.848094936471734, + "learning_rate": 2.4647546298371508e-06, + "loss": 0.6622, + "step": 14377 + }, + { + "epoch": 1.0387414886123503, + "grad_norm": 5.885294883165703, + "learning_rate": 2.4644621567605532e-06, + "loss": 0.6449, + "step": 14378 + }, + { + "epoch": 1.0388137338125598, + "grad_norm": 6.96885455271205, + "learning_rate": 2.46416968417044e-06, + "loss": 0.7141, + "step": 14379 + }, + { + "epoch": 1.0388859790127694, + "grad_norm": 6.715287988868199, + "learning_rate": 2.4638772120708164e-06, + "loss": 0.6778, + "step": 14380 + }, + { + "epoch": 1.038958224212979, + "grad_norm": 6.905647355104835, + "learning_rate": 2.4635847404656857e-06, + "loss": 0.6162, + "step": 14381 + }, + { + "epoch": 1.0390304694131884, + "grad_norm": 5.863762354824539, + "learning_rate": 2.4632922693590518e-06, + "loss": 0.6772, + "step": 14382 + }, + { + "epoch": 1.039102714613398, + "grad_norm": 9.966927962774523, + "learning_rate": 2.462999798754918e-06, + "loss": 0.6524, + "step": 14383 + }, + { + "epoch": 1.0391749598136073, + "grad_norm": 5.554997045205444, + "learning_rate": 2.4627073286572883e-06, + "loss": 0.721, + "step": 14384 + }, + { + "epoch": 1.0392472050138168, + "grad_norm": 7.495642349781628, + "learning_rate": 2.4624148590701675e-06, + "loss": 0.7053, + "step": 14385 + }, + { + "epoch": 1.0393194502140264, + "grad_norm": 6.017870674587041, + "learning_rate": 2.4621223899975565e-06, + "loss": 0.6416, + "step": 14386 + }, + { + "epoch": 1.039391695414236, + "grad_norm": 6.322772741381756, + "learning_rate": 2.4618299214434606e-06, + "loss": 0.6225, + "step": 14387 + }, + { + "epoch": 1.0394639406144455, + "grad_norm": 7.082541597286593, + "learning_rate": 2.4615374534118836e-06, + "loss": 0.7558, + "step": 14388 + }, + { + "epoch": 1.039536185814655, + "grad_norm": 6.458314005504933, + "learning_rate": 2.4612449859068304e-06, + "loss": 0.6487, + "step": 14389 + }, + { + "epoch": 1.0396084310148646, + "grad_norm": 6.232398310144263, + "learning_rate": 2.460952518932302e-06, + "loss": 0.7139, + "step": 14390 + }, + { + "epoch": 1.0396806762150739, + "grad_norm": 6.083384927330529, + "learning_rate": 2.460660052492303e-06, + "loss": 0.6814, + "step": 14391 + }, + { + "epoch": 1.0397529214152834, + "grad_norm": 5.830953530383031, + "learning_rate": 2.4603675865908374e-06, + "loss": 0.6235, + "step": 14392 + }, + { + "epoch": 1.039825166615493, + "grad_norm": 7.833661769177119, + "learning_rate": 2.46007512123191e-06, + "loss": 0.6269, + "step": 14393 + }, + { + "epoch": 1.0398974118157025, + "grad_norm": 6.580175229785868, + "learning_rate": 2.4597826564195218e-06, + "loss": 0.6805, + "step": 14394 + }, + { + "epoch": 1.039969657015912, + "grad_norm": 6.237543128099584, + "learning_rate": 2.459490192157678e-06, + "loss": 0.6962, + "step": 14395 + }, + { + "epoch": 1.0400419022161216, + "grad_norm": 6.45382634893722, + "learning_rate": 2.459197728450383e-06, + "loss": 0.6785, + "step": 14396 + }, + { + "epoch": 1.0401141474163311, + "grad_norm": 5.789053604180918, + "learning_rate": 2.4589052653016384e-06, + "loss": 0.5935, + "step": 14397 + }, + { + "epoch": 1.0401863926165404, + "grad_norm": 6.254082223964707, + "learning_rate": 2.458612802715449e-06, + "loss": 0.6302, + "step": 14398 + }, + { + "epoch": 1.04025863781675, + "grad_norm": 8.399397655870647, + "learning_rate": 2.4583203406958184e-06, + "loss": 0.6526, + "step": 14399 + }, + { + "epoch": 1.0403308830169595, + "grad_norm": 6.217015623605924, + "learning_rate": 2.458027879246751e-06, + "loss": 0.6994, + "step": 14400 + }, + { + "epoch": 1.040403128217169, + "grad_norm": 6.018996049678461, + "learning_rate": 2.457735418372248e-06, + "loss": 0.6194, + "step": 14401 + }, + { + "epoch": 1.0404753734173786, + "grad_norm": 7.297096134731385, + "learning_rate": 2.4574429580763136e-06, + "loss": 0.6563, + "step": 14402 + }, + { + "epoch": 1.0405476186175882, + "grad_norm": 7.092255960164733, + "learning_rate": 2.457150498362954e-06, + "loss": 0.6619, + "step": 14403 + }, + { + "epoch": 1.0406198638177977, + "grad_norm": 5.905650274921276, + "learning_rate": 2.45685803923617e-06, + "loss": 0.7221, + "step": 14404 + }, + { + "epoch": 1.040692109018007, + "grad_norm": 8.037007090403206, + "learning_rate": 2.4565655806999656e-06, + "loss": 0.6591, + "step": 14405 + }, + { + "epoch": 1.0407643542182166, + "grad_norm": 6.7484680132696075, + "learning_rate": 2.4562731227583446e-06, + "loss": 0.6468, + "step": 14406 + }, + { + "epoch": 1.040836599418426, + "grad_norm": 7.218216517168261, + "learning_rate": 2.455980665415311e-06, + "loss": 0.7385, + "step": 14407 + }, + { + "epoch": 1.0409088446186356, + "grad_norm": 8.73155404328131, + "learning_rate": 2.455688208674868e-06, + "loss": 0.6344, + "step": 14408 + }, + { + "epoch": 1.0409810898188452, + "grad_norm": 6.665795873355569, + "learning_rate": 2.4553957525410187e-06, + "loss": 0.6469, + "step": 14409 + }, + { + "epoch": 1.0410533350190547, + "grad_norm": 6.741561701475756, + "learning_rate": 2.455103297017767e-06, + "loss": 0.6738, + "step": 14410 + }, + { + "epoch": 1.0411255802192643, + "grad_norm": 5.519586021545743, + "learning_rate": 2.4548108421091164e-06, + "loss": 0.5566, + "step": 14411 + }, + { + "epoch": 1.0411978254194736, + "grad_norm": 5.817771899890059, + "learning_rate": 2.4545183878190697e-06, + "loss": 0.5712, + "step": 14412 + }, + { + "epoch": 1.0412700706196831, + "grad_norm": 6.689386404776757, + "learning_rate": 2.4542259341516316e-06, + "loss": 0.6256, + "step": 14413 + }, + { + "epoch": 1.0413423158198927, + "grad_norm": 7.0942143359696805, + "learning_rate": 2.4539334811108056e-06, + "loss": 0.6808, + "step": 14414 + }, + { + "epoch": 1.0414145610201022, + "grad_norm": 6.481360780510422, + "learning_rate": 2.4536410287005935e-06, + "loss": 0.6358, + "step": 14415 + }, + { + "epoch": 1.0414868062203118, + "grad_norm": 6.607087012095451, + "learning_rate": 2.4533485769249993e-06, + "loss": 0.7345, + "step": 14416 + }, + { + "epoch": 1.0415590514205213, + "grad_norm": 7.30834347221457, + "learning_rate": 2.453056125788027e-06, + "loss": 0.6282, + "step": 14417 + }, + { + "epoch": 1.0416312966207308, + "grad_norm": 7.687399514634804, + "learning_rate": 2.4527636752936817e-06, + "loss": 0.6926, + "step": 14418 + }, + { + "epoch": 1.0417035418209402, + "grad_norm": 6.207429790245724, + "learning_rate": 2.452471225445963e-06, + "loss": 0.6404, + "step": 14419 + }, + { + "epoch": 1.0417757870211497, + "grad_norm": 5.788184713309573, + "learning_rate": 2.452178776248877e-06, + "loss": 0.6852, + "step": 14420 + }, + { + "epoch": 1.0418480322213592, + "grad_norm": 6.281836031625042, + "learning_rate": 2.4518863277064266e-06, + "loss": 0.6216, + "step": 14421 + }, + { + "epoch": 1.0419202774215688, + "grad_norm": 6.410868250086778, + "learning_rate": 2.4515938798226146e-06, + "loss": 0.6009, + "step": 14422 + }, + { + "epoch": 1.0419925226217783, + "grad_norm": 7.4188964006589915, + "learning_rate": 2.451301432601445e-06, + "loss": 0.6291, + "step": 14423 + }, + { + "epoch": 1.0420647678219879, + "grad_norm": 7.574699209555082, + "learning_rate": 2.4510089860469207e-06, + "loss": 0.6995, + "step": 14424 + }, + { + "epoch": 1.0421370130221974, + "grad_norm": 7.284782895091197, + "learning_rate": 2.450716540163046e-06, + "loss": 0.6813, + "step": 14425 + }, + { + "epoch": 1.042209258222407, + "grad_norm": 5.68418066396175, + "learning_rate": 2.450424094953823e-06, + "loss": 0.6491, + "step": 14426 + }, + { + "epoch": 1.0422815034226163, + "grad_norm": 6.413694048206084, + "learning_rate": 2.450131650423256e-06, + "loss": 0.6926, + "step": 14427 + }, + { + "epoch": 1.0423537486228258, + "grad_norm": 6.946170372004567, + "learning_rate": 2.4498392065753484e-06, + "loss": 0.7658, + "step": 14428 + }, + { + "epoch": 1.0424259938230354, + "grad_norm": 8.484814711499395, + "learning_rate": 2.4495467634141025e-06, + "loss": 0.7412, + "step": 14429 + }, + { + "epoch": 1.042498239023245, + "grad_norm": 6.469660450723535, + "learning_rate": 2.4492543209435217e-06, + "loss": 0.6045, + "step": 14430 + }, + { + "epoch": 1.0425704842234544, + "grad_norm": 7.7009655631919784, + "learning_rate": 2.44896187916761e-06, + "loss": 0.7607, + "step": 14431 + }, + { + "epoch": 1.042642729423664, + "grad_norm": 6.803629568485911, + "learning_rate": 2.448669438090372e-06, + "loss": 0.6795, + "step": 14432 + }, + { + "epoch": 1.0427149746238735, + "grad_norm": 7.931146196267742, + "learning_rate": 2.4483769977158085e-06, + "loss": 0.7606, + "step": 14433 + }, + { + "epoch": 1.0427872198240828, + "grad_norm": 6.115869372772317, + "learning_rate": 2.4480845580479234e-06, + "loss": 0.6708, + "step": 14434 + }, + { + "epoch": 1.0428594650242924, + "grad_norm": 6.460830632240416, + "learning_rate": 2.4477921190907215e-06, + "loss": 0.622, + "step": 14435 + }, + { + "epoch": 1.042931710224502, + "grad_norm": 6.8214609549073995, + "learning_rate": 2.447499680848204e-06, + "loss": 0.5923, + "step": 14436 + }, + { + "epoch": 1.0430039554247115, + "grad_norm": 6.581436592202913, + "learning_rate": 2.4472072433243752e-06, + "loss": 0.664, + "step": 14437 + }, + { + "epoch": 1.043076200624921, + "grad_norm": 6.379186116028414, + "learning_rate": 2.446914806523238e-06, + "loss": 0.5983, + "step": 14438 + }, + { + "epoch": 1.0431484458251306, + "grad_norm": 6.7299181003887005, + "learning_rate": 2.446622370448797e-06, + "loss": 0.6704, + "step": 14439 + }, + { + "epoch": 1.04322069102534, + "grad_norm": 6.286123027144978, + "learning_rate": 2.4463299351050533e-06, + "loss": 0.6708, + "step": 14440 + }, + { + "epoch": 1.0432929362255494, + "grad_norm": 5.892256048021777, + "learning_rate": 2.446037500496011e-06, + "loss": 0.6165, + "step": 14441 + }, + { + "epoch": 1.043365181425759, + "grad_norm": 6.955584898870863, + "learning_rate": 2.4457450666256752e-06, + "loss": 0.785, + "step": 14442 + }, + { + "epoch": 1.0434374266259685, + "grad_norm": 6.223021300791716, + "learning_rate": 2.4454526334980458e-06, + "loss": 0.6654, + "step": 14443 + }, + { + "epoch": 1.043509671826178, + "grad_norm": 6.4978381743424025, + "learning_rate": 2.4451602011171267e-06, + "loss": 0.6741, + "step": 14444 + }, + { + "epoch": 1.0435819170263876, + "grad_norm": 8.10084111120471, + "learning_rate": 2.444867769486923e-06, + "loss": 0.6488, + "step": 14445 + }, + { + "epoch": 1.0436541622265971, + "grad_norm": 7.961690251304358, + "learning_rate": 2.4445753386114375e-06, + "loss": 0.7373, + "step": 14446 + }, + { + "epoch": 1.0437264074268067, + "grad_norm": 6.1159872580296355, + "learning_rate": 2.4442829084946714e-06, + "loss": 0.6917, + "step": 14447 + }, + { + "epoch": 1.043798652627016, + "grad_norm": 6.81058804341228, + "learning_rate": 2.443990479140629e-06, + "loss": 0.7675, + "step": 14448 + }, + { + "epoch": 1.0438708978272255, + "grad_norm": 6.10757177722178, + "learning_rate": 2.4436980505533136e-06, + "loss": 0.6635, + "step": 14449 + }, + { + "epoch": 1.043943143027435, + "grad_norm": 7.1366179928403355, + "learning_rate": 2.4434056227367285e-06, + "loss": 0.6656, + "step": 14450 + }, + { + "epoch": 1.0440153882276446, + "grad_norm": 5.970831543025271, + "learning_rate": 2.4431131956948763e-06, + "loss": 0.6221, + "step": 14451 + }, + { + "epoch": 1.0440876334278542, + "grad_norm": 6.926539655493289, + "learning_rate": 2.4428207694317598e-06, + "loss": 0.6939, + "step": 14452 + }, + { + "epoch": 1.0441598786280637, + "grad_norm": 5.960037983510892, + "learning_rate": 2.4425283439513835e-06, + "loss": 0.7483, + "step": 14453 + }, + { + "epoch": 1.0442321238282732, + "grad_norm": 6.6430524630331895, + "learning_rate": 2.442235919257749e-06, + "loss": 0.6894, + "step": 14454 + }, + { + "epoch": 1.0443043690284828, + "grad_norm": 5.903439776629305, + "learning_rate": 2.44194349535486e-06, + "loss": 0.6322, + "step": 14455 + }, + { + "epoch": 1.044376614228692, + "grad_norm": 6.673718950306909, + "learning_rate": 2.441651072246719e-06, + "loss": 0.6839, + "step": 14456 + }, + { + "epoch": 1.0444488594289016, + "grad_norm": 5.494178118206587, + "learning_rate": 2.4413586499373308e-06, + "loss": 0.682, + "step": 14457 + }, + { + "epoch": 1.0445211046291112, + "grad_norm": 7.396587007473843, + "learning_rate": 2.4410662284306956e-06, + "loss": 0.6326, + "step": 14458 + }, + { + "epoch": 1.0445933498293207, + "grad_norm": 5.797207640283427, + "learning_rate": 2.4407738077308187e-06, + "loss": 0.693, + "step": 14459 + }, + { + "epoch": 1.0446655950295303, + "grad_norm": 6.37007878660528, + "learning_rate": 2.440481387841703e-06, + "loss": 0.6153, + "step": 14460 + }, + { + "epoch": 1.0447378402297398, + "grad_norm": 5.753041002280948, + "learning_rate": 2.44018896876735e-06, + "loss": 0.5524, + "step": 14461 + }, + { + "epoch": 1.0448100854299494, + "grad_norm": 7.6610356535625295, + "learning_rate": 2.439896550511764e-06, + "loss": 0.6576, + "step": 14462 + }, + { + "epoch": 1.0448823306301587, + "grad_norm": 10.565483800423515, + "learning_rate": 2.4396041330789472e-06, + "loss": 0.7272, + "step": 14463 + }, + { + "epoch": 1.0449545758303682, + "grad_norm": 7.469185660835802, + "learning_rate": 2.4393117164729034e-06, + "loss": 0.7262, + "step": 14464 + }, + { + "epoch": 1.0450268210305778, + "grad_norm": 7.313056337761085, + "learning_rate": 2.439019300697635e-06, + "loss": 0.5905, + "step": 14465 + }, + { + "epoch": 1.0450990662307873, + "grad_norm": 6.350592770280231, + "learning_rate": 2.4387268857571446e-06, + "loss": 0.6168, + "step": 14466 + }, + { + "epoch": 1.0451713114309968, + "grad_norm": 5.9422164655486505, + "learning_rate": 2.438434471655436e-06, + "loss": 0.6499, + "step": 14467 + }, + { + "epoch": 1.0452435566312064, + "grad_norm": 7.202028137236814, + "learning_rate": 2.4381420583965113e-06, + "loss": 0.6858, + "step": 14468 + }, + { + "epoch": 1.045315801831416, + "grad_norm": 7.357896271538802, + "learning_rate": 2.4378496459843736e-06, + "loss": 0.6566, + "step": 14469 + }, + { + "epoch": 1.0453880470316252, + "grad_norm": 6.978279201582176, + "learning_rate": 2.4375572344230263e-06, + "loss": 0.6166, + "step": 14470 + }, + { + "epoch": 1.0454602922318348, + "grad_norm": 7.699132043049307, + "learning_rate": 2.437264823716473e-06, + "loss": 0.6262, + "step": 14471 + }, + { + "epoch": 1.0455325374320443, + "grad_norm": 6.4816138581982115, + "learning_rate": 2.4369724138687135e-06, + "loss": 0.7122, + "step": 14472 + }, + { + "epoch": 1.0456047826322539, + "grad_norm": 7.872236463267205, + "learning_rate": 2.4366800048837537e-06, + "loss": 0.7971, + "step": 14473 + }, + { + "epoch": 1.0456770278324634, + "grad_norm": 8.22275552938878, + "learning_rate": 2.436387596765596e-06, + "loss": 0.6572, + "step": 14474 + }, + { + "epoch": 1.045749273032673, + "grad_norm": 5.897858324731403, + "learning_rate": 2.436095189518243e-06, + "loss": 0.6253, + "step": 14475 + }, + { + "epoch": 1.0458215182328825, + "grad_norm": 6.205968555780156, + "learning_rate": 2.435802783145696e-06, + "loss": 0.6458, + "step": 14476 + }, + { + "epoch": 1.0458937634330918, + "grad_norm": 5.432292680993234, + "learning_rate": 2.4355103776519596e-06, + "loss": 0.6497, + "step": 14477 + }, + { + "epoch": 1.0459660086333014, + "grad_norm": 6.548952487728357, + "learning_rate": 2.4352179730410365e-06, + "loss": 0.6543, + "step": 14478 + }, + { + "epoch": 1.046038253833511, + "grad_norm": 8.456964942725048, + "learning_rate": 2.434925569316928e-06, + "loss": 0.6843, + "step": 14479 + }, + { + "epoch": 1.0461104990337204, + "grad_norm": 9.2628472266964, + "learning_rate": 2.4346331664836382e-06, + "loss": 0.6125, + "step": 14480 + }, + { + "epoch": 1.04618274423393, + "grad_norm": 6.693773695959896, + "learning_rate": 2.43434076454517e-06, + "loss": 0.7502, + "step": 14481 + }, + { + "epoch": 1.0462549894341395, + "grad_norm": 7.401985324848164, + "learning_rate": 2.434048363505526e-06, + "loss": 0.7079, + "step": 14482 + }, + { + "epoch": 1.046327234634349, + "grad_norm": 6.329318162495832, + "learning_rate": 2.4337559633687083e-06, + "loss": 0.6733, + "step": 14483 + }, + { + "epoch": 1.0463994798345584, + "grad_norm": 6.219043513900124, + "learning_rate": 2.43346356413872e-06, + "loss": 0.6713, + "step": 14484 + }, + { + "epoch": 1.046471725034768, + "grad_norm": 5.596358729864017, + "learning_rate": 2.4331711658195647e-06, + "loss": 0.6883, + "step": 14485 + }, + { + "epoch": 1.0465439702349775, + "grad_norm": 6.280925989097275, + "learning_rate": 2.4328787684152428e-06, + "loss": 0.6792, + "step": 14486 + }, + { + "epoch": 1.046616215435187, + "grad_norm": 6.087569481925083, + "learning_rate": 2.432586371929759e-06, + "loss": 0.6697, + "step": 14487 + }, + { + "epoch": 1.0466884606353966, + "grad_norm": 7.462220725707575, + "learning_rate": 2.4322939763671153e-06, + "loss": 0.7716, + "step": 14488 + }, + { + "epoch": 1.046760705835606, + "grad_norm": 8.820014076827185, + "learning_rate": 2.4320015817313154e-06, + "loss": 0.6192, + "step": 14489 + }, + { + "epoch": 1.0468329510358156, + "grad_norm": 8.508697996130516, + "learning_rate": 2.4317091880263603e-06, + "loss": 0.6488, + "step": 14490 + }, + { + "epoch": 1.046905196236025, + "grad_norm": 5.991210539552758, + "learning_rate": 2.4314167952562535e-06, + "loss": 0.6876, + "step": 14491 + }, + { + "epoch": 1.0469774414362345, + "grad_norm": 5.4887241605646935, + "learning_rate": 2.4311244034249975e-06, + "loss": 0.6017, + "step": 14492 + }, + { + "epoch": 1.047049686636444, + "grad_norm": 6.224850523352825, + "learning_rate": 2.430832012536595e-06, + "loss": 0.6248, + "step": 14493 + }, + { + "epoch": 1.0471219318366536, + "grad_norm": 8.350773173644237, + "learning_rate": 2.430539622595048e-06, + "loss": 0.6374, + "step": 14494 + }, + { + "epoch": 1.0471941770368631, + "grad_norm": 6.954406208768131, + "learning_rate": 2.43024723360436e-06, + "loss": 0.6496, + "step": 14495 + }, + { + "epoch": 1.0472664222370727, + "grad_norm": 5.18735862447367, + "learning_rate": 2.4299548455685336e-06, + "loss": 0.6813, + "step": 14496 + }, + { + "epoch": 1.0473386674372822, + "grad_norm": 5.792193967384945, + "learning_rate": 2.4296624584915707e-06, + "loss": 0.5937, + "step": 14497 + }, + { + "epoch": 1.0474109126374918, + "grad_norm": 8.083160188793446, + "learning_rate": 2.429370072377474e-06, + "loss": 0.7188, + "step": 14498 + }, + { + "epoch": 1.047483157837701, + "grad_norm": 6.469725309551765, + "learning_rate": 2.429077687230246e-06, + "loss": 0.6639, + "step": 14499 + }, + { + "epoch": 1.0475554030379106, + "grad_norm": 6.042178358135701, + "learning_rate": 2.42878530305389e-06, + "loss": 0.7263, + "step": 14500 + }, + { + "epoch": 1.0476276482381202, + "grad_norm": 6.128153397933469, + "learning_rate": 2.4284929198524078e-06, + "loss": 0.6124, + "step": 14501 + }, + { + "epoch": 1.0476998934383297, + "grad_norm": 6.396186777428255, + "learning_rate": 2.4282005376298012e-06, + "loss": 0.6551, + "step": 14502 + }, + { + "epoch": 1.0477721386385392, + "grad_norm": 6.56576602136153, + "learning_rate": 2.427908156390075e-06, + "loss": 0.6761, + "step": 14503 + }, + { + "epoch": 1.0478443838387488, + "grad_norm": 6.4169371246543765, + "learning_rate": 2.4276157761372294e-06, + "loss": 0.6606, + "step": 14504 + }, + { + "epoch": 1.0479166290389583, + "grad_norm": 6.506896834966606, + "learning_rate": 2.4273233968752673e-06, + "loss": 0.6077, + "step": 14505 + }, + { + "epoch": 1.0479888742391676, + "grad_norm": 7.347247390687937, + "learning_rate": 2.4270310186081916e-06, + "loss": 0.7181, + "step": 14506 + }, + { + "epoch": 1.0480611194393772, + "grad_norm": 6.755039170813057, + "learning_rate": 2.4267386413400044e-06, + "loss": 0.6254, + "step": 14507 + }, + { + "epoch": 1.0481333646395867, + "grad_norm": 5.64233947228089, + "learning_rate": 2.4264462650747085e-06, + "loss": 0.6438, + "step": 14508 + }, + { + "epoch": 1.0482056098397963, + "grad_norm": 7.064483904066209, + "learning_rate": 2.426153889816306e-06, + "loss": 0.7013, + "step": 14509 + }, + { + "epoch": 1.0482778550400058, + "grad_norm": 6.194709661913959, + "learning_rate": 2.4258615155688e-06, + "loss": 0.6467, + "step": 14510 + }, + { + "epoch": 1.0483501002402154, + "grad_norm": 6.780387014860513, + "learning_rate": 2.425569142336192e-06, + "loss": 0.6442, + "step": 14511 + }, + { + "epoch": 1.048422345440425, + "grad_norm": 6.7107527577525605, + "learning_rate": 2.425276770122484e-06, + "loss": 0.5727, + "step": 14512 + }, + { + "epoch": 1.0484945906406342, + "grad_norm": 6.730505874008904, + "learning_rate": 2.4249843989316795e-06, + "loss": 0.6734, + "step": 14513 + }, + { + "epoch": 1.0485668358408438, + "grad_norm": 6.353832031777132, + "learning_rate": 2.4246920287677813e-06, + "loss": 0.6835, + "step": 14514 + }, + { + "epoch": 1.0486390810410533, + "grad_norm": 6.185583260865511, + "learning_rate": 2.424399659634789e-06, + "loss": 0.6981, + "step": 14515 + }, + { + "epoch": 1.0487113262412628, + "grad_norm": 5.502730298766858, + "learning_rate": 2.4241072915367073e-06, + "loss": 0.6797, + "step": 14516 + }, + { + "epoch": 1.0487835714414724, + "grad_norm": 7.5006914455845495, + "learning_rate": 2.4238149244775393e-06, + "loss": 0.7091, + "step": 14517 + }, + { + "epoch": 1.048855816641682, + "grad_norm": 6.767826699980342, + "learning_rate": 2.423522558461284e-06, + "loss": 0.7302, + "step": 14518 + }, + { + "epoch": 1.0489280618418915, + "grad_norm": 6.099897990390009, + "learning_rate": 2.423230193491946e-06, + "loss": 0.6459, + "step": 14519 + }, + { + "epoch": 1.0490003070421008, + "grad_norm": 6.918190692462545, + "learning_rate": 2.422937829573527e-06, + "loss": 0.6578, + "step": 14520 + }, + { + "epoch": 1.0490725522423103, + "grad_norm": 6.67063216843886, + "learning_rate": 2.4226454667100295e-06, + "loss": 0.6208, + "step": 14521 + }, + { + "epoch": 1.0491447974425199, + "grad_norm": 6.280222338974403, + "learning_rate": 2.4223531049054554e-06, + "loss": 0.6584, + "step": 14522 + }, + { + "epoch": 1.0492170426427294, + "grad_norm": 6.093425096506749, + "learning_rate": 2.422060744163807e-06, + "loss": 0.6552, + "step": 14523 + }, + { + "epoch": 1.049289287842939, + "grad_norm": 7.2582232438099075, + "learning_rate": 2.421768384489086e-06, + "loss": 0.623, + "step": 14524 + }, + { + "epoch": 1.0493615330431485, + "grad_norm": 7.681765022114415, + "learning_rate": 2.421476025885296e-06, + "loss": 0.7174, + "step": 14525 + }, + { + "epoch": 1.049433778243358, + "grad_norm": 8.43063278484077, + "learning_rate": 2.4211836683564372e-06, + "loss": 0.6383, + "step": 14526 + }, + { + "epoch": 1.0495060234435676, + "grad_norm": 6.4662090231339855, + "learning_rate": 2.420891311906513e-06, + "loss": 0.6592, + "step": 14527 + }, + { + "epoch": 1.049578268643777, + "grad_norm": 9.013065179676971, + "learning_rate": 2.420598956539527e-06, + "loss": 0.6365, + "step": 14528 + }, + { + "epoch": 1.0496505138439864, + "grad_norm": 8.442989117995003, + "learning_rate": 2.4203066022594775e-06, + "loss": 0.7166, + "step": 14529 + }, + { + "epoch": 1.049722759044196, + "grad_norm": 6.681305199761741, + "learning_rate": 2.4200142490703694e-06, + "loss": 0.6413, + "step": 14530 + }, + { + "epoch": 1.0497950042444055, + "grad_norm": 6.65384483761433, + "learning_rate": 2.4197218969762043e-06, + "loss": 0.6286, + "step": 14531 + }, + { + "epoch": 1.049867249444615, + "grad_norm": 7.547161088974429, + "learning_rate": 2.419429545980985e-06, + "loss": 0.711, + "step": 14532 + }, + { + "epoch": 1.0499394946448246, + "grad_norm": 7.936828104252058, + "learning_rate": 2.4191371960887115e-06, + "loss": 0.7339, + "step": 14533 + }, + { + "epoch": 1.0500117398450342, + "grad_norm": 8.316198917096136, + "learning_rate": 2.418844847303387e-06, + "loss": 0.6336, + "step": 14534 + }, + { + "epoch": 1.0500839850452435, + "grad_norm": 6.657533906856747, + "learning_rate": 2.418552499629014e-06, + "loss": 0.6161, + "step": 14535 + }, + { + "epoch": 1.050156230245453, + "grad_norm": 7.633450437533173, + "learning_rate": 2.418260153069594e-06, + "loss": 0.6736, + "step": 14536 + }, + { + "epoch": 1.0502284754456626, + "grad_norm": 5.855698390781311, + "learning_rate": 2.4179678076291284e-06, + "loss": 0.6557, + "step": 14537 + }, + { + "epoch": 1.050300720645872, + "grad_norm": 8.72906474658641, + "learning_rate": 2.4176754633116202e-06, + "loss": 0.6088, + "step": 14538 + }, + { + "epoch": 1.0503729658460816, + "grad_norm": 6.254214887371705, + "learning_rate": 2.417383120121071e-06, + "loss": 0.6534, + "step": 14539 + }, + { + "epoch": 1.0504452110462912, + "grad_norm": 6.124985052596557, + "learning_rate": 2.4170907780614826e-06, + "loss": 0.6304, + "step": 14540 + }, + { + "epoch": 1.0505174562465007, + "grad_norm": 6.941160593746812, + "learning_rate": 2.4167984371368573e-06, + "loss": 0.7275, + "step": 14541 + }, + { + "epoch": 1.05058970144671, + "grad_norm": 7.726304037401657, + "learning_rate": 2.4165060973511977e-06, + "loss": 0.7321, + "step": 14542 + }, + { + "epoch": 1.0506619466469196, + "grad_norm": 5.409560408298918, + "learning_rate": 2.416213758708503e-06, + "loss": 0.6234, + "step": 14543 + }, + { + "epoch": 1.0507341918471291, + "grad_norm": 8.394177698755325, + "learning_rate": 2.4159214212127778e-06, + "loss": 0.6756, + "step": 14544 + }, + { + "epoch": 1.0508064370473387, + "grad_norm": 7.039619855587491, + "learning_rate": 2.4156290848680227e-06, + "loss": 0.7404, + "step": 14545 + }, + { + "epoch": 1.0508786822475482, + "grad_norm": 5.855070358883034, + "learning_rate": 2.415336749678241e-06, + "loss": 0.6358, + "step": 14546 + }, + { + "epoch": 1.0509509274477578, + "grad_norm": 8.062251597281685, + "learning_rate": 2.4150444156474326e-06, + "loss": 0.6932, + "step": 14547 + }, + { + "epoch": 1.0510231726479673, + "grad_norm": 7.41531679820717, + "learning_rate": 2.4147520827796e-06, + "loss": 0.764, + "step": 14548 + }, + { + "epoch": 1.0510954178481766, + "grad_norm": 6.39051367329301, + "learning_rate": 2.414459751078745e-06, + "loss": 0.6854, + "step": 14549 + }, + { + "epoch": 1.0511676630483862, + "grad_norm": 7.496624250460195, + "learning_rate": 2.4141674205488707e-06, + "loss": 0.6771, + "step": 14550 + }, + { + "epoch": 1.0512399082485957, + "grad_norm": 6.052469351752741, + "learning_rate": 2.4138750911939767e-06, + "loss": 0.641, + "step": 14551 + }, + { + "epoch": 1.0513121534488052, + "grad_norm": 6.59788838910362, + "learning_rate": 2.4135827630180663e-06, + "loss": 0.6471, + "step": 14552 + }, + { + "epoch": 1.0513843986490148, + "grad_norm": 6.274464752348745, + "learning_rate": 2.413290436025141e-06, + "loss": 0.699, + "step": 14553 + }, + { + "epoch": 1.0514566438492243, + "grad_norm": 6.458746357498338, + "learning_rate": 2.4129981102192017e-06, + "loss": 0.6723, + "step": 14554 + }, + { + "epoch": 1.0515288890494339, + "grad_norm": 8.862630471370807, + "learning_rate": 2.412705785604251e-06, + "loss": 0.7241, + "step": 14555 + }, + { + "epoch": 1.0516011342496432, + "grad_norm": 6.012907132582258, + "learning_rate": 2.41241346218429e-06, + "loss": 0.6682, + "step": 14556 + }, + { + "epoch": 1.0516733794498527, + "grad_norm": 6.101710045431168, + "learning_rate": 2.4121211399633215e-06, + "loss": 0.7036, + "step": 14557 + }, + { + "epoch": 1.0517456246500623, + "grad_norm": 7.509608154233279, + "learning_rate": 2.411828818945346e-06, + "loss": 0.7174, + "step": 14558 + }, + { + "epoch": 1.0518178698502718, + "grad_norm": 6.443007696336302, + "learning_rate": 2.411536499134365e-06, + "loss": 0.6894, + "step": 14559 + }, + { + "epoch": 1.0518901150504814, + "grad_norm": 6.315674238659494, + "learning_rate": 2.411244180534382e-06, + "loss": 0.6852, + "step": 14560 + }, + { + "epoch": 1.051962360250691, + "grad_norm": 6.8857448882467045, + "learning_rate": 2.410951863149396e-06, + "loss": 0.6485, + "step": 14561 + }, + { + "epoch": 1.0520346054509004, + "grad_norm": 7.236168657571513, + "learning_rate": 2.41065954698341e-06, + "loss": 0.6546, + "step": 14562 + }, + { + "epoch": 1.0521068506511098, + "grad_norm": 6.5850402293211046, + "learning_rate": 2.410367232040425e-06, + "loss": 0.6546, + "step": 14563 + }, + { + "epoch": 1.0521790958513193, + "grad_norm": 6.934049830950558, + "learning_rate": 2.4100749183244444e-06, + "loss": 0.6913, + "step": 14564 + }, + { + "epoch": 1.0522513410515288, + "grad_norm": 6.695917553976336, + "learning_rate": 2.4097826058394674e-06, + "loss": 0.7087, + "step": 14565 + }, + { + "epoch": 1.0523235862517384, + "grad_norm": 6.531737678974378, + "learning_rate": 2.4094902945894965e-06, + "loss": 0.6861, + "step": 14566 + }, + { + "epoch": 1.052395831451948, + "grad_norm": 6.090035113153565, + "learning_rate": 2.4091979845785337e-06, + "loss": 0.705, + "step": 14567 + }, + { + "epoch": 1.0524680766521575, + "grad_norm": 8.562281696293324, + "learning_rate": 2.4089056758105795e-06, + "loss": 0.6629, + "step": 14568 + }, + { + "epoch": 1.052540321852367, + "grad_norm": 7.987349282814282, + "learning_rate": 2.4086133682896356e-06, + "loss": 0.7369, + "step": 14569 + }, + { + "epoch": 1.0526125670525763, + "grad_norm": 5.873075778840581, + "learning_rate": 2.408321062019704e-06, + "loss": 0.608, + "step": 14570 + }, + { + "epoch": 1.0526848122527859, + "grad_norm": 6.436221097554182, + "learning_rate": 2.4080287570047863e-06, + "loss": 0.6139, + "step": 14571 + }, + { + "epoch": 1.0527570574529954, + "grad_norm": 5.579663831093911, + "learning_rate": 2.407736453248883e-06, + "loss": 0.6999, + "step": 14572 + }, + { + "epoch": 1.052829302653205, + "grad_norm": 7.080829701487998, + "learning_rate": 2.4074441507559963e-06, + "loss": 0.663, + "step": 14573 + }, + { + "epoch": 1.0529015478534145, + "grad_norm": 5.324595605243181, + "learning_rate": 2.407151849530128e-06, + "loss": 0.5862, + "step": 14574 + }, + { + "epoch": 1.052973793053624, + "grad_norm": 5.660464239593326, + "learning_rate": 2.406859549575278e-06, + "loss": 0.5747, + "step": 14575 + }, + { + "epoch": 1.0530460382538336, + "grad_norm": 6.1633260115309545, + "learning_rate": 2.406567250895448e-06, + "loss": 0.6396, + "step": 14576 + }, + { + "epoch": 1.0531182834540431, + "grad_norm": 7.580421084179728, + "learning_rate": 2.4062749534946395e-06, + "loss": 0.7372, + "step": 14577 + }, + { + "epoch": 1.0531905286542524, + "grad_norm": 8.177590000143638, + "learning_rate": 2.405982657376856e-06, + "loss": 0.6402, + "step": 14578 + }, + { + "epoch": 1.053262773854462, + "grad_norm": 6.171909612244718, + "learning_rate": 2.4056903625460954e-06, + "loss": 0.6663, + "step": 14579 + }, + { + "epoch": 1.0533350190546715, + "grad_norm": 7.090276329778494, + "learning_rate": 2.4053980690063603e-06, + "loss": 0.6939, + "step": 14580 + }, + { + "epoch": 1.053407264254881, + "grad_norm": 7.815068913581213, + "learning_rate": 2.4051057767616527e-06, + "loss": 0.6527, + "step": 14581 + }, + { + "epoch": 1.0534795094550906, + "grad_norm": 7.307872644894282, + "learning_rate": 2.4048134858159743e-06, + "loss": 0.6712, + "step": 14582 + }, + { + "epoch": 1.0535517546553002, + "grad_norm": 5.926371857412762, + "learning_rate": 2.404521196173324e-06, + "loss": 0.663, + "step": 14583 + }, + { + "epoch": 1.0536239998555097, + "grad_norm": 8.192878645387522, + "learning_rate": 2.4042289078377047e-06, + "loss": 0.6292, + "step": 14584 + }, + { + "epoch": 1.053696245055719, + "grad_norm": 6.617337095693184, + "learning_rate": 2.4039366208131176e-06, + "loss": 0.6747, + "step": 14585 + }, + { + "epoch": 1.0537684902559286, + "grad_norm": 7.395974801729808, + "learning_rate": 2.4036443351035635e-06, + "loss": 0.7052, + "step": 14586 + }, + { + "epoch": 1.053840735456138, + "grad_norm": 6.819495852454243, + "learning_rate": 2.4033520507130434e-06, + "loss": 0.6598, + "step": 14587 + }, + { + "epoch": 1.0539129806563476, + "grad_norm": 6.711692332403486, + "learning_rate": 2.4030597676455587e-06, + "loss": 0.6898, + "step": 14588 + }, + { + "epoch": 1.0539852258565572, + "grad_norm": 7.37942914993141, + "learning_rate": 2.4027674859051112e-06, + "loss": 0.6709, + "step": 14589 + }, + { + "epoch": 1.0540574710567667, + "grad_norm": 6.515381440291888, + "learning_rate": 2.4024752054957006e-06, + "loss": 0.6091, + "step": 14590 + }, + { + "epoch": 1.0541297162569763, + "grad_norm": 6.389840000877675, + "learning_rate": 2.4021829264213288e-06, + "loss": 0.691, + "step": 14591 + }, + { + "epoch": 1.0542019614571856, + "grad_norm": 7.055861602446639, + "learning_rate": 2.4018906486859974e-06, + "loss": 0.7513, + "step": 14592 + }, + { + "epoch": 1.0542742066573951, + "grad_norm": 5.673349740482428, + "learning_rate": 2.401598372293706e-06, + "loss": 0.6891, + "step": 14593 + }, + { + "epoch": 1.0543464518576047, + "grad_norm": 6.967441589964735, + "learning_rate": 2.4013060972484566e-06, + "loss": 0.6614, + "step": 14594 + }, + { + "epoch": 1.0544186970578142, + "grad_norm": 5.911997748368058, + "learning_rate": 2.4010138235542503e-06, + "loss": 0.6314, + "step": 14595 + }, + { + "epoch": 1.0544909422580238, + "grad_norm": 7.345257024143704, + "learning_rate": 2.400721551215088e-06, + "loss": 0.7083, + "step": 14596 + }, + { + "epoch": 1.0545631874582333, + "grad_norm": 7.296449963363766, + "learning_rate": 2.4004292802349706e-06, + "loss": 0.7303, + "step": 14597 + }, + { + "epoch": 1.0546354326584428, + "grad_norm": 6.38856170848533, + "learning_rate": 2.4001370106178986e-06, + "loss": 0.6706, + "step": 14598 + }, + { + "epoch": 1.0547076778586522, + "grad_norm": 7.834276778309543, + "learning_rate": 2.399844742367874e-06, + "loss": 0.6963, + "step": 14599 + }, + { + "epoch": 1.0547799230588617, + "grad_norm": 5.761922003830202, + "learning_rate": 2.3995524754888968e-06, + "loss": 0.628, + "step": 14600 + }, + { + "epoch": 1.0548521682590712, + "grad_norm": 8.126533246258152, + "learning_rate": 2.399260209984968e-06, + "loss": 0.7221, + "step": 14601 + }, + { + "epoch": 1.0549244134592808, + "grad_norm": 5.868350222360233, + "learning_rate": 2.3989679458600886e-06, + "loss": 0.5969, + "step": 14602 + }, + { + "epoch": 1.0549966586594903, + "grad_norm": 6.971989310715448, + "learning_rate": 2.398675683118261e-06, + "loss": 0.6468, + "step": 14603 + }, + { + "epoch": 1.0550689038596999, + "grad_norm": 6.629213828401224, + "learning_rate": 2.3983834217634834e-06, + "loss": 0.6163, + "step": 14604 + }, + { + "epoch": 1.0551411490599094, + "grad_norm": 5.815422717906193, + "learning_rate": 2.3980911617997575e-06, + "loss": 0.6853, + "step": 14605 + }, + { + "epoch": 1.055213394260119, + "grad_norm": 6.511344985898891, + "learning_rate": 2.3977989032310847e-06, + "loss": 0.6677, + "step": 14606 + }, + { + "epoch": 1.0552856394603283, + "grad_norm": 6.386206855441521, + "learning_rate": 2.3975066460614663e-06, + "loss": 0.6174, + "step": 14607 + }, + { + "epoch": 1.0553578846605378, + "grad_norm": 5.7017686877154965, + "learning_rate": 2.3972143902949017e-06, + "loss": 0.6663, + "step": 14608 + }, + { + "epoch": 1.0554301298607474, + "grad_norm": 5.681210236363011, + "learning_rate": 2.3969221359353923e-06, + "loss": 0.6383, + "step": 14609 + }, + { + "epoch": 1.055502375060957, + "grad_norm": 5.867342889711596, + "learning_rate": 2.3966298829869393e-06, + "loss": 0.6945, + "step": 14610 + }, + { + "epoch": 1.0555746202611664, + "grad_norm": 5.544724254304664, + "learning_rate": 2.3963376314535426e-06, + "loss": 0.6129, + "step": 14611 + }, + { + "epoch": 1.055646865461376, + "grad_norm": 6.262026088546947, + "learning_rate": 2.396045381339203e-06, + "loss": 0.5894, + "step": 14612 + }, + { + "epoch": 1.0557191106615855, + "grad_norm": 6.264256506275586, + "learning_rate": 2.3957531326479216e-06, + "loss": 0.6917, + "step": 14613 + }, + { + "epoch": 1.0557913558617948, + "grad_norm": 6.828916595454639, + "learning_rate": 2.3954608853836992e-06, + "loss": 0.6826, + "step": 14614 + }, + { + "epoch": 1.0558636010620044, + "grad_norm": 6.274529804987911, + "learning_rate": 2.3951686395505356e-06, + "loss": 0.7871, + "step": 14615 + }, + { + "epoch": 1.055935846262214, + "grad_norm": 6.187301478428293, + "learning_rate": 2.3948763951524322e-06, + "loss": 0.6609, + "step": 14616 + }, + { + "epoch": 1.0560080914624235, + "grad_norm": 5.338374775084437, + "learning_rate": 2.39458415219339e-06, + "loss": 0.5847, + "step": 14617 + }, + { + "epoch": 1.056080336662633, + "grad_norm": 6.067541484148686, + "learning_rate": 2.394291910677408e-06, + "loss": 0.5903, + "step": 14618 + }, + { + "epoch": 1.0561525818628426, + "grad_norm": 6.448468095888465, + "learning_rate": 2.3939996706084874e-06, + "loss": 0.685, + "step": 14619 + }, + { + "epoch": 1.056224827063052, + "grad_norm": 8.335360420365896, + "learning_rate": 2.3937074319906288e-06, + "loss": 0.6957, + "step": 14620 + }, + { + "epoch": 1.0562970722632614, + "grad_norm": 6.840969687988148, + "learning_rate": 2.3934151948278346e-06, + "loss": 0.6341, + "step": 14621 + }, + { + "epoch": 1.056369317463471, + "grad_norm": 7.203840439490364, + "learning_rate": 2.3931229591241026e-06, + "loss": 0.6889, + "step": 14622 + }, + { + "epoch": 1.0564415626636805, + "grad_norm": 6.904008182171585, + "learning_rate": 2.392830724883434e-06, + "loss": 0.7364, + "step": 14623 + }, + { + "epoch": 1.05651380786389, + "grad_norm": 5.770750997715697, + "learning_rate": 2.3925384921098304e-06, + "loss": 0.6664, + "step": 14624 + }, + { + "epoch": 1.0565860530640996, + "grad_norm": 6.202178806043782, + "learning_rate": 2.392246260807291e-06, + "loss": 0.7006, + "step": 14625 + }, + { + "epoch": 1.0566582982643091, + "grad_norm": 5.758078166572878, + "learning_rate": 2.391954030979816e-06, + "loss": 0.5793, + "step": 14626 + }, + { + "epoch": 1.0567305434645187, + "grad_norm": 6.190828295185447, + "learning_rate": 2.3916618026314068e-06, + "loss": 0.6831, + "step": 14627 + }, + { + "epoch": 1.056802788664728, + "grad_norm": 6.431120442826932, + "learning_rate": 2.3913695757660637e-06, + "loss": 0.6696, + "step": 14628 + }, + { + "epoch": 1.0568750338649375, + "grad_norm": 6.177588954085933, + "learning_rate": 2.3910773503877866e-06, + "loss": 0.6527, + "step": 14629 + }, + { + "epoch": 1.056947279065147, + "grad_norm": 6.276075062854178, + "learning_rate": 2.390785126500576e-06, + "loss": 0.6303, + "step": 14630 + }, + { + "epoch": 1.0570195242653566, + "grad_norm": 6.110454493224668, + "learning_rate": 2.390492904108432e-06, + "loss": 0.607, + "step": 14631 + }, + { + "epoch": 1.0570917694655662, + "grad_norm": 6.627436639633269, + "learning_rate": 2.390200683215356e-06, + "loss": 0.6602, + "step": 14632 + }, + { + "epoch": 1.0571640146657757, + "grad_norm": 7.752473744013025, + "learning_rate": 2.389908463825347e-06, + "loss": 0.6347, + "step": 14633 + }, + { + "epoch": 1.0572362598659852, + "grad_norm": 6.842608909757632, + "learning_rate": 2.3896162459424045e-06, + "loss": 0.6621, + "step": 14634 + }, + { + "epoch": 1.0573085050661946, + "grad_norm": 8.77848712721479, + "learning_rate": 2.3893240295705313e-06, + "loss": 0.6827, + "step": 14635 + }, + { + "epoch": 1.057380750266404, + "grad_norm": 6.745350967449527, + "learning_rate": 2.3890318147137255e-06, + "loss": 0.7297, + "step": 14636 + }, + { + "epoch": 1.0574529954666136, + "grad_norm": 5.90076979448279, + "learning_rate": 2.3887396013759883e-06, + "loss": 0.6567, + "step": 14637 + }, + { + "epoch": 1.0575252406668232, + "grad_norm": 5.903584842734229, + "learning_rate": 2.388447389561319e-06, + "loss": 0.631, + "step": 14638 + }, + { + "epoch": 1.0575974858670327, + "grad_norm": 6.061492521595277, + "learning_rate": 2.388155179273719e-06, + "loss": 0.6173, + "step": 14639 + }, + { + "epoch": 1.0576697310672423, + "grad_norm": 7.619843036707038, + "learning_rate": 2.3878629705171875e-06, + "loss": 0.6889, + "step": 14640 + }, + { + "epoch": 1.0577419762674518, + "grad_norm": 8.57962884558124, + "learning_rate": 2.3875707632957248e-06, + "loss": 0.7337, + "step": 14641 + }, + { + "epoch": 1.0578142214676611, + "grad_norm": 6.863319600952896, + "learning_rate": 2.3872785576133315e-06, + "loss": 0.6498, + "step": 14642 + }, + { + "epoch": 1.0578864666678707, + "grad_norm": 5.5153476548099825, + "learning_rate": 2.386986353474007e-06, + "loss": 0.6858, + "step": 14643 + }, + { + "epoch": 1.0579587118680802, + "grad_norm": 6.024135999342496, + "learning_rate": 2.386694150881751e-06, + "loss": 0.6804, + "step": 14644 + }, + { + "epoch": 1.0580309570682898, + "grad_norm": 6.171802066420335, + "learning_rate": 2.386401949840564e-06, + "loss": 0.6717, + "step": 14645 + }, + { + "epoch": 1.0581032022684993, + "grad_norm": 6.909139865720047, + "learning_rate": 2.3861097503544476e-06, + "loss": 0.687, + "step": 14646 + }, + { + "epoch": 1.0581754474687088, + "grad_norm": 7.925431816779567, + "learning_rate": 2.3858175524273995e-06, + "loss": 0.7599, + "step": 14647 + }, + { + "epoch": 1.0582476926689184, + "grad_norm": 6.192809014000571, + "learning_rate": 2.3855253560634194e-06, + "loss": 0.6981, + "step": 14648 + }, + { + "epoch": 1.058319937869128, + "grad_norm": 6.056697310078175, + "learning_rate": 2.38523316126651e-06, + "loss": 0.633, + "step": 14649 + }, + { + "epoch": 1.0583921830693372, + "grad_norm": 6.535411914081526, + "learning_rate": 2.3849409680406684e-06, + "loss": 0.6709, + "step": 14650 + }, + { + "epoch": 1.0584644282695468, + "grad_norm": 5.984617542102765, + "learning_rate": 2.3846487763898955e-06, + "loss": 0.6633, + "step": 14651 + }, + { + "epoch": 1.0585366734697563, + "grad_norm": 6.32065863303975, + "learning_rate": 2.3843565863181916e-06, + "loss": 0.6291, + "step": 14652 + }, + { + "epoch": 1.0586089186699659, + "grad_norm": 6.418562208658261, + "learning_rate": 2.384064397829557e-06, + "loss": 0.7634, + "step": 14653 + }, + { + "epoch": 1.0586811638701754, + "grad_norm": 5.799830585504774, + "learning_rate": 2.38377221092799e-06, + "loss": 0.7291, + "step": 14654 + }, + { + "epoch": 1.058753409070385, + "grad_norm": 6.222750962967997, + "learning_rate": 2.383480025617491e-06, + "loss": 0.6536, + "step": 14655 + }, + { + "epoch": 1.0588256542705945, + "grad_norm": 7.739304576455719, + "learning_rate": 2.3831878419020598e-06, + "loss": 0.6603, + "step": 14656 + }, + { + "epoch": 1.0588978994708038, + "grad_norm": 8.389277344636687, + "learning_rate": 2.3828956597856973e-06, + "loss": 0.7107, + "step": 14657 + }, + { + "epoch": 1.0589701446710134, + "grad_norm": 5.664717769208379, + "learning_rate": 2.3826034792724014e-06, + "loss": 0.616, + "step": 14658 + }, + { + "epoch": 1.059042389871223, + "grad_norm": 6.986096061284173, + "learning_rate": 2.382311300366173e-06, + "loss": 0.6973, + "step": 14659 + }, + { + "epoch": 1.0591146350714324, + "grad_norm": 6.468536225992897, + "learning_rate": 2.3820191230710125e-06, + "loss": 0.6283, + "step": 14660 + }, + { + "epoch": 1.059186880271642, + "grad_norm": 5.753911553799321, + "learning_rate": 2.3817269473909176e-06, + "loss": 0.6261, + "step": 14661 + }, + { + "epoch": 1.0592591254718515, + "grad_norm": 6.943227242501546, + "learning_rate": 2.3814347733298884e-06, + "loss": 0.6964, + "step": 14662 + }, + { + "epoch": 1.059331370672061, + "grad_norm": 6.629439683715324, + "learning_rate": 2.3811426008919256e-06, + "loss": 0.6414, + "step": 14663 + }, + { + "epoch": 1.0594036158722704, + "grad_norm": 6.633248430536198, + "learning_rate": 2.3808504300810296e-06, + "loss": 0.6666, + "step": 14664 + }, + { + "epoch": 1.05947586107248, + "grad_norm": 7.8227669354030205, + "learning_rate": 2.3805582609011972e-06, + "loss": 0.6557, + "step": 14665 + }, + { + "epoch": 1.0595481062726895, + "grad_norm": 7.961140907990812, + "learning_rate": 2.38026609335643e-06, + "loss": 0.6345, + "step": 14666 + }, + { + "epoch": 1.059620351472899, + "grad_norm": 8.299294444050227, + "learning_rate": 2.379973927450727e-06, + "loss": 0.7269, + "step": 14667 + }, + { + "epoch": 1.0596925966731086, + "grad_norm": 6.359138053210133, + "learning_rate": 2.3796817631880873e-06, + "loss": 0.7041, + "step": 14668 + }, + { + "epoch": 1.059764841873318, + "grad_norm": 7.351071867345397, + "learning_rate": 2.379389600572511e-06, + "loss": 0.7346, + "step": 14669 + }, + { + "epoch": 1.0598370870735276, + "grad_norm": 7.806622789337269, + "learning_rate": 2.3790974396079976e-06, + "loss": 0.6743, + "step": 14670 + }, + { + "epoch": 1.059909332273737, + "grad_norm": 7.684370691050795, + "learning_rate": 2.3788052802985466e-06, + "loss": 0.7778, + "step": 14671 + }, + { + "epoch": 1.0599815774739465, + "grad_norm": 6.783233211943784, + "learning_rate": 2.3785131226481565e-06, + "loss": 0.5983, + "step": 14672 + }, + { + "epoch": 1.060053822674156, + "grad_norm": 7.0624962190601055, + "learning_rate": 2.3782209666608276e-06, + "loss": 0.6475, + "step": 14673 + }, + { + "epoch": 1.0601260678743656, + "grad_norm": 8.86705370562607, + "learning_rate": 2.3779288123405607e-06, + "loss": 0.7736, + "step": 14674 + }, + { + "epoch": 1.0601983130745751, + "grad_norm": 6.298008667625842, + "learning_rate": 2.377636659691352e-06, + "loss": 0.6361, + "step": 14675 + }, + { + "epoch": 1.0602705582747847, + "grad_norm": 6.305834886784205, + "learning_rate": 2.3773445087172016e-06, + "loss": 0.6784, + "step": 14676 + }, + { + "epoch": 1.0603428034749942, + "grad_norm": 7.997386266979449, + "learning_rate": 2.3770523594221106e-06, + "loss": 0.6736, + "step": 14677 + }, + { + "epoch": 1.0604150486752038, + "grad_norm": 7.462426737005583, + "learning_rate": 2.3767602118100786e-06, + "loss": 0.6789, + "step": 14678 + }, + { + "epoch": 1.060487293875413, + "grad_norm": 6.601982870769034, + "learning_rate": 2.376468065885102e-06, + "loss": 0.6953, + "step": 14679 + }, + { + "epoch": 1.0605595390756226, + "grad_norm": 6.058145119088477, + "learning_rate": 2.376175921651182e-06, + "loss": 0.6932, + "step": 14680 + }, + { + "epoch": 1.0606317842758322, + "grad_norm": 5.442847067298332, + "learning_rate": 2.3758837791123175e-06, + "loss": 0.677, + "step": 14681 + }, + { + "epoch": 1.0607040294760417, + "grad_norm": 6.037894744289208, + "learning_rate": 2.3755916382725084e-06, + "loss": 0.5829, + "step": 14682 + }, + { + "epoch": 1.0607762746762512, + "grad_norm": 5.7903818951304995, + "learning_rate": 2.375299499135752e-06, + "loss": 0.6753, + "step": 14683 + }, + { + "epoch": 1.0608485198764608, + "grad_norm": 6.596154520291835, + "learning_rate": 2.3750073617060494e-06, + "loss": 0.6738, + "step": 14684 + }, + { + "epoch": 1.0609207650766703, + "grad_norm": 5.996891488057063, + "learning_rate": 2.374715225987399e-06, + "loss": 0.7181, + "step": 14685 + }, + { + "epoch": 1.0609930102768796, + "grad_norm": 7.310443556409903, + "learning_rate": 2.3744230919837996e-06, + "loss": 0.722, + "step": 14686 + }, + { + "epoch": 1.0610652554770892, + "grad_norm": 5.8217670031373085, + "learning_rate": 2.3741309596992503e-06, + "loss": 0.6476, + "step": 14687 + }, + { + "epoch": 1.0611375006772987, + "grad_norm": 5.999013183823444, + "learning_rate": 2.3738388291377506e-06, + "loss": 0.6078, + "step": 14688 + }, + { + "epoch": 1.0612097458775083, + "grad_norm": 6.108682188278629, + "learning_rate": 2.3735467003033007e-06, + "loss": 0.6382, + "step": 14689 + }, + { + "epoch": 1.0612819910777178, + "grad_norm": 6.850290178153211, + "learning_rate": 2.373254573199897e-06, + "loss": 0.6804, + "step": 14690 + }, + { + "epoch": 1.0613542362779274, + "grad_norm": 5.767938909410823, + "learning_rate": 2.372962447831539e-06, + "loss": 0.6776, + "step": 14691 + }, + { + "epoch": 1.061426481478137, + "grad_norm": 5.427554099763482, + "learning_rate": 2.372670324202228e-06, + "loss": 0.6301, + "step": 14692 + }, + { + "epoch": 1.0614987266783462, + "grad_norm": 7.123368193499899, + "learning_rate": 2.372378202315961e-06, + "loss": 0.6171, + "step": 14693 + }, + { + "epoch": 1.0615709718785558, + "grad_norm": 6.577230143446653, + "learning_rate": 2.3720860821767366e-06, + "loss": 0.6873, + "step": 14694 + }, + { + "epoch": 1.0616432170787653, + "grad_norm": 7.2807412686078985, + "learning_rate": 2.3717939637885548e-06, + "loss": 0.6188, + "step": 14695 + }, + { + "epoch": 1.0617154622789748, + "grad_norm": 7.439190840660665, + "learning_rate": 2.3715018471554146e-06, + "loss": 0.6357, + "step": 14696 + }, + { + "epoch": 1.0617877074791844, + "grad_norm": 7.556062086117846, + "learning_rate": 2.3712097322813136e-06, + "loss": 0.666, + "step": 14697 + }, + { + "epoch": 1.061859952679394, + "grad_norm": 6.5818318775442615, + "learning_rate": 2.3709176191702516e-06, + "loss": 0.648, + "step": 14698 + }, + { + "epoch": 1.0619321978796035, + "grad_norm": 7.130447446846259, + "learning_rate": 2.3706255078262274e-06, + "loss": 0.6603, + "step": 14699 + }, + { + "epoch": 1.0620044430798128, + "grad_norm": 6.284324083544046, + "learning_rate": 2.3703333982532394e-06, + "loss": 0.7122, + "step": 14700 + }, + { + "epoch": 1.0620766882800223, + "grad_norm": 5.995664619721298, + "learning_rate": 2.370041290455286e-06, + "loss": 0.5957, + "step": 14701 + }, + { + "epoch": 1.0621489334802319, + "grad_norm": 5.779744050057899, + "learning_rate": 2.369749184436367e-06, + "loss": 0.7597, + "step": 14702 + }, + { + "epoch": 1.0622211786804414, + "grad_norm": 7.022473635109679, + "learning_rate": 2.3694570802004814e-06, + "loss": 0.6333, + "step": 14703 + }, + { + "epoch": 1.062293423880651, + "grad_norm": 6.260480409658415, + "learning_rate": 2.3691649777516257e-06, + "loss": 0.6042, + "step": 14704 + }, + { + "epoch": 1.0623656690808605, + "grad_norm": 7.24793924443463, + "learning_rate": 2.3688728770937997e-06, + "loss": 0.6706, + "step": 14705 + }, + { + "epoch": 1.06243791428107, + "grad_norm": 6.643595096364871, + "learning_rate": 2.3685807782310037e-06, + "loss": 0.6784, + "step": 14706 + }, + { + "epoch": 1.0625101594812794, + "grad_norm": 6.133451586426448, + "learning_rate": 2.3682886811672333e-06, + "loss": 0.666, + "step": 14707 + }, + { + "epoch": 1.062582404681489, + "grad_norm": 6.838090341806335, + "learning_rate": 2.367996585906489e-06, + "loss": 0.7305, + "step": 14708 + }, + { + "epoch": 1.0626546498816984, + "grad_norm": 5.890578697285694, + "learning_rate": 2.3677044924527688e-06, + "loss": 0.6611, + "step": 14709 + }, + { + "epoch": 1.062726895081908, + "grad_norm": 6.9088085837836966, + "learning_rate": 2.3674124008100723e-06, + "loss": 0.6139, + "step": 14710 + }, + { + "epoch": 1.0627991402821175, + "grad_norm": 5.406272755834063, + "learning_rate": 2.3671203109823964e-06, + "loss": 0.6783, + "step": 14711 + }, + { + "epoch": 1.062871385482327, + "grad_norm": 5.872183266547287, + "learning_rate": 2.36682822297374e-06, + "loss": 0.6579, + "step": 14712 + }, + { + "epoch": 1.0629436306825366, + "grad_norm": 7.852869818868122, + "learning_rate": 2.366536136788102e-06, + "loss": 0.6893, + "step": 14713 + }, + { + "epoch": 1.063015875882746, + "grad_norm": 7.381602806813612, + "learning_rate": 2.366244052429481e-06, + "loss": 0.6751, + "step": 14714 + }, + { + "epoch": 1.0630881210829555, + "grad_norm": 5.667668571909937, + "learning_rate": 2.3659519699018745e-06, + "loss": 0.6796, + "step": 14715 + }, + { + "epoch": 1.063160366283165, + "grad_norm": 6.3091358247177025, + "learning_rate": 2.3656598892092817e-06, + "loss": 0.7114, + "step": 14716 + }, + { + "epoch": 1.0632326114833746, + "grad_norm": 5.5598981965988346, + "learning_rate": 2.365367810355702e-06, + "loss": 0.6494, + "step": 14717 + }, + { + "epoch": 1.063304856683584, + "grad_norm": 6.645706350933062, + "learning_rate": 2.3650757333451308e-06, + "loss": 0.6442, + "step": 14718 + }, + { + "epoch": 1.0633771018837936, + "grad_norm": 7.64807439766295, + "learning_rate": 2.364783658181568e-06, + "loss": 0.6579, + "step": 14719 + }, + { + "epoch": 1.0634493470840032, + "grad_norm": 6.835261127252885, + "learning_rate": 2.364491584869012e-06, + "loss": 0.7248, + "step": 14720 + }, + { + "epoch": 1.0635215922842125, + "grad_norm": 6.302188117887484, + "learning_rate": 2.3641995134114623e-06, + "loss": 0.6562, + "step": 14721 + }, + { + "epoch": 1.063593837484422, + "grad_norm": 6.216996295497724, + "learning_rate": 2.363907443812915e-06, + "loss": 0.5578, + "step": 14722 + }, + { + "epoch": 1.0636660826846316, + "grad_norm": 5.810612854142208, + "learning_rate": 2.363615376077369e-06, + "loss": 0.7069, + "step": 14723 + }, + { + "epoch": 1.0637383278848411, + "grad_norm": 5.708627215243363, + "learning_rate": 2.363323310208823e-06, + "loss": 0.6186, + "step": 14724 + }, + { + "epoch": 1.0638105730850507, + "grad_norm": 5.969751723348948, + "learning_rate": 2.363031246211274e-06, + "loss": 0.6752, + "step": 14725 + }, + { + "epoch": 1.0638828182852602, + "grad_norm": 6.292648373575851, + "learning_rate": 2.3627391840887213e-06, + "loss": 0.7358, + "step": 14726 + }, + { + "epoch": 1.0639550634854698, + "grad_norm": 6.313413346188731, + "learning_rate": 2.3624471238451622e-06, + "loss": 0.6427, + "step": 14727 + }, + { + "epoch": 1.0640273086856793, + "grad_norm": 6.636439397814115, + "learning_rate": 2.362155065484596e-06, + "loss": 0.6179, + "step": 14728 + }, + { + "epoch": 1.0640995538858886, + "grad_norm": 7.629745757540485, + "learning_rate": 2.3618630090110192e-06, + "loss": 0.6179, + "step": 14729 + }, + { + "epoch": 1.0641717990860982, + "grad_norm": 6.346175308013407, + "learning_rate": 2.361570954428431e-06, + "loss": 0.6897, + "step": 14730 + }, + { + "epoch": 1.0642440442863077, + "grad_norm": 8.066482861318997, + "learning_rate": 2.3612789017408296e-06, + "loss": 0.6673, + "step": 14731 + }, + { + "epoch": 1.0643162894865172, + "grad_norm": 7.367346461533986, + "learning_rate": 2.3609868509522114e-06, + "loss": 0.6963, + "step": 14732 + }, + { + "epoch": 1.0643885346867268, + "grad_norm": 5.806499378117264, + "learning_rate": 2.3606948020665748e-06, + "loss": 0.6675, + "step": 14733 + }, + { + "epoch": 1.0644607798869363, + "grad_norm": 7.187949257783523, + "learning_rate": 2.3604027550879184e-06, + "loss": 0.6041, + "step": 14734 + }, + { + "epoch": 1.0645330250871459, + "grad_norm": 6.830195691277985, + "learning_rate": 2.3601107100202413e-06, + "loss": 0.6163, + "step": 14735 + }, + { + "epoch": 1.0646052702873552, + "grad_norm": 7.063090071882518, + "learning_rate": 2.3598186668675388e-06, + "loss": 0.703, + "step": 14736 + }, + { + "epoch": 1.0646775154875647, + "grad_norm": 6.372373451242449, + "learning_rate": 2.3595266256338097e-06, + "loss": 0.6544, + "step": 14737 + }, + { + "epoch": 1.0647497606877743, + "grad_norm": 6.27175200398285, + "learning_rate": 2.359234586323052e-06, + "loss": 0.6476, + "step": 14738 + }, + { + "epoch": 1.0648220058879838, + "grad_norm": 7.333442629375363, + "learning_rate": 2.3589425489392644e-06, + "loss": 0.6881, + "step": 14739 + }, + { + "epoch": 1.0648942510881934, + "grad_norm": 7.452755964887618, + "learning_rate": 2.3586505134864433e-06, + "loss": 0.6126, + "step": 14740 + }, + { + "epoch": 1.064966496288403, + "grad_norm": 10.132759041156206, + "learning_rate": 2.3583584799685864e-06, + "loss": 0.5863, + "step": 14741 + }, + { + "epoch": 1.0650387414886124, + "grad_norm": 5.396669929288208, + "learning_rate": 2.358066448389693e-06, + "loss": 0.6253, + "step": 14742 + }, + { + "epoch": 1.0651109866888218, + "grad_norm": 6.047829722656553, + "learning_rate": 2.3577744187537592e-06, + "loss": 0.6305, + "step": 14743 + }, + { + "epoch": 1.0651832318890313, + "grad_norm": 6.686865375389576, + "learning_rate": 2.357482391064783e-06, + "loss": 0.6543, + "step": 14744 + }, + { + "epoch": 1.0652554770892408, + "grad_norm": 6.75106407538118, + "learning_rate": 2.3571903653267618e-06, + "loss": 0.6331, + "step": 14745 + }, + { + "epoch": 1.0653277222894504, + "grad_norm": 7.809352637507373, + "learning_rate": 2.3568983415436953e-06, + "loss": 0.7548, + "step": 14746 + }, + { + "epoch": 1.06539996748966, + "grad_norm": 5.597255524735287, + "learning_rate": 2.3566063197195775e-06, + "loss": 0.6271, + "step": 14747 + }, + { + "epoch": 1.0654722126898695, + "grad_norm": 6.02419077397132, + "learning_rate": 2.3563142998584083e-06, + "loss": 0.7333, + "step": 14748 + }, + { + "epoch": 1.065544457890079, + "grad_norm": 7.498335590055553, + "learning_rate": 2.356022281964186e-06, + "loss": 0.657, + "step": 14749 + }, + { + "epoch": 1.0656167030902886, + "grad_norm": 5.507914310894129, + "learning_rate": 2.355730266040906e-06, + "loss": 0.6718, + "step": 14750 + }, + { + "epoch": 1.0656889482904979, + "grad_norm": 7.383663972063175, + "learning_rate": 2.355438252092566e-06, + "loss": 0.6865, + "step": 14751 + }, + { + "epoch": 1.0657611934907074, + "grad_norm": 5.639411267334443, + "learning_rate": 2.3551462401231647e-06, + "loss": 0.6648, + "step": 14752 + }, + { + "epoch": 1.065833438690917, + "grad_norm": 7.0055939257454405, + "learning_rate": 2.354854230136699e-06, + "loss": 0.7173, + "step": 14753 + }, + { + "epoch": 1.0659056838911265, + "grad_norm": 7.0399354992425955, + "learning_rate": 2.3545622221371662e-06, + "loss": 0.6717, + "step": 14754 + }, + { + "epoch": 1.065977929091336, + "grad_norm": 6.842085962615803, + "learning_rate": 2.354270216128563e-06, + "loss": 0.6938, + "step": 14755 + }, + { + "epoch": 1.0660501742915456, + "grad_norm": 7.437019653195861, + "learning_rate": 2.3539782121148886e-06, + "loss": 0.7046, + "step": 14756 + }, + { + "epoch": 1.0661224194917551, + "grad_norm": 5.412160652217841, + "learning_rate": 2.3536862101001377e-06, + "loss": 0.5941, + "step": 14757 + }, + { + "epoch": 1.0661946646919644, + "grad_norm": 6.043583568028988, + "learning_rate": 2.3533942100883097e-06, + "loss": 0.6408, + "step": 14758 + }, + { + "epoch": 1.066266909892174, + "grad_norm": 6.55679305657633, + "learning_rate": 2.353102212083401e-06, + "loss": 0.7218, + "step": 14759 + }, + { + "epoch": 1.0663391550923835, + "grad_norm": 6.251616612214258, + "learning_rate": 2.35281021608941e-06, + "loss": 0.5738, + "step": 14760 + }, + { + "epoch": 1.066411400292593, + "grad_norm": 6.7225708498897285, + "learning_rate": 2.3525182221103313e-06, + "loss": 0.7068, + "step": 14761 + }, + { + "epoch": 1.0664836454928026, + "grad_norm": 7.957034605884172, + "learning_rate": 2.352226230150164e-06, + "loss": 0.6033, + "step": 14762 + }, + { + "epoch": 1.0665558906930122, + "grad_norm": 8.053996014814052, + "learning_rate": 2.3519342402129047e-06, + "loss": 0.7202, + "step": 14763 + }, + { + "epoch": 1.0666281358932217, + "grad_norm": 7.010757898997752, + "learning_rate": 2.351642252302552e-06, + "loss": 0.8343, + "step": 14764 + }, + { + "epoch": 1.066700381093431, + "grad_norm": 6.248862506351937, + "learning_rate": 2.351350266423101e-06, + "loss": 0.685, + "step": 14765 + }, + { + "epoch": 1.0667726262936406, + "grad_norm": 7.846032771734194, + "learning_rate": 2.351058282578549e-06, + "loss": 0.7023, + "step": 14766 + }, + { + "epoch": 1.06684487149385, + "grad_norm": 7.810199124072703, + "learning_rate": 2.3507663007728936e-06, + "loss": 0.7347, + "step": 14767 + }, + { + "epoch": 1.0669171166940596, + "grad_norm": 6.397418824873795, + "learning_rate": 2.3504743210101315e-06, + "loss": 0.6972, + "step": 14768 + }, + { + "epoch": 1.0669893618942692, + "grad_norm": 6.539095752043591, + "learning_rate": 2.35018234329426e-06, + "loss": 0.6786, + "step": 14769 + }, + { + "epoch": 1.0670616070944787, + "grad_norm": 5.867534683053517, + "learning_rate": 2.349890367629276e-06, + "loss": 0.7053, + "step": 14770 + }, + { + "epoch": 1.0671338522946883, + "grad_norm": 6.014678167166539, + "learning_rate": 2.3495983940191766e-06, + "loss": 0.624, + "step": 14771 + }, + { + "epoch": 1.0672060974948976, + "grad_norm": 5.856301602815105, + "learning_rate": 2.349306422467958e-06, + "loss": 0.6798, + "step": 14772 + }, + { + "epoch": 1.0672783426951071, + "grad_norm": 6.984665286455634, + "learning_rate": 2.349014452979617e-06, + "loss": 0.7174, + "step": 14773 + }, + { + "epoch": 1.0673505878953167, + "grad_norm": 6.501298334594436, + "learning_rate": 2.348722485558153e-06, + "loss": 0.6449, + "step": 14774 + }, + { + "epoch": 1.0674228330955262, + "grad_norm": 8.914947866992348, + "learning_rate": 2.3484305202075582e-06, + "loss": 0.6796, + "step": 14775 + }, + { + "epoch": 1.0674950782957358, + "grad_norm": 7.9986525831387345, + "learning_rate": 2.3481385569318325e-06, + "loss": 0.6767, + "step": 14776 + }, + { + "epoch": 1.0675673234959453, + "grad_norm": 6.639918390714721, + "learning_rate": 2.3478465957349725e-06, + "loss": 0.6643, + "step": 14777 + }, + { + "epoch": 1.0676395686961548, + "grad_norm": 7.529686223315155, + "learning_rate": 2.3475546366209755e-06, + "loss": 0.6579, + "step": 14778 + }, + { + "epoch": 1.0677118138963642, + "grad_norm": 7.853250654181512, + "learning_rate": 2.3472626795938355e-06, + "loss": 0.6419, + "step": 14779 + }, + { + "epoch": 1.0677840590965737, + "grad_norm": 6.632673893391279, + "learning_rate": 2.346970724657551e-06, + "loss": 0.6569, + "step": 14780 + }, + { + "epoch": 1.0678563042967832, + "grad_norm": 7.206703339847177, + "learning_rate": 2.3466787718161193e-06, + "loss": 0.7699, + "step": 14781 + }, + { + "epoch": 1.0679285494969928, + "grad_norm": 7.953143946057369, + "learning_rate": 2.3463868210735356e-06, + "loss": 0.6464, + "step": 14782 + }, + { + "epoch": 1.0680007946972023, + "grad_norm": 6.662934721027941, + "learning_rate": 2.346094872433797e-06, + "loss": 0.6486, + "step": 14783 + }, + { + "epoch": 1.0680730398974119, + "grad_norm": 8.737180201363238, + "learning_rate": 2.3458029259009004e-06, + "loss": 0.6249, + "step": 14784 + }, + { + "epoch": 1.0681452850976214, + "grad_norm": 9.429953279834598, + "learning_rate": 2.345510981478842e-06, + "loss": 0.6774, + "step": 14785 + }, + { + "epoch": 1.0682175302978307, + "grad_norm": 6.6939084729723675, + "learning_rate": 2.345219039171618e-06, + "loss": 0.6381, + "step": 14786 + }, + { + "epoch": 1.0682897754980403, + "grad_norm": 7.510611084966982, + "learning_rate": 2.3449270989832253e-06, + "loss": 0.6632, + "step": 14787 + }, + { + "epoch": 1.0683620206982498, + "grad_norm": 6.844576293671697, + "learning_rate": 2.34463516091766e-06, + "loss": 0.73, + "step": 14788 + }, + { + "epoch": 1.0684342658984594, + "grad_norm": 5.873641486049402, + "learning_rate": 2.344343224978919e-06, + "loss": 0.6935, + "step": 14789 + }, + { + "epoch": 1.068506511098669, + "grad_norm": 6.027468746172309, + "learning_rate": 2.3440512911709983e-06, + "loss": 0.7136, + "step": 14790 + }, + { + "epoch": 1.0685787562988784, + "grad_norm": 6.9136572422722145, + "learning_rate": 2.343759359497894e-06, + "loss": 0.6903, + "step": 14791 + }, + { + "epoch": 1.068651001499088, + "grad_norm": 6.322268641874473, + "learning_rate": 2.343467429963604e-06, + "loss": 0.6106, + "step": 14792 + }, + { + "epoch": 1.0687232466992973, + "grad_norm": 7.941812860424864, + "learning_rate": 2.3431755025721226e-06, + "loss": 0.6729, + "step": 14793 + }, + { + "epoch": 1.0687954918995068, + "grad_norm": 6.9701328701670535, + "learning_rate": 2.3428835773274465e-06, + "loss": 0.6599, + "step": 14794 + }, + { + "epoch": 1.0688677370997164, + "grad_norm": 6.798304335163004, + "learning_rate": 2.342591654233572e-06, + "loss": 0.5982, + "step": 14795 + }, + { + "epoch": 1.068939982299926, + "grad_norm": 6.183512009389459, + "learning_rate": 2.3422997332944966e-06, + "loss": 0.6894, + "step": 14796 + }, + { + "epoch": 1.0690122275001355, + "grad_norm": 5.570407347654104, + "learning_rate": 2.3420078145142146e-06, + "loss": 0.6876, + "step": 14797 + }, + { + "epoch": 1.069084472700345, + "grad_norm": 6.343078878447211, + "learning_rate": 2.341715897896723e-06, + "loss": 0.5852, + "step": 14798 + }, + { + "epoch": 1.0691567179005546, + "grad_norm": 6.534737126054928, + "learning_rate": 2.3414239834460183e-06, + "loss": 0.7036, + "step": 14799 + }, + { + "epoch": 1.069228963100764, + "grad_norm": 6.431674728975024, + "learning_rate": 2.341132071166096e-06, + "loss": 0.6026, + "step": 14800 + }, + { + "epoch": 1.0693012083009734, + "grad_norm": 6.384028306119114, + "learning_rate": 2.3408401610609516e-06, + "loss": 0.6428, + "step": 14801 + }, + { + "epoch": 1.069373453501183, + "grad_norm": 8.75068269518647, + "learning_rate": 2.340548253134582e-06, + "loss": 0.6787, + "step": 14802 + }, + { + "epoch": 1.0694456987013925, + "grad_norm": 6.249096614399057, + "learning_rate": 2.3402563473909845e-06, + "loss": 0.7053, + "step": 14803 + }, + { + "epoch": 1.069517943901602, + "grad_norm": 7.289811630431731, + "learning_rate": 2.3399644438341516e-06, + "loss": 0.6652, + "step": 14804 + }, + { + "epoch": 1.0695901891018116, + "grad_norm": 6.186655931024992, + "learning_rate": 2.3396725424680818e-06, + "loss": 0.6363, + "step": 14805 + }, + { + "epoch": 1.0696624343020211, + "grad_norm": 5.897496756002545, + "learning_rate": 2.3393806432967713e-06, + "loss": 0.6004, + "step": 14806 + }, + { + "epoch": 1.0697346795022307, + "grad_norm": 5.662696494762621, + "learning_rate": 2.3390887463242144e-06, + "loss": 0.6479, + "step": 14807 + }, + { + "epoch": 1.06980692470244, + "grad_norm": 6.266739363282508, + "learning_rate": 2.3387968515544073e-06, + "loss": 0.612, + "step": 14808 + }, + { + "epoch": 1.0698791699026495, + "grad_norm": 6.071048961921397, + "learning_rate": 2.3385049589913463e-06, + "loss": 0.6972, + "step": 14809 + }, + { + "epoch": 1.069951415102859, + "grad_norm": 6.45541791843944, + "learning_rate": 2.3382130686390274e-06, + "loss": 0.7264, + "step": 14810 + }, + { + "epoch": 1.0700236603030686, + "grad_norm": 6.245588653162464, + "learning_rate": 2.3379211805014455e-06, + "loss": 0.6196, + "step": 14811 + }, + { + "epoch": 1.0700959055032782, + "grad_norm": 5.549773670427358, + "learning_rate": 2.337629294582597e-06, + "loss": 0.6272, + "step": 14812 + }, + { + "epoch": 1.0701681507034877, + "grad_norm": 6.745523734603679, + "learning_rate": 2.337337410886478e-06, + "loss": 0.7159, + "step": 14813 + }, + { + "epoch": 1.0702403959036972, + "grad_norm": 6.835593462324813, + "learning_rate": 2.3370455294170825e-06, + "loss": 0.6444, + "step": 14814 + }, + { + "epoch": 1.0703126411039066, + "grad_norm": 8.514291753428466, + "learning_rate": 2.3367536501784076e-06, + "loss": 0.6526, + "step": 14815 + }, + { + "epoch": 1.070384886304116, + "grad_norm": 8.482980181445024, + "learning_rate": 2.3364617731744487e-06, + "loss": 0.7362, + "step": 14816 + }, + { + "epoch": 1.0704571315043256, + "grad_norm": 8.129268991936588, + "learning_rate": 2.3361698984092017e-06, + "loss": 0.7038, + "step": 14817 + }, + { + "epoch": 1.0705293767045352, + "grad_norm": 7.896413360736825, + "learning_rate": 2.3358780258866603e-06, + "loss": 0.6255, + "step": 14818 + }, + { + "epoch": 1.0706016219047447, + "grad_norm": 7.056840097166892, + "learning_rate": 2.3355861556108213e-06, + "loss": 0.6579, + "step": 14819 + }, + { + "epoch": 1.0706738671049543, + "grad_norm": 7.116058092503104, + "learning_rate": 2.3352942875856805e-06, + "loss": 0.7286, + "step": 14820 + }, + { + "epoch": 1.0707461123051638, + "grad_norm": 5.895754900119675, + "learning_rate": 2.3350024218152344e-06, + "loss": 0.6617, + "step": 14821 + }, + { + "epoch": 1.0708183575053734, + "grad_norm": 7.155467694429581, + "learning_rate": 2.3347105583034756e-06, + "loss": 0.6833, + "step": 14822 + }, + { + "epoch": 1.0708906027055827, + "grad_norm": 6.491351022094015, + "learning_rate": 2.3344186970544007e-06, + "loss": 0.6999, + "step": 14823 + }, + { + "epoch": 1.0709628479057922, + "grad_norm": 7.778457569508529, + "learning_rate": 2.3341268380720062e-06, + "loss": 0.6149, + "step": 14824 + }, + { + "epoch": 1.0710350931060018, + "grad_norm": 6.3269288993061235, + "learning_rate": 2.333834981360286e-06, + "loss": 0.7062, + "step": 14825 + }, + { + "epoch": 1.0711073383062113, + "grad_norm": 6.5993073533474185, + "learning_rate": 2.333543126923236e-06, + "loss": 0.6648, + "step": 14826 + }, + { + "epoch": 1.0711795835064208, + "grad_norm": 6.880317989363982, + "learning_rate": 2.3332512747648507e-06, + "loss": 0.6223, + "step": 14827 + }, + { + "epoch": 1.0712518287066304, + "grad_norm": 7.087853220054925, + "learning_rate": 2.3329594248891272e-06, + "loss": 0.6441, + "step": 14828 + }, + { + "epoch": 1.07132407390684, + "grad_norm": 6.9369523115736405, + "learning_rate": 2.3326675773000586e-06, + "loss": 0.6846, + "step": 14829 + }, + { + "epoch": 1.0713963191070492, + "grad_norm": 7.344780253115319, + "learning_rate": 2.332375732001641e-06, + "loss": 0.6541, + "step": 14830 + }, + { + "epoch": 1.0714685643072588, + "grad_norm": 6.288419816249579, + "learning_rate": 2.3320838889978705e-06, + "loss": 0.6371, + "step": 14831 + }, + { + "epoch": 1.0715408095074683, + "grad_norm": 6.329564663107795, + "learning_rate": 2.33179204829274e-06, + "loss": 0.7339, + "step": 14832 + }, + { + "epoch": 1.0716130547076779, + "grad_norm": 5.885765114880888, + "learning_rate": 2.3315002098902454e-06, + "loss": 0.6377, + "step": 14833 + }, + { + "epoch": 1.0716852999078874, + "grad_norm": 8.506475338274017, + "learning_rate": 2.3312083737943827e-06, + "loss": 0.6848, + "step": 14834 + }, + { + "epoch": 1.071757545108097, + "grad_norm": 6.4248960494007585, + "learning_rate": 2.3309165400091474e-06, + "loss": 0.6288, + "step": 14835 + }, + { + "epoch": 1.0718297903083065, + "grad_norm": 6.9962930400670205, + "learning_rate": 2.330624708538532e-06, + "loss": 0.6104, + "step": 14836 + }, + { + "epoch": 1.0719020355085158, + "grad_norm": 6.669269848254596, + "learning_rate": 2.3303328793865332e-06, + "loss": 0.6447, + "step": 14837 + }, + { + "epoch": 1.0719742807087254, + "grad_norm": 6.036552983601496, + "learning_rate": 2.330041052557146e-06, + "loss": 0.6909, + "step": 14838 + }, + { + "epoch": 1.072046525908935, + "grad_norm": 6.321816698032725, + "learning_rate": 2.329749228054364e-06, + "loss": 0.6139, + "step": 14839 + }, + { + "epoch": 1.0721187711091444, + "grad_norm": 6.72917324888618, + "learning_rate": 2.329457405882183e-06, + "loss": 0.6685, + "step": 14840 + }, + { + "epoch": 1.072191016309354, + "grad_norm": 6.738484202106659, + "learning_rate": 2.329165586044598e-06, + "loss": 0.6405, + "step": 14841 + }, + { + "epoch": 1.0722632615095635, + "grad_norm": 6.14936218676011, + "learning_rate": 2.3288737685456035e-06, + "loss": 0.6389, + "step": 14842 + }, + { + "epoch": 1.072335506709773, + "grad_norm": 6.487329310999671, + "learning_rate": 2.328581953389194e-06, + "loss": 0.668, + "step": 14843 + }, + { + "epoch": 1.0724077519099824, + "grad_norm": 6.407558400590854, + "learning_rate": 2.3282901405793646e-06, + "loss": 0.6705, + "step": 14844 + }, + { + "epoch": 1.072479997110192, + "grad_norm": 7.561924179475921, + "learning_rate": 2.3279983301201098e-06, + "loss": 0.7319, + "step": 14845 + }, + { + "epoch": 1.0725522423104015, + "grad_norm": 6.247840813556549, + "learning_rate": 2.327706522015425e-06, + "loss": 0.7048, + "step": 14846 + }, + { + "epoch": 1.072624487510611, + "grad_norm": 6.2768463336487885, + "learning_rate": 2.3274147162693027e-06, + "loss": 0.6679, + "step": 14847 + }, + { + "epoch": 1.0726967327108206, + "grad_norm": 6.167842581040164, + "learning_rate": 2.3271229128857396e-06, + "loss": 0.6738, + "step": 14848 + }, + { + "epoch": 1.07276897791103, + "grad_norm": 7.438273926546914, + "learning_rate": 2.3268311118687307e-06, + "loss": 0.6701, + "step": 14849 + }, + { + "epoch": 1.0728412231112396, + "grad_norm": 6.163796384068654, + "learning_rate": 2.3265393132222685e-06, + "loss": 0.5884, + "step": 14850 + }, + { + "epoch": 1.072913468311449, + "grad_norm": 6.507230698314298, + "learning_rate": 2.3262475169503484e-06, + "loss": 0.67, + "step": 14851 + }, + { + "epoch": 1.0729857135116585, + "grad_norm": 6.106600782769182, + "learning_rate": 2.3259557230569637e-06, + "loss": 0.6315, + "step": 14852 + }, + { + "epoch": 1.073057958711868, + "grad_norm": 7.429603552118403, + "learning_rate": 2.325663931546112e-06, + "loss": 0.6687, + "step": 14853 + }, + { + "epoch": 1.0731302039120776, + "grad_norm": 8.098658660218158, + "learning_rate": 2.3253721424217853e-06, + "loss": 0.6944, + "step": 14854 + }, + { + "epoch": 1.0732024491122871, + "grad_norm": 7.259292697973647, + "learning_rate": 2.3250803556879775e-06, + "loss": 0.6576, + "step": 14855 + }, + { + "epoch": 1.0732746943124967, + "grad_norm": 5.460818921112892, + "learning_rate": 2.3247885713486844e-06, + "loss": 0.6605, + "step": 14856 + }, + { + "epoch": 1.0733469395127062, + "grad_norm": 7.879484080741715, + "learning_rate": 2.3244967894078998e-06, + "loss": 0.6555, + "step": 14857 + }, + { + "epoch": 1.0734191847129155, + "grad_norm": 6.506861659604157, + "learning_rate": 2.3242050098696174e-06, + "loss": 0.6447, + "step": 14858 + }, + { + "epoch": 1.073491429913125, + "grad_norm": 7.233014170319114, + "learning_rate": 2.323913232737832e-06, + "loss": 0.7339, + "step": 14859 + }, + { + "epoch": 1.0735636751133346, + "grad_norm": 6.904787486988902, + "learning_rate": 2.3236214580165385e-06, + "loss": 0.6341, + "step": 14860 + }, + { + "epoch": 1.0736359203135442, + "grad_norm": 8.566322976823937, + "learning_rate": 2.323329685709729e-06, + "loss": 0.7563, + "step": 14861 + }, + { + "epoch": 1.0737081655137537, + "grad_norm": 6.443088513081467, + "learning_rate": 2.3230379158213996e-06, + "loss": 0.6162, + "step": 14862 + }, + { + "epoch": 1.0737804107139632, + "grad_norm": 6.751378801597215, + "learning_rate": 2.322746148355545e-06, + "loss": 0.7449, + "step": 14863 + }, + { + "epoch": 1.0738526559141728, + "grad_norm": 6.453474945111935, + "learning_rate": 2.3224543833161563e-06, + "loss": 0.6594, + "step": 14864 + }, + { + "epoch": 1.073924901114382, + "grad_norm": 5.4188267826681376, + "learning_rate": 2.3221626207072296e-06, + "loss": 0.6336, + "step": 14865 + }, + { + "epoch": 1.0739971463145916, + "grad_norm": 5.178892048649257, + "learning_rate": 2.3218708605327582e-06, + "loss": 0.6568, + "step": 14866 + }, + { + "epoch": 1.0740693915148012, + "grad_norm": 6.517974356774976, + "learning_rate": 2.3215791027967375e-06, + "loss": 0.6918, + "step": 14867 + }, + { + "epoch": 1.0741416367150107, + "grad_norm": 5.723705708485851, + "learning_rate": 2.3212873475031597e-06, + "loss": 0.671, + "step": 14868 + }, + { + "epoch": 1.0742138819152203, + "grad_norm": 6.247209154726644, + "learning_rate": 2.320995594656019e-06, + "loss": 0.6343, + "step": 14869 + }, + { + "epoch": 1.0742861271154298, + "grad_norm": 6.3836118087914615, + "learning_rate": 2.32070384425931e-06, + "loss": 0.6649, + "step": 14870 + }, + { + "epoch": 1.0743583723156394, + "grad_norm": 7.029896245546198, + "learning_rate": 2.3204120963170263e-06, + "loss": 0.6625, + "step": 14871 + }, + { + "epoch": 1.0744306175158487, + "grad_norm": 6.827048908698239, + "learning_rate": 2.3201203508331615e-06, + "loss": 0.6745, + "step": 14872 + }, + { + "epoch": 1.0745028627160582, + "grad_norm": 6.291055039072434, + "learning_rate": 2.3198286078117086e-06, + "loss": 0.5865, + "step": 14873 + }, + { + "epoch": 1.0745751079162678, + "grad_norm": 5.35730830345807, + "learning_rate": 2.3195368672566637e-06, + "loss": 0.638, + "step": 14874 + }, + { + "epoch": 1.0746473531164773, + "grad_norm": 6.799186927206351, + "learning_rate": 2.319245129172018e-06, + "loss": 0.6278, + "step": 14875 + }, + { + "epoch": 1.0747195983166868, + "grad_norm": 7.22957944836656, + "learning_rate": 2.3189533935617658e-06, + "loss": 0.6936, + "step": 14876 + }, + { + "epoch": 1.0747918435168964, + "grad_norm": 6.782964393023986, + "learning_rate": 2.3186616604299016e-06, + "loss": 0.6888, + "step": 14877 + }, + { + "epoch": 1.074864088717106, + "grad_norm": 6.555834482297226, + "learning_rate": 2.3183699297804195e-06, + "loss": 0.7229, + "step": 14878 + }, + { + "epoch": 1.0749363339173155, + "grad_norm": 6.375673146331112, + "learning_rate": 2.3180782016173107e-06, + "loss": 0.5926, + "step": 14879 + }, + { + "epoch": 1.0750085791175248, + "grad_norm": 6.029396523411231, + "learning_rate": 2.31778647594457e-06, + "loss": 0.6187, + "step": 14880 + }, + { + "epoch": 1.0750808243177343, + "grad_norm": 6.46686736644229, + "learning_rate": 2.3174947527661916e-06, + "loss": 0.7121, + "step": 14881 + }, + { + "epoch": 1.0751530695179439, + "grad_norm": 6.484027605058008, + "learning_rate": 2.317203032086168e-06, + "loss": 0.5988, + "step": 14882 + }, + { + "epoch": 1.0752253147181534, + "grad_norm": 6.296629642978841, + "learning_rate": 2.316911313908493e-06, + "loss": 0.6934, + "step": 14883 + }, + { + "epoch": 1.075297559918363, + "grad_norm": 6.597685448518131, + "learning_rate": 2.3166195982371596e-06, + "loss": 0.6333, + "step": 14884 + }, + { + "epoch": 1.0753698051185725, + "grad_norm": 7.457867343204617, + "learning_rate": 2.3163278850761622e-06, + "loss": 0.6827, + "step": 14885 + }, + { + "epoch": 1.075442050318782, + "grad_norm": 6.880637891606429, + "learning_rate": 2.316036174429493e-06, + "loss": 0.635, + "step": 14886 + }, + { + "epoch": 1.0755142955189914, + "grad_norm": 6.917029068732855, + "learning_rate": 2.3157444663011455e-06, + "loss": 0.7231, + "step": 14887 + }, + { + "epoch": 1.075586540719201, + "grad_norm": 5.933094518477314, + "learning_rate": 2.3154527606951136e-06, + "loss": 0.6752, + "step": 14888 + }, + { + "epoch": 1.0756587859194104, + "grad_norm": 6.5163503558701645, + "learning_rate": 2.31516105761539e-06, + "loss": 0.6653, + "step": 14889 + }, + { + "epoch": 1.07573103111962, + "grad_norm": 8.271311197641076, + "learning_rate": 2.314869357065968e-06, + "loss": 0.6615, + "step": 14890 + }, + { + "epoch": 1.0758032763198295, + "grad_norm": 10.852378732126326, + "learning_rate": 2.314577659050841e-06, + "loss": 0.6403, + "step": 14891 + }, + { + "epoch": 1.075875521520039, + "grad_norm": 7.371679819457415, + "learning_rate": 2.314285963574002e-06, + "loss": 0.6345, + "step": 14892 + }, + { + "epoch": 1.0759477667202486, + "grad_norm": 5.110027866474172, + "learning_rate": 2.3139942706394438e-06, + "loss": 0.731, + "step": 14893 + }, + { + "epoch": 1.076020011920458, + "grad_norm": 5.395444799547201, + "learning_rate": 2.3137025802511593e-06, + "loss": 0.5744, + "step": 14894 + }, + { + "epoch": 1.0760922571206675, + "grad_norm": 7.2020474701609105, + "learning_rate": 2.313410892413141e-06, + "loss": 0.7, + "step": 14895 + }, + { + "epoch": 1.076164502320877, + "grad_norm": 6.773153085688229, + "learning_rate": 2.3131192071293845e-06, + "loss": 0.6986, + "step": 14896 + }, + { + "epoch": 1.0762367475210866, + "grad_norm": 7.280379738597678, + "learning_rate": 2.31282752440388e-06, + "loss": 0.6527, + "step": 14897 + }, + { + "epoch": 1.076308992721296, + "grad_norm": 6.987124727438827, + "learning_rate": 2.3125358442406216e-06, + "loss": 0.5603, + "step": 14898 + }, + { + "epoch": 1.0763812379215056, + "grad_norm": 6.732300051479754, + "learning_rate": 2.312244166643602e-06, + "loss": 0.6193, + "step": 14899 + }, + { + "epoch": 1.0764534831217152, + "grad_norm": 6.2013490378236265, + "learning_rate": 2.311952491616814e-06, + "loss": 0.6514, + "step": 14900 + }, + { + "epoch": 1.0765257283219247, + "grad_norm": 6.37977989367182, + "learning_rate": 2.3116608191642495e-06, + "loss": 0.6744, + "step": 14901 + }, + { + "epoch": 1.076597973522134, + "grad_norm": 6.103664998172173, + "learning_rate": 2.3113691492899028e-06, + "loss": 0.6677, + "step": 14902 + }, + { + "epoch": 1.0766702187223436, + "grad_norm": 7.038391826488053, + "learning_rate": 2.3110774819977664e-06, + "loss": 0.6834, + "step": 14903 + }, + { + "epoch": 1.0767424639225531, + "grad_norm": 7.084693269517357, + "learning_rate": 2.3107858172918317e-06, + "loss": 0.6847, + "step": 14904 + }, + { + "epoch": 1.0768147091227627, + "grad_norm": 6.803498366785684, + "learning_rate": 2.310494155176093e-06, + "loss": 0.6194, + "step": 14905 + }, + { + "epoch": 1.0768869543229722, + "grad_norm": 7.783656962144831, + "learning_rate": 2.3102024956545428e-06, + "loss": 0.6318, + "step": 14906 + }, + { + "epoch": 1.0769591995231818, + "grad_norm": 5.942057897983516, + "learning_rate": 2.309910838731172e-06, + "loss": 0.715, + "step": 14907 + }, + { + "epoch": 1.0770314447233913, + "grad_norm": 7.770890168364823, + "learning_rate": 2.309619184409974e-06, + "loss": 0.6689, + "step": 14908 + }, + { + "epoch": 1.0771036899236006, + "grad_norm": 8.365751255409332, + "learning_rate": 2.3093275326949414e-06, + "loss": 0.6421, + "step": 14909 + }, + { + "epoch": 1.0771759351238102, + "grad_norm": 6.48572939780355, + "learning_rate": 2.3090358835900682e-06, + "loss": 0.6625, + "step": 14910 + }, + { + "epoch": 1.0772481803240197, + "grad_norm": 5.517871085972481, + "learning_rate": 2.3087442370993446e-06, + "loss": 0.6757, + "step": 14911 + }, + { + "epoch": 1.0773204255242292, + "grad_norm": 8.083597657381631, + "learning_rate": 2.3084525932267637e-06, + "loss": 0.7001, + "step": 14912 + }, + { + "epoch": 1.0773926707244388, + "grad_norm": 7.58718676352567, + "learning_rate": 2.3081609519763187e-06, + "loss": 0.707, + "step": 14913 + }, + { + "epoch": 1.0774649159246483, + "grad_norm": 6.211960656684434, + "learning_rate": 2.3078693133520006e-06, + "loss": 0.6015, + "step": 14914 + }, + { + "epoch": 1.0775371611248579, + "grad_norm": 6.754140643525239, + "learning_rate": 2.3075776773578028e-06, + "loss": 0.6295, + "step": 14915 + }, + { + "epoch": 1.0776094063250672, + "grad_norm": 6.5144721092958076, + "learning_rate": 2.307286043997717e-06, + "loss": 0.6309, + "step": 14916 + }, + { + "epoch": 1.0776816515252767, + "grad_norm": 7.2974317448538555, + "learning_rate": 2.306994413275736e-06, + "loss": 0.6833, + "step": 14917 + }, + { + "epoch": 1.0777538967254863, + "grad_norm": 6.136462327054361, + "learning_rate": 2.306702785195851e-06, + "loss": 0.7139, + "step": 14918 + }, + { + "epoch": 1.0778261419256958, + "grad_norm": 7.700151407313797, + "learning_rate": 2.306411159762055e-06, + "loss": 0.7135, + "step": 14919 + }, + { + "epoch": 1.0778983871259054, + "grad_norm": 6.672135144774626, + "learning_rate": 2.30611953697834e-06, + "loss": 0.642, + "step": 14920 + }, + { + "epoch": 1.077970632326115, + "grad_norm": 5.740293520025341, + "learning_rate": 2.305827916848699e-06, + "loss": 0.6453, + "step": 14921 + }, + { + "epoch": 1.0780428775263244, + "grad_norm": 6.88054863095165, + "learning_rate": 2.3055362993771218e-06, + "loss": 0.6287, + "step": 14922 + }, + { + "epoch": 1.0781151227265338, + "grad_norm": 5.4952538559566335, + "learning_rate": 2.3052446845676017e-06, + "loss": 0.6875, + "step": 14923 + }, + { + "epoch": 1.0781873679267433, + "grad_norm": 5.718638267884761, + "learning_rate": 2.304953072424132e-06, + "loss": 0.5956, + "step": 14924 + }, + { + "epoch": 1.0782596131269528, + "grad_norm": 6.044396496252231, + "learning_rate": 2.3046614629507016e-06, + "loss": 0.6678, + "step": 14925 + }, + { + "epoch": 1.0783318583271624, + "grad_norm": 6.696736739560872, + "learning_rate": 2.304369856151305e-06, + "loss": 0.6555, + "step": 14926 + }, + { + "epoch": 1.078404103527372, + "grad_norm": 7.120280794328822, + "learning_rate": 2.3040782520299324e-06, + "loss": 0.6514, + "step": 14927 + }, + { + "epoch": 1.0784763487275815, + "grad_norm": 8.185139789075029, + "learning_rate": 2.303786650590577e-06, + "loss": 0.7035, + "step": 14928 + }, + { + "epoch": 1.078548593927791, + "grad_norm": 6.340930631668727, + "learning_rate": 2.3034950518372302e-06, + "loss": 0.7274, + "step": 14929 + }, + { + "epoch": 1.0786208391280003, + "grad_norm": 5.731983985617629, + "learning_rate": 2.303203455773883e-06, + "loss": 0.6274, + "step": 14930 + }, + { + "epoch": 1.0786930843282099, + "grad_norm": 7.634896281898763, + "learning_rate": 2.3029118624045286e-06, + "loss": 0.6601, + "step": 14931 + }, + { + "epoch": 1.0787653295284194, + "grad_norm": 7.482349480040015, + "learning_rate": 2.302620271733157e-06, + "loss": 0.6312, + "step": 14932 + }, + { + "epoch": 1.078837574728629, + "grad_norm": 6.087441959926888, + "learning_rate": 2.3023286837637602e-06, + "loss": 0.6073, + "step": 14933 + }, + { + "epoch": 1.0789098199288385, + "grad_norm": 8.310327231600995, + "learning_rate": 2.3020370985003308e-06, + "loss": 0.6799, + "step": 14934 + }, + { + "epoch": 1.078982065129048, + "grad_norm": 7.630888869280088, + "learning_rate": 2.301745515946861e-06, + "loss": 0.6031, + "step": 14935 + }, + { + "epoch": 1.0790543103292576, + "grad_norm": 6.575095450865202, + "learning_rate": 2.3014539361073395e-06, + "loss": 0.6871, + "step": 14936 + }, + { + "epoch": 1.079126555529467, + "grad_norm": 6.969828569227785, + "learning_rate": 2.3011623589857593e-06, + "loss": 0.7149, + "step": 14937 + }, + { + "epoch": 1.0791988007296764, + "grad_norm": 5.965674122068092, + "learning_rate": 2.3008707845861135e-06, + "loss": 0.6145, + "step": 14938 + }, + { + "epoch": 1.079271045929886, + "grad_norm": 6.93338165388631, + "learning_rate": 2.3005792129123913e-06, + "loss": 0.7204, + "step": 14939 + }, + { + "epoch": 1.0793432911300955, + "grad_norm": 6.984087978669715, + "learning_rate": 2.3002876439685847e-06, + "loss": 0.7144, + "step": 14940 + }, + { + "epoch": 1.079415536330305, + "grad_norm": 7.001628141429633, + "learning_rate": 2.299996077758685e-06, + "loss": 0.6568, + "step": 14941 + }, + { + "epoch": 1.0794877815305146, + "grad_norm": 7.491484576237601, + "learning_rate": 2.299704514286684e-06, + "loss": 0.6453, + "step": 14942 + }, + { + "epoch": 1.0795600267307242, + "grad_norm": 6.077821856394477, + "learning_rate": 2.2994129535565725e-06, + "loss": 0.6133, + "step": 14943 + }, + { + "epoch": 1.0796322719309335, + "grad_norm": 6.703082602484653, + "learning_rate": 2.299121395572342e-06, + "loss": 0.6499, + "step": 14944 + }, + { + "epoch": 1.079704517131143, + "grad_norm": 6.280564607642492, + "learning_rate": 2.2988298403379837e-06, + "loss": 0.5589, + "step": 14945 + }, + { + "epoch": 1.0797767623313526, + "grad_norm": 7.504133229990474, + "learning_rate": 2.298538287857489e-06, + "loss": 0.6863, + "step": 14946 + }, + { + "epoch": 1.079849007531562, + "grad_norm": 6.118359761559874, + "learning_rate": 2.298246738134848e-06, + "loss": 0.7034, + "step": 14947 + }, + { + "epoch": 1.0799212527317716, + "grad_norm": 6.61411297745326, + "learning_rate": 2.2979551911740526e-06, + "loss": 0.6567, + "step": 14948 + }, + { + "epoch": 1.0799934979319812, + "grad_norm": 6.877114542979189, + "learning_rate": 2.297663646979095e-06, + "loss": 0.773, + "step": 14949 + }, + { + "epoch": 1.0800657431321907, + "grad_norm": 7.866982147658164, + "learning_rate": 2.297372105553964e-06, + "loss": 0.6645, + "step": 14950 + }, + { + "epoch": 1.0801379883324003, + "grad_norm": 7.84980351876422, + "learning_rate": 2.2970805669026504e-06, + "loss": 0.6987, + "step": 14951 + }, + { + "epoch": 1.0802102335326096, + "grad_norm": 8.488255184324911, + "learning_rate": 2.2967890310291476e-06, + "loss": 0.6584, + "step": 14952 + }, + { + "epoch": 1.0802824787328191, + "grad_norm": 8.015687343659843, + "learning_rate": 2.2964974979374458e-06, + "loss": 0.6509, + "step": 14953 + }, + { + "epoch": 1.0803547239330287, + "grad_norm": 6.1721469481225695, + "learning_rate": 2.296205967631534e-06, + "loss": 0.6106, + "step": 14954 + }, + { + "epoch": 1.0804269691332382, + "grad_norm": 6.642786585232387, + "learning_rate": 2.2959144401154046e-06, + "loss": 0.7477, + "step": 14955 + }, + { + "epoch": 1.0804992143334478, + "grad_norm": 8.579583049326638, + "learning_rate": 2.2956229153930486e-06, + "loss": 0.6592, + "step": 14956 + }, + { + "epoch": 1.0805714595336573, + "grad_norm": 7.184322019483722, + "learning_rate": 2.2953313934684555e-06, + "loss": 0.6534, + "step": 14957 + }, + { + "epoch": 1.0806437047338668, + "grad_norm": 5.987605007580798, + "learning_rate": 2.295039874345617e-06, + "loss": 0.6024, + "step": 14958 + }, + { + "epoch": 1.0807159499340762, + "grad_norm": 6.655316927705481, + "learning_rate": 2.294748358028523e-06, + "loss": 0.6896, + "step": 14959 + }, + { + "epoch": 1.0807881951342857, + "grad_norm": 6.188241625180863, + "learning_rate": 2.2944568445211658e-06, + "loss": 0.573, + "step": 14960 + }, + { + "epoch": 1.0808604403344952, + "grad_norm": 6.255192850068871, + "learning_rate": 2.2941653338275337e-06, + "loss": 0.5918, + "step": 14961 + }, + { + "epoch": 1.0809326855347048, + "grad_norm": 6.203636013861586, + "learning_rate": 2.2938738259516186e-06, + "loss": 0.5661, + "step": 14962 + }, + { + "epoch": 1.0810049307349143, + "grad_norm": 6.31444617759825, + "learning_rate": 2.2935823208974117e-06, + "loss": 0.6837, + "step": 14963 + }, + { + "epoch": 1.0810771759351239, + "grad_norm": 6.553108689010906, + "learning_rate": 2.2932908186689016e-06, + "loss": 0.6683, + "step": 14964 + }, + { + "epoch": 1.0811494211353334, + "grad_norm": 6.427037894790587, + "learning_rate": 2.292999319270079e-06, + "loss": 0.6452, + "step": 14965 + }, + { + "epoch": 1.0812216663355427, + "grad_norm": 6.54825317152511, + "learning_rate": 2.292707822704936e-06, + "loss": 0.6334, + "step": 14966 + }, + { + "epoch": 1.0812939115357523, + "grad_norm": 8.325692399474296, + "learning_rate": 2.2924163289774627e-06, + "loss": 0.7474, + "step": 14967 + }, + { + "epoch": 1.0813661567359618, + "grad_norm": 8.354856036148416, + "learning_rate": 2.2921248380916474e-06, + "loss": 0.5975, + "step": 14968 + }, + { + "epoch": 1.0814384019361714, + "grad_norm": 8.175190065373432, + "learning_rate": 2.291833350051482e-06, + "loss": 0.6606, + "step": 14969 + }, + { + "epoch": 1.081510647136381, + "grad_norm": 5.808989090026498, + "learning_rate": 2.2915418648609567e-06, + "loss": 0.6293, + "step": 14970 + }, + { + "epoch": 1.0815828923365904, + "grad_norm": 6.755265619235544, + "learning_rate": 2.2912503825240608e-06, + "loss": 0.7196, + "step": 14971 + }, + { + "epoch": 1.0816551375368, + "grad_norm": 6.72636429232149, + "learning_rate": 2.2909589030447855e-06, + "loss": 0.6208, + "step": 14972 + }, + { + "epoch": 1.0817273827370095, + "grad_norm": 6.657750493378381, + "learning_rate": 2.29066742642712e-06, + "loss": 0.6832, + "step": 14973 + }, + { + "epoch": 1.0817996279372188, + "grad_norm": 5.880204513003446, + "learning_rate": 2.2903759526750558e-06, + "loss": 0.6531, + "step": 14974 + }, + { + "epoch": 1.0818718731374284, + "grad_norm": 6.9286376433094805, + "learning_rate": 2.2900844817925817e-06, + "loss": 0.7075, + "step": 14975 + }, + { + "epoch": 1.081944118337638, + "grad_norm": 8.173807356977798, + "learning_rate": 2.2897930137836876e-06, + "loss": 0.6416, + "step": 14976 + }, + { + "epoch": 1.0820163635378475, + "grad_norm": 6.845250352805084, + "learning_rate": 2.289501548652364e-06, + "loss": 0.6602, + "step": 14977 + }, + { + "epoch": 1.082088608738057, + "grad_norm": 7.009366172618299, + "learning_rate": 2.289210086402602e-06, + "loss": 0.6678, + "step": 14978 + }, + { + "epoch": 1.0821608539382666, + "grad_norm": 6.01827223358085, + "learning_rate": 2.288918627038389e-06, + "loss": 0.6594, + "step": 14979 + }, + { + "epoch": 1.082233099138476, + "grad_norm": 7.577467553633416, + "learning_rate": 2.2886271705637158e-06, + "loss": 0.6545, + "step": 14980 + }, + { + "epoch": 1.0823053443386854, + "grad_norm": 5.535079061741265, + "learning_rate": 2.288335716982574e-06, + "loss": 0.6193, + "step": 14981 + }, + { + "epoch": 1.082377589538895, + "grad_norm": 5.6104781147334775, + "learning_rate": 2.2880442662989508e-06, + "loss": 0.6841, + "step": 14982 + }, + { + "epoch": 1.0824498347391045, + "grad_norm": 6.28870219270675, + "learning_rate": 2.2877528185168367e-06, + "loss": 0.6445, + "step": 14983 + }, + { + "epoch": 1.082522079939314, + "grad_norm": 6.831279104716991, + "learning_rate": 2.2874613736402223e-06, + "loss": 0.6867, + "step": 14984 + }, + { + "epoch": 1.0825943251395236, + "grad_norm": 6.3766142353482005, + "learning_rate": 2.287169931673097e-06, + "loss": 0.7331, + "step": 14985 + }, + { + "epoch": 1.0826665703397331, + "grad_norm": 6.049153216663624, + "learning_rate": 2.286878492619449e-06, + "loss": 0.6817, + "step": 14986 + }, + { + "epoch": 1.0827388155399427, + "grad_norm": 6.051873400970724, + "learning_rate": 2.28658705648327e-06, + "loss": 0.6035, + "step": 14987 + }, + { + "epoch": 1.082811060740152, + "grad_norm": 6.626628171833272, + "learning_rate": 2.2862956232685483e-06, + "loss": 0.651, + "step": 14988 + }, + { + "epoch": 1.0828833059403615, + "grad_norm": 7.078156528023739, + "learning_rate": 2.286004192979273e-06, + "loss": 0.6462, + "step": 14989 + }, + { + "epoch": 1.082955551140571, + "grad_norm": 6.884930461235951, + "learning_rate": 2.2857127656194344e-06, + "loss": 0.7065, + "step": 14990 + }, + { + "epoch": 1.0830277963407806, + "grad_norm": 7.608894975314177, + "learning_rate": 2.285421341193022e-06, + "loss": 0.6936, + "step": 14991 + }, + { + "epoch": 1.0831000415409902, + "grad_norm": 5.747223556925661, + "learning_rate": 2.285129919704025e-06, + "loss": 0.6381, + "step": 14992 + }, + { + "epoch": 1.0831722867411997, + "grad_norm": 6.835191644578463, + "learning_rate": 2.284838501156432e-06, + "loss": 0.6635, + "step": 14993 + }, + { + "epoch": 1.0832445319414092, + "grad_norm": 8.021479381618242, + "learning_rate": 2.2845470855542326e-06, + "loss": 0.7593, + "step": 14994 + }, + { + "epoch": 1.0833167771416186, + "grad_norm": 6.591623261602591, + "learning_rate": 2.284255672901418e-06, + "loss": 0.5894, + "step": 14995 + }, + { + "epoch": 1.083389022341828, + "grad_norm": 6.249288289555585, + "learning_rate": 2.283964263201974e-06, + "loss": 0.7371, + "step": 14996 + }, + { + "epoch": 1.0834612675420376, + "grad_norm": 6.382022056988574, + "learning_rate": 2.283672856459892e-06, + "loss": 0.6282, + "step": 14997 + }, + { + "epoch": 1.0835335127422472, + "grad_norm": 6.756411298013886, + "learning_rate": 2.2833814526791604e-06, + "loss": 0.6299, + "step": 14998 + }, + { + "epoch": 1.0836057579424567, + "grad_norm": 5.56464644605388, + "learning_rate": 2.283090051863769e-06, + "loss": 0.6557, + "step": 14999 + }, + { + "epoch": 1.0836780031426663, + "grad_norm": 5.754349431867446, + "learning_rate": 2.282798654017706e-06, + "loss": 0.5909, + "step": 15000 + }, + { + "epoch": 1.0837502483428758, + "grad_norm": 6.2145515723274185, + "learning_rate": 2.2825072591449607e-06, + "loss": 0.654, + "step": 15001 + }, + { + "epoch": 1.0838224935430851, + "grad_norm": 6.36462515543065, + "learning_rate": 2.282215867249522e-06, + "loss": 0.572, + "step": 15002 + }, + { + "epoch": 1.0838947387432947, + "grad_norm": 7.860230416373704, + "learning_rate": 2.28192447833538e-06, + "loss": 0.6552, + "step": 15003 + }, + { + "epoch": 1.0839669839435042, + "grad_norm": 7.2860908624688685, + "learning_rate": 2.2816330924065213e-06, + "loss": 0.6428, + "step": 15004 + }, + { + "epoch": 1.0840392291437138, + "grad_norm": 8.036688124972027, + "learning_rate": 2.2813417094669367e-06, + "loss": 0.6714, + "step": 15005 + }, + { + "epoch": 1.0841114743439233, + "grad_norm": 6.555824008480992, + "learning_rate": 2.281050329520615e-06, + "loss": 0.6526, + "step": 15006 + }, + { + "epoch": 1.0841837195441328, + "grad_norm": 6.596910632175944, + "learning_rate": 2.280758952571543e-06, + "loss": 0.603, + "step": 15007 + }, + { + "epoch": 1.0842559647443424, + "grad_norm": 7.406721011608229, + "learning_rate": 2.2804675786237105e-06, + "loss": 0.7142, + "step": 15008 + }, + { + "epoch": 1.0843282099445517, + "grad_norm": 7.239658741363867, + "learning_rate": 2.2801762076811072e-06, + "loss": 0.7226, + "step": 15009 + }, + { + "epoch": 1.0844004551447612, + "grad_norm": 7.328449559293919, + "learning_rate": 2.2798848397477214e-06, + "loss": 0.6494, + "step": 15010 + }, + { + "epoch": 1.0844727003449708, + "grad_norm": 7.686373496286403, + "learning_rate": 2.279593474827541e-06, + "loss": 0.7002, + "step": 15011 + }, + { + "epoch": 1.0845449455451803, + "grad_norm": 6.634940121803804, + "learning_rate": 2.2793021129245542e-06, + "loss": 0.6318, + "step": 15012 + }, + { + "epoch": 1.0846171907453899, + "grad_norm": 7.07409634160385, + "learning_rate": 2.2790107540427506e-06, + "loss": 0.6944, + "step": 15013 + }, + { + "epoch": 1.0846894359455994, + "grad_norm": 5.715585635692597, + "learning_rate": 2.278719398186118e-06, + "loss": 0.6684, + "step": 15014 + }, + { + "epoch": 1.084761681145809, + "grad_norm": 7.076715526918245, + "learning_rate": 2.278428045358645e-06, + "loss": 0.6819, + "step": 15015 + }, + { + "epoch": 1.0848339263460183, + "grad_norm": 8.697833385563749, + "learning_rate": 2.2781366955643193e-06, + "loss": 0.7073, + "step": 15016 + }, + { + "epoch": 1.0849061715462278, + "grad_norm": 6.274976643674629, + "learning_rate": 2.277845348807131e-06, + "loss": 0.6698, + "step": 15017 + }, + { + "epoch": 1.0849784167464374, + "grad_norm": 6.85618680578856, + "learning_rate": 2.277554005091067e-06, + "loss": 0.734, + "step": 15018 + }, + { + "epoch": 1.085050661946647, + "grad_norm": 6.4795062435086805, + "learning_rate": 2.277262664420116e-06, + "loss": 0.6605, + "step": 15019 + }, + { + "epoch": 1.0851229071468564, + "grad_norm": 7.862652741663359, + "learning_rate": 2.276971326798267e-06, + "loss": 0.7412, + "step": 15020 + }, + { + "epoch": 1.085195152347066, + "grad_norm": 7.22214471172245, + "learning_rate": 2.276679992229506e-06, + "loss": 0.7746, + "step": 15021 + }, + { + "epoch": 1.0852673975472755, + "grad_norm": 6.877491170197355, + "learning_rate": 2.2763886607178226e-06, + "loss": 0.607, + "step": 15022 + }, + { + "epoch": 1.085339642747485, + "grad_norm": 7.869696875118508, + "learning_rate": 2.276097332267205e-06, + "loss": 0.6626, + "step": 15023 + }, + { + "epoch": 1.0854118879476944, + "grad_norm": 7.68571463841677, + "learning_rate": 2.2758060068816426e-06, + "loss": 0.7086, + "step": 15024 + }, + { + "epoch": 1.085484133147904, + "grad_norm": 7.280221498075121, + "learning_rate": 2.2755146845651204e-06, + "loss": 0.6244, + "step": 15025 + }, + { + "epoch": 1.0855563783481135, + "grad_norm": 6.857308669583236, + "learning_rate": 2.275223365321628e-06, + "loss": 0.639, + "step": 15026 + }, + { + "epoch": 1.085628623548323, + "grad_norm": 7.403203538870146, + "learning_rate": 2.274932049155153e-06, + "loss": 0.662, + "step": 15027 + }, + { + "epoch": 1.0857008687485326, + "grad_norm": 7.534713771676649, + "learning_rate": 2.2746407360696843e-06, + "loss": 0.6085, + "step": 15028 + }, + { + "epoch": 1.085773113948742, + "grad_norm": 7.22749835578744, + "learning_rate": 2.274349426069208e-06, + "loss": 0.6487, + "step": 15029 + }, + { + "epoch": 1.0858453591489516, + "grad_norm": 7.36783730481971, + "learning_rate": 2.274058119157713e-06, + "loss": 0.6228, + "step": 15030 + }, + { + "epoch": 1.085917604349161, + "grad_norm": 6.224915788073242, + "learning_rate": 2.2737668153391877e-06, + "loss": 0.6746, + "step": 15031 + }, + { + "epoch": 1.0859898495493705, + "grad_norm": 6.660510781436279, + "learning_rate": 2.2734755146176186e-06, + "loss": 0.6337, + "step": 15032 + }, + { + "epoch": 1.08606209474958, + "grad_norm": 6.760470428801621, + "learning_rate": 2.273184216996994e-06, + "loss": 0.6779, + "step": 15033 + }, + { + "epoch": 1.0861343399497896, + "grad_norm": 5.88986338743576, + "learning_rate": 2.2728929224813005e-06, + "loss": 0.5813, + "step": 15034 + }, + { + "epoch": 1.0862065851499991, + "grad_norm": 7.984939465660568, + "learning_rate": 2.272601631074528e-06, + "loss": 0.6379, + "step": 15035 + }, + { + "epoch": 1.0862788303502087, + "grad_norm": 6.2951903254890444, + "learning_rate": 2.272310342780661e-06, + "loss": 0.5981, + "step": 15036 + }, + { + "epoch": 1.0863510755504182, + "grad_norm": 7.4686907522493104, + "learning_rate": 2.2720190576036893e-06, + "loss": 0.6813, + "step": 15037 + }, + { + "epoch": 1.0864233207506275, + "grad_norm": 6.5148597469149445, + "learning_rate": 2.2717277755476007e-06, + "loss": 0.657, + "step": 15038 + }, + { + "epoch": 1.086495565950837, + "grad_norm": 7.067694450108033, + "learning_rate": 2.2714364966163806e-06, + "loss": 0.6243, + "step": 15039 + }, + { + "epoch": 1.0865678111510466, + "grad_norm": 6.710006347289402, + "learning_rate": 2.271145220814017e-06, + "loss": 0.6772, + "step": 15040 + }, + { + "epoch": 1.0866400563512562, + "grad_norm": 6.207378476167433, + "learning_rate": 2.2708539481444983e-06, + "loss": 0.6823, + "step": 15041 + }, + { + "epoch": 1.0867123015514657, + "grad_norm": 5.392592096086122, + "learning_rate": 2.2705626786118113e-06, + "loss": 0.6869, + "step": 15042 + }, + { + "epoch": 1.0867845467516752, + "grad_norm": 7.229271557254933, + "learning_rate": 2.2702714122199424e-06, + "loss": 0.622, + "step": 15043 + }, + { + "epoch": 1.0868567919518848, + "grad_norm": 6.0720193591506595, + "learning_rate": 2.26998014897288e-06, + "loss": 0.6764, + "step": 15044 + }, + { + "epoch": 1.0869290371520943, + "grad_norm": 18.02812286710206, + "learning_rate": 2.2696888888746106e-06, + "loss": 0.6876, + "step": 15045 + }, + { + "epoch": 1.0870012823523036, + "grad_norm": 6.50330298206332, + "learning_rate": 2.2693976319291217e-06, + "loss": 0.5993, + "step": 15046 + }, + { + "epoch": 1.0870735275525132, + "grad_norm": 7.442934503093117, + "learning_rate": 2.2691063781403996e-06, + "loss": 0.6696, + "step": 15047 + }, + { + "epoch": 1.0871457727527227, + "grad_norm": 6.225467906356611, + "learning_rate": 2.268815127512432e-06, + "loss": 0.6397, + "step": 15048 + }, + { + "epoch": 1.0872180179529323, + "grad_norm": 7.672814461238574, + "learning_rate": 2.2685238800492073e-06, + "loss": 0.6938, + "step": 15049 + }, + { + "epoch": 1.0872902631531418, + "grad_norm": 6.211116533877956, + "learning_rate": 2.268232635754709e-06, + "loss": 0.7458, + "step": 15050 + }, + { + "epoch": 1.0873625083533514, + "grad_norm": 6.987681312460645, + "learning_rate": 2.2679413946329263e-06, + "loss": 0.643, + "step": 15051 + }, + { + "epoch": 1.087434753553561, + "grad_norm": 6.851275482603614, + "learning_rate": 2.2676501566878457e-06, + "loss": 0.6627, + "step": 15052 + }, + { + "epoch": 1.0875069987537702, + "grad_norm": 6.035162254750249, + "learning_rate": 2.2673589219234553e-06, + "loss": 0.6966, + "step": 15053 + }, + { + "epoch": 1.0875792439539798, + "grad_norm": 6.842242906707184, + "learning_rate": 2.26706769034374e-06, + "loss": 0.6884, + "step": 15054 + }, + { + "epoch": 1.0876514891541893, + "grad_norm": 6.770041770637156, + "learning_rate": 2.2667764619526864e-06, + "loss": 0.6631, + "step": 15055 + }, + { + "epoch": 1.0877237343543988, + "grad_norm": 8.22764657060695, + "learning_rate": 2.266485236754283e-06, + "loss": 0.6997, + "step": 15056 + }, + { + "epoch": 1.0877959795546084, + "grad_norm": 6.305886609559134, + "learning_rate": 2.2661940147525148e-06, + "loss": 0.5712, + "step": 15057 + }, + { + "epoch": 1.087868224754818, + "grad_norm": 7.331526042297519, + "learning_rate": 2.2659027959513686e-06, + "loss": 0.6461, + "step": 15058 + }, + { + "epoch": 1.0879404699550275, + "grad_norm": 5.902712453842228, + "learning_rate": 2.265611580354832e-06, + "loss": 0.6338, + "step": 15059 + }, + { + "epoch": 1.0880127151552368, + "grad_norm": 7.402330866578498, + "learning_rate": 2.2653203679668907e-06, + "loss": 0.7973, + "step": 15060 + }, + { + "epoch": 1.0880849603554463, + "grad_norm": 6.5528784564077736, + "learning_rate": 2.2650291587915316e-06, + "loss": 0.6691, + "step": 15061 + }, + { + "epoch": 1.0881572055556559, + "grad_norm": 6.407707532363856, + "learning_rate": 2.2647379528327402e-06, + "loss": 0.6693, + "step": 15062 + }, + { + "epoch": 1.0882294507558654, + "grad_norm": 7.01662894968994, + "learning_rate": 2.264446750094505e-06, + "loss": 0.6192, + "step": 15063 + }, + { + "epoch": 1.088301695956075, + "grad_norm": 7.235616161416494, + "learning_rate": 2.264155550580809e-06, + "loss": 0.7373, + "step": 15064 + }, + { + "epoch": 1.0883739411562845, + "grad_norm": 6.092144407367951, + "learning_rate": 2.2638643542956413e-06, + "loss": 0.6618, + "step": 15065 + }, + { + "epoch": 1.088446186356494, + "grad_norm": 6.996300128254872, + "learning_rate": 2.2635731612429872e-06, + "loss": 0.6537, + "step": 15066 + }, + { + "epoch": 1.0885184315567034, + "grad_norm": 6.369604183337657, + "learning_rate": 2.263281971426834e-06, + "loss": 0.6341, + "step": 15067 + }, + { + "epoch": 1.088590676756913, + "grad_norm": 6.826640720665819, + "learning_rate": 2.2629907848511654e-06, + "loss": 0.6447, + "step": 15068 + }, + { + "epoch": 1.0886629219571224, + "grad_norm": 6.649992531220107, + "learning_rate": 2.262699601519969e-06, + "loss": 0.6367, + "step": 15069 + }, + { + "epoch": 1.088735167157332, + "grad_norm": 7.274518351652922, + "learning_rate": 2.2624084214372317e-06, + "loss": 0.6513, + "step": 15070 + }, + { + "epoch": 1.0888074123575415, + "grad_norm": 7.039057893571606, + "learning_rate": 2.2621172446069374e-06, + "loss": 0.7045, + "step": 15071 + }, + { + "epoch": 1.088879657557751, + "grad_norm": 6.940172385237733, + "learning_rate": 2.261826071033074e-06, + "loss": 0.7459, + "step": 15072 + }, + { + "epoch": 1.0889519027579606, + "grad_norm": 5.439572870932834, + "learning_rate": 2.2615349007196267e-06, + "loss": 0.5842, + "step": 15073 + }, + { + "epoch": 1.08902414795817, + "grad_norm": 6.334926538681272, + "learning_rate": 2.2612437336705818e-06, + "loss": 0.6872, + "step": 15074 + }, + { + "epoch": 1.0890963931583795, + "grad_norm": 8.018669754780431, + "learning_rate": 2.2609525698899244e-06, + "loss": 0.6409, + "step": 15075 + }, + { + "epoch": 1.089168638358589, + "grad_norm": 5.622309910404948, + "learning_rate": 2.2606614093816403e-06, + "loss": 0.6319, + "step": 15076 + }, + { + "epoch": 1.0892408835587986, + "grad_norm": 7.940408729391386, + "learning_rate": 2.2603702521497174e-06, + "loss": 0.755, + "step": 15077 + }, + { + "epoch": 1.089313128759008, + "grad_norm": 7.253178656041438, + "learning_rate": 2.2600790981981375e-06, + "loss": 0.6358, + "step": 15078 + }, + { + "epoch": 1.0893853739592176, + "grad_norm": 6.774571940845374, + "learning_rate": 2.2597879475308893e-06, + "loss": 0.7162, + "step": 15079 + }, + { + "epoch": 1.0894576191594272, + "grad_norm": 7.851301350212314, + "learning_rate": 2.2594968001519572e-06, + "loss": 0.753, + "step": 15080 + }, + { + "epoch": 1.0895298643596365, + "grad_norm": 8.182344866668103, + "learning_rate": 2.2592056560653288e-06, + "loss": 0.7054, + "step": 15081 + }, + { + "epoch": 1.089602109559846, + "grad_norm": 5.5973645684647595, + "learning_rate": 2.2589145152749866e-06, + "loss": 0.6065, + "step": 15082 + }, + { + "epoch": 1.0896743547600556, + "grad_norm": 6.839314734949779, + "learning_rate": 2.2586233777849178e-06, + "loss": 0.6272, + "step": 15083 + }, + { + "epoch": 1.0897465999602651, + "grad_norm": 5.962512846798922, + "learning_rate": 2.258332243599107e-06, + "loss": 0.6098, + "step": 15084 + }, + { + "epoch": 1.0898188451604747, + "grad_norm": 6.00582253545612, + "learning_rate": 2.258041112721541e-06, + "loss": 0.5758, + "step": 15085 + }, + { + "epoch": 1.0898910903606842, + "grad_norm": 6.916103876343178, + "learning_rate": 2.257749985156204e-06, + "loss": 0.6637, + "step": 15086 + }, + { + "epoch": 1.0899633355608938, + "grad_norm": 6.665402419697096, + "learning_rate": 2.2574588609070815e-06, + "loss": 0.6026, + "step": 15087 + }, + { + "epoch": 1.090035580761103, + "grad_norm": 5.962383609811337, + "learning_rate": 2.2571677399781593e-06, + "loss": 0.6517, + "step": 15088 + }, + { + "epoch": 1.0901078259613126, + "grad_norm": 5.616860498240049, + "learning_rate": 2.2568766223734213e-06, + "loss": 0.6133, + "step": 15089 + }, + { + "epoch": 1.0901800711615222, + "grad_norm": 7.345361670723265, + "learning_rate": 2.256585508096854e-06, + "loss": 0.6671, + "step": 15090 + }, + { + "epoch": 1.0902523163617317, + "grad_norm": 5.980304180051171, + "learning_rate": 2.256294397152442e-06, + "loss": 0.627, + "step": 15091 + }, + { + "epoch": 1.0903245615619412, + "grad_norm": 6.217297561718215, + "learning_rate": 2.2560032895441713e-06, + "loss": 0.6403, + "step": 15092 + }, + { + "epoch": 1.0903968067621508, + "grad_norm": 6.662103935335657, + "learning_rate": 2.255712185276025e-06, + "loss": 0.6171, + "step": 15093 + }, + { + "epoch": 1.0904690519623603, + "grad_norm": 6.146281115065447, + "learning_rate": 2.2554210843519895e-06, + "loss": 0.6782, + "step": 15094 + }, + { + "epoch": 1.0905412971625696, + "grad_norm": 8.291204493947138, + "learning_rate": 2.2551299867760503e-06, + "loss": 0.6843, + "step": 15095 + }, + { + "epoch": 1.0906135423627792, + "grad_norm": 6.907965123480054, + "learning_rate": 2.254838892552191e-06, + "loss": 0.6933, + "step": 15096 + }, + { + "epoch": 1.0906857875629887, + "grad_norm": 7.653900159564919, + "learning_rate": 2.2545478016843965e-06, + "loss": 0.6716, + "step": 15097 + }, + { + "epoch": 1.0907580327631983, + "grad_norm": 7.473663629089249, + "learning_rate": 2.2542567141766517e-06, + "loss": 0.7301, + "step": 15098 + }, + { + "epoch": 1.0908302779634078, + "grad_norm": 6.653346614990024, + "learning_rate": 2.2539656300329428e-06, + "loss": 0.7213, + "step": 15099 + }, + { + "epoch": 1.0909025231636174, + "grad_norm": 7.003602871810705, + "learning_rate": 2.2536745492572524e-06, + "loss": 0.6585, + "step": 15100 + }, + { + "epoch": 1.090974768363827, + "grad_norm": 6.810371836455132, + "learning_rate": 2.2533834718535663e-06, + "loss": 0.6782, + "step": 15101 + }, + { + "epoch": 1.0910470135640364, + "grad_norm": 5.952221418876816, + "learning_rate": 2.2530923978258693e-06, + "loss": 0.5818, + "step": 15102 + }, + { + "epoch": 1.0911192587642458, + "grad_norm": 7.786045542378254, + "learning_rate": 2.2528013271781455e-06, + "loss": 0.6538, + "step": 15103 + }, + { + "epoch": 1.0911915039644553, + "grad_norm": 7.7074903147460585, + "learning_rate": 2.2525102599143798e-06, + "loss": 0.6817, + "step": 15104 + }, + { + "epoch": 1.0912637491646648, + "grad_norm": 5.59699636697302, + "learning_rate": 2.252219196038556e-06, + "loss": 0.6389, + "step": 15105 + }, + { + "epoch": 1.0913359943648744, + "grad_norm": 5.900531240566263, + "learning_rate": 2.25192813555466e-06, + "loss": 0.6801, + "step": 15106 + }, + { + "epoch": 1.091408239565084, + "grad_norm": 6.179335322514564, + "learning_rate": 2.2516370784666743e-06, + "loss": 0.6154, + "step": 15107 + }, + { + "epoch": 1.0914804847652935, + "grad_norm": 9.166266785193619, + "learning_rate": 2.251346024778584e-06, + "loss": 0.7113, + "step": 15108 + }, + { + "epoch": 1.091552729965503, + "grad_norm": 6.220763527473625, + "learning_rate": 2.251054974494374e-06, + "loss": 0.6269, + "step": 15109 + }, + { + "epoch": 1.0916249751657123, + "grad_norm": 7.3949573553273025, + "learning_rate": 2.250763927618029e-06, + "loss": 0.7471, + "step": 15110 + }, + { + "epoch": 1.0916972203659219, + "grad_norm": 6.544416234477882, + "learning_rate": 2.2504728841535313e-06, + "loss": 0.6776, + "step": 15111 + }, + { + "epoch": 1.0917694655661314, + "grad_norm": 5.812811074085224, + "learning_rate": 2.2501818441048655e-06, + "loss": 0.6326, + "step": 15112 + }, + { + "epoch": 1.091841710766341, + "grad_norm": 5.5373668577148605, + "learning_rate": 2.2498908074760177e-06, + "loss": 0.6729, + "step": 15113 + }, + { + "epoch": 1.0919139559665505, + "grad_norm": 5.3154400039227925, + "learning_rate": 2.2495997742709695e-06, + "loss": 0.6348, + "step": 15114 + }, + { + "epoch": 1.09198620116676, + "grad_norm": 7.126471116630718, + "learning_rate": 2.249308744493706e-06, + "loss": 0.6996, + "step": 15115 + }, + { + "epoch": 1.0920584463669696, + "grad_norm": 5.780156542496208, + "learning_rate": 2.2490177181482114e-06, + "loss": 0.6369, + "step": 15116 + }, + { + "epoch": 1.092130691567179, + "grad_norm": 7.577590892099371, + "learning_rate": 2.2487266952384695e-06, + "loss": 0.7381, + "step": 15117 + }, + { + "epoch": 1.0922029367673884, + "grad_norm": 7.386746378765754, + "learning_rate": 2.2484356757684637e-06, + "loss": 0.709, + "step": 15118 + }, + { + "epoch": 1.092275181967598, + "grad_norm": 7.727212444621064, + "learning_rate": 2.2481446597421783e-06, + "loss": 0.6806, + "step": 15119 + }, + { + "epoch": 1.0923474271678075, + "grad_norm": 6.6704734741717475, + "learning_rate": 2.2478536471635983e-06, + "loss": 0.6177, + "step": 15120 + }, + { + "epoch": 1.092419672368017, + "grad_norm": 6.3446293864970515, + "learning_rate": 2.2475626380367037e-06, + "loss": 0.6686, + "step": 15121 + }, + { + "epoch": 1.0924919175682266, + "grad_norm": 5.809963862557883, + "learning_rate": 2.2472716323654817e-06, + "loss": 0.6261, + "step": 15122 + }, + { + "epoch": 1.0925641627684362, + "grad_norm": 7.074075850123519, + "learning_rate": 2.2469806301539144e-06, + "loss": 0.7294, + "step": 15123 + }, + { + "epoch": 1.0926364079686457, + "grad_norm": 6.5577052447782815, + "learning_rate": 2.2466896314059873e-06, + "loss": 0.6735, + "step": 15124 + }, + { + "epoch": 1.092708653168855, + "grad_norm": 6.821639623351479, + "learning_rate": 2.2463986361256814e-06, + "loss": 0.5988, + "step": 15125 + }, + { + "epoch": 1.0927808983690646, + "grad_norm": 7.077573371595823, + "learning_rate": 2.2461076443169807e-06, + "loss": 0.6485, + "step": 15126 + }, + { + "epoch": 1.092853143569274, + "grad_norm": 6.133060367701997, + "learning_rate": 2.2458166559838704e-06, + "loss": 0.619, + "step": 15127 + }, + { + "epoch": 1.0929253887694836, + "grad_norm": 6.525379517724758, + "learning_rate": 2.245525671130332e-06, + "loss": 0.6483, + "step": 15128 + }, + { + "epoch": 1.0929976339696932, + "grad_norm": 8.076404498992584, + "learning_rate": 2.2452346897603494e-06, + "loss": 0.724, + "step": 15129 + }, + { + "epoch": 1.0930698791699027, + "grad_norm": 6.4232681137017815, + "learning_rate": 2.244943711877906e-06, + "loss": 0.7432, + "step": 15130 + }, + { + "epoch": 1.0931421243701123, + "grad_norm": 5.20811557568872, + "learning_rate": 2.2446527374869858e-06, + "loss": 0.6486, + "step": 15131 + }, + { + "epoch": 1.0932143695703216, + "grad_norm": 5.969245290927633, + "learning_rate": 2.244361766591571e-06, + "loss": 0.627, + "step": 15132 + }, + { + "epoch": 1.0932866147705311, + "grad_norm": 7.894912974751922, + "learning_rate": 2.244070799195645e-06, + "loss": 0.7182, + "step": 15133 + }, + { + "epoch": 1.0933588599707407, + "grad_norm": 7.065313268412841, + "learning_rate": 2.243779835303191e-06, + "loss": 0.6536, + "step": 15134 + }, + { + "epoch": 1.0934311051709502, + "grad_norm": 6.368523093721469, + "learning_rate": 2.243488874918193e-06, + "loss": 0.7079, + "step": 15135 + }, + { + "epoch": 1.0935033503711598, + "grad_norm": 6.216438822452996, + "learning_rate": 2.243197918044632e-06, + "loss": 0.6642, + "step": 15136 + }, + { + "epoch": 1.0935755955713693, + "grad_norm": 6.395083637182746, + "learning_rate": 2.2429069646864927e-06, + "loss": 0.7123, + "step": 15137 + }, + { + "epoch": 1.0936478407715788, + "grad_norm": 6.4654535564925455, + "learning_rate": 2.242616014847758e-06, + "loss": 0.6692, + "step": 15138 + }, + { + "epoch": 1.0937200859717882, + "grad_norm": 7.671990854045043, + "learning_rate": 2.2423250685324093e-06, + "loss": 0.7381, + "step": 15139 + }, + { + "epoch": 1.0937923311719977, + "grad_norm": 5.869211795013583, + "learning_rate": 2.2420341257444305e-06, + "loss": 0.6754, + "step": 15140 + }, + { + "epoch": 1.0938645763722072, + "grad_norm": 7.287849546133625, + "learning_rate": 2.2417431864878035e-06, + "loss": 0.6796, + "step": 15141 + }, + { + "epoch": 1.0939368215724168, + "grad_norm": 6.528895779727155, + "learning_rate": 2.2414522507665134e-06, + "loss": 0.7311, + "step": 15142 + }, + { + "epoch": 1.0940090667726263, + "grad_norm": 7.526835180422908, + "learning_rate": 2.2411613185845403e-06, + "loss": 0.7104, + "step": 15143 + }, + { + "epoch": 1.0940813119728359, + "grad_norm": 5.4177363977754815, + "learning_rate": 2.2408703899458678e-06, + "loss": 0.6006, + "step": 15144 + }, + { + "epoch": 1.0941535571730454, + "grad_norm": 6.967135802661278, + "learning_rate": 2.2405794648544786e-06, + "loss": 0.6383, + "step": 15145 + }, + { + "epoch": 1.0942258023732547, + "grad_norm": 6.324592112782168, + "learning_rate": 2.240288543314355e-06, + "loss": 0.7444, + "step": 15146 + }, + { + "epoch": 1.0942980475734643, + "grad_norm": 5.554170007649834, + "learning_rate": 2.2399976253294796e-06, + "loss": 0.5962, + "step": 15147 + }, + { + "epoch": 1.0943702927736738, + "grad_norm": 5.151277861340459, + "learning_rate": 2.2397067109038345e-06, + "loss": 0.6306, + "step": 15148 + }, + { + "epoch": 1.0944425379738834, + "grad_norm": 6.576569506930812, + "learning_rate": 2.2394158000414032e-06, + "loss": 0.6825, + "step": 15149 + }, + { + "epoch": 1.094514783174093, + "grad_norm": 7.169837279874143, + "learning_rate": 2.2391248927461665e-06, + "loss": 0.7059, + "step": 15150 + }, + { + "epoch": 1.0945870283743024, + "grad_norm": 7.806314445822099, + "learning_rate": 2.238833989022108e-06, + "loss": 0.6583, + "step": 15151 + }, + { + "epoch": 1.094659273574512, + "grad_norm": 6.551029025483295, + "learning_rate": 2.23854308887321e-06, + "loss": 0.6728, + "step": 15152 + }, + { + "epoch": 1.0947315187747213, + "grad_norm": 5.891803492027876, + "learning_rate": 2.238252192303453e-06, + "loss": 0.6108, + "step": 15153 + }, + { + "epoch": 1.0948037639749308, + "grad_norm": 6.785228493876335, + "learning_rate": 2.2379612993168203e-06, + "loss": 0.6114, + "step": 15154 + }, + { + "epoch": 1.0948760091751404, + "grad_norm": 7.232685855886554, + "learning_rate": 2.2376704099172934e-06, + "loss": 0.6876, + "step": 15155 + }, + { + "epoch": 1.09494825437535, + "grad_norm": 6.005675492611851, + "learning_rate": 2.2373795241088567e-06, + "loss": 0.6448, + "step": 15156 + }, + { + "epoch": 1.0950204995755595, + "grad_norm": 7.748962086733349, + "learning_rate": 2.2370886418954895e-06, + "loss": 0.7687, + "step": 15157 + }, + { + "epoch": 1.095092744775769, + "grad_norm": 7.34224578386169, + "learning_rate": 2.2367977632811742e-06, + "loss": 0.7074, + "step": 15158 + }, + { + "epoch": 1.0951649899759786, + "grad_norm": 5.4687658691176, + "learning_rate": 2.236506888269893e-06, + "loss": 0.7069, + "step": 15159 + }, + { + "epoch": 1.0952372351761879, + "grad_norm": 7.773003678221221, + "learning_rate": 2.236216016865629e-06, + "loss": 0.6495, + "step": 15160 + }, + { + "epoch": 1.0953094803763974, + "grad_norm": 6.514258371740337, + "learning_rate": 2.235925149072362e-06, + "loss": 0.679, + "step": 15161 + }, + { + "epoch": 1.095381725576607, + "grad_norm": 5.8365473294580505, + "learning_rate": 2.235634284894075e-06, + "loss": 0.6326, + "step": 15162 + }, + { + "epoch": 1.0954539707768165, + "grad_norm": 6.758877885221332, + "learning_rate": 2.23534342433475e-06, + "loss": 0.6634, + "step": 15163 + }, + { + "epoch": 1.095526215977026, + "grad_norm": 6.394893349361428, + "learning_rate": 2.235052567398367e-06, + "loss": 0.6439, + "step": 15164 + }, + { + "epoch": 1.0955984611772356, + "grad_norm": 7.486873583947556, + "learning_rate": 2.234761714088909e-06, + "loss": 0.6739, + "step": 15165 + }, + { + "epoch": 1.0956707063774451, + "grad_norm": 6.226089517010506, + "learning_rate": 2.234470864410357e-06, + "loss": 0.6889, + "step": 15166 + }, + { + "epoch": 1.0957429515776544, + "grad_norm": 6.492482753608838, + "learning_rate": 2.234180018366694e-06, + "loss": 0.6585, + "step": 15167 + }, + { + "epoch": 1.095815196777864, + "grad_norm": 6.828177794507712, + "learning_rate": 2.233889175961899e-06, + "loss": 0.7224, + "step": 15168 + }, + { + "epoch": 1.0958874419780735, + "grad_norm": 6.2400812407853765, + "learning_rate": 2.2335983371999543e-06, + "loss": 0.6862, + "step": 15169 + }, + { + "epoch": 1.095959687178283, + "grad_norm": 5.930776222207376, + "learning_rate": 2.233307502084842e-06, + "loss": 0.6159, + "step": 15170 + }, + { + "epoch": 1.0960319323784926, + "grad_norm": 6.129803700625586, + "learning_rate": 2.233016670620543e-06, + "loss": 0.6949, + "step": 15171 + }, + { + "epoch": 1.0961041775787022, + "grad_norm": 6.167468697569372, + "learning_rate": 2.232725842811038e-06, + "loss": 0.701, + "step": 15172 + }, + { + "epoch": 1.0961764227789117, + "grad_norm": 6.750978187043237, + "learning_rate": 2.2324350186603085e-06, + "loss": 0.7385, + "step": 15173 + }, + { + "epoch": 1.0962486679791212, + "grad_norm": 6.16024418334513, + "learning_rate": 2.2321441981723367e-06, + "loss": 0.7045, + "step": 15174 + }, + { + "epoch": 1.0963209131793306, + "grad_norm": 5.867202454000584, + "learning_rate": 2.2318533813511023e-06, + "loss": 0.628, + "step": 15175 + }, + { + "epoch": 1.09639315837954, + "grad_norm": 6.819323560891139, + "learning_rate": 2.231562568200587e-06, + "loss": 0.7203, + "step": 15176 + }, + { + "epoch": 1.0964654035797496, + "grad_norm": 7.609134615443945, + "learning_rate": 2.231271758724772e-06, + "loss": 0.7049, + "step": 15177 + }, + { + "epoch": 1.0965376487799592, + "grad_norm": 6.746273849018231, + "learning_rate": 2.2309809529276377e-06, + "loss": 0.6594, + "step": 15178 + }, + { + "epoch": 1.0966098939801687, + "grad_norm": 5.57905258886369, + "learning_rate": 2.230690150813165e-06, + "loss": 0.6566, + "step": 15179 + }, + { + "epoch": 1.0966821391803783, + "grad_norm": 6.701893086372097, + "learning_rate": 2.2303993523853352e-06, + "loss": 0.6191, + "step": 15180 + }, + { + "epoch": 1.0967543843805878, + "grad_norm": 6.322856301569943, + "learning_rate": 2.23010855764813e-06, + "loss": 0.6575, + "step": 15181 + }, + { + "epoch": 1.0968266295807971, + "grad_norm": 8.466785386021616, + "learning_rate": 2.2298177666055283e-06, + "loss": 0.6743, + "step": 15182 + }, + { + "epoch": 1.0968988747810067, + "grad_norm": 6.009851315606182, + "learning_rate": 2.229526979261511e-06, + "loss": 0.654, + "step": 15183 + }, + { + "epoch": 1.0969711199812162, + "grad_norm": 5.8389245984435005, + "learning_rate": 2.22923619562006e-06, + "loss": 0.6543, + "step": 15184 + }, + { + "epoch": 1.0970433651814258, + "grad_norm": 5.147855271339058, + "learning_rate": 2.228945415685155e-06, + "loss": 0.5757, + "step": 15185 + }, + { + "epoch": 1.0971156103816353, + "grad_norm": 6.272625064142803, + "learning_rate": 2.228654639460777e-06, + "loss": 0.6558, + "step": 15186 + }, + { + "epoch": 1.0971878555818448, + "grad_norm": 7.3572100716958255, + "learning_rate": 2.228363866950906e-06, + "loss": 0.6549, + "step": 15187 + }, + { + "epoch": 1.0972601007820544, + "grad_norm": 5.65978690996498, + "learning_rate": 2.228073098159523e-06, + "loss": 0.6688, + "step": 15188 + }, + { + "epoch": 1.0973323459822637, + "grad_norm": 7.4455138556520755, + "learning_rate": 2.227782333090608e-06, + "loss": 0.6645, + "step": 15189 + }, + { + "epoch": 1.0974045911824732, + "grad_norm": 7.226217527661348, + "learning_rate": 2.2274915717481413e-06, + "loss": 0.679, + "step": 15190 + }, + { + "epoch": 1.0974768363826828, + "grad_norm": 6.588361083761236, + "learning_rate": 2.2272008141361036e-06, + "loss": 0.6646, + "step": 15191 + }, + { + "epoch": 1.0975490815828923, + "grad_norm": 7.174674358355054, + "learning_rate": 2.2269100602584754e-06, + "loss": 0.6342, + "step": 15192 + }, + { + "epoch": 1.0976213267831019, + "grad_norm": 6.512618215004185, + "learning_rate": 2.2266193101192355e-06, + "loss": 0.6799, + "step": 15193 + }, + { + "epoch": 1.0976935719833114, + "grad_norm": 7.2932629681714065, + "learning_rate": 2.2263285637223655e-06, + "loss": 0.6159, + "step": 15194 + }, + { + "epoch": 1.097765817183521, + "grad_norm": 6.403779832933175, + "learning_rate": 2.226037821071846e-06, + "loss": 0.7156, + "step": 15195 + }, + { + "epoch": 1.0978380623837305, + "grad_norm": 6.306004572195518, + "learning_rate": 2.2257470821716547e-06, + "loss": 0.6585, + "step": 15196 + }, + { + "epoch": 1.0979103075839398, + "grad_norm": 6.439917545620034, + "learning_rate": 2.225456347025773e-06, + "loss": 0.6352, + "step": 15197 + }, + { + "epoch": 1.0979825527841494, + "grad_norm": 4.713987056139856, + "learning_rate": 2.2251656156381802e-06, + "loss": 0.5852, + "step": 15198 + }, + { + "epoch": 1.098054797984359, + "grad_norm": 8.12872698474848, + "learning_rate": 2.224874888012858e-06, + "loss": 0.6897, + "step": 15199 + }, + { + "epoch": 1.0981270431845684, + "grad_norm": 8.55549591285867, + "learning_rate": 2.2245841641537848e-06, + "loss": 0.6165, + "step": 15200 + }, + { + "epoch": 1.098199288384778, + "grad_norm": 6.29491975401376, + "learning_rate": 2.22429344406494e-06, + "loss": 0.6592, + "step": 15201 + }, + { + "epoch": 1.0982715335849875, + "grad_norm": 6.142821251647121, + "learning_rate": 2.224002727750304e-06, + "loss": 0.6626, + "step": 15202 + }, + { + "epoch": 1.098343778785197, + "grad_norm": 7.2056080806049465, + "learning_rate": 2.2237120152138564e-06, + "loss": 0.7126, + "step": 15203 + }, + { + "epoch": 1.0984160239854064, + "grad_norm": 5.905601183257582, + "learning_rate": 2.2234213064595765e-06, + "loss": 0.6356, + "step": 15204 + }, + { + "epoch": 1.098488269185616, + "grad_norm": 6.772841342619427, + "learning_rate": 2.223130601491444e-06, + "loss": 0.6728, + "step": 15205 + }, + { + "epoch": 1.0985605143858255, + "grad_norm": 6.250129088022371, + "learning_rate": 2.2228399003134395e-06, + "loss": 0.661, + "step": 15206 + }, + { + "epoch": 1.098632759586035, + "grad_norm": 6.581589029098461, + "learning_rate": 2.222549202929541e-06, + "loss": 0.685, + "step": 15207 + }, + { + "epoch": 1.0987050047862446, + "grad_norm": 8.50984339999173, + "learning_rate": 2.2222585093437286e-06, + "loss": 0.7333, + "step": 15208 + }, + { + "epoch": 1.098777249986454, + "grad_norm": 5.563466877432548, + "learning_rate": 2.2219678195599823e-06, + "loss": 0.6662, + "step": 15209 + }, + { + "epoch": 1.0988494951866636, + "grad_norm": 8.30276538786551, + "learning_rate": 2.22167713358228e-06, + "loss": 0.6876, + "step": 15210 + }, + { + "epoch": 1.098921740386873, + "grad_norm": 6.4572519032738676, + "learning_rate": 2.221386451414601e-06, + "loss": 0.686, + "step": 15211 + }, + { + "epoch": 1.0989939855870825, + "grad_norm": 7.52598647435068, + "learning_rate": 2.221095773060925e-06, + "loss": 0.6668, + "step": 15212 + }, + { + "epoch": 1.099066230787292, + "grad_norm": 6.394590308791108, + "learning_rate": 2.220805098525233e-06, + "loss": 0.653, + "step": 15213 + }, + { + "epoch": 1.0991384759875016, + "grad_norm": 7.9061395087079225, + "learning_rate": 2.2205144278115013e-06, + "loss": 0.7085, + "step": 15214 + }, + { + "epoch": 1.0992107211877111, + "grad_norm": 6.978876394431969, + "learning_rate": 2.22022376092371e-06, + "loss": 0.621, + "step": 15215 + }, + { + "epoch": 1.0992829663879207, + "grad_norm": 7.919711628297499, + "learning_rate": 2.219933097865838e-06, + "loss": 0.6728, + "step": 15216 + }, + { + "epoch": 1.0993552115881302, + "grad_norm": 6.529040679332823, + "learning_rate": 2.2196424386418654e-06, + "loss": 0.698, + "step": 15217 + }, + { + "epoch": 1.0994274567883395, + "grad_norm": 6.7848006414981015, + "learning_rate": 2.219351783255769e-06, + "loss": 0.6648, + "step": 15218 + }, + { + "epoch": 1.099499701988549, + "grad_norm": 5.7287796537134135, + "learning_rate": 2.2190611317115295e-06, + "loss": 0.5914, + "step": 15219 + }, + { + "epoch": 1.0995719471887586, + "grad_norm": 6.882742943125407, + "learning_rate": 2.218770484013125e-06, + "loss": 0.7459, + "step": 15220 + }, + { + "epoch": 1.0996441923889682, + "grad_norm": 5.722657249893257, + "learning_rate": 2.218479840164534e-06, + "loss": 0.7022, + "step": 15221 + }, + { + "epoch": 1.0997164375891777, + "grad_norm": 5.898807418036134, + "learning_rate": 2.2181892001697352e-06, + "loss": 0.5693, + "step": 15222 + }, + { + "epoch": 1.0997886827893872, + "grad_norm": 6.445386185658342, + "learning_rate": 2.2178985640327076e-06, + "loss": 0.6593, + "step": 15223 + }, + { + "epoch": 1.0998609279895968, + "grad_norm": 6.158243389100848, + "learning_rate": 2.2176079317574307e-06, + "loss": 0.6718, + "step": 15224 + }, + { + "epoch": 1.099933173189806, + "grad_norm": 8.753978151896812, + "learning_rate": 2.2173173033478808e-06, + "loss": 0.6661, + "step": 15225 + }, + { + "epoch": 1.1000054183900156, + "grad_norm": 7.440636750732269, + "learning_rate": 2.2170266788080365e-06, + "loss": 0.6378, + "step": 15226 + }, + { + "epoch": 1.1000776635902252, + "grad_norm": 7.684359025104761, + "learning_rate": 2.2167360581418797e-06, + "loss": 0.6697, + "step": 15227 + }, + { + "epoch": 1.1001499087904347, + "grad_norm": 6.910933757576564, + "learning_rate": 2.2164454413533843e-06, + "loss": 0.6839, + "step": 15228 + }, + { + "epoch": 1.1002221539906443, + "grad_norm": 6.654568311990291, + "learning_rate": 2.2161548284465313e-06, + "loss": 0.6736, + "step": 15229 + }, + { + "epoch": 1.1002943991908538, + "grad_norm": 6.259674214426175, + "learning_rate": 2.2158642194252984e-06, + "loss": 0.6613, + "step": 15230 + }, + { + "epoch": 1.1003666443910634, + "grad_norm": 6.960814195561947, + "learning_rate": 2.215573614293664e-06, + "loss": 0.6687, + "step": 15231 + }, + { + "epoch": 1.1004388895912727, + "grad_norm": 6.77174012983268, + "learning_rate": 2.2152830130556055e-06, + "loss": 0.6505, + "step": 15232 + }, + { + "epoch": 1.1005111347914822, + "grad_norm": 7.3241859374265, + "learning_rate": 2.2149924157151013e-06, + "loss": 0.6922, + "step": 15233 + }, + { + "epoch": 1.1005833799916918, + "grad_norm": 6.749837237620993, + "learning_rate": 2.214701822276131e-06, + "loss": 0.7033, + "step": 15234 + }, + { + "epoch": 1.1006556251919013, + "grad_norm": 7.659585778811966, + "learning_rate": 2.2144112327426696e-06, + "loss": 0.6654, + "step": 15235 + }, + { + "epoch": 1.1007278703921108, + "grad_norm": 6.3064228680662655, + "learning_rate": 2.214120647118697e-06, + "loss": 0.6455, + "step": 15236 + }, + { + "epoch": 1.1008001155923204, + "grad_norm": 6.291052613598184, + "learning_rate": 2.213830065408191e-06, + "loss": 0.6363, + "step": 15237 + }, + { + "epoch": 1.10087236079253, + "grad_norm": 6.084730153428257, + "learning_rate": 2.2135394876151304e-06, + "loss": 0.7064, + "step": 15238 + }, + { + "epoch": 1.1009446059927392, + "grad_norm": 6.563257373520729, + "learning_rate": 2.2132489137434905e-06, + "loss": 0.6515, + "step": 15239 + }, + { + "epoch": 1.1010168511929488, + "grad_norm": 6.7989257524949815, + "learning_rate": 2.2129583437972494e-06, + "loss": 0.6415, + "step": 15240 + }, + { + "epoch": 1.1010890963931583, + "grad_norm": 7.0207153068907555, + "learning_rate": 2.2126677777803866e-06, + "loss": 0.6846, + "step": 15241 + }, + { + "epoch": 1.1011613415933679, + "grad_norm": 5.647889929857879, + "learning_rate": 2.21237721569688e-06, + "loss": 0.6811, + "step": 15242 + }, + { + "epoch": 1.1012335867935774, + "grad_norm": 5.956254111705854, + "learning_rate": 2.212086657550705e-06, + "loss": 0.6222, + "step": 15243 + }, + { + "epoch": 1.101305831993787, + "grad_norm": 8.857257139110297, + "learning_rate": 2.21179610334584e-06, + "loss": 0.6604, + "step": 15244 + }, + { + "epoch": 1.1013780771939965, + "grad_norm": 7.043338674942545, + "learning_rate": 2.211505553086263e-06, + "loss": 0.7025, + "step": 15245 + }, + { + "epoch": 1.101450322394206, + "grad_norm": 6.223286723102147, + "learning_rate": 2.2112150067759505e-06, + "loss": 0.6694, + "step": 15246 + }, + { + "epoch": 1.1015225675944154, + "grad_norm": 6.124446610344398, + "learning_rate": 2.2109244644188803e-06, + "loss": 0.6635, + "step": 15247 + }, + { + "epoch": 1.101594812794625, + "grad_norm": 6.600713298283949, + "learning_rate": 2.2106339260190295e-06, + "loss": 0.647, + "step": 15248 + }, + { + "epoch": 1.1016670579948344, + "grad_norm": 6.282011830452045, + "learning_rate": 2.2103433915803766e-06, + "loss": 0.6661, + "step": 15249 + }, + { + "epoch": 1.101739303195044, + "grad_norm": 6.527245567035364, + "learning_rate": 2.210052861106897e-06, + "loss": 0.6569, + "step": 15250 + }, + { + "epoch": 1.1018115483952535, + "grad_norm": 8.454472404455249, + "learning_rate": 2.2097623346025686e-06, + "loss": 0.6642, + "step": 15251 + }, + { + "epoch": 1.101883793595463, + "grad_norm": 6.577276542112684, + "learning_rate": 2.2094718120713697e-06, + "loss": 0.6758, + "step": 15252 + }, + { + "epoch": 1.1019560387956726, + "grad_norm": 6.516703344893917, + "learning_rate": 2.2091812935172747e-06, + "loss": 0.712, + "step": 15253 + }, + { + "epoch": 1.102028283995882, + "grad_norm": 7.335447382372839, + "learning_rate": 2.2088907789442617e-06, + "loss": 0.7335, + "step": 15254 + }, + { + "epoch": 1.1021005291960915, + "grad_norm": 6.913588547491376, + "learning_rate": 2.2086002683563087e-06, + "loss": 0.7213, + "step": 15255 + }, + { + "epoch": 1.102172774396301, + "grad_norm": 7.122443292565449, + "learning_rate": 2.208309761757392e-06, + "loss": 0.7078, + "step": 15256 + }, + { + "epoch": 1.1022450195965106, + "grad_norm": 7.377741675818382, + "learning_rate": 2.208019259151488e-06, + "loss": 0.7099, + "step": 15257 + }, + { + "epoch": 1.10231726479672, + "grad_norm": 6.877049366072519, + "learning_rate": 2.207728760542573e-06, + "loss": 0.7119, + "step": 15258 + }, + { + "epoch": 1.1023895099969296, + "grad_norm": 5.709829909268177, + "learning_rate": 2.207438265934625e-06, + "loss": 0.6475, + "step": 15259 + }, + { + "epoch": 1.1024617551971392, + "grad_norm": 6.669709687327659, + "learning_rate": 2.20714777533162e-06, + "loss": 0.7068, + "step": 15260 + }, + { + "epoch": 1.1025340003973485, + "grad_norm": 6.646609493202301, + "learning_rate": 2.206857288737534e-06, + "loss": 0.6955, + "step": 15261 + }, + { + "epoch": 1.102606245597558, + "grad_norm": 8.345035982505792, + "learning_rate": 2.2065668061563443e-06, + "loss": 0.663, + "step": 15262 + }, + { + "epoch": 1.1026784907977676, + "grad_norm": 6.521133692184792, + "learning_rate": 2.2062763275920275e-06, + "loss": 0.6516, + "step": 15263 + }, + { + "epoch": 1.1027507359979771, + "grad_norm": 5.562764365096889, + "learning_rate": 2.2059858530485595e-06, + "loss": 0.6796, + "step": 15264 + }, + { + "epoch": 1.1028229811981867, + "grad_norm": 6.987455571942697, + "learning_rate": 2.2056953825299164e-06, + "loss": 0.7205, + "step": 15265 + }, + { + "epoch": 1.1028952263983962, + "grad_norm": 5.983137276641961, + "learning_rate": 2.205404916040076e-06, + "loss": 0.68, + "step": 15266 + }, + { + "epoch": 1.1029674715986058, + "grad_norm": 5.710186659773314, + "learning_rate": 2.2051144535830137e-06, + "loss": 0.6174, + "step": 15267 + }, + { + "epoch": 1.1030397167988153, + "grad_norm": 6.763022400145018, + "learning_rate": 2.2048239951627044e-06, + "loss": 0.6296, + "step": 15268 + }, + { + "epoch": 1.1031119619990246, + "grad_norm": 10.627996044728178, + "learning_rate": 2.2045335407831255e-06, + "loss": 0.6212, + "step": 15269 + }, + { + "epoch": 1.1031842071992342, + "grad_norm": 6.719212715716252, + "learning_rate": 2.2042430904482543e-06, + "loss": 0.6488, + "step": 15270 + }, + { + "epoch": 1.1032564523994437, + "grad_norm": 7.359150034711783, + "learning_rate": 2.2039526441620647e-06, + "loss": 0.6536, + "step": 15271 + }, + { + "epoch": 1.1033286975996532, + "grad_norm": 8.109191539651645, + "learning_rate": 2.203662201928533e-06, + "loss": 0.7152, + "step": 15272 + }, + { + "epoch": 1.1034009427998628, + "grad_norm": 7.284012037322789, + "learning_rate": 2.2033717637516364e-06, + "loss": 0.587, + "step": 15273 + }, + { + "epoch": 1.1034731880000723, + "grad_norm": 6.002832062229793, + "learning_rate": 2.20308132963535e-06, + "loss": 0.6019, + "step": 15274 + }, + { + "epoch": 1.1035454332002819, + "grad_norm": 6.5888234034274085, + "learning_rate": 2.2027908995836493e-06, + "loss": 0.5993, + "step": 15275 + }, + { + "epoch": 1.1036176784004912, + "grad_norm": 6.911375327475446, + "learning_rate": 2.202500473600511e-06, + "loss": 0.6896, + "step": 15276 + }, + { + "epoch": 1.1036899236007007, + "grad_norm": 6.20245157809501, + "learning_rate": 2.2022100516899096e-06, + "loss": 0.7027, + "step": 15277 + }, + { + "epoch": 1.1037621688009103, + "grad_norm": 5.901943029084125, + "learning_rate": 2.2019196338558218e-06, + "loss": 0.6379, + "step": 15278 + }, + { + "epoch": 1.1038344140011198, + "grad_norm": 6.809473327795119, + "learning_rate": 2.201629220102222e-06, + "loss": 0.6674, + "step": 15279 + }, + { + "epoch": 1.1039066592013294, + "grad_norm": 7.263827062489167, + "learning_rate": 2.201338810433087e-06, + "loss": 0.66, + "step": 15280 + }, + { + "epoch": 1.103978904401539, + "grad_norm": 6.743439205669214, + "learning_rate": 2.201048404852393e-06, + "loss": 0.6556, + "step": 15281 + }, + { + "epoch": 1.1040511496017484, + "grad_norm": 5.852416766852772, + "learning_rate": 2.2007580033641128e-06, + "loss": 0.6048, + "step": 15282 + }, + { + "epoch": 1.1041233948019578, + "grad_norm": 6.367655394043019, + "learning_rate": 2.2004676059722228e-06, + "loss": 0.7014, + "step": 15283 + }, + { + "epoch": 1.1041956400021673, + "grad_norm": 5.72422620049928, + "learning_rate": 2.2001772126807003e-06, + "loss": 0.5501, + "step": 15284 + }, + { + "epoch": 1.1042678852023768, + "grad_norm": 5.470293797164365, + "learning_rate": 2.199886823493518e-06, + "loss": 0.6487, + "step": 15285 + }, + { + "epoch": 1.1043401304025864, + "grad_norm": 7.219196784655214, + "learning_rate": 2.199596438414652e-06, + "loss": 0.6158, + "step": 15286 + }, + { + "epoch": 1.104412375602796, + "grad_norm": 10.385399443759749, + "learning_rate": 2.199306057448077e-06, + "loss": 0.6675, + "step": 15287 + }, + { + "epoch": 1.1044846208030055, + "grad_norm": 6.202201255605873, + "learning_rate": 2.19901568059777e-06, + "loss": 0.6627, + "step": 15288 + }, + { + "epoch": 1.104556866003215, + "grad_norm": 6.01687918135372, + "learning_rate": 2.1987253078677037e-06, + "loss": 0.6896, + "step": 15289 + }, + { + "epoch": 1.1046291112034243, + "grad_norm": 5.664496548562956, + "learning_rate": 2.198434939261854e-06, + "loss": 0.6167, + "step": 15290 + }, + { + "epoch": 1.1047013564036339, + "grad_norm": 5.999062146958173, + "learning_rate": 2.1981445747841957e-06, + "loss": 0.6549, + "step": 15291 + }, + { + "epoch": 1.1047736016038434, + "grad_norm": 6.977796216941709, + "learning_rate": 2.1978542144387045e-06, + "loss": 0.6265, + "step": 15292 + }, + { + "epoch": 1.104845846804053, + "grad_norm": 6.658753116951384, + "learning_rate": 2.197563858229354e-06, + "loss": 0.7126, + "step": 15293 + }, + { + "epoch": 1.1049180920042625, + "grad_norm": 6.791885684485793, + "learning_rate": 2.1972735061601192e-06, + "loss": 0.6207, + "step": 15294 + }, + { + "epoch": 1.104990337204472, + "grad_norm": 7.719570100900996, + "learning_rate": 2.1969831582349767e-06, + "loss": 0.692, + "step": 15295 + }, + { + "epoch": 1.1050625824046816, + "grad_norm": 5.403024758798487, + "learning_rate": 2.1966928144578976e-06, + "loss": 0.6371, + "step": 15296 + }, + { + "epoch": 1.105134827604891, + "grad_norm": 6.519944037916917, + "learning_rate": 2.1964024748328584e-06, + "loss": 0.7075, + "step": 15297 + }, + { + "epoch": 1.1052070728051004, + "grad_norm": 6.042084287043527, + "learning_rate": 2.1961121393638342e-06, + "loss": 0.6337, + "step": 15298 + }, + { + "epoch": 1.10527931800531, + "grad_norm": 7.373194586682623, + "learning_rate": 2.1958218080547994e-06, + "loss": 0.6714, + "step": 15299 + }, + { + "epoch": 1.1053515632055195, + "grad_norm": 7.838395559442574, + "learning_rate": 2.195531480909727e-06, + "loss": 0.6609, + "step": 15300 + }, + { + "epoch": 1.105423808405729, + "grad_norm": 6.1478964580883275, + "learning_rate": 2.1952411579325925e-06, + "loss": 0.5931, + "step": 15301 + }, + { + "epoch": 1.1054960536059386, + "grad_norm": 7.581380683546214, + "learning_rate": 2.1949508391273703e-06, + "loss": 0.6072, + "step": 15302 + }, + { + "epoch": 1.1055682988061482, + "grad_norm": 6.494952075713901, + "learning_rate": 2.1946605244980336e-06, + "loss": 0.6155, + "step": 15303 + }, + { + "epoch": 1.1056405440063575, + "grad_norm": 7.192188889633664, + "learning_rate": 2.1943702140485573e-06, + "loss": 0.7468, + "step": 15304 + }, + { + "epoch": 1.105712789206567, + "grad_norm": 8.986593645895274, + "learning_rate": 2.1940799077829155e-06, + "loss": 0.6433, + "step": 15305 + }, + { + "epoch": 1.1057850344067766, + "grad_norm": 7.505372983850235, + "learning_rate": 2.193789605705083e-06, + "loss": 0.6487, + "step": 15306 + }, + { + "epoch": 1.105857279606986, + "grad_norm": 7.02563089273255, + "learning_rate": 2.193499307819032e-06, + "loss": 0.6884, + "step": 15307 + }, + { + "epoch": 1.1059295248071956, + "grad_norm": 6.903623055432843, + "learning_rate": 2.193209014128738e-06, + "loss": 0.6915, + "step": 15308 + }, + { + "epoch": 1.1060017700074052, + "grad_norm": 6.721112922517842, + "learning_rate": 2.1929187246381753e-06, + "loss": 0.6331, + "step": 15309 + }, + { + "epoch": 1.1060740152076147, + "grad_norm": 5.787649540792121, + "learning_rate": 2.1926284393513157e-06, + "loss": 0.7298, + "step": 15310 + }, + { + "epoch": 1.106146260407824, + "grad_norm": 6.494834314454588, + "learning_rate": 2.1923381582721334e-06, + "loss": 0.6651, + "step": 15311 + }, + { + "epoch": 1.1062185056080336, + "grad_norm": 7.090308072777552, + "learning_rate": 2.192047881404604e-06, + "loss": 0.752, + "step": 15312 + }, + { + "epoch": 1.1062907508082431, + "grad_norm": 7.483178921177645, + "learning_rate": 2.1917576087527e-06, + "loss": 0.601, + "step": 15313 + }, + { + "epoch": 1.1063629960084527, + "grad_norm": 7.34271336807538, + "learning_rate": 2.1914673403203947e-06, + "loss": 0.7017, + "step": 15314 + }, + { + "epoch": 1.1064352412086622, + "grad_norm": 6.762734727434778, + "learning_rate": 2.191177076111662e-06, + "loss": 0.6254, + "step": 15315 + }, + { + "epoch": 1.1065074864088718, + "grad_norm": 6.313286458492314, + "learning_rate": 2.1908868161304754e-06, + "loss": 0.6282, + "step": 15316 + }, + { + "epoch": 1.1065797316090813, + "grad_norm": 6.827127134966228, + "learning_rate": 2.190596560380808e-06, + "loss": 0.7732, + "step": 15317 + }, + { + "epoch": 1.1066519768092906, + "grad_norm": 8.222481812282732, + "learning_rate": 2.1903063088666333e-06, + "loss": 0.6092, + "step": 15318 + }, + { + "epoch": 1.1067242220095002, + "grad_norm": 6.313557149161827, + "learning_rate": 2.190016061591925e-06, + "loss": 0.6107, + "step": 15319 + }, + { + "epoch": 1.1067964672097097, + "grad_norm": 5.814979926588815, + "learning_rate": 2.1897258185606564e-06, + "loss": 0.6456, + "step": 15320 + }, + { + "epoch": 1.1068687124099192, + "grad_norm": 7.1518775875530585, + "learning_rate": 2.1894355797768e-06, + "loss": 0.6147, + "step": 15321 + }, + { + "epoch": 1.1069409576101288, + "grad_norm": 6.4652440987253925, + "learning_rate": 2.1891453452443296e-06, + "loss": 0.68, + "step": 15322 + }, + { + "epoch": 1.1070132028103383, + "grad_norm": 5.570499112078379, + "learning_rate": 2.188855114967218e-06, + "loss": 0.6131, + "step": 15323 + }, + { + "epoch": 1.1070854480105479, + "grad_norm": 6.092044846067217, + "learning_rate": 2.1885648889494394e-06, + "loss": 0.5504, + "step": 15324 + }, + { + "epoch": 1.1071576932107574, + "grad_norm": 7.106885134480313, + "learning_rate": 2.1882746671949637e-06, + "loss": 0.6591, + "step": 15325 + }, + { + "epoch": 1.1072299384109667, + "grad_norm": 8.210891970318583, + "learning_rate": 2.1879844497077666e-06, + "loss": 0.6458, + "step": 15326 + }, + { + "epoch": 1.1073021836111763, + "grad_norm": 7.979938625778485, + "learning_rate": 2.187694236491821e-06, + "loss": 0.6899, + "step": 15327 + }, + { + "epoch": 1.1073744288113858, + "grad_norm": 7.081221082313527, + "learning_rate": 2.187404027551098e-06, + "loss": 0.6502, + "step": 15328 + }, + { + "epoch": 1.1074466740115954, + "grad_norm": 6.91418387958428, + "learning_rate": 2.187113822889571e-06, + "loss": 0.7501, + "step": 15329 + }, + { + "epoch": 1.107518919211805, + "grad_norm": 6.863683924317862, + "learning_rate": 2.1868236225112127e-06, + "loss": 0.6736, + "step": 15330 + }, + { + "epoch": 1.1075911644120144, + "grad_norm": 6.632491859980311, + "learning_rate": 2.1865334264199967e-06, + "loss": 0.5809, + "step": 15331 + }, + { + "epoch": 1.107663409612224, + "grad_norm": 8.064771701823869, + "learning_rate": 2.1862432346198937e-06, + "loss": 0.7099, + "step": 15332 + }, + { + "epoch": 1.1077356548124333, + "grad_norm": 6.342985962206447, + "learning_rate": 2.1859530471148775e-06, + "loss": 0.6517, + "step": 15333 + }, + { + "epoch": 1.1078079000126428, + "grad_norm": 5.8230187185917845, + "learning_rate": 2.1856628639089207e-06, + "loss": 0.6485, + "step": 15334 + }, + { + "epoch": 1.1078801452128524, + "grad_norm": 6.284334099331566, + "learning_rate": 2.1853726850059947e-06, + "loss": 0.6431, + "step": 15335 + }, + { + "epoch": 1.107952390413062, + "grad_norm": 6.788074346869151, + "learning_rate": 2.1850825104100727e-06, + "loss": 0.7178, + "step": 15336 + }, + { + "epoch": 1.1080246356132715, + "grad_norm": 7.044385516819073, + "learning_rate": 2.1847923401251262e-06, + "loss": 0.6571, + "step": 15337 + }, + { + "epoch": 1.108096880813481, + "grad_norm": 6.976808823426551, + "learning_rate": 2.184502174155129e-06, + "loss": 0.6226, + "step": 15338 + }, + { + "epoch": 1.1081691260136906, + "grad_norm": 5.69882148471921, + "learning_rate": 2.1842120125040504e-06, + "loss": 0.6239, + "step": 15339 + }, + { + "epoch": 1.1082413712138999, + "grad_norm": 6.29097014691727, + "learning_rate": 2.183921855175865e-06, + "loss": 0.7242, + "step": 15340 + }, + { + "epoch": 1.1083136164141094, + "grad_norm": 7.298415223636658, + "learning_rate": 2.1836317021745444e-06, + "loss": 0.6905, + "step": 15341 + }, + { + "epoch": 1.108385861614319, + "grad_norm": 7.941593586390974, + "learning_rate": 2.1833415535040598e-06, + "loss": 0.6501, + "step": 15342 + }, + { + "epoch": 1.1084581068145285, + "grad_norm": 7.37949066513446, + "learning_rate": 2.183051409168383e-06, + "loss": 0.6875, + "step": 15343 + }, + { + "epoch": 1.108530352014738, + "grad_norm": 6.751155330542998, + "learning_rate": 2.1827612691714866e-06, + "loss": 0.7027, + "step": 15344 + }, + { + "epoch": 1.1086025972149476, + "grad_norm": 7.846827416254819, + "learning_rate": 2.1824711335173424e-06, + "loss": 0.7229, + "step": 15345 + }, + { + "epoch": 1.1086748424151571, + "grad_norm": 6.930498043510546, + "learning_rate": 2.1821810022099214e-06, + "loss": 0.6545, + "step": 15346 + }, + { + "epoch": 1.1087470876153667, + "grad_norm": 6.7631930237654005, + "learning_rate": 2.1818908752531958e-06, + "loss": 0.5882, + "step": 15347 + }, + { + "epoch": 1.108819332815576, + "grad_norm": 6.629815420858143, + "learning_rate": 2.181600752651137e-06, + "loss": 0.6989, + "step": 15348 + }, + { + "epoch": 1.1088915780157855, + "grad_norm": 6.248617095541177, + "learning_rate": 2.181310634407717e-06, + "loss": 0.7364, + "step": 15349 + }, + { + "epoch": 1.108963823215995, + "grad_norm": 6.351012634274388, + "learning_rate": 2.181020520526907e-06, + "loss": 0.6055, + "step": 15350 + }, + { + "epoch": 1.1090360684162046, + "grad_norm": 7.166937298063588, + "learning_rate": 2.180730411012678e-06, + "loss": 0.6644, + "step": 15351 + }, + { + "epoch": 1.1091083136164142, + "grad_norm": 6.072940920153738, + "learning_rate": 2.1804403058690028e-06, + "loss": 0.6495, + "step": 15352 + }, + { + "epoch": 1.1091805588166237, + "grad_norm": 6.443632593574572, + "learning_rate": 2.1801502050998504e-06, + "loss": 0.5911, + "step": 15353 + }, + { + "epoch": 1.1092528040168332, + "grad_norm": 5.996183771209421, + "learning_rate": 2.179860108709194e-06, + "loss": 0.6464, + "step": 15354 + }, + { + "epoch": 1.1093250492170426, + "grad_norm": 5.56959595425269, + "learning_rate": 2.1795700167010035e-06, + "loss": 0.7184, + "step": 15355 + }, + { + "epoch": 1.109397294417252, + "grad_norm": 7.966839969085457, + "learning_rate": 2.1792799290792518e-06, + "loss": 0.6747, + "step": 15356 + }, + { + "epoch": 1.1094695396174616, + "grad_norm": 6.258042920617361, + "learning_rate": 2.178989845847908e-06, + "loss": 0.6477, + "step": 15357 + }, + { + "epoch": 1.1095417848176712, + "grad_norm": 5.491372016716457, + "learning_rate": 2.178699767010944e-06, + "loss": 0.6246, + "step": 15358 + }, + { + "epoch": 1.1096140300178807, + "grad_norm": 5.560564883236118, + "learning_rate": 2.1784096925723315e-06, + "loss": 0.6507, + "step": 15359 + }, + { + "epoch": 1.1096862752180903, + "grad_norm": 5.927103356549365, + "learning_rate": 2.1781196225360395e-06, + "loss": 0.6016, + "step": 15360 + }, + { + "epoch": 1.1097585204182998, + "grad_norm": 5.808840019787796, + "learning_rate": 2.1778295569060404e-06, + "loss": 0.6621, + "step": 15361 + }, + { + "epoch": 1.1098307656185091, + "grad_norm": 5.667282893763926, + "learning_rate": 2.1775394956863043e-06, + "loss": 0.6505, + "step": 15362 + }, + { + "epoch": 1.1099030108187187, + "grad_norm": 7.522162688918306, + "learning_rate": 2.177249438880803e-06, + "loss": 0.7381, + "step": 15363 + }, + { + "epoch": 1.1099752560189282, + "grad_norm": 7.548166610730688, + "learning_rate": 2.1769593864935055e-06, + "loss": 0.6128, + "step": 15364 + }, + { + "epoch": 1.1100475012191378, + "grad_norm": 5.794317539898364, + "learning_rate": 2.176669338528383e-06, + "loss": 0.6636, + "step": 15365 + }, + { + "epoch": 1.1101197464193473, + "grad_norm": 5.8583179385044915, + "learning_rate": 2.1763792949894074e-06, + "loss": 0.663, + "step": 15366 + }, + { + "epoch": 1.1101919916195568, + "grad_norm": 6.382610789442041, + "learning_rate": 2.1760892558805465e-06, + "loss": 0.6339, + "step": 15367 + }, + { + "epoch": 1.1102642368197664, + "grad_norm": 6.96278543486462, + "learning_rate": 2.1757992212057727e-06, + "loss": 0.6779, + "step": 15368 + }, + { + "epoch": 1.1103364820199757, + "grad_norm": 5.859486652582042, + "learning_rate": 2.175509190969056e-06, + "loss": 0.6577, + "step": 15369 + }, + { + "epoch": 1.1104087272201852, + "grad_norm": 6.1746977326703005, + "learning_rate": 2.175219165174367e-06, + "loss": 0.6722, + "step": 15370 + }, + { + "epoch": 1.1104809724203948, + "grad_norm": 6.390495466864187, + "learning_rate": 2.174929143825675e-06, + "loss": 0.6132, + "step": 15371 + }, + { + "epoch": 1.1105532176206043, + "grad_norm": 7.031226942236499, + "learning_rate": 2.1746391269269502e-06, + "loss": 0.6254, + "step": 15372 + }, + { + "epoch": 1.1106254628208139, + "grad_norm": 7.131127917943982, + "learning_rate": 2.1743491144821635e-06, + "loss": 0.7094, + "step": 15373 + }, + { + "epoch": 1.1106977080210234, + "grad_norm": 7.50363172975208, + "learning_rate": 2.174059106495285e-06, + "loss": 0.6036, + "step": 15374 + }, + { + "epoch": 1.110769953221233, + "grad_norm": 5.789336616317214, + "learning_rate": 2.1737691029702836e-06, + "loss": 0.6546, + "step": 15375 + }, + { + "epoch": 1.1108421984214423, + "grad_norm": 6.346453010959153, + "learning_rate": 2.17347910391113e-06, + "loss": 0.681, + "step": 15376 + }, + { + "epoch": 1.1109144436216518, + "grad_norm": 7.051763154447672, + "learning_rate": 2.1731891093217946e-06, + "loss": 0.6411, + "step": 15377 + }, + { + "epoch": 1.1109866888218614, + "grad_norm": 6.8767104361998435, + "learning_rate": 2.1728991192062456e-06, + "loss": 0.6474, + "step": 15378 + }, + { + "epoch": 1.111058934022071, + "grad_norm": 7.807471281958295, + "learning_rate": 2.1726091335684537e-06, + "loss": 0.6778, + "step": 15379 + }, + { + "epoch": 1.1111311792222804, + "grad_norm": 6.9342379762332325, + "learning_rate": 2.172319152412389e-06, + "loss": 0.6605, + "step": 15380 + }, + { + "epoch": 1.11120342442249, + "grad_norm": 8.223329841768688, + "learning_rate": 2.1720291757420214e-06, + "loss": 0.6752, + "step": 15381 + }, + { + "epoch": 1.1112756696226995, + "grad_norm": 5.009164422381164, + "learning_rate": 2.171739203561318e-06, + "loss": 0.645, + "step": 15382 + }, + { + "epoch": 1.1113479148229088, + "grad_norm": 6.848505745403461, + "learning_rate": 2.1714492358742507e-06, + "loss": 0.5843, + "step": 15383 + }, + { + "epoch": 1.1114201600231184, + "grad_norm": 7.411049362197352, + "learning_rate": 2.17115927268479e-06, + "loss": 0.6969, + "step": 15384 + }, + { + "epoch": 1.111492405223328, + "grad_norm": 7.097799212580571, + "learning_rate": 2.1708693139969012e-06, + "loss": 0.6801, + "step": 15385 + }, + { + "epoch": 1.1115646504235375, + "grad_norm": 6.455180951336837, + "learning_rate": 2.170579359814557e-06, + "loss": 0.6497, + "step": 15386 + }, + { + "epoch": 1.111636895623747, + "grad_norm": 6.836380286998874, + "learning_rate": 2.170289410141725e-06, + "loss": 0.7175, + "step": 15387 + }, + { + "epoch": 1.1117091408239566, + "grad_norm": 7.488391029695435, + "learning_rate": 2.169999464982376e-06, + "loss": 0.5516, + "step": 15388 + }, + { + "epoch": 1.111781386024166, + "grad_norm": 6.807980499401023, + "learning_rate": 2.169709524340477e-06, + "loss": 0.6204, + "step": 15389 + }, + { + "epoch": 1.1118536312243754, + "grad_norm": 5.9774355979905796, + "learning_rate": 2.1694195882199984e-06, + "loss": 0.5831, + "step": 15390 + }, + { + "epoch": 1.111925876424585, + "grad_norm": 7.0045143284729745, + "learning_rate": 2.1691296566249093e-06, + "loss": 0.6515, + "step": 15391 + }, + { + "epoch": 1.1119981216247945, + "grad_norm": 5.950245238307081, + "learning_rate": 2.168839729559178e-06, + "loss": 0.6024, + "step": 15392 + }, + { + "epoch": 1.112070366825004, + "grad_norm": 8.935023678330362, + "learning_rate": 2.1685498070267735e-06, + "loss": 0.6251, + "step": 15393 + }, + { + "epoch": 1.1121426120252136, + "grad_norm": 6.030858160542983, + "learning_rate": 2.168259889031665e-06, + "loss": 0.682, + "step": 15394 + }, + { + "epoch": 1.1122148572254231, + "grad_norm": 6.624997984687931, + "learning_rate": 2.167969975577822e-06, + "loss": 0.6628, + "step": 15395 + }, + { + "epoch": 1.1122871024256327, + "grad_norm": 6.3663202174861, + "learning_rate": 2.1676800666692106e-06, + "loss": 0.6521, + "step": 15396 + }, + { + "epoch": 1.1123593476258422, + "grad_norm": 9.147131848679612, + "learning_rate": 2.1673901623098018e-06, + "loss": 0.6238, + "step": 15397 + }, + { + "epoch": 1.1124315928260515, + "grad_norm": 8.212990719143324, + "learning_rate": 2.1671002625035635e-06, + "loss": 0.7094, + "step": 15398 + }, + { + "epoch": 1.112503838026261, + "grad_norm": 5.845937717164694, + "learning_rate": 2.166810367254465e-06, + "loss": 0.6696, + "step": 15399 + }, + { + "epoch": 1.1125760832264706, + "grad_norm": 6.072841358099562, + "learning_rate": 2.166520476566473e-06, + "loss": 0.7259, + "step": 15400 + }, + { + "epoch": 1.1126483284266802, + "grad_norm": 6.5700460010236155, + "learning_rate": 2.166230590443556e-06, + "loss": 0.6466, + "step": 15401 + }, + { + "epoch": 1.1127205736268897, + "grad_norm": 6.400999420332856, + "learning_rate": 2.165940708889684e-06, + "loss": 0.6786, + "step": 15402 + }, + { + "epoch": 1.1127928188270992, + "grad_norm": 6.467569289512748, + "learning_rate": 2.165650831908824e-06, + "loss": 0.6094, + "step": 15403 + }, + { + "epoch": 1.1128650640273088, + "grad_norm": 6.340942062038642, + "learning_rate": 2.1653609595049443e-06, + "loss": 0.659, + "step": 15404 + }, + { + "epoch": 1.112937309227518, + "grad_norm": 10.002333178131373, + "learning_rate": 2.1650710916820132e-06, + "loss": 0.6865, + "step": 15405 + }, + { + "epoch": 1.1130095544277276, + "grad_norm": 6.492055292539237, + "learning_rate": 2.164781228444e-06, + "loss": 0.6337, + "step": 15406 + }, + { + "epoch": 1.1130817996279372, + "grad_norm": 6.29041619747851, + "learning_rate": 2.16449136979487e-06, + "loss": 0.6966, + "step": 15407 + }, + { + "epoch": 1.1131540448281467, + "grad_norm": 6.243437106031633, + "learning_rate": 2.1642015157385933e-06, + "loss": 0.6817, + "step": 15408 + }, + { + "epoch": 1.1132262900283563, + "grad_norm": 6.0975367591190945, + "learning_rate": 2.163911666279138e-06, + "loss": 0.7048, + "step": 15409 + }, + { + "epoch": 1.1132985352285658, + "grad_norm": 7.0503865562138515, + "learning_rate": 2.163621821420469e-06, + "loss": 0.6583, + "step": 15410 + }, + { + "epoch": 1.1133707804287754, + "grad_norm": 5.963746854980765, + "learning_rate": 2.163331981166557e-06, + "loss": 0.6634, + "step": 15411 + }, + { + "epoch": 1.1134430256289847, + "grad_norm": 6.318223530224366, + "learning_rate": 2.163042145521369e-06, + "loss": 0.5923, + "step": 15412 + }, + { + "epoch": 1.1135152708291942, + "grad_norm": 6.613497266942319, + "learning_rate": 2.162752314488873e-06, + "loss": 0.7693, + "step": 15413 + }, + { + "epoch": 1.1135875160294038, + "grad_norm": 5.95081354557913, + "learning_rate": 2.1624624880730353e-06, + "loss": 0.7194, + "step": 15414 + }, + { + "epoch": 1.1136597612296133, + "grad_norm": 6.57995232144506, + "learning_rate": 2.162172666277824e-06, + "loss": 0.6236, + "step": 15415 + }, + { + "epoch": 1.1137320064298228, + "grad_norm": 7.364378700129158, + "learning_rate": 2.1618828491072068e-06, + "loss": 0.7524, + "step": 15416 + }, + { + "epoch": 1.1138042516300324, + "grad_norm": 5.933028936971684, + "learning_rate": 2.161593036565151e-06, + "loss": 0.6252, + "step": 15417 + }, + { + "epoch": 1.113876496830242, + "grad_norm": 7.1825231155676805, + "learning_rate": 2.161303228655623e-06, + "loss": 0.6606, + "step": 15418 + }, + { + "epoch": 1.1139487420304515, + "grad_norm": 5.51471994555307, + "learning_rate": 2.1610134253825913e-06, + "loss": 0.6336, + "step": 15419 + }, + { + "epoch": 1.1140209872306608, + "grad_norm": 7.324467964510102, + "learning_rate": 2.160723626750023e-06, + "loss": 0.7366, + "step": 15420 + }, + { + "epoch": 1.1140932324308703, + "grad_norm": 8.129365188570423, + "learning_rate": 2.160433832761884e-06, + "loss": 0.6789, + "step": 15421 + }, + { + "epoch": 1.1141654776310799, + "grad_norm": 5.655852077249419, + "learning_rate": 2.1601440434221427e-06, + "loss": 0.6648, + "step": 15422 + }, + { + "epoch": 1.1142377228312894, + "grad_norm": 6.798614630104568, + "learning_rate": 2.1598542587347652e-06, + "loss": 0.6756, + "step": 15423 + }, + { + "epoch": 1.114309968031499, + "grad_norm": 7.351717129417899, + "learning_rate": 2.1595644787037194e-06, + "loss": 0.6914, + "step": 15424 + }, + { + "epoch": 1.1143822132317085, + "grad_norm": 7.739029780256519, + "learning_rate": 2.159274703332971e-06, + "loss": 0.6732, + "step": 15425 + }, + { + "epoch": 1.114454458431918, + "grad_norm": 6.179110301149311, + "learning_rate": 2.158984932626487e-06, + "loss": 0.6536, + "step": 15426 + }, + { + "epoch": 1.1145267036321274, + "grad_norm": 5.759304976386922, + "learning_rate": 2.1586951665882362e-06, + "loss": 0.6551, + "step": 15427 + }, + { + "epoch": 1.114598948832337, + "grad_norm": 6.854726689272092, + "learning_rate": 2.1584054052221826e-06, + "loss": 0.6706, + "step": 15428 + }, + { + "epoch": 1.1146711940325464, + "grad_norm": 6.342534592428515, + "learning_rate": 2.158115648532293e-06, + "loss": 0.6473, + "step": 15429 + }, + { + "epoch": 1.114743439232756, + "grad_norm": 6.983114585394124, + "learning_rate": 2.157825896522535e-06, + "loss": 0.7092, + "step": 15430 + }, + { + "epoch": 1.1148156844329655, + "grad_norm": 6.690447291544677, + "learning_rate": 2.1575361491968757e-06, + "loss": 0.6978, + "step": 15431 + }, + { + "epoch": 1.114887929633175, + "grad_norm": 6.840839481258833, + "learning_rate": 2.15724640655928e-06, + "loss": 0.6632, + "step": 15432 + }, + { + "epoch": 1.1149601748333846, + "grad_norm": 6.533654167446051, + "learning_rate": 2.1569566686137145e-06, + "loss": 0.6626, + "step": 15433 + }, + { + "epoch": 1.115032420033594, + "grad_norm": 7.718664500886542, + "learning_rate": 2.1566669353641467e-06, + "loss": 0.6498, + "step": 15434 + }, + { + "epoch": 1.1151046652338035, + "grad_norm": 7.25611356595466, + "learning_rate": 2.1563772068145415e-06, + "loss": 0.6903, + "step": 15435 + }, + { + "epoch": 1.115176910434013, + "grad_norm": 5.781994580958073, + "learning_rate": 2.156087482968865e-06, + "loss": 0.5421, + "step": 15436 + }, + { + "epoch": 1.1152491556342226, + "grad_norm": 6.625520217911872, + "learning_rate": 2.1557977638310844e-06, + "loss": 0.6832, + "step": 15437 + }, + { + "epoch": 1.115321400834432, + "grad_norm": 6.237497260144689, + "learning_rate": 2.1555080494051657e-06, + "loss": 0.625, + "step": 15438 + }, + { + "epoch": 1.1153936460346416, + "grad_norm": 6.703332146420493, + "learning_rate": 2.1552183396950734e-06, + "loss": 0.6584, + "step": 15439 + }, + { + "epoch": 1.1154658912348512, + "grad_norm": 8.17942353472123, + "learning_rate": 2.1549286347047744e-06, + "loss": 0.6972, + "step": 15440 + }, + { + "epoch": 1.1155381364350605, + "grad_norm": 6.88015165649757, + "learning_rate": 2.154638934438236e-06, + "loss": 0.6653, + "step": 15441 + }, + { + "epoch": 1.11561038163527, + "grad_norm": 8.03997922069085, + "learning_rate": 2.1543492388994206e-06, + "loss": 0.7356, + "step": 15442 + }, + { + "epoch": 1.1156826268354796, + "grad_norm": 6.142981157286688, + "learning_rate": 2.154059548092296e-06, + "loss": 0.6378, + "step": 15443 + }, + { + "epoch": 1.1157548720356891, + "grad_norm": 6.277853279822532, + "learning_rate": 2.1537698620208275e-06, + "loss": 0.7737, + "step": 15444 + }, + { + "epoch": 1.1158271172358987, + "grad_norm": 5.526272705694155, + "learning_rate": 2.153480180688982e-06, + "loss": 0.6943, + "step": 15445 + }, + { + "epoch": 1.1158993624361082, + "grad_norm": 5.979358771912101, + "learning_rate": 2.1531905041007226e-06, + "loss": 0.7305, + "step": 15446 + }, + { + "epoch": 1.1159716076363178, + "grad_norm": 6.773170545128657, + "learning_rate": 2.152900832260016e-06, + "loss": 0.6579, + "step": 15447 + }, + { + "epoch": 1.116043852836527, + "grad_norm": 7.890111321073712, + "learning_rate": 2.152611165170828e-06, + "loss": 0.7451, + "step": 15448 + }, + { + "epoch": 1.1161160980367366, + "grad_norm": 6.074079015173276, + "learning_rate": 2.1523215028371235e-06, + "loss": 0.625, + "step": 15449 + }, + { + "epoch": 1.1161883432369462, + "grad_norm": 7.433153437104016, + "learning_rate": 2.152031845262867e-06, + "loss": 0.6272, + "step": 15450 + }, + { + "epoch": 1.1162605884371557, + "grad_norm": 6.49012299360867, + "learning_rate": 2.1517421924520245e-06, + "loss": 0.6896, + "step": 15451 + }, + { + "epoch": 1.1163328336373652, + "grad_norm": 7.210567444205181, + "learning_rate": 2.1514525444085616e-06, + "loss": 0.6907, + "step": 15452 + }, + { + "epoch": 1.1164050788375748, + "grad_norm": 7.621751687466628, + "learning_rate": 2.151162901136442e-06, + "loss": 0.6093, + "step": 15453 + }, + { + "epoch": 1.1164773240377843, + "grad_norm": 5.3218807200268365, + "learning_rate": 2.1508732626396312e-06, + "loss": 0.6692, + "step": 15454 + }, + { + "epoch": 1.1165495692379936, + "grad_norm": 6.074368215604813, + "learning_rate": 2.150583628922095e-06, + "loss": 0.6901, + "step": 15455 + }, + { + "epoch": 1.1166218144382032, + "grad_norm": 7.358884369841747, + "learning_rate": 2.1502939999877983e-06, + "loss": 0.6122, + "step": 15456 + }, + { + "epoch": 1.1166940596384127, + "grad_norm": 6.6722817933473095, + "learning_rate": 2.1500043758407044e-06, + "loss": 0.6599, + "step": 15457 + }, + { + "epoch": 1.1167663048386223, + "grad_norm": 7.467257681798382, + "learning_rate": 2.1497147564847787e-06, + "loss": 0.7119, + "step": 15458 + }, + { + "epoch": 1.1168385500388318, + "grad_norm": 6.963364234856967, + "learning_rate": 2.1494251419239865e-06, + "loss": 0.7366, + "step": 15459 + }, + { + "epoch": 1.1169107952390414, + "grad_norm": 5.773756955281553, + "learning_rate": 2.1491355321622915e-06, + "loss": 0.695, + "step": 15460 + }, + { + "epoch": 1.116983040439251, + "grad_norm": 5.269164211874519, + "learning_rate": 2.1488459272036584e-06, + "loss": 0.5834, + "step": 15461 + }, + { + "epoch": 1.1170552856394602, + "grad_norm": 5.889246124052549, + "learning_rate": 2.148556327052052e-06, + "loss": 0.6796, + "step": 15462 + }, + { + "epoch": 1.1171275308396698, + "grad_norm": 6.5743339288224485, + "learning_rate": 2.1482667317114373e-06, + "loss": 0.6464, + "step": 15463 + }, + { + "epoch": 1.1171997760398793, + "grad_norm": 5.781134526929557, + "learning_rate": 2.147977141185777e-06, + "loss": 0.6784, + "step": 15464 + }, + { + "epoch": 1.1172720212400888, + "grad_norm": 6.540036656454873, + "learning_rate": 2.1476875554790366e-06, + "loss": 0.6401, + "step": 15465 + }, + { + "epoch": 1.1173442664402984, + "grad_norm": 6.538622915425461, + "learning_rate": 2.1473979745951804e-06, + "loss": 0.6209, + "step": 15466 + }, + { + "epoch": 1.117416511640508, + "grad_norm": 6.163436180949765, + "learning_rate": 2.147108398538172e-06, + "loss": 0.6238, + "step": 15467 + }, + { + "epoch": 1.1174887568407175, + "grad_norm": 6.861505477354914, + "learning_rate": 2.146818827311975e-06, + "loss": 0.6739, + "step": 15468 + }, + { + "epoch": 1.117561002040927, + "grad_norm": 6.84052831338349, + "learning_rate": 2.1465292609205545e-06, + "loss": 0.5943, + "step": 15469 + }, + { + "epoch": 1.1176332472411363, + "grad_norm": 6.534554991393589, + "learning_rate": 2.1462396993678753e-06, + "loss": 0.672, + "step": 15470 + }, + { + "epoch": 1.1177054924413459, + "grad_norm": 6.22384511506813, + "learning_rate": 2.145950142657898e-06, + "loss": 0.6896, + "step": 15471 + }, + { + "epoch": 1.1177777376415554, + "grad_norm": 7.480516175600315, + "learning_rate": 2.1456605907945894e-06, + "loss": 0.7314, + "step": 15472 + }, + { + "epoch": 1.117849982841765, + "grad_norm": 6.049536934778078, + "learning_rate": 2.145371043781912e-06, + "loss": 0.6917, + "step": 15473 + }, + { + "epoch": 1.1179222280419745, + "grad_norm": 5.945872617545891, + "learning_rate": 2.1450815016238293e-06, + "loss": 0.6725, + "step": 15474 + }, + { + "epoch": 1.117994473242184, + "grad_norm": 6.524421590256284, + "learning_rate": 2.144791964324305e-06, + "loss": 0.7558, + "step": 15475 + }, + { + "epoch": 1.1180667184423936, + "grad_norm": 6.816829880281539, + "learning_rate": 2.1445024318873034e-06, + "loss": 0.6151, + "step": 15476 + }, + { + "epoch": 1.118138963642603, + "grad_norm": 5.656643679918121, + "learning_rate": 2.1442129043167877e-06, + "loss": 0.677, + "step": 15477 + }, + { + "epoch": 1.1182112088428124, + "grad_norm": 6.297158511762716, + "learning_rate": 2.143923381616721e-06, + "loss": 0.6276, + "step": 15478 + }, + { + "epoch": 1.118283454043022, + "grad_norm": 7.347213123350352, + "learning_rate": 2.143633863791066e-06, + "loss": 0.6288, + "step": 15479 + }, + { + "epoch": 1.1183556992432315, + "grad_norm": 6.343362730636005, + "learning_rate": 2.143344350843787e-06, + "loss": 0.6499, + "step": 15480 + }, + { + "epoch": 1.118427944443441, + "grad_norm": 6.202311657064438, + "learning_rate": 2.1430548427788474e-06, + "loss": 0.6484, + "step": 15481 + }, + { + "epoch": 1.1185001896436506, + "grad_norm": 6.483401011658812, + "learning_rate": 2.1427653396002094e-06, + "loss": 0.7296, + "step": 15482 + }, + { + "epoch": 1.1185724348438602, + "grad_norm": 6.309693874181906, + "learning_rate": 2.142475841311837e-06, + "loss": 0.6827, + "step": 15483 + }, + { + "epoch": 1.1186446800440695, + "grad_norm": 6.599992590235394, + "learning_rate": 2.142186347917693e-06, + "loss": 0.7558, + "step": 15484 + }, + { + "epoch": 1.118716925244279, + "grad_norm": 6.976028268144361, + "learning_rate": 2.1418968594217393e-06, + "loss": 0.6915, + "step": 15485 + }, + { + "epoch": 1.1187891704444886, + "grad_norm": 5.873072531225138, + "learning_rate": 2.1416073758279397e-06, + "loss": 0.6176, + "step": 15486 + }, + { + "epoch": 1.118861415644698, + "grad_norm": 7.7202744928291835, + "learning_rate": 2.141317897140256e-06, + "loss": 0.6827, + "step": 15487 + }, + { + "epoch": 1.1189336608449076, + "grad_norm": 6.949539871901761, + "learning_rate": 2.141028423362654e-06, + "loss": 0.6789, + "step": 15488 + }, + { + "epoch": 1.1190059060451172, + "grad_norm": 7.024615196467865, + "learning_rate": 2.1407389544990927e-06, + "loss": 0.6831, + "step": 15489 + }, + { + "epoch": 1.1190781512453267, + "grad_norm": 6.861771497292422, + "learning_rate": 2.140449490553536e-06, + "loss": 0.6849, + "step": 15490 + }, + { + "epoch": 1.1191503964455363, + "grad_norm": 6.61551058620286, + "learning_rate": 2.1401600315299472e-06, + "loss": 0.6616, + "step": 15491 + }, + { + "epoch": 1.1192226416457456, + "grad_norm": 6.713127302981462, + "learning_rate": 2.139870577432288e-06, + "loss": 0.6326, + "step": 15492 + }, + { + "epoch": 1.1192948868459551, + "grad_norm": 6.187196049786696, + "learning_rate": 2.1395811282645207e-06, + "loss": 0.7243, + "step": 15493 + }, + { + "epoch": 1.1193671320461647, + "grad_norm": 5.844827318079965, + "learning_rate": 2.139291684030608e-06, + "loss": 0.6149, + "step": 15494 + }, + { + "epoch": 1.1194393772463742, + "grad_norm": 7.8659114162598645, + "learning_rate": 2.1390022447345125e-06, + "loss": 0.6668, + "step": 15495 + }, + { + "epoch": 1.1195116224465838, + "grad_norm": 6.740536873373559, + "learning_rate": 2.138712810380195e-06, + "loss": 0.6804, + "step": 15496 + }, + { + "epoch": 1.1195838676467933, + "grad_norm": 5.252472613125721, + "learning_rate": 2.1384233809716195e-06, + "loss": 0.6077, + "step": 15497 + }, + { + "epoch": 1.1196561128470028, + "grad_norm": 7.023870639365799, + "learning_rate": 2.1381339565127474e-06, + "loss": 0.6763, + "step": 15498 + }, + { + "epoch": 1.1197283580472122, + "grad_norm": 7.974281934022727, + "learning_rate": 2.13784453700754e-06, + "loss": 0.7059, + "step": 15499 + }, + { + "epoch": 1.1198006032474217, + "grad_norm": 7.794222516233955, + "learning_rate": 2.1375551224599593e-06, + "loss": 0.7348, + "step": 15500 + }, + { + "epoch": 1.1198728484476312, + "grad_norm": 6.788242935913713, + "learning_rate": 2.1372657128739673e-06, + "loss": 0.6184, + "step": 15501 + }, + { + "epoch": 1.1199450936478408, + "grad_norm": 6.815895564232869, + "learning_rate": 2.1369763082535276e-06, + "loss": 0.6806, + "step": 15502 + }, + { + "epoch": 1.1200173388480503, + "grad_norm": 6.424550782170342, + "learning_rate": 2.1366869086025993e-06, + "loss": 0.6378, + "step": 15503 + }, + { + "epoch": 1.1200895840482599, + "grad_norm": 6.325136736571639, + "learning_rate": 2.1363975139251452e-06, + "loss": 0.6223, + "step": 15504 + }, + { + "epoch": 1.1201618292484694, + "grad_norm": 5.430875755207981, + "learning_rate": 2.1361081242251265e-06, + "loss": 0.6621, + "step": 15505 + }, + { + "epoch": 1.1202340744486787, + "grad_norm": 6.1251195779143774, + "learning_rate": 2.135818739506506e-06, + "loss": 0.6015, + "step": 15506 + }, + { + "epoch": 1.1203063196488883, + "grad_norm": 7.0617313473100936, + "learning_rate": 2.1355293597732437e-06, + "loss": 0.6569, + "step": 15507 + }, + { + "epoch": 1.1203785648490978, + "grad_norm": 6.402170361631708, + "learning_rate": 2.135239985029301e-06, + "loss": 0.6413, + "step": 15508 + }, + { + "epoch": 1.1204508100493074, + "grad_norm": 5.948106467653139, + "learning_rate": 2.1349506152786404e-06, + "loss": 0.6399, + "step": 15509 + }, + { + "epoch": 1.120523055249517, + "grad_norm": 6.022429821069485, + "learning_rate": 2.1346612505252217e-06, + "loss": 0.609, + "step": 15510 + }, + { + "epoch": 1.1205953004497264, + "grad_norm": 6.525276043778513, + "learning_rate": 2.134371890773007e-06, + "loss": 0.6484, + "step": 15511 + }, + { + "epoch": 1.120667545649936, + "grad_norm": 6.452699661092229, + "learning_rate": 2.1340825360259575e-06, + "loss": 0.6636, + "step": 15512 + }, + { + "epoch": 1.1207397908501453, + "grad_norm": 6.414504376336723, + "learning_rate": 2.1337931862880347e-06, + "loss": 0.5878, + "step": 15513 + }, + { + "epoch": 1.1208120360503548, + "grad_norm": 6.051550031354151, + "learning_rate": 2.133503841563198e-06, + "loss": 0.6189, + "step": 15514 + }, + { + "epoch": 1.1208842812505644, + "grad_norm": 6.263807380335798, + "learning_rate": 2.1332145018554086e-06, + "loss": 0.664, + "step": 15515 + }, + { + "epoch": 1.120956526450774, + "grad_norm": 7.167257979491186, + "learning_rate": 2.132925167168629e-06, + "loss": 0.7028, + "step": 15516 + }, + { + "epoch": 1.1210287716509835, + "grad_norm": 8.21145549849773, + "learning_rate": 2.1326358375068184e-06, + "loss": 0.6938, + "step": 15517 + }, + { + "epoch": 1.121101016851193, + "grad_norm": 5.436079694791101, + "learning_rate": 2.1323465128739377e-06, + "loss": 0.7038, + "step": 15518 + }, + { + "epoch": 1.1211732620514026, + "grad_norm": 7.453931848769716, + "learning_rate": 2.1320571932739477e-06, + "loss": 0.7277, + "step": 15519 + }, + { + "epoch": 1.1212455072516119, + "grad_norm": 5.901610474698272, + "learning_rate": 2.1317678787108095e-06, + "loss": 0.5775, + "step": 15520 + }, + { + "epoch": 1.1213177524518214, + "grad_norm": 6.468867056704323, + "learning_rate": 2.1314785691884825e-06, + "loss": 0.7107, + "step": 15521 + }, + { + "epoch": 1.121389997652031, + "grad_norm": 5.578067554851265, + "learning_rate": 2.1311892647109283e-06, + "loss": 0.6558, + "step": 15522 + }, + { + "epoch": 1.1214622428522405, + "grad_norm": 6.39320765308456, + "learning_rate": 2.1308999652821064e-06, + "loss": 0.6048, + "step": 15523 + }, + { + "epoch": 1.12153448805245, + "grad_norm": 6.457863608065061, + "learning_rate": 2.1306106709059775e-06, + "loss": 0.6322, + "step": 15524 + }, + { + "epoch": 1.1216067332526596, + "grad_norm": 6.113950143118528, + "learning_rate": 2.130321381586502e-06, + "loss": 0.6873, + "step": 15525 + }, + { + "epoch": 1.1216789784528691, + "grad_norm": 5.90830804400643, + "learning_rate": 2.130032097327639e-06, + "loss": 0.6539, + "step": 15526 + }, + { + "epoch": 1.1217512236530784, + "grad_norm": 6.344873488587589, + "learning_rate": 2.1297428181333507e-06, + "loss": 0.6556, + "step": 15527 + }, + { + "epoch": 1.121823468853288, + "grad_norm": 6.861215540013551, + "learning_rate": 2.1294535440075946e-06, + "loss": 0.7036, + "step": 15528 + }, + { + "epoch": 1.1218957140534975, + "grad_norm": 7.37535818490959, + "learning_rate": 2.1291642749543314e-06, + "loss": 0.6858, + "step": 15529 + }, + { + "epoch": 1.121967959253707, + "grad_norm": 6.641951843546608, + "learning_rate": 2.128875010977522e-06, + "loss": 0.6412, + "step": 15530 + }, + { + "epoch": 1.1220402044539166, + "grad_norm": 5.960399278412938, + "learning_rate": 2.128585752081126e-06, + "loss": 0.6304, + "step": 15531 + }, + { + "epoch": 1.1221124496541262, + "grad_norm": 5.670109581805076, + "learning_rate": 2.128296498269102e-06, + "loss": 0.642, + "step": 15532 + }, + { + "epoch": 1.1221846948543357, + "grad_norm": 6.17591528653566, + "learning_rate": 2.1280072495454106e-06, + "loss": 0.6041, + "step": 15533 + }, + { + "epoch": 1.122256940054545, + "grad_norm": 6.492594094518144, + "learning_rate": 2.127718005914011e-06, + "loss": 0.6193, + "step": 15534 + }, + { + "epoch": 1.1223291852547546, + "grad_norm": 7.108058933389418, + "learning_rate": 2.127428767378863e-06, + "loss": 0.6924, + "step": 15535 + }, + { + "epoch": 1.122401430454964, + "grad_norm": 7.464685842619494, + "learning_rate": 2.1271395339439256e-06, + "loss": 0.6107, + "step": 15536 + }, + { + "epoch": 1.1224736756551736, + "grad_norm": 5.938646065379191, + "learning_rate": 2.1268503056131582e-06, + "loss": 0.6818, + "step": 15537 + }, + { + "epoch": 1.1225459208553832, + "grad_norm": 7.17264089047751, + "learning_rate": 2.1265610823905213e-06, + "loss": 0.657, + "step": 15538 + }, + { + "epoch": 1.1226181660555927, + "grad_norm": 6.714947711346179, + "learning_rate": 2.1262718642799726e-06, + "loss": 0.6778, + "step": 15539 + }, + { + "epoch": 1.1226904112558023, + "grad_norm": 6.465077707719042, + "learning_rate": 2.125982651285472e-06, + "loss": 0.6505, + "step": 15540 + }, + { + "epoch": 1.1227626564560116, + "grad_norm": 6.220576185910498, + "learning_rate": 2.1256934434109794e-06, + "loss": 0.7589, + "step": 15541 + }, + { + "epoch": 1.1228349016562211, + "grad_norm": 6.080370176992336, + "learning_rate": 2.125404240660452e-06, + "loss": 0.6898, + "step": 15542 + }, + { + "epoch": 1.1229071468564307, + "grad_norm": 6.91470192576426, + "learning_rate": 2.1251150430378486e-06, + "loss": 0.6779, + "step": 15543 + }, + { + "epoch": 1.1229793920566402, + "grad_norm": 5.947228101790494, + "learning_rate": 2.1248258505471303e-06, + "loss": 0.624, + "step": 15544 + }, + { + "epoch": 1.1230516372568498, + "grad_norm": 7.437672845450856, + "learning_rate": 2.1245366631922556e-06, + "loss": 0.6586, + "step": 15545 + }, + { + "epoch": 1.1231238824570593, + "grad_norm": 6.0084858332242606, + "learning_rate": 2.124247480977182e-06, + "loss": 0.5941, + "step": 15546 + }, + { + "epoch": 1.1231961276572688, + "grad_norm": 6.193298089317559, + "learning_rate": 2.123958303905868e-06, + "loss": 0.6544, + "step": 15547 + }, + { + "epoch": 1.1232683728574784, + "grad_norm": 7.113598451449645, + "learning_rate": 2.1236691319822732e-06, + "loss": 0.7925, + "step": 15548 + }, + { + "epoch": 1.1233406180576877, + "grad_norm": 5.291821975631582, + "learning_rate": 2.1233799652103555e-06, + "loss": 0.6512, + "step": 15549 + }, + { + "epoch": 1.1234128632578972, + "grad_norm": 5.534455658181123, + "learning_rate": 2.123090803594074e-06, + "loss": 0.5971, + "step": 15550 + }, + { + "epoch": 1.1234851084581068, + "grad_norm": 6.399357954245293, + "learning_rate": 2.122801647137386e-06, + "loss": 0.707, + "step": 15551 + }, + { + "epoch": 1.1235573536583163, + "grad_norm": 5.048324518524531, + "learning_rate": 2.122512495844252e-06, + "loss": 0.5638, + "step": 15552 + }, + { + "epoch": 1.1236295988585259, + "grad_norm": 6.4382098649118635, + "learning_rate": 2.122223349718628e-06, + "loss": 0.6583, + "step": 15553 + }, + { + "epoch": 1.1237018440587354, + "grad_norm": 7.663061731853736, + "learning_rate": 2.1219342087644726e-06, + "loss": 0.6262, + "step": 15554 + }, + { + "epoch": 1.123774089258945, + "grad_norm": 5.773826658273104, + "learning_rate": 2.1216450729857458e-06, + "loss": 0.6592, + "step": 15555 + }, + { + "epoch": 1.1238463344591543, + "grad_norm": 5.8744769573728925, + "learning_rate": 2.1213559423864026e-06, + "loss": 0.6899, + "step": 15556 + }, + { + "epoch": 1.1239185796593638, + "grad_norm": 6.113657823365744, + "learning_rate": 2.121066816970403e-06, + "loss": 0.6921, + "step": 15557 + }, + { + "epoch": 1.1239908248595734, + "grad_norm": 6.205162962710877, + "learning_rate": 2.120777696741704e-06, + "loss": 0.7072, + "step": 15558 + }, + { + "epoch": 1.124063070059783, + "grad_norm": 8.008029722663528, + "learning_rate": 2.1204885817042654e-06, + "loss": 0.6868, + "step": 15559 + }, + { + "epoch": 1.1241353152599924, + "grad_norm": 8.057344429486125, + "learning_rate": 2.1201994718620423e-06, + "loss": 0.6752, + "step": 15560 + }, + { + "epoch": 1.124207560460202, + "grad_norm": 6.454574015868, + "learning_rate": 2.1199103672189934e-06, + "loss": 0.6568, + "step": 15561 + }, + { + "epoch": 1.1242798056604115, + "grad_norm": 6.248362517425659, + "learning_rate": 2.119621267779077e-06, + "loss": 0.6361, + "step": 15562 + }, + { + "epoch": 1.1243520508606208, + "grad_norm": 8.220305669071482, + "learning_rate": 2.11933217354625e-06, + "loss": 0.6567, + "step": 15563 + }, + { + "epoch": 1.1244242960608304, + "grad_norm": 7.859914469168636, + "learning_rate": 2.1190430845244696e-06, + "loss": 0.6867, + "step": 15564 + }, + { + "epoch": 1.12449654126104, + "grad_norm": 6.398708463527675, + "learning_rate": 2.118754000717694e-06, + "loss": 0.7001, + "step": 15565 + }, + { + "epoch": 1.1245687864612495, + "grad_norm": 6.366799259635881, + "learning_rate": 2.11846492212988e-06, + "loss": 0.7214, + "step": 15566 + }, + { + "epoch": 1.124641031661459, + "grad_norm": 6.435944008042504, + "learning_rate": 2.1181758487649846e-06, + "loss": 0.6668, + "step": 15567 + }, + { + "epoch": 1.1247132768616686, + "grad_norm": 7.792386706804207, + "learning_rate": 2.1178867806269657e-06, + "loss": 0.615, + "step": 15568 + }, + { + "epoch": 1.124785522061878, + "grad_norm": 7.607239096408228, + "learning_rate": 2.1175977177197797e-06, + "loss": 0.6922, + "step": 15569 + }, + { + "epoch": 1.1248577672620876, + "grad_norm": 5.752884182922435, + "learning_rate": 2.1173086600473857e-06, + "loss": 0.6995, + "step": 15570 + }, + { + "epoch": 1.124930012462297, + "grad_norm": 7.133725094361559, + "learning_rate": 2.117019607613737e-06, + "loss": 0.6528, + "step": 15571 + }, + { + "epoch": 1.1250022576625065, + "grad_norm": 6.558090617799477, + "learning_rate": 2.116730560422793e-06, + "loss": 0.6301, + "step": 15572 + }, + { + "epoch": 1.125074502862716, + "grad_norm": 6.26319773550408, + "learning_rate": 2.1164415184785107e-06, + "loss": 0.615, + "step": 15573 + }, + { + "epoch": 1.1251467480629256, + "grad_norm": 6.559288183530998, + "learning_rate": 2.116152481784846e-06, + "loss": 0.5809, + "step": 15574 + }, + { + "epoch": 1.1252189932631351, + "grad_norm": 7.760714599898273, + "learning_rate": 2.115863450345755e-06, + "loss": 0.7582, + "step": 15575 + }, + { + "epoch": 1.1252912384633447, + "grad_norm": 5.670567049066521, + "learning_rate": 2.1155744241651957e-06, + "loss": 0.6588, + "step": 15576 + }, + { + "epoch": 1.1253634836635542, + "grad_norm": 7.037139189501094, + "learning_rate": 2.1152854032471244e-06, + "loss": 0.6432, + "step": 15577 + }, + { + "epoch": 1.1254357288637635, + "grad_norm": 5.826979455532284, + "learning_rate": 2.1149963875954966e-06, + "loss": 0.6292, + "step": 15578 + }, + { + "epoch": 1.125507974063973, + "grad_norm": 8.129176078985159, + "learning_rate": 2.114707377214269e-06, + "loss": 0.6534, + "step": 15579 + }, + { + "epoch": 1.1255802192641826, + "grad_norm": 7.169951669272065, + "learning_rate": 2.1144183721073993e-06, + "loss": 0.6301, + "step": 15580 + }, + { + "epoch": 1.1256524644643922, + "grad_norm": 7.265213361995125, + "learning_rate": 2.114129372278842e-06, + "loss": 0.6808, + "step": 15581 + }, + { + "epoch": 1.1257247096646017, + "grad_norm": 7.834107813864819, + "learning_rate": 2.1138403777325537e-06, + "loss": 0.6675, + "step": 15582 + }, + { + "epoch": 1.1257969548648112, + "grad_norm": 6.016656326468222, + "learning_rate": 2.1135513884724913e-06, + "loss": 0.6418, + "step": 15583 + }, + { + "epoch": 1.1258692000650208, + "grad_norm": 6.702030830624302, + "learning_rate": 2.1132624045026113e-06, + "loss": 0.6132, + "step": 15584 + }, + { + "epoch": 1.12594144526523, + "grad_norm": 5.945027288144867, + "learning_rate": 2.1129734258268676e-06, + "loss": 0.7105, + "step": 15585 + }, + { + "epoch": 1.1260136904654396, + "grad_norm": 7.131911021295332, + "learning_rate": 2.1126844524492163e-06, + "loss": 0.7602, + "step": 15586 + }, + { + "epoch": 1.1260859356656492, + "grad_norm": 5.534376564650311, + "learning_rate": 2.112395484373615e-06, + "loss": 0.6602, + "step": 15587 + }, + { + "epoch": 1.1261581808658587, + "grad_norm": 6.512161908004472, + "learning_rate": 2.1121065216040195e-06, + "loss": 0.6602, + "step": 15588 + }, + { + "epoch": 1.1262304260660683, + "grad_norm": 6.940480459770143, + "learning_rate": 2.1118175641443836e-06, + "loss": 0.6345, + "step": 15589 + }, + { + "epoch": 1.1263026712662778, + "grad_norm": 5.768640902759525, + "learning_rate": 2.1115286119986634e-06, + "loss": 0.6809, + "step": 15590 + }, + { + "epoch": 1.1263749164664874, + "grad_norm": 6.279947477959329, + "learning_rate": 2.111239665170816e-06, + "loss": 0.7372, + "step": 15591 + }, + { + "epoch": 1.1264471616666967, + "grad_norm": 8.70477903801044, + "learning_rate": 2.110950723664795e-06, + "loss": 0.7341, + "step": 15592 + }, + { + "epoch": 1.1265194068669062, + "grad_norm": 7.164402744612541, + "learning_rate": 2.1106617874845566e-06, + "loss": 0.5243, + "step": 15593 + }, + { + "epoch": 1.1265916520671158, + "grad_norm": 8.372907234737731, + "learning_rate": 2.1103728566340558e-06, + "loss": 0.6989, + "step": 15594 + }, + { + "epoch": 1.1266638972673253, + "grad_norm": 7.252749743732707, + "learning_rate": 2.110083931117249e-06, + "loss": 0.657, + "step": 15595 + }, + { + "epoch": 1.1267361424675348, + "grad_norm": 5.9765625, + "learning_rate": 2.1097950109380894e-06, + "loss": 0.6629, + "step": 15596 + }, + { + "epoch": 1.1268083876677444, + "grad_norm": 7.781389472661302, + "learning_rate": 2.109506096100533e-06, + "loss": 0.6421, + "step": 15597 + }, + { + "epoch": 1.126880632867954, + "grad_norm": 6.277608090848389, + "learning_rate": 2.1092171866085364e-06, + "loss": 0.5725, + "step": 15598 + }, + { + "epoch": 1.1269528780681632, + "grad_norm": 6.7456577604119135, + "learning_rate": 2.108928282466052e-06, + "loss": 0.6255, + "step": 15599 + }, + { + "epoch": 1.1270251232683728, + "grad_norm": 8.751615974523068, + "learning_rate": 2.1086393836770354e-06, + "loss": 0.6161, + "step": 15600 + }, + { + "epoch": 1.1270973684685823, + "grad_norm": 7.497673691139712, + "learning_rate": 2.108350490245442e-06, + "loss": 0.766, + "step": 15601 + }, + { + "epoch": 1.1271696136687919, + "grad_norm": 7.047900478409471, + "learning_rate": 2.1080616021752274e-06, + "loss": 0.6066, + "step": 15602 + }, + { + "epoch": 1.1272418588690014, + "grad_norm": 7.0527668278469635, + "learning_rate": 2.107772719470344e-06, + "loss": 0.6548, + "step": 15603 + }, + { + "epoch": 1.127314104069211, + "grad_norm": 6.137296206320915, + "learning_rate": 2.1074838421347475e-06, + "loss": 0.6657, + "step": 15604 + }, + { + "epoch": 1.1273863492694205, + "grad_norm": 7.197264564959216, + "learning_rate": 2.1071949701723933e-06, + "loss": 0.6499, + "step": 15605 + }, + { + "epoch": 1.1274585944696298, + "grad_norm": 6.336163574399949, + "learning_rate": 2.1069061035872343e-06, + "loss": 0.6416, + "step": 15606 + }, + { + "epoch": 1.1275308396698394, + "grad_norm": 7.78130147526272, + "learning_rate": 2.1066172423832256e-06, + "loss": 0.6695, + "step": 15607 + }, + { + "epoch": 1.127603084870049, + "grad_norm": 6.313735085701797, + "learning_rate": 2.106328386564321e-06, + "loss": 0.6883, + "step": 15608 + }, + { + "epoch": 1.1276753300702584, + "grad_norm": 6.525226936924811, + "learning_rate": 2.1060395361344763e-06, + "loss": 0.6036, + "step": 15609 + }, + { + "epoch": 1.127747575270468, + "grad_norm": 8.145213175888282, + "learning_rate": 2.105750691097643e-06, + "loss": 0.6809, + "step": 15610 + }, + { + "epoch": 1.1278198204706775, + "grad_norm": 8.034764571001803, + "learning_rate": 2.1054618514577775e-06, + "loss": 0.6559, + "step": 15611 + }, + { + "epoch": 1.127892065670887, + "grad_norm": 6.4792418971298975, + "learning_rate": 2.1051730172188327e-06, + "loss": 0.634, + "step": 15612 + }, + { + "epoch": 1.1279643108710964, + "grad_norm": 6.35814458017757, + "learning_rate": 2.1048841883847636e-06, + "loss": 0.6869, + "step": 15613 + }, + { + "epoch": 1.128036556071306, + "grad_norm": 6.478247116270179, + "learning_rate": 2.1045953649595217e-06, + "loss": 0.6907, + "step": 15614 + }, + { + "epoch": 1.1281088012715155, + "grad_norm": 6.220744517621314, + "learning_rate": 2.104306546947063e-06, + "loss": 0.7105, + "step": 15615 + }, + { + "epoch": 1.128181046471725, + "grad_norm": 7.1400757960295405, + "learning_rate": 2.104017734351341e-06, + "loss": 0.7804, + "step": 15616 + }, + { + "epoch": 1.1282532916719346, + "grad_norm": 6.537917534822435, + "learning_rate": 2.103728927176308e-06, + "loss": 0.7033, + "step": 15617 + }, + { + "epoch": 1.128325536872144, + "grad_norm": 6.758322212533076, + "learning_rate": 2.1034401254259185e-06, + "loss": 0.6703, + "step": 15618 + }, + { + "epoch": 1.1283977820723536, + "grad_norm": 6.975802423951514, + "learning_rate": 2.1031513291041253e-06, + "loss": 0.6889, + "step": 15619 + }, + { + "epoch": 1.128470027272563, + "grad_norm": 6.0090343487894655, + "learning_rate": 2.102862538214883e-06, + "loss": 0.6787, + "step": 15620 + }, + { + "epoch": 1.1285422724727725, + "grad_norm": 6.000204082832894, + "learning_rate": 2.102573752762144e-06, + "loss": 0.6532, + "step": 15621 + }, + { + "epoch": 1.128614517672982, + "grad_norm": 6.938442131885896, + "learning_rate": 2.102284972749861e-06, + "loss": 0.6979, + "step": 15622 + }, + { + "epoch": 1.1286867628731916, + "grad_norm": 7.121818769232134, + "learning_rate": 2.101996198181989e-06, + "loss": 0.6219, + "step": 15623 + }, + { + "epoch": 1.1287590080734011, + "grad_norm": 6.018438617578042, + "learning_rate": 2.1017074290624796e-06, + "loss": 0.616, + "step": 15624 + }, + { + "epoch": 1.1288312532736107, + "grad_norm": 6.676927426839478, + "learning_rate": 2.101418665395286e-06, + "loss": 0.7029, + "step": 15625 + }, + { + "epoch": 1.1289034984738202, + "grad_norm": 7.421872430098239, + "learning_rate": 2.1011299071843612e-06, + "loss": 0.7085, + "step": 15626 + }, + { + "epoch": 1.1289757436740298, + "grad_norm": 6.329575209985832, + "learning_rate": 2.1008411544336595e-06, + "loss": 0.6594, + "step": 15627 + }, + { + "epoch": 1.129047988874239, + "grad_norm": 7.890945760420387, + "learning_rate": 2.100552407147131e-06, + "loss": 0.7025, + "step": 15628 + }, + { + "epoch": 1.1291202340744486, + "grad_norm": 6.200505316046137, + "learning_rate": 2.1002636653287304e-06, + "loss": 0.6432, + "step": 15629 + }, + { + "epoch": 1.1291924792746582, + "grad_norm": 6.19237503510886, + "learning_rate": 2.0999749289824107e-06, + "loss": 0.6472, + "step": 15630 + }, + { + "epoch": 1.1292647244748677, + "grad_norm": 6.153524192310579, + "learning_rate": 2.0996861981121227e-06, + "loss": 0.6473, + "step": 15631 + }, + { + "epoch": 1.1293369696750772, + "grad_norm": 6.221035178049023, + "learning_rate": 2.0993974727218198e-06, + "loss": 0.6961, + "step": 15632 + }, + { + "epoch": 1.1294092148752868, + "grad_norm": 7.485412143529312, + "learning_rate": 2.0991087528154546e-06, + "loss": 0.659, + "step": 15633 + }, + { + "epoch": 1.1294814600754963, + "grad_norm": 7.054816193109122, + "learning_rate": 2.09882003839698e-06, + "loss": 0.6928, + "step": 15634 + }, + { + "epoch": 1.1295537052757059, + "grad_norm": 6.319680237708611, + "learning_rate": 2.0985313294703463e-06, + "loss": 0.6579, + "step": 15635 + }, + { + "epoch": 1.1296259504759152, + "grad_norm": 5.698859974142444, + "learning_rate": 2.0982426260395077e-06, + "loss": 0.619, + "step": 15636 + }, + { + "epoch": 1.1296981956761247, + "grad_norm": 6.2565575340208985, + "learning_rate": 2.0979539281084154e-06, + "loss": 0.6448, + "step": 15637 + }, + { + "epoch": 1.1297704408763343, + "grad_norm": 6.80814131143406, + "learning_rate": 2.0976652356810217e-06, + "loss": 0.6906, + "step": 15638 + }, + { + "epoch": 1.1298426860765438, + "grad_norm": 7.257900274462509, + "learning_rate": 2.0973765487612786e-06, + "loss": 0.6832, + "step": 15639 + }, + { + "epoch": 1.1299149312767534, + "grad_norm": 6.32931966925245, + "learning_rate": 2.0970878673531377e-06, + "loss": 0.7015, + "step": 15640 + }, + { + "epoch": 1.129987176476963, + "grad_norm": 5.849537966488184, + "learning_rate": 2.0967991914605523e-06, + "loss": 0.6787, + "step": 15641 + }, + { + "epoch": 1.1300594216771724, + "grad_norm": 6.646230687745974, + "learning_rate": 2.096510521087471e-06, + "loss": 0.6307, + "step": 15642 + }, + { + "epoch": 1.1301316668773818, + "grad_norm": 6.288009726033927, + "learning_rate": 2.096221856237848e-06, + "loss": 0.6403, + "step": 15643 + }, + { + "epoch": 1.1302039120775913, + "grad_norm": 6.175448925672833, + "learning_rate": 2.095933196915635e-06, + "loss": 0.6515, + "step": 15644 + }, + { + "epoch": 1.1302761572778008, + "grad_norm": 6.375878067880111, + "learning_rate": 2.095644543124783e-06, + "loss": 0.6597, + "step": 15645 + }, + { + "epoch": 1.1303484024780104, + "grad_norm": 6.139778214714185, + "learning_rate": 2.0953558948692423e-06, + "loss": 0.619, + "step": 15646 + }, + { + "epoch": 1.13042064767822, + "grad_norm": 6.532668256319093, + "learning_rate": 2.095067252152965e-06, + "loss": 0.6606, + "step": 15647 + }, + { + "epoch": 1.1304928928784295, + "grad_norm": 7.281370497594679, + "learning_rate": 2.0947786149799036e-06, + "loss": 0.6717, + "step": 15648 + }, + { + "epoch": 1.130565138078639, + "grad_norm": 5.851405713332608, + "learning_rate": 2.094489983354008e-06, + "loss": 0.6381, + "step": 15649 + }, + { + "epoch": 1.1306373832788483, + "grad_norm": 6.071122163365663, + "learning_rate": 2.094201357279229e-06, + "loss": 0.6774, + "step": 15650 + }, + { + "epoch": 1.1307096284790579, + "grad_norm": 5.2496790106601585, + "learning_rate": 2.0939127367595188e-06, + "loss": 0.6622, + "step": 15651 + }, + { + "epoch": 1.1307818736792674, + "grad_norm": 6.929340134508266, + "learning_rate": 2.0936241217988286e-06, + "loss": 0.6892, + "step": 15652 + }, + { + "epoch": 1.130854118879477, + "grad_norm": 7.36514218236519, + "learning_rate": 2.093335512401108e-06, + "loss": 0.6356, + "step": 15653 + }, + { + "epoch": 1.1309263640796865, + "grad_norm": 6.862371880563487, + "learning_rate": 2.0930469085703077e-06, + "loss": 0.6314, + "step": 15654 + }, + { + "epoch": 1.130998609279896, + "grad_norm": 7.9932938124204895, + "learning_rate": 2.092758310310381e-06, + "loss": 0.6863, + "step": 15655 + }, + { + "epoch": 1.1310708544801056, + "grad_norm": 5.851409950865341, + "learning_rate": 2.092469717625275e-06, + "loss": 0.6021, + "step": 15656 + }, + { + "epoch": 1.131143099680315, + "grad_norm": 7.69526763748027, + "learning_rate": 2.092181130518943e-06, + "loss": 0.6601, + "step": 15657 + }, + { + "epoch": 1.1312153448805244, + "grad_norm": 7.197000609944466, + "learning_rate": 2.0918925489953348e-06, + "loss": 0.6274, + "step": 15658 + }, + { + "epoch": 1.131287590080734, + "grad_norm": 7.677782817911916, + "learning_rate": 2.091603973058401e-06, + "loss": 0.633, + "step": 15659 + }, + { + "epoch": 1.1313598352809435, + "grad_norm": 7.376487711192173, + "learning_rate": 2.0913154027120912e-06, + "loss": 0.7233, + "step": 15660 + }, + { + "epoch": 1.131432080481153, + "grad_norm": 6.1595103349144535, + "learning_rate": 2.0910268379603564e-06, + "loss": 0.6039, + "step": 15661 + }, + { + "epoch": 1.1315043256813626, + "grad_norm": 7.308383924366419, + "learning_rate": 2.090738278807146e-06, + "loss": 0.6549, + "step": 15662 + }, + { + "epoch": 1.1315765708815722, + "grad_norm": 6.578483952313014, + "learning_rate": 2.090449725256412e-06, + "loss": 0.6219, + "step": 15663 + }, + { + "epoch": 1.1316488160817815, + "grad_norm": 6.933206415237402, + "learning_rate": 2.0901611773121024e-06, + "loss": 0.6689, + "step": 15664 + }, + { + "epoch": 1.131721061281991, + "grad_norm": 7.054548800806398, + "learning_rate": 2.089872634978168e-06, + "loss": 0.6275, + "step": 15665 + }, + { + "epoch": 1.1317933064822006, + "grad_norm": 7.303672222224071, + "learning_rate": 2.0895840982585598e-06, + "loss": 0.6086, + "step": 15666 + }, + { + "epoch": 1.13186555168241, + "grad_norm": 6.362614586778207, + "learning_rate": 2.0892955671572256e-06, + "loss": 0.6385, + "step": 15667 + }, + { + "epoch": 1.1319377968826196, + "grad_norm": 7.118303866720277, + "learning_rate": 2.089007041678116e-06, + "loss": 0.612, + "step": 15668 + }, + { + "epoch": 1.1320100420828292, + "grad_norm": 8.116455750071646, + "learning_rate": 2.0887185218251814e-06, + "loss": 0.7192, + "step": 15669 + }, + { + "epoch": 1.1320822872830387, + "grad_norm": 6.210869324507905, + "learning_rate": 2.0884300076023716e-06, + "loss": 0.6135, + "step": 15670 + }, + { + "epoch": 1.132154532483248, + "grad_norm": 6.714942314485302, + "learning_rate": 2.088141499013634e-06, + "loss": 0.6714, + "step": 15671 + }, + { + "epoch": 1.1322267776834576, + "grad_norm": 7.273439597878323, + "learning_rate": 2.0878529960629203e-06, + "loss": 0.6222, + "step": 15672 + }, + { + "epoch": 1.1322990228836671, + "grad_norm": 7.107947573046095, + "learning_rate": 2.08756449875418e-06, + "loss": 0.7135, + "step": 15673 + }, + { + "epoch": 1.1323712680838767, + "grad_norm": 7.091261641534683, + "learning_rate": 2.08727600709136e-06, + "loss": 0.7242, + "step": 15674 + }, + { + "epoch": 1.1324435132840862, + "grad_norm": 6.548330359195698, + "learning_rate": 2.086987521078411e-06, + "loss": 0.624, + "step": 15675 + }, + { + "epoch": 1.1325157584842958, + "grad_norm": 6.4798191473884, + "learning_rate": 2.0866990407192828e-06, + "loss": 0.6347, + "step": 15676 + }, + { + "epoch": 1.1325880036845053, + "grad_norm": 6.329740040339057, + "learning_rate": 2.086410566017924e-06, + "loss": 0.5957, + "step": 15677 + }, + { + "epoch": 1.1326602488847146, + "grad_norm": 7.131171782563623, + "learning_rate": 2.0861220969782827e-06, + "loss": 0.631, + "step": 15678 + }, + { + "epoch": 1.1327324940849242, + "grad_norm": 7.201719862778168, + "learning_rate": 2.085833633604309e-06, + "loss": 0.6805, + "step": 15679 + }, + { + "epoch": 1.1328047392851337, + "grad_norm": 6.590220297574023, + "learning_rate": 2.085545175899951e-06, + "loss": 0.6912, + "step": 15680 + }, + { + "epoch": 1.1328769844853432, + "grad_norm": 6.756166255407213, + "learning_rate": 2.085256723869158e-06, + "loss": 0.6472, + "step": 15681 + }, + { + "epoch": 1.1329492296855528, + "grad_norm": 6.808716169384955, + "learning_rate": 2.084968277515878e-06, + "loss": 0.6836, + "step": 15682 + }, + { + "epoch": 1.1330214748857623, + "grad_norm": 6.240482866237884, + "learning_rate": 2.08467983684406e-06, + "loss": 0.6906, + "step": 15683 + }, + { + "epoch": 1.1330937200859719, + "grad_norm": 5.725255715601985, + "learning_rate": 2.0843914018576543e-06, + "loss": 0.709, + "step": 15684 + }, + { + "epoch": 1.1331659652861812, + "grad_norm": 6.4553280965215265, + "learning_rate": 2.0841029725606056e-06, + "loss": 0.6171, + "step": 15685 + }, + { + "epoch": 1.1332382104863907, + "grad_norm": 6.602382414874656, + "learning_rate": 2.0838145489568644e-06, + "loss": 0.5969, + "step": 15686 + }, + { + "epoch": 1.1333104556866003, + "grad_norm": 6.073528522882042, + "learning_rate": 2.0835261310503803e-06, + "loss": 0.6291, + "step": 15687 + }, + { + "epoch": 1.1333827008868098, + "grad_norm": 9.789784958126873, + "learning_rate": 2.0832377188450992e-06, + "loss": 0.5838, + "step": 15688 + }, + { + "epoch": 1.1334549460870194, + "grad_norm": 7.044277211372652, + "learning_rate": 2.08294931234497e-06, + "loss": 0.6993, + "step": 15689 + }, + { + "epoch": 1.133527191287229, + "grad_norm": 6.56014216617891, + "learning_rate": 2.0826609115539407e-06, + "loss": 0.689, + "step": 15690 + }, + { + "epoch": 1.1335994364874384, + "grad_norm": 7.334951771250597, + "learning_rate": 2.08237251647596e-06, + "loss": 0.7113, + "step": 15691 + }, + { + "epoch": 1.1336716816876478, + "grad_norm": 6.075078127009362, + "learning_rate": 2.082084127114975e-06, + "loss": 0.6723, + "step": 15692 + }, + { + "epoch": 1.1337439268878573, + "grad_norm": 8.252963834183442, + "learning_rate": 2.0817957434749335e-06, + "loss": 0.7107, + "step": 15693 + }, + { + "epoch": 1.1338161720880668, + "grad_norm": 7.769587712684481, + "learning_rate": 2.081507365559784e-06, + "loss": 0.6533, + "step": 15694 + }, + { + "epoch": 1.1338884172882764, + "grad_norm": 7.141668531431699, + "learning_rate": 2.081218993373474e-06, + "loss": 0.7571, + "step": 15695 + }, + { + "epoch": 1.133960662488486, + "grad_norm": 7.953369616510724, + "learning_rate": 2.0809306269199504e-06, + "loss": 0.703, + "step": 15696 + }, + { + "epoch": 1.1340329076886955, + "grad_norm": 6.180422964590711, + "learning_rate": 2.0806422662031607e-06, + "loss": 0.6774, + "step": 15697 + }, + { + "epoch": 1.134105152888905, + "grad_norm": 6.021993699064478, + "learning_rate": 2.080353911227054e-06, + "loss": 0.6548, + "step": 15698 + }, + { + "epoch": 1.1341773980891143, + "grad_norm": 6.911424450382831, + "learning_rate": 2.080065561995575e-06, + "loss": 0.6563, + "step": 15699 + }, + { + "epoch": 1.1342496432893239, + "grad_norm": 6.543350035532258, + "learning_rate": 2.079777218512673e-06, + "loss": 0.6354, + "step": 15700 + }, + { + "epoch": 1.1343218884895334, + "grad_norm": 6.47515934928499, + "learning_rate": 2.0794888807822947e-06, + "loss": 0.611, + "step": 15701 + }, + { + "epoch": 1.134394133689743, + "grad_norm": 6.470315200226586, + "learning_rate": 2.079200548808388e-06, + "loss": 0.6899, + "step": 15702 + }, + { + "epoch": 1.1344663788899525, + "grad_norm": 7.290007077228057, + "learning_rate": 2.078912222594898e-06, + "loss": 0.6574, + "step": 15703 + }, + { + "epoch": 1.134538624090162, + "grad_norm": 6.2014640679192325, + "learning_rate": 2.0786239021457727e-06, + "loss": 0.6657, + "step": 15704 + }, + { + "epoch": 1.1346108692903716, + "grad_norm": 9.629950105056563, + "learning_rate": 2.0783355874649598e-06, + "loss": 0.7134, + "step": 15705 + }, + { + "epoch": 1.1346831144905811, + "grad_norm": 6.716444462712628, + "learning_rate": 2.078047278556404e-06, + "loss": 0.6803, + "step": 15706 + }, + { + "epoch": 1.1347553596907907, + "grad_norm": 8.429855839954909, + "learning_rate": 2.077758975424054e-06, + "loss": 0.6651, + "step": 15707 + }, + { + "epoch": 1.134827604891, + "grad_norm": 5.875234071654381, + "learning_rate": 2.0774706780718557e-06, + "loss": 0.6093, + "step": 15708 + }, + { + "epoch": 1.1348998500912095, + "grad_norm": 6.411290711712259, + "learning_rate": 2.077182386503756e-06, + "loss": 0.6234, + "step": 15709 + }, + { + "epoch": 1.134972095291419, + "grad_norm": 8.284061746275995, + "learning_rate": 2.076894100723701e-06, + "loss": 0.6718, + "step": 15710 + }, + { + "epoch": 1.1350443404916286, + "grad_norm": 8.443218638588244, + "learning_rate": 2.076605820735637e-06, + "loss": 0.6588, + "step": 15711 + }, + { + "epoch": 1.1351165856918382, + "grad_norm": 7.808565415468852, + "learning_rate": 2.0763175465435117e-06, + "loss": 0.6431, + "step": 15712 + }, + { + "epoch": 1.1351888308920477, + "grad_norm": 6.515448185925245, + "learning_rate": 2.0760292781512685e-06, + "loss": 0.5892, + "step": 15713 + }, + { + "epoch": 1.1352610760922572, + "grad_norm": 5.7758348001964315, + "learning_rate": 2.075741015562856e-06, + "loss": 0.6896, + "step": 15714 + }, + { + "epoch": 1.1353333212924666, + "grad_norm": 5.072185528944999, + "learning_rate": 2.0754527587822195e-06, + "loss": 0.6415, + "step": 15715 + }, + { + "epoch": 1.135405566492676, + "grad_norm": 6.003832546717424, + "learning_rate": 2.0751645078133064e-06, + "loss": 0.6944, + "step": 15716 + }, + { + "epoch": 1.1354778116928856, + "grad_norm": 8.481637305775267, + "learning_rate": 2.07487626266006e-06, + "loss": 0.7404, + "step": 15717 + }, + { + "epoch": 1.1355500568930952, + "grad_norm": 5.197449484135149, + "learning_rate": 2.0745880233264273e-06, + "loss": 0.6408, + "step": 15718 + }, + { + "epoch": 1.1356223020933047, + "grad_norm": 6.998723594914133, + "learning_rate": 2.074299789816354e-06, + "loss": 0.6922, + "step": 15719 + }, + { + "epoch": 1.1356945472935143, + "grad_norm": 5.298144163146783, + "learning_rate": 2.0740115621337873e-06, + "loss": 0.66, + "step": 15720 + }, + { + "epoch": 1.1357667924937238, + "grad_norm": 6.328164182941964, + "learning_rate": 2.073723340282671e-06, + "loss": 0.6932, + "step": 15721 + }, + { + "epoch": 1.1358390376939331, + "grad_norm": 5.910310510007662, + "learning_rate": 2.0734351242669508e-06, + "loss": 0.6841, + "step": 15722 + }, + { + "epoch": 1.1359112828941427, + "grad_norm": 5.627425963407498, + "learning_rate": 2.0731469140905735e-06, + "loss": 0.6413, + "step": 15723 + }, + { + "epoch": 1.1359835280943522, + "grad_norm": 6.469853256424598, + "learning_rate": 2.072858709757482e-06, + "loss": 0.6437, + "step": 15724 + }, + { + "epoch": 1.1360557732945618, + "grad_norm": 6.869158395804423, + "learning_rate": 2.072570511271624e-06, + "loss": 0.6702, + "step": 15725 + }, + { + "epoch": 1.1361280184947713, + "grad_norm": 6.822938254130528, + "learning_rate": 2.072282318636944e-06, + "loss": 0.6335, + "step": 15726 + }, + { + "epoch": 1.1362002636949808, + "grad_norm": 7.502249825312821, + "learning_rate": 2.071994131857387e-06, + "loss": 0.7014, + "step": 15727 + }, + { + "epoch": 1.1362725088951904, + "grad_norm": 6.064843737420295, + "learning_rate": 2.0717059509368974e-06, + "loss": 0.6876, + "step": 15728 + }, + { + "epoch": 1.1363447540953997, + "grad_norm": 6.473495438206206, + "learning_rate": 2.071417775879421e-06, + "loss": 0.6153, + "step": 15729 + }, + { + "epoch": 1.1364169992956092, + "grad_norm": 6.299300491008934, + "learning_rate": 2.0711296066889037e-06, + "loss": 0.7204, + "step": 15730 + }, + { + "epoch": 1.1364892444958188, + "grad_norm": 6.30176439555386, + "learning_rate": 2.070841443369288e-06, + "loss": 0.6049, + "step": 15731 + }, + { + "epoch": 1.1365614896960283, + "grad_norm": 6.4993051010761915, + "learning_rate": 2.070553285924519e-06, + "loss": 0.6749, + "step": 15732 + }, + { + "epoch": 1.1366337348962379, + "grad_norm": 6.403134067393074, + "learning_rate": 2.070265134358542e-06, + "loss": 0.6858, + "step": 15733 + }, + { + "epoch": 1.1367059800964474, + "grad_norm": 6.826786564777413, + "learning_rate": 2.069976988675304e-06, + "loss": 0.7306, + "step": 15734 + }, + { + "epoch": 1.136778225296657, + "grad_norm": 7.910884177540396, + "learning_rate": 2.069688848878745e-06, + "loss": 0.6388, + "step": 15735 + }, + { + "epoch": 1.1368504704968663, + "grad_norm": 6.161198984353994, + "learning_rate": 2.069400714972812e-06, + "loss": 0.6693, + "step": 15736 + }, + { + "epoch": 1.1369227156970758, + "grad_norm": 7.413927179514547, + "learning_rate": 2.0691125869614492e-06, + "loss": 0.639, + "step": 15737 + }, + { + "epoch": 1.1369949608972854, + "grad_norm": 6.352511364394843, + "learning_rate": 2.0688244648486e-06, + "loss": 0.6057, + "step": 15738 + }, + { + "epoch": 1.137067206097495, + "grad_norm": 6.68149189809993, + "learning_rate": 2.0685363486382096e-06, + "loss": 0.618, + "step": 15739 + }, + { + "epoch": 1.1371394512977044, + "grad_norm": 6.70511254497064, + "learning_rate": 2.0682482383342212e-06, + "loss": 0.6136, + "step": 15740 + }, + { + "epoch": 1.137211696497914, + "grad_norm": 6.3523798530443125, + "learning_rate": 2.0679601339405797e-06, + "loss": 0.6896, + "step": 15741 + }, + { + "epoch": 1.1372839416981235, + "grad_norm": 6.984092348255074, + "learning_rate": 2.0676720354612277e-06, + "loss": 0.6699, + "step": 15742 + }, + { + "epoch": 1.1373561868983328, + "grad_norm": 6.11385904817387, + "learning_rate": 2.0673839429001104e-06, + "loss": 0.6742, + "step": 15743 + }, + { + "epoch": 1.1374284320985424, + "grad_norm": 6.761733982326864, + "learning_rate": 2.067095856261171e-06, + "loss": 0.637, + "step": 15744 + }, + { + "epoch": 1.137500677298752, + "grad_norm": 8.38558928191576, + "learning_rate": 2.066807775548354e-06, + "loss": 0.6498, + "step": 15745 + }, + { + "epoch": 1.1375729224989615, + "grad_norm": 6.843198989215925, + "learning_rate": 2.066519700765601e-06, + "loss": 0.6729, + "step": 15746 + }, + { + "epoch": 1.137645167699171, + "grad_norm": 8.890415182470345, + "learning_rate": 2.0662316319168567e-06, + "loss": 0.7615, + "step": 15747 + }, + { + "epoch": 1.1377174128993806, + "grad_norm": 6.39924379888044, + "learning_rate": 2.0659435690060656e-06, + "loss": 0.6505, + "step": 15748 + }, + { + "epoch": 1.13778965809959, + "grad_norm": 6.197013246318651, + "learning_rate": 2.065655512037169e-06, + "loss": 0.5943, + "step": 15749 + }, + { + "epoch": 1.1378619032997994, + "grad_norm": 6.664226784866847, + "learning_rate": 2.0653674610141113e-06, + "loss": 0.6648, + "step": 15750 + }, + { + "epoch": 1.137934148500009, + "grad_norm": 6.073111459564444, + "learning_rate": 2.065079415940836e-06, + "loss": 0.7024, + "step": 15751 + }, + { + "epoch": 1.1380063937002185, + "grad_norm": 6.793357623015989, + "learning_rate": 2.064791376821286e-06, + "loss": 0.6296, + "step": 15752 + }, + { + "epoch": 1.138078638900428, + "grad_norm": 6.232853983571056, + "learning_rate": 2.0645033436594035e-06, + "loss": 0.6943, + "step": 15753 + }, + { + "epoch": 1.1381508841006376, + "grad_norm": 6.1192737203248075, + "learning_rate": 2.064215316459132e-06, + "loss": 0.6127, + "step": 15754 + }, + { + "epoch": 1.1382231293008471, + "grad_norm": 6.166590647185926, + "learning_rate": 2.0639272952244153e-06, + "loss": 0.6186, + "step": 15755 + }, + { + "epoch": 1.1382953745010567, + "grad_norm": 6.275513112036014, + "learning_rate": 2.0636392799591944e-06, + "loss": 0.6243, + "step": 15756 + }, + { + "epoch": 1.138367619701266, + "grad_norm": 6.41734610919538, + "learning_rate": 2.0633512706674133e-06, + "loss": 0.5934, + "step": 15757 + }, + { + "epoch": 1.1384398649014755, + "grad_norm": 6.504551101196467, + "learning_rate": 2.063063267353014e-06, + "loss": 0.6523, + "step": 15758 + }, + { + "epoch": 1.138512110101685, + "grad_norm": 6.370109028264934, + "learning_rate": 2.0627752700199407e-06, + "loss": 0.7143, + "step": 15759 + }, + { + "epoch": 1.1385843553018946, + "grad_norm": 6.718751135537695, + "learning_rate": 2.0624872786721333e-06, + "loss": 0.659, + "step": 15760 + }, + { + "epoch": 1.1386566005021042, + "grad_norm": 6.011801874628368, + "learning_rate": 2.0621992933135354e-06, + "loss": 0.6822, + "step": 15761 + }, + { + "epoch": 1.1387288457023137, + "grad_norm": 6.036854092184295, + "learning_rate": 2.0619113139480894e-06, + "loss": 0.6898, + "step": 15762 + }, + { + "epoch": 1.1388010909025232, + "grad_norm": 7.392113952980409, + "learning_rate": 2.061623340579737e-06, + "loss": 0.5987, + "step": 15763 + }, + { + "epoch": 1.1388733361027326, + "grad_norm": 6.119767114035411, + "learning_rate": 2.061335373212421e-06, + "loss": 0.6068, + "step": 15764 + }, + { + "epoch": 1.138945581302942, + "grad_norm": 6.0821152952606665, + "learning_rate": 2.061047411850083e-06, + "loss": 0.6005, + "step": 15765 + }, + { + "epoch": 1.1390178265031516, + "grad_norm": 5.816172434332452, + "learning_rate": 2.0607594564966655e-06, + "loss": 0.6624, + "step": 15766 + }, + { + "epoch": 1.1390900717033612, + "grad_norm": 7.089378589682018, + "learning_rate": 2.0604715071561095e-06, + "loss": 0.6131, + "step": 15767 + }, + { + "epoch": 1.1391623169035707, + "grad_norm": 8.0867187122621, + "learning_rate": 2.0601835638323576e-06, + "loss": 0.6736, + "step": 15768 + }, + { + "epoch": 1.1392345621037803, + "grad_norm": 6.526910389657509, + "learning_rate": 2.0598956265293507e-06, + "loss": 0.6341, + "step": 15769 + }, + { + "epoch": 1.1393068073039898, + "grad_norm": 7.51099987825473, + "learning_rate": 2.059607695251032e-06, + "loss": 0.6819, + "step": 15770 + }, + { + "epoch": 1.1393790525041991, + "grad_norm": 7.145926012452092, + "learning_rate": 2.059319770001341e-06, + "loss": 0.6395, + "step": 15771 + }, + { + "epoch": 1.1394512977044087, + "grad_norm": 5.5086653678111075, + "learning_rate": 2.0590318507842206e-06, + "loss": 0.6587, + "step": 15772 + }, + { + "epoch": 1.1395235429046182, + "grad_norm": 6.590304518568238, + "learning_rate": 2.0587439376036125e-06, + "loss": 0.6987, + "step": 15773 + }, + { + "epoch": 1.1395957881048278, + "grad_norm": 9.09910651211687, + "learning_rate": 2.0584560304634564e-06, + "loss": 0.6514, + "step": 15774 + }, + { + "epoch": 1.1396680333050373, + "grad_norm": 6.34075195426055, + "learning_rate": 2.0581681293676944e-06, + "loss": 0.6823, + "step": 15775 + }, + { + "epoch": 1.1397402785052468, + "grad_norm": 6.990514322768116, + "learning_rate": 2.057880234320267e-06, + "loss": 0.6543, + "step": 15776 + }, + { + "epoch": 1.1398125237054564, + "grad_norm": 7.268439947388905, + "learning_rate": 2.0575923453251176e-06, + "loss": 0.6509, + "step": 15777 + }, + { + "epoch": 1.139884768905666, + "grad_norm": 6.534608406411702, + "learning_rate": 2.0573044623861844e-06, + "loss": 0.6312, + "step": 15778 + }, + { + "epoch": 1.1399570141058755, + "grad_norm": 6.5049437282512175, + "learning_rate": 2.057016585507409e-06, + "loss": 0.6549, + "step": 15779 + }, + { + "epoch": 1.1400292593060848, + "grad_norm": 6.357795687934733, + "learning_rate": 2.056728714692734e-06, + "loss": 0.6313, + "step": 15780 + }, + { + "epoch": 1.1401015045062943, + "grad_norm": 7.987603357926861, + "learning_rate": 2.0564408499460974e-06, + "loss": 0.7, + "step": 15781 + }, + { + "epoch": 1.1401737497065039, + "grad_norm": 6.521927161892357, + "learning_rate": 2.0561529912714416e-06, + "loss": 0.7176, + "step": 15782 + }, + { + "epoch": 1.1402459949067134, + "grad_norm": 7.491748339498134, + "learning_rate": 2.0558651386727064e-06, + "loss": 0.7172, + "step": 15783 + }, + { + "epoch": 1.140318240106923, + "grad_norm": 7.361118766092923, + "learning_rate": 2.0555772921538335e-06, + "loss": 0.7957, + "step": 15784 + }, + { + "epoch": 1.1403904853071325, + "grad_norm": 7.400813558035413, + "learning_rate": 2.0552894517187617e-06, + "loss": 0.7465, + "step": 15785 + }, + { + "epoch": 1.140462730507342, + "grad_norm": 5.64119525256514, + "learning_rate": 2.055001617371432e-06, + "loss": 0.6407, + "step": 15786 + }, + { + "epoch": 1.1405349757075514, + "grad_norm": 8.45060559873383, + "learning_rate": 2.0547137891157855e-06, + "loss": 0.7474, + "step": 15787 + }, + { + "epoch": 1.140607220907761, + "grad_norm": 5.949814404069349, + "learning_rate": 2.0544259669557608e-06, + "loss": 0.6138, + "step": 15788 + }, + { + "epoch": 1.1406794661079704, + "grad_norm": 5.570465043000278, + "learning_rate": 2.0541381508952986e-06, + "loss": 0.6233, + "step": 15789 + }, + { + "epoch": 1.14075171130818, + "grad_norm": 6.555384384047012, + "learning_rate": 2.0538503409383382e-06, + "loss": 0.66, + "step": 15790 + }, + { + "epoch": 1.1408239565083895, + "grad_norm": 7.527580918681957, + "learning_rate": 2.0535625370888216e-06, + "loss": 0.5767, + "step": 15791 + }, + { + "epoch": 1.140896201708599, + "grad_norm": 6.595154526803893, + "learning_rate": 2.0532747393506867e-06, + "loss": 0.628, + "step": 15792 + }, + { + "epoch": 1.1409684469088086, + "grad_norm": 5.919877734597069, + "learning_rate": 2.0529869477278735e-06, + "loss": 0.578, + "step": 15793 + }, + { + "epoch": 1.141040692109018, + "grad_norm": 6.960557167289236, + "learning_rate": 2.052699162224322e-06, + "loss": 0.6321, + "step": 15794 + }, + { + "epoch": 1.1411129373092275, + "grad_norm": 6.756407345784294, + "learning_rate": 2.052411382843972e-06, + "loss": 0.6306, + "step": 15795 + }, + { + "epoch": 1.141185182509437, + "grad_norm": 6.906603773903442, + "learning_rate": 2.052123609590762e-06, + "loss": 0.6839, + "step": 15796 + }, + { + "epoch": 1.1412574277096466, + "grad_norm": 6.767919138233695, + "learning_rate": 2.051835842468632e-06, + "loss": 0.599, + "step": 15797 + }, + { + "epoch": 1.141329672909856, + "grad_norm": 5.933148204758491, + "learning_rate": 2.051548081481522e-06, + "loss": 0.623, + "step": 15798 + }, + { + "epoch": 1.1414019181100656, + "grad_norm": 6.476277417579726, + "learning_rate": 2.0512603266333706e-06, + "loss": 0.6168, + "step": 15799 + }, + { + "epoch": 1.1414741633102752, + "grad_norm": 7.246303800021569, + "learning_rate": 2.0509725779281163e-06, + "loss": 0.6774, + "step": 15800 + }, + { + "epoch": 1.1415464085104845, + "grad_norm": 7.94541020366119, + "learning_rate": 2.050684835369699e-06, + "loss": 0.7139, + "step": 15801 + }, + { + "epoch": 1.141618653710694, + "grad_norm": 7.06136481690279, + "learning_rate": 2.0503970989620585e-06, + "loss": 0.6335, + "step": 15802 + }, + { + "epoch": 1.1416908989109036, + "grad_norm": 5.989773300033233, + "learning_rate": 2.050109368709131e-06, + "loss": 0.5343, + "step": 15803 + }, + { + "epoch": 1.1417631441111131, + "grad_norm": 5.344686682629173, + "learning_rate": 2.049821644614857e-06, + "loss": 0.6656, + "step": 15804 + }, + { + "epoch": 1.1418353893113227, + "grad_norm": 7.942504026359712, + "learning_rate": 2.0495339266831766e-06, + "loss": 0.612, + "step": 15805 + }, + { + "epoch": 1.1419076345115322, + "grad_norm": 7.855186605294867, + "learning_rate": 2.0492462149180257e-06, + "loss": 0.5424, + "step": 15806 + }, + { + "epoch": 1.1419798797117418, + "grad_norm": 6.335382665383813, + "learning_rate": 2.0489585093233446e-06, + "loss": 0.6261, + "step": 15807 + }, + { + "epoch": 1.142052124911951, + "grad_norm": 7.788595019117772, + "learning_rate": 2.0486708099030712e-06, + "loss": 0.6297, + "step": 15808 + }, + { + "epoch": 1.1421243701121606, + "grad_norm": 7.091235820205366, + "learning_rate": 2.048383116661144e-06, + "loss": 0.7061, + "step": 15809 + }, + { + "epoch": 1.1421966153123702, + "grad_norm": 6.199124908217735, + "learning_rate": 2.048095429601501e-06, + "loss": 0.5847, + "step": 15810 + }, + { + "epoch": 1.1422688605125797, + "grad_norm": 5.648473969021273, + "learning_rate": 2.047807748728081e-06, + "loss": 0.5899, + "step": 15811 + }, + { + "epoch": 1.1423411057127892, + "grad_norm": 6.596956314171255, + "learning_rate": 2.047520074044822e-06, + "loss": 0.6971, + "step": 15812 + }, + { + "epoch": 1.1424133509129988, + "grad_norm": 5.629444485728189, + "learning_rate": 2.0472324055556614e-06, + "loss": 0.6246, + "step": 15813 + }, + { + "epoch": 1.1424855961132083, + "grad_norm": 6.919895965587962, + "learning_rate": 2.0469447432645377e-06, + "loss": 0.6203, + "step": 15814 + }, + { + "epoch": 1.1425578413134176, + "grad_norm": 6.332551138242551, + "learning_rate": 2.046657087175389e-06, + "loss": 0.686, + "step": 15815 + }, + { + "epoch": 1.1426300865136272, + "grad_norm": 6.6669929742428495, + "learning_rate": 2.0463694372921535e-06, + "loss": 0.7485, + "step": 15816 + }, + { + "epoch": 1.1427023317138367, + "grad_norm": 6.886266181775834, + "learning_rate": 2.046081793618767e-06, + "loss": 0.5864, + "step": 15817 + }, + { + "epoch": 1.1427745769140463, + "grad_norm": 7.763366002229336, + "learning_rate": 2.0457941561591676e-06, + "loss": 0.6677, + "step": 15818 + }, + { + "epoch": 1.1428468221142558, + "grad_norm": 6.758956618285924, + "learning_rate": 2.0455065249172957e-06, + "loss": 0.6212, + "step": 15819 + }, + { + "epoch": 1.1429190673144654, + "grad_norm": 5.969925529999536, + "learning_rate": 2.045218899897085e-06, + "loss": 0.6106, + "step": 15820 + }, + { + "epoch": 1.142991312514675, + "grad_norm": 8.464278613749189, + "learning_rate": 2.0449312811024744e-06, + "loss": 0.6591, + "step": 15821 + }, + { + "epoch": 1.1430635577148842, + "grad_norm": 6.886838949780577, + "learning_rate": 2.0446436685374016e-06, + "loss": 0.7045, + "step": 15822 + }, + { + "epoch": 1.1431358029150938, + "grad_norm": 8.761823676506141, + "learning_rate": 2.044356062205804e-06, + "loss": 0.6378, + "step": 15823 + }, + { + "epoch": 1.1432080481153033, + "grad_norm": 7.395472930580123, + "learning_rate": 2.0440684621116175e-06, + "loss": 0.689, + "step": 15824 + }, + { + "epoch": 1.1432802933155128, + "grad_norm": 7.884414857815797, + "learning_rate": 2.0437808682587794e-06, + "loss": 0.7313, + "step": 15825 + }, + { + "epoch": 1.1433525385157224, + "grad_norm": 6.02653390186775, + "learning_rate": 2.0434932806512275e-06, + "loss": 0.691, + "step": 15826 + }, + { + "epoch": 1.143424783715932, + "grad_norm": 5.426190420586873, + "learning_rate": 2.0432056992928983e-06, + "loss": 0.7035, + "step": 15827 + }, + { + "epoch": 1.1434970289161415, + "grad_norm": 8.67147336837391, + "learning_rate": 2.042918124187728e-06, + "loss": 0.7234, + "step": 15828 + }, + { + "epoch": 1.1435692741163508, + "grad_norm": 6.026055030731004, + "learning_rate": 2.0426305553396536e-06, + "loss": 0.5902, + "step": 15829 + }, + { + "epoch": 1.1436415193165603, + "grad_norm": 6.646751826642146, + "learning_rate": 2.042342992752613e-06, + "loss": 0.6715, + "step": 15830 + }, + { + "epoch": 1.1437137645167699, + "grad_norm": 7.073907062777437, + "learning_rate": 2.0420554364305405e-06, + "loss": 0.6614, + "step": 15831 + }, + { + "epoch": 1.1437860097169794, + "grad_norm": 7.656451288808062, + "learning_rate": 2.0417678863773725e-06, + "loss": 0.7453, + "step": 15832 + }, + { + "epoch": 1.143858254917189, + "grad_norm": 5.838053001986528, + "learning_rate": 2.041480342597047e-06, + "loss": 0.5757, + "step": 15833 + }, + { + "epoch": 1.1439305001173985, + "grad_norm": 6.993399914526693, + "learning_rate": 2.041192805093501e-06, + "loss": 0.7168, + "step": 15834 + }, + { + "epoch": 1.144002745317608, + "grad_norm": 6.444551328160019, + "learning_rate": 2.040905273870668e-06, + "loss": 0.6685, + "step": 15835 + }, + { + "epoch": 1.1440749905178174, + "grad_norm": 7.572037664627088, + "learning_rate": 2.0406177489324854e-06, + "loss": 0.6361, + "step": 15836 + }, + { + "epoch": 1.144147235718027, + "grad_norm": 6.929595292949817, + "learning_rate": 2.04033023028289e-06, + "loss": 0.6591, + "step": 15837 + }, + { + "epoch": 1.1442194809182364, + "grad_norm": 4.906848457914757, + "learning_rate": 2.0400427179258157e-06, + "loss": 0.621, + "step": 15838 + }, + { + "epoch": 1.144291726118446, + "grad_norm": 5.744066577111129, + "learning_rate": 2.0397552118652e-06, + "loss": 0.6455, + "step": 15839 + }, + { + "epoch": 1.1443639713186555, + "grad_norm": 7.719991607266148, + "learning_rate": 2.0394677121049773e-06, + "loss": 0.6311, + "step": 15840 + }, + { + "epoch": 1.144436216518865, + "grad_norm": 6.062214952826841, + "learning_rate": 2.039180218649085e-06, + "loss": 0.6545, + "step": 15841 + }, + { + "epoch": 1.1445084617190746, + "grad_norm": 6.106128815587567, + "learning_rate": 2.038892731501457e-06, + "loss": 0.6439, + "step": 15842 + }, + { + "epoch": 1.144580706919284, + "grad_norm": 5.730776954689, + "learning_rate": 2.03860525066603e-06, + "loss": 0.64, + "step": 15843 + }, + { + "epoch": 1.1446529521194935, + "grad_norm": 6.002081509975733, + "learning_rate": 2.0383177761467397e-06, + "loss": 0.7373, + "step": 15844 + }, + { + "epoch": 1.144725197319703, + "grad_norm": 6.75830499695632, + "learning_rate": 2.0380303079475196e-06, + "loss": 0.6512, + "step": 15845 + }, + { + "epoch": 1.1447974425199126, + "grad_norm": 6.196221572352109, + "learning_rate": 2.0377428460723055e-06, + "loss": 0.7246, + "step": 15846 + }, + { + "epoch": 1.144869687720122, + "grad_norm": 6.672575652521524, + "learning_rate": 2.0374553905250327e-06, + "loss": 0.653, + "step": 15847 + }, + { + "epoch": 1.1449419329203316, + "grad_norm": 7.455969181832077, + "learning_rate": 2.0371679413096378e-06, + "loss": 0.6135, + "step": 15848 + }, + { + "epoch": 1.1450141781205412, + "grad_norm": 7.854137336278195, + "learning_rate": 2.0368804984300536e-06, + "loss": 0.5675, + "step": 15849 + }, + { + "epoch": 1.1450864233207507, + "grad_norm": 7.4823097134963374, + "learning_rate": 2.0365930618902158e-06, + "loss": 0.7047, + "step": 15850 + }, + { + "epoch": 1.14515866852096, + "grad_norm": 5.624081345882847, + "learning_rate": 2.036305631694059e-06, + "loss": 0.6619, + "step": 15851 + }, + { + "epoch": 1.1452309137211696, + "grad_norm": 7.756368911901352, + "learning_rate": 2.0360182078455186e-06, + "loss": 0.659, + "step": 15852 + }, + { + "epoch": 1.1453031589213791, + "grad_norm": 6.751271163491957, + "learning_rate": 2.035730790348528e-06, + "loss": 0.599, + "step": 15853 + }, + { + "epoch": 1.1453754041215887, + "grad_norm": 7.500037638251912, + "learning_rate": 2.035443379207023e-06, + "loss": 0.6554, + "step": 15854 + }, + { + "epoch": 1.1454476493217982, + "grad_norm": 7.111157539666456, + "learning_rate": 2.0351559744249376e-06, + "loss": 0.6579, + "step": 15855 + }, + { + "epoch": 1.1455198945220078, + "grad_norm": 6.283819024348308, + "learning_rate": 2.0348685760062055e-06, + "loss": 0.7155, + "step": 15856 + }, + { + "epoch": 1.1455921397222173, + "grad_norm": 6.725371464676924, + "learning_rate": 2.034581183954761e-06, + "loss": 0.6586, + "step": 15857 + }, + { + "epoch": 1.1456643849224268, + "grad_norm": 6.849099494049895, + "learning_rate": 2.0342937982745394e-06, + "loss": 0.6807, + "step": 15858 + }, + { + "epoch": 1.1457366301226362, + "grad_norm": 6.339218851577327, + "learning_rate": 2.0340064189694746e-06, + "loss": 0.6479, + "step": 15859 + }, + { + "epoch": 1.1458088753228457, + "grad_norm": 6.275190324670627, + "learning_rate": 2.0337190460434993e-06, + "loss": 0.6491, + "step": 15860 + }, + { + "epoch": 1.1458811205230552, + "grad_norm": 6.338668516880697, + "learning_rate": 2.033431679500548e-06, + "loss": 0.6552, + "step": 15861 + }, + { + "epoch": 1.1459533657232648, + "grad_norm": 6.426101815712107, + "learning_rate": 2.033144319344556e-06, + "loss": 0.5776, + "step": 15862 + }, + { + "epoch": 1.1460256109234743, + "grad_norm": 6.515875283243772, + "learning_rate": 2.032856965579455e-06, + "loss": 0.6405, + "step": 15863 + }, + { + "epoch": 1.1460978561236839, + "grad_norm": 7.531243921807714, + "learning_rate": 2.0325696182091785e-06, + "loss": 0.6909, + "step": 15864 + }, + { + "epoch": 1.1461701013238934, + "grad_norm": 6.324915093014126, + "learning_rate": 2.0322822772376618e-06, + "loss": 0.647, + "step": 15865 + }, + { + "epoch": 1.1462423465241027, + "grad_norm": 7.958686487906969, + "learning_rate": 2.0319949426688382e-06, + "loss": 0.6929, + "step": 15866 + }, + { + "epoch": 1.1463145917243123, + "grad_norm": 7.589570578991036, + "learning_rate": 2.0317076145066395e-06, + "loss": 0.6171, + "step": 15867 + }, + { + "epoch": 1.1463868369245218, + "grad_norm": 6.310299026972847, + "learning_rate": 2.0314202927550003e-06, + "loss": 0.6329, + "step": 15868 + }, + { + "epoch": 1.1464590821247314, + "grad_norm": 5.9479907065179765, + "learning_rate": 2.0311329774178536e-06, + "loss": 0.6038, + "step": 15869 + }, + { + "epoch": 1.146531327324941, + "grad_norm": 6.323205311789192, + "learning_rate": 2.0308456684991325e-06, + "loss": 0.5826, + "step": 15870 + }, + { + "epoch": 1.1466035725251504, + "grad_norm": 14.69100741448792, + "learning_rate": 2.030558366002769e-06, + "loss": 0.7269, + "step": 15871 + }, + { + "epoch": 1.14667581772536, + "grad_norm": 5.778721601881404, + "learning_rate": 2.030271069932698e-06, + "loss": 0.6433, + "step": 15872 + }, + { + "epoch": 1.1467480629255693, + "grad_norm": 6.309529427031478, + "learning_rate": 2.0299837802928516e-06, + "loss": 0.6449, + "step": 15873 + }, + { + "epoch": 1.1468203081257788, + "grad_norm": 5.440311976350659, + "learning_rate": 2.0296964970871615e-06, + "loss": 0.5678, + "step": 15874 + }, + { + "epoch": 1.1468925533259884, + "grad_norm": 10.0417383806087, + "learning_rate": 2.0294092203195605e-06, + "loss": 0.6759, + "step": 15875 + }, + { + "epoch": 1.146964798526198, + "grad_norm": 7.107570544955124, + "learning_rate": 2.0291219499939827e-06, + "loss": 0.6093, + "step": 15876 + }, + { + "epoch": 1.1470370437264075, + "grad_norm": 5.380665122401117, + "learning_rate": 2.0288346861143607e-06, + "loss": 0.6402, + "step": 15877 + }, + { + "epoch": 1.147109288926617, + "grad_norm": 6.79376275729023, + "learning_rate": 2.028547428684625e-06, + "loss": 0.5896, + "step": 15878 + }, + { + "epoch": 1.1471815341268266, + "grad_norm": 6.411801496092418, + "learning_rate": 2.028260177708709e-06, + "loss": 0.6939, + "step": 15879 + }, + { + "epoch": 1.1472537793270359, + "grad_norm": 8.171408645422666, + "learning_rate": 2.0279729331905454e-06, + "loss": 0.6867, + "step": 15880 + }, + { + "epoch": 1.1473260245272454, + "grad_norm": 6.603402399391621, + "learning_rate": 2.027685695134065e-06, + "loss": 0.6617, + "step": 15881 + }, + { + "epoch": 1.147398269727455, + "grad_norm": 7.0706229968941905, + "learning_rate": 2.027398463543201e-06, + "loss": 0.6914, + "step": 15882 + }, + { + "epoch": 1.1474705149276645, + "grad_norm": 7.942151726569542, + "learning_rate": 2.027111238421885e-06, + "loss": 0.66, + "step": 15883 + }, + { + "epoch": 1.147542760127874, + "grad_norm": 7.307874210889083, + "learning_rate": 2.02682401977405e-06, + "loss": 0.6523, + "step": 15884 + }, + { + "epoch": 1.1476150053280836, + "grad_norm": 6.585134654134715, + "learning_rate": 2.0265368076036255e-06, + "loss": 0.7165, + "step": 15885 + }, + { + "epoch": 1.1476872505282931, + "grad_norm": 6.50475460167242, + "learning_rate": 2.026249601914545e-06, + "loss": 0.6586, + "step": 15886 + }, + { + "epoch": 1.1477594957285024, + "grad_norm": 7.07432120473848, + "learning_rate": 2.02596240271074e-06, + "loss": 0.7138, + "step": 15887 + }, + { + "epoch": 1.147831740928712, + "grad_norm": 5.385102115891953, + "learning_rate": 2.0256752099961413e-06, + "loss": 0.5897, + "step": 15888 + }, + { + "epoch": 1.1479039861289215, + "grad_norm": 5.868788336972927, + "learning_rate": 2.02538802377468e-06, + "loss": 0.6109, + "step": 15889 + }, + { + "epoch": 1.147976231329131, + "grad_norm": 6.395742144436797, + "learning_rate": 2.0251008440502883e-06, + "loss": 0.7039, + "step": 15890 + }, + { + "epoch": 1.1480484765293406, + "grad_norm": 6.034840518136805, + "learning_rate": 2.0248136708268985e-06, + "loss": 0.6416, + "step": 15891 + }, + { + "epoch": 1.1481207217295502, + "grad_norm": 6.199097832291903, + "learning_rate": 2.0245265041084395e-06, + "loss": 0.6653, + "step": 15892 + }, + { + "epoch": 1.1481929669297597, + "grad_norm": 6.636959293849322, + "learning_rate": 2.024239343898844e-06, + "loss": 0.6409, + "step": 15893 + }, + { + "epoch": 1.148265212129969, + "grad_norm": 5.940692244243958, + "learning_rate": 2.0239521902020428e-06, + "loss": 0.7099, + "step": 15894 + }, + { + "epoch": 1.1483374573301786, + "grad_norm": 6.956572563047924, + "learning_rate": 2.0236650430219654e-06, + "loss": 0.6882, + "step": 15895 + }, + { + "epoch": 1.148409702530388, + "grad_norm": 6.169749992659656, + "learning_rate": 2.023377902362544e-06, + "loss": 0.635, + "step": 15896 + }, + { + "epoch": 1.1484819477305976, + "grad_norm": 6.189870669195535, + "learning_rate": 2.0230907682277093e-06, + "loss": 0.6434, + "step": 15897 + }, + { + "epoch": 1.1485541929308072, + "grad_norm": 6.534814765248766, + "learning_rate": 2.022803640621392e-06, + "loss": 0.6208, + "step": 15898 + }, + { + "epoch": 1.1486264381310167, + "grad_norm": 6.733149370506839, + "learning_rate": 2.0225165195475217e-06, + "loss": 0.6868, + "step": 15899 + }, + { + "epoch": 1.1486986833312263, + "grad_norm": 8.348667481596193, + "learning_rate": 2.02222940501003e-06, + "loss": 0.7701, + "step": 15900 + }, + { + "epoch": 1.1487709285314356, + "grad_norm": 7.229522197969759, + "learning_rate": 2.021942297012846e-06, + "loss": 0.6145, + "step": 15901 + }, + { + "epoch": 1.1488431737316451, + "grad_norm": 8.44693211950027, + "learning_rate": 2.0216551955599026e-06, + "loss": 0.6709, + "step": 15902 + }, + { + "epoch": 1.1489154189318547, + "grad_norm": 5.692297955304421, + "learning_rate": 2.021368100655126e-06, + "loss": 0.6198, + "step": 15903 + }, + { + "epoch": 1.1489876641320642, + "grad_norm": 6.309329303577863, + "learning_rate": 2.0210810123024494e-06, + "loss": 0.7119, + "step": 15904 + }, + { + "epoch": 1.1490599093322738, + "grad_norm": 7.132446145951776, + "learning_rate": 2.0207939305058028e-06, + "loss": 0.7164, + "step": 15905 + }, + { + "epoch": 1.1491321545324833, + "grad_norm": 7.136209069871483, + "learning_rate": 2.020506855269114e-06, + "loss": 0.728, + "step": 15906 + }, + { + "epoch": 1.1492043997326928, + "grad_norm": 5.61602575945041, + "learning_rate": 2.0202197865963143e-06, + "loss": 0.6576, + "step": 15907 + }, + { + "epoch": 1.1492766449329022, + "grad_norm": 7.313599071885014, + "learning_rate": 2.019932724491333e-06, + "loss": 0.701, + "step": 15908 + }, + { + "epoch": 1.1493488901331117, + "grad_norm": 9.330543191844878, + "learning_rate": 2.0196456689581007e-06, + "loss": 0.7219, + "step": 15909 + }, + { + "epoch": 1.1494211353333212, + "grad_norm": 6.697137750765948, + "learning_rate": 2.0193586200005454e-06, + "loss": 0.6206, + "step": 15910 + }, + { + "epoch": 1.1494933805335308, + "grad_norm": 6.102386770608342, + "learning_rate": 2.0190715776225976e-06, + "loss": 0.6052, + "step": 15911 + }, + { + "epoch": 1.1495656257337403, + "grad_norm": 7.065889878434982, + "learning_rate": 2.018784541828187e-06, + "loss": 0.6803, + "step": 15912 + }, + { + "epoch": 1.1496378709339499, + "grad_norm": 5.354586672605855, + "learning_rate": 2.018497512621242e-06, + "loss": 0.577, + "step": 15913 + }, + { + "epoch": 1.1497101161341594, + "grad_norm": 6.425532503734066, + "learning_rate": 2.0182104900056922e-06, + "loss": 0.6122, + "step": 15914 + }, + { + "epoch": 1.1497823613343687, + "grad_norm": 8.678332424741312, + "learning_rate": 2.017923473985466e-06, + "loss": 0.7322, + "step": 15915 + }, + { + "epoch": 1.1498546065345783, + "grad_norm": 6.959014524211985, + "learning_rate": 2.017636464564495e-06, + "loss": 0.6616, + "step": 15916 + }, + { + "epoch": 1.1499268517347878, + "grad_norm": 8.37246255724449, + "learning_rate": 2.0173494617467043e-06, + "loss": 0.7078, + "step": 15917 + }, + { + "epoch": 1.1499990969349974, + "grad_norm": 7.035359855725971, + "learning_rate": 2.0170624655360256e-06, + "loss": 0.6675, + "step": 15918 + }, + { + "epoch": 1.150071342135207, + "grad_norm": 7.875051104667388, + "learning_rate": 2.0167754759363873e-06, + "loss": 0.6506, + "step": 15919 + }, + { + "epoch": 1.1501435873354164, + "grad_norm": 7.064104016204589, + "learning_rate": 2.0164884929517168e-06, + "loss": 0.7061, + "step": 15920 + }, + { + "epoch": 1.150215832535626, + "grad_norm": 6.096520992073029, + "learning_rate": 2.0162015165859434e-06, + "loss": 0.6371, + "step": 15921 + }, + { + "epoch": 1.1502880777358353, + "grad_norm": 6.949929589662728, + "learning_rate": 2.0159145468429956e-06, + "loss": 0.6881, + "step": 15922 + }, + { + "epoch": 1.1503603229360448, + "grad_norm": 9.231769226389728, + "learning_rate": 2.015627583726802e-06, + "loss": 0.6602, + "step": 15923 + }, + { + "epoch": 1.1504325681362544, + "grad_norm": 6.899866804965842, + "learning_rate": 2.0153406272412906e-06, + "loss": 0.6002, + "step": 15924 + }, + { + "epoch": 1.150504813336464, + "grad_norm": 9.25839429870182, + "learning_rate": 2.0150536773903894e-06, + "loss": 0.6432, + "step": 15925 + }, + { + "epoch": 1.1505770585366735, + "grad_norm": 10.7223719936138, + "learning_rate": 2.0147667341780277e-06, + "loss": 0.6766, + "step": 15926 + }, + { + "epoch": 1.150649303736883, + "grad_norm": 6.962573132225042, + "learning_rate": 2.0144797976081318e-06, + "loss": 0.6941, + "step": 15927 + }, + { + "epoch": 1.1507215489370926, + "grad_norm": 6.171583569563286, + "learning_rate": 2.0141928676846307e-06, + "loss": 0.6456, + "step": 15928 + }, + { + "epoch": 1.150793794137302, + "grad_norm": 6.826457992036647, + "learning_rate": 2.0139059444114516e-06, + "loss": 0.6849, + "step": 15929 + }, + { + "epoch": 1.1508660393375116, + "grad_norm": 7.841152536183895, + "learning_rate": 2.0136190277925244e-06, + "loss": 0.6819, + "step": 15930 + }, + { + "epoch": 1.150938284537721, + "grad_norm": 8.952634574946055, + "learning_rate": 2.013332117831773e-06, + "loss": 0.6871, + "step": 15931 + }, + { + "epoch": 1.1510105297379305, + "grad_norm": 7.898749974500267, + "learning_rate": 2.0130452145331276e-06, + "loss": 0.6821, + "step": 15932 + }, + { + "epoch": 1.15108277493814, + "grad_norm": 5.704848439295477, + "learning_rate": 2.0127583179005154e-06, + "loss": 0.6769, + "step": 15933 + }, + { + "epoch": 1.1511550201383496, + "grad_norm": 7.657428012450992, + "learning_rate": 2.012471427937864e-06, + "loss": 0.6381, + "step": 15934 + }, + { + "epoch": 1.1512272653385591, + "grad_norm": 6.321698728735381, + "learning_rate": 2.0121845446491e-06, + "loss": 0.6347, + "step": 15935 + }, + { + "epoch": 1.1512995105387687, + "grad_norm": 5.94344056744985, + "learning_rate": 2.01189766803815e-06, + "loss": 0.6646, + "step": 15936 + }, + { + "epoch": 1.1513717557389782, + "grad_norm": 6.874547492654318, + "learning_rate": 2.011610798108943e-06, + "loss": 0.6827, + "step": 15937 + }, + { + "epoch": 1.1514440009391875, + "grad_norm": 6.1080420715641806, + "learning_rate": 2.011323934865404e-06, + "loss": 0.5908, + "step": 15938 + }, + { + "epoch": 1.151516246139397, + "grad_norm": 6.411275836775752, + "learning_rate": 2.011037078311461e-06, + "loss": 0.6459, + "step": 15939 + }, + { + "epoch": 1.1515884913396066, + "grad_norm": 7.234320424082072, + "learning_rate": 2.0107502284510414e-06, + "loss": 0.6646, + "step": 15940 + }, + { + "epoch": 1.1516607365398162, + "grad_norm": 8.308916050695181, + "learning_rate": 2.0104633852880714e-06, + "loss": 0.6926, + "step": 15941 + }, + { + "epoch": 1.1517329817400257, + "grad_norm": 7.288421112004177, + "learning_rate": 2.010176548826477e-06, + "loss": 0.6553, + "step": 15942 + }, + { + "epoch": 1.1518052269402352, + "grad_norm": 7.357042594998951, + "learning_rate": 2.009889719070185e-06, + "loss": 0.6193, + "step": 15943 + }, + { + "epoch": 1.1518774721404448, + "grad_norm": 6.5204012629127766, + "learning_rate": 2.0096028960231233e-06, + "loss": 0.6081, + "step": 15944 + }, + { + "epoch": 1.151949717340654, + "grad_norm": 7.642903555715124, + "learning_rate": 2.0093160796892163e-06, + "loss": 0.7451, + "step": 15945 + }, + { + "epoch": 1.1520219625408636, + "grad_norm": 6.430576193519836, + "learning_rate": 2.009029270072391e-06, + "loss": 0.6291, + "step": 15946 + }, + { + "epoch": 1.1520942077410732, + "grad_norm": 8.065056447294518, + "learning_rate": 2.008742467176574e-06, + "loss": 0.6241, + "step": 15947 + }, + { + "epoch": 1.1521664529412827, + "grad_norm": 7.294414881562158, + "learning_rate": 2.0084556710056922e-06, + "loss": 0.6101, + "step": 15948 + }, + { + "epoch": 1.1522386981414923, + "grad_norm": 6.734321470114253, + "learning_rate": 2.0081688815636697e-06, + "loss": 0.622, + "step": 15949 + }, + { + "epoch": 1.1523109433417018, + "grad_norm": 7.133090863013717, + "learning_rate": 2.007882098854433e-06, + "loss": 0.6383, + "step": 15950 + }, + { + "epoch": 1.1523831885419114, + "grad_norm": 6.083844864471886, + "learning_rate": 2.0075953228819093e-06, + "loss": 0.6284, + "step": 15951 + }, + { + "epoch": 1.1524554337421207, + "grad_norm": 8.067291019122035, + "learning_rate": 2.0073085536500227e-06, + "loss": 0.6787, + "step": 15952 + }, + { + "epoch": 1.1525276789423302, + "grad_norm": 8.266943900269029, + "learning_rate": 2.007021791162699e-06, + "loss": 0.6819, + "step": 15953 + }, + { + "epoch": 1.1525999241425398, + "grad_norm": 8.1887043074508, + "learning_rate": 2.0067350354238645e-06, + "loss": 0.6259, + "step": 15954 + }, + { + "epoch": 1.1526721693427493, + "grad_norm": 6.83985434661515, + "learning_rate": 2.006448286437445e-06, + "loss": 0.7, + "step": 15955 + }, + { + "epoch": 1.1527444145429588, + "grad_norm": 5.7261175006158735, + "learning_rate": 2.0061615442073645e-06, + "loss": 0.6097, + "step": 15956 + }, + { + "epoch": 1.1528166597431684, + "grad_norm": 7.19461636069893, + "learning_rate": 2.005874808737549e-06, + "loss": 0.6875, + "step": 15957 + }, + { + "epoch": 1.152888904943378, + "grad_norm": 5.939938335676611, + "learning_rate": 2.0055880800319237e-06, + "loss": 0.7516, + "step": 15958 + }, + { + "epoch": 1.1529611501435872, + "grad_norm": 8.568870235297942, + "learning_rate": 2.005301358094415e-06, + "loss": 0.7234, + "step": 15959 + }, + { + "epoch": 1.1530333953437968, + "grad_norm": 7.45928815416362, + "learning_rate": 2.0050146429289447e-06, + "loss": 0.6454, + "step": 15960 + }, + { + "epoch": 1.1531056405440063, + "grad_norm": 6.236260227081055, + "learning_rate": 2.0047279345394405e-06, + "loss": 0.6325, + "step": 15961 + }, + { + "epoch": 1.1531778857442159, + "grad_norm": 7.368065111268136, + "learning_rate": 2.004441232929827e-06, + "loss": 0.7, + "step": 15962 + }, + { + "epoch": 1.1532501309444254, + "grad_norm": 6.482135284791717, + "learning_rate": 2.0041545381040274e-06, + "loss": 0.6696, + "step": 15963 + }, + { + "epoch": 1.153322376144635, + "grad_norm": 6.788242092978905, + "learning_rate": 2.003867850065967e-06, + "loss": 0.7121, + "step": 15964 + }, + { + "epoch": 1.1533946213448445, + "grad_norm": 5.769215528027869, + "learning_rate": 2.003581168819571e-06, + "loss": 0.6421, + "step": 15965 + }, + { + "epoch": 1.1534668665450538, + "grad_norm": 6.683345181398362, + "learning_rate": 2.003294494368763e-06, + "loss": 0.6928, + "step": 15966 + }, + { + "epoch": 1.1535391117452634, + "grad_norm": 6.265410992779905, + "learning_rate": 2.0030078267174678e-06, + "loss": 0.6023, + "step": 15967 + }, + { + "epoch": 1.153611356945473, + "grad_norm": 6.705948241769322, + "learning_rate": 2.0027211658696097e-06, + "loss": 0.6495, + "step": 15968 + }, + { + "epoch": 1.1536836021456824, + "grad_norm": 7.621247916175191, + "learning_rate": 2.002434511829113e-06, + "loss": 0.6441, + "step": 15969 + }, + { + "epoch": 1.153755847345892, + "grad_norm": 7.871149968781861, + "learning_rate": 2.0021478645999008e-06, + "loss": 0.6412, + "step": 15970 + }, + { + "epoch": 1.1538280925461015, + "grad_norm": 5.8255684898540565, + "learning_rate": 2.001861224185898e-06, + "loss": 0.6277, + "step": 15971 + }, + { + "epoch": 1.153900337746311, + "grad_norm": 6.767291491494908, + "learning_rate": 2.0015745905910282e-06, + "loss": 0.7104, + "step": 15972 + }, + { + "epoch": 1.1539725829465204, + "grad_norm": 6.334245448770788, + "learning_rate": 2.0012879638192167e-06, + "loss": 0.6449, + "step": 15973 + }, + { + "epoch": 1.15404482814673, + "grad_norm": 5.732229886584823, + "learning_rate": 2.0010013438743835e-06, + "loss": 0.6272, + "step": 15974 + }, + { + "epoch": 1.1541170733469395, + "grad_norm": 6.858764899369581, + "learning_rate": 2.0007147307604556e-06, + "loss": 0.6801, + "step": 15975 + }, + { + "epoch": 1.154189318547149, + "grad_norm": 6.012815775678321, + "learning_rate": 2.000428124481356e-06, + "loss": 0.6429, + "step": 15976 + }, + { + "epoch": 1.1542615637473586, + "grad_norm": 5.903598412209835, + "learning_rate": 2.0001415250410064e-06, + "loss": 0.6317, + "step": 15977 + }, + { + "epoch": 1.154333808947568, + "grad_norm": 6.580819273617208, + "learning_rate": 1.9998549324433312e-06, + "loss": 0.6665, + "step": 15978 + }, + { + "epoch": 1.1544060541477776, + "grad_norm": 7.4808514609182195, + "learning_rate": 1.999568346692254e-06, + "loss": 0.6591, + "step": 15979 + }, + { + "epoch": 1.154478299347987, + "grad_norm": 6.576244384438387, + "learning_rate": 1.9992817677916977e-06, + "loss": 0.5402, + "step": 15980 + }, + { + "epoch": 1.1545505445481965, + "grad_norm": 8.090626942563256, + "learning_rate": 1.998995195745585e-06, + "loss": 0.6519, + "step": 15981 + }, + { + "epoch": 1.154622789748406, + "grad_norm": 6.645165899199271, + "learning_rate": 1.998708630557839e-06, + "loss": 0.6353, + "step": 15982 + }, + { + "epoch": 1.1546950349486156, + "grad_norm": 7.953989278001971, + "learning_rate": 1.998422072232383e-06, + "loss": 0.6194, + "step": 15983 + }, + { + "epoch": 1.1547672801488251, + "grad_norm": 6.096448095598307, + "learning_rate": 1.998135520773139e-06, + "loss": 0.6004, + "step": 15984 + }, + { + "epoch": 1.1548395253490347, + "grad_norm": 6.872862604982213, + "learning_rate": 1.99784897618403e-06, + "loss": 0.6557, + "step": 15985 + }, + { + "epoch": 1.1549117705492442, + "grad_norm": 6.240226733661205, + "learning_rate": 1.9975624384689785e-06, + "loss": 0.6709, + "step": 15986 + }, + { + "epoch": 1.1549840157494535, + "grad_norm": 5.9028688471133, + "learning_rate": 1.9972759076319085e-06, + "loss": 0.5708, + "step": 15987 + }, + { + "epoch": 1.155056260949663, + "grad_norm": 8.182507106406124, + "learning_rate": 1.996989383676739e-06, + "loss": 0.7274, + "step": 15988 + }, + { + "epoch": 1.1551285061498726, + "grad_norm": 6.256161513641222, + "learning_rate": 1.996702866607395e-06, + "loss": 0.6035, + "step": 15989 + }, + { + "epoch": 1.1552007513500822, + "grad_norm": 5.8638501790307656, + "learning_rate": 1.996416356427798e-06, + "loss": 0.709, + "step": 15990 + }, + { + "epoch": 1.1552729965502917, + "grad_norm": 5.599652293173511, + "learning_rate": 1.996129853141871e-06, + "loss": 0.6067, + "step": 15991 + }, + { + "epoch": 1.1553452417505012, + "grad_norm": 6.4074268283602756, + "learning_rate": 1.9958433567535342e-06, + "loss": 0.602, + "step": 15992 + }, + { + "epoch": 1.1554174869507108, + "grad_norm": 5.486896163633685, + "learning_rate": 1.9955568672667103e-06, + "loss": 0.6244, + "step": 15993 + }, + { + "epoch": 1.15548973215092, + "grad_norm": 9.403732013474757, + "learning_rate": 1.9952703846853216e-06, + "loss": 0.8005, + "step": 15994 + }, + { + "epoch": 1.1555619773511296, + "grad_norm": 7.035490800103418, + "learning_rate": 1.994983909013289e-06, + "loss": 0.6234, + "step": 15995 + }, + { + "epoch": 1.1556342225513392, + "grad_norm": 6.6841205364798295, + "learning_rate": 1.994697440254535e-06, + "loss": 0.6608, + "step": 15996 + }, + { + "epoch": 1.1557064677515487, + "grad_norm": 6.286194330933443, + "learning_rate": 1.99441097841298e-06, + "loss": 0.6549, + "step": 15997 + }, + { + "epoch": 1.1557787129517583, + "grad_norm": 7.525190899730888, + "learning_rate": 1.994124523492547e-06, + "loss": 0.7648, + "step": 15998 + }, + { + "epoch": 1.1558509581519678, + "grad_norm": 6.5681860132675, + "learning_rate": 1.993838075497156e-06, + "loss": 0.6131, + "step": 15999 + }, + { + "epoch": 1.1559232033521774, + "grad_norm": 7.175505340538034, + "learning_rate": 1.9935516344307285e-06, + "loss": 0.6907, + "step": 16000 + }, + { + "epoch": 1.155995448552387, + "grad_norm": 5.238881462921636, + "learning_rate": 1.993265200297187e-06, + "loss": 0.6594, + "step": 16001 + }, + { + "epoch": 1.1560676937525962, + "grad_norm": 7.88913245862351, + "learning_rate": 1.9929787731004503e-06, + "loss": 0.606, + "step": 16002 + }, + { + "epoch": 1.1561399389528058, + "grad_norm": 7.36577378237249, + "learning_rate": 1.9926923528444404e-06, + "loss": 0.6517, + "step": 16003 + }, + { + "epoch": 1.1562121841530153, + "grad_norm": 6.06572582508938, + "learning_rate": 1.9924059395330787e-06, + "loss": 0.6686, + "step": 16004 + }, + { + "epoch": 1.1562844293532248, + "grad_norm": 6.544302277818988, + "learning_rate": 1.9921195331702866e-06, + "loss": 0.6605, + "step": 16005 + }, + { + "epoch": 1.1563566745534344, + "grad_norm": 6.8557441851719885, + "learning_rate": 1.991833133759983e-06, + "loss": 0.6558, + "step": 16006 + }, + { + "epoch": 1.156428919753644, + "grad_norm": 6.3734667561077725, + "learning_rate": 1.9915467413060884e-06, + "loss": 0.6415, + "step": 16007 + }, + { + "epoch": 1.1565011649538535, + "grad_norm": 5.193589462571845, + "learning_rate": 1.991260355812524e-06, + "loss": 0.6775, + "step": 16008 + }, + { + "epoch": 1.156573410154063, + "grad_norm": 6.479318435052602, + "learning_rate": 1.9909739772832123e-06, + "loss": 0.6627, + "step": 16009 + }, + { + "epoch": 1.1566456553542723, + "grad_norm": 6.065176461491719, + "learning_rate": 1.99068760572207e-06, + "loss": 0.6351, + "step": 16010 + }, + { + "epoch": 1.1567179005544819, + "grad_norm": 6.2545912091716795, + "learning_rate": 1.990401241133019e-06, + "loss": 0.665, + "step": 16011 + }, + { + "epoch": 1.1567901457546914, + "grad_norm": 7.38691912076203, + "learning_rate": 1.99011488351998e-06, + "loss": 0.6573, + "step": 16012 + }, + { + "epoch": 1.156862390954901, + "grad_norm": 7.057631443379299, + "learning_rate": 1.989828532886872e-06, + "loss": 0.6088, + "step": 16013 + }, + { + "epoch": 1.1569346361551105, + "grad_norm": 6.264543321044259, + "learning_rate": 1.989542189237615e-06, + "loss": 0.6635, + "step": 16014 + }, + { + "epoch": 1.15700688135532, + "grad_norm": 7.36108974557651, + "learning_rate": 1.9892558525761295e-06, + "loss": 0.6631, + "step": 16015 + }, + { + "epoch": 1.1570791265555296, + "grad_norm": 6.042895523150383, + "learning_rate": 1.988969522906335e-06, + "loss": 0.6616, + "step": 16016 + }, + { + "epoch": 1.157151371755739, + "grad_norm": 7.120370532049008, + "learning_rate": 1.9886832002321503e-06, + "loss": 0.6181, + "step": 16017 + }, + { + "epoch": 1.1572236169559484, + "grad_norm": 7.811213028762747, + "learning_rate": 1.988396884557496e-06, + "loss": 0.6366, + "step": 16018 + }, + { + "epoch": 1.157295862156158, + "grad_norm": 6.067679797802626, + "learning_rate": 1.9881105758862917e-06, + "loss": 0.6187, + "step": 16019 + }, + { + "epoch": 1.1573681073563675, + "grad_norm": 6.037182040164433, + "learning_rate": 1.987824274222455e-06, + "loss": 0.6472, + "step": 16020 + }, + { + "epoch": 1.157440352556577, + "grad_norm": 6.603874641323858, + "learning_rate": 1.987537979569907e-06, + "loss": 0.6866, + "step": 16021 + }, + { + "epoch": 1.1575125977567866, + "grad_norm": 5.588336733437693, + "learning_rate": 1.987251691932565e-06, + "loss": 0.6953, + "step": 16022 + }, + { + "epoch": 1.1575848429569962, + "grad_norm": 6.087939813223649, + "learning_rate": 1.986965411314351e-06, + "loss": 0.6551, + "step": 16023 + }, + { + "epoch": 1.1576570881572055, + "grad_norm": 6.747238724672104, + "learning_rate": 1.986679137719181e-06, + "loss": 0.7055, + "step": 16024 + }, + { + "epoch": 1.157729333357415, + "grad_norm": 6.9028093562010415, + "learning_rate": 1.9863928711509754e-06, + "loss": 0.6567, + "step": 16025 + }, + { + "epoch": 1.1578015785576246, + "grad_norm": 8.63603013777322, + "learning_rate": 1.986106611613653e-06, + "loss": 0.7276, + "step": 16026 + }, + { + "epoch": 1.157873823757834, + "grad_norm": 6.772799099907296, + "learning_rate": 1.9858203591111315e-06, + "loss": 0.6396, + "step": 16027 + }, + { + "epoch": 1.1579460689580436, + "grad_norm": 5.89540582012261, + "learning_rate": 1.9855341136473295e-06, + "loss": 0.68, + "step": 16028 + }, + { + "epoch": 1.1580183141582532, + "grad_norm": 4.9008039632374585, + "learning_rate": 1.985247875226167e-06, + "loss": 0.6788, + "step": 16029 + }, + { + "epoch": 1.1580905593584627, + "grad_norm": 7.124099406830202, + "learning_rate": 1.984961643851561e-06, + "loss": 0.7136, + "step": 16030 + }, + { + "epoch": 1.158162804558672, + "grad_norm": 7.284208163425537, + "learning_rate": 1.98467541952743e-06, + "loss": 0.5957, + "step": 16031 + }, + { + "epoch": 1.1582350497588816, + "grad_norm": 6.809357364486399, + "learning_rate": 1.984389202257693e-06, + "loss": 0.6842, + "step": 16032 + }, + { + "epoch": 1.1583072949590911, + "grad_norm": 7.685347340961286, + "learning_rate": 1.9841029920462667e-06, + "loss": 0.7301, + "step": 16033 + }, + { + "epoch": 1.1583795401593007, + "grad_norm": 6.9721556411678, + "learning_rate": 1.9838167888970713e-06, + "loss": 0.662, + "step": 16034 + }, + { + "epoch": 1.1584517853595102, + "grad_norm": 6.442654222175335, + "learning_rate": 1.983530592814022e-06, + "loss": 0.6699, + "step": 16035 + }, + { + "epoch": 1.1585240305597198, + "grad_norm": 7.164397153869204, + "learning_rate": 1.9832444038010384e-06, + "loss": 0.6449, + "step": 16036 + }, + { + "epoch": 1.1585962757599293, + "grad_norm": 6.4946992240961094, + "learning_rate": 1.9829582218620376e-06, + "loss": 0.6742, + "step": 16037 + }, + { + "epoch": 1.1586685209601386, + "grad_norm": 5.795242449845411, + "learning_rate": 1.982672047000937e-06, + "loss": 0.6963, + "step": 16038 + }, + { + "epoch": 1.1587407661603482, + "grad_norm": 6.987674488489005, + "learning_rate": 1.9823858792216545e-06, + "loss": 0.6689, + "step": 16039 + }, + { + "epoch": 1.1588130113605577, + "grad_norm": 5.915999827170224, + "learning_rate": 1.982099718528107e-06, + "loss": 0.6803, + "step": 16040 + }, + { + "epoch": 1.1588852565607672, + "grad_norm": 7.0924587334880185, + "learning_rate": 1.981813564924213e-06, + "loss": 0.585, + "step": 16041 + }, + { + "epoch": 1.1589575017609768, + "grad_norm": 5.522514557514889, + "learning_rate": 1.9815274184138884e-06, + "loss": 0.6673, + "step": 16042 + }, + { + "epoch": 1.1590297469611863, + "grad_norm": 6.579063511676554, + "learning_rate": 1.981241279001051e-06, + "loss": 0.6032, + "step": 16043 + }, + { + "epoch": 1.1591019921613959, + "grad_norm": 6.148180324890695, + "learning_rate": 1.980955146689618e-06, + "loss": 0.6603, + "step": 16044 + }, + { + "epoch": 1.1591742373616052, + "grad_norm": 8.703528677380703, + "learning_rate": 1.980669021483506e-06, + "loss": 0.6625, + "step": 16045 + }, + { + "epoch": 1.1592464825618147, + "grad_norm": 5.576861583057011, + "learning_rate": 1.980382903386631e-06, + "loss": 0.6228, + "step": 16046 + }, + { + "epoch": 1.1593187277620243, + "grad_norm": 5.560062002755662, + "learning_rate": 1.9800967924029112e-06, + "loss": 0.6484, + "step": 16047 + }, + { + "epoch": 1.1593909729622338, + "grad_norm": 7.411136608597822, + "learning_rate": 1.979810688536264e-06, + "loss": 0.6665, + "step": 16048 + }, + { + "epoch": 1.1594632181624434, + "grad_norm": 7.879412913261115, + "learning_rate": 1.979524591790603e-06, + "loss": 0.7633, + "step": 16049 + }, + { + "epoch": 1.159535463362653, + "grad_norm": 7.225397921365322, + "learning_rate": 1.9792385021698464e-06, + "loss": 0.7578, + "step": 16050 + }, + { + "epoch": 1.1596077085628624, + "grad_norm": 6.314175525041662, + "learning_rate": 1.978952419677911e-06, + "loss": 0.6389, + "step": 16051 + }, + { + "epoch": 1.1596799537630718, + "grad_norm": 7.040018946015267, + "learning_rate": 1.9786663443187115e-06, + "loss": 0.7292, + "step": 16052 + }, + { + "epoch": 1.1597521989632813, + "grad_norm": 4.947081720945283, + "learning_rate": 1.978380276096165e-06, + "loss": 0.5909, + "step": 16053 + }, + { + "epoch": 1.1598244441634908, + "grad_norm": 5.166984158677139, + "learning_rate": 1.978094215014188e-06, + "loss": 0.6129, + "step": 16054 + }, + { + "epoch": 1.1598966893637004, + "grad_norm": 5.564267681294778, + "learning_rate": 1.9778081610766957e-06, + "loss": 0.6346, + "step": 16055 + }, + { + "epoch": 1.15996893456391, + "grad_norm": 7.648550217823657, + "learning_rate": 1.9775221142876046e-06, + "loss": 0.665, + "step": 16056 + }, + { + "epoch": 1.1600411797641195, + "grad_norm": 6.698750097492244, + "learning_rate": 1.9772360746508293e-06, + "loss": 0.7082, + "step": 16057 + }, + { + "epoch": 1.160113424964329, + "grad_norm": 6.971539542054235, + "learning_rate": 1.9769500421702876e-06, + "loss": 0.6533, + "step": 16058 + }, + { + "epoch": 1.1601856701645383, + "grad_norm": 7.075484231613863, + "learning_rate": 1.976664016849892e-06, + "loss": 0.6673, + "step": 16059 + }, + { + "epoch": 1.1602579153647479, + "grad_norm": 5.810752031567281, + "learning_rate": 1.9763779986935606e-06, + "loss": 0.6389, + "step": 16060 + }, + { + "epoch": 1.1603301605649574, + "grad_norm": 5.807320532848194, + "learning_rate": 1.9760919877052077e-06, + "loss": 0.5918, + "step": 16061 + }, + { + "epoch": 1.160402405765167, + "grad_norm": 5.109742877560501, + "learning_rate": 1.9758059838887493e-06, + "loss": 0.5923, + "step": 16062 + }, + { + "epoch": 1.1604746509653765, + "grad_norm": 5.765590595256429, + "learning_rate": 1.9755199872480995e-06, + "loss": 0.5989, + "step": 16063 + }, + { + "epoch": 1.160546896165586, + "grad_norm": 6.71549106782091, + "learning_rate": 1.9752339977871733e-06, + "loss": 0.6791, + "step": 16064 + }, + { + "epoch": 1.1606191413657956, + "grad_norm": 6.665333455609171, + "learning_rate": 1.974948015509886e-06, + "loss": 0.6344, + "step": 16065 + }, + { + "epoch": 1.160691386566005, + "grad_norm": 7.096753261904458, + "learning_rate": 1.9746620404201545e-06, + "loss": 0.6473, + "step": 16066 + }, + { + "epoch": 1.1607636317662144, + "grad_norm": 7.25696045439281, + "learning_rate": 1.97437607252189e-06, + "loss": 0.65, + "step": 16067 + }, + { + "epoch": 1.160835876966424, + "grad_norm": 6.487067635595382, + "learning_rate": 1.97409011181901e-06, + "loss": 0.7572, + "step": 16068 + }, + { + "epoch": 1.1609081221666335, + "grad_norm": 7.394675953170131, + "learning_rate": 1.9738041583154276e-06, + "loss": 0.6628, + "step": 16069 + }, + { + "epoch": 1.160980367366843, + "grad_norm": 6.166477441032418, + "learning_rate": 1.9735182120150575e-06, + "loss": 0.6794, + "step": 16070 + }, + { + "epoch": 1.1610526125670526, + "grad_norm": 8.087546073393202, + "learning_rate": 1.9732322729218143e-06, + "loss": 0.6975, + "step": 16071 + }, + { + "epoch": 1.1611248577672622, + "grad_norm": 6.129581528485894, + "learning_rate": 1.972946341039612e-06, + "loss": 0.6495, + "step": 16072 + }, + { + "epoch": 1.1611971029674717, + "grad_norm": 7.518455686064812, + "learning_rate": 1.972660416372366e-06, + "loss": 0.637, + "step": 16073 + }, + { + "epoch": 1.161269348167681, + "grad_norm": 6.3997799835533895, + "learning_rate": 1.972374498923989e-06, + "loss": 0.5813, + "step": 16074 + }, + { + "epoch": 1.1613415933678906, + "grad_norm": 5.6424070803082005, + "learning_rate": 1.9720885886983954e-06, + "loss": 0.5887, + "step": 16075 + }, + { + "epoch": 1.1614138385681, + "grad_norm": 7.043017496711504, + "learning_rate": 1.9718026856995e-06, + "loss": 0.6825, + "step": 16076 + }, + { + "epoch": 1.1614860837683096, + "grad_norm": 6.73715258889863, + "learning_rate": 1.971516789931215e-06, + "loss": 0.6982, + "step": 16077 + }, + { + "epoch": 1.1615583289685192, + "grad_norm": 6.942303893053506, + "learning_rate": 1.971230901397454e-06, + "loss": 0.6852, + "step": 16078 + }, + { + "epoch": 1.1616305741687287, + "grad_norm": 5.928206066488246, + "learning_rate": 1.9709450201021313e-06, + "loss": 0.6326, + "step": 16079 + }, + { + "epoch": 1.1617028193689383, + "grad_norm": 8.873021724664444, + "learning_rate": 1.970659146049162e-06, + "loss": 0.706, + "step": 16080 + }, + { + "epoch": 1.1617750645691478, + "grad_norm": 7.511510535660837, + "learning_rate": 1.9703732792424572e-06, + "loss": 0.6634, + "step": 16081 + }, + { + "epoch": 1.1618473097693571, + "grad_norm": 7.367905129748657, + "learning_rate": 1.970087419685931e-06, + "loss": 0.6871, + "step": 16082 + }, + { + "epoch": 1.1619195549695667, + "grad_norm": 7.855485503540687, + "learning_rate": 1.9698015673834968e-06, + "loss": 0.6605, + "step": 16083 + }, + { + "epoch": 1.1619918001697762, + "grad_norm": 5.435071205437764, + "learning_rate": 1.969515722339067e-06, + "loss": 0.5958, + "step": 16084 + }, + { + "epoch": 1.1620640453699858, + "grad_norm": 6.509376804976046, + "learning_rate": 1.9692298845565554e-06, + "loss": 0.6667, + "step": 16085 + }, + { + "epoch": 1.1621362905701953, + "grad_norm": 6.621101239855646, + "learning_rate": 1.968944054039874e-06, + "loss": 0.6405, + "step": 16086 + }, + { + "epoch": 1.1622085357704048, + "grad_norm": 6.223375296850514, + "learning_rate": 1.968658230792937e-06, + "loss": 0.676, + "step": 16087 + }, + { + "epoch": 1.1622807809706144, + "grad_norm": 6.162402182029032, + "learning_rate": 1.9683724148196557e-06, + "loss": 0.7026, + "step": 16088 + }, + { + "epoch": 1.1623530261708237, + "grad_norm": 6.262863654201889, + "learning_rate": 1.9680866061239435e-06, + "loss": 0.629, + "step": 16089 + }, + { + "epoch": 1.1624252713710332, + "grad_norm": 5.3230122397764195, + "learning_rate": 1.9678008047097124e-06, + "loss": 0.6048, + "step": 16090 + }, + { + "epoch": 1.1624975165712428, + "grad_norm": 7.515037466071282, + "learning_rate": 1.967515010580876e-06, + "loss": 0.6302, + "step": 16091 + }, + { + "epoch": 1.1625697617714523, + "grad_norm": 7.0332074640711415, + "learning_rate": 1.9672292237413446e-06, + "loss": 0.5999, + "step": 16092 + }, + { + "epoch": 1.1626420069716619, + "grad_norm": 6.173002888436432, + "learning_rate": 1.9669434441950313e-06, + "loss": 0.6912, + "step": 16093 + }, + { + "epoch": 1.1627142521718714, + "grad_norm": 6.589750550430155, + "learning_rate": 1.9666576719458495e-06, + "loss": 0.735, + "step": 16094 + }, + { + "epoch": 1.162786497372081, + "grad_norm": 7.091073089872785, + "learning_rate": 1.9663719069977094e-06, + "loss": 0.6406, + "step": 16095 + }, + { + "epoch": 1.1628587425722903, + "grad_norm": 6.96202385471096, + "learning_rate": 1.9660861493545237e-06, + "loss": 0.6547, + "step": 16096 + }, + { + "epoch": 1.1629309877724998, + "grad_norm": 6.04770893191755, + "learning_rate": 1.9658003990202036e-06, + "loss": 0.6843, + "step": 16097 + }, + { + "epoch": 1.1630032329727094, + "grad_norm": 5.8787628460499795, + "learning_rate": 1.965514655998662e-06, + "loss": 0.6703, + "step": 16098 + }, + { + "epoch": 1.163075478172919, + "grad_norm": 5.900257764455017, + "learning_rate": 1.965228920293809e-06, + "loss": 0.6623, + "step": 16099 + }, + { + "epoch": 1.1631477233731284, + "grad_norm": 6.820747632525452, + "learning_rate": 1.9649431919095572e-06, + "loss": 0.7075, + "step": 16100 + }, + { + "epoch": 1.163219968573338, + "grad_norm": 6.073817749524871, + "learning_rate": 1.9646574708498177e-06, + "loss": 0.6925, + "step": 16101 + }, + { + "epoch": 1.1632922137735475, + "grad_norm": 6.989856182141481, + "learning_rate": 1.9643717571185017e-06, + "loss": 0.6775, + "step": 16102 + }, + { + "epoch": 1.1633644589737568, + "grad_norm": 6.870707055900291, + "learning_rate": 1.96408605071952e-06, + "loss": 0.6447, + "step": 16103 + }, + { + "epoch": 1.1634367041739664, + "grad_norm": 7.028749284033593, + "learning_rate": 1.9638003516567845e-06, + "loss": 0.6918, + "step": 16104 + }, + { + "epoch": 1.163508949374176, + "grad_norm": 6.117935777988663, + "learning_rate": 1.963514659934207e-06, + "loss": 0.6051, + "step": 16105 + }, + { + "epoch": 1.1635811945743855, + "grad_norm": 6.530673791026078, + "learning_rate": 1.963228975555696e-06, + "loss": 0.7126, + "step": 16106 + }, + { + "epoch": 1.163653439774595, + "grad_norm": 7.120585094371223, + "learning_rate": 1.962943298525163e-06, + "loss": 0.6504, + "step": 16107 + }, + { + "epoch": 1.1637256849748046, + "grad_norm": 5.9017995385279525, + "learning_rate": 1.962657628846521e-06, + "loss": 0.6143, + "step": 16108 + }, + { + "epoch": 1.163797930175014, + "grad_norm": 6.220870533461679, + "learning_rate": 1.962371966523678e-06, + "loss": 0.6577, + "step": 16109 + }, + { + "epoch": 1.1638701753752234, + "grad_norm": 7.01440826943808, + "learning_rate": 1.962086311560545e-06, + "loss": 0.6908, + "step": 16110 + }, + { + "epoch": 1.163942420575433, + "grad_norm": 8.309512870824054, + "learning_rate": 1.9618006639610325e-06, + "loss": 0.7209, + "step": 16111 + }, + { + "epoch": 1.1640146657756425, + "grad_norm": 6.793747316024797, + "learning_rate": 1.961515023729052e-06, + "loss": 0.634, + "step": 16112 + }, + { + "epoch": 1.164086910975852, + "grad_norm": 6.799638839555279, + "learning_rate": 1.961229390868512e-06, + "loss": 0.5973, + "step": 16113 + }, + { + "epoch": 1.1641591561760616, + "grad_norm": 7.061466107605926, + "learning_rate": 1.9609437653833235e-06, + "loss": 0.6439, + "step": 16114 + }, + { + "epoch": 1.1642314013762711, + "grad_norm": 6.930168883452541, + "learning_rate": 1.9606581472773957e-06, + "loss": 0.7037, + "step": 16115 + }, + { + "epoch": 1.1643036465764807, + "grad_norm": 6.670414284532478, + "learning_rate": 1.9603725365546404e-06, + "loss": 0.6325, + "step": 16116 + }, + { + "epoch": 1.16437589177669, + "grad_norm": 6.064990289213327, + "learning_rate": 1.960086933218965e-06, + "loss": 0.6304, + "step": 16117 + }, + { + "epoch": 1.1644481369768995, + "grad_norm": 6.234022546428188, + "learning_rate": 1.9598013372742806e-06, + "loss": 0.5925, + "step": 16118 + }, + { + "epoch": 1.164520382177109, + "grad_norm": 6.8065229320886615, + "learning_rate": 1.9595157487244973e-06, + "loss": 0.6331, + "step": 16119 + }, + { + "epoch": 1.1645926273773186, + "grad_norm": 7.271814085242556, + "learning_rate": 1.9592301675735227e-06, + "loss": 0.7466, + "step": 16120 + }, + { + "epoch": 1.1646648725775282, + "grad_norm": 8.185314556548724, + "learning_rate": 1.9589445938252666e-06, + "loss": 0.6523, + "step": 16121 + }, + { + "epoch": 1.1647371177777377, + "grad_norm": 6.91799632087956, + "learning_rate": 1.95865902748364e-06, + "loss": 0.6746, + "step": 16122 + }, + { + "epoch": 1.1648093629779472, + "grad_norm": 7.214419168103935, + "learning_rate": 1.9583734685525514e-06, + "loss": 0.712, + "step": 16123 + }, + { + "epoch": 1.1648816081781566, + "grad_norm": 5.944714790606055, + "learning_rate": 1.9580879170359083e-06, + "loss": 0.6919, + "step": 16124 + }, + { + "epoch": 1.164953853378366, + "grad_norm": 6.18559436159543, + "learning_rate": 1.9578023729376213e-06, + "loss": 0.6207, + "step": 16125 + }, + { + "epoch": 1.1650260985785756, + "grad_norm": 7.870233697971958, + "learning_rate": 1.9575168362615993e-06, + "loss": 0.6151, + "step": 16126 + }, + { + "epoch": 1.1650983437787852, + "grad_norm": 7.488523476373775, + "learning_rate": 1.9572313070117503e-06, + "loss": 0.6822, + "step": 16127 + }, + { + "epoch": 1.1651705889789947, + "grad_norm": 6.2794451041991275, + "learning_rate": 1.956945785191983e-06, + "loss": 0.6417, + "step": 16128 + }, + { + "epoch": 1.1652428341792043, + "grad_norm": 6.659704896729542, + "learning_rate": 1.956660270806206e-06, + "loss": 0.5882, + "step": 16129 + }, + { + "epoch": 1.1653150793794138, + "grad_norm": 6.950146943878519, + "learning_rate": 1.956374763858329e-06, + "loss": 0.615, + "step": 16130 + }, + { + "epoch": 1.1653873245796231, + "grad_norm": 6.408647488688806, + "learning_rate": 1.9560892643522584e-06, + "loss": 0.665, + "step": 16131 + }, + { + "epoch": 1.1654595697798327, + "grad_norm": 5.820421624920802, + "learning_rate": 1.9558037722919038e-06, + "loss": 0.5865, + "step": 16132 + }, + { + "epoch": 1.1655318149800422, + "grad_norm": 6.42630275476866, + "learning_rate": 1.955518287681174e-06, + "loss": 0.6865, + "step": 16133 + }, + { + "epoch": 1.1656040601802518, + "grad_norm": 7.420849795557947, + "learning_rate": 1.9552328105239747e-06, + "loss": 0.6421, + "step": 16134 + }, + { + "epoch": 1.1656763053804613, + "grad_norm": 6.983752877948496, + "learning_rate": 1.954947340824216e-06, + "loss": 0.6651, + "step": 16135 + }, + { + "epoch": 1.1657485505806708, + "grad_norm": 6.016329478839172, + "learning_rate": 1.954661878585804e-06, + "loss": 0.6335, + "step": 16136 + }, + { + "epoch": 1.1658207957808804, + "grad_norm": 6.325085774318814, + "learning_rate": 1.954376423812649e-06, + "loss": 0.6335, + "step": 16137 + }, + { + "epoch": 1.1658930409810897, + "grad_norm": 6.890009361670436, + "learning_rate": 1.954090976508656e-06, + "loss": 0.628, + "step": 16138 + }, + { + "epoch": 1.1659652861812992, + "grad_norm": 8.051305762116954, + "learning_rate": 1.9538055366777335e-06, + "loss": 0.6403, + "step": 16139 + }, + { + "epoch": 1.1660375313815088, + "grad_norm": 7.148961515971109, + "learning_rate": 1.9535201043237895e-06, + "loss": 0.7226, + "step": 16140 + }, + { + "epoch": 1.1661097765817183, + "grad_norm": 6.43599438884104, + "learning_rate": 1.953234679450731e-06, + "loss": 0.6787, + "step": 16141 + }, + { + "epoch": 1.1661820217819279, + "grad_norm": 6.631737503986217, + "learning_rate": 1.9529492620624654e-06, + "loss": 0.6574, + "step": 16142 + }, + { + "epoch": 1.1662542669821374, + "grad_norm": 7.59640921039972, + "learning_rate": 1.9526638521628994e-06, + "loss": 0.6996, + "step": 16143 + }, + { + "epoch": 1.166326512182347, + "grad_norm": 5.997731415711257, + "learning_rate": 1.9523784497559407e-06, + "loss": 0.6629, + "step": 16144 + }, + { + "epoch": 1.1663987573825563, + "grad_norm": 6.28170729153962, + "learning_rate": 1.952093054845495e-06, + "loss": 0.6479, + "step": 16145 + }, + { + "epoch": 1.1664710025827658, + "grad_norm": 7.217095255587434, + "learning_rate": 1.9518076674354703e-06, + "loss": 0.6291, + "step": 16146 + }, + { + "epoch": 1.1665432477829754, + "grad_norm": 6.990282943740183, + "learning_rate": 1.951522287529773e-06, + "loss": 0.6151, + "step": 16147 + }, + { + "epoch": 1.166615492983185, + "grad_norm": 7.171338296060434, + "learning_rate": 1.9512369151323106e-06, + "loss": 0.6451, + "step": 16148 + }, + { + "epoch": 1.1666877381833944, + "grad_norm": 6.849658662408216, + "learning_rate": 1.950951550246988e-06, + "loss": 0.7172, + "step": 16149 + }, + { + "epoch": 1.166759983383604, + "grad_norm": 7.238032757219331, + "learning_rate": 1.9506661928777116e-06, + "loss": 0.6636, + "step": 16150 + }, + { + "epoch": 1.1668322285838135, + "grad_norm": 7.704496868918426, + "learning_rate": 1.95038084302839e-06, + "loss": 0.5911, + "step": 16151 + }, + { + "epoch": 1.166904473784023, + "grad_norm": 5.862070994347769, + "learning_rate": 1.9500955007029268e-06, + "loss": 0.5449, + "step": 16152 + }, + { + "epoch": 1.1669767189842326, + "grad_norm": 5.888406269922126, + "learning_rate": 1.9498101659052295e-06, + "loss": 0.5775, + "step": 16153 + }, + { + "epoch": 1.167048964184442, + "grad_norm": 7.9949680238505385, + "learning_rate": 1.9495248386392035e-06, + "loss": 0.7077, + "step": 16154 + }, + { + "epoch": 1.1671212093846515, + "grad_norm": 6.553268770124291, + "learning_rate": 1.949239518908756e-06, + "loss": 0.6674, + "step": 16155 + }, + { + "epoch": 1.167193454584861, + "grad_norm": 5.620007821617249, + "learning_rate": 1.948954206717791e-06, + "loss": 0.6424, + "step": 16156 + }, + { + "epoch": 1.1672656997850706, + "grad_norm": 6.295597221277628, + "learning_rate": 1.948668902070215e-06, + "loss": 0.6294, + "step": 16157 + }, + { + "epoch": 1.16733794498528, + "grad_norm": 5.949922115553475, + "learning_rate": 1.9483836049699345e-06, + "loss": 0.5591, + "step": 16158 + }, + { + "epoch": 1.1674101901854896, + "grad_norm": 6.437292891939229, + "learning_rate": 1.9480983154208534e-06, + "loss": 0.6256, + "step": 16159 + }, + { + "epoch": 1.1674824353856992, + "grad_norm": 6.031942219121007, + "learning_rate": 1.947813033426878e-06, + "loss": 0.5829, + "step": 16160 + }, + { + "epoch": 1.1675546805859085, + "grad_norm": 7.4558898787654675, + "learning_rate": 1.947527758991913e-06, + "loss": 0.6433, + "step": 16161 + }, + { + "epoch": 1.167626925786118, + "grad_norm": 5.993217767627427, + "learning_rate": 1.947242492119866e-06, + "loss": 0.6719, + "step": 16162 + }, + { + "epoch": 1.1676991709863276, + "grad_norm": 6.443166368547626, + "learning_rate": 1.946957232814638e-06, + "loss": 0.6588, + "step": 16163 + }, + { + "epoch": 1.1677714161865371, + "grad_norm": 7.222506251822645, + "learning_rate": 1.946671981080136e-06, + "loss": 0.6869, + "step": 16164 + }, + { + "epoch": 1.1678436613867467, + "grad_norm": 6.789474351899244, + "learning_rate": 1.9463867369202656e-06, + "loss": 0.7121, + "step": 16165 + }, + { + "epoch": 1.1679159065869562, + "grad_norm": 5.979217458192461, + "learning_rate": 1.9461015003389316e-06, + "loss": 0.6718, + "step": 16166 + }, + { + "epoch": 1.1679881517871658, + "grad_norm": 6.0710335675139255, + "learning_rate": 1.945816271340037e-06, + "loss": 0.6172, + "step": 16167 + }, + { + "epoch": 1.168060396987375, + "grad_norm": 7.896526886634268, + "learning_rate": 1.9455310499274877e-06, + "loss": 0.672, + "step": 16168 + }, + { + "epoch": 1.1681326421875846, + "grad_norm": 7.920010740484797, + "learning_rate": 1.945245836105188e-06, + "loss": 0.6375, + "step": 16169 + }, + { + "epoch": 1.1682048873877942, + "grad_norm": 6.4565922841186, + "learning_rate": 1.944960629877042e-06, + "loss": 0.7154, + "step": 16170 + }, + { + "epoch": 1.1682771325880037, + "grad_norm": 5.9345998005325065, + "learning_rate": 1.9446754312469534e-06, + "loss": 0.5935, + "step": 16171 + }, + { + "epoch": 1.1683493777882132, + "grad_norm": 7.055123857455868, + "learning_rate": 1.9443902402188273e-06, + "loss": 0.6795, + "step": 16172 + }, + { + "epoch": 1.1684216229884228, + "grad_norm": 7.247482915189041, + "learning_rate": 1.944105056796568e-06, + "loss": 0.7044, + "step": 16173 + }, + { + "epoch": 1.1684938681886323, + "grad_norm": 6.72298904472978, + "learning_rate": 1.943819880984078e-06, + "loss": 0.7123, + "step": 16174 + }, + { + "epoch": 1.1685661133888416, + "grad_norm": 6.155194651967823, + "learning_rate": 1.943534712785262e-06, + "loss": 0.5876, + "step": 16175 + }, + { + "epoch": 1.1686383585890512, + "grad_norm": 6.219570393337915, + "learning_rate": 1.9432495522040253e-06, + "loss": 0.6511, + "step": 16176 + }, + { + "epoch": 1.1687106037892607, + "grad_norm": 7.449415391347939, + "learning_rate": 1.9429643992442686e-06, + "loss": 0.6958, + "step": 16177 + }, + { + "epoch": 1.1687828489894703, + "grad_norm": 6.895421441291848, + "learning_rate": 1.942679253909896e-06, + "loss": 0.6906, + "step": 16178 + }, + { + "epoch": 1.1688550941896798, + "grad_norm": 7.986641937158135, + "learning_rate": 1.9423941162048122e-06, + "loss": 0.647, + "step": 16179 + }, + { + "epoch": 1.1689273393898894, + "grad_norm": 6.361634249139463, + "learning_rate": 1.9421089861329213e-06, + "loss": 0.626, + "step": 16180 + }, + { + "epoch": 1.168999584590099, + "grad_norm": 6.741117779329695, + "learning_rate": 1.941823863698124e-06, + "loss": 0.6916, + "step": 16181 + }, + { + "epoch": 1.1690718297903082, + "grad_norm": 6.820372066696495, + "learning_rate": 1.941538748904325e-06, + "loss": 0.6495, + "step": 16182 + }, + { + "epoch": 1.1691440749905178, + "grad_norm": 6.139330545517088, + "learning_rate": 1.941253641755427e-06, + "loss": 0.6406, + "step": 16183 + }, + { + "epoch": 1.1692163201907273, + "grad_norm": 7.094299832852302, + "learning_rate": 1.9409685422553323e-06, + "loss": 0.6889, + "step": 16184 + }, + { + "epoch": 1.1692885653909368, + "grad_norm": 6.534549153660225, + "learning_rate": 1.9406834504079444e-06, + "loss": 0.6706, + "step": 16185 + }, + { + "epoch": 1.1693608105911464, + "grad_norm": 6.578381313667354, + "learning_rate": 1.9403983662171656e-06, + "loss": 0.6205, + "step": 16186 + }, + { + "epoch": 1.169433055791356, + "grad_norm": 6.118071393576925, + "learning_rate": 1.9401132896868993e-06, + "loss": 0.6386, + "step": 16187 + }, + { + "epoch": 1.1695053009915655, + "grad_norm": 7.264038962894084, + "learning_rate": 1.9398282208210467e-06, + "loss": 0.6854, + "step": 16188 + }, + { + "epoch": 1.1695775461917748, + "grad_norm": 6.6631064763396255, + "learning_rate": 1.939543159623511e-06, + "loss": 0.6415, + "step": 16189 + }, + { + "epoch": 1.1696497913919843, + "grad_norm": 6.703734470216431, + "learning_rate": 1.9392581060981953e-06, + "loss": 0.6839, + "step": 16190 + }, + { + "epoch": 1.1697220365921939, + "grad_norm": 5.753076476680386, + "learning_rate": 1.9389730602489994e-06, + "loss": 0.6619, + "step": 16191 + }, + { + "epoch": 1.1697942817924034, + "grad_norm": 8.154414061657945, + "learning_rate": 1.9386880220798266e-06, + "loss": 0.7792, + "step": 16192 + }, + { + "epoch": 1.169866526992613, + "grad_norm": 6.086079156326371, + "learning_rate": 1.9384029915945793e-06, + "loss": 0.6268, + "step": 16193 + }, + { + "epoch": 1.1699387721928225, + "grad_norm": 7.376675948757578, + "learning_rate": 1.9381179687971597e-06, + "loss": 0.6338, + "step": 16194 + }, + { + "epoch": 1.170011017393032, + "grad_norm": 6.89568504642733, + "learning_rate": 1.9378329536914685e-06, + "loss": 0.6413, + "step": 16195 + }, + { + "epoch": 1.1700832625932414, + "grad_norm": 7.362757980412299, + "learning_rate": 1.937547946281407e-06, + "loss": 0.6408, + "step": 16196 + }, + { + "epoch": 1.170155507793451, + "grad_norm": 6.584841527284702, + "learning_rate": 1.937262946570878e-06, + "loss": 0.6192, + "step": 16197 + }, + { + "epoch": 1.1702277529936604, + "grad_norm": 7.136088526668459, + "learning_rate": 1.9369779545637823e-06, + "loss": 0.6718, + "step": 16198 + }, + { + "epoch": 1.17029999819387, + "grad_norm": 7.667749356101064, + "learning_rate": 1.9366929702640207e-06, + "loss": 0.6797, + "step": 16199 + }, + { + "epoch": 1.1703722433940795, + "grad_norm": 7.161302939838712, + "learning_rate": 1.9364079936754955e-06, + "loss": 0.6047, + "step": 16200 + }, + { + "epoch": 1.170444488594289, + "grad_norm": 7.965133504568864, + "learning_rate": 1.9361230248021072e-06, + "loss": 0.6345, + "step": 16201 + }, + { + "epoch": 1.1705167337944986, + "grad_norm": 6.756428800717148, + "learning_rate": 1.935838063647757e-06, + "loss": 0.6265, + "step": 16202 + }, + { + "epoch": 1.170588978994708, + "grad_norm": 8.636382622218823, + "learning_rate": 1.935553110216345e-06, + "loss": 0.725, + "step": 16203 + }, + { + "epoch": 1.1706612241949175, + "grad_norm": 7.063670643698394, + "learning_rate": 1.935268164511773e-06, + "loss": 0.6789, + "step": 16204 + }, + { + "epoch": 1.170733469395127, + "grad_norm": 9.328032577798007, + "learning_rate": 1.9349832265379426e-06, + "loss": 0.6718, + "step": 16205 + }, + { + "epoch": 1.1708057145953366, + "grad_norm": 6.646055052435976, + "learning_rate": 1.934698296298751e-06, + "loss": 0.61, + "step": 16206 + }, + { + "epoch": 1.170877959795546, + "grad_norm": 7.5107492345931375, + "learning_rate": 1.9344133737981017e-06, + "loss": 0.7028, + "step": 16207 + }, + { + "epoch": 1.1709502049957556, + "grad_norm": 6.205924298517166, + "learning_rate": 1.934128459039895e-06, + "loss": 0.6977, + "step": 16208 + }, + { + "epoch": 1.1710224501959652, + "grad_norm": 5.549542712368853, + "learning_rate": 1.9338435520280296e-06, + "loss": 0.6089, + "step": 16209 + }, + { + "epoch": 1.1710946953961745, + "grad_norm": 5.797997543350231, + "learning_rate": 1.933558652766406e-06, + "loss": 0.6209, + "step": 16210 + }, + { + "epoch": 1.171166940596384, + "grad_norm": 6.940734659056048, + "learning_rate": 1.9332737612589246e-06, + "loss": 0.6334, + "step": 16211 + }, + { + "epoch": 1.1712391857965936, + "grad_norm": 8.75231336257891, + "learning_rate": 1.9329888775094862e-06, + "loss": 0.6037, + "step": 16212 + }, + { + "epoch": 1.1713114309968031, + "grad_norm": 7.5446506168070835, + "learning_rate": 1.9327040015219893e-06, + "loss": 0.6039, + "step": 16213 + }, + { + "epoch": 1.1713836761970127, + "grad_norm": 7.461776478515826, + "learning_rate": 1.932419133300334e-06, + "loss": 0.706, + "step": 16214 + }, + { + "epoch": 1.1714559213972222, + "grad_norm": 6.5006584420860065, + "learning_rate": 1.9321342728484207e-06, + "loss": 0.6343, + "step": 16215 + }, + { + "epoch": 1.1715281665974318, + "grad_norm": 6.505464164482142, + "learning_rate": 1.9318494201701477e-06, + "loss": 0.6751, + "step": 16216 + }, + { + "epoch": 1.171600411797641, + "grad_norm": 6.82402114276361, + "learning_rate": 1.931564575269415e-06, + "loss": 0.6554, + "step": 16217 + }, + { + "epoch": 1.1716726569978506, + "grad_norm": 6.875990640186732, + "learning_rate": 1.931279738150122e-06, + "loss": 0.6423, + "step": 16218 + }, + { + "epoch": 1.1717449021980602, + "grad_norm": 7.34241541646543, + "learning_rate": 1.9309949088161687e-06, + "loss": 0.6761, + "step": 16219 + }, + { + "epoch": 1.1718171473982697, + "grad_norm": 7.388861350993737, + "learning_rate": 1.9307100872714515e-06, + "loss": 0.6689, + "step": 16220 + }, + { + "epoch": 1.1718893925984792, + "grad_norm": 6.858585251375872, + "learning_rate": 1.930425273519872e-06, + "loss": 0.6567, + "step": 16221 + }, + { + "epoch": 1.1719616377986888, + "grad_norm": 7.228483169594121, + "learning_rate": 1.9301404675653283e-06, + "loss": 0.5839, + "step": 16222 + }, + { + "epoch": 1.1720338829988983, + "grad_norm": 7.822784490438036, + "learning_rate": 1.9298556694117202e-06, + "loss": 0.6845, + "step": 16223 + }, + { + "epoch": 1.1721061281991079, + "grad_norm": 7.038790444590005, + "learning_rate": 1.9295708790629443e-06, + "loss": 0.6454, + "step": 16224 + }, + { + "epoch": 1.1721783733993172, + "grad_norm": 7.807619569359392, + "learning_rate": 1.9292860965229e-06, + "loss": 0.6499, + "step": 16225 + }, + { + "epoch": 1.1722506185995267, + "grad_norm": 6.9828215029870355, + "learning_rate": 1.929001321795486e-06, + "loss": 0.6906, + "step": 16226 + }, + { + "epoch": 1.1723228637997363, + "grad_norm": 6.103680310278228, + "learning_rate": 1.9287165548846005e-06, + "loss": 0.6441, + "step": 16227 + }, + { + "epoch": 1.1723951089999458, + "grad_norm": 7.278445772118249, + "learning_rate": 1.928431795794142e-06, + "loss": 0.6593, + "step": 16228 + }, + { + "epoch": 1.1724673542001554, + "grad_norm": 6.964309988573409, + "learning_rate": 1.9281470445280084e-06, + "loss": 0.6501, + "step": 16229 + }, + { + "epoch": 1.172539599400365, + "grad_norm": 9.69333674658602, + "learning_rate": 1.9278623010900978e-06, + "loss": 0.761, + "step": 16230 + }, + { + "epoch": 1.1726118446005744, + "grad_norm": 7.730230403886289, + "learning_rate": 1.927577565484308e-06, + "loss": 0.6973, + "step": 16231 + }, + { + "epoch": 1.172684089800784, + "grad_norm": 5.8135089613839765, + "learning_rate": 1.9272928377145366e-06, + "loss": 0.6358, + "step": 16232 + }, + { + "epoch": 1.1727563350009933, + "grad_norm": 7.035588125711894, + "learning_rate": 1.9270081177846825e-06, + "loss": 0.6367, + "step": 16233 + }, + { + "epoch": 1.1728285802012028, + "grad_norm": 6.046096791110005, + "learning_rate": 1.9267234056986415e-06, + "loss": 0.6355, + "step": 16234 + }, + { + "epoch": 1.1729008254014124, + "grad_norm": 5.53113085947121, + "learning_rate": 1.9264387014603116e-06, + "loss": 0.6083, + "step": 16235 + }, + { + "epoch": 1.172973070601622, + "grad_norm": 8.143931239139487, + "learning_rate": 1.9261540050735904e-06, + "loss": 0.6953, + "step": 16236 + }, + { + "epoch": 1.1730453158018315, + "grad_norm": 6.937770803853832, + "learning_rate": 1.925869316542377e-06, + "loss": 0.5998, + "step": 16237 + }, + { + "epoch": 1.173117561002041, + "grad_norm": 6.403821531421595, + "learning_rate": 1.9255846358705657e-06, + "loss": 0.6569, + "step": 16238 + }, + { + "epoch": 1.1731898062022506, + "grad_norm": 7.216630368512193, + "learning_rate": 1.925299963062055e-06, + "loss": 0.6694, + "step": 16239 + }, + { + "epoch": 1.1732620514024599, + "grad_norm": 6.750195818285946, + "learning_rate": 1.9250152981207416e-06, + "loss": 0.6702, + "step": 16240 + }, + { + "epoch": 1.1733342966026694, + "grad_norm": 7.461520346998451, + "learning_rate": 1.924730641050522e-06, + "loss": 0.6761, + "step": 16241 + }, + { + "epoch": 1.173406541802879, + "grad_norm": 6.583035844608206, + "learning_rate": 1.9244459918552934e-06, + "loss": 0.6002, + "step": 16242 + }, + { + "epoch": 1.1734787870030885, + "grad_norm": 5.892595280365316, + "learning_rate": 1.924161350538952e-06, + "loss": 0.6819, + "step": 16243 + }, + { + "epoch": 1.173551032203298, + "grad_norm": 6.2099783678645535, + "learning_rate": 1.923876717105395e-06, + "loss": 0.6585, + "step": 16244 + }, + { + "epoch": 1.1736232774035076, + "grad_norm": 7.3771759961038095, + "learning_rate": 1.923592091558518e-06, + "loss": 0.7318, + "step": 16245 + }, + { + "epoch": 1.1736955226037171, + "grad_norm": 6.000316611519476, + "learning_rate": 1.9233074739022186e-06, + "loss": 0.6743, + "step": 16246 + }, + { + "epoch": 1.1737677678039264, + "grad_norm": 7.551022481839147, + "learning_rate": 1.923022864140391e-06, + "loss": 0.6181, + "step": 16247 + }, + { + "epoch": 1.173840013004136, + "grad_norm": 6.604559981931548, + "learning_rate": 1.922738262276934e-06, + "loss": 0.6975, + "step": 16248 + }, + { + "epoch": 1.1739122582043455, + "grad_norm": 6.615452346386787, + "learning_rate": 1.9224536683157403e-06, + "loss": 0.677, + "step": 16249 + }, + { + "epoch": 1.173984503404555, + "grad_norm": 5.872657471190175, + "learning_rate": 1.9221690822607077e-06, + "loss": 0.641, + "step": 16250 + }, + { + "epoch": 1.1740567486047646, + "grad_norm": 7.295363733271455, + "learning_rate": 1.921884504115733e-06, + "loss": 0.6696, + "step": 16251 + }, + { + "epoch": 1.1741289938049742, + "grad_norm": 6.72731104284524, + "learning_rate": 1.9215999338847094e-06, + "loss": 0.5643, + "step": 16252 + }, + { + "epoch": 1.1742012390051837, + "grad_norm": 7.48949153951774, + "learning_rate": 1.921315371571534e-06, + "loss": 0.6179, + "step": 16253 + }, + { + "epoch": 1.174273484205393, + "grad_norm": 6.584650640235599, + "learning_rate": 1.921030817180101e-06, + "loss": 0.6246, + "step": 16254 + }, + { + "epoch": 1.1743457294056026, + "grad_norm": 8.760369396346778, + "learning_rate": 1.9207462707143076e-06, + "loss": 0.7043, + "step": 16255 + }, + { + "epoch": 1.174417974605812, + "grad_norm": 6.993579372202959, + "learning_rate": 1.9204617321780478e-06, + "loss": 0.5886, + "step": 16256 + }, + { + "epoch": 1.1744902198060216, + "grad_norm": 4.697128173339772, + "learning_rate": 1.9201772015752163e-06, + "loss": 0.6131, + "step": 16257 + }, + { + "epoch": 1.1745624650062312, + "grad_norm": 6.312943546839773, + "learning_rate": 1.9198926789097095e-06, + "loss": 0.7589, + "step": 16258 + }, + { + "epoch": 1.1746347102064407, + "grad_norm": 5.510411205421246, + "learning_rate": 1.919608164185421e-06, + "loss": 0.6547, + "step": 16259 + }, + { + "epoch": 1.1747069554066503, + "grad_norm": 7.119597946988145, + "learning_rate": 1.9193236574062465e-06, + "loss": 0.7401, + "step": 16260 + }, + { + "epoch": 1.1747792006068596, + "grad_norm": 6.4052423940826095, + "learning_rate": 1.9190391585760795e-06, + "loss": 0.6209, + "step": 16261 + }, + { + "epoch": 1.1748514458070691, + "grad_norm": 6.702490147402337, + "learning_rate": 1.9187546676988167e-06, + "loss": 0.6651, + "step": 16262 + }, + { + "epoch": 1.1749236910072787, + "grad_norm": 6.908079155393885, + "learning_rate": 1.9184701847783498e-06, + "loss": 0.618, + "step": 16263 + }, + { + "epoch": 1.1749959362074882, + "grad_norm": 6.334706140942123, + "learning_rate": 1.9181857098185746e-06, + "loss": 0.5673, + "step": 16264 + }, + { + "epoch": 1.1750681814076978, + "grad_norm": 6.314085506330271, + "learning_rate": 1.9179012428233867e-06, + "loss": 0.6443, + "step": 16265 + }, + { + "epoch": 1.1751404266079073, + "grad_norm": 7.474322040139613, + "learning_rate": 1.9176167837966778e-06, + "loss": 0.6286, + "step": 16266 + }, + { + "epoch": 1.1752126718081168, + "grad_norm": 7.5310823413975605, + "learning_rate": 1.917332332742343e-06, + "loss": 0.6638, + "step": 16267 + }, + { + "epoch": 1.1752849170083262, + "grad_norm": 6.668954393137946, + "learning_rate": 1.917047889664276e-06, + "loss": 0.6618, + "step": 16268 + }, + { + "epoch": 1.1753571622085357, + "grad_norm": 6.0734010201181245, + "learning_rate": 1.916763454566371e-06, + "loss": 0.701, + "step": 16269 + }, + { + "epoch": 1.1754294074087452, + "grad_norm": 7.9860709044692895, + "learning_rate": 1.9164790274525215e-06, + "loss": 0.767, + "step": 16270 + }, + { + "epoch": 1.1755016526089548, + "grad_norm": 9.203630261888978, + "learning_rate": 1.9161946083266205e-06, + "loss": 0.5687, + "step": 16271 + }, + { + "epoch": 1.1755738978091643, + "grad_norm": 6.7644010850772975, + "learning_rate": 1.915910197192562e-06, + "loss": 0.6617, + "step": 16272 + }, + { + "epoch": 1.1756461430093739, + "grad_norm": 6.6790824036053325, + "learning_rate": 1.9156257940542407e-06, + "loss": 0.6005, + "step": 16273 + }, + { + "epoch": 1.1757183882095834, + "grad_norm": 6.5284315534884785, + "learning_rate": 1.9153413989155475e-06, + "loss": 0.6333, + "step": 16274 + }, + { + "epoch": 1.1757906334097927, + "grad_norm": 6.929156535728635, + "learning_rate": 1.9150570117803762e-06, + "loss": 0.6112, + "step": 16275 + }, + { + "epoch": 1.1758628786100023, + "grad_norm": 6.444237007807283, + "learning_rate": 1.9147726326526216e-06, + "loss": 0.6267, + "step": 16276 + }, + { + "epoch": 1.1759351238102118, + "grad_norm": 6.812203497077557, + "learning_rate": 1.914488261536174e-06, + "loss": 0.6374, + "step": 16277 + }, + { + "epoch": 1.1760073690104214, + "grad_norm": 6.362093257830939, + "learning_rate": 1.9142038984349275e-06, + "loss": 0.5733, + "step": 16278 + }, + { + "epoch": 1.176079614210631, + "grad_norm": 7.294006437289904, + "learning_rate": 1.913919543352775e-06, + "loss": 0.6795, + "step": 16279 + }, + { + "epoch": 1.1761518594108404, + "grad_norm": 7.584137535071401, + "learning_rate": 1.9136351962936104e-06, + "loss": 0.5446, + "step": 16280 + }, + { + "epoch": 1.17622410461105, + "grad_norm": 8.311186034102295, + "learning_rate": 1.913350857261323e-06, + "loss": 0.6571, + "step": 16281 + }, + { + "epoch": 1.1762963498112593, + "grad_norm": 7.554861703814937, + "learning_rate": 1.913066526259807e-06, + "loss": 0.6028, + "step": 16282 + }, + { + "epoch": 1.1763685950114688, + "grad_norm": 6.820563907075282, + "learning_rate": 1.9127822032929553e-06, + "loss": 0.6738, + "step": 16283 + }, + { + "epoch": 1.1764408402116784, + "grad_norm": 6.967066266343327, + "learning_rate": 1.9124978883646586e-06, + "loss": 0.5738, + "step": 16284 + }, + { + "epoch": 1.176513085411888, + "grad_norm": 6.699531641290045, + "learning_rate": 1.9122135814788096e-06, + "loss": 0.6472, + "step": 16285 + }, + { + "epoch": 1.1765853306120975, + "grad_norm": 8.419093421099634, + "learning_rate": 1.9119292826393002e-06, + "loss": 0.6975, + "step": 16286 + }, + { + "epoch": 1.176657575812307, + "grad_norm": 7.092083033506474, + "learning_rate": 1.911644991850023e-06, + "loss": 0.5925, + "step": 16287 + }, + { + "epoch": 1.1767298210125166, + "grad_norm": 7.066547415584854, + "learning_rate": 1.9113607091148684e-06, + "loss": 0.6447, + "step": 16288 + }, + { + "epoch": 1.1768020662127259, + "grad_norm": 7.507888968214248, + "learning_rate": 1.9110764344377285e-06, + "loss": 0.6685, + "step": 16289 + }, + { + "epoch": 1.1768743114129354, + "grad_norm": 6.088183555372345, + "learning_rate": 1.9107921678224966e-06, + "loss": 0.6657, + "step": 16290 + }, + { + "epoch": 1.176946556613145, + "grad_norm": 7.1413656639428975, + "learning_rate": 1.91050790927306e-06, + "loss": 0.6832, + "step": 16291 + }, + { + "epoch": 1.1770188018133545, + "grad_norm": 6.428543526725163, + "learning_rate": 1.9102236587933137e-06, + "loss": 0.6347, + "step": 16292 + }, + { + "epoch": 1.177091047013564, + "grad_norm": 8.04288195451589, + "learning_rate": 1.909939416387147e-06, + "loss": 0.7274, + "step": 16293 + }, + { + "epoch": 1.1771632922137736, + "grad_norm": 6.835178808344447, + "learning_rate": 1.909655182058453e-06, + "loss": 0.6339, + "step": 16294 + }, + { + "epoch": 1.1772355374139831, + "grad_norm": 7.980712050408972, + "learning_rate": 1.90937095581112e-06, + "loss": 0.6872, + "step": 16295 + }, + { + "epoch": 1.1773077826141927, + "grad_norm": 7.036085576806228, + "learning_rate": 1.90908673764904e-06, + "loss": 0.7006, + "step": 16296 + }, + { + "epoch": 1.177380027814402, + "grad_norm": 5.948493818172293, + "learning_rate": 1.908802527576104e-06, + "loss": 0.6503, + "step": 16297 + }, + { + "epoch": 1.1774522730146115, + "grad_norm": 6.197998080533652, + "learning_rate": 1.9085183255962027e-06, + "loss": 0.6691, + "step": 16298 + }, + { + "epoch": 1.177524518214821, + "grad_norm": 6.031346769989542, + "learning_rate": 1.9082341317132255e-06, + "loss": 0.5998, + "step": 16299 + }, + { + "epoch": 1.1775967634150306, + "grad_norm": 6.408388255478829, + "learning_rate": 1.907949945931064e-06, + "loss": 0.6537, + "step": 16300 + }, + { + "epoch": 1.1776690086152402, + "grad_norm": 9.862125551104194, + "learning_rate": 1.9076657682536083e-06, + "loss": 0.6764, + "step": 16301 + }, + { + "epoch": 1.1777412538154497, + "grad_norm": 6.327696985911052, + "learning_rate": 1.907381598684748e-06, + "loss": 0.5897, + "step": 16302 + }, + { + "epoch": 1.1778134990156592, + "grad_norm": 6.164140166553639, + "learning_rate": 1.9070974372283728e-06, + "loss": 0.6999, + "step": 16303 + }, + { + "epoch": 1.1778857442158688, + "grad_norm": 6.023598674987773, + "learning_rate": 1.9068132838883738e-06, + "loss": 0.5747, + "step": 16304 + }, + { + "epoch": 1.177957989416078, + "grad_norm": 6.076448734049296, + "learning_rate": 1.906529138668641e-06, + "loss": 0.6554, + "step": 16305 + }, + { + "epoch": 1.1780302346162876, + "grad_norm": 6.250871521267304, + "learning_rate": 1.9062450015730626e-06, + "loss": 0.6038, + "step": 16306 + }, + { + "epoch": 1.1781024798164972, + "grad_norm": 7.309972603429552, + "learning_rate": 1.9059608726055294e-06, + "loss": 0.6439, + "step": 16307 + }, + { + "epoch": 1.1781747250167067, + "grad_norm": 6.11549854956778, + "learning_rate": 1.9056767517699318e-06, + "loss": 0.6997, + "step": 16308 + }, + { + "epoch": 1.1782469702169163, + "grad_norm": 7.274318817863018, + "learning_rate": 1.9053926390701569e-06, + "loss": 0.6747, + "step": 16309 + }, + { + "epoch": 1.1783192154171258, + "grad_norm": 8.185966056807414, + "learning_rate": 1.9051085345100951e-06, + "loss": 0.6573, + "step": 16310 + }, + { + "epoch": 1.1783914606173354, + "grad_norm": 7.150210834942621, + "learning_rate": 1.904824438093635e-06, + "loss": 0.6558, + "step": 16311 + }, + { + "epoch": 1.1784637058175447, + "grad_norm": 6.714161144132956, + "learning_rate": 1.9045403498246673e-06, + "loss": 0.6437, + "step": 16312 + }, + { + "epoch": 1.1785359510177542, + "grad_norm": 7.011078650709572, + "learning_rate": 1.9042562697070794e-06, + "loss": 0.6586, + "step": 16313 + }, + { + "epoch": 1.1786081962179638, + "grad_norm": 7.78910167994158, + "learning_rate": 1.9039721977447602e-06, + "loss": 0.6502, + "step": 16314 + }, + { + "epoch": 1.1786804414181733, + "grad_norm": 7.174397608760193, + "learning_rate": 1.9036881339415996e-06, + "loss": 0.6276, + "step": 16315 + }, + { + "epoch": 1.1787526866183828, + "grad_norm": 6.218395433909051, + "learning_rate": 1.903404078301485e-06, + "loss": 0.6353, + "step": 16316 + }, + { + "epoch": 1.1788249318185924, + "grad_norm": 6.24864273592752, + "learning_rate": 1.9031200308283051e-06, + "loss": 0.7045, + "step": 16317 + }, + { + "epoch": 1.178897177018802, + "grad_norm": 6.411693809352004, + "learning_rate": 1.9028359915259486e-06, + "loss": 0.6654, + "step": 16318 + }, + { + "epoch": 1.1789694222190112, + "grad_norm": 7.921533821779694, + "learning_rate": 1.9025519603983046e-06, + "loss": 0.6655, + "step": 16319 + }, + { + "epoch": 1.1790416674192208, + "grad_norm": 6.421841735011168, + "learning_rate": 1.9022679374492595e-06, + "loss": 0.7112, + "step": 16320 + }, + { + "epoch": 1.1791139126194303, + "grad_norm": 6.704673321508254, + "learning_rate": 1.9019839226827023e-06, + "loss": 0.7134, + "step": 16321 + }, + { + "epoch": 1.1791861578196399, + "grad_norm": 6.768776950095673, + "learning_rate": 1.9016999161025217e-06, + "loss": 0.7061, + "step": 16322 + }, + { + "epoch": 1.1792584030198494, + "grad_norm": 6.664166108697803, + "learning_rate": 1.9014159177126041e-06, + "loss": 0.648, + "step": 16323 + }, + { + "epoch": 1.179330648220059, + "grad_norm": 6.441170544125938, + "learning_rate": 1.9011319275168372e-06, + "loss": 0.6697, + "step": 16324 + }, + { + "epoch": 1.1794028934202685, + "grad_norm": 5.307322773141893, + "learning_rate": 1.9008479455191097e-06, + "loss": 0.6215, + "step": 16325 + }, + { + "epoch": 1.1794751386204778, + "grad_norm": 6.79451877487141, + "learning_rate": 1.900563971723309e-06, + "loss": 0.682, + "step": 16326 + }, + { + "epoch": 1.1795473838206874, + "grad_norm": 6.0738576309456, + "learning_rate": 1.9002800061333214e-06, + "loss": 0.6314, + "step": 16327 + }, + { + "epoch": 1.179619629020897, + "grad_norm": 8.052551282253365, + "learning_rate": 1.8999960487530352e-06, + "loss": 0.6637, + "step": 16328 + }, + { + "epoch": 1.1796918742211064, + "grad_norm": 6.397480838414513, + "learning_rate": 1.8997120995863371e-06, + "loss": 0.64, + "step": 16329 + }, + { + "epoch": 1.179764119421316, + "grad_norm": 6.8326897977170145, + "learning_rate": 1.899428158637115e-06, + "loss": 0.5913, + "step": 16330 + }, + { + "epoch": 1.1798363646215255, + "grad_norm": 6.053957239227457, + "learning_rate": 1.899144225909254e-06, + "loss": 0.6407, + "step": 16331 + }, + { + "epoch": 1.179908609821735, + "grad_norm": 6.2981192067149845, + "learning_rate": 1.8988603014066425e-06, + "loss": 0.6878, + "step": 16332 + }, + { + "epoch": 1.1799808550219444, + "grad_norm": 7.736136557603584, + "learning_rate": 1.898576385133167e-06, + "loss": 0.7363, + "step": 16333 + }, + { + "epoch": 1.180053100222154, + "grad_norm": 7.793869142455293, + "learning_rate": 1.8982924770927133e-06, + "loss": 0.7265, + "step": 16334 + }, + { + "epoch": 1.1801253454223635, + "grad_norm": 7.223581786038009, + "learning_rate": 1.8980085772891685e-06, + "loss": 0.637, + "step": 16335 + }, + { + "epoch": 1.180197590622573, + "grad_norm": 7.369468295458206, + "learning_rate": 1.897724685726419e-06, + "loss": 0.6615, + "step": 16336 + }, + { + "epoch": 1.1802698358227826, + "grad_norm": 7.852381603729275, + "learning_rate": 1.8974408024083519e-06, + "loss": 0.6337, + "step": 16337 + }, + { + "epoch": 1.180342081022992, + "grad_norm": 7.494749647442003, + "learning_rate": 1.897156927338851e-06, + "loss": 0.7047, + "step": 16338 + }, + { + "epoch": 1.1804143262232016, + "grad_norm": 7.748462893734226, + "learning_rate": 1.896873060521804e-06, + "loss": 0.648, + "step": 16339 + }, + { + "epoch": 1.180486571423411, + "grad_norm": 7.0280228054000125, + "learning_rate": 1.8965892019610968e-06, + "loss": 0.6492, + "step": 16340 + }, + { + "epoch": 1.1805588166236205, + "grad_norm": 8.530247444239876, + "learning_rate": 1.8963053516606145e-06, + "loss": 0.6825, + "step": 16341 + }, + { + "epoch": 1.18063106182383, + "grad_norm": 7.172381347866184, + "learning_rate": 1.8960215096242433e-06, + "loss": 0.6266, + "step": 16342 + }, + { + "epoch": 1.1807033070240396, + "grad_norm": 6.358641036134688, + "learning_rate": 1.8957376758558684e-06, + "loss": 0.6723, + "step": 16343 + }, + { + "epoch": 1.1807755522242491, + "grad_norm": 7.1152531395580185, + "learning_rate": 1.8954538503593762e-06, + "loss": 0.7002, + "step": 16344 + }, + { + "epoch": 1.1808477974244587, + "grad_norm": 6.883767381088178, + "learning_rate": 1.8951700331386507e-06, + "loss": 0.6337, + "step": 16345 + }, + { + "epoch": 1.1809200426246682, + "grad_norm": 6.429475097114388, + "learning_rate": 1.8948862241975783e-06, + "loss": 0.6236, + "step": 16346 + }, + { + "epoch": 1.1809922878248775, + "grad_norm": 6.623591723202286, + "learning_rate": 1.8946024235400445e-06, + "loss": 0.6284, + "step": 16347 + }, + { + "epoch": 1.181064533025087, + "grad_norm": 5.819928416437864, + "learning_rate": 1.8943186311699317e-06, + "loss": 0.6172, + "step": 16348 + }, + { + "epoch": 1.1811367782252966, + "grad_norm": 6.4528246941322625, + "learning_rate": 1.894034847091127e-06, + "loss": 0.6092, + "step": 16349 + }, + { + "epoch": 1.1812090234255062, + "grad_norm": 7.300227634590972, + "learning_rate": 1.8937510713075151e-06, + "loss": 0.5914, + "step": 16350 + }, + { + "epoch": 1.1812812686257157, + "grad_norm": 6.621237784200313, + "learning_rate": 1.893467303822981e-06, + "loss": 0.6586, + "step": 16351 + }, + { + "epoch": 1.1813535138259252, + "grad_norm": 5.680849651671181, + "learning_rate": 1.8931835446414078e-06, + "loss": 0.7406, + "step": 16352 + }, + { + "epoch": 1.1814257590261348, + "grad_norm": 9.008639533396583, + "learning_rate": 1.8928997937666808e-06, + "loss": 0.6729, + "step": 16353 + }, + { + "epoch": 1.181498004226344, + "grad_norm": 5.80619650693957, + "learning_rate": 1.8926160512026833e-06, + "loss": 0.6127, + "step": 16354 + }, + { + "epoch": 1.1815702494265536, + "grad_norm": 6.7747217212690565, + "learning_rate": 1.8923323169533025e-06, + "loss": 0.6241, + "step": 16355 + }, + { + "epoch": 1.1816424946267632, + "grad_norm": 7.852972801667561, + "learning_rate": 1.892048591022419e-06, + "loss": 0.6512, + "step": 16356 + }, + { + "epoch": 1.1817147398269727, + "grad_norm": 5.954308099403906, + "learning_rate": 1.8917648734139185e-06, + "loss": 0.6032, + "step": 16357 + }, + { + "epoch": 1.1817869850271823, + "grad_norm": 5.845423820799942, + "learning_rate": 1.891481164131685e-06, + "loss": 0.6075, + "step": 16358 + }, + { + "epoch": 1.1818592302273918, + "grad_norm": 6.102110775822589, + "learning_rate": 1.8911974631796015e-06, + "loss": 0.6261, + "step": 16359 + }, + { + "epoch": 1.1819314754276014, + "grad_norm": 5.951472175615787, + "learning_rate": 1.8909137705615519e-06, + "loss": 0.6682, + "step": 16360 + }, + { + "epoch": 1.1820037206278107, + "grad_norm": 6.917573095775743, + "learning_rate": 1.8906300862814197e-06, + "loss": 0.6183, + "step": 16361 + }, + { + "epoch": 1.1820759658280202, + "grad_norm": 5.728881865849681, + "learning_rate": 1.8903464103430894e-06, + "loss": 0.6716, + "step": 16362 + }, + { + "epoch": 1.1821482110282298, + "grad_norm": 7.005120992766008, + "learning_rate": 1.8900627427504425e-06, + "loss": 0.6132, + "step": 16363 + }, + { + "epoch": 1.1822204562284393, + "grad_norm": 8.118088350065713, + "learning_rate": 1.8897790835073633e-06, + "loss": 0.6455, + "step": 16364 + }, + { + "epoch": 1.1822927014286488, + "grad_norm": 8.723959839474372, + "learning_rate": 1.8894954326177356e-06, + "loss": 0.7122, + "step": 16365 + }, + { + "epoch": 1.1823649466288584, + "grad_norm": 6.393608907595244, + "learning_rate": 1.8892117900854403e-06, + "loss": 0.6823, + "step": 16366 + }, + { + "epoch": 1.182437191829068, + "grad_norm": 7.311945706739797, + "learning_rate": 1.8889281559143615e-06, + "loss": 0.6352, + "step": 16367 + }, + { + "epoch": 1.1825094370292772, + "grad_norm": 6.86172368673681, + "learning_rate": 1.888644530108381e-06, + "loss": 0.633, + "step": 16368 + }, + { + "epoch": 1.1825816822294868, + "grad_norm": 6.1992138271877115, + "learning_rate": 1.8883609126713842e-06, + "loss": 0.6684, + "step": 16369 + }, + { + "epoch": 1.1826539274296963, + "grad_norm": 6.706871141889936, + "learning_rate": 1.8880773036072503e-06, + "loss": 0.652, + "step": 16370 + }, + { + "epoch": 1.1827261726299059, + "grad_norm": 5.927129744184324, + "learning_rate": 1.887793702919863e-06, + "loss": 0.532, + "step": 16371 + }, + { + "epoch": 1.1827984178301154, + "grad_norm": 6.5747354432464915, + "learning_rate": 1.8875101106131049e-06, + "loss": 0.5694, + "step": 16372 + }, + { + "epoch": 1.182870663030325, + "grad_norm": 9.645448834944855, + "learning_rate": 1.8872265266908574e-06, + "loss": 0.6804, + "step": 16373 + }, + { + "epoch": 1.1829429082305345, + "grad_norm": 7.704995692287184, + "learning_rate": 1.886942951157003e-06, + "loss": 0.5972, + "step": 16374 + }, + { + "epoch": 1.183015153430744, + "grad_norm": 8.602459187764184, + "learning_rate": 1.8866593840154235e-06, + "loss": 0.63, + "step": 16375 + }, + { + "epoch": 1.1830873986309536, + "grad_norm": 7.099561554171822, + "learning_rate": 1.8863758252700014e-06, + "loss": 0.623, + "step": 16376 + }, + { + "epoch": 1.183159643831163, + "grad_norm": 8.525513449501837, + "learning_rate": 1.8860922749246175e-06, + "loss": 0.6538, + "step": 16377 + }, + { + "epoch": 1.1832318890313724, + "grad_norm": 7.532431355494993, + "learning_rate": 1.8858087329831534e-06, + "loss": 0.6328, + "step": 16378 + }, + { + "epoch": 1.183304134231582, + "grad_norm": 6.427988080396443, + "learning_rate": 1.8855251994494912e-06, + "loss": 0.6101, + "step": 16379 + }, + { + "epoch": 1.1833763794317915, + "grad_norm": 6.517239816474414, + "learning_rate": 1.8852416743275126e-06, + "loss": 0.6098, + "step": 16380 + }, + { + "epoch": 1.183448624632001, + "grad_norm": 11.013892677395203, + "learning_rate": 1.8849581576210972e-06, + "loss": 0.6144, + "step": 16381 + }, + { + "epoch": 1.1835208698322106, + "grad_norm": 8.930130060664885, + "learning_rate": 1.8846746493341263e-06, + "loss": 0.6814, + "step": 16382 + }, + { + "epoch": 1.1835931150324202, + "grad_norm": 6.166954377482074, + "learning_rate": 1.8843911494704837e-06, + "loss": 0.6141, + "step": 16383 + }, + { + "epoch": 1.1836653602326295, + "grad_norm": 7.603539706646941, + "learning_rate": 1.884107658034047e-06, + "loss": 0.6575, + "step": 16384 + }, + { + "epoch": 1.183737605432839, + "grad_norm": 6.239498623977193, + "learning_rate": 1.8838241750286983e-06, + "loss": 0.6699, + "step": 16385 + }, + { + "epoch": 1.1838098506330486, + "grad_norm": 6.482105271531827, + "learning_rate": 1.8835407004583184e-06, + "loss": 0.6347, + "step": 16386 + }, + { + "epoch": 1.183882095833258, + "grad_norm": 7.333995818066971, + "learning_rate": 1.883257234326788e-06, + "loss": 0.6094, + "step": 16387 + }, + { + "epoch": 1.1839543410334676, + "grad_norm": 6.625842922496258, + "learning_rate": 1.8829737766379862e-06, + "loss": 0.6097, + "step": 16388 + }, + { + "epoch": 1.1840265862336772, + "grad_norm": 7.587083189826358, + "learning_rate": 1.882690327395795e-06, + "loss": 0.6691, + "step": 16389 + }, + { + "epoch": 1.1840988314338867, + "grad_norm": 5.837505084082536, + "learning_rate": 1.8824068866040943e-06, + "loss": 0.6619, + "step": 16390 + }, + { + "epoch": 1.184171076634096, + "grad_norm": 7.653017942241321, + "learning_rate": 1.8821234542667632e-06, + "loss": 0.6885, + "step": 16391 + }, + { + "epoch": 1.1842433218343056, + "grad_norm": 7.577058005183543, + "learning_rate": 1.8818400303876822e-06, + "loss": 0.5879, + "step": 16392 + }, + { + "epoch": 1.1843155670345151, + "grad_norm": 7.218798880803797, + "learning_rate": 1.8815566149707312e-06, + "loss": 0.6312, + "step": 16393 + }, + { + "epoch": 1.1843878122347247, + "grad_norm": 6.112682803176204, + "learning_rate": 1.881273208019791e-06, + "loss": 0.6079, + "step": 16394 + }, + { + "epoch": 1.1844600574349342, + "grad_norm": 6.164647605211148, + "learning_rate": 1.880989809538739e-06, + "loss": 0.5929, + "step": 16395 + }, + { + "epoch": 1.1845323026351438, + "grad_norm": 7.210543372727223, + "learning_rate": 1.8807064195314555e-06, + "loss": 0.5885, + "step": 16396 + }, + { + "epoch": 1.1846045478353533, + "grad_norm": 6.364849311860833, + "learning_rate": 1.8804230380018221e-06, + "loss": 0.625, + "step": 16397 + }, + { + "epoch": 1.1846767930355626, + "grad_norm": 7.452622114765919, + "learning_rate": 1.880139664953715e-06, + "loss": 0.6539, + "step": 16398 + }, + { + "epoch": 1.1847490382357722, + "grad_norm": 7.245556752339132, + "learning_rate": 1.8798563003910144e-06, + "loss": 0.7231, + "step": 16399 + }, + { + "epoch": 1.1848212834359817, + "grad_norm": 6.789492612148563, + "learning_rate": 1.8795729443175997e-06, + "loss": 0.6112, + "step": 16400 + }, + { + "epoch": 1.1848935286361912, + "grad_norm": 7.8506454008080455, + "learning_rate": 1.8792895967373501e-06, + "loss": 0.6353, + "step": 16401 + }, + { + "epoch": 1.1849657738364008, + "grad_norm": 5.96565334019591, + "learning_rate": 1.8790062576541435e-06, + "loss": 0.6886, + "step": 16402 + }, + { + "epoch": 1.1850380190366103, + "grad_norm": 6.195177343753498, + "learning_rate": 1.878722927071859e-06, + "loss": 0.5872, + "step": 16403 + }, + { + "epoch": 1.1851102642368199, + "grad_norm": 5.872456426089372, + "learning_rate": 1.8784396049943754e-06, + "loss": 0.6198, + "step": 16404 + }, + { + "epoch": 1.1851825094370292, + "grad_norm": 7.200221206657798, + "learning_rate": 1.8781562914255714e-06, + "loss": 0.6419, + "step": 16405 + }, + { + "epoch": 1.1852547546372387, + "grad_norm": 7.007365302847343, + "learning_rate": 1.8778729863693247e-06, + "loss": 0.5838, + "step": 16406 + }, + { + "epoch": 1.1853269998374483, + "grad_norm": 6.302948325408788, + "learning_rate": 1.8775896898295131e-06, + "loss": 0.6111, + "step": 16407 + }, + { + "epoch": 1.1853992450376578, + "grad_norm": 6.257817986303775, + "learning_rate": 1.877306401810017e-06, + "loss": 0.6422, + "step": 16408 + }, + { + "epoch": 1.1854714902378674, + "grad_norm": 10.150660646482995, + "learning_rate": 1.8770231223147117e-06, + "loss": 0.6746, + "step": 16409 + }, + { + "epoch": 1.185543735438077, + "grad_norm": 6.686845693933176, + "learning_rate": 1.8767398513474756e-06, + "loss": 0.6271, + "step": 16410 + }, + { + "epoch": 1.1856159806382864, + "grad_norm": 6.592297724028137, + "learning_rate": 1.8764565889121872e-06, + "loss": 0.6514, + "step": 16411 + }, + { + "epoch": 1.1856882258384958, + "grad_norm": 7.800695182925305, + "learning_rate": 1.8761733350127253e-06, + "loss": 0.6457, + "step": 16412 + }, + { + "epoch": 1.1857604710387053, + "grad_norm": 6.7430135103803455, + "learning_rate": 1.8758900896529646e-06, + "loss": 0.6874, + "step": 16413 + }, + { + "epoch": 1.1858327162389148, + "grad_norm": 6.290652700743158, + "learning_rate": 1.8756068528367847e-06, + "loss": 0.6266, + "step": 16414 + }, + { + "epoch": 1.1859049614391244, + "grad_norm": 6.277057216255288, + "learning_rate": 1.8753236245680622e-06, + "loss": 0.6653, + "step": 16415 + }, + { + "epoch": 1.185977206639334, + "grad_norm": 6.001073741203593, + "learning_rate": 1.8750404048506738e-06, + "loss": 0.6047, + "step": 16416 + }, + { + "epoch": 1.1860494518395435, + "grad_norm": 7.4325878687545845, + "learning_rate": 1.874757193688497e-06, + "loss": 0.7165, + "step": 16417 + }, + { + "epoch": 1.186121697039753, + "grad_norm": 5.881825479532667, + "learning_rate": 1.8744739910854087e-06, + "loss": 0.6125, + "step": 16418 + }, + { + "epoch": 1.1861939422399623, + "grad_norm": 6.665582409743651, + "learning_rate": 1.8741907970452866e-06, + "loss": 0.6251, + "step": 16419 + }, + { + "epoch": 1.1862661874401719, + "grad_norm": 6.5616748109353065, + "learning_rate": 1.8739076115720055e-06, + "loss": 0.6396, + "step": 16420 + }, + { + "epoch": 1.1863384326403814, + "grad_norm": 5.546646156425947, + "learning_rate": 1.8736244346694439e-06, + "loss": 0.6144, + "step": 16421 + }, + { + "epoch": 1.186410677840591, + "grad_norm": 7.360090281720607, + "learning_rate": 1.8733412663414778e-06, + "loss": 0.6945, + "step": 16422 + }, + { + "epoch": 1.1864829230408005, + "grad_norm": 8.511560544508393, + "learning_rate": 1.8730581065919822e-06, + "loss": 0.6674, + "step": 16423 + }, + { + "epoch": 1.18655516824101, + "grad_norm": 8.316432854290467, + "learning_rate": 1.8727749554248344e-06, + "loss": 0.6836, + "step": 16424 + }, + { + "epoch": 1.1866274134412196, + "grad_norm": 6.082017137808772, + "learning_rate": 1.8724918128439102e-06, + "loss": 0.5716, + "step": 16425 + }, + { + "epoch": 1.186699658641429, + "grad_norm": 6.825511301192393, + "learning_rate": 1.872208678853087e-06, + "loss": 0.6321, + "step": 16426 + }, + { + "epoch": 1.1867719038416384, + "grad_norm": 6.42535499156102, + "learning_rate": 1.871925553456239e-06, + "loss": 0.6613, + "step": 16427 + }, + { + "epoch": 1.186844149041848, + "grad_norm": 7.296801286933618, + "learning_rate": 1.8716424366572423e-06, + "loss": 0.6798, + "step": 16428 + }, + { + "epoch": 1.1869163942420575, + "grad_norm": 7.428769156161545, + "learning_rate": 1.8713593284599728e-06, + "loss": 0.6723, + "step": 16429 + }, + { + "epoch": 1.186988639442267, + "grad_norm": 6.677574135709062, + "learning_rate": 1.8710762288683059e-06, + "loss": 0.6901, + "step": 16430 + }, + { + "epoch": 1.1870608846424766, + "grad_norm": 6.6389729677601785, + "learning_rate": 1.870793137886117e-06, + "loss": 0.6679, + "step": 16431 + }, + { + "epoch": 1.1871331298426862, + "grad_norm": 6.7393618925133945, + "learning_rate": 1.8705100555172816e-06, + "loss": 0.702, + "step": 16432 + }, + { + "epoch": 1.1872053750428955, + "grad_norm": 7.595473105767636, + "learning_rate": 1.8702269817656755e-06, + "loss": 0.7719, + "step": 16433 + }, + { + "epoch": 1.187277620243105, + "grad_norm": 6.447599322059511, + "learning_rate": 1.8699439166351724e-06, + "loss": 0.63, + "step": 16434 + }, + { + "epoch": 1.1873498654433146, + "grad_norm": 7.195854837561811, + "learning_rate": 1.8696608601296477e-06, + "loss": 0.6787, + "step": 16435 + }, + { + "epoch": 1.187422110643524, + "grad_norm": 6.695227320873715, + "learning_rate": 1.8693778122529762e-06, + "loss": 0.6642, + "step": 16436 + }, + { + "epoch": 1.1874943558437336, + "grad_norm": 6.09581420366993, + "learning_rate": 1.8690947730090343e-06, + "loss": 0.6544, + "step": 16437 + }, + { + "epoch": 1.1875666010439432, + "grad_norm": 7.886737130055353, + "learning_rate": 1.8688117424016938e-06, + "loss": 0.6326, + "step": 16438 + }, + { + "epoch": 1.1876388462441527, + "grad_norm": 6.317931906907692, + "learning_rate": 1.8685287204348298e-06, + "loss": 0.7106, + "step": 16439 + }, + { + "epoch": 1.187711091444362, + "grad_norm": 6.864872356653438, + "learning_rate": 1.8682457071123192e-06, + "loss": 0.6559, + "step": 16440 + }, + { + "epoch": 1.1877833366445716, + "grad_norm": 7.345192625507553, + "learning_rate": 1.867962702438033e-06, + "loss": 0.6533, + "step": 16441 + }, + { + "epoch": 1.1878555818447811, + "grad_norm": 7.305370017412438, + "learning_rate": 1.867679706415847e-06, + "loss": 0.5932, + "step": 16442 + }, + { + "epoch": 1.1879278270449907, + "grad_norm": 7.059669129828696, + "learning_rate": 1.8673967190496344e-06, + "loss": 0.6527, + "step": 16443 + }, + { + "epoch": 1.1880000722452002, + "grad_norm": 6.2934260992443845, + "learning_rate": 1.8671137403432704e-06, + "loss": 0.621, + "step": 16444 + }, + { + "epoch": 1.1880723174454098, + "grad_norm": 5.737388047931639, + "learning_rate": 1.8668307703006271e-06, + "loss": 0.6314, + "step": 16445 + }, + { + "epoch": 1.1881445626456193, + "grad_norm": 5.934838270572465, + "learning_rate": 1.8665478089255796e-06, + "loss": 0.594, + "step": 16446 + }, + { + "epoch": 1.1882168078458288, + "grad_norm": 7.272132502424065, + "learning_rate": 1.8662648562220006e-06, + "loss": 0.6511, + "step": 16447 + }, + { + "epoch": 1.1882890530460382, + "grad_norm": 6.673720950906354, + "learning_rate": 1.8659819121937633e-06, + "loss": 0.6667, + "step": 16448 + }, + { + "epoch": 1.1883612982462477, + "grad_norm": 6.030690221553404, + "learning_rate": 1.8656989768447414e-06, + "loss": 0.6448, + "step": 16449 + }, + { + "epoch": 1.1884335434464572, + "grad_norm": 6.6712946404858915, + "learning_rate": 1.865416050178808e-06, + "loss": 0.6571, + "step": 16450 + }, + { + "epoch": 1.1885057886466668, + "grad_norm": 6.700845682093015, + "learning_rate": 1.8651331321998374e-06, + "loss": 0.6631, + "step": 16451 + }, + { + "epoch": 1.1885780338468763, + "grad_norm": 6.416300412248838, + "learning_rate": 1.8648502229116997e-06, + "loss": 0.6052, + "step": 16452 + }, + { + "epoch": 1.1886502790470859, + "grad_norm": 7.2978185660342305, + "learning_rate": 1.8645673223182692e-06, + "loss": 0.7384, + "step": 16453 + }, + { + "epoch": 1.1887225242472954, + "grad_norm": 6.1880181606152, + "learning_rate": 1.8642844304234199e-06, + "loss": 0.679, + "step": 16454 + }, + { + "epoch": 1.188794769447505, + "grad_norm": 7.511302061619455, + "learning_rate": 1.8640015472310224e-06, + "loss": 0.639, + "step": 16455 + }, + { + "epoch": 1.1888670146477143, + "grad_norm": 6.94830251038906, + "learning_rate": 1.8637186727449496e-06, + "loss": 0.6053, + "step": 16456 + }, + { + "epoch": 1.1889392598479238, + "grad_norm": 5.800334710296022, + "learning_rate": 1.8634358069690744e-06, + "loss": 0.6257, + "step": 16457 + }, + { + "epoch": 1.1890115050481334, + "grad_norm": 6.283295103946754, + "learning_rate": 1.8631529499072692e-06, + "loss": 0.5755, + "step": 16458 + }, + { + "epoch": 1.189083750248343, + "grad_norm": 6.893789243889035, + "learning_rate": 1.862870101563405e-06, + "loss": 0.6406, + "step": 16459 + }, + { + "epoch": 1.1891559954485524, + "grad_norm": 6.201479446112568, + "learning_rate": 1.862587261941355e-06, + "loss": 0.6792, + "step": 16460 + }, + { + "epoch": 1.189228240648762, + "grad_norm": 7.804397409601971, + "learning_rate": 1.8623044310449895e-06, + "loss": 0.6397, + "step": 16461 + }, + { + "epoch": 1.1893004858489715, + "grad_norm": 6.046574074997642, + "learning_rate": 1.8620216088781823e-06, + "loss": 0.611, + "step": 16462 + }, + { + "epoch": 1.1893727310491808, + "grad_norm": 7.069875731070752, + "learning_rate": 1.8617387954448036e-06, + "loss": 0.6315, + "step": 16463 + }, + { + "epoch": 1.1894449762493904, + "grad_norm": 6.892156659949988, + "learning_rate": 1.8614559907487245e-06, + "loss": 0.7052, + "step": 16464 + }, + { + "epoch": 1.1895172214496, + "grad_norm": 6.517353953666553, + "learning_rate": 1.8611731947938188e-06, + "loss": 0.6139, + "step": 16465 + }, + { + "epoch": 1.1895894666498095, + "grad_norm": 6.083294628838863, + "learning_rate": 1.8608904075839545e-06, + "loss": 0.6007, + "step": 16466 + }, + { + "epoch": 1.189661711850019, + "grad_norm": 7.773353093134705, + "learning_rate": 1.860607629123004e-06, + "loss": 0.6793, + "step": 16467 + }, + { + "epoch": 1.1897339570502286, + "grad_norm": 6.6624972975047285, + "learning_rate": 1.8603248594148393e-06, + "loss": 0.6919, + "step": 16468 + }, + { + "epoch": 1.189806202250438, + "grad_norm": 6.423310873444029, + "learning_rate": 1.8600420984633317e-06, + "loss": 0.6542, + "step": 16469 + }, + { + "epoch": 1.1898784474506474, + "grad_norm": 6.919780198944832, + "learning_rate": 1.8597593462723496e-06, + "loss": 0.6891, + "step": 16470 + }, + { + "epoch": 1.189950692650857, + "grad_norm": 7.316648692092309, + "learning_rate": 1.8594766028457647e-06, + "loss": 0.5775, + "step": 16471 + }, + { + "epoch": 1.1900229378510665, + "grad_norm": 6.460355610227327, + "learning_rate": 1.8591938681874485e-06, + "loss": 0.7218, + "step": 16472 + }, + { + "epoch": 1.190095183051276, + "grad_norm": 7.496800566761328, + "learning_rate": 1.8589111423012702e-06, + "loss": 0.6436, + "step": 16473 + }, + { + "epoch": 1.1901674282514856, + "grad_norm": 7.145606509094417, + "learning_rate": 1.8586284251911008e-06, + "loss": 0.651, + "step": 16474 + }, + { + "epoch": 1.1902396734516951, + "grad_norm": 7.588937749499036, + "learning_rate": 1.8583457168608098e-06, + "loss": 0.6814, + "step": 16475 + }, + { + "epoch": 1.1903119186519047, + "grad_norm": 7.131100368685167, + "learning_rate": 1.8580630173142685e-06, + "loss": 0.6722, + "step": 16476 + }, + { + "epoch": 1.190384163852114, + "grad_norm": 6.529430373205243, + "learning_rate": 1.8577803265553457e-06, + "loss": 0.6654, + "step": 16477 + }, + { + "epoch": 1.1904564090523235, + "grad_norm": 6.342054619704596, + "learning_rate": 1.857497644587911e-06, + "loss": 0.6452, + "step": 16478 + }, + { + "epoch": 1.190528654252533, + "grad_norm": 7.983732611985307, + "learning_rate": 1.8572149714158366e-06, + "loss": 0.7103, + "step": 16479 + }, + { + "epoch": 1.1906008994527426, + "grad_norm": 8.328028350899084, + "learning_rate": 1.8569323070429883e-06, + "loss": 0.6459, + "step": 16480 + }, + { + "epoch": 1.1906731446529522, + "grad_norm": 7.190911586310784, + "learning_rate": 1.8566496514732375e-06, + "loss": 0.6562, + "step": 16481 + }, + { + "epoch": 1.1907453898531617, + "grad_norm": 7.643831606013685, + "learning_rate": 1.8563670047104536e-06, + "loss": 0.5965, + "step": 16482 + }, + { + "epoch": 1.1908176350533712, + "grad_norm": 6.414385732853457, + "learning_rate": 1.856084366758507e-06, + "loss": 0.6047, + "step": 16483 + }, + { + "epoch": 1.1908898802535806, + "grad_norm": 8.640171584085666, + "learning_rate": 1.8558017376212639e-06, + "loss": 0.704, + "step": 16484 + }, + { + "epoch": 1.19096212545379, + "grad_norm": 7.960686407350303, + "learning_rate": 1.8555191173025954e-06, + "loss": 0.6088, + "step": 16485 + }, + { + "epoch": 1.1910343706539996, + "grad_norm": 8.777071681642317, + "learning_rate": 1.8552365058063692e-06, + "loss": 0.6618, + "step": 16486 + }, + { + "epoch": 1.1911066158542092, + "grad_norm": 6.381210891947528, + "learning_rate": 1.8549539031364555e-06, + "loss": 0.6176, + "step": 16487 + }, + { + "epoch": 1.1911788610544187, + "grad_norm": 7.108349267092784, + "learning_rate": 1.8546713092967216e-06, + "loss": 0.6702, + "step": 16488 + }, + { + "epoch": 1.1912511062546283, + "grad_norm": 7.411270178530782, + "learning_rate": 1.8543887242910362e-06, + "loss": 0.6845, + "step": 16489 + }, + { + "epoch": 1.1913233514548378, + "grad_norm": 6.1469116644257955, + "learning_rate": 1.8541061481232685e-06, + "loss": 0.6728, + "step": 16490 + }, + { + "epoch": 1.1913955966550471, + "grad_norm": 8.246696966923048, + "learning_rate": 1.8538235807972854e-06, + "loss": 0.6253, + "step": 16491 + }, + { + "epoch": 1.1914678418552567, + "grad_norm": 6.294226456215958, + "learning_rate": 1.8535410223169558e-06, + "loss": 0.6964, + "step": 16492 + }, + { + "epoch": 1.1915400870554662, + "grad_norm": 7.189654483478109, + "learning_rate": 1.853258472686148e-06, + "loss": 0.791, + "step": 16493 + }, + { + "epoch": 1.1916123322556758, + "grad_norm": 6.988874040327419, + "learning_rate": 1.8529759319087303e-06, + "loss": 0.6745, + "step": 16494 + }, + { + "epoch": 1.1916845774558853, + "grad_norm": 8.018009894357553, + "learning_rate": 1.8526933999885683e-06, + "loss": 0.6586, + "step": 16495 + }, + { + "epoch": 1.1917568226560948, + "grad_norm": 6.688697778978777, + "learning_rate": 1.8524108769295313e-06, + "loss": 0.703, + "step": 16496 + }, + { + "epoch": 1.1918290678563044, + "grad_norm": 6.110202899067546, + "learning_rate": 1.852128362735488e-06, + "loss": 0.6062, + "step": 16497 + }, + { + "epoch": 1.1919013130565137, + "grad_norm": 5.80542184695005, + "learning_rate": 1.8518458574103032e-06, + "loss": 0.6032, + "step": 16498 + }, + { + "epoch": 1.1919735582567232, + "grad_norm": 5.820299719470029, + "learning_rate": 1.851563360957845e-06, + "loss": 0.6773, + "step": 16499 + }, + { + "epoch": 1.1920458034569328, + "grad_norm": 6.07356181134035, + "learning_rate": 1.8512808733819815e-06, + "loss": 0.6764, + "step": 16500 + }, + { + "epoch": 1.1921180486571423, + "grad_norm": 6.027493428654295, + "learning_rate": 1.8509983946865791e-06, + "loss": 0.6306, + "step": 16501 + }, + { + "epoch": 1.1921902938573519, + "grad_norm": 6.15795100379871, + "learning_rate": 1.8507159248755048e-06, + "loss": 0.6467, + "step": 16502 + }, + { + "epoch": 1.1922625390575614, + "grad_norm": 8.028129713651788, + "learning_rate": 1.850433463952625e-06, + "loss": 0.7188, + "step": 16503 + }, + { + "epoch": 1.192334784257771, + "grad_norm": 6.586125164141232, + "learning_rate": 1.8501510119218073e-06, + "loss": 0.6628, + "step": 16504 + }, + { + "epoch": 1.1924070294579803, + "grad_norm": 7.303775636600887, + "learning_rate": 1.8498685687869169e-06, + "loss": 0.6768, + "step": 16505 + }, + { + "epoch": 1.1924792746581898, + "grad_norm": 8.192671911340952, + "learning_rate": 1.8495861345518211e-06, + "loss": 0.7312, + "step": 16506 + }, + { + "epoch": 1.1925515198583994, + "grad_norm": 8.539284077637525, + "learning_rate": 1.8493037092203864e-06, + "loss": 0.7052, + "step": 16507 + }, + { + "epoch": 1.192623765058609, + "grad_norm": 7.035890124739876, + "learning_rate": 1.8490212927964796e-06, + "loss": 0.7053, + "step": 16508 + }, + { + "epoch": 1.1926960102588184, + "grad_norm": 6.307135086783038, + "learning_rate": 1.8487388852839643e-06, + "loss": 0.573, + "step": 16509 + }, + { + "epoch": 1.192768255459028, + "grad_norm": 9.29144857062274, + "learning_rate": 1.8484564866867083e-06, + "loss": 0.715, + "step": 16510 + }, + { + "epoch": 1.1928405006592375, + "grad_norm": 8.06057598015402, + "learning_rate": 1.8481740970085774e-06, + "loss": 0.6757, + "step": 16511 + }, + { + "epoch": 1.1929127458594468, + "grad_norm": 6.5288659814257155, + "learning_rate": 1.847891716253438e-06, + "loss": 0.6534, + "step": 16512 + }, + { + "epoch": 1.1929849910596564, + "grad_norm": 7.115875022626978, + "learning_rate": 1.8476093444251537e-06, + "loss": 0.6455, + "step": 16513 + }, + { + "epoch": 1.193057236259866, + "grad_norm": 7.8883539236339875, + "learning_rate": 1.8473269815275908e-06, + "loss": 0.6572, + "step": 16514 + }, + { + "epoch": 1.1931294814600755, + "grad_norm": 8.241284014462265, + "learning_rate": 1.847044627564615e-06, + "loss": 0.6408, + "step": 16515 + }, + { + "epoch": 1.193201726660285, + "grad_norm": 6.6805072482189445, + "learning_rate": 1.846762282540091e-06, + "loss": 0.6433, + "step": 16516 + }, + { + "epoch": 1.1932739718604946, + "grad_norm": 7.294205171034637, + "learning_rate": 1.8464799464578841e-06, + "loss": 0.6356, + "step": 16517 + }, + { + "epoch": 1.193346217060704, + "grad_norm": 7.543824900434657, + "learning_rate": 1.8461976193218595e-06, + "loss": 0.6696, + "step": 16518 + }, + { + "epoch": 1.1934184622609136, + "grad_norm": 6.318787417881143, + "learning_rate": 1.8459153011358822e-06, + "loss": 0.6658, + "step": 16519 + }, + { + "epoch": 1.193490707461123, + "grad_norm": 6.409844706494608, + "learning_rate": 1.845632991903816e-06, + "loss": 0.6092, + "step": 16520 + }, + { + "epoch": 1.1935629526613325, + "grad_norm": 5.995066521842636, + "learning_rate": 1.845350691629526e-06, + "loss": 0.6811, + "step": 16521 + }, + { + "epoch": 1.193635197861542, + "grad_norm": 6.320970046044994, + "learning_rate": 1.8450684003168778e-06, + "loss": 0.7549, + "step": 16522 + }, + { + "epoch": 1.1937074430617516, + "grad_norm": 6.342379116519764, + "learning_rate": 1.844786117969734e-06, + "loss": 0.6385, + "step": 16523 + }, + { + "epoch": 1.1937796882619611, + "grad_norm": 6.381558205151186, + "learning_rate": 1.8445038445919586e-06, + "loss": 0.5641, + "step": 16524 + }, + { + "epoch": 1.1938519334621707, + "grad_norm": 6.402509388110552, + "learning_rate": 1.8442215801874175e-06, + "loss": 0.6683, + "step": 16525 + }, + { + "epoch": 1.1939241786623802, + "grad_norm": 7.194082148727973, + "learning_rate": 1.8439393247599744e-06, + "loss": 0.6516, + "step": 16526 + }, + { + "epoch": 1.1939964238625898, + "grad_norm": 6.218433774642867, + "learning_rate": 1.8436570783134915e-06, + "loss": 0.6325, + "step": 16527 + }, + { + "epoch": 1.194068669062799, + "grad_norm": 6.657508695269416, + "learning_rate": 1.8433748408518337e-06, + "loss": 0.6907, + "step": 16528 + }, + { + "epoch": 1.1941409142630086, + "grad_norm": 6.134151550669581, + "learning_rate": 1.843092612378865e-06, + "loss": 0.6316, + "step": 16529 + }, + { + "epoch": 1.1942131594632182, + "grad_norm": 7.8218701132844055, + "learning_rate": 1.842810392898448e-06, + "loss": 0.6311, + "step": 16530 + }, + { + "epoch": 1.1942854046634277, + "grad_norm": 5.3297429914222185, + "learning_rate": 1.8425281824144463e-06, + "loss": 0.6505, + "step": 16531 + }, + { + "epoch": 1.1943576498636372, + "grad_norm": 6.634992153735146, + "learning_rate": 1.8422459809307237e-06, + "loss": 0.599, + "step": 16532 + }, + { + "epoch": 1.1944298950638468, + "grad_norm": 7.848885485018468, + "learning_rate": 1.841963788451143e-06, + "loss": 0.6549, + "step": 16533 + }, + { + "epoch": 1.1945021402640563, + "grad_norm": 6.1674312770495385, + "learning_rate": 1.8416816049795666e-06, + "loss": 0.7018, + "step": 16534 + }, + { + "epoch": 1.1945743854642656, + "grad_norm": 6.024782814765018, + "learning_rate": 1.841399430519858e-06, + "loss": 0.6145, + "step": 16535 + }, + { + "epoch": 1.1946466306644752, + "grad_norm": 7.946203071248036, + "learning_rate": 1.8411172650758807e-06, + "loss": 0.6579, + "step": 16536 + }, + { + "epoch": 1.1947188758646847, + "grad_norm": 5.947637637174508, + "learning_rate": 1.840835108651496e-06, + "loss": 0.6426, + "step": 16537 + }, + { + "epoch": 1.1947911210648943, + "grad_norm": 6.06412948260803, + "learning_rate": 1.8405529612505656e-06, + "loss": 0.6125, + "step": 16538 + }, + { + "epoch": 1.1948633662651038, + "grad_norm": 6.705127905892251, + "learning_rate": 1.840270822876954e-06, + "loss": 0.686, + "step": 16539 + }, + { + "epoch": 1.1949356114653134, + "grad_norm": 6.411874971918712, + "learning_rate": 1.8399886935345234e-06, + "loss": 0.7178, + "step": 16540 + }, + { + "epoch": 1.195007856665523, + "grad_norm": 9.39755019060611, + "learning_rate": 1.8397065732271342e-06, + "loss": 0.6446, + "step": 16541 + }, + { + "epoch": 1.1950801018657322, + "grad_norm": 7.038010259258211, + "learning_rate": 1.8394244619586493e-06, + "loss": 0.6851, + "step": 16542 + }, + { + "epoch": 1.1951523470659418, + "grad_norm": 6.526847268016141, + "learning_rate": 1.8391423597329305e-06, + "loss": 0.6498, + "step": 16543 + }, + { + "epoch": 1.1952245922661513, + "grad_norm": 6.279182966987835, + "learning_rate": 1.8388602665538407e-06, + "loss": 0.7005, + "step": 16544 + }, + { + "epoch": 1.1952968374663608, + "grad_norm": 6.9443957450006835, + "learning_rate": 1.8385781824252397e-06, + "loss": 0.541, + "step": 16545 + }, + { + "epoch": 1.1953690826665704, + "grad_norm": 6.559766218160069, + "learning_rate": 1.8382961073509897e-06, + "loss": 0.5938, + "step": 16546 + }, + { + "epoch": 1.19544132786678, + "grad_norm": 6.325365006239418, + "learning_rate": 1.8380140413349529e-06, + "loss": 0.697, + "step": 16547 + }, + { + "epoch": 1.1955135730669895, + "grad_norm": 8.46550618225957, + "learning_rate": 1.837731984380989e-06, + "loss": 0.7274, + "step": 16548 + }, + { + "epoch": 1.1955858182671988, + "grad_norm": 5.9239466121104405, + "learning_rate": 1.8374499364929605e-06, + "loss": 0.623, + "step": 16549 + }, + { + "epoch": 1.1956580634674083, + "grad_norm": 6.918247762230648, + "learning_rate": 1.8371678976747277e-06, + "loss": 0.598, + "step": 16550 + }, + { + "epoch": 1.1957303086676179, + "grad_norm": 6.392410140914612, + "learning_rate": 1.8368858679301527e-06, + "loss": 0.623, + "step": 16551 + }, + { + "epoch": 1.1958025538678274, + "grad_norm": 6.872090781841228, + "learning_rate": 1.8366038472630937e-06, + "loss": 0.6146, + "step": 16552 + }, + { + "epoch": 1.195874799068037, + "grad_norm": 7.140011950998429, + "learning_rate": 1.8363218356774135e-06, + "loss": 0.6449, + "step": 16553 + }, + { + "epoch": 1.1959470442682465, + "grad_norm": 7.710487793212395, + "learning_rate": 1.836039833176973e-06, + "loss": 0.7487, + "step": 16554 + }, + { + "epoch": 1.196019289468456, + "grad_norm": 6.84109292205303, + "learning_rate": 1.8357578397656309e-06, + "loss": 0.6019, + "step": 16555 + }, + { + "epoch": 1.1960915346686654, + "grad_norm": 7.246745464290763, + "learning_rate": 1.835475855447248e-06, + "loss": 0.6466, + "step": 16556 + }, + { + "epoch": 1.196163779868875, + "grad_norm": 7.620215243606096, + "learning_rate": 1.8351938802256846e-06, + "loss": 0.6053, + "step": 16557 + }, + { + "epoch": 1.1962360250690844, + "grad_norm": 6.344398202336033, + "learning_rate": 1.8349119141048017e-06, + "loss": 0.6625, + "step": 16558 + }, + { + "epoch": 1.196308270269294, + "grad_norm": 7.322505007840128, + "learning_rate": 1.8346299570884575e-06, + "loss": 0.6577, + "step": 16559 + }, + { + "epoch": 1.1963805154695035, + "grad_norm": 6.922895300349386, + "learning_rate": 1.8343480091805126e-06, + "loss": 0.6844, + "step": 16560 + }, + { + "epoch": 1.196452760669713, + "grad_norm": 6.380449247418002, + "learning_rate": 1.8340660703848273e-06, + "loss": 0.715, + "step": 16561 + }, + { + "epoch": 1.1965250058699226, + "grad_norm": 7.587095256737881, + "learning_rate": 1.8337841407052597e-06, + "loss": 0.7096, + "step": 16562 + }, + { + "epoch": 1.196597251070132, + "grad_norm": 7.906326956996386, + "learning_rate": 1.8335022201456699e-06, + "loss": 0.7375, + "step": 16563 + }, + { + "epoch": 1.1966694962703415, + "grad_norm": 6.746922109138162, + "learning_rate": 1.833220308709917e-06, + "loss": 0.684, + "step": 16564 + }, + { + "epoch": 1.196741741470551, + "grad_norm": 6.393626508517675, + "learning_rate": 1.832938406401862e-06, + "loss": 0.6678, + "step": 16565 + }, + { + "epoch": 1.1968139866707606, + "grad_norm": 6.401920400032291, + "learning_rate": 1.8326565132253605e-06, + "loss": 0.5975, + "step": 16566 + }, + { + "epoch": 1.19688623187097, + "grad_norm": 7.193273729720258, + "learning_rate": 1.8323746291842733e-06, + "loss": 0.6389, + "step": 16567 + }, + { + "epoch": 1.1969584770711796, + "grad_norm": 6.180933078089449, + "learning_rate": 1.8320927542824596e-06, + "loss": 0.6542, + "step": 16568 + }, + { + "epoch": 1.1970307222713892, + "grad_norm": 6.073921691892173, + "learning_rate": 1.8318108885237782e-06, + "loss": 0.6231, + "step": 16569 + }, + { + "epoch": 1.1971029674715985, + "grad_norm": 6.978802328895752, + "learning_rate": 1.831529031912086e-06, + "loss": 0.7213, + "step": 16570 + }, + { + "epoch": 1.197175212671808, + "grad_norm": 8.103516331118913, + "learning_rate": 1.831247184451242e-06, + "loss": 0.7041, + "step": 16571 + }, + { + "epoch": 1.1972474578720176, + "grad_norm": 7.819556139669025, + "learning_rate": 1.8309653461451054e-06, + "loss": 0.693, + "step": 16572 + }, + { + "epoch": 1.1973197030722271, + "grad_norm": 7.372614070492356, + "learning_rate": 1.8306835169975338e-06, + "loss": 0.6731, + "step": 16573 + }, + { + "epoch": 1.1973919482724367, + "grad_norm": 6.256316693204862, + "learning_rate": 1.8304016970123845e-06, + "loss": 0.6247, + "step": 16574 + }, + { + "epoch": 1.1974641934726462, + "grad_norm": 6.866543649602362, + "learning_rate": 1.8301198861935165e-06, + "loss": 0.6031, + "step": 16575 + }, + { + "epoch": 1.1975364386728558, + "grad_norm": 6.5700503556689265, + "learning_rate": 1.8298380845447872e-06, + "loss": 0.5949, + "step": 16576 + }, + { + "epoch": 1.197608683873065, + "grad_norm": 6.408551058731082, + "learning_rate": 1.8295562920700542e-06, + "loss": 0.6652, + "step": 16577 + }, + { + "epoch": 1.1976809290732746, + "grad_norm": 6.352228521545915, + "learning_rate": 1.8292745087731745e-06, + "loss": 0.6386, + "step": 16578 + }, + { + "epoch": 1.1977531742734842, + "grad_norm": 6.4935258755085, + "learning_rate": 1.8289927346580073e-06, + "loss": 0.6338, + "step": 16579 + }, + { + "epoch": 1.1978254194736937, + "grad_norm": 6.874589110146847, + "learning_rate": 1.8287109697284069e-06, + "loss": 0.659, + "step": 16580 + }, + { + "epoch": 1.1978976646739032, + "grad_norm": 7.479020084513783, + "learning_rate": 1.8284292139882326e-06, + "loss": 0.7318, + "step": 16581 + }, + { + "epoch": 1.1979699098741128, + "grad_norm": 6.399433957340698, + "learning_rate": 1.8281474674413413e-06, + "loss": 0.5875, + "step": 16582 + }, + { + "epoch": 1.1980421550743223, + "grad_norm": 6.927962887406882, + "learning_rate": 1.8278657300915902e-06, + "loss": 0.6669, + "step": 16583 + }, + { + "epoch": 1.1981144002745316, + "grad_norm": 6.390022678484245, + "learning_rate": 1.8275840019428342e-06, + "loss": 0.578, + "step": 16584 + }, + { + "epoch": 1.1981866454747412, + "grad_norm": 5.823184130627285, + "learning_rate": 1.8273022829989312e-06, + "loss": 0.6259, + "step": 16585 + }, + { + "epoch": 1.1982588906749507, + "grad_norm": 5.2588890753314645, + "learning_rate": 1.8270205732637383e-06, + "loss": 0.6547, + "step": 16586 + }, + { + "epoch": 1.1983311358751603, + "grad_norm": 7.895668880256466, + "learning_rate": 1.8267388727411106e-06, + "loss": 0.7156, + "step": 16587 + }, + { + "epoch": 1.1984033810753698, + "grad_norm": 6.196135380900523, + "learning_rate": 1.8264571814349053e-06, + "loss": 0.6744, + "step": 16588 + }, + { + "epoch": 1.1984756262755794, + "grad_norm": 8.557560965041976, + "learning_rate": 1.8261754993489777e-06, + "loss": 0.614, + "step": 16589 + }, + { + "epoch": 1.198547871475789, + "grad_norm": 7.262524014825559, + "learning_rate": 1.825893826487185e-06, + "loss": 0.6942, + "step": 16590 + }, + { + "epoch": 1.1986201166759982, + "grad_norm": 7.647263090935372, + "learning_rate": 1.825612162853382e-06, + "loss": 0.6781, + "step": 16591 + }, + { + "epoch": 1.1986923618762078, + "grad_norm": 9.197675195608195, + "learning_rate": 1.825330508451425e-06, + "loss": 0.641, + "step": 16592 + }, + { + "epoch": 1.1987646070764173, + "grad_norm": 7.090568467608761, + "learning_rate": 1.8250488632851693e-06, + "loss": 0.5833, + "step": 16593 + }, + { + "epoch": 1.1988368522766268, + "grad_norm": 6.1350155900603225, + "learning_rate": 1.824767227358471e-06, + "loss": 0.6464, + "step": 16594 + }, + { + "epoch": 1.1989090974768364, + "grad_norm": 6.981837825223715, + "learning_rate": 1.824485600675185e-06, + "loss": 0.6664, + "step": 16595 + }, + { + "epoch": 1.198981342677046, + "grad_norm": 6.617010517298213, + "learning_rate": 1.824203983239166e-06, + "loss": 0.6614, + "step": 16596 + }, + { + "epoch": 1.1990535878772555, + "grad_norm": 5.784188838129053, + "learning_rate": 1.823922375054271e-06, + "loss": 0.6528, + "step": 16597 + }, + { + "epoch": 1.199125833077465, + "grad_norm": 6.548932974294869, + "learning_rate": 1.823640776124353e-06, + "loss": 0.6503, + "step": 16598 + }, + { + "epoch": 1.1991980782776746, + "grad_norm": 6.549314495061648, + "learning_rate": 1.8233591864532674e-06, + "loss": 0.6472, + "step": 16599 + }, + { + "epoch": 1.1992703234778839, + "grad_norm": 6.160720673551861, + "learning_rate": 1.8230776060448684e-06, + "loss": 0.6453, + "step": 16600 + }, + { + "epoch": 1.1993425686780934, + "grad_norm": 7.079585112145161, + "learning_rate": 1.8227960349030133e-06, + "loss": 0.7066, + "step": 16601 + }, + { + "epoch": 1.199414813878303, + "grad_norm": 5.963273816708161, + "learning_rate": 1.8225144730315537e-06, + "loss": 0.6581, + "step": 16602 + }, + { + "epoch": 1.1994870590785125, + "grad_norm": 6.4352869470459755, + "learning_rate": 1.8222329204343448e-06, + "loss": 0.61, + "step": 16603 + }, + { + "epoch": 1.199559304278722, + "grad_norm": 6.6058285263863405, + "learning_rate": 1.8219513771152414e-06, + "loss": 0.6515, + "step": 16604 + }, + { + "epoch": 1.1996315494789316, + "grad_norm": 6.543899187575282, + "learning_rate": 1.8216698430780965e-06, + "loss": 0.6731, + "step": 16605 + }, + { + "epoch": 1.1997037946791411, + "grad_norm": 6.937272626570281, + "learning_rate": 1.821388318326765e-06, + "loss": 0.66, + "step": 16606 + }, + { + "epoch": 1.1997760398793504, + "grad_norm": 5.419635233129686, + "learning_rate": 1.8211068028651005e-06, + "loss": 0.6269, + "step": 16607 + }, + { + "epoch": 1.19984828507956, + "grad_norm": 7.443336568861021, + "learning_rate": 1.8208252966969572e-06, + "loss": 0.6844, + "step": 16608 + }, + { + "epoch": 1.1999205302797695, + "grad_norm": 6.159887178552074, + "learning_rate": 1.8205437998261876e-06, + "loss": 0.6796, + "step": 16609 + }, + { + "epoch": 1.199992775479979, + "grad_norm": 5.407952365868611, + "learning_rate": 1.820262312256646e-06, + "loss": 0.6696, + "step": 16610 + }, + { + "epoch": 1.2000650206801886, + "grad_norm": 6.341957778698694, + "learning_rate": 1.8199808339921868e-06, + "loss": 0.6129, + "step": 16611 + }, + { + "epoch": 1.2001372658803982, + "grad_norm": 6.853688170206158, + "learning_rate": 1.8196993650366607e-06, + "loss": 0.6086, + "step": 16612 + }, + { + "epoch": 1.2002095110806077, + "grad_norm": 7.3238330627616826, + "learning_rate": 1.8194179053939221e-06, + "loss": 0.6104, + "step": 16613 + }, + { + "epoch": 1.200281756280817, + "grad_norm": 6.290172711110221, + "learning_rate": 1.8191364550678237e-06, + "loss": 0.6738, + "step": 16614 + }, + { + "epoch": 1.2003540014810266, + "grad_norm": 8.059069472900886, + "learning_rate": 1.8188550140622194e-06, + "loss": 0.6294, + "step": 16615 + }, + { + "epoch": 1.200426246681236, + "grad_norm": 6.349391289318453, + "learning_rate": 1.8185735823809606e-06, + "loss": 0.5913, + "step": 16616 + }, + { + "epoch": 1.2004984918814456, + "grad_norm": 6.757462791258575, + "learning_rate": 1.8182921600279002e-06, + "loss": 0.7, + "step": 16617 + }, + { + "epoch": 1.2005707370816552, + "grad_norm": 6.293331843769307, + "learning_rate": 1.8180107470068907e-06, + "loss": 0.6565, + "step": 16618 + }, + { + "epoch": 1.2006429822818647, + "grad_norm": 6.003047804805267, + "learning_rate": 1.8177293433217853e-06, + "loss": 0.5605, + "step": 16619 + }, + { + "epoch": 1.2007152274820743, + "grad_norm": 7.803345710954242, + "learning_rate": 1.8174479489764348e-06, + "loss": 0.6618, + "step": 16620 + }, + { + "epoch": 1.2007874726822836, + "grad_norm": 7.7249496211412225, + "learning_rate": 1.8171665639746921e-06, + "loss": 0.6805, + "step": 16621 + }, + { + "epoch": 1.2008597178824931, + "grad_norm": 7.571955546833503, + "learning_rate": 1.8168851883204092e-06, + "loss": 0.6014, + "step": 16622 + }, + { + "epoch": 1.2009319630827027, + "grad_norm": 6.413673231088839, + "learning_rate": 1.8166038220174372e-06, + "loss": 0.699, + "step": 16623 + }, + { + "epoch": 1.2010042082829122, + "grad_norm": 6.2366260107371145, + "learning_rate": 1.8163224650696284e-06, + "loss": 0.6891, + "step": 16624 + }, + { + "epoch": 1.2010764534831218, + "grad_norm": 6.5340841614458425, + "learning_rate": 1.816041117480834e-06, + "loss": 0.6306, + "step": 16625 + }, + { + "epoch": 1.2011486986833313, + "grad_norm": 6.133752291647664, + "learning_rate": 1.8157597792549069e-06, + "loss": 0.6286, + "step": 16626 + }, + { + "epoch": 1.2012209438835408, + "grad_norm": 6.007176240284276, + "learning_rate": 1.8154784503956963e-06, + "loss": 0.6639, + "step": 16627 + }, + { + "epoch": 1.2012931890837502, + "grad_norm": 7.000591525561513, + "learning_rate": 1.8151971309070537e-06, + "loss": 0.5907, + "step": 16628 + }, + { + "epoch": 1.2013654342839597, + "grad_norm": 10.999228710530684, + "learning_rate": 1.8149158207928313e-06, + "loss": 0.7088, + "step": 16629 + }, + { + "epoch": 1.2014376794841692, + "grad_norm": 6.038159459588975, + "learning_rate": 1.8146345200568793e-06, + "loss": 0.5767, + "step": 16630 + }, + { + "epoch": 1.2015099246843788, + "grad_norm": 6.423830797259249, + "learning_rate": 1.814353228703048e-06, + "loss": 0.6724, + "step": 16631 + }, + { + "epoch": 1.2015821698845883, + "grad_norm": 5.381471685612753, + "learning_rate": 1.8140719467351892e-06, + "loss": 0.5884, + "step": 16632 + }, + { + "epoch": 1.2016544150847979, + "grad_norm": 6.403998746968325, + "learning_rate": 1.813790674157153e-06, + "loss": 0.7206, + "step": 16633 + }, + { + "epoch": 1.2017266602850074, + "grad_norm": 6.438699258742503, + "learning_rate": 1.813509410972789e-06, + "loss": 0.7042, + "step": 16634 + }, + { + "epoch": 1.2017989054852167, + "grad_norm": 7.038003755092026, + "learning_rate": 1.8132281571859483e-06, + "loss": 0.6549, + "step": 16635 + }, + { + "epoch": 1.2018711506854263, + "grad_norm": 5.8770108941162045, + "learning_rate": 1.8129469128004823e-06, + "loss": 0.6367, + "step": 16636 + }, + { + "epoch": 1.2019433958856358, + "grad_norm": 6.676563482733077, + "learning_rate": 1.8126656778202376e-06, + "loss": 0.6377, + "step": 16637 + }, + { + "epoch": 1.2020156410858454, + "grad_norm": 6.011279947131326, + "learning_rate": 1.8123844522490666e-06, + "loss": 0.6337, + "step": 16638 + }, + { + "epoch": 1.202087886286055, + "grad_norm": 7.365804597059343, + "learning_rate": 1.8121032360908185e-06, + "loss": 0.7007, + "step": 16639 + }, + { + "epoch": 1.2021601314862644, + "grad_norm": 6.202295050842476, + "learning_rate": 1.8118220293493443e-06, + "loss": 0.6226, + "step": 16640 + }, + { + "epoch": 1.202232376686474, + "grad_norm": 5.617850273834178, + "learning_rate": 1.811540832028491e-06, + "loss": 0.623, + "step": 16641 + }, + { + "epoch": 1.2023046218866833, + "grad_norm": 7.805290130168979, + "learning_rate": 1.8112596441321095e-06, + "loss": 0.7192, + "step": 16642 + }, + { + "epoch": 1.2023768670868928, + "grad_norm": 6.9274045329569445, + "learning_rate": 1.8109784656640478e-06, + "loss": 0.6634, + "step": 16643 + }, + { + "epoch": 1.2024491122871024, + "grad_norm": 6.775774876949903, + "learning_rate": 1.8106972966281575e-06, + "loss": 0.7116, + "step": 16644 + }, + { + "epoch": 1.202521357487312, + "grad_norm": 6.798753500739555, + "learning_rate": 1.810416137028285e-06, + "loss": 0.6355, + "step": 16645 + }, + { + "epoch": 1.2025936026875215, + "grad_norm": 6.6832607059226765, + "learning_rate": 1.81013498686828e-06, + "loss": 0.7011, + "step": 16646 + }, + { + "epoch": 1.202665847887731, + "grad_norm": 7.229468640736508, + "learning_rate": 1.8098538461519921e-06, + "loss": 0.6375, + "step": 16647 + }, + { + "epoch": 1.2027380930879406, + "grad_norm": 6.497061431984198, + "learning_rate": 1.8095727148832687e-06, + "loss": 0.6835, + "step": 16648 + }, + { + "epoch": 1.2028103382881499, + "grad_norm": 7.1400063411566945, + "learning_rate": 1.8092915930659588e-06, + "loss": 0.6839, + "step": 16649 + }, + { + "epoch": 1.2028825834883594, + "grad_norm": 6.310234947606479, + "learning_rate": 1.8090104807039105e-06, + "loss": 0.7378, + "step": 16650 + }, + { + "epoch": 1.202954828688569, + "grad_norm": 5.796620982894771, + "learning_rate": 1.8087293778009729e-06, + "loss": 0.6031, + "step": 16651 + }, + { + "epoch": 1.2030270738887785, + "grad_norm": 6.517642506986475, + "learning_rate": 1.8084482843609927e-06, + "loss": 0.6214, + "step": 16652 + }, + { + "epoch": 1.203099319088988, + "grad_norm": 6.2353524037049155, + "learning_rate": 1.8081672003878186e-06, + "loss": 0.6425, + "step": 16653 + }, + { + "epoch": 1.2031715642891976, + "grad_norm": 7.921765448962065, + "learning_rate": 1.8078861258852992e-06, + "loss": 0.5797, + "step": 16654 + }, + { + "epoch": 1.2032438094894071, + "grad_norm": 6.933913945071445, + "learning_rate": 1.8076050608572804e-06, + "loss": 0.6675, + "step": 16655 + }, + { + "epoch": 1.2033160546896164, + "grad_norm": 7.964413409092206, + "learning_rate": 1.8073240053076108e-06, + "loss": 0.6699, + "step": 16656 + }, + { + "epoch": 1.203388299889826, + "grad_norm": 6.281008587653246, + "learning_rate": 1.8070429592401373e-06, + "loss": 0.6584, + "step": 16657 + }, + { + "epoch": 1.2034605450900355, + "grad_norm": 6.413212858140478, + "learning_rate": 1.806761922658709e-06, + "loss": 0.6001, + "step": 16658 + }, + { + "epoch": 1.203532790290245, + "grad_norm": 6.629229077473571, + "learning_rate": 1.806480895567171e-06, + "loss": 0.6539, + "step": 16659 + }, + { + "epoch": 1.2036050354904546, + "grad_norm": 6.051338854734087, + "learning_rate": 1.8061998779693706e-06, + "loss": 0.6204, + "step": 16660 + }, + { + "epoch": 1.2036772806906642, + "grad_norm": 6.07604034683664, + "learning_rate": 1.805918869869156e-06, + "loss": 0.6434, + "step": 16661 + }, + { + "epoch": 1.2037495258908737, + "grad_norm": 6.89489890352952, + "learning_rate": 1.8056378712703727e-06, + "loss": 0.7696, + "step": 16662 + }, + { + "epoch": 1.203821771091083, + "grad_norm": 7.202287405968595, + "learning_rate": 1.8053568821768674e-06, + "loss": 0.7057, + "step": 16663 + }, + { + "epoch": 1.2038940162912926, + "grad_norm": 6.2250218034845, + "learning_rate": 1.8050759025924874e-06, + "loss": 0.7108, + "step": 16664 + }, + { + "epoch": 1.203966261491502, + "grad_norm": 7.287109113257165, + "learning_rate": 1.8047949325210793e-06, + "loss": 0.621, + "step": 16665 + }, + { + "epoch": 1.2040385066917116, + "grad_norm": 6.955724474582687, + "learning_rate": 1.8045139719664881e-06, + "loss": 0.6676, + "step": 16666 + }, + { + "epoch": 1.2041107518919212, + "grad_norm": 6.272380278970151, + "learning_rate": 1.8042330209325604e-06, + "loss": 0.6894, + "step": 16667 + }, + { + "epoch": 1.2041829970921307, + "grad_norm": 6.593866283964271, + "learning_rate": 1.803952079423144e-06, + "loss": 0.6411, + "step": 16668 + }, + { + "epoch": 1.2042552422923403, + "grad_norm": 6.825708585852066, + "learning_rate": 1.8036711474420816e-06, + "loss": 0.6269, + "step": 16669 + }, + { + "epoch": 1.2043274874925498, + "grad_norm": 6.66903247181384, + "learning_rate": 1.8033902249932205e-06, + "loss": 0.7063, + "step": 16670 + }, + { + "epoch": 1.2043997326927591, + "grad_norm": 6.555642459674111, + "learning_rate": 1.8031093120804063e-06, + "loss": 0.615, + "step": 16671 + }, + { + "epoch": 1.2044719778929687, + "grad_norm": 6.714402890559801, + "learning_rate": 1.802828408707486e-06, + "loss": 0.6498, + "step": 16672 + }, + { + "epoch": 1.2045442230931782, + "grad_norm": 5.328853971221809, + "learning_rate": 1.8025475148783023e-06, + "loss": 0.6002, + "step": 16673 + }, + { + "epoch": 1.2046164682933878, + "grad_norm": 6.9949267940767035, + "learning_rate": 1.8022666305967012e-06, + "loss": 0.6776, + "step": 16674 + }, + { + "epoch": 1.2046887134935973, + "grad_norm": 6.126219958823444, + "learning_rate": 1.8019857558665289e-06, + "loss": 0.6304, + "step": 16675 + }, + { + "epoch": 1.2047609586938068, + "grad_norm": 7.926675699975866, + "learning_rate": 1.8017048906916295e-06, + "loss": 0.7045, + "step": 16676 + }, + { + "epoch": 1.2048332038940164, + "grad_norm": 5.831760775815302, + "learning_rate": 1.8014240350758476e-06, + "loss": 0.6083, + "step": 16677 + }, + { + "epoch": 1.204905449094226, + "grad_norm": 6.712771288399548, + "learning_rate": 1.8011431890230286e-06, + "loss": 0.719, + "step": 16678 + }, + { + "epoch": 1.2049776942944352, + "grad_norm": 6.604409808026264, + "learning_rate": 1.8008623525370168e-06, + "loss": 0.666, + "step": 16679 + }, + { + "epoch": 1.2050499394946448, + "grad_norm": 6.238706838175305, + "learning_rate": 1.8005815256216563e-06, + "loss": 0.675, + "step": 16680 + }, + { + "epoch": 1.2051221846948543, + "grad_norm": 6.431855921844428, + "learning_rate": 1.8003007082807916e-06, + "loss": 0.5756, + "step": 16681 + }, + { + "epoch": 1.2051944298950639, + "grad_norm": 6.618395410248355, + "learning_rate": 1.8000199005182667e-06, + "loss": 0.6201, + "step": 16682 + }, + { + "epoch": 1.2052666750952734, + "grad_norm": 6.974571635994231, + "learning_rate": 1.7997391023379275e-06, + "loss": 0.5858, + "step": 16683 + }, + { + "epoch": 1.205338920295483, + "grad_norm": 8.44729881677005, + "learning_rate": 1.7994583137436144e-06, + "loss": 0.6918, + "step": 16684 + }, + { + "epoch": 1.2054111654956925, + "grad_norm": 5.954358711464646, + "learning_rate": 1.7991775347391731e-06, + "loss": 0.5916, + "step": 16685 + }, + { + "epoch": 1.2054834106959018, + "grad_norm": 6.0571917074346615, + "learning_rate": 1.7988967653284487e-06, + "loss": 0.6454, + "step": 16686 + }, + { + "epoch": 1.2055556558961114, + "grad_norm": 7.220491273603046, + "learning_rate": 1.7986160055152819e-06, + "loss": 0.71, + "step": 16687 + }, + { + "epoch": 1.205627901096321, + "grad_norm": 6.6373966762166, + "learning_rate": 1.7983352553035176e-06, + "loss": 0.7372, + "step": 16688 + }, + { + "epoch": 1.2057001462965304, + "grad_norm": 6.947332888608169, + "learning_rate": 1.7980545146969988e-06, + "loss": 0.68, + "step": 16689 + }, + { + "epoch": 1.20577239149674, + "grad_norm": 6.665555511699007, + "learning_rate": 1.7977737836995692e-06, + "loss": 0.5591, + "step": 16690 + }, + { + "epoch": 1.2058446366969495, + "grad_norm": 7.022656967022157, + "learning_rate": 1.797493062315071e-06, + "loss": 0.7301, + "step": 16691 + }, + { + "epoch": 1.205916881897159, + "grad_norm": 6.369232860226042, + "learning_rate": 1.7972123505473468e-06, + "loss": 0.6762, + "step": 16692 + }, + { + "epoch": 1.2059891270973684, + "grad_norm": 6.636427326775621, + "learning_rate": 1.7969316484002408e-06, + "loss": 0.6085, + "step": 16693 + }, + { + "epoch": 1.206061372297578, + "grad_norm": 6.93822770984508, + "learning_rate": 1.7966509558775939e-06, + "loss": 0.6672, + "step": 16694 + }, + { + "epoch": 1.2061336174977875, + "grad_norm": 7.048523974284578, + "learning_rate": 1.7963702729832494e-06, + "loss": 0.6847, + "step": 16695 + }, + { + "epoch": 1.206205862697997, + "grad_norm": 7.209961929538229, + "learning_rate": 1.7960895997210492e-06, + "loss": 0.6736, + "step": 16696 + }, + { + "epoch": 1.2062781078982066, + "grad_norm": 6.5124700212429305, + "learning_rate": 1.7958089360948372e-06, + "loss": 0.653, + "step": 16697 + }, + { + "epoch": 1.206350353098416, + "grad_norm": 6.233087774869572, + "learning_rate": 1.795528282108453e-06, + "loss": 0.6459, + "step": 16698 + }, + { + "epoch": 1.2064225982986256, + "grad_norm": 6.535484000205598, + "learning_rate": 1.795247637765739e-06, + "loss": 0.6168, + "step": 16699 + }, + { + "epoch": 1.206494843498835, + "grad_norm": 7.422353757173721, + "learning_rate": 1.7949670030705386e-06, + "loss": 0.5726, + "step": 16700 + }, + { + "epoch": 1.2065670886990445, + "grad_norm": 5.9766703676948785, + "learning_rate": 1.7946863780266933e-06, + "loss": 0.6287, + "step": 16701 + }, + { + "epoch": 1.206639333899254, + "grad_norm": 6.8096338244788415, + "learning_rate": 1.7944057626380422e-06, + "loss": 0.6805, + "step": 16702 + }, + { + "epoch": 1.2067115790994636, + "grad_norm": 5.674389158471363, + "learning_rate": 1.794125156908429e-06, + "loss": 0.6056, + "step": 16703 + }, + { + "epoch": 1.2067838242996731, + "grad_norm": 7.4705478946290995, + "learning_rate": 1.7938445608416949e-06, + "loss": 0.6936, + "step": 16704 + }, + { + "epoch": 1.2068560694998827, + "grad_norm": 5.993727266111409, + "learning_rate": 1.7935639744416797e-06, + "loss": 0.6699, + "step": 16705 + }, + { + "epoch": 1.2069283147000922, + "grad_norm": 7.301144382187684, + "learning_rate": 1.7932833977122252e-06, + "loss": 0.6214, + "step": 16706 + }, + { + "epoch": 1.2070005599003015, + "grad_norm": 6.3030061240610555, + "learning_rate": 1.7930028306571723e-06, + "loss": 0.6351, + "step": 16707 + }, + { + "epoch": 1.207072805100511, + "grad_norm": 6.531279495391682, + "learning_rate": 1.7927222732803622e-06, + "loss": 0.6917, + "step": 16708 + }, + { + "epoch": 1.2071450503007206, + "grad_norm": 6.236778312321276, + "learning_rate": 1.7924417255856346e-06, + "loss": 0.6481, + "step": 16709 + }, + { + "epoch": 1.2072172955009302, + "grad_norm": 6.834677897344484, + "learning_rate": 1.7921611875768302e-06, + "loss": 0.7049, + "step": 16710 + }, + { + "epoch": 1.2072895407011397, + "grad_norm": 7.136080775483733, + "learning_rate": 1.7918806592577906e-06, + "loss": 0.6541, + "step": 16711 + }, + { + "epoch": 1.2073617859013492, + "grad_norm": 6.717937189660865, + "learning_rate": 1.7916001406323541e-06, + "loss": 0.5953, + "step": 16712 + }, + { + "epoch": 1.2074340311015588, + "grad_norm": 5.971366268988045, + "learning_rate": 1.7913196317043613e-06, + "loss": 0.6624, + "step": 16713 + }, + { + "epoch": 1.207506276301768, + "grad_norm": 6.203967135747541, + "learning_rate": 1.7910391324776522e-06, + "loss": 0.6118, + "step": 16714 + }, + { + "epoch": 1.2075785215019776, + "grad_norm": 8.065631583564729, + "learning_rate": 1.7907586429560685e-06, + "loss": 0.6231, + "step": 16715 + }, + { + "epoch": 1.2076507667021872, + "grad_norm": 7.600013692743113, + "learning_rate": 1.7904781631434473e-06, + "loss": 0.6983, + "step": 16716 + }, + { + "epoch": 1.2077230119023967, + "grad_norm": 6.038330033705285, + "learning_rate": 1.790197693043629e-06, + "loss": 0.6619, + "step": 16717 + }, + { + "epoch": 1.2077952571026063, + "grad_norm": 6.775967135497182, + "learning_rate": 1.7899172326604537e-06, + "loss": 0.7086, + "step": 16718 + }, + { + "epoch": 1.2078675023028158, + "grad_norm": 6.6938372381030335, + "learning_rate": 1.7896367819977598e-06, + "loss": 0.6265, + "step": 16719 + }, + { + "epoch": 1.2079397475030254, + "grad_norm": 6.144042328470174, + "learning_rate": 1.7893563410593866e-06, + "loss": 0.6354, + "step": 16720 + }, + { + "epoch": 1.2080119927032347, + "grad_norm": 5.558151774417548, + "learning_rate": 1.7890759098491733e-06, + "loss": 0.6078, + "step": 16721 + }, + { + "epoch": 1.2080842379034442, + "grad_norm": 5.683195728116535, + "learning_rate": 1.7887954883709594e-06, + "loss": 0.6215, + "step": 16722 + }, + { + "epoch": 1.2081564831036538, + "grad_norm": 6.885239068384524, + "learning_rate": 1.7885150766285825e-06, + "loss": 0.6427, + "step": 16723 + }, + { + "epoch": 1.2082287283038633, + "grad_norm": 6.089029684705166, + "learning_rate": 1.7882346746258816e-06, + "loss": 0.5992, + "step": 16724 + }, + { + "epoch": 1.2083009735040728, + "grad_norm": 7.928944700431451, + "learning_rate": 1.7879542823666956e-06, + "loss": 0.7575, + "step": 16725 + }, + { + "epoch": 1.2083732187042824, + "grad_norm": 6.936161496368935, + "learning_rate": 1.7876738998548637e-06, + "loss": 0.5836, + "step": 16726 + }, + { + "epoch": 1.208445463904492, + "grad_norm": 7.155747575072383, + "learning_rate": 1.7873935270942216e-06, + "loss": 0.6776, + "step": 16727 + }, + { + "epoch": 1.2085177091047012, + "grad_norm": 7.395495368506796, + "learning_rate": 1.7871131640886087e-06, + "loss": 0.6704, + "step": 16728 + }, + { + "epoch": 1.2085899543049108, + "grad_norm": 7.034347733419778, + "learning_rate": 1.7868328108418641e-06, + "loss": 0.6738, + "step": 16729 + }, + { + "epoch": 1.2086621995051203, + "grad_norm": 6.797682861023508, + "learning_rate": 1.7865524673578244e-06, + "loss": 0.6808, + "step": 16730 + }, + { + "epoch": 1.2087344447053299, + "grad_norm": 7.611474219553126, + "learning_rate": 1.7862721336403266e-06, + "loss": 0.7053, + "step": 16731 + }, + { + "epoch": 1.2088066899055394, + "grad_norm": 7.815417912121736, + "learning_rate": 1.7859918096932096e-06, + "loss": 0.6635, + "step": 16732 + }, + { + "epoch": 1.208878935105749, + "grad_norm": 6.885414419784603, + "learning_rate": 1.785711495520311e-06, + "loss": 0.6692, + "step": 16733 + }, + { + "epoch": 1.2089511803059585, + "grad_norm": 5.8816341521972415, + "learning_rate": 1.7854311911254662e-06, + "loss": 0.5526, + "step": 16734 + }, + { + "epoch": 1.2090234255061678, + "grad_norm": 6.329799703650194, + "learning_rate": 1.7851508965125143e-06, + "loss": 0.685, + "step": 16735 + }, + { + "epoch": 1.2090956707063774, + "grad_norm": 7.761544274114718, + "learning_rate": 1.7848706116852914e-06, + "loss": 0.7117, + "step": 16736 + }, + { + "epoch": 1.209167915906587, + "grad_norm": 5.782397347086228, + "learning_rate": 1.7845903366476347e-06, + "loss": 0.6514, + "step": 16737 + }, + { + "epoch": 1.2092401611067964, + "grad_norm": 7.410463575104409, + "learning_rate": 1.7843100714033806e-06, + "loss": 0.6003, + "step": 16738 + }, + { + "epoch": 1.209312406307006, + "grad_norm": 6.759385542171333, + "learning_rate": 1.7840298159563657e-06, + "loss": 0.6793, + "step": 16739 + }, + { + "epoch": 1.2093846515072155, + "grad_norm": 6.75796265187235, + "learning_rate": 1.783749570310428e-06, + "loss": 0.6547, + "step": 16740 + }, + { + "epoch": 1.209456896707425, + "grad_norm": 8.538782392621044, + "learning_rate": 1.7834693344694016e-06, + "loss": 0.6217, + "step": 16741 + }, + { + "epoch": 1.2095291419076346, + "grad_norm": 6.09431869591931, + "learning_rate": 1.7831891084371228e-06, + "loss": 0.6188, + "step": 16742 + }, + { + "epoch": 1.209601387107844, + "grad_norm": 8.234514441567208, + "learning_rate": 1.7829088922174304e-06, + "loss": 0.6285, + "step": 16743 + }, + { + "epoch": 1.2096736323080535, + "grad_norm": 7.91955123015261, + "learning_rate": 1.7826286858141573e-06, + "loss": 0.7203, + "step": 16744 + }, + { + "epoch": 1.209745877508263, + "grad_norm": 7.118018226422273, + "learning_rate": 1.7823484892311404e-06, + "loss": 0.6831, + "step": 16745 + }, + { + "epoch": 1.2098181227084726, + "grad_norm": 6.579075687964, + "learning_rate": 1.7820683024722158e-06, + "loss": 0.7039, + "step": 16746 + }, + { + "epoch": 1.209890367908682, + "grad_norm": 6.985872180746827, + "learning_rate": 1.781788125541219e-06, + "loss": 0.6006, + "step": 16747 + }, + { + "epoch": 1.2099626131088916, + "grad_norm": 6.35371675826733, + "learning_rate": 1.7815079584419844e-06, + "loss": 0.6225, + "step": 16748 + }, + { + "epoch": 1.2100348583091012, + "grad_norm": 6.633552644768711, + "learning_rate": 1.7812278011783482e-06, + "loss": 0.6371, + "step": 16749 + }, + { + "epoch": 1.2101071035093107, + "grad_norm": 6.215662817746472, + "learning_rate": 1.7809476537541453e-06, + "loss": 0.6304, + "step": 16750 + }, + { + "epoch": 1.21017934870952, + "grad_norm": 8.171252721138591, + "learning_rate": 1.780667516173211e-06, + "loss": 0.6584, + "step": 16751 + }, + { + "epoch": 1.2102515939097296, + "grad_norm": 6.558780742476521, + "learning_rate": 1.7803873884393796e-06, + "loss": 0.6941, + "step": 16752 + }, + { + "epoch": 1.2103238391099391, + "grad_norm": 6.577163444790785, + "learning_rate": 1.7801072705564863e-06, + "loss": 0.6435, + "step": 16753 + }, + { + "epoch": 1.2103960843101487, + "grad_norm": 5.647076159104684, + "learning_rate": 1.7798271625283664e-06, + "loss": 0.6612, + "step": 16754 + }, + { + "epoch": 1.2104683295103582, + "grad_norm": 8.033331575301151, + "learning_rate": 1.779547064358852e-06, + "loss": 0.6664, + "step": 16755 + }, + { + "epoch": 1.2105405747105678, + "grad_norm": 7.375182585961484, + "learning_rate": 1.7792669760517788e-06, + "loss": 0.644, + "step": 16756 + }, + { + "epoch": 1.2106128199107773, + "grad_norm": 7.903380344379228, + "learning_rate": 1.7789868976109814e-06, + "loss": 0.7582, + "step": 16757 + }, + { + "epoch": 1.2106850651109866, + "grad_norm": 6.444182547732186, + "learning_rate": 1.7787068290402947e-06, + "loss": 0.6438, + "step": 16758 + }, + { + "epoch": 1.2107573103111962, + "grad_norm": 6.106468035925456, + "learning_rate": 1.7784267703435503e-06, + "loss": 0.5638, + "step": 16759 + }, + { + "epoch": 1.2108295555114057, + "grad_norm": 7.096102555506895, + "learning_rate": 1.7781467215245835e-06, + "loss": 0.6235, + "step": 16760 + }, + { + "epoch": 1.2109018007116152, + "grad_norm": 7.248390545274598, + "learning_rate": 1.7778666825872278e-06, + "loss": 0.6699, + "step": 16761 + }, + { + "epoch": 1.2109740459118248, + "grad_norm": 6.374078422205195, + "learning_rate": 1.7775866535353163e-06, + "loss": 0.6698, + "step": 16762 + }, + { + "epoch": 1.2110462911120343, + "grad_norm": 6.960142559038476, + "learning_rate": 1.7773066343726823e-06, + "loss": 0.6354, + "step": 16763 + }, + { + "epoch": 1.2111185363122439, + "grad_norm": 6.2631496191497344, + "learning_rate": 1.7770266251031593e-06, + "loss": 0.7321, + "step": 16764 + }, + { + "epoch": 1.2111907815124532, + "grad_norm": 6.8092649285504745, + "learning_rate": 1.776746625730581e-06, + "loss": 0.5865, + "step": 16765 + }, + { + "epoch": 1.2112630267126627, + "grad_norm": 5.869951392107359, + "learning_rate": 1.7764666362587795e-06, + "loss": 0.604, + "step": 16766 + }, + { + "epoch": 1.2113352719128723, + "grad_norm": 6.772158386469249, + "learning_rate": 1.7761866566915881e-06, + "loss": 0.6886, + "step": 16767 + }, + { + "epoch": 1.2114075171130818, + "grad_norm": 6.937806818595372, + "learning_rate": 1.7759066870328401e-06, + "loss": 0.5613, + "step": 16768 + }, + { + "epoch": 1.2114797623132914, + "grad_norm": 6.835333120604211, + "learning_rate": 1.7756267272863662e-06, + "loss": 0.5996, + "step": 16769 + }, + { + "epoch": 1.211552007513501, + "grad_norm": 5.8311419412591325, + "learning_rate": 1.7753467774559997e-06, + "loss": 0.598, + "step": 16770 + }, + { + "epoch": 1.2116242527137104, + "grad_norm": 6.264789934708537, + "learning_rate": 1.7750668375455735e-06, + "loss": 0.6456, + "step": 16771 + }, + { + "epoch": 1.2116964979139198, + "grad_norm": 7.744972567229575, + "learning_rate": 1.7747869075589206e-06, + "loss": 0.5855, + "step": 16772 + }, + { + "epoch": 1.2117687431141293, + "grad_norm": 7.137586485544189, + "learning_rate": 1.7745069874998705e-06, + "loss": 0.6973, + "step": 16773 + }, + { + "epoch": 1.2118409883143388, + "grad_norm": 7.530616068771807, + "learning_rate": 1.774227077372257e-06, + "loss": 0.6802, + "step": 16774 + }, + { + "epoch": 1.2119132335145484, + "grad_norm": 8.26514870437724, + "learning_rate": 1.7739471771799109e-06, + "loss": 0.6316, + "step": 16775 + }, + { + "epoch": 1.211985478714758, + "grad_norm": 5.870582705531899, + "learning_rate": 1.7736672869266646e-06, + "loss": 0.6268, + "step": 16776 + }, + { + "epoch": 1.2120577239149675, + "grad_norm": 8.661913619899114, + "learning_rate": 1.7733874066163487e-06, + "loss": 0.6985, + "step": 16777 + }, + { + "epoch": 1.212129969115177, + "grad_norm": 7.864566733909083, + "learning_rate": 1.7731075362527953e-06, + "loss": 0.6422, + "step": 16778 + }, + { + "epoch": 1.2122022143153863, + "grad_norm": 5.670861355919963, + "learning_rate": 1.7728276758398354e-06, + "loss": 0.6134, + "step": 16779 + }, + { + "epoch": 1.2122744595155959, + "grad_norm": 7.613342629386362, + "learning_rate": 1.7725478253813e-06, + "loss": 0.6668, + "step": 16780 + }, + { + "epoch": 1.2123467047158054, + "grad_norm": 7.538902352168856, + "learning_rate": 1.7722679848810198e-06, + "loss": 0.623, + "step": 16781 + }, + { + "epoch": 1.212418949916015, + "grad_norm": 6.860900570225565, + "learning_rate": 1.7719881543428257e-06, + "loss": 0.5963, + "step": 16782 + }, + { + "epoch": 1.2124911951162245, + "grad_norm": 5.676007407422168, + "learning_rate": 1.7717083337705499e-06, + "loss": 0.6718, + "step": 16783 + }, + { + "epoch": 1.212563440316434, + "grad_norm": 5.8299557671414775, + "learning_rate": 1.7714285231680201e-06, + "loss": 0.6046, + "step": 16784 + }, + { + "epoch": 1.2126356855166436, + "grad_norm": 8.042123046996194, + "learning_rate": 1.771148722539068e-06, + "loss": 0.6826, + "step": 16785 + }, + { + "epoch": 1.212707930716853, + "grad_norm": 6.187849555111955, + "learning_rate": 1.7708689318875255e-06, + "loss": 0.613, + "step": 16786 + }, + { + "epoch": 1.2127801759170624, + "grad_norm": 5.921385088058704, + "learning_rate": 1.7705891512172199e-06, + "loss": 0.6305, + "step": 16787 + }, + { + "epoch": 1.212852421117272, + "grad_norm": 6.467604973482655, + "learning_rate": 1.7703093805319826e-06, + "loss": 0.7332, + "step": 16788 + }, + { + "epoch": 1.2129246663174815, + "grad_norm": 6.572440910552569, + "learning_rate": 1.770029619835643e-06, + "loss": 0.7036, + "step": 16789 + }, + { + "epoch": 1.212996911517691, + "grad_norm": 6.285069457332678, + "learning_rate": 1.7697498691320322e-06, + "loss": 0.6905, + "step": 16790 + }, + { + "epoch": 1.2130691567179006, + "grad_norm": 6.677895752930838, + "learning_rate": 1.7694701284249776e-06, + "loss": 0.6409, + "step": 16791 + }, + { + "epoch": 1.2131414019181102, + "grad_norm": 7.624125946477662, + "learning_rate": 1.7691903977183103e-06, + "loss": 0.5828, + "step": 16792 + }, + { + "epoch": 1.2132136471183195, + "grad_norm": 5.595783567266469, + "learning_rate": 1.7689106770158592e-06, + "loss": 0.627, + "step": 16793 + }, + { + "epoch": 1.213285892318529, + "grad_norm": 6.82445100805034, + "learning_rate": 1.768630966321453e-06, + "loss": 0.6611, + "step": 16794 + }, + { + "epoch": 1.2133581375187386, + "grad_norm": 6.314367641220781, + "learning_rate": 1.7683512656389208e-06, + "loss": 0.6769, + "step": 16795 + }, + { + "epoch": 1.213430382718948, + "grad_norm": 5.525535778459686, + "learning_rate": 1.7680715749720915e-06, + "loss": 0.6457, + "step": 16796 + }, + { + "epoch": 1.2135026279191576, + "grad_norm": 8.008890933479803, + "learning_rate": 1.767791894324795e-06, + "loss": 0.6747, + "step": 16797 + }, + { + "epoch": 1.2135748731193672, + "grad_norm": 6.386281223067727, + "learning_rate": 1.767512223700858e-06, + "loss": 0.6128, + "step": 16798 + }, + { + "epoch": 1.2136471183195767, + "grad_norm": 7.308068652635977, + "learning_rate": 1.76723256310411e-06, + "loss": 0.5996, + "step": 16799 + }, + { + "epoch": 1.213719363519786, + "grad_norm": 6.822996959262368, + "learning_rate": 1.7669529125383803e-06, + "loss": 0.5949, + "step": 16800 + }, + { + "epoch": 1.2137916087199956, + "grad_norm": 6.076846108325237, + "learning_rate": 1.766673272007495e-06, + "loss": 0.6404, + "step": 16801 + }, + { + "epoch": 1.2138638539202051, + "grad_norm": 7.045962526237461, + "learning_rate": 1.7663936415152832e-06, + "loss": 0.563, + "step": 16802 + }, + { + "epoch": 1.2139360991204147, + "grad_norm": 6.771226638717027, + "learning_rate": 1.7661140210655731e-06, + "loss": 0.6301, + "step": 16803 + }, + { + "epoch": 1.2140083443206242, + "grad_norm": 8.177784054288436, + "learning_rate": 1.7658344106621928e-06, + "loss": 0.7302, + "step": 16804 + }, + { + "epoch": 1.2140805895208338, + "grad_norm": 8.601741667638995, + "learning_rate": 1.7655548103089685e-06, + "loss": 0.6254, + "step": 16805 + }, + { + "epoch": 1.2141528347210433, + "grad_norm": 6.070370000029311, + "learning_rate": 1.7652752200097288e-06, + "loss": 0.6293, + "step": 16806 + }, + { + "epoch": 1.2142250799212526, + "grad_norm": 6.431171454570013, + "learning_rate": 1.7649956397683004e-06, + "loss": 0.6093, + "step": 16807 + }, + { + "epoch": 1.2142973251214622, + "grad_norm": 5.851361382036949, + "learning_rate": 1.7647160695885121e-06, + "loss": 0.68, + "step": 16808 + }, + { + "epoch": 1.2143695703216717, + "grad_norm": 7.2365619163481645, + "learning_rate": 1.7644365094741891e-06, + "loss": 0.6722, + "step": 16809 + }, + { + "epoch": 1.2144418155218812, + "grad_norm": 5.871580386726641, + "learning_rate": 1.764156959429159e-06, + "loss": 0.5605, + "step": 16810 + }, + { + "epoch": 1.2145140607220908, + "grad_norm": 6.8386155468527585, + "learning_rate": 1.7638774194572505e-06, + "loss": 0.6442, + "step": 16811 + }, + { + "epoch": 1.2145863059223003, + "grad_norm": 7.529119799717368, + "learning_rate": 1.7635978895622867e-06, + "loss": 0.6245, + "step": 16812 + }, + { + "epoch": 1.2146585511225099, + "grad_norm": 6.440665938734519, + "learning_rate": 1.763318369748096e-06, + "loss": 0.6618, + "step": 16813 + }, + { + "epoch": 1.2147307963227192, + "grad_norm": 5.775595709391356, + "learning_rate": 1.7630388600185054e-06, + "loss": 0.6015, + "step": 16814 + }, + { + "epoch": 1.2148030415229287, + "grad_norm": 6.292395576880658, + "learning_rate": 1.7627593603773414e-06, + "loss": 0.5981, + "step": 16815 + }, + { + "epoch": 1.2148752867231383, + "grad_norm": 6.131237046347651, + "learning_rate": 1.7624798708284282e-06, + "loss": 0.649, + "step": 16816 + }, + { + "epoch": 1.2149475319233478, + "grad_norm": 7.108098110348357, + "learning_rate": 1.7622003913755931e-06, + "loss": 0.7114, + "step": 16817 + }, + { + "epoch": 1.2150197771235574, + "grad_norm": 6.631672791603381, + "learning_rate": 1.7619209220226625e-06, + "loss": 0.7261, + "step": 16818 + }, + { + "epoch": 1.215092022323767, + "grad_norm": 7.1340154529657624, + "learning_rate": 1.7616414627734604e-06, + "loss": 0.6681, + "step": 16819 + }, + { + "epoch": 1.2151642675239764, + "grad_norm": 6.608615984971165, + "learning_rate": 1.7613620136318139e-06, + "loss": 0.587, + "step": 16820 + }, + { + "epoch": 1.215236512724186, + "grad_norm": 6.380229226421998, + "learning_rate": 1.7610825746015476e-06, + "loss": 0.6403, + "step": 16821 + }, + { + "epoch": 1.2153087579243955, + "grad_norm": 6.022689198918335, + "learning_rate": 1.7608031456864874e-06, + "loss": 0.6237, + "step": 16822 + }, + { + "epoch": 1.2153810031246048, + "grad_norm": 6.680868408039997, + "learning_rate": 1.760523726890458e-06, + "loss": 0.6216, + "step": 16823 + }, + { + "epoch": 1.2154532483248144, + "grad_norm": 8.822674011953032, + "learning_rate": 1.7602443182172846e-06, + "loss": 0.7101, + "step": 16824 + }, + { + "epoch": 1.215525493525024, + "grad_norm": 6.077916629622409, + "learning_rate": 1.759964919670793e-06, + "loss": 0.6516, + "step": 16825 + }, + { + "epoch": 1.2155977387252335, + "grad_norm": 7.164564341832095, + "learning_rate": 1.7596855312548061e-06, + "loss": 0.6712, + "step": 16826 + }, + { + "epoch": 1.215669983925443, + "grad_norm": 7.711801714330598, + "learning_rate": 1.7594061529731489e-06, + "loss": 0.7055, + "step": 16827 + }, + { + "epoch": 1.2157422291256526, + "grad_norm": 6.328718444878159, + "learning_rate": 1.759126784829647e-06, + "loss": 0.591, + "step": 16828 + }, + { + "epoch": 1.215814474325862, + "grad_norm": 8.220580850762662, + "learning_rate": 1.7588474268281254e-06, + "loss": 0.6284, + "step": 16829 + }, + { + "epoch": 1.2158867195260714, + "grad_norm": 6.726704559421079, + "learning_rate": 1.7585680789724057e-06, + "loss": 0.6244, + "step": 16830 + }, + { + "epoch": 1.215958964726281, + "grad_norm": 7.0083775117126565, + "learning_rate": 1.7582887412663132e-06, + "loss": 0.7091, + "step": 16831 + }, + { + "epoch": 1.2160312099264905, + "grad_norm": 5.956566964488975, + "learning_rate": 1.7580094137136722e-06, + "loss": 0.605, + "step": 16832 + }, + { + "epoch": 1.2161034551267, + "grad_norm": 5.852745598646884, + "learning_rate": 1.7577300963183068e-06, + "loss": 0.6197, + "step": 16833 + }, + { + "epoch": 1.2161757003269096, + "grad_norm": 5.719613432192835, + "learning_rate": 1.7574507890840392e-06, + "loss": 0.5827, + "step": 16834 + }, + { + "epoch": 1.2162479455271191, + "grad_norm": 5.4543105970741586, + "learning_rate": 1.7571714920146943e-06, + "loss": 0.6248, + "step": 16835 + }, + { + "epoch": 1.2163201907273287, + "grad_norm": 7.484163986130252, + "learning_rate": 1.756892205114095e-06, + "loss": 0.6599, + "step": 16836 + }, + { + "epoch": 1.216392435927538, + "grad_norm": 6.756114309749817, + "learning_rate": 1.756612928386064e-06, + "loss": 0.6304, + "step": 16837 + }, + { + "epoch": 1.2164646811277475, + "grad_norm": 6.273410136732043, + "learning_rate": 1.7563336618344245e-06, + "loss": 0.6691, + "step": 16838 + }, + { + "epoch": 1.216536926327957, + "grad_norm": 6.013580212540301, + "learning_rate": 1.7560544054630002e-06, + "loss": 0.5611, + "step": 16839 + }, + { + "epoch": 1.2166091715281666, + "grad_norm": 6.444384994817254, + "learning_rate": 1.7557751592756145e-06, + "loss": 0.7164, + "step": 16840 + }, + { + "epoch": 1.2166814167283762, + "grad_norm": 7.176329581917942, + "learning_rate": 1.755495923276087e-06, + "loss": 0.593, + "step": 16841 + }, + { + "epoch": 1.2167536619285857, + "grad_norm": 6.87559645406281, + "learning_rate": 1.7552166974682433e-06, + "loss": 0.5696, + "step": 16842 + }, + { + "epoch": 1.2168259071287952, + "grad_norm": 6.22575865262871, + "learning_rate": 1.7549374818559054e-06, + "loss": 0.669, + "step": 16843 + }, + { + "epoch": 1.2168981523290046, + "grad_norm": 6.211993817406775, + "learning_rate": 1.7546582764428938e-06, + "loss": 0.676, + "step": 16844 + }, + { + "epoch": 1.216970397529214, + "grad_norm": 6.16393686995709, + "learning_rate": 1.7543790812330318e-06, + "loss": 0.5909, + "step": 16845 + }, + { + "epoch": 1.2170426427294236, + "grad_norm": 6.519561090966757, + "learning_rate": 1.7540998962301415e-06, + "loss": 0.6426, + "step": 16846 + }, + { + "epoch": 1.2171148879296332, + "grad_norm": 6.240639658200581, + "learning_rate": 1.7538207214380445e-06, + "loss": 0.5962, + "step": 16847 + }, + { + "epoch": 1.2171871331298427, + "grad_norm": 7.701714253028525, + "learning_rate": 1.7535415568605624e-06, + "loss": 0.6391, + "step": 16848 + }, + { + "epoch": 1.2172593783300523, + "grad_norm": 6.315658232513228, + "learning_rate": 1.7532624025015166e-06, + "loss": 0.6276, + "step": 16849 + }, + { + "epoch": 1.2173316235302618, + "grad_norm": 7.072573069643004, + "learning_rate": 1.752983258364729e-06, + "loss": 0.7428, + "step": 16850 + }, + { + "epoch": 1.2174038687304711, + "grad_norm": 6.17409566210899, + "learning_rate": 1.7527041244540206e-06, + "loss": 0.7225, + "step": 16851 + }, + { + "epoch": 1.2174761139306807, + "grad_norm": 6.877444578257389, + "learning_rate": 1.752425000773212e-06, + "loss": 0.7327, + "step": 16852 + }, + { + "epoch": 1.2175483591308902, + "grad_norm": 7.6693215403583315, + "learning_rate": 1.7521458873261249e-06, + "loss": 0.6192, + "step": 16853 + }, + { + "epoch": 1.2176206043310998, + "grad_norm": 6.877512524731019, + "learning_rate": 1.7518667841165806e-06, + "loss": 0.5696, + "step": 16854 + }, + { + "epoch": 1.2176928495313093, + "grad_norm": 6.698496681202876, + "learning_rate": 1.751587691148398e-06, + "loss": 0.6007, + "step": 16855 + }, + { + "epoch": 1.2177650947315188, + "grad_norm": 6.3763192999520815, + "learning_rate": 1.7513086084253994e-06, + "loss": 0.6612, + "step": 16856 + }, + { + "epoch": 1.2178373399317284, + "grad_norm": 6.676151236364448, + "learning_rate": 1.7510295359514043e-06, + "loss": 0.6579, + "step": 16857 + }, + { + "epoch": 1.2179095851319377, + "grad_norm": 6.1447060106067815, + "learning_rate": 1.7507504737302344e-06, + "loss": 0.6319, + "step": 16858 + }, + { + "epoch": 1.2179818303321472, + "grad_norm": 5.745386013943602, + "learning_rate": 1.7504714217657078e-06, + "loss": 0.6271, + "step": 16859 + }, + { + "epoch": 1.2180540755323568, + "grad_norm": 6.021561663264934, + "learning_rate": 1.7501923800616455e-06, + "loss": 0.5927, + "step": 16860 + }, + { + "epoch": 1.2181263207325663, + "grad_norm": 6.453013273662049, + "learning_rate": 1.7499133486218678e-06, + "loss": 0.6544, + "step": 16861 + }, + { + "epoch": 1.2181985659327759, + "grad_norm": 5.621039797886493, + "learning_rate": 1.7496343274501936e-06, + "loss": 0.6174, + "step": 16862 + }, + { + "epoch": 1.2182708111329854, + "grad_norm": 7.443725032428215, + "learning_rate": 1.7493553165504427e-06, + "loss": 0.6412, + "step": 16863 + }, + { + "epoch": 1.218343056333195, + "grad_norm": 6.764624118554706, + "learning_rate": 1.7490763159264345e-06, + "loss": 0.6946, + "step": 16864 + }, + { + "epoch": 1.2184153015334043, + "grad_norm": 6.985846788964579, + "learning_rate": 1.7487973255819894e-06, + "loss": 0.6334, + "step": 16865 + }, + { + "epoch": 1.2184875467336138, + "grad_norm": 5.656200429794011, + "learning_rate": 1.7485183455209249e-06, + "loss": 0.5973, + "step": 16866 + }, + { + "epoch": 1.2185597919338234, + "grad_norm": 6.403125726821732, + "learning_rate": 1.7482393757470607e-06, + "loss": 0.6288, + "step": 16867 + }, + { + "epoch": 1.218632037134033, + "grad_norm": 8.492472737706233, + "learning_rate": 1.7479604162642167e-06, + "loss": 0.6548, + "step": 16868 + }, + { + "epoch": 1.2187042823342424, + "grad_norm": 7.156262260326561, + "learning_rate": 1.7476814670762093e-06, + "loss": 0.6495, + "step": 16869 + }, + { + "epoch": 1.218776527534452, + "grad_norm": 7.807685772545072, + "learning_rate": 1.747402528186859e-06, + "loss": 0.6175, + "step": 16870 + }, + { + "epoch": 1.2188487727346615, + "grad_norm": 7.568703873207915, + "learning_rate": 1.7471235995999836e-06, + "loss": 0.7374, + "step": 16871 + }, + { + "epoch": 1.2189210179348708, + "grad_norm": 5.924208048111955, + "learning_rate": 1.746844681319403e-06, + "loss": 0.6484, + "step": 16872 + }, + { + "epoch": 1.2189932631350804, + "grad_norm": 6.983000413095155, + "learning_rate": 1.7465657733489322e-06, + "loss": 0.6284, + "step": 16873 + }, + { + "epoch": 1.21906550833529, + "grad_norm": 7.675929348125989, + "learning_rate": 1.7462868756923915e-06, + "loss": 0.6408, + "step": 16874 + }, + { + "epoch": 1.2191377535354995, + "grad_norm": 7.736193510568294, + "learning_rate": 1.7460079883535986e-06, + "loss": 0.721, + "step": 16875 + }, + { + "epoch": 1.219209998735709, + "grad_norm": 7.020577838297482, + "learning_rate": 1.7457291113363707e-06, + "loss": 0.6609, + "step": 16876 + }, + { + "epoch": 1.2192822439359186, + "grad_norm": 9.22084606070372, + "learning_rate": 1.745450244644525e-06, + "loss": 0.7157, + "step": 16877 + }, + { + "epoch": 1.219354489136128, + "grad_norm": 7.111925946016388, + "learning_rate": 1.7451713882818799e-06, + "loss": 0.6659, + "step": 16878 + }, + { + "epoch": 1.2194267343363374, + "grad_norm": 6.8525537450186516, + "learning_rate": 1.7448925422522528e-06, + "loss": 0.6599, + "step": 16879 + }, + { + "epoch": 1.219498979536547, + "grad_norm": 7.4239478499290925, + "learning_rate": 1.7446137065594601e-06, + "loss": 0.7038, + "step": 16880 + }, + { + "epoch": 1.2195712247367565, + "grad_norm": 6.951224558138373, + "learning_rate": 1.7443348812073191e-06, + "loss": 0.6602, + "step": 16881 + }, + { + "epoch": 1.219643469936966, + "grad_norm": 8.508892568753117, + "learning_rate": 1.744056066199647e-06, + "loss": 0.6236, + "step": 16882 + }, + { + "epoch": 1.2197157151371756, + "grad_norm": 6.830546702542574, + "learning_rate": 1.743777261540261e-06, + "loss": 0.6612, + "step": 16883 + }, + { + "epoch": 1.2197879603373851, + "grad_norm": 6.758037444323385, + "learning_rate": 1.7434984672329764e-06, + "loss": 0.6739, + "step": 16884 + }, + { + "epoch": 1.2198602055375947, + "grad_norm": 7.4583634814571225, + "learning_rate": 1.7432196832816105e-06, + "loss": 0.697, + "step": 16885 + }, + { + "epoch": 1.219932450737804, + "grad_norm": 6.856638855804335, + "learning_rate": 1.7429409096899807e-06, + "loss": 0.6947, + "step": 16886 + }, + { + "epoch": 1.2200046959380135, + "grad_norm": 8.649196574213962, + "learning_rate": 1.742662146461901e-06, + "loss": 0.6463, + "step": 16887 + }, + { + "epoch": 1.220076941138223, + "grad_norm": 6.658831315243185, + "learning_rate": 1.742383393601188e-06, + "loss": 0.6795, + "step": 16888 + }, + { + "epoch": 1.2201491863384326, + "grad_norm": 6.184561600069854, + "learning_rate": 1.7421046511116578e-06, + "loss": 0.5774, + "step": 16889 + }, + { + "epoch": 1.2202214315386422, + "grad_norm": 6.4585860008118505, + "learning_rate": 1.7418259189971282e-06, + "loss": 0.69, + "step": 16890 + }, + { + "epoch": 1.2202936767388517, + "grad_norm": 6.46864856880018, + "learning_rate": 1.7415471972614121e-06, + "loss": 0.6551, + "step": 16891 + }, + { + "epoch": 1.2203659219390612, + "grad_norm": 7.965183552017742, + "learning_rate": 1.7412684859083256e-06, + "loss": 0.6459, + "step": 16892 + }, + { + "epoch": 1.2204381671392708, + "grad_norm": 7.063608538295097, + "learning_rate": 1.7409897849416852e-06, + "loss": 0.7448, + "step": 16893 + }, + { + "epoch": 1.22051041233948, + "grad_norm": 8.02496495699733, + "learning_rate": 1.740711094365305e-06, + "loss": 0.6801, + "step": 16894 + }, + { + "epoch": 1.2205826575396896, + "grad_norm": 7.041184659032022, + "learning_rate": 1.7404324141829997e-06, + "loss": 0.6739, + "step": 16895 + }, + { + "epoch": 1.2206549027398992, + "grad_norm": 6.844439467446195, + "learning_rate": 1.7401537443985855e-06, + "loss": 0.6992, + "step": 16896 + }, + { + "epoch": 1.2207271479401087, + "grad_norm": 5.692062392471217, + "learning_rate": 1.7398750850158768e-06, + "loss": 0.6195, + "step": 16897 + }, + { + "epoch": 1.2207993931403183, + "grad_norm": 7.950379982750333, + "learning_rate": 1.7395964360386874e-06, + "loss": 0.6799, + "step": 16898 + }, + { + "epoch": 1.2208716383405278, + "grad_norm": 7.032861957151556, + "learning_rate": 1.7393177974708325e-06, + "loss": 0.6801, + "step": 16899 + }, + { + "epoch": 1.2209438835407374, + "grad_norm": 8.390155800428783, + "learning_rate": 1.7390391693161276e-06, + "loss": 0.7232, + "step": 16900 + }, + { + "epoch": 1.221016128740947, + "grad_norm": 5.61905932906303, + "learning_rate": 1.7387605515783845e-06, + "loss": 0.6175, + "step": 16901 + }, + { + "epoch": 1.2210883739411562, + "grad_norm": 6.547720114848571, + "learning_rate": 1.7384819442614182e-06, + "loss": 0.6627, + "step": 16902 + }, + { + "epoch": 1.2211606191413658, + "grad_norm": 5.609420053625214, + "learning_rate": 1.7382033473690426e-06, + "loss": 0.5698, + "step": 16903 + }, + { + "epoch": 1.2212328643415753, + "grad_norm": 7.092213199532777, + "learning_rate": 1.7379247609050726e-06, + "loss": 0.6462, + "step": 16904 + }, + { + "epoch": 1.2213051095417848, + "grad_norm": 5.955553095296728, + "learning_rate": 1.7376461848733201e-06, + "loss": 0.6212, + "step": 16905 + }, + { + "epoch": 1.2213773547419944, + "grad_norm": 6.585228208673009, + "learning_rate": 1.7373676192775996e-06, + "loss": 0.5713, + "step": 16906 + }, + { + "epoch": 1.221449599942204, + "grad_norm": 6.758560403881152, + "learning_rate": 1.7370890641217248e-06, + "loss": 0.6268, + "step": 16907 + }, + { + "epoch": 1.2215218451424135, + "grad_norm": 6.570508738523054, + "learning_rate": 1.7368105194095076e-06, + "loss": 0.6474, + "step": 16908 + }, + { + "epoch": 1.2215940903426228, + "grad_norm": 6.626875126180384, + "learning_rate": 1.736531985144762e-06, + "loss": 0.673, + "step": 16909 + }, + { + "epoch": 1.2216663355428323, + "grad_norm": 7.840258060412133, + "learning_rate": 1.7362534613313007e-06, + "loss": 0.6773, + "step": 16910 + }, + { + "epoch": 1.2217385807430419, + "grad_norm": 6.832142361696533, + "learning_rate": 1.735974947972937e-06, + "loss": 0.6675, + "step": 16911 + }, + { + "epoch": 1.2218108259432514, + "grad_norm": 6.918598441485909, + "learning_rate": 1.7356964450734825e-06, + "loss": 0.6406, + "step": 16912 + }, + { + "epoch": 1.221883071143461, + "grad_norm": 5.808841989904001, + "learning_rate": 1.7354179526367503e-06, + "loss": 0.6566, + "step": 16913 + }, + { + "epoch": 1.2219553163436705, + "grad_norm": 6.9908084468449365, + "learning_rate": 1.7351394706665526e-06, + "loss": 0.6049, + "step": 16914 + }, + { + "epoch": 1.22202756154388, + "grad_norm": 6.906962776513464, + "learning_rate": 1.7348609991667026e-06, + "loss": 0.6226, + "step": 16915 + }, + { + "epoch": 1.2220998067440894, + "grad_norm": 6.618255348921299, + "learning_rate": 1.7345825381410105e-06, + "loss": 0.696, + "step": 16916 + }, + { + "epoch": 1.222172051944299, + "grad_norm": 6.30819071613121, + "learning_rate": 1.7343040875932894e-06, + "loss": 0.6911, + "step": 16917 + }, + { + "epoch": 1.2222442971445084, + "grad_norm": 5.460788359126714, + "learning_rate": 1.7340256475273509e-06, + "loss": 0.5865, + "step": 16918 + }, + { + "epoch": 1.222316542344718, + "grad_norm": 7.0164103928594, + "learning_rate": 1.733747217947006e-06, + "loss": 0.63, + "step": 16919 + }, + { + "epoch": 1.2223887875449275, + "grad_norm": 6.951013274391834, + "learning_rate": 1.7334687988560673e-06, + "loss": 0.7279, + "step": 16920 + }, + { + "epoch": 1.222461032745137, + "grad_norm": 6.207905116663753, + "learning_rate": 1.733190390258345e-06, + "loss": 0.6084, + "step": 16921 + }, + { + "epoch": 1.2225332779453466, + "grad_norm": 7.281881804334968, + "learning_rate": 1.7329119921576515e-06, + "loss": 0.6561, + "step": 16922 + }, + { + "epoch": 1.222605523145556, + "grad_norm": 5.932272446902396, + "learning_rate": 1.7326336045577969e-06, + "loss": 0.5851, + "step": 16923 + }, + { + "epoch": 1.2226777683457655, + "grad_norm": 8.671644932899707, + "learning_rate": 1.732355227462592e-06, + "loss": 0.7314, + "step": 16924 + }, + { + "epoch": 1.222750013545975, + "grad_norm": 8.648583056871313, + "learning_rate": 1.7320768608758499e-06, + "loss": 0.6261, + "step": 16925 + }, + { + "epoch": 1.2228222587461846, + "grad_norm": 7.746278423006643, + "learning_rate": 1.731798504801377e-06, + "loss": 0.6203, + "step": 16926 + }, + { + "epoch": 1.222894503946394, + "grad_norm": 5.954236665193677, + "learning_rate": 1.731520159242987e-06, + "loss": 0.6136, + "step": 16927 + }, + { + "epoch": 1.2229667491466036, + "grad_norm": 7.072544483248536, + "learning_rate": 1.731241824204489e-06, + "loss": 0.6223, + "step": 16928 + }, + { + "epoch": 1.2230389943468132, + "grad_norm": 7.768149995842649, + "learning_rate": 1.7309634996896945e-06, + "loss": 0.6774, + "step": 16929 + }, + { + "epoch": 1.2231112395470225, + "grad_norm": 7.6920415597508045, + "learning_rate": 1.730685185702412e-06, + "loss": 0.6797, + "step": 16930 + }, + { + "epoch": 1.223183484747232, + "grad_norm": 8.8067933130892, + "learning_rate": 1.7304068822464518e-06, + "loss": 0.6068, + "step": 16931 + }, + { + "epoch": 1.2232557299474416, + "grad_norm": 6.713623077948573, + "learning_rate": 1.7301285893256243e-06, + "loss": 0.6675, + "step": 16932 + }, + { + "epoch": 1.2233279751476511, + "grad_norm": 7.655164244600922, + "learning_rate": 1.7298503069437382e-06, + "loss": 0.6549, + "step": 16933 + }, + { + "epoch": 1.2234002203478607, + "grad_norm": 8.219624868033298, + "learning_rate": 1.7295720351046033e-06, + "loss": 0.7568, + "step": 16934 + }, + { + "epoch": 1.2234724655480702, + "grad_norm": 5.972588867446094, + "learning_rate": 1.7292937738120292e-06, + "loss": 0.5817, + "step": 16935 + }, + { + "epoch": 1.2235447107482798, + "grad_norm": 6.9494026421354755, + "learning_rate": 1.7290155230698251e-06, + "loss": 0.6401, + "step": 16936 + }, + { + "epoch": 1.223616955948489, + "grad_norm": 7.633106112811701, + "learning_rate": 1.7287372828817994e-06, + "loss": 0.7013, + "step": 16937 + }, + { + "epoch": 1.2236892011486986, + "grad_norm": 6.011210776439244, + "learning_rate": 1.728459053251762e-06, + "loss": 0.639, + "step": 16938 + }, + { + "epoch": 1.2237614463489082, + "grad_norm": 6.4978736920865545, + "learning_rate": 1.7281808341835204e-06, + "loss": 0.6435, + "step": 16939 + }, + { + "epoch": 1.2238336915491177, + "grad_norm": 5.773503903225202, + "learning_rate": 1.727902625680885e-06, + "loss": 0.5839, + "step": 16940 + }, + { + "epoch": 1.2239059367493272, + "grad_norm": 6.9918841589228915, + "learning_rate": 1.7276244277476622e-06, + "loss": 0.6661, + "step": 16941 + }, + { + "epoch": 1.2239781819495368, + "grad_norm": 6.072358914428933, + "learning_rate": 1.7273462403876616e-06, + "loss": 0.6565, + "step": 16942 + }, + { + "epoch": 1.2240504271497463, + "grad_norm": 8.040939482652004, + "learning_rate": 1.7270680636046918e-06, + "loss": 0.6467, + "step": 16943 + }, + { + "epoch": 1.2241226723499556, + "grad_norm": 6.61345660207573, + "learning_rate": 1.7267898974025594e-06, + "loss": 0.647, + "step": 16944 + }, + { + "epoch": 1.2241949175501652, + "grad_norm": 8.29052445357542, + "learning_rate": 1.7265117417850723e-06, + "loss": 0.6501, + "step": 16945 + }, + { + "epoch": 1.2242671627503747, + "grad_norm": 7.953423575022312, + "learning_rate": 1.726233596756039e-06, + "loss": 0.7039, + "step": 16946 + }, + { + "epoch": 1.2243394079505843, + "grad_norm": 5.89541779077059, + "learning_rate": 1.7259554623192682e-06, + "loss": 0.6019, + "step": 16947 + }, + { + "epoch": 1.2244116531507938, + "grad_norm": 7.245130284666859, + "learning_rate": 1.7256773384785652e-06, + "loss": 0.706, + "step": 16948 + }, + { + "epoch": 1.2244838983510034, + "grad_norm": 6.399622619946771, + "learning_rate": 1.7253992252377383e-06, + "loss": 0.5852, + "step": 16949 + }, + { + "epoch": 1.224556143551213, + "grad_norm": 7.466576679005366, + "learning_rate": 1.7251211226005948e-06, + "loss": 0.6621, + "step": 16950 + }, + { + "epoch": 1.2246283887514222, + "grad_norm": 6.8864215650390985, + "learning_rate": 1.7248430305709413e-06, + "loss": 0.709, + "step": 16951 + }, + { + "epoch": 1.2247006339516318, + "grad_norm": 8.207945942007298, + "learning_rate": 1.7245649491525845e-06, + "loss": 0.6435, + "step": 16952 + }, + { + "epoch": 1.2247728791518413, + "grad_norm": 6.159438183913779, + "learning_rate": 1.7242868783493316e-06, + "loss": 0.6901, + "step": 16953 + }, + { + "epoch": 1.2248451243520508, + "grad_norm": 6.077209560620826, + "learning_rate": 1.7240088181649895e-06, + "loss": 0.6084, + "step": 16954 + }, + { + "epoch": 1.2249173695522604, + "grad_norm": 6.7472734949369375, + "learning_rate": 1.723730768603364e-06, + "loss": 0.6523, + "step": 16955 + }, + { + "epoch": 1.22498961475247, + "grad_norm": 6.693745201529413, + "learning_rate": 1.7234527296682612e-06, + "loss": 0.7064, + "step": 16956 + }, + { + "epoch": 1.2250618599526795, + "grad_norm": 6.672557644014423, + "learning_rate": 1.7231747013634886e-06, + "loss": 0.5935, + "step": 16957 + }, + { + "epoch": 1.2251341051528888, + "grad_norm": 7.1873237919976605, + "learning_rate": 1.72289668369285e-06, + "loss": 0.6415, + "step": 16958 + }, + { + "epoch": 1.2252063503530983, + "grad_norm": 6.120435298323707, + "learning_rate": 1.7226186766601527e-06, + "loss": 0.5958, + "step": 16959 + }, + { + "epoch": 1.2252785955533079, + "grad_norm": 6.236108218699546, + "learning_rate": 1.7223406802692017e-06, + "loss": 0.6427, + "step": 16960 + }, + { + "epoch": 1.2253508407535174, + "grad_norm": 7.491046392995555, + "learning_rate": 1.7220626945238045e-06, + "loss": 0.6726, + "step": 16961 + }, + { + "epoch": 1.225423085953727, + "grad_norm": 5.703273823507706, + "learning_rate": 1.7217847194277636e-06, + "loss": 0.5548, + "step": 16962 + }, + { + "epoch": 1.2254953311539365, + "grad_norm": 6.369793130578668, + "learning_rate": 1.7215067549848858e-06, + "loss": 0.5964, + "step": 16963 + }, + { + "epoch": 1.225567576354146, + "grad_norm": 5.818614082101147, + "learning_rate": 1.7212288011989762e-06, + "loss": 0.6101, + "step": 16964 + }, + { + "epoch": 1.2256398215543556, + "grad_norm": 8.565284888700981, + "learning_rate": 1.7209508580738398e-06, + "loss": 0.7416, + "step": 16965 + }, + { + "epoch": 1.225712066754565, + "grad_norm": 7.058840993784823, + "learning_rate": 1.7206729256132811e-06, + "loss": 0.7106, + "step": 16966 + }, + { + "epoch": 1.2257843119547744, + "grad_norm": 7.215961132257841, + "learning_rate": 1.720395003821105e-06, + "loss": 0.6758, + "step": 16967 + }, + { + "epoch": 1.225856557154984, + "grad_norm": 6.879104498758104, + "learning_rate": 1.7201170927011163e-06, + "loss": 0.5649, + "step": 16968 + }, + { + "epoch": 1.2259288023551935, + "grad_norm": 6.257985011472031, + "learning_rate": 1.7198391922571187e-06, + "loss": 0.6466, + "step": 16969 + }, + { + "epoch": 1.226001047555403, + "grad_norm": 5.862915596942788, + "learning_rate": 1.7195613024929165e-06, + "loss": 0.6424, + "step": 16970 + }, + { + "epoch": 1.2260732927556126, + "grad_norm": 6.492782400259471, + "learning_rate": 1.7192834234123146e-06, + "loss": 0.5846, + "step": 16971 + }, + { + "epoch": 1.2261455379558222, + "grad_norm": 6.271221905030064, + "learning_rate": 1.719005555019117e-06, + "loss": 0.6843, + "step": 16972 + }, + { + "epoch": 1.2262177831560317, + "grad_norm": 6.813863075450396, + "learning_rate": 1.7187276973171261e-06, + "loss": 0.5989, + "step": 16973 + }, + { + "epoch": 1.226290028356241, + "grad_norm": 6.883196574308074, + "learning_rate": 1.718449850310146e-06, + "loss": 0.6929, + "step": 16974 + }, + { + "epoch": 1.2263622735564506, + "grad_norm": 6.590070665180329, + "learning_rate": 1.7181720140019819e-06, + "loss": 0.6693, + "step": 16975 + }, + { + "epoch": 1.22643451875666, + "grad_norm": 6.113166432627842, + "learning_rate": 1.7178941883964351e-06, + "loss": 0.6616, + "step": 16976 + }, + { + "epoch": 1.2265067639568696, + "grad_norm": 7.213808953099354, + "learning_rate": 1.7176163734973094e-06, + "loss": 0.7019, + "step": 16977 + }, + { + "epoch": 1.2265790091570792, + "grad_norm": 6.704525105543952, + "learning_rate": 1.7173385693084082e-06, + "loss": 0.6461, + "step": 16978 + }, + { + "epoch": 1.2266512543572887, + "grad_norm": 7.023732960971057, + "learning_rate": 1.7170607758335346e-06, + "loss": 0.649, + "step": 16979 + }, + { + "epoch": 1.2267234995574983, + "grad_norm": 6.587152298902429, + "learning_rate": 1.7167829930764907e-06, + "loss": 0.6352, + "step": 16980 + }, + { + "epoch": 1.2267957447577076, + "grad_norm": 5.8503576446498124, + "learning_rate": 1.7165052210410794e-06, + "loss": 0.6627, + "step": 16981 + }, + { + "epoch": 1.2268679899579171, + "grad_norm": 8.283544143460585, + "learning_rate": 1.716227459731104e-06, + "loss": 0.7281, + "step": 16982 + }, + { + "epoch": 1.2269402351581267, + "grad_norm": 8.021570688679155, + "learning_rate": 1.7159497091503655e-06, + "loss": 0.6497, + "step": 16983 + }, + { + "epoch": 1.2270124803583362, + "grad_norm": 8.187669592112849, + "learning_rate": 1.7156719693026667e-06, + "loss": 0.6637, + "step": 16984 + }, + { + "epoch": 1.2270847255585458, + "grad_norm": 7.1654249786395585, + "learning_rate": 1.7153942401918095e-06, + "loss": 0.6609, + "step": 16985 + }, + { + "epoch": 1.2271569707587553, + "grad_norm": 7.590133245250973, + "learning_rate": 1.715116521821597e-06, + "loss": 0.5896, + "step": 16986 + }, + { + "epoch": 1.2272292159589648, + "grad_norm": 6.54602103479201, + "learning_rate": 1.7148388141958292e-06, + "loss": 0.6706, + "step": 16987 + }, + { + "epoch": 1.2273014611591742, + "grad_norm": 7.799825901142587, + "learning_rate": 1.7145611173183075e-06, + "loss": 0.7177, + "step": 16988 + }, + { + "epoch": 1.2273737063593837, + "grad_norm": 6.541597046046309, + "learning_rate": 1.7142834311928353e-06, + "loss": 0.6561, + "step": 16989 + }, + { + "epoch": 1.2274459515595932, + "grad_norm": 6.737509013720199, + "learning_rate": 1.7140057558232132e-06, + "loss": 0.6894, + "step": 16990 + }, + { + "epoch": 1.2275181967598028, + "grad_norm": 8.019183047725603, + "learning_rate": 1.7137280912132415e-06, + "loss": 0.6913, + "step": 16991 + }, + { + "epoch": 1.2275904419600123, + "grad_norm": 7.386976184121307, + "learning_rate": 1.7134504373667215e-06, + "loss": 0.5951, + "step": 16992 + }, + { + "epoch": 1.2276626871602219, + "grad_norm": 6.765424835556627, + "learning_rate": 1.7131727942874548e-06, + "loss": 0.7469, + "step": 16993 + }, + { + "epoch": 1.2277349323604314, + "grad_norm": 6.327341591650607, + "learning_rate": 1.712895161979241e-06, + "loss": 0.6353, + "step": 16994 + }, + { + "epoch": 1.2278071775606407, + "grad_norm": 6.7253862121181145, + "learning_rate": 1.7126175404458818e-06, + "loss": 0.6826, + "step": 16995 + }, + { + "epoch": 1.2278794227608503, + "grad_norm": 7.712408140058956, + "learning_rate": 1.7123399296911763e-06, + "loss": 0.6609, + "step": 16996 + }, + { + "epoch": 1.2279516679610598, + "grad_norm": 6.453493564970468, + "learning_rate": 1.7120623297189265e-06, + "loss": 0.6734, + "step": 16997 + }, + { + "epoch": 1.2280239131612694, + "grad_norm": 6.472525411698136, + "learning_rate": 1.7117847405329313e-06, + "loss": 0.6807, + "step": 16998 + }, + { + "epoch": 1.228096158361479, + "grad_norm": 6.919464587187353, + "learning_rate": 1.7115071621369908e-06, + "loss": 0.5548, + "step": 16999 + }, + { + "epoch": 1.2281684035616884, + "grad_norm": 9.674807003964721, + "learning_rate": 1.7112295945349062e-06, + "loss": 0.6577, + "step": 17000 + }, + { + "epoch": 1.228240648761898, + "grad_norm": 6.482157647522466, + "learning_rate": 1.7109520377304748e-06, + "loss": 0.6276, + "step": 17001 + }, + { + "epoch": 1.2283128939621073, + "grad_norm": 6.974446931740331, + "learning_rate": 1.7106744917274976e-06, + "loss": 0.6576, + "step": 17002 + }, + { + "epoch": 1.2283851391623168, + "grad_norm": 6.3443464928923134, + "learning_rate": 1.7103969565297729e-06, + "loss": 0.6524, + "step": 17003 + }, + { + "epoch": 1.2284573843625264, + "grad_norm": 5.99185263283676, + "learning_rate": 1.7101194321411025e-06, + "loss": 0.6839, + "step": 17004 + }, + { + "epoch": 1.228529629562736, + "grad_norm": 8.295893321897333, + "learning_rate": 1.7098419185652826e-06, + "loss": 0.6504, + "step": 17005 + }, + { + "epoch": 1.2286018747629455, + "grad_norm": 8.123691394768846, + "learning_rate": 1.7095644158061133e-06, + "loss": 0.7303, + "step": 17006 + }, + { + "epoch": 1.228674119963155, + "grad_norm": 7.0164182762465135, + "learning_rate": 1.7092869238673943e-06, + "loss": 0.6228, + "step": 17007 + }, + { + "epoch": 1.2287463651633646, + "grad_norm": 6.301565236214692, + "learning_rate": 1.7090094427529225e-06, + "loss": 0.6574, + "step": 17008 + }, + { + "epoch": 1.2288186103635739, + "grad_norm": 6.426524166305408, + "learning_rate": 1.7087319724664972e-06, + "loss": 0.6312, + "step": 17009 + }, + { + "epoch": 1.2288908555637834, + "grad_norm": 6.290666344890988, + "learning_rate": 1.708454513011917e-06, + "loss": 0.6488, + "step": 17010 + }, + { + "epoch": 1.228963100763993, + "grad_norm": 5.955969423808668, + "learning_rate": 1.7081770643929803e-06, + "loss": 0.6834, + "step": 17011 + }, + { + "epoch": 1.2290353459642025, + "grad_norm": 8.026802939507578, + "learning_rate": 1.7078996266134845e-06, + "loss": 0.6701, + "step": 17012 + }, + { + "epoch": 1.229107591164412, + "grad_norm": 6.811416119786308, + "learning_rate": 1.7076221996772273e-06, + "loss": 0.613, + "step": 17013 + }, + { + "epoch": 1.2291798363646216, + "grad_norm": 6.503846717539101, + "learning_rate": 1.7073447835880074e-06, + "loss": 0.7023, + "step": 17014 + }, + { + "epoch": 1.2292520815648311, + "grad_norm": 5.416199101539707, + "learning_rate": 1.707067378349623e-06, + "loss": 0.5986, + "step": 17015 + }, + { + "epoch": 1.2293243267650404, + "grad_norm": 6.102612433173311, + "learning_rate": 1.7067899839658694e-06, + "loss": 0.5596, + "step": 17016 + }, + { + "epoch": 1.22939657196525, + "grad_norm": 6.150980131620476, + "learning_rate": 1.7065126004405446e-06, + "loss": 0.6641, + "step": 17017 + }, + { + "epoch": 1.2294688171654595, + "grad_norm": 6.651362814402652, + "learning_rate": 1.7062352277774473e-06, + "loss": 0.6255, + "step": 17018 + }, + { + "epoch": 1.229541062365669, + "grad_norm": 6.736646370961784, + "learning_rate": 1.705957865980373e-06, + "loss": 0.6867, + "step": 17019 + }, + { + "epoch": 1.2296133075658786, + "grad_norm": 6.621567034614728, + "learning_rate": 1.7056805150531187e-06, + "loss": 0.6652, + "step": 17020 + }, + { + "epoch": 1.2296855527660882, + "grad_norm": 7.787673198800676, + "learning_rate": 1.7054031749994814e-06, + "loss": 0.5959, + "step": 17021 + }, + { + "epoch": 1.2297577979662977, + "grad_norm": 6.349706099309434, + "learning_rate": 1.7051258458232583e-06, + "loss": 0.6219, + "step": 17022 + }, + { + "epoch": 1.229830043166507, + "grad_norm": 7.809615678501627, + "learning_rate": 1.7048485275282444e-06, + "loss": 0.6506, + "step": 17023 + }, + { + "epoch": 1.2299022883667166, + "grad_norm": 6.421916283987586, + "learning_rate": 1.7045712201182369e-06, + "loss": 0.5594, + "step": 17024 + }, + { + "epoch": 1.229974533566926, + "grad_norm": 7.041537883717041, + "learning_rate": 1.7042939235970325e-06, + "loss": 0.6475, + "step": 17025 + }, + { + "epoch": 1.2300467787671356, + "grad_norm": 7.220364476776102, + "learning_rate": 1.7040166379684255e-06, + "loss": 0.7024, + "step": 17026 + }, + { + "epoch": 1.2301190239673452, + "grad_norm": 7.387138076410982, + "learning_rate": 1.7037393632362132e-06, + "loss": 0.6888, + "step": 17027 + }, + { + "epoch": 1.2301912691675547, + "grad_norm": 7.19547684903238, + "learning_rate": 1.7034620994041903e-06, + "loss": 0.6565, + "step": 17028 + }, + { + "epoch": 1.2302635143677643, + "grad_norm": 6.069622457159995, + "learning_rate": 1.7031848464761542e-06, + "loss": 0.6244, + "step": 17029 + }, + { + "epoch": 1.2303357595679736, + "grad_norm": 7.1652194787195915, + "learning_rate": 1.7029076044558976e-06, + "loss": 0.5792, + "step": 17030 + }, + { + "epoch": 1.2304080047681831, + "grad_norm": 7.325299399444633, + "learning_rate": 1.7026303733472166e-06, + "loss": 0.682, + "step": 17031 + }, + { + "epoch": 1.2304802499683927, + "grad_norm": 5.834662077205138, + "learning_rate": 1.702353153153908e-06, + "loss": 0.6088, + "step": 17032 + }, + { + "epoch": 1.2305524951686022, + "grad_norm": 6.601398967094316, + "learning_rate": 1.7020759438797647e-06, + "loss": 0.6548, + "step": 17033 + }, + { + "epoch": 1.2306247403688118, + "grad_norm": 6.84316888719701, + "learning_rate": 1.7017987455285818e-06, + "loss": 0.6436, + "step": 17034 + }, + { + "epoch": 1.2306969855690213, + "grad_norm": 6.166921902434992, + "learning_rate": 1.7015215581041543e-06, + "loss": 0.6396, + "step": 17035 + }, + { + "epoch": 1.2307692307692308, + "grad_norm": 6.708514414489549, + "learning_rate": 1.7012443816102778e-06, + "loss": 0.6501, + "step": 17036 + }, + { + "epoch": 1.2308414759694402, + "grad_norm": 8.939947046724727, + "learning_rate": 1.7009672160507445e-06, + "loss": 0.6107, + "step": 17037 + }, + { + "epoch": 1.2309137211696497, + "grad_norm": 5.979671053817696, + "learning_rate": 1.7006900614293497e-06, + "loss": 0.5865, + "step": 17038 + }, + { + "epoch": 1.2309859663698592, + "grad_norm": 6.699472423677497, + "learning_rate": 1.700412917749888e-06, + "loss": 0.5985, + "step": 17039 + }, + { + "epoch": 1.2310582115700688, + "grad_norm": 7.406341423403953, + "learning_rate": 1.700135785016152e-06, + "loss": 0.7252, + "step": 17040 + }, + { + "epoch": 1.2311304567702783, + "grad_norm": 6.9505942550293405, + "learning_rate": 1.6998586632319357e-06, + "loss": 0.664, + "step": 17041 + }, + { + "epoch": 1.2312027019704879, + "grad_norm": 7.262624338369253, + "learning_rate": 1.6995815524010337e-06, + "loss": 0.7226, + "step": 17042 + }, + { + "epoch": 1.2312749471706974, + "grad_norm": 6.921857640091803, + "learning_rate": 1.6993044525272395e-06, + "loss": 0.643, + "step": 17043 + }, + { + "epoch": 1.231347192370907, + "grad_norm": 7.13832105199859, + "learning_rate": 1.6990273636143445e-06, + "loss": 0.678, + "step": 17044 + }, + { + "epoch": 1.2314194375711165, + "grad_norm": 5.577677219914726, + "learning_rate": 1.6987502856661426e-06, + "loss": 0.6084, + "step": 17045 + }, + { + "epoch": 1.2314916827713258, + "grad_norm": 7.8810188664933065, + "learning_rate": 1.6984732186864274e-06, + "loss": 0.614, + "step": 17046 + }, + { + "epoch": 1.2315639279715354, + "grad_norm": 7.860057156575814, + "learning_rate": 1.6981961626789928e-06, + "loss": 0.6517, + "step": 17047 + }, + { + "epoch": 1.231636173171745, + "grad_norm": 6.595796818811873, + "learning_rate": 1.697919117647629e-06, + "loss": 0.6969, + "step": 17048 + }, + { + "epoch": 1.2317084183719544, + "grad_norm": 6.5924678477956, + "learning_rate": 1.6976420835961299e-06, + "loss": 0.6468, + "step": 17049 + }, + { + "epoch": 1.231780663572164, + "grad_norm": 6.46559958289319, + "learning_rate": 1.6973650605282882e-06, + "loss": 0.6493, + "step": 17050 + }, + { + "epoch": 1.2318529087723735, + "grad_norm": 8.680137093863307, + "learning_rate": 1.6970880484478946e-06, + "loss": 0.683, + "step": 17051 + }, + { + "epoch": 1.231925153972583, + "grad_norm": 7.707517783521539, + "learning_rate": 1.696811047358743e-06, + "loss": 0.664, + "step": 17052 + }, + { + "epoch": 1.2319973991727924, + "grad_norm": 7.157316303354196, + "learning_rate": 1.6965340572646239e-06, + "loss": 0.6864, + "step": 17053 + }, + { + "epoch": 1.232069644373002, + "grad_norm": 6.711175099475525, + "learning_rate": 1.6962570781693305e-06, + "loss": 0.5378, + "step": 17054 + }, + { + "epoch": 1.2321418895732115, + "grad_norm": 6.798714505029494, + "learning_rate": 1.6959801100766529e-06, + "loss": 0.6303, + "step": 17055 + }, + { + "epoch": 1.232214134773421, + "grad_norm": 7.19209103118689, + "learning_rate": 1.6957031529903834e-06, + "loss": 0.6294, + "step": 17056 + }, + { + "epoch": 1.2322863799736306, + "grad_norm": 7.961184991016202, + "learning_rate": 1.6954262069143146e-06, + "loss": 0.7207, + "step": 17057 + }, + { + "epoch": 1.23235862517384, + "grad_norm": 6.74137129103159, + "learning_rate": 1.695149271852235e-06, + "loss": 0.7273, + "step": 17058 + }, + { + "epoch": 1.2324308703740496, + "grad_norm": 7.6110938165596576, + "learning_rate": 1.6948723478079363e-06, + "loss": 0.6165, + "step": 17059 + }, + { + "epoch": 1.232503115574259, + "grad_norm": 6.411650079689687, + "learning_rate": 1.6945954347852106e-06, + "loss": 0.5874, + "step": 17060 + }, + { + "epoch": 1.2325753607744685, + "grad_norm": 6.92266496753133, + "learning_rate": 1.6943185327878492e-06, + "loss": 0.7039, + "step": 17061 + }, + { + "epoch": 1.232647605974678, + "grad_norm": 7.766038112282593, + "learning_rate": 1.6940416418196403e-06, + "loss": 0.6428, + "step": 17062 + }, + { + "epoch": 1.2327198511748876, + "grad_norm": 6.3317535170044055, + "learning_rate": 1.6937647618843756e-06, + "loss": 0.6655, + "step": 17063 + }, + { + "epoch": 1.2327920963750971, + "grad_norm": 7.321971530253566, + "learning_rate": 1.693487892985846e-06, + "loss": 0.6379, + "step": 17064 + }, + { + "epoch": 1.2328643415753067, + "grad_norm": 6.42595548538476, + "learning_rate": 1.6932110351278402e-06, + "loss": 0.632, + "step": 17065 + }, + { + "epoch": 1.2329365867755162, + "grad_norm": 5.987737841252438, + "learning_rate": 1.6929341883141493e-06, + "loss": 0.6498, + "step": 17066 + }, + { + "epoch": 1.2330088319757255, + "grad_norm": 7.433599651452597, + "learning_rate": 1.6926573525485622e-06, + "loss": 0.6776, + "step": 17067 + }, + { + "epoch": 1.233081077175935, + "grad_norm": 7.142886548662964, + "learning_rate": 1.6923805278348698e-06, + "loss": 0.6534, + "step": 17068 + }, + { + "epoch": 1.2331533223761446, + "grad_norm": 6.814239279927828, + "learning_rate": 1.6921037141768605e-06, + "loss": 0.6373, + "step": 17069 + }, + { + "epoch": 1.2332255675763542, + "grad_norm": 7.231273010797577, + "learning_rate": 1.6918269115783238e-06, + "loss": 0.6315, + "step": 17070 + }, + { + "epoch": 1.2332978127765637, + "grad_norm": 6.058705509023223, + "learning_rate": 1.6915501200430493e-06, + "loss": 0.5779, + "step": 17071 + }, + { + "epoch": 1.2333700579767732, + "grad_norm": 6.611335048925473, + "learning_rate": 1.691273339574827e-06, + "loss": 0.6677, + "step": 17072 + }, + { + "epoch": 1.2334423031769828, + "grad_norm": 7.3364294047672, + "learning_rate": 1.6909965701774433e-06, + "loss": 0.6652, + "step": 17073 + }, + { + "epoch": 1.233514548377192, + "grad_norm": 8.077916273596935, + "learning_rate": 1.6907198118546886e-06, + "loss": 0.6799, + "step": 17074 + }, + { + "epoch": 1.2335867935774016, + "grad_norm": 6.191623431226364, + "learning_rate": 1.6904430646103526e-06, + "loss": 0.6172, + "step": 17075 + }, + { + "epoch": 1.2336590387776112, + "grad_norm": 6.946492733135471, + "learning_rate": 1.6901663284482214e-06, + "loss": 0.6551, + "step": 17076 + }, + { + "epoch": 1.2337312839778207, + "grad_norm": 5.587090988038244, + "learning_rate": 1.6898896033720844e-06, + "loss": 0.691, + "step": 17077 + }, + { + "epoch": 1.2338035291780303, + "grad_norm": 5.838031112394581, + "learning_rate": 1.6896128893857296e-06, + "loss": 0.5924, + "step": 17078 + }, + { + "epoch": 1.2338757743782398, + "grad_norm": 8.700487706337213, + "learning_rate": 1.689336186492946e-06, + "loss": 0.6454, + "step": 17079 + }, + { + "epoch": 1.2339480195784494, + "grad_norm": 6.650042150837103, + "learning_rate": 1.6890594946975194e-06, + "loss": 0.6041, + "step": 17080 + }, + { + "epoch": 1.2340202647786587, + "grad_norm": 6.987142198167438, + "learning_rate": 1.6887828140032392e-06, + "loss": 0.626, + "step": 17081 + }, + { + "epoch": 1.2340925099788682, + "grad_norm": 7.0978491950590765, + "learning_rate": 1.6885061444138928e-06, + "loss": 0.6615, + "step": 17082 + }, + { + "epoch": 1.2341647551790778, + "grad_norm": 7.9244503372140995, + "learning_rate": 1.6882294859332665e-06, + "loss": 0.5947, + "step": 17083 + }, + { + "epoch": 1.2342370003792873, + "grad_norm": 6.048996821691025, + "learning_rate": 1.6879528385651484e-06, + "loss": 0.7309, + "step": 17084 + }, + { + "epoch": 1.2343092455794968, + "grad_norm": 7.925847669448163, + "learning_rate": 1.6876762023133253e-06, + "loss": 0.6501, + "step": 17085 + }, + { + "epoch": 1.2343814907797064, + "grad_norm": 6.934015997266072, + "learning_rate": 1.6873995771815854e-06, + "loss": 0.6627, + "step": 17086 + }, + { + "epoch": 1.234453735979916, + "grad_norm": 6.177606861729381, + "learning_rate": 1.687122963173713e-06, + "loss": 0.6202, + "step": 17087 + }, + { + "epoch": 1.2345259811801252, + "grad_norm": 7.017206842042898, + "learning_rate": 1.6868463602934962e-06, + "loss": 0.6825, + "step": 17088 + }, + { + "epoch": 1.2345982263803348, + "grad_norm": 6.896557388632191, + "learning_rate": 1.6865697685447223e-06, + "loss": 0.7316, + "step": 17089 + }, + { + "epoch": 1.2346704715805443, + "grad_norm": 6.265412819330492, + "learning_rate": 1.6862931879311762e-06, + "loss": 0.6542, + "step": 17090 + }, + { + "epoch": 1.2347427167807539, + "grad_norm": 6.0510488688598505, + "learning_rate": 1.6860166184566443e-06, + "loss": 0.6117, + "step": 17091 + }, + { + "epoch": 1.2348149619809634, + "grad_norm": 6.325088488295292, + "learning_rate": 1.685740060124913e-06, + "loss": 0.6076, + "step": 17092 + }, + { + "epoch": 1.234887207181173, + "grad_norm": 5.463401402314542, + "learning_rate": 1.6854635129397682e-06, + "loss": 0.667, + "step": 17093 + }, + { + "epoch": 1.2349594523813825, + "grad_norm": 6.391069392116902, + "learning_rate": 1.685186976904995e-06, + "loss": 0.6479, + "step": 17094 + }, + { + "epoch": 1.2350316975815918, + "grad_norm": 6.36349076686611, + "learning_rate": 1.6849104520243797e-06, + "loss": 0.6368, + "step": 17095 + }, + { + "epoch": 1.2351039427818014, + "grad_norm": 7.352169323968984, + "learning_rate": 1.6846339383017074e-06, + "loss": 0.6542, + "step": 17096 + }, + { + "epoch": 1.235176187982011, + "grad_norm": 7.018610329737173, + "learning_rate": 1.6843574357407638e-06, + "loss": 0.6447, + "step": 17097 + }, + { + "epoch": 1.2352484331822204, + "grad_norm": 6.682856293330322, + "learning_rate": 1.6840809443453332e-06, + "loss": 0.7204, + "step": 17098 + }, + { + "epoch": 1.23532067838243, + "grad_norm": 7.011129523466545, + "learning_rate": 1.6838044641192008e-06, + "loss": 0.677, + "step": 17099 + }, + { + "epoch": 1.2353929235826395, + "grad_norm": 6.761742162636388, + "learning_rate": 1.6835279950661527e-06, + "loss": 0.6221, + "step": 17100 + }, + { + "epoch": 1.235465168782849, + "grad_norm": 5.477825372904839, + "learning_rate": 1.683251537189971e-06, + "loss": 0.6294, + "step": 17101 + }, + { + "epoch": 1.2355374139830584, + "grad_norm": 7.32804561482521, + "learning_rate": 1.6829750904944414e-06, + "loss": 0.6649, + "step": 17102 + }, + { + "epoch": 1.235609659183268, + "grad_norm": 6.471537261358114, + "learning_rate": 1.6826986549833491e-06, + "loss": 0.6976, + "step": 17103 + }, + { + "epoch": 1.2356819043834775, + "grad_norm": 7.086265346088571, + "learning_rate": 1.6824222306604785e-06, + "loss": 0.6895, + "step": 17104 + }, + { + "epoch": 1.235754149583687, + "grad_norm": 7.236486007471314, + "learning_rate": 1.6821458175296116e-06, + "loss": 0.6738, + "step": 17105 + }, + { + "epoch": 1.2358263947838966, + "grad_norm": 6.130896085426888, + "learning_rate": 1.6818694155945335e-06, + "loss": 0.5399, + "step": 17106 + }, + { + "epoch": 1.235898639984106, + "grad_norm": 6.299084599914102, + "learning_rate": 1.6815930248590284e-06, + "loss": 0.6966, + "step": 17107 + }, + { + "epoch": 1.2359708851843156, + "grad_norm": 7.391309645960633, + "learning_rate": 1.6813166453268787e-06, + "loss": 0.6744, + "step": 17108 + }, + { + "epoch": 1.236043130384525, + "grad_norm": 6.722722923750576, + "learning_rate": 1.6810402770018685e-06, + "loss": 0.5842, + "step": 17109 + }, + { + "epoch": 1.2361153755847345, + "grad_norm": 8.011353542535499, + "learning_rate": 1.6807639198877811e-06, + "loss": 0.6514, + "step": 17110 + }, + { + "epoch": 1.236187620784944, + "grad_norm": 7.873600668937718, + "learning_rate": 1.6804875739884002e-06, + "loss": 0.6736, + "step": 17111 + }, + { + "epoch": 1.2362598659851536, + "grad_norm": 7.143319120453784, + "learning_rate": 1.6802112393075077e-06, + "loss": 0.6452, + "step": 17112 + }, + { + "epoch": 1.2363321111853631, + "grad_norm": 7.161630531783897, + "learning_rate": 1.6799349158488864e-06, + "loss": 0.6891, + "step": 17113 + }, + { + "epoch": 1.2364043563855727, + "grad_norm": 7.123596053085763, + "learning_rate": 1.6796586036163207e-06, + "loss": 0.678, + "step": 17114 + }, + { + "epoch": 1.2364766015857822, + "grad_norm": 6.441627142399598, + "learning_rate": 1.6793823026135906e-06, + "loss": 0.7031, + "step": 17115 + }, + { + "epoch": 1.2365488467859918, + "grad_norm": 6.301773778285408, + "learning_rate": 1.679106012844479e-06, + "loss": 0.6405, + "step": 17116 + }, + { + "epoch": 1.236621091986201, + "grad_norm": 5.780616849303132, + "learning_rate": 1.6788297343127695e-06, + "loss": 0.6572, + "step": 17117 + }, + { + "epoch": 1.2366933371864106, + "grad_norm": 6.774154114791548, + "learning_rate": 1.6785534670222441e-06, + "loss": 0.6698, + "step": 17118 + }, + { + "epoch": 1.2367655823866202, + "grad_norm": 7.053843366406215, + "learning_rate": 1.678277210976683e-06, + "loss": 0.6548, + "step": 17119 + }, + { + "epoch": 1.2368378275868297, + "grad_norm": 6.985065059154083, + "learning_rate": 1.6780009661798685e-06, + "loss": 0.6019, + "step": 17120 + }, + { + "epoch": 1.2369100727870392, + "grad_norm": 6.443555335759152, + "learning_rate": 1.677724732635583e-06, + "loss": 0.6173, + "step": 17121 + }, + { + "epoch": 1.2369823179872488, + "grad_norm": 6.911498686024187, + "learning_rate": 1.677448510347608e-06, + "loss": 0.6171, + "step": 17122 + }, + { + "epoch": 1.2370545631874583, + "grad_norm": 6.1998611434646005, + "learning_rate": 1.6771722993197234e-06, + "loss": 0.631, + "step": 17123 + }, + { + "epoch": 1.2371268083876679, + "grad_norm": 7.199389368230518, + "learning_rate": 1.6768960995557113e-06, + "loss": 0.5651, + "step": 17124 + }, + { + "epoch": 1.2371990535878772, + "grad_norm": 6.004715020550712, + "learning_rate": 1.6766199110593527e-06, + "loss": 0.686, + "step": 17125 + }, + { + "epoch": 1.2372712987880867, + "grad_norm": 5.780265765564473, + "learning_rate": 1.6763437338344281e-06, + "loss": 0.6681, + "step": 17126 + }, + { + "epoch": 1.2373435439882963, + "grad_norm": 6.5260611186507615, + "learning_rate": 1.6760675678847182e-06, + "loss": 0.6565, + "step": 17127 + }, + { + "epoch": 1.2374157891885058, + "grad_norm": 5.79856629903182, + "learning_rate": 1.6757914132140035e-06, + "loss": 0.642, + "step": 17128 + }, + { + "epoch": 1.2374880343887154, + "grad_norm": 7.342093552797942, + "learning_rate": 1.6755152698260658e-06, + "loss": 0.7371, + "step": 17129 + }, + { + "epoch": 1.237560279588925, + "grad_norm": 6.729349265755742, + "learning_rate": 1.6752391377246818e-06, + "loss": 0.8148, + "step": 17130 + }, + { + "epoch": 1.2376325247891344, + "grad_norm": 6.240467278517535, + "learning_rate": 1.674963016913634e-06, + "loss": 0.6457, + "step": 17131 + }, + { + "epoch": 1.2377047699893438, + "grad_norm": 5.495868865335211, + "learning_rate": 1.6746869073967035e-06, + "loss": 0.676, + "step": 17132 + }, + { + "epoch": 1.2377770151895533, + "grad_norm": 5.814889396212584, + "learning_rate": 1.674410809177667e-06, + "loss": 0.6526, + "step": 17133 + }, + { + "epoch": 1.2378492603897628, + "grad_norm": 6.9641812662529485, + "learning_rate": 1.6741347222603055e-06, + "loss": 0.6337, + "step": 17134 + }, + { + "epoch": 1.2379215055899724, + "grad_norm": 5.980744298844757, + "learning_rate": 1.6738586466483985e-06, + "loss": 0.6299, + "step": 17135 + }, + { + "epoch": 1.237993750790182, + "grad_norm": 5.878456559836001, + "learning_rate": 1.673582582345726e-06, + "loss": 0.6204, + "step": 17136 + }, + { + "epoch": 1.2380659959903915, + "grad_norm": 7.035085759294267, + "learning_rate": 1.6733065293560651e-06, + "loss": 0.6735, + "step": 17137 + }, + { + "epoch": 1.238138241190601, + "grad_norm": 8.158176706100987, + "learning_rate": 1.6730304876831965e-06, + "loss": 0.6569, + "step": 17138 + }, + { + "epoch": 1.2382104863908103, + "grad_norm": 7.18092908283293, + "learning_rate": 1.6727544573308985e-06, + "loss": 0.6615, + "step": 17139 + }, + { + "epoch": 1.2382827315910199, + "grad_norm": 8.847126346576337, + "learning_rate": 1.6724784383029496e-06, + "loss": 0.6303, + "step": 17140 + }, + { + "epoch": 1.2383549767912294, + "grad_norm": 6.392062522171816, + "learning_rate": 1.672202430603128e-06, + "loss": 0.6301, + "step": 17141 + }, + { + "epoch": 1.238427221991439, + "grad_norm": 8.148181885671931, + "learning_rate": 1.6719264342352126e-06, + "loss": 0.6945, + "step": 17142 + }, + { + "epoch": 1.2384994671916485, + "grad_norm": 7.596131504698291, + "learning_rate": 1.6716504492029823e-06, + "loss": 0.6327, + "step": 17143 + }, + { + "epoch": 1.238571712391858, + "grad_norm": 6.802528853574304, + "learning_rate": 1.6713744755102125e-06, + "loss": 0.6502, + "step": 17144 + }, + { + "epoch": 1.2386439575920676, + "grad_norm": 6.448588182756072, + "learning_rate": 1.6710985131606832e-06, + "loss": 0.6109, + "step": 17145 + }, + { + "epoch": 1.238716202792277, + "grad_norm": 7.829287140064671, + "learning_rate": 1.670822562158172e-06, + "loss": 0.6914, + "step": 17146 + }, + { + "epoch": 1.2387884479924864, + "grad_norm": 8.316214513113824, + "learning_rate": 1.670546622506457e-06, + "loss": 0.7089, + "step": 17147 + }, + { + "epoch": 1.238860693192696, + "grad_norm": 7.523270174346455, + "learning_rate": 1.6702706942093138e-06, + "loss": 0.6553, + "step": 17148 + }, + { + "epoch": 1.2389329383929055, + "grad_norm": 6.264746092988856, + "learning_rate": 1.6699947772705204e-06, + "loss": 0.6815, + "step": 17149 + }, + { + "epoch": 1.239005183593115, + "grad_norm": 7.943819669092356, + "learning_rate": 1.6697188716938545e-06, + "loss": 0.6959, + "step": 17150 + }, + { + "epoch": 1.2390774287933246, + "grad_norm": 6.382243211588324, + "learning_rate": 1.6694429774830923e-06, + "loss": 0.6201, + "step": 17151 + }, + { + "epoch": 1.2391496739935342, + "grad_norm": 5.643422962400031, + "learning_rate": 1.6691670946420104e-06, + "loss": 0.6355, + "step": 17152 + }, + { + "epoch": 1.2392219191937435, + "grad_norm": 7.720405184929248, + "learning_rate": 1.6688912231743863e-06, + "loss": 0.5891, + "step": 17153 + }, + { + "epoch": 1.239294164393953, + "grad_norm": 5.963603572163224, + "learning_rate": 1.6686153630839964e-06, + "loss": 0.6525, + "step": 17154 + }, + { + "epoch": 1.2393664095941626, + "grad_norm": 6.05688877575325, + "learning_rate": 1.6683395143746162e-06, + "loss": 0.6059, + "step": 17155 + }, + { + "epoch": 1.239438654794372, + "grad_norm": 6.823334643735451, + "learning_rate": 1.6680636770500222e-06, + "loss": 0.6033, + "step": 17156 + }, + { + "epoch": 1.2395108999945816, + "grad_norm": 5.448312991759629, + "learning_rate": 1.6677878511139916e-06, + "loss": 0.7245, + "step": 17157 + }, + { + "epoch": 1.2395831451947912, + "grad_norm": 6.306888011948815, + "learning_rate": 1.6675120365702973e-06, + "loss": 0.6074, + "step": 17158 + }, + { + "epoch": 1.2396553903950007, + "grad_norm": 7.8481775683147506, + "learning_rate": 1.6672362334227177e-06, + "loss": 0.683, + "step": 17159 + }, + { + "epoch": 1.23972763559521, + "grad_norm": 6.448502998213519, + "learning_rate": 1.6669604416750277e-06, + "loss": 0.637, + "step": 17160 + }, + { + "epoch": 1.2397998807954196, + "grad_norm": 6.665722048911851, + "learning_rate": 1.6666846613310031e-06, + "loss": 0.6439, + "step": 17161 + }, + { + "epoch": 1.2398721259956291, + "grad_norm": 7.911888554527773, + "learning_rate": 1.6664088923944174e-06, + "loss": 0.6092, + "step": 17162 + }, + { + "epoch": 1.2399443711958387, + "grad_norm": 6.937771078776231, + "learning_rate": 1.6661331348690468e-06, + "loss": 0.7002, + "step": 17163 + }, + { + "epoch": 1.2400166163960482, + "grad_norm": 5.663817680592671, + "learning_rate": 1.6658573887586669e-06, + "loss": 0.6137, + "step": 17164 + }, + { + "epoch": 1.2400888615962578, + "grad_norm": 6.3878758866704555, + "learning_rate": 1.6655816540670508e-06, + "loss": 0.6132, + "step": 17165 + }, + { + "epoch": 1.2401611067964673, + "grad_norm": 6.398428855210125, + "learning_rate": 1.6653059307979742e-06, + "loss": 0.7003, + "step": 17166 + }, + { + "epoch": 1.2402333519966766, + "grad_norm": 5.951552936909815, + "learning_rate": 1.665030218955211e-06, + "loss": 0.5775, + "step": 17167 + }, + { + "epoch": 1.2403055971968862, + "grad_norm": 7.564358569558139, + "learning_rate": 1.6647545185425368e-06, + "loss": 0.6592, + "step": 17168 + }, + { + "epoch": 1.2403778423970957, + "grad_norm": 5.794487392079034, + "learning_rate": 1.6644788295637238e-06, + "loss": 0.558, + "step": 17169 + }, + { + "epoch": 1.2404500875973052, + "grad_norm": 6.3670506000428375, + "learning_rate": 1.664203152022547e-06, + "loss": 0.6285, + "step": 17170 + }, + { + "epoch": 1.2405223327975148, + "grad_norm": 7.362897090857419, + "learning_rate": 1.6639274859227811e-06, + "loss": 0.6763, + "step": 17171 + }, + { + "epoch": 1.2405945779977243, + "grad_norm": 7.683379270646579, + "learning_rate": 1.663651831268197e-06, + "loss": 0.6334, + "step": 17172 + }, + { + "epoch": 1.2406668231979339, + "grad_norm": 6.37661303888328, + "learning_rate": 1.663376188062571e-06, + "loss": 0.7154, + "step": 17173 + }, + { + "epoch": 1.2407390683981432, + "grad_norm": 6.552315502167261, + "learning_rate": 1.663100556309675e-06, + "loss": 0.6258, + "step": 17174 + }, + { + "epoch": 1.2408113135983527, + "grad_norm": 7.319546947529597, + "learning_rate": 1.6628249360132836e-06, + "loss": 0.6598, + "step": 17175 + }, + { + "epoch": 1.2408835587985623, + "grad_norm": 7.117711136918475, + "learning_rate": 1.6625493271771678e-06, + "loss": 0.6365, + "step": 17176 + }, + { + "epoch": 1.2409558039987718, + "grad_norm": 7.019360335895749, + "learning_rate": 1.6622737298051017e-06, + "loss": 0.6141, + "step": 17177 + }, + { + "epoch": 1.2410280491989814, + "grad_norm": 7.033448007139928, + "learning_rate": 1.661998143900857e-06, + "loss": 0.6216, + "step": 17178 + }, + { + "epoch": 1.241100294399191, + "grad_norm": 6.998741309216256, + "learning_rate": 1.6617225694682088e-06, + "loss": 0.6445, + "step": 17179 + }, + { + "epoch": 1.2411725395994004, + "grad_norm": 7.268078855196972, + "learning_rate": 1.6614470065109264e-06, + "loss": 0.6388, + "step": 17180 + }, + { + "epoch": 1.2412447847996098, + "grad_norm": 6.12508158726752, + "learning_rate": 1.6611714550327838e-06, + "loss": 0.5695, + "step": 17181 + }, + { + "epoch": 1.2413170299998193, + "grad_norm": 7.217106355424438, + "learning_rate": 1.6608959150375527e-06, + "loss": 0.6653, + "step": 17182 + }, + { + "epoch": 1.2413892752000288, + "grad_norm": 6.098226145493625, + "learning_rate": 1.660620386529005e-06, + "loss": 0.6027, + "step": 17183 + }, + { + "epoch": 1.2414615204002384, + "grad_norm": 6.768052720552185, + "learning_rate": 1.660344869510912e-06, + "loss": 0.6308, + "step": 17184 + }, + { + "epoch": 1.241533765600448, + "grad_norm": 7.0380015870352945, + "learning_rate": 1.6600693639870464e-06, + "loss": 0.6099, + "step": 17185 + }, + { + "epoch": 1.2416060108006575, + "grad_norm": 8.575859840813477, + "learning_rate": 1.6597938699611788e-06, + "loss": 0.694, + "step": 17186 + }, + { + "epoch": 1.241678256000867, + "grad_norm": 6.189061748039433, + "learning_rate": 1.6595183874370807e-06, + "loss": 0.6288, + "step": 17187 + }, + { + "epoch": 1.2417505012010763, + "grad_norm": 6.508394542736771, + "learning_rate": 1.6592429164185232e-06, + "loss": 0.6356, + "step": 17188 + }, + { + "epoch": 1.2418227464012859, + "grad_norm": 6.24177284439019, + "learning_rate": 1.6589674569092785e-06, + "loss": 0.6864, + "step": 17189 + }, + { + "epoch": 1.2418949916014954, + "grad_norm": 7.147295951372373, + "learning_rate": 1.658692008913115e-06, + "loss": 0.6507, + "step": 17190 + }, + { + "epoch": 1.241967236801705, + "grad_norm": 7.390502412517287, + "learning_rate": 1.6584165724338048e-06, + "loss": 0.6456, + "step": 17191 + }, + { + "epoch": 1.2420394820019145, + "grad_norm": 6.487118501416173, + "learning_rate": 1.6581411474751185e-06, + "loss": 0.6207, + "step": 17192 + }, + { + "epoch": 1.242111727202124, + "grad_norm": 6.9168917454632055, + "learning_rate": 1.6578657340408264e-06, + "loss": 0.6448, + "step": 17193 + }, + { + "epoch": 1.2421839724023336, + "grad_norm": 6.8339542323927995, + "learning_rate": 1.657590332134698e-06, + "loss": 0.6902, + "step": 17194 + }, + { + "epoch": 1.2422562176025431, + "grad_norm": 7.4051288770218395, + "learning_rate": 1.6573149417605042e-06, + "loss": 0.6648, + "step": 17195 + }, + { + "epoch": 1.2423284628027527, + "grad_norm": 7.533198820730557, + "learning_rate": 1.6570395629220148e-06, + "loss": 0.7101, + "step": 17196 + }, + { + "epoch": 1.242400708002962, + "grad_norm": 6.853903845605414, + "learning_rate": 1.6567641956229989e-06, + "loss": 0.653, + "step": 17197 + }, + { + "epoch": 1.2424729532031715, + "grad_norm": 6.696659553540511, + "learning_rate": 1.6564888398672262e-06, + "loss": 0.6754, + "step": 17198 + }, + { + "epoch": 1.242545198403381, + "grad_norm": 6.15590794576387, + "learning_rate": 1.6562134956584665e-06, + "loss": 0.6533, + "step": 17199 + }, + { + "epoch": 1.2426174436035906, + "grad_norm": 6.957438093452126, + "learning_rate": 1.6559381630004895e-06, + "loss": 0.6582, + "step": 17200 + }, + { + "epoch": 1.2426896888038002, + "grad_norm": 6.0008623774988905, + "learning_rate": 1.655662841897063e-06, + "loss": 0.6628, + "step": 17201 + }, + { + "epoch": 1.2427619340040097, + "grad_norm": 6.31633740106262, + "learning_rate": 1.6553875323519567e-06, + "loss": 0.6997, + "step": 17202 + }, + { + "epoch": 1.2428341792042192, + "grad_norm": 6.315750040801239, + "learning_rate": 1.6551122343689391e-06, + "loss": 0.588, + "step": 17203 + }, + { + "epoch": 1.2429064244044286, + "grad_norm": 6.3230861618057315, + "learning_rate": 1.6548369479517803e-06, + "loss": 0.554, + "step": 17204 + }, + { + "epoch": 1.242978669604638, + "grad_norm": 7.125514497083318, + "learning_rate": 1.6545616731042462e-06, + "loss": 0.6538, + "step": 17205 + }, + { + "epoch": 1.2430509148048476, + "grad_norm": 6.318513027015027, + "learning_rate": 1.6542864098301064e-06, + "loss": 0.6956, + "step": 17206 + }, + { + "epoch": 1.2431231600050572, + "grad_norm": 5.923196046376364, + "learning_rate": 1.6540111581331296e-06, + "loss": 0.6574, + "step": 17207 + }, + { + "epoch": 1.2431954052052667, + "grad_norm": 6.321768424417561, + "learning_rate": 1.6537359180170827e-06, + "loss": 0.6174, + "step": 17208 + }, + { + "epoch": 1.2432676504054763, + "grad_norm": 6.923883495459135, + "learning_rate": 1.6534606894857342e-06, + "loss": 0.6796, + "step": 17209 + }, + { + "epoch": 1.2433398956056858, + "grad_norm": 6.527816234863124, + "learning_rate": 1.653185472542851e-06, + "loss": 0.7307, + "step": 17210 + }, + { + "epoch": 1.2434121408058951, + "grad_norm": 6.650043871742032, + "learning_rate": 1.6529102671922021e-06, + "loss": 0.6287, + "step": 17211 + }, + { + "epoch": 1.2434843860061047, + "grad_norm": 7.003024401574149, + "learning_rate": 1.6526350734375534e-06, + "loss": 0.6166, + "step": 17212 + }, + { + "epoch": 1.2435566312063142, + "grad_norm": 6.32284755344315, + "learning_rate": 1.6523598912826724e-06, + "loss": 0.6341, + "step": 17213 + }, + { + "epoch": 1.2436288764065238, + "grad_norm": 7.159178821634956, + "learning_rate": 1.6520847207313273e-06, + "loss": 0.666, + "step": 17214 + }, + { + "epoch": 1.2437011216067333, + "grad_norm": 7.58820382236507, + "learning_rate": 1.6518095617872827e-06, + "loss": 0.762, + "step": 17215 + }, + { + "epoch": 1.2437733668069428, + "grad_norm": 8.345652616934105, + "learning_rate": 1.6515344144543073e-06, + "loss": 0.7377, + "step": 17216 + }, + { + "epoch": 1.2438456120071524, + "grad_norm": 6.327778069675979, + "learning_rate": 1.6512592787361663e-06, + "loss": 0.6704, + "step": 17217 + }, + { + "epoch": 1.2439178572073617, + "grad_norm": 8.050615407983766, + "learning_rate": 1.6509841546366283e-06, + "loss": 0.6864, + "step": 17218 + }, + { + "epoch": 1.2439901024075712, + "grad_norm": 7.271451062155092, + "learning_rate": 1.650709042159457e-06, + "loss": 0.6565, + "step": 17219 + }, + { + "epoch": 1.2440623476077808, + "grad_norm": 6.275087284529738, + "learning_rate": 1.6504339413084196e-06, + "loss": 0.6729, + "step": 17220 + }, + { + "epoch": 1.2441345928079903, + "grad_norm": 5.257245831043793, + "learning_rate": 1.6501588520872822e-06, + "loss": 0.6349, + "step": 17221 + }, + { + "epoch": 1.2442068380081999, + "grad_norm": 7.536872759226437, + "learning_rate": 1.6498837744998097e-06, + "loss": 0.714, + "step": 17222 + }, + { + "epoch": 1.2442790832084094, + "grad_norm": 6.795848970689525, + "learning_rate": 1.6496087085497683e-06, + "loss": 0.6637, + "step": 17223 + }, + { + "epoch": 1.244351328408619, + "grad_norm": 6.690158208735437, + "learning_rate": 1.6493336542409232e-06, + "loss": 0.644, + "step": 17224 + }, + { + "epoch": 1.2444235736088283, + "grad_norm": 6.845149763654937, + "learning_rate": 1.6490586115770407e-06, + "loss": 0.6505, + "step": 17225 + }, + { + "epoch": 1.2444958188090378, + "grad_norm": 6.238657310009765, + "learning_rate": 1.6487835805618845e-06, + "loss": 0.5782, + "step": 17226 + }, + { + "epoch": 1.2445680640092474, + "grad_norm": 6.482279169960569, + "learning_rate": 1.6485085611992202e-06, + "loss": 0.5977, + "step": 17227 + }, + { + "epoch": 1.244640309209457, + "grad_norm": 7.090596443344049, + "learning_rate": 1.6482335534928122e-06, + "loss": 0.7099, + "step": 17228 + }, + { + "epoch": 1.2447125544096664, + "grad_norm": 7.1222226267692355, + "learning_rate": 1.6479585574464267e-06, + "loss": 0.7054, + "step": 17229 + }, + { + "epoch": 1.244784799609876, + "grad_norm": 6.050971011590245, + "learning_rate": 1.6476835730638258e-06, + "loss": 0.6467, + "step": 17230 + }, + { + "epoch": 1.2448570448100855, + "grad_norm": 6.102331135064896, + "learning_rate": 1.6474086003487755e-06, + "loss": 0.6273, + "step": 17231 + }, + { + "epoch": 1.2449292900102948, + "grad_norm": 6.7586430915686755, + "learning_rate": 1.6471336393050403e-06, + "loss": 0.6985, + "step": 17232 + }, + { + "epoch": 1.2450015352105044, + "grad_norm": 5.949276137998816, + "learning_rate": 1.646858689936382e-06, + "loss": 0.6167, + "step": 17233 + }, + { + "epoch": 1.245073780410714, + "grad_norm": 6.1391835935499195, + "learning_rate": 1.6465837522465662e-06, + "loss": 0.6356, + "step": 17234 + }, + { + "epoch": 1.2451460256109235, + "grad_norm": 7.303657859000372, + "learning_rate": 1.6463088262393557e-06, + "loss": 0.6784, + "step": 17235 + }, + { + "epoch": 1.245218270811133, + "grad_norm": 6.084667146680143, + "learning_rate": 1.6460339119185158e-06, + "loss": 0.6123, + "step": 17236 + }, + { + "epoch": 1.2452905160113426, + "grad_norm": 7.228190800507078, + "learning_rate": 1.6457590092878079e-06, + "loss": 0.636, + "step": 17237 + }, + { + "epoch": 1.245362761211552, + "grad_norm": 5.569179853124712, + "learning_rate": 1.6454841183509956e-06, + "loss": 0.6486, + "step": 17238 + }, + { + "epoch": 1.2454350064117614, + "grad_norm": 6.8017440018031, + "learning_rate": 1.645209239111843e-06, + "loss": 0.6575, + "step": 17239 + }, + { + "epoch": 1.245507251611971, + "grad_norm": 7.375952966188425, + "learning_rate": 1.6449343715741115e-06, + "loss": 0.65, + "step": 17240 + }, + { + "epoch": 1.2455794968121805, + "grad_norm": 7.942777786368132, + "learning_rate": 1.6446595157415645e-06, + "loss": 0.6629, + "step": 17241 + }, + { + "epoch": 1.24565174201239, + "grad_norm": 6.3466757055159455, + "learning_rate": 1.6443846716179646e-06, + "loss": 0.6668, + "step": 17242 + }, + { + "epoch": 1.2457239872125996, + "grad_norm": 6.531119459389965, + "learning_rate": 1.6441098392070749e-06, + "loss": 0.6017, + "step": 17243 + }, + { + "epoch": 1.2457962324128091, + "grad_norm": 7.102301060845472, + "learning_rate": 1.6438350185126561e-06, + "loss": 0.6665, + "step": 17244 + }, + { + "epoch": 1.2458684776130187, + "grad_norm": 6.60535122650109, + "learning_rate": 1.6435602095384718e-06, + "loss": 0.6943, + "step": 17245 + }, + { + "epoch": 1.245940722813228, + "grad_norm": 7.456018298147121, + "learning_rate": 1.6432854122882835e-06, + "loss": 0.602, + "step": 17246 + }, + { + "epoch": 1.2460129680134375, + "grad_norm": 7.08913752293548, + "learning_rate": 1.6430106267658522e-06, + "loss": 0.7516, + "step": 17247 + }, + { + "epoch": 1.246085213213647, + "grad_norm": 5.66513762694918, + "learning_rate": 1.6427358529749398e-06, + "loss": 0.5787, + "step": 17248 + }, + { + "epoch": 1.2461574584138566, + "grad_norm": 7.144208213548591, + "learning_rate": 1.6424610909193073e-06, + "loss": 0.7288, + "step": 17249 + }, + { + "epoch": 1.2462297036140662, + "grad_norm": 5.698330805464514, + "learning_rate": 1.6421863406027183e-06, + "loss": 0.5909, + "step": 17250 + }, + { + "epoch": 1.2463019488142757, + "grad_norm": 6.942064587872077, + "learning_rate": 1.6419116020289313e-06, + "loss": 0.588, + "step": 17251 + }, + { + "epoch": 1.2463741940144852, + "grad_norm": 7.529225690730297, + "learning_rate": 1.6416368752017081e-06, + "loss": 0.7742, + "step": 17252 + }, + { + "epoch": 1.2464464392146946, + "grad_norm": 6.73049878929019, + "learning_rate": 1.6413621601248097e-06, + "loss": 0.6078, + "step": 17253 + }, + { + "epoch": 1.246518684414904, + "grad_norm": 7.068879073876139, + "learning_rate": 1.6410874568019973e-06, + "loss": 0.6161, + "step": 17254 + }, + { + "epoch": 1.2465909296151136, + "grad_norm": 6.391099236014105, + "learning_rate": 1.6408127652370297e-06, + "loss": 0.6163, + "step": 17255 + }, + { + "epoch": 1.2466631748153232, + "grad_norm": 6.42648231832882, + "learning_rate": 1.6405380854336688e-06, + "loss": 0.6391, + "step": 17256 + }, + { + "epoch": 1.2467354200155327, + "grad_norm": 6.006295080740192, + "learning_rate": 1.6402634173956745e-06, + "loss": 0.6692, + "step": 17257 + }, + { + "epoch": 1.2468076652157423, + "grad_norm": 8.00403826834144, + "learning_rate": 1.6399887611268059e-06, + "loss": 0.6758, + "step": 17258 + }, + { + "epoch": 1.2468799104159518, + "grad_norm": 7.499198107766022, + "learning_rate": 1.6397141166308235e-06, + "loss": 0.6254, + "step": 17259 + }, + { + "epoch": 1.2469521556161611, + "grad_norm": 5.691448475950084, + "learning_rate": 1.6394394839114865e-06, + "loss": 0.693, + "step": 17260 + }, + { + "epoch": 1.2470244008163707, + "grad_norm": 8.583529522576445, + "learning_rate": 1.6391648629725562e-06, + "loss": 0.594, + "step": 17261 + }, + { + "epoch": 1.2470966460165802, + "grad_norm": 6.46934970895362, + "learning_rate": 1.6388902538177896e-06, + "loss": 0.6391, + "step": 17262 + }, + { + "epoch": 1.2471688912167898, + "grad_norm": 7.004650206235765, + "learning_rate": 1.6386156564509458e-06, + "loss": 0.6826, + "step": 17263 + }, + { + "epoch": 1.2472411364169993, + "grad_norm": 6.861250288663258, + "learning_rate": 1.638341070875787e-06, + "loss": 0.6077, + "step": 17264 + }, + { + "epoch": 1.2473133816172088, + "grad_norm": 7.348148408557062, + "learning_rate": 1.6380664970960685e-06, + "loss": 0.6076, + "step": 17265 + }, + { + "epoch": 1.2473856268174184, + "grad_norm": 6.437195705888412, + "learning_rate": 1.6377919351155502e-06, + "loss": 0.5446, + "step": 17266 + }, + { + "epoch": 1.247457872017628, + "grad_norm": 6.4002898985055445, + "learning_rate": 1.637517384937991e-06, + "loss": 0.574, + "step": 17267 + }, + { + "epoch": 1.2475301172178375, + "grad_norm": 6.588492227200459, + "learning_rate": 1.63724284656715e-06, + "loss": 0.594, + "step": 17268 + }, + { + "epoch": 1.2476023624180468, + "grad_norm": 7.225420623467893, + "learning_rate": 1.6369683200067837e-06, + "loss": 0.6946, + "step": 17269 + }, + { + "epoch": 1.2476746076182563, + "grad_norm": 7.648880631075849, + "learning_rate": 1.6366938052606513e-06, + "loss": 0.7039, + "step": 17270 + }, + { + "epoch": 1.2477468528184659, + "grad_norm": 6.438607129937472, + "learning_rate": 1.6364193023325102e-06, + "loss": 0.6914, + "step": 17271 + }, + { + "epoch": 1.2478190980186754, + "grad_norm": 7.191729553348475, + "learning_rate": 1.6361448112261185e-06, + "loss": 0.6581, + "step": 17272 + }, + { + "epoch": 1.247891343218885, + "grad_norm": 7.298038888122536, + "learning_rate": 1.6358703319452334e-06, + "loss": 0.7058, + "step": 17273 + }, + { + "epoch": 1.2479635884190945, + "grad_norm": 6.941724436116192, + "learning_rate": 1.6355958644936126e-06, + "loss": 0.5903, + "step": 17274 + }, + { + "epoch": 1.248035833619304, + "grad_norm": 6.627124949419231, + "learning_rate": 1.635321408875014e-06, + "loss": 0.6639, + "step": 17275 + }, + { + "epoch": 1.2481080788195134, + "grad_norm": 5.811652008875413, + "learning_rate": 1.6350469650931933e-06, + "loss": 0.6141, + "step": 17276 + }, + { + "epoch": 1.248180324019723, + "grad_norm": 7.012981095020243, + "learning_rate": 1.634772533151907e-06, + "loss": 0.6472, + "step": 17277 + }, + { + "epoch": 1.2482525692199324, + "grad_norm": 7.633713043408271, + "learning_rate": 1.634498113054915e-06, + "loss": 0.6584, + "step": 17278 + }, + { + "epoch": 1.248324814420142, + "grad_norm": 5.733193091222398, + "learning_rate": 1.6342237048059707e-06, + "loss": 0.6333, + "step": 17279 + }, + { + "epoch": 1.2483970596203515, + "grad_norm": 7.2588200028980365, + "learning_rate": 1.6339493084088315e-06, + "loss": 0.6049, + "step": 17280 + }, + { + "epoch": 1.248469304820561, + "grad_norm": 6.362978502428189, + "learning_rate": 1.6336749238672539e-06, + "loss": 0.5925, + "step": 17281 + }, + { + "epoch": 1.2485415500207706, + "grad_norm": 7.96039361565201, + "learning_rate": 1.6334005511849942e-06, + "loss": 0.7064, + "step": 17282 + }, + { + "epoch": 1.24861379522098, + "grad_norm": 6.932304292510181, + "learning_rate": 1.6331261903658077e-06, + "loss": 0.6965, + "step": 17283 + }, + { + "epoch": 1.2486860404211895, + "grad_norm": 7.824070777198487, + "learning_rate": 1.6328518414134505e-06, + "loss": 0.693, + "step": 17284 + }, + { + "epoch": 1.248758285621399, + "grad_norm": 6.954732027684769, + "learning_rate": 1.6325775043316786e-06, + "loss": 0.6469, + "step": 17285 + }, + { + "epoch": 1.2488305308216086, + "grad_norm": 6.52538390217777, + "learning_rate": 1.6323031791242475e-06, + "loss": 0.6194, + "step": 17286 + }, + { + "epoch": 1.248902776021818, + "grad_norm": 7.178718841926168, + "learning_rate": 1.6320288657949116e-06, + "loss": 0.629, + "step": 17287 + }, + { + "epoch": 1.2489750212220276, + "grad_norm": 6.631370216883472, + "learning_rate": 1.6317545643474264e-06, + "loss": 0.6667, + "step": 17288 + }, + { + "epoch": 1.2490472664222372, + "grad_norm": 5.98045790638305, + "learning_rate": 1.6314802747855485e-06, + "loss": 0.663, + "step": 17289 + }, + { + "epoch": 1.2491195116224465, + "grad_norm": 6.135175388257269, + "learning_rate": 1.63120599711303e-06, + "loss": 0.6544, + "step": 17290 + }, + { + "epoch": 1.249191756822656, + "grad_norm": 8.994867344758603, + "learning_rate": 1.630931731333627e-06, + "loss": 0.6683, + "step": 17291 + }, + { + "epoch": 1.2492640020228656, + "grad_norm": 6.122764646605699, + "learning_rate": 1.630657477451093e-06, + "loss": 0.7172, + "step": 17292 + }, + { + "epoch": 1.2493362472230751, + "grad_norm": 6.886229897478572, + "learning_rate": 1.630383235469185e-06, + "loss": 0.6175, + "step": 17293 + }, + { + "epoch": 1.2494084924232847, + "grad_norm": 8.373966437943766, + "learning_rate": 1.630109005391654e-06, + "loss": 0.6249, + "step": 17294 + }, + { + "epoch": 1.2494807376234942, + "grad_norm": 7.1744574258593765, + "learning_rate": 1.6298347872222552e-06, + "loss": 0.6615, + "step": 17295 + }, + { + "epoch": 1.2495529828237038, + "grad_norm": 6.800238818294535, + "learning_rate": 1.6295605809647436e-06, + "loss": 0.6707, + "step": 17296 + }, + { + "epoch": 1.249625228023913, + "grad_norm": 6.389786569675536, + "learning_rate": 1.629286386622871e-06, + "loss": 0.6021, + "step": 17297 + }, + { + "epoch": 1.2496974732241226, + "grad_norm": 6.321906003249022, + "learning_rate": 1.6290122042003915e-06, + "loss": 0.5758, + "step": 17298 + }, + { + "epoch": 1.2497697184243322, + "grad_norm": 6.047680231935559, + "learning_rate": 1.6287380337010586e-06, + "loss": 0.6369, + "step": 17299 + }, + { + "epoch": 1.2498419636245417, + "grad_norm": 6.924595558667889, + "learning_rate": 1.6284638751286263e-06, + "loss": 0.6271, + "step": 17300 + }, + { + "epoch": 1.2499142088247512, + "grad_norm": 6.815413105927135, + "learning_rate": 1.6281897284868465e-06, + "loss": 0.6486, + "step": 17301 + }, + { + "epoch": 1.2499864540249608, + "grad_norm": 7.048045262099383, + "learning_rate": 1.627915593779472e-06, + "loss": 0.6777, + "step": 17302 + }, + { + "epoch": 1.2500586992251703, + "grad_norm": 6.774305312235263, + "learning_rate": 1.627641471010257e-06, + "loss": 0.6356, + "step": 17303 + }, + { + "epoch": 1.2501309444253796, + "grad_norm": 9.70943404303439, + "learning_rate": 1.627367360182952e-06, + "loss": 0.7046, + "step": 17304 + }, + { + "epoch": 1.2502031896255892, + "grad_norm": 6.297429895680906, + "learning_rate": 1.6270932613013102e-06, + "loss": 0.6897, + "step": 17305 + }, + { + "epoch": 1.2502754348257987, + "grad_norm": 6.702434370835818, + "learning_rate": 1.626819174369083e-06, + "loss": 0.6225, + "step": 17306 + }, + { + "epoch": 1.2503476800260083, + "grad_norm": 7.162803614174716, + "learning_rate": 1.6265450993900257e-06, + "loss": 0.6711, + "step": 17307 + }, + { + "epoch": 1.2504199252262178, + "grad_norm": 6.528123609251438, + "learning_rate": 1.6262710363678863e-06, + "loss": 0.6702, + "step": 17308 + }, + { + "epoch": 1.2504921704264274, + "grad_norm": 7.310172207704743, + "learning_rate": 1.6259969853064177e-06, + "loss": 0.6106, + "step": 17309 + }, + { + "epoch": 1.250564415626637, + "grad_norm": 7.627265468556344, + "learning_rate": 1.6257229462093723e-06, + "loss": 0.6315, + "step": 17310 + }, + { + "epoch": 1.2506366608268462, + "grad_norm": 6.248045653915167, + "learning_rate": 1.6254489190805012e-06, + "loss": 0.6287, + "step": 17311 + }, + { + "epoch": 1.2507089060270558, + "grad_norm": 7.014098547224425, + "learning_rate": 1.6251749039235548e-06, + "loss": 0.6152, + "step": 17312 + }, + { + "epoch": 1.2507811512272653, + "grad_norm": 7.023730516951505, + "learning_rate": 1.624900900742285e-06, + "loss": 0.6395, + "step": 17313 + }, + { + "epoch": 1.2508533964274748, + "grad_norm": 6.7818777611216845, + "learning_rate": 1.6246269095404426e-06, + "loss": 0.6157, + "step": 17314 + }, + { + "epoch": 1.2509256416276844, + "grad_norm": 8.605366793256659, + "learning_rate": 1.6243529303217774e-06, + "loss": 0.5962, + "step": 17315 + }, + { + "epoch": 1.250997886827894, + "grad_norm": 6.463998422358103, + "learning_rate": 1.6240789630900414e-06, + "loss": 0.5661, + "step": 17316 + }, + { + "epoch": 1.2510701320281035, + "grad_norm": 5.956629725197418, + "learning_rate": 1.6238050078489837e-06, + "loss": 0.5716, + "step": 17317 + }, + { + "epoch": 1.2511423772283128, + "grad_norm": 7.90880293207255, + "learning_rate": 1.6235310646023567e-06, + "loss": 0.7175, + "step": 17318 + }, + { + "epoch": 1.2512146224285223, + "grad_norm": 7.983182157334112, + "learning_rate": 1.6232571333539071e-06, + "loss": 0.5819, + "step": 17319 + }, + { + "epoch": 1.2512868676287319, + "grad_norm": 6.87407775241807, + "learning_rate": 1.6229832141073865e-06, + "loss": 0.6723, + "step": 17320 + }, + { + "epoch": 1.2513591128289414, + "grad_norm": 6.318606303134708, + "learning_rate": 1.6227093068665465e-06, + "loss": 0.6097, + "step": 17321 + }, + { + "epoch": 1.251431358029151, + "grad_norm": 5.937410695257669, + "learning_rate": 1.6224354116351338e-06, + "loss": 0.6492, + "step": 17322 + }, + { + "epoch": 1.2515036032293605, + "grad_norm": 6.0696859343221865, + "learning_rate": 1.6221615284168985e-06, + "loss": 0.6197, + "step": 17323 + }, + { + "epoch": 1.25157584842957, + "grad_norm": 6.632715015571165, + "learning_rate": 1.6218876572155905e-06, + "loss": 0.7417, + "step": 17324 + }, + { + "epoch": 1.2516480936297794, + "grad_norm": 7.353272063944508, + "learning_rate": 1.6216137980349595e-06, + "loss": 0.6958, + "step": 17325 + }, + { + "epoch": 1.2517203388299891, + "grad_norm": 6.271552803951658, + "learning_rate": 1.6213399508787525e-06, + "loss": 0.6648, + "step": 17326 + }, + { + "epoch": 1.2517925840301984, + "grad_norm": 8.356289131797633, + "learning_rate": 1.6210661157507196e-06, + "loss": 0.5988, + "step": 17327 + }, + { + "epoch": 1.251864829230408, + "grad_norm": 6.44311426765539, + "learning_rate": 1.62079229265461e-06, + "loss": 0.7039, + "step": 17328 + }, + { + "epoch": 1.2519370744306175, + "grad_norm": 7.021830170118633, + "learning_rate": 1.6205184815941705e-06, + "loss": 0.6088, + "step": 17329 + }, + { + "epoch": 1.252009319630827, + "grad_norm": 7.426811937364411, + "learning_rate": 1.62024468257315e-06, + "loss": 0.6108, + "step": 17330 + }, + { + "epoch": 1.2520815648310366, + "grad_norm": 5.224008069953702, + "learning_rate": 1.6199708955952964e-06, + "loss": 0.6649, + "step": 17331 + }, + { + "epoch": 1.252153810031246, + "grad_norm": 7.405894596891954, + "learning_rate": 1.6196971206643593e-06, + "loss": 0.5967, + "step": 17332 + }, + { + "epoch": 1.2522260552314557, + "grad_norm": 6.059208556511224, + "learning_rate": 1.6194233577840842e-06, + "loss": 0.6322, + "step": 17333 + }, + { + "epoch": 1.252298300431665, + "grad_norm": 6.708126594090466, + "learning_rate": 1.6191496069582192e-06, + "loss": 0.6784, + "step": 17334 + }, + { + "epoch": 1.2523705456318746, + "grad_norm": 7.516052868375451, + "learning_rate": 1.6188758681905123e-06, + "loss": 0.6965, + "step": 17335 + }, + { + "epoch": 1.252442790832084, + "grad_norm": 6.438645936765233, + "learning_rate": 1.618602141484712e-06, + "loss": 0.6578, + "step": 17336 + }, + { + "epoch": 1.2525150360322936, + "grad_norm": 7.501241199785107, + "learning_rate": 1.6183284268445626e-06, + "loss": 0.6326, + "step": 17337 + }, + { + "epoch": 1.2525872812325032, + "grad_norm": 8.170563630887818, + "learning_rate": 1.6180547242738126e-06, + "loss": 0.6718, + "step": 17338 + }, + { + "epoch": 1.2526595264327125, + "grad_norm": 7.924289553594014, + "learning_rate": 1.6177810337762093e-06, + "loss": 0.62, + "step": 17339 + }, + { + "epoch": 1.2527317716329223, + "grad_norm": 6.86281574127425, + "learning_rate": 1.617507355355498e-06, + "loss": 0.6095, + "step": 17340 + }, + { + "epoch": 1.2528040168331316, + "grad_norm": 6.853280177275144, + "learning_rate": 1.6172336890154258e-06, + "loss": 0.585, + "step": 17341 + }, + { + "epoch": 1.2528762620333411, + "grad_norm": 7.998640898651842, + "learning_rate": 1.6169600347597388e-06, + "loss": 0.6588, + "step": 17342 + }, + { + "epoch": 1.2529485072335507, + "grad_norm": 7.526087087386368, + "learning_rate": 1.6166863925921841e-06, + "loss": 0.6246, + "step": 17343 + }, + { + "epoch": 1.2530207524337602, + "grad_norm": 7.490148813885502, + "learning_rate": 1.6164127625165063e-06, + "loss": 0.6327, + "step": 17344 + }, + { + "epoch": 1.2530929976339698, + "grad_norm": 8.053785242112818, + "learning_rate": 1.6161391445364516e-06, + "loss": 0.6953, + "step": 17345 + }, + { + "epoch": 1.253165242834179, + "grad_norm": 6.782517275199445, + "learning_rate": 1.615865538655767e-06, + "loss": 0.5918, + "step": 17346 + }, + { + "epoch": 1.2532374880343888, + "grad_norm": 7.218367661428492, + "learning_rate": 1.6155919448781953e-06, + "loss": 0.6949, + "step": 17347 + }, + { + "epoch": 1.2533097332345982, + "grad_norm": 6.605547001587798, + "learning_rate": 1.6153183632074829e-06, + "loss": 0.6585, + "step": 17348 + }, + { + "epoch": 1.2533819784348077, + "grad_norm": 7.405164164205148, + "learning_rate": 1.6150447936473756e-06, + "loss": 0.617, + "step": 17349 + }, + { + "epoch": 1.2534542236350172, + "grad_norm": 5.799960484863372, + "learning_rate": 1.614771236201619e-06, + "loss": 0.6358, + "step": 17350 + }, + { + "epoch": 1.2535264688352268, + "grad_norm": 6.52659477534589, + "learning_rate": 1.6144976908739557e-06, + "loss": 0.6828, + "step": 17351 + }, + { + "epoch": 1.2535987140354363, + "grad_norm": 6.278805992671179, + "learning_rate": 1.6142241576681318e-06, + "loss": 0.6426, + "step": 17352 + }, + { + "epoch": 1.2536709592356456, + "grad_norm": 6.900261815950327, + "learning_rate": 1.6139506365878915e-06, + "loss": 0.6344, + "step": 17353 + }, + { + "epoch": 1.2537432044358554, + "grad_norm": 8.028391050902474, + "learning_rate": 1.613677127636979e-06, + "loss": 0.6192, + "step": 17354 + }, + { + "epoch": 1.2538154496360647, + "grad_norm": 6.584160217634772, + "learning_rate": 1.6134036308191382e-06, + "loss": 0.6519, + "step": 17355 + }, + { + "epoch": 1.2538876948362743, + "grad_norm": 6.819979699963201, + "learning_rate": 1.6131301461381133e-06, + "loss": 0.6356, + "step": 17356 + }, + { + "epoch": 1.2539599400364838, + "grad_norm": 5.0773468829930515, + "learning_rate": 1.6128566735976486e-06, + "loss": 0.5931, + "step": 17357 + }, + { + "epoch": 1.2540321852366934, + "grad_norm": 7.200957065127426, + "learning_rate": 1.6125832132014865e-06, + "loss": 0.6501, + "step": 17358 + }, + { + "epoch": 1.254104430436903, + "grad_norm": 6.305992473565137, + "learning_rate": 1.6123097649533714e-06, + "loss": 0.6577, + "step": 17359 + }, + { + "epoch": 1.2541766756371124, + "grad_norm": 6.426364786383962, + "learning_rate": 1.6120363288570462e-06, + "loss": 0.6892, + "step": 17360 + }, + { + "epoch": 1.254248920837322, + "grad_norm": 6.716327461152388, + "learning_rate": 1.6117629049162553e-06, + "loss": 0.7248, + "step": 17361 + }, + { + "epoch": 1.2543211660375313, + "grad_norm": 6.501640699631853, + "learning_rate": 1.611489493134739e-06, + "loss": 0.6059, + "step": 17362 + }, + { + "epoch": 1.2543934112377408, + "grad_norm": 6.564916256721305, + "learning_rate": 1.6112160935162424e-06, + "loss": 0.6495, + "step": 17363 + }, + { + "epoch": 1.2544656564379504, + "grad_norm": 6.407976020339503, + "learning_rate": 1.6109427060645084e-06, + "loss": 0.686, + "step": 17364 + }, + { + "epoch": 1.25453790163816, + "grad_norm": 7.055202258351634, + "learning_rate": 1.6106693307832772e-06, + "loss": 0.6522, + "step": 17365 + }, + { + "epoch": 1.2546101468383695, + "grad_norm": 6.961327401660924, + "learning_rate": 1.6103959676762922e-06, + "loss": 0.6498, + "step": 17366 + }, + { + "epoch": 1.254682392038579, + "grad_norm": 6.930647481420387, + "learning_rate": 1.610122616747296e-06, + "loss": 0.6283, + "step": 17367 + }, + { + "epoch": 1.2547546372387886, + "grad_norm": 6.544568367896413, + "learning_rate": 1.6098492780000308e-06, + "loss": 0.6693, + "step": 17368 + }, + { + "epoch": 1.2548268824389979, + "grad_norm": 5.849745994335326, + "learning_rate": 1.609575951438237e-06, + "loss": 0.5172, + "step": 17369 + }, + { + "epoch": 1.2548991276392074, + "grad_norm": 6.181859692010891, + "learning_rate": 1.6093026370656573e-06, + "loss": 0.5537, + "step": 17370 + }, + { + "epoch": 1.254971372839417, + "grad_norm": 8.138042135388787, + "learning_rate": 1.6090293348860332e-06, + "loss": 0.678, + "step": 17371 + }, + { + "epoch": 1.2550436180396265, + "grad_norm": 7.195320452458961, + "learning_rate": 1.6087560449031054e-06, + "loss": 0.6323, + "step": 17372 + }, + { + "epoch": 1.255115863239836, + "grad_norm": 6.477749521329455, + "learning_rate": 1.6084827671206152e-06, + "loss": 0.6314, + "step": 17373 + }, + { + "epoch": 1.2551881084400456, + "grad_norm": 5.766716913140338, + "learning_rate": 1.608209501542304e-06, + "loss": 0.6362, + "step": 17374 + }, + { + "epoch": 1.2552603536402551, + "grad_norm": 5.8093548540953135, + "learning_rate": 1.607936248171913e-06, + "loss": 0.6989, + "step": 17375 + }, + { + "epoch": 1.2553325988404644, + "grad_norm": 7.10203062220519, + "learning_rate": 1.6076630070131805e-06, + "loss": 0.7208, + "step": 17376 + }, + { + "epoch": 1.255404844040674, + "grad_norm": 7.49709925186954, + "learning_rate": 1.6073897780698491e-06, + "loss": 0.5791, + "step": 17377 + }, + { + "epoch": 1.2554770892408835, + "grad_norm": 6.8967363240759125, + "learning_rate": 1.6071165613456596e-06, + "loss": 0.5967, + "step": 17378 + }, + { + "epoch": 1.255549334441093, + "grad_norm": 6.472657134033585, + "learning_rate": 1.6068433568443503e-06, + "loss": 0.6187, + "step": 17379 + }, + { + "epoch": 1.2556215796413026, + "grad_norm": 6.4604952567432745, + "learning_rate": 1.6065701645696618e-06, + "loss": 0.6066, + "step": 17380 + }, + { + "epoch": 1.2556938248415122, + "grad_norm": 8.465226795382721, + "learning_rate": 1.6062969845253343e-06, + "loss": 0.6212, + "step": 17381 + }, + { + "epoch": 1.2557660700417217, + "grad_norm": 6.786930078566997, + "learning_rate": 1.6060238167151077e-06, + "loss": 0.6078, + "step": 17382 + }, + { + "epoch": 1.255838315241931, + "grad_norm": 7.241307802896453, + "learning_rate": 1.6057506611427198e-06, + "loss": 0.6569, + "step": 17383 + }, + { + "epoch": 1.2559105604421406, + "grad_norm": 7.331695402620505, + "learning_rate": 1.6054775178119114e-06, + "loss": 0.6565, + "step": 17384 + }, + { + "epoch": 1.25598280564235, + "grad_norm": 6.777275643569898, + "learning_rate": 1.6052043867264217e-06, + "loss": 0.7011, + "step": 17385 + }, + { + "epoch": 1.2560550508425596, + "grad_norm": 6.0320655389964095, + "learning_rate": 1.6049312678899888e-06, + "loss": 0.6432, + "step": 17386 + }, + { + "epoch": 1.2561272960427692, + "grad_norm": 8.491512324384614, + "learning_rate": 1.604658161306352e-06, + "loss": 0.7126, + "step": 17387 + }, + { + "epoch": 1.2561995412429787, + "grad_norm": 8.086875794744774, + "learning_rate": 1.6043850669792494e-06, + "loss": 0.6345, + "step": 17388 + }, + { + "epoch": 1.2562717864431883, + "grad_norm": 6.682634526706462, + "learning_rate": 1.6041119849124212e-06, + "loss": 0.6138, + "step": 17389 + }, + { + "epoch": 1.2563440316433976, + "grad_norm": 6.645321753571913, + "learning_rate": 1.6038389151096031e-06, + "loss": 0.718, + "step": 17390 + }, + { + "epoch": 1.2564162768436071, + "grad_norm": 6.3406532884834395, + "learning_rate": 1.6035658575745339e-06, + "loss": 0.6084, + "step": 17391 + }, + { + "epoch": 1.2564885220438167, + "grad_norm": 9.283165936242048, + "learning_rate": 1.6032928123109525e-06, + "loss": 0.6218, + "step": 17392 + }, + { + "epoch": 1.2565607672440262, + "grad_norm": 6.854976277717262, + "learning_rate": 1.6030197793225976e-06, + "loss": 0.6834, + "step": 17393 + }, + { + "epoch": 1.2566330124442358, + "grad_norm": 7.238410104509446, + "learning_rate": 1.6027467586132041e-06, + "loss": 0.6231, + "step": 17394 + }, + { + "epoch": 1.2567052576444453, + "grad_norm": 6.615378825094729, + "learning_rate": 1.6024737501865111e-06, + "loss": 0.6347, + "step": 17395 + }, + { + "epoch": 1.2567775028446548, + "grad_norm": 5.989482244108436, + "learning_rate": 1.602200754046256e-06, + "loss": 0.6433, + "step": 17396 + }, + { + "epoch": 1.2568497480448642, + "grad_norm": 7.933574036010711, + "learning_rate": 1.6019277701961747e-06, + "loss": 0.6562, + "step": 17397 + }, + { + "epoch": 1.2569219932450737, + "grad_norm": 6.023693351407994, + "learning_rate": 1.601654798640005e-06, + "loss": 0.6673, + "step": 17398 + }, + { + "epoch": 1.2569942384452832, + "grad_norm": 9.000002543131151, + "learning_rate": 1.6013818393814839e-06, + "loss": 0.6608, + "step": 17399 + }, + { + "epoch": 1.2570664836454928, + "grad_norm": 6.266645315147441, + "learning_rate": 1.601108892424348e-06, + "loss": 0.5871, + "step": 17400 + }, + { + "epoch": 1.2571387288457023, + "grad_norm": 6.407467014735376, + "learning_rate": 1.6008359577723328e-06, + "loss": 0.6391, + "step": 17401 + }, + { + "epoch": 1.2572109740459119, + "grad_norm": 8.174865292247128, + "learning_rate": 1.6005630354291751e-06, + "loss": 0.5952, + "step": 17402 + }, + { + "epoch": 1.2572832192461214, + "grad_norm": 6.7569276087012, + "learning_rate": 1.6002901253986125e-06, + "loss": 0.6184, + "step": 17403 + }, + { + "epoch": 1.2573554644463307, + "grad_norm": 7.063275319980781, + "learning_rate": 1.6000172276843783e-06, + "loss": 0.7081, + "step": 17404 + }, + { + "epoch": 1.2574277096465405, + "grad_norm": 6.064877702528787, + "learning_rate": 1.5997443422902088e-06, + "loss": 0.684, + "step": 17405 + }, + { + "epoch": 1.2574999548467498, + "grad_norm": 7.900126019209109, + "learning_rate": 1.5994714692198409e-06, + "loss": 0.628, + "step": 17406 + }, + { + "epoch": 1.2575722000469594, + "grad_norm": 5.822803512138014, + "learning_rate": 1.5991986084770105e-06, + "loss": 0.6517, + "step": 17407 + }, + { + "epoch": 1.257644445247169, + "grad_norm": 6.52927671857867, + "learning_rate": 1.5989257600654506e-06, + "loss": 0.5919, + "step": 17408 + }, + { + "epoch": 1.2577166904473784, + "grad_norm": 6.515884943106107, + "learning_rate": 1.5986529239888973e-06, + "loss": 0.6286, + "step": 17409 + }, + { + "epoch": 1.257788935647588, + "grad_norm": 6.7170779964433, + "learning_rate": 1.5983801002510862e-06, + "loss": 0.5895, + "step": 17410 + }, + { + "epoch": 1.2578611808477973, + "grad_norm": 6.4323754514933755, + "learning_rate": 1.5981072888557508e-06, + "loss": 0.6497, + "step": 17411 + }, + { + "epoch": 1.257933426048007, + "grad_norm": 7.433910882291997, + "learning_rate": 1.5978344898066268e-06, + "loss": 0.7114, + "step": 17412 + }, + { + "epoch": 1.2580056712482164, + "grad_norm": 7.122569422492325, + "learning_rate": 1.5975617031074475e-06, + "loss": 0.601, + "step": 17413 + }, + { + "epoch": 1.258077916448426, + "grad_norm": 5.942924511181802, + "learning_rate": 1.5972889287619487e-06, + "loss": 0.5537, + "step": 17414 + }, + { + "epoch": 1.2581501616486355, + "grad_norm": 5.854992827452608, + "learning_rate": 1.5970161667738632e-06, + "loss": 0.6081, + "step": 17415 + }, + { + "epoch": 1.258222406848845, + "grad_norm": 7.449550579263679, + "learning_rate": 1.5967434171469248e-06, + "loss": 0.6371, + "step": 17416 + }, + { + "epoch": 1.2582946520490546, + "grad_norm": 6.988269786708038, + "learning_rate": 1.5964706798848681e-06, + "loss": 0.6328, + "step": 17417 + }, + { + "epoch": 1.2583668972492639, + "grad_norm": 7.631009844626024, + "learning_rate": 1.596197954991427e-06, + "loss": 0.6859, + "step": 17418 + }, + { + "epoch": 1.2584391424494736, + "grad_norm": 5.57034913809392, + "learning_rate": 1.5959252424703327e-06, + "loss": 0.5958, + "step": 17419 + }, + { + "epoch": 1.258511387649683, + "grad_norm": 7.195530659270785, + "learning_rate": 1.5956525423253206e-06, + "loss": 0.6678, + "step": 17420 + }, + { + "epoch": 1.2585836328498925, + "grad_norm": 6.604512619760831, + "learning_rate": 1.5953798545601238e-06, + "loss": 0.6453, + "step": 17421 + }, + { + "epoch": 1.258655878050102, + "grad_norm": 5.934333679794601, + "learning_rate": 1.5951071791784733e-06, + "loss": 0.6162, + "step": 17422 + }, + { + "epoch": 1.2587281232503116, + "grad_norm": 6.9749129204458695, + "learning_rate": 1.594834516184103e-06, + "loss": 0.6282, + "step": 17423 + }, + { + "epoch": 1.2588003684505211, + "grad_norm": 7.1518538519244865, + "learning_rate": 1.5945618655807455e-06, + "loss": 0.5808, + "step": 17424 + }, + { + "epoch": 1.2588726136507304, + "grad_norm": 7.0878039743945624, + "learning_rate": 1.5942892273721333e-06, + "loss": 0.6404, + "step": 17425 + }, + { + "epoch": 1.2589448588509402, + "grad_norm": 6.806371049181117, + "learning_rate": 1.594016601561998e-06, + "loss": 0.6236, + "step": 17426 + }, + { + "epoch": 1.2590171040511495, + "grad_norm": 7.066520424295286, + "learning_rate": 1.5937439881540717e-06, + "loss": 0.6359, + "step": 17427 + }, + { + "epoch": 1.259089349251359, + "grad_norm": 7.10395005717206, + "learning_rate": 1.5934713871520875e-06, + "loss": 0.6891, + "step": 17428 + }, + { + "epoch": 1.2591615944515686, + "grad_norm": 5.965917744155488, + "learning_rate": 1.5931987985597752e-06, + "loss": 0.5961, + "step": 17429 + }, + { + "epoch": 1.2592338396517782, + "grad_norm": 8.062991105137314, + "learning_rate": 1.5929262223808676e-06, + "loss": 0.6436, + "step": 17430 + }, + { + "epoch": 1.2593060848519877, + "grad_norm": 8.229706282166008, + "learning_rate": 1.5926536586190956e-06, + "loss": 0.5821, + "step": 17431 + }, + { + "epoch": 1.2593783300521972, + "grad_norm": 6.148361496413185, + "learning_rate": 1.5923811072781914e-06, + "loss": 0.577, + "step": 17432 + }, + { + "epoch": 1.2594505752524068, + "grad_norm": 6.924814258935178, + "learning_rate": 1.5921085683618837e-06, + "loss": 0.7321, + "step": 17433 + }, + { + "epoch": 1.259522820452616, + "grad_norm": 7.176023924601637, + "learning_rate": 1.5918360418739054e-06, + "loss": 0.669, + "step": 17434 + }, + { + "epoch": 1.2595950656528256, + "grad_norm": 6.073471680847507, + "learning_rate": 1.5915635278179872e-06, + "loss": 0.6193, + "step": 17435 + }, + { + "epoch": 1.2596673108530352, + "grad_norm": 7.938297982541291, + "learning_rate": 1.5912910261978582e-06, + "loss": 0.6316, + "step": 17436 + }, + { + "epoch": 1.2597395560532447, + "grad_norm": 7.144438612114698, + "learning_rate": 1.5910185370172493e-06, + "loss": 0.6522, + "step": 17437 + }, + { + "epoch": 1.2598118012534543, + "grad_norm": 6.70422781028707, + "learning_rate": 1.5907460602798913e-06, + "loss": 0.6362, + "step": 17438 + }, + { + "epoch": 1.2598840464536638, + "grad_norm": 6.64722844929591, + "learning_rate": 1.590473595989514e-06, + "loss": 0.6671, + "step": 17439 + }, + { + "epoch": 1.2599562916538734, + "grad_norm": 8.014640762550394, + "learning_rate": 1.590201144149846e-06, + "loss": 0.7188, + "step": 17440 + }, + { + "epoch": 1.2600285368540827, + "grad_norm": 6.299488519159897, + "learning_rate": 1.5899287047646183e-06, + "loss": 0.6867, + "step": 17441 + }, + { + "epoch": 1.2601007820542922, + "grad_norm": 5.303706711901051, + "learning_rate": 1.5896562778375603e-06, + "loss": 0.5977, + "step": 17442 + }, + { + "epoch": 1.2601730272545018, + "grad_norm": 6.371883153980345, + "learning_rate": 1.5893838633724012e-06, + "loss": 0.685, + "step": 17443 + }, + { + "epoch": 1.2602452724547113, + "grad_norm": 9.04629267002662, + "learning_rate": 1.5891114613728695e-06, + "loss": 0.6747, + "step": 17444 + }, + { + "epoch": 1.2603175176549208, + "grad_norm": 7.829175562614136, + "learning_rate": 1.5888390718426945e-06, + "loss": 0.6992, + "step": 17445 + }, + { + "epoch": 1.2603897628551304, + "grad_norm": 6.5655254837621095, + "learning_rate": 1.5885666947856066e-06, + "loss": 0.675, + "step": 17446 + }, + { + "epoch": 1.26046200805534, + "grad_norm": 6.207177210268588, + "learning_rate": 1.588294330205331e-06, + "loss": 0.5381, + "step": 17447 + }, + { + "epoch": 1.2605342532555492, + "grad_norm": 8.278165422731856, + "learning_rate": 1.588021978105599e-06, + "loss": 0.6735, + "step": 17448 + }, + { + "epoch": 1.2606064984557588, + "grad_norm": 7.099819729087977, + "learning_rate": 1.5877496384901374e-06, + "loss": 0.6856, + "step": 17449 + }, + { + "epoch": 1.2606787436559683, + "grad_norm": 7.042395950865001, + "learning_rate": 1.5874773113626764e-06, + "loss": 0.7032, + "step": 17450 + }, + { + "epoch": 1.2607509888561779, + "grad_norm": 6.332830040994705, + "learning_rate": 1.5872049967269414e-06, + "loss": 0.7138, + "step": 17451 + }, + { + "epoch": 1.2608232340563874, + "grad_norm": 5.7955376658199285, + "learning_rate": 1.5869326945866614e-06, + "loss": 0.6685, + "step": 17452 + }, + { + "epoch": 1.260895479256597, + "grad_norm": 6.822130308384623, + "learning_rate": 1.5866604049455642e-06, + "loss": 0.6074, + "step": 17453 + }, + { + "epoch": 1.2609677244568065, + "grad_norm": 6.256569423390667, + "learning_rate": 1.5863881278073765e-06, + "loss": 0.6147, + "step": 17454 + }, + { + "epoch": 1.2610399696570158, + "grad_norm": 6.082821794018233, + "learning_rate": 1.5861158631758258e-06, + "loss": 0.6011, + "step": 17455 + }, + { + "epoch": 1.2611122148572254, + "grad_norm": 6.133116035482086, + "learning_rate": 1.5858436110546394e-06, + "loss": 0.6553, + "step": 17456 + }, + { + "epoch": 1.261184460057435, + "grad_norm": 6.885238791364562, + "learning_rate": 1.5855713714475446e-06, + "loss": 0.7171, + "step": 17457 + }, + { + "epoch": 1.2612567052576444, + "grad_norm": 7.23704449895356, + "learning_rate": 1.5852991443582674e-06, + "loss": 0.6869, + "step": 17458 + }, + { + "epoch": 1.261328950457854, + "grad_norm": 7.290256153101346, + "learning_rate": 1.5850269297905346e-06, + "loss": 0.6464, + "step": 17459 + }, + { + "epoch": 1.2614011956580635, + "grad_norm": 6.6882455000285645, + "learning_rate": 1.5847547277480737e-06, + "loss": 0.6234, + "step": 17460 + }, + { + "epoch": 1.261473440858273, + "grad_norm": 5.469212278173666, + "learning_rate": 1.5844825382346084e-06, + "loss": 0.6041, + "step": 17461 + }, + { + "epoch": 1.2615456860584824, + "grad_norm": 6.522944521979043, + "learning_rate": 1.5842103612538668e-06, + "loss": 0.5849, + "step": 17462 + }, + { + "epoch": 1.261617931258692, + "grad_norm": 7.549026214250899, + "learning_rate": 1.5839381968095747e-06, + "loss": 0.6349, + "step": 17463 + }, + { + "epoch": 1.2616901764589015, + "grad_norm": 5.826333923848469, + "learning_rate": 1.5836660449054577e-06, + "loss": 0.6469, + "step": 17464 + }, + { + "epoch": 1.261762421659111, + "grad_norm": 6.379046782553323, + "learning_rate": 1.5833939055452404e-06, + "loss": 0.6917, + "step": 17465 + }, + { + "epoch": 1.2618346668593206, + "grad_norm": 6.571029496958779, + "learning_rate": 1.583121778732649e-06, + "loss": 0.5924, + "step": 17466 + }, + { + "epoch": 1.26190691205953, + "grad_norm": 6.959144164335152, + "learning_rate": 1.582849664471408e-06, + "loss": 0.6543, + "step": 17467 + }, + { + "epoch": 1.2619791572597396, + "grad_norm": 7.382464236164114, + "learning_rate": 1.5825775627652439e-06, + "loss": 0.6012, + "step": 17468 + }, + { + "epoch": 1.262051402459949, + "grad_norm": 6.242289860999747, + "learning_rate": 1.5823054736178804e-06, + "loss": 0.656, + "step": 17469 + }, + { + "epoch": 1.2621236476601585, + "grad_norm": 6.82101999569993, + "learning_rate": 1.582033397033042e-06, + "loss": 0.611, + "step": 17470 + }, + { + "epoch": 1.262195892860368, + "grad_norm": 7.764719121585106, + "learning_rate": 1.5817613330144543e-06, + "loss": 0.6392, + "step": 17471 + }, + { + "epoch": 1.2622681380605776, + "grad_norm": 5.971754985343963, + "learning_rate": 1.5814892815658402e-06, + "loss": 0.579, + "step": 17472 + }, + { + "epoch": 1.2623403832607871, + "grad_norm": 6.574502196522569, + "learning_rate": 1.581217242690925e-06, + "loss": 0.6518, + "step": 17473 + }, + { + "epoch": 1.2624126284609967, + "grad_norm": 7.597816163564519, + "learning_rate": 1.580945216393432e-06, + "loss": 0.6328, + "step": 17474 + }, + { + "epoch": 1.2624848736612062, + "grad_norm": 5.6424753635935145, + "learning_rate": 1.5806732026770863e-06, + "loss": 0.589, + "step": 17475 + }, + { + "epoch": 1.2625571188614155, + "grad_norm": 5.5500412191973885, + "learning_rate": 1.5804012015456099e-06, + "loss": 0.5906, + "step": 17476 + }, + { + "epoch": 1.2626293640616253, + "grad_norm": 7.311630066860546, + "learning_rate": 1.5801292130027271e-06, + "loss": 0.6527, + "step": 17477 + }, + { + "epoch": 1.2627016092618346, + "grad_norm": 7.169292442066584, + "learning_rate": 1.5798572370521623e-06, + "loss": 0.6264, + "step": 17478 + }, + { + "epoch": 1.2627738544620442, + "grad_norm": 6.226158751221346, + "learning_rate": 1.5795852736976364e-06, + "loss": 0.6168, + "step": 17479 + }, + { + "epoch": 1.2628460996622537, + "grad_norm": 6.624524081428032, + "learning_rate": 1.5793133229428731e-06, + "loss": 0.6137, + "step": 17480 + }, + { + "epoch": 1.2629183448624632, + "grad_norm": 6.218275502567418, + "learning_rate": 1.579041384791596e-06, + "loss": 0.6904, + "step": 17481 + }, + { + "epoch": 1.2629905900626728, + "grad_norm": 6.247250371726048, + "learning_rate": 1.5787694592475275e-06, + "loss": 0.6554, + "step": 17482 + }, + { + "epoch": 1.263062835262882, + "grad_norm": 7.887455853370413, + "learning_rate": 1.5784975463143892e-06, + "loss": 0.6567, + "step": 17483 + }, + { + "epoch": 1.2631350804630919, + "grad_norm": 5.898363125565868, + "learning_rate": 1.5782256459959044e-06, + "loss": 0.5455, + "step": 17484 + }, + { + "epoch": 1.2632073256633012, + "grad_norm": 7.230878673043639, + "learning_rate": 1.5779537582957949e-06, + "loss": 0.6671, + "step": 17485 + }, + { + "epoch": 1.2632795708635107, + "grad_norm": 6.78742467792792, + "learning_rate": 1.5776818832177823e-06, + "loss": 0.7247, + "step": 17486 + }, + { + "epoch": 1.2633518160637203, + "grad_norm": 5.9701664227623095, + "learning_rate": 1.5774100207655885e-06, + "loss": 0.6645, + "step": 17487 + }, + { + "epoch": 1.2634240612639298, + "grad_norm": 6.562212256526531, + "learning_rate": 1.5771381709429352e-06, + "loss": 0.5865, + "step": 17488 + }, + { + "epoch": 1.2634963064641394, + "grad_norm": 6.850562202048154, + "learning_rate": 1.5768663337535439e-06, + "loss": 0.6056, + "step": 17489 + }, + { + "epoch": 1.2635685516643487, + "grad_norm": 7.206318509387455, + "learning_rate": 1.576594509201136e-06, + "loss": 0.6945, + "step": 17490 + }, + { + "epoch": 1.2636407968645584, + "grad_norm": 6.340510701690516, + "learning_rate": 1.5763226972894317e-06, + "loss": 0.6379, + "step": 17491 + }, + { + "epoch": 1.2637130420647678, + "grad_norm": 6.339161683985737, + "learning_rate": 1.5760508980221525e-06, + "loss": 0.6838, + "step": 17492 + }, + { + "epoch": 1.2637852872649773, + "grad_norm": 7.338917542346376, + "learning_rate": 1.5757791114030201e-06, + "loss": 0.6268, + "step": 17493 + }, + { + "epoch": 1.2638575324651868, + "grad_norm": 5.905713576727354, + "learning_rate": 1.5755073374357532e-06, + "loss": 0.6804, + "step": 17494 + }, + { + "epoch": 1.2639297776653964, + "grad_norm": 6.120068803288278, + "learning_rate": 1.575235576124073e-06, + "loss": 0.6649, + "step": 17495 + }, + { + "epoch": 1.264002022865606, + "grad_norm": 6.565369187700017, + "learning_rate": 1.5749638274717005e-06, + "loss": 0.587, + "step": 17496 + }, + { + "epoch": 1.2640742680658152, + "grad_norm": 7.0247930421583415, + "learning_rate": 1.574692091482354e-06, + "loss": 0.6533, + "step": 17497 + }, + { + "epoch": 1.264146513266025, + "grad_norm": 7.205255222393181, + "learning_rate": 1.5744203681597548e-06, + "loss": 0.6833, + "step": 17498 + }, + { + "epoch": 1.2642187584662343, + "grad_norm": 7.381340017262411, + "learning_rate": 1.5741486575076214e-06, + "loss": 0.6286, + "step": 17499 + }, + { + "epoch": 1.2642910036664439, + "grad_norm": 6.978700111698699, + "learning_rate": 1.5738769595296752e-06, + "loss": 0.6421, + "step": 17500 + }, + { + "epoch": 1.2643632488666534, + "grad_norm": 7.364965303938821, + "learning_rate": 1.5736052742296337e-06, + "loss": 0.6644, + "step": 17501 + }, + { + "epoch": 1.264435494066863, + "grad_norm": 5.889216004089361, + "learning_rate": 1.5733336016112163e-06, + "loss": 0.6209, + "step": 17502 + }, + { + "epoch": 1.2645077392670725, + "grad_norm": 6.972902984955237, + "learning_rate": 1.5730619416781436e-06, + "loss": 0.6664, + "step": 17503 + }, + { + "epoch": 1.264579984467282, + "grad_norm": 6.804604250620925, + "learning_rate": 1.5727902944341316e-06, + "loss": 0.6297, + "step": 17504 + }, + { + "epoch": 1.2646522296674916, + "grad_norm": 7.044101211470588, + "learning_rate": 1.5725186598829014e-06, + "loss": 0.6383, + "step": 17505 + }, + { + "epoch": 1.264724474867701, + "grad_norm": 6.786181085348705, + "learning_rate": 1.5722470380281703e-06, + "loss": 0.6533, + "step": 17506 + }, + { + "epoch": 1.2647967200679104, + "grad_norm": 5.572472366257645, + "learning_rate": 1.571975428873658e-06, + "loss": 0.6646, + "step": 17507 + }, + { + "epoch": 1.26486896526812, + "grad_norm": 5.768600233785191, + "learning_rate": 1.5717038324230805e-06, + "loss": 0.6666, + "step": 17508 + }, + { + "epoch": 1.2649412104683295, + "grad_norm": 7.495541073620244, + "learning_rate": 1.5714322486801566e-06, + "loss": 0.6474, + "step": 17509 + }, + { + "epoch": 1.265013455668539, + "grad_norm": 5.339948941961786, + "learning_rate": 1.571160677648605e-06, + "loss": 0.5658, + "step": 17510 + }, + { + "epoch": 1.2650857008687486, + "grad_norm": 7.982056043962912, + "learning_rate": 1.5708891193321418e-06, + "loss": 0.6346, + "step": 17511 + }, + { + "epoch": 1.2651579460689582, + "grad_norm": 7.592358622611665, + "learning_rate": 1.5706175737344853e-06, + "loss": 0.6525, + "step": 17512 + }, + { + "epoch": 1.2652301912691675, + "grad_norm": 7.756117344535669, + "learning_rate": 1.5703460408593526e-06, + "loss": 0.677, + "step": 17513 + }, + { + "epoch": 1.265302436469377, + "grad_norm": 7.245750234023902, + "learning_rate": 1.570074520710461e-06, + "loss": 0.6065, + "step": 17514 + }, + { + "epoch": 1.2653746816695866, + "grad_norm": 6.276470738434134, + "learning_rate": 1.5698030132915272e-06, + "loss": 0.6407, + "step": 17515 + }, + { + "epoch": 1.265446926869796, + "grad_norm": 5.5522158417864285, + "learning_rate": 1.5695315186062676e-06, + "loss": 0.6151, + "step": 17516 + }, + { + "epoch": 1.2655191720700056, + "grad_norm": 6.300124927069279, + "learning_rate": 1.5692600366583987e-06, + "loss": 0.5735, + "step": 17517 + }, + { + "epoch": 1.2655914172702152, + "grad_norm": 5.796682513966468, + "learning_rate": 1.568988567451638e-06, + "loss": 0.5676, + "step": 17518 + }, + { + "epoch": 1.2656636624704247, + "grad_norm": 6.835790456007795, + "learning_rate": 1.568717110989701e-06, + "loss": 0.6758, + "step": 17519 + }, + { + "epoch": 1.265735907670634, + "grad_norm": 6.924567187724059, + "learning_rate": 1.5684456672763026e-06, + "loss": 0.658, + "step": 17520 + }, + { + "epoch": 1.2658081528708436, + "grad_norm": 9.288749567636833, + "learning_rate": 1.5681742363151615e-06, + "loss": 0.6378, + "step": 17521 + }, + { + "epoch": 1.2658803980710531, + "grad_norm": 6.518243277322754, + "learning_rate": 1.5679028181099903e-06, + "loss": 0.6518, + "step": 17522 + }, + { + "epoch": 1.2659526432712627, + "grad_norm": 6.47193071239148, + "learning_rate": 1.5676314126645059e-06, + "loss": 0.6178, + "step": 17523 + }, + { + "epoch": 1.2660248884714722, + "grad_norm": 6.807321522614886, + "learning_rate": 1.5673600199824228e-06, + "loss": 0.5806, + "step": 17524 + }, + { + "epoch": 1.2660971336716818, + "grad_norm": 6.489311526889241, + "learning_rate": 1.5670886400674586e-06, + "loss": 0.6184, + "step": 17525 + }, + { + "epoch": 1.2661693788718913, + "grad_norm": 6.130201349980977, + "learning_rate": 1.5668172729233256e-06, + "loss": 0.5787, + "step": 17526 + }, + { + "epoch": 1.2662416240721006, + "grad_norm": 6.277884877209373, + "learning_rate": 1.5665459185537394e-06, + "loss": 0.6113, + "step": 17527 + }, + { + "epoch": 1.2663138692723102, + "grad_norm": 6.6475773578803805, + "learning_rate": 1.5662745769624154e-06, + "loss": 0.6516, + "step": 17528 + }, + { + "epoch": 1.2663861144725197, + "grad_norm": 9.049904535116742, + "learning_rate": 1.5660032481530667e-06, + "loss": 0.7581, + "step": 17529 + }, + { + "epoch": 1.2664583596727292, + "grad_norm": 6.410233017707593, + "learning_rate": 1.5657319321294087e-06, + "loss": 0.5838, + "step": 17530 + }, + { + "epoch": 1.2665306048729388, + "grad_norm": 6.274400914991089, + "learning_rate": 1.5654606288951548e-06, + "loss": 0.7056, + "step": 17531 + }, + { + "epoch": 1.2666028500731483, + "grad_norm": 6.128890067298585, + "learning_rate": 1.5651893384540202e-06, + "loss": 0.6705, + "step": 17532 + }, + { + "epoch": 1.2666750952733579, + "grad_norm": 7.557263287298828, + "learning_rate": 1.564918060809717e-06, + "loss": 0.5873, + "step": 17533 + }, + { + "epoch": 1.2667473404735672, + "grad_norm": 6.997334926630242, + "learning_rate": 1.5646467959659592e-06, + "loss": 0.637, + "step": 17534 + }, + { + "epoch": 1.2668195856737767, + "grad_norm": 7.536906417324409, + "learning_rate": 1.5643755439264619e-06, + "loss": 0.6627, + "step": 17535 + }, + { + "epoch": 1.2668918308739863, + "grad_norm": 6.9276027704584555, + "learning_rate": 1.5641043046949356e-06, + "loss": 0.6346, + "step": 17536 + }, + { + "epoch": 1.2669640760741958, + "grad_norm": 6.007477233818181, + "learning_rate": 1.5638330782750948e-06, + "loss": 0.6305, + "step": 17537 + }, + { + "epoch": 1.2670363212744054, + "grad_norm": 6.990487856451179, + "learning_rate": 1.5635618646706522e-06, + "loss": 0.6242, + "step": 17538 + }, + { + "epoch": 1.267108566474615, + "grad_norm": 5.882406881870461, + "learning_rate": 1.5632906638853218e-06, + "loss": 0.6191, + "step": 17539 + }, + { + "epoch": 1.2671808116748244, + "grad_norm": 7.154361658951155, + "learning_rate": 1.5630194759228137e-06, + "loss": 0.6133, + "step": 17540 + }, + { + "epoch": 1.2672530568750338, + "grad_norm": 7.233561591606208, + "learning_rate": 1.5627483007868417e-06, + "loss": 0.6592, + "step": 17541 + }, + { + "epoch": 1.2673253020752433, + "grad_norm": 6.212624451524782, + "learning_rate": 1.562477138481118e-06, + "loss": 0.5909, + "step": 17542 + }, + { + "epoch": 1.2673975472754528, + "grad_norm": 7.338451015832127, + "learning_rate": 1.5622059890093539e-06, + "loss": 0.6479, + "step": 17543 + }, + { + "epoch": 1.2674697924756624, + "grad_norm": 7.320831247878352, + "learning_rate": 1.5619348523752619e-06, + "loss": 0.7124, + "step": 17544 + }, + { + "epoch": 1.267542037675872, + "grad_norm": 7.5023858408137984, + "learning_rate": 1.5616637285825532e-06, + "loss": 0.6828, + "step": 17545 + }, + { + "epoch": 1.2676142828760815, + "grad_norm": 6.937301770376685, + "learning_rate": 1.5613926176349401e-06, + "loss": 0.6974, + "step": 17546 + }, + { + "epoch": 1.267686528076291, + "grad_norm": 6.051813834181761, + "learning_rate": 1.5611215195361327e-06, + "loss": 0.5721, + "step": 17547 + }, + { + "epoch": 1.2677587732765003, + "grad_norm": 6.359188442633681, + "learning_rate": 1.5608504342898425e-06, + "loss": 0.6484, + "step": 17548 + }, + { + "epoch": 1.26783101847671, + "grad_norm": 6.641611542068116, + "learning_rate": 1.560579361899781e-06, + "loss": 0.747, + "step": 17549 + }, + { + "epoch": 1.2679032636769194, + "grad_norm": 8.996796779683416, + "learning_rate": 1.5603083023696597e-06, + "loss": 0.6908, + "step": 17550 + }, + { + "epoch": 1.267975508877129, + "grad_norm": 6.4218114399607025, + "learning_rate": 1.5600372557031867e-06, + "loss": 0.5582, + "step": 17551 + }, + { + "epoch": 1.2680477540773385, + "grad_norm": 5.553935283048939, + "learning_rate": 1.5597662219040735e-06, + "loss": 0.639, + "step": 17552 + }, + { + "epoch": 1.268119999277548, + "grad_norm": 6.883222067659509, + "learning_rate": 1.5594952009760323e-06, + "loss": 0.7237, + "step": 17553 + }, + { + "epoch": 1.2681922444777576, + "grad_norm": 6.701728871033193, + "learning_rate": 1.5592241929227703e-06, + "loss": 0.6941, + "step": 17554 + }, + { + "epoch": 1.268264489677967, + "grad_norm": 6.669284719750365, + "learning_rate": 1.5589531977479989e-06, + "loss": 0.6126, + "step": 17555 + }, + { + "epoch": 1.2683367348781767, + "grad_norm": 6.463405004179057, + "learning_rate": 1.5586822154554276e-06, + "loss": 0.5632, + "step": 17556 + }, + { + "epoch": 1.268408980078386, + "grad_norm": 8.70537588711352, + "learning_rate": 1.5584112460487661e-06, + "loss": 0.6309, + "step": 17557 + }, + { + "epoch": 1.2684812252785955, + "grad_norm": 6.971656090733459, + "learning_rate": 1.5581402895317233e-06, + "loss": 0.5861, + "step": 17558 + }, + { + "epoch": 1.268553470478805, + "grad_norm": 5.977448361635224, + "learning_rate": 1.5578693459080085e-06, + "loss": 0.581, + "step": 17559 + }, + { + "epoch": 1.2686257156790146, + "grad_norm": 6.410815589186347, + "learning_rate": 1.5575984151813311e-06, + "loss": 0.6032, + "step": 17560 + }, + { + "epoch": 1.2686979608792242, + "grad_norm": 7.040481678350111, + "learning_rate": 1.5573274973553996e-06, + "loss": 0.6208, + "step": 17561 + }, + { + "epoch": 1.2687702060794335, + "grad_norm": 7.358320610176512, + "learning_rate": 1.5570565924339227e-06, + "loss": 0.6842, + "step": 17562 + }, + { + "epoch": 1.2688424512796432, + "grad_norm": 6.6266258691853315, + "learning_rate": 1.5567857004206085e-06, + "loss": 0.5833, + "step": 17563 + }, + { + "epoch": 1.2689146964798526, + "grad_norm": 6.450228948743675, + "learning_rate": 1.556514821319167e-06, + "loss": 0.6771, + "step": 17564 + }, + { + "epoch": 1.268986941680062, + "grad_norm": 8.698852147641288, + "learning_rate": 1.5562439551333038e-06, + "loss": 0.7078, + "step": 17565 + }, + { + "epoch": 1.2690591868802716, + "grad_norm": 5.713322633238439, + "learning_rate": 1.555973101866728e-06, + "loss": 0.6453, + "step": 17566 + }, + { + "epoch": 1.2691314320804812, + "grad_norm": 7.352665071576778, + "learning_rate": 1.5557022615231488e-06, + "loss": 0.694, + "step": 17567 + }, + { + "epoch": 1.2692036772806907, + "grad_norm": 7.167082072202571, + "learning_rate": 1.555431434106271e-06, + "loss": 0.6669, + "step": 17568 + }, + { + "epoch": 1.2692759224809, + "grad_norm": 5.721240225832131, + "learning_rate": 1.5551606196198038e-06, + "loss": 0.6022, + "step": 17569 + }, + { + "epoch": 1.2693481676811098, + "grad_norm": 8.180350640203615, + "learning_rate": 1.5548898180674544e-06, + "loss": 0.6392, + "step": 17570 + }, + { + "epoch": 1.2694204128813191, + "grad_norm": 10.260617223544862, + "learning_rate": 1.5546190294529295e-06, + "loss": 0.7057, + "step": 17571 + }, + { + "epoch": 1.2694926580815287, + "grad_norm": 6.557622641123012, + "learning_rate": 1.554348253779936e-06, + "loss": 0.6023, + "step": 17572 + }, + { + "epoch": 1.2695649032817382, + "grad_norm": 6.3978585713749885, + "learning_rate": 1.55407749105218e-06, + "loss": 0.6094, + "step": 17573 + }, + { + "epoch": 1.2696371484819478, + "grad_norm": 6.659534199323037, + "learning_rate": 1.5538067412733693e-06, + "loss": 0.6302, + "step": 17574 + }, + { + "epoch": 1.2697093936821573, + "grad_norm": 6.4207946908205935, + "learning_rate": 1.55353600444721e-06, + "loss": 0.5877, + "step": 17575 + }, + { + "epoch": 1.2697816388823666, + "grad_norm": 6.758926705473358, + "learning_rate": 1.5532652805774074e-06, + "loss": 0.7489, + "step": 17576 + }, + { + "epoch": 1.2698538840825764, + "grad_norm": 9.523313726508022, + "learning_rate": 1.5529945696676679e-06, + "loss": 0.6055, + "step": 17577 + }, + { + "epoch": 1.2699261292827857, + "grad_norm": 6.326708524436212, + "learning_rate": 1.5527238717216986e-06, + "loss": 0.6462, + "step": 17578 + }, + { + "epoch": 1.2699983744829952, + "grad_norm": 5.812080942450113, + "learning_rate": 1.552453186743203e-06, + "loss": 0.7507, + "step": 17579 + }, + { + "epoch": 1.2700706196832048, + "grad_norm": 5.653769439184796, + "learning_rate": 1.5521825147358874e-06, + "loss": 0.594, + "step": 17580 + }, + { + "epoch": 1.2701428648834143, + "grad_norm": 6.0075058083698645, + "learning_rate": 1.5519118557034569e-06, + "loss": 0.6453, + "step": 17581 + }, + { + "epoch": 1.2702151100836239, + "grad_norm": 7.0828103564309846, + "learning_rate": 1.5516412096496183e-06, + "loss": 0.6719, + "step": 17582 + }, + { + "epoch": 1.2702873552838334, + "grad_norm": 6.261910647987474, + "learning_rate": 1.5513705765780742e-06, + "loss": 0.6567, + "step": 17583 + }, + { + "epoch": 1.270359600484043, + "grad_norm": 7.4605867473180565, + "learning_rate": 1.5510999564925305e-06, + "loss": 0.6226, + "step": 17584 + }, + { + "epoch": 1.2704318456842523, + "grad_norm": 6.377500455071984, + "learning_rate": 1.5508293493966919e-06, + "loss": 0.5883, + "step": 17585 + }, + { + "epoch": 1.2705040908844618, + "grad_norm": 6.5406767790704565, + "learning_rate": 1.5505587552942622e-06, + "loss": 0.6019, + "step": 17586 + }, + { + "epoch": 1.2705763360846714, + "grad_norm": 7.952211698542582, + "learning_rate": 1.5502881741889458e-06, + "loss": 0.6016, + "step": 17587 + }, + { + "epoch": 1.270648581284881, + "grad_norm": 7.783349424205324, + "learning_rate": 1.5500176060844472e-06, + "loss": 0.6354, + "step": 17588 + }, + { + "epoch": 1.2707208264850904, + "grad_norm": 5.68385684528519, + "learning_rate": 1.5497470509844704e-06, + "loss": 0.6219, + "step": 17589 + }, + { + "epoch": 1.2707930716853, + "grad_norm": 6.828224443283302, + "learning_rate": 1.5494765088927177e-06, + "loss": 0.6757, + "step": 17590 + }, + { + "epoch": 1.2708653168855095, + "grad_norm": 9.008948222255974, + "learning_rate": 1.5492059798128938e-06, + "loss": 0.6863, + "step": 17591 + }, + { + "epoch": 1.2709375620857188, + "grad_norm": 6.967565872105474, + "learning_rate": 1.5489354637487031e-06, + "loss": 0.647, + "step": 17592 + }, + { + "epoch": 1.2710098072859284, + "grad_norm": 6.45843952050641, + "learning_rate": 1.5486649607038459e-06, + "loss": 0.6396, + "step": 17593 + }, + { + "epoch": 1.271082052486138, + "grad_norm": 6.848899262805706, + "learning_rate": 1.548394470682027e-06, + "loss": 0.7048, + "step": 17594 + }, + { + "epoch": 1.2711542976863475, + "grad_norm": 6.952157696935996, + "learning_rate": 1.5481239936869485e-06, + "loss": 0.6643, + "step": 17595 + }, + { + "epoch": 1.271226542886557, + "grad_norm": 7.6278723637784935, + "learning_rate": 1.5478535297223149e-06, + "loss": 0.6831, + "step": 17596 + }, + { + "epoch": 1.2712987880867666, + "grad_norm": 6.488830653383629, + "learning_rate": 1.5475830787918261e-06, + "loss": 0.6229, + "step": 17597 + }, + { + "epoch": 1.271371033286976, + "grad_norm": 6.163282685939115, + "learning_rate": 1.5473126408991857e-06, + "loss": 0.6334, + "step": 17598 + }, + { + "epoch": 1.2714432784871854, + "grad_norm": 7.00077461316433, + "learning_rate": 1.5470422160480952e-06, + "loss": 0.7198, + "step": 17599 + }, + { + "epoch": 1.271515523687395, + "grad_norm": 6.514897221232477, + "learning_rate": 1.5467718042422575e-06, + "loss": 0.628, + "step": 17600 + }, + { + "epoch": 1.2715877688876045, + "grad_norm": 6.760447293894495, + "learning_rate": 1.5465014054853732e-06, + "loss": 0.6931, + "step": 17601 + }, + { + "epoch": 1.271660014087814, + "grad_norm": 9.477528949714465, + "learning_rate": 1.5462310197811443e-06, + "loss": 0.672, + "step": 17602 + }, + { + "epoch": 1.2717322592880236, + "grad_norm": 6.417348486936268, + "learning_rate": 1.5459606471332726e-06, + "loss": 0.6952, + "step": 17603 + }, + { + "epoch": 1.2718045044882331, + "grad_norm": 5.4970969428039, + "learning_rate": 1.5456902875454582e-06, + "loss": 0.5931, + "step": 17604 + }, + { + "epoch": 1.2718767496884427, + "grad_norm": 8.95457012769876, + "learning_rate": 1.5454199410214033e-06, + "loss": 0.6912, + "step": 17605 + }, + { + "epoch": 1.271948994888652, + "grad_norm": 5.7714433955820965, + "learning_rate": 1.5451496075648078e-06, + "loss": 0.5804, + "step": 17606 + }, + { + "epoch": 1.2720212400888615, + "grad_norm": 6.554887116112715, + "learning_rate": 1.5448792871793738e-06, + "loss": 0.5735, + "step": 17607 + }, + { + "epoch": 1.272093485289071, + "grad_norm": 6.75062049556893, + "learning_rate": 1.5446089798688003e-06, + "loss": 0.6884, + "step": 17608 + }, + { + "epoch": 1.2721657304892806, + "grad_norm": 6.3963319996163746, + "learning_rate": 1.544338685636787e-06, + "loss": 0.727, + "step": 17609 + }, + { + "epoch": 1.2722379756894902, + "grad_norm": 6.510405794261755, + "learning_rate": 1.5440684044870367e-06, + "loss": 0.6282, + "step": 17610 + }, + { + "epoch": 1.2723102208896997, + "grad_norm": 6.965585580323211, + "learning_rate": 1.5437981364232466e-06, + "loss": 0.6701, + "step": 17611 + }, + { + "epoch": 1.2723824660899092, + "grad_norm": 8.231613014459816, + "learning_rate": 1.5435278814491178e-06, + "loss": 0.6815, + "step": 17612 + }, + { + "epoch": 1.2724547112901186, + "grad_norm": 6.486799480859788, + "learning_rate": 1.5432576395683492e-06, + "loss": 0.6786, + "step": 17613 + }, + { + "epoch": 1.272526956490328, + "grad_norm": 7.8699559603359255, + "learning_rate": 1.5429874107846415e-06, + "loss": 0.6027, + "step": 17614 + }, + { + "epoch": 1.2725992016905376, + "grad_norm": 7.128045485277542, + "learning_rate": 1.5427171951016927e-06, + "loss": 0.5946, + "step": 17615 + }, + { + "epoch": 1.2726714468907472, + "grad_norm": 6.83947564686237, + "learning_rate": 1.5424469925232021e-06, + "loss": 0.6438, + "step": 17616 + }, + { + "epoch": 1.2727436920909567, + "grad_norm": 6.317835903740441, + "learning_rate": 1.5421768030528689e-06, + "loss": 0.6067, + "step": 17617 + }, + { + "epoch": 1.2728159372911663, + "grad_norm": 6.172991147097851, + "learning_rate": 1.5419066266943916e-06, + "loss": 0.6439, + "step": 17618 + }, + { + "epoch": 1.2728881824913758, + "grad_norm": 6.3358369530015155, + "learning_rate": 1.5416364634514684e-06, + "loss": 0.6531, + "step": 17619 + }, + { + "epoch": 1.2729604276915851, + "grad_norm": 5.932776891497531, + "learning_rate": 1.5413663133277978e-06, + "loss": 0.5832, + "step": 17620 + }, + { + "epoch": 1.2730326728917947, + "grad_norm": 7.190784002649079, + "learning_rate": 1.541096176327079e-06, + "loss": 0.6726, + "step": 17621 + }, + { + "epoch": 1.2731049180920042, + "grad_norm": 6.2027049655098185, + "learning_rate": 1.5408260524530082e-06, + "loss": 0.6515, + "step": 17622 + }, + { + "epoch": 1.2731771632922138, + "grad_norm": 7.450420024731617, + "learning_rate": 1.5405559417092832e-06, + "loss": 0.6084, + "step": 17623 + }, + { + "epoch": 1.2732494084924233, + "grad_norm": 8.1030954733361, + "learning_rate": 1.5402858440996032e-06, + "loss": 0.5925, + "step": 17624 + }, + { + "epoch": 1.2733216536926328, + "grad_norm": 6.960932019849134, + "learning_rate": 1.5400157596276658e-06, + "loss": 0.651, + "step": 17625 + }, + { + "epoch": 1.2733938988928424, + "grad_norm": 6.450829196828897, + "learning_rate": 1.5397456882971662e-06, + "loss": 0.6454, + "step": 17626 + }, + { + "epoch": 1.2734661440930517, + "grad_norm": 6.532782415919728, + "learning_rate": 1.5394756301118024e-06, + "loss": 0.6205, + "step": 17627 + }, + { + "epoch": 1.2735383892932615, + "grad_norm": 8.386171547850692, + "learning_rate": 1.5392055850752725e-06, + "loss": 0.5979, + "step": 17628 + }, + { + "epoch": 1.2736106344934708, + "grad_norm": 7.860624968553088, + "learning_rate": 1.538935553191271e-06, + "loss": 0.6581, + "step": 17629 + }, + { + "epoch": 1.2736828796936803, + "grad_norm": 6.537757077824365, + "learning_rate": 1.5386655344634957e-06, + "loss": 0.6422, + "step": 17630 + }, + { + "epoch": 1.2737551248938899, + "grad_norm": 6.876925112215738, + "learning_rate": 1.538395528895643e-06, + "loss": 0.5981, + "step": 17631 + }, + { + "epoch": 1.2738273700940994, + "grad_norm": 8.14273764348124, + "learning_rate": 1.5381255364914092e-06, + "loss": 0.6487, + "step": 17632 + }, + { + "epoch": 1.273899615294309, + "grad_norm": 6.709797701337283, + "learning_rate": 1.5378555572544895e-06, + "loss": 0.6903, + "step": 17633 + }, + { + "epoch": 1.2739718604945183, + "grad_norm": 7.32817211013975, + "learning_rate": 1.53758559118858e-06, + "loss": 0.5873, + "step": 17634 + }, + { + "epoch": 1.274044105694728, + "grad_norm": 7.1199100450126664, + "learning_rate": 1.5373156382973774e-06, + "loss": 0.5757, + "step": 17635 + }, + { + "epoch": 1.2741163508949374, + "grad_norm": 6.648208273534382, + "learning_rate": 1.5370456985845758e-06, + "loss": 0.7004, + "step": 17636 + }, + { + "epoch": 1.274188596095147, + "grad_norm": 5.73759050152445, + "learning_rate": 1.53677577205387e-06, + "loss": 0.6003, + "step": 17637 + }, + { + "epoch": 1.2742608412953564, + "grad_norm": 7.195557166629449, + "learning_rate": 1.5365058587089565e-06, + "loss": 0.6864, + "step": 17638 + }, + { + "epoch": 1.274333086495566, + "grad_norm": 7.679659683316865, + "learning_rate": 1.5362359585535307e-06, + "loss": 0.6556, + "step": 17639 + }, + { + "epoch": 1.2744053316957755, + "grad_norm": 6.391454666116583, + "learning_rate": 1.5359660715912856e-06, + "loss": 0.5907, + "step": 17640 + }, + { + "epoch": 1.2744775768959848, + "grad_norm": 6.73492698442943, + "learning_rate": 1.535696197825916e-06, + "loss": 0.615, + "step": 17641 + }, + { + "epoch": 1.2745498220961946, + "grad_norm": 7.19859585315637, + "learning_rate": 1.5354263372611177e-06, + "loss": 0.6887, + "step": 17642 + }, + { + "epoch": 1.274622067296404, + "grad_norm": 6.992536653293728, + "learning_rate": 1.535156489900583e-06, + "loss": 0.7085, + "step": 17643 + }, + { + "epoch": 1.2746943124966135, + "grad_norm": 9.082891844306586, + "learning_rate": 1.5348866557480068e-06, + "loss": 0.6251, + "step": 17644 + }, + { + "epoch": 1.274766557696823, + "grad_norm": 7.018618754164426, + "learning_rate": 1.5346168348070834e-06, + "loss": 0.5991, + "step": 17645 + }, + { + "epoch": 1.2748388028970326, + "grad_norm": 10.02306757160311, + "learning_rate": 1.5343470270815058e-06, + "loss": 0.7075, + "step": 17646 + }, + { + "epoch": 1.274911048097242, + "grad_norm": 6.731402739170645, + "learning_rate": 1.5340772325749675e-06, + "loss": 0.7025, + "step": 17647 + }, + { + "epoch": 1.2749832932974514, + "grad_norm": 7.1343346733728055, + "learning_rate": 1.5338074512911621e-06, + "loss": 0.6703, + "step": 17648 + }, + { + "epoch": 1.2750555384976612, + "grad_norm": 7.630748145545245, + "learning_rate": 1.533537683233783e-06, + "loss": 0.6962, + "step": 17649 + }, + { + "epoch": 1.2751277836978705, + "grad_norm": 7.125804119577379, + "learning_rate": 1.533267928406522e-06, + "loss": 0.646, + "step": 17650 + }, + { + "epoch": 1.27520002889808, + "grad_norm": 8.309549596816492, + "learning_rate": 1.5329981868130718e-06, + "loss": 0.5999, + "step": 17651 + }, + { + "epoch": 1.2752722740982896, + "grad_norm": 9.723091750240442, + "learning_rate": 1.5327284584571262e-06, + "loss": 0.6761, + "step": 17652 + }, + { + "epoch": 1.2753445192984991, + "grad_norm": 8.851379338765822, + "learning_rate": 1.5324587433423776e-06, + "loss": 0.6796, + "step": 17653 + }, + { + "epoch": 1.2754167644987087, + "grad_norm": 6.480553514364268, + "learning_rate": 1.5321890414725168e-06, + "loss": 0.6316, + "step": 17654 + }, + { + "epoch": 1.2754890096989182, + "grad_norm": 7.15497001295117, + "learning_rate": 1.5319193528512366e-06, + "loss": 0.6495, + "step": 17655 + }, + { + "epoch": 1.2755612548991277, + "grad_norm": 5.849480252122466, + "learning_rate": 1.5316496774822287e-06, + "loss": 0.6196, + "step": 17656 + }, + { + "epoch": 1.275633500099337, + "grad_norm": 6.869339433254669, + "learning_rate": 1.5313800153691856e-06, + "loss": 0.7625, + "step": 17657 + }, + { + "epoch": 1.2757057452995466, + "grad_norm": 6.024233200220155, + "learning_rate": 1.5311103665157973e-06, + "loss": 0.6309, + "step": 17658 + }, + { + "epoch": 1.2757779904997562, + "grad_norm": 6.527263392028547, + "learning_rate": 1.5308407309257555e-06, + "loss": 0.705, + "step": 17659 + }, + { + "epoch": 1.2758502356999657, + "grad_norm": 6.642437425002087, + "learning_rate": 1.5305711086027525e-06, + "loss": 0.6303, + "step": 17660 + }, + { + "epoch": 1.2759224809001752, + "grad_norm": 6.887283172240626, + "learning_rate": 1.5303014995504778e-06, + "loss": 0.6614, + "step": 17661 + }, + { + "epoch": 1.2759947261003848, + "grad_norm": 7.34502071993134, + "learning_rate": 1.5300319037726225e-06, + "loss": 0.6433, + "step": 17662 + }, + { + "epoch": 1.2760669713005943, + "grad_norm": 6.552642103020234, + "learning_rate": 1.5297623212728773e-06, + "loss": 0.6495, + "step": 17663 + }, + { + "epoch": 1.2761392165008036, + "grad_norm": 7.3350453834715905, + "learning_rate": 1.5294927520549336e-06, + "loss": 0.6576, + "step": 17664 + }, + { + "epoch": 1.2762114617010132, + "grad_norm": 6.559544651696309, + "learning_rate": 1.529223196122479e-06, + "loss": 0.6158, + "step": 17665 + }, + { + "epoch": 1.2762837069012227, + "grad_norm": 6.967549721027488, + "learning_rate": 1.5289536534792057e-06, + "loss": 0.6876, + "step": 17666 + }, + { + "epoch": 1.2763559521014323, + "grad_norm": 7.62747827497431, + "learning_rate": 1.528684124128804e-06, + "loss": 0.6308, + "step": 17667 + }, + { + "epoch": 1.2764281973016418, + "grad_norm": 6.714738366774131, + "learning_rate": 1.5284146080749613e-06, + "loss": 0.604, + "step": 17668 + }, + { + "epoch": 1.2765004425018514, + "grad_norm": 7.1994516269816025, + "learning_rate": 1.5281451053213684e-06, + "loss": 0.66, + "step": 17669 + }, + { + "epoch": 1.276572687702061, + "grad_norm": 6.507071243329943, + "learning_rate": 1.5278756158717142e-06, + "loss": 0.592, + "step": 17670 + }, + { + "epoch": 1.2766449329022702, + "grad_norm": 6.806673410520037, + "learning_rate": 1.5276061397296887e-06, + "loss": 0.6476, + "step": 17671 + }, + { + "epoch": 1.2767171781024798, + "grad_norm": 6.602504324458423, + "learning_rate": 1.5273366768989794e-06, + "loss": 0.6923, + "step": 17672 + }, + { + "epoch": 1.2767894233026893, + "grad_norm": 6.85381117558802, + "learning_rate": 1.527067227383276e-06, + "loss": 0.7139, + "step": 17673 + }, + { + "epoch": 1.2768616685028988, + "grad_norm": 7.361782060990762, + "learning_rate": 1.5267977911862673e-06, + "loss": 0.6408, + "step": 17674 + }, + { + "epoch": 1.2769339137031084, + "grad_norm": 6.478325726885984, + "learning_rate": 1.5265283683116405e-06, + "loss": 0.5914, + "step": 17675 + }, + { + "epoch": 1.277006158903318, + "grad_norm": 8.609786174391905, + "learning_rate": 1.5262589587630847e-06, + "loss": 0.6534, + "step": 17676 + }, + { + "epoch": 1.2770784041035275, + "grad_norm": 7.860933122840289, + "learning_rate": 1.5259895625442878e-06, + "loss": 0.6306, + "step": 17677 + }, + { + "epoch": 1.2771506493037368, + "grad_norm": 6.476902638742383, + "learning_rate": 1.5257201796589382e-06, + "loss": 0.5851, + "step": 17678 + }, + { + "epoch": 1.2772228945039463, + "grad_norm": 9.418513114193056, + "learning_rate": 1.5254508101107217e-06, + "loss": 0.6161, + "step": 17679 + }, + { + "epoch": 1.2772951397041559, + "grad_norm": 6.728486710750646, + "learning_rate": 1.525181453903327e-06, + "loss": 0.5991, + "step": 17680 + }, + { + "epoch": 1.2773673849043654, + "grad_norm": 7.271770806707483, + "learning_rate": 1.5249121110404414e-06, + "loss": 0.6506, + "step": 17681 + }, + { + "epoch": 1.277439630104575, + "grad_norm": 9.42907905037821, + "learning_rate": 1.524642781525753e-06, + "loss": 0.6189, + "step": 17682 + }, + { + "epoch": 1.2775118753047845, + "grad_norm": 6.474321260557906, + "learning_rate": 1.5243734653629466e-06, + "loss": 0.6883, + "step": 17683 + }, + { + "epoch": 1.277584120504994, + "grad_norm": 8.129726033116686, + "learning_rate": 1.5241041625557102e-06, + "loss": 0.6354, + "step": 17684 + }, + { + "epoch": 1.2776563657052034, + "grad_norm": 6.8801483297992245, + "learning_rate": 1.5238348731077302e-06, + "loss": 0.6306, + "step": 17685 + }, + { + "epoch": 1.277728610905413, + "grad_norm": 6.325719606980899, + "learning_rate": 1.5235655970226926e-06, + "loss": 0.5788, + "step": 17686 + }, + { + "epoch": 1.2778008561056224, + "grad_norm": 7.0099101079599935, + "learning_rate": 1.5232963343042834e-06, + "loss": 0.5734, + "step": 17687 + }, + { + "epoch": 1.277873101305832, + "grad_norm": 6.659755589579805, + "learning_rate": 1.5230270849561895e-06, + "loss": 0.595, + "step": 17688 + }, + { + "epoch": 1.2779453465060415, + "grad_norm": 7.515066653506183, + "learning_rate": 1.5227578489820966e-06, + "loss": 0.6935, + "step": 17689 + }, + { + "epoch": 1.278017591706251, + "grad_norm": 6.546805370024262, + "learning_rate": 1.5224886263856891e-06, + "loss": 0.6263, + "step": 17690 + }, + { + "epoch": 1.2780898369064606, + "grad_norm": 6.228967295574439, + "learning_rate": 1.5222194171706539e-06, + "loss": 0.6698, + "step": 17691 + }, + { + "epoch": 1.27816208210667, + "grad_norm": 6.873747555485506, + "learning_rate": 1.5219502213406762e-06, + "loss": 0.6996, + "step": 17692 + }, + { + "epoch": 1.2782343273068795, + "grad_norm": 7.934332508455663, + "learning_rate": 1.52168103889944e-06, + "loss": 0.7008, + "step": 17693 + }, + { + "epoch": 1.278306572507089, + "grad_norm": 6.128987785268678, + "learning_rate": 1.52141186985063e-06, + "loss": 0.6515, + "step": 17694 + }, + { + "epoch": 1.2783788177072986, + "grad_norm": 7.017688473248749, + "learning_rate": 1.521142714197932e-06, + "loss": 0.5781, + "step": 17695 + }, + { + "epoch": 1.278451062907508, + "grad_norm": 6.431529711691694, + "learning_rate": 1.5208735719450316e-06, + "loss": 0.6048, + "step": 17696 + }, + { + "epoch": 1.2785233081077176, + "grad_norm": 8.349091495269812, + "learning_rate": 1.5206044430956108e-06, + "loss": 0.6092, + "step": 17697 + }, + { + "epoch": 1.2785955533079272, + "grad_norm": 6.602241147162318, + "learning_rate": 1.5203353276533544e-06, + "loss": 0.6179, + "step": 17698 + }, + { + "epoch": 1.2786677985081365, + "grad_norm": 6.729039461648001, + "learning_rate": 1.5200662256219473e-06, + "loss": 0.6226, + "step": 17699 + }, + { + "epoch": 1.2787400437083463, + "grad_norm": 6.02431963486237, + "learning_rate": 1.5197971370050722e-06, + "loss": 0.6695, + "step": 17700 + }, + { + "epoch": 1.2788122889085556, + "grad_norm": 7.007609999164524, + "learning_rate": 1.5195280618064131e-06, + "loss": 0.6639, + "step": 17701 + }, + { + "epoch": 1.2788845341087651, + "grad_norm": 7.701652834968664, + "learning_rate": 1.5192590000296537e-06, + "loss": 0.6656, + "step": 17702 + }, + { + "epoch": 1.2789567793089747, + "grad_norm": 7.040628240036844, + "learning_rate": 1.5189899516784775e-06, + "loss": 0.7064, + "step": 17703 + }, + { + "epoch": 1.2790290245091842, + "grad_norm": 5.807454205848493, + "learning_rate": 1.5187209167565669e-06, + "loss": 0.5653, + "step": 17704 + }, + { + "epoch": 1.2791012697093938, + "grad_norm": 6.240665025730744, + "learning_rate": 1.5184518952676046e-06, + "loss": 0.6633, + "step": 17705 + }, + { + "epoch": 1.279173514909603, + "grad_norm": 7.7270075686242325, + "learning_rate": 1.5181828872152743e-06, + "loss": 0.5782, + "step": 17706 + }, + { + "epoch": 1.2792457601098128, + "grad_norm": 6.3728170210235335, + "learning_rate": 1.5179138926032585e-06, + "loss": 0.6267, + "step": 17707 + }, + { + "epoch": 1.2793180053100222, + "grad_norm": 8.58595127316343, + "learning_rate": 1.5176449114352376e-06, + "loss": 0.6679, + "step": 17708 + }, + { + "epoch": 1.2793902505102317, + "grad_norm": 7.048406802402117, + "learning_rate": 1.5173759437148955e-06, + "loss": 0.7122, + "step": 17709 + }, + { + "epoch": 1.2794624957104412, + "grad_norm": 5.605709142547866, + "learning_rate": 1.517106989445915e-06, + "loss": 0.6027, + "step": 17710 + }, + { + "epoch": 1.2795347409106508, + "grad_norm": 6.906850106934289, + "learning_rate": 1.5168380486319755e-06, + "loss": 0.687, + "step": 17711 + }, + { + "epoch": 1.2796069861108603, + "grad_norm": 5.8763260663726005, + "learning_rate": 1.5165691212767597e-06, + "loss": 0.6743, + "step": 17712 + }, + { + "epoch": 1.2796792313110696, + "grad_norm": 7.269937658659186, + "learning_rate": 1.5163002073839492e-06, + "loss": 0.7357, + "step": 17713 + }, + { + "epoch": 1.2797514765112794, + "grad_norm": 6.081592815965174, + "learning_rate": 1.5160313069572254e-06, + "loss": 0.6727, + "step": 17714 + }, + { + "epoch": 1.2798237217114887, + "grad_norm": 6.720555262558993, + "learning_rate": 1.5157624200002685e-06, + "loss": 0.7035, + "step": 17715 + }, + { + "epoch": 1.2798959669116983, + "grad_norm": 7.025974312005345, + "learning_rate": 1.5154935465167602e-06, + "loss": 0.6049, + "step": 17716 + }, + { + "epoch": 1.2799682121119078, + "grad_norm": 9.010991060915682, + "learning_rate": 1.5152246865103809e-06, + "loss": 0.7433, + "step": 17717 + }, + { + "epoch": 1.2800404573121174, + "grad_norm": 6.857244417028829, + "learning_rate": 1.5149558399848108e-06, + "loss": 0.6407, + "step": 17718 + }, + { + "epoch": 1.280112702512327, + "grad_norm": 6.243270608179889, + "learning_rate": 1.5146870069437302e-06, + "loss": 0.5652, + "step": 17719 + }, + { + "epoch": 1.2801849477125362, + "grad_norm": 7.0236876106919715, + "learning_rate": 1.5144181873908198e-06, + "loss": 0.6463, + "step": 17720 + }, + { + "epoch": 1.280257192912746, + "grad_norm": 5.987612971313216, + "learning_rate": 1.5141493813297598e-06, + "loss": 0.609, + "step": 17721 + }, + { + "epoch": 1.2803294381129553, + "grad_norm": 6.918240594073067, + "learning_rate": 1.5138805887642278e-06, + "loss": 0.6393, + "step": 17722 + }, + { + "epoch": 1.2804016833131648, + "grad_norm": 6.526823889475533, + "learning_rate": 1.5136118096979056e-06, + "loss": 0.63, + "step": 17723 + }, + { + "epoch": 1.2804739285133744, + "grad_norm": 6.462320126773235, + "learning_rate": 1.5133430441344727e-06, + "loss": 0.5436, + "step": 17724 + }, + { + "epoch": 1.280546173713584, + "grad_norm": 8.066305519195508, + "learning_rate": 1.5130742920776065e-06, + "loss": 0.686, + "step": 17725 + }, + { + "epoch": 1.2806184189137935, + "grad_norm": 8.526766203099756, + "learning_rate": 1.5128055535309868e-06, + "loss": 0.6809, + "step": 17726 + }, + { + "epoch": 1.280690664114003, + "grad_norm": 7.694979123092796, + "learning_rate": 1.5125368284982929e-06, + "loss": 0.599, + "step": 17727 + }, + { + "epoch": 1.2807629093142125, + "grad_norm": 7.859210943553649, + "learning_rate": 1.512268116983203e-06, + "loss": 0.6483, + "step": 17728 + }, + { + "epoch": 1.2808351545144219, + "grad_norm": 6.828062148863339, + "learning_rate": 1.5119994189893956e-06, + "loss": 0.6082, + "step": 17729 + }, + { + "epoch": 1.2809073997146314, + "grad_norm": 9.135409335949406, + "learning_rate": 1.511730734520549e-06, + "loss": 0.6615, + "step": 17730 + }, + { + "epoch": 1.280979644914841, + "grad_norm": 7.010959220701254, + "learning_rate": 1.5114620635803412e-06, + "loss": 0.6798, + "step": 17731 + }, + { + "epoch": 1.2810518901150505, + "grad_norm": 6.61393850792929, + "learning_rate": 1.5111934061724509e-06, + "loss": 0.6199, + "step": 17732 + }, + { + "epoch": 1.28112413531526, + "grad_norm": 7.1136933678152605, + "learning_rate": 1.5109247623005544e-06, + "loss": 0.6208, + "step": 17733 + }, + { + "epoch": 1.2811963805154696, + "grad_norm": 7.1625053685352915, + "learning_rate": 1.51065613196833e-06, + "loss": 0.6301, + "step": 17734 + }, + { + "epoch": 1.2812686257156791, + "grad_norm": 8.048874808391068, + "learning_rate": 1.5103875151794559e-06, + "loss": 0.7505, + "step": 17735 + }, + { + "epoch": 1.2813408709158884, + "grad_norm": 6.408920996631528, + "learning_rate": 1.5101189119376069e-06, + "loss": 0.5375, + "step": 17736 + }, + { + "epoch": 1.281413116116098, + "grad_norm": 7.052882575118361, + "learning_rate": 1.5098503222464617e-06, + "loss": 0.6674, + "step": 17737 + }, + { + "epoch": 1.2814853613163075, + "grad_norm": 6.64297551461272, + "learning_rate": 1.5095817461096973e-06, + "loss": 0.7058, + "step": 17738 + }, + { + "epoch": 1.281557606516517, + "grad_norm": 7.034102611445176, + "learning_rate": 1.5093131835309905e-06, + "loss": 0.6747, + "step": 17739 + }, + { + "epoch": 1.2816298517167266, + "grad_norm": 8.066097905728933, + "learning_rate": 1.5090446345140158e-06, + "loss": 0.6214, + "step": 17740 + }, + { + "epoch": 1.2817020969169362, + "grad_norm": 6.305794355190405, + "learning_rate": 1.5087760990624505e-06, + "loss": 0.6604, + "step": 17741 + }, + { + "epoch": 1.2817743421171457, + "grad_norm": 7.130720285037639, + "learning_rate": 1.5085075771799717e-06, + "loss": 0.6747, + "step": 17742 + }, + { + "epoch": 1.281846587317355, + "grad_norm": 6.575785241177356, + "learning_rate": 1.5082390688702538e-06, + "loss": 0.6375, + "step": 17743 + }, + { + "epoch": 1.2819188325175646, + "grad_norm": 6.346848806818335, + "learning_rate": 1.507970574136973e-06, + "loss": 0.6431, + "step": 17744 + }, + { + "epoch": 1.281991077717774, + "grad_norm": 5.975837693488337, + "learning_rate": 1.5077020929838046e-06, + "loss": 0.5723, + "step": 17745 + }, + { + "epoch": 1.2820633229179836, + "grad_norm": 8.054630194237223, + "learning_rate": 1.5074336254144245e-06, + "loss": 0.5747, + "step": 17746 + }, + { + "epoch": 1.2821355681181932, + "grad_norm": 6.411046758395237, + "learning_rate": 1.507165171432507e-06, + "loss": 0.6358, + "step": 17747 + }, + { + "epoch": 1.2822078133184027, + "grad_norm": 7.727240090122759, + "learning_rate": 1.5068967310417276e-06, + "loss": 0.6893, + "step": 17748 + }, + { + "epoch": 1.2822800585186123, + "grad_norm": 6.964414607940773, + "learning_rate": 1.5066283042457618e-06, + "loss": 0.631, + "step": 17749 + }, + { + "epoch": 1.2823523037188216, + "grad_norm": 6.317910170469278, + "learning_rate": 1.5063598910482815e-06, + "loss": 0.6335, + "step": 17750 + }, + { + "epoch": 1.2824245489190311, + "grad_norm": 5.910457343722471, + "learning_rate": 1.5060914914529634e-06, + "loss": 0.6889, + "step": 17751 + }, + { + "epoch": 1.2824967941192407, + "grad_norm": 6.781641232503861, + "learning_rate": 1.5058231054634809e-06, + "loss": 0.5955, + "step": 17752 + }, + { + "epoch": 1.2825690393194502, + "grad_norm": 6.479398504527157, + "learning_rate": 1.5055547330835097e-06, + "loss": 0.8278, + "step": 17753 + }, + { + "epoch": 1.2826412845196598, + "grad_norm": 6.365324269655186, + "learning_rate": 1.5052863743167206e-06, + "loss": 0.625, + "step": 17754 + }, + { + "epoch": 1.2827135297198693, + "grad_norm": 5.5582250391044035, + "learning_rate": 1.5050180291667887e-06, + "loss": 0.566, + "step": 17755 + }, + { + "epoch": 1.2827857749200788, + "grad_norm": 6.676280655325379, + "learning_rate": 1.5047496976373874e-06, + "loss": 0.6383, + "step": 17756 + }, + { + "epoch": 1.2828580201202882, + "grad_norm": 7.413823243472239, + "learning_rate": 1.5044813797321906e-06, + "loss": 0.6573, + "step": 17757 + }, + { + "epoch": 1.2829302653204977, + "grad_norm": 6.1354110363695336, + "learning_rate": 1.50421307545487e-06, + "loss": 0.6445, + "step": 17758 + }, + { + "epoch": 1.2830025105207072, + "grad_norm": 6.870476639161768, + "learning_rate": 1.5039447848090996e-06, + "loss": 0.5785, + "step": 17759 + }, + { + "epoch": 1.2830747557209168, + "grad_norm": 7.231726406694951, + "learning_rate": 1.503676507798552e-06, + "loss": 0.6882, + "step": 17760 + }, + { + "epoch": 1.2831470009211263, + "grad_norm": 6.108129818359274, + "learning_rate": 1.503408244426899e-06, + "loss": 0.6288, + "step": 17761 + }, + { + "epoch": 1.2832192461213359, + "grad_norm": 6.424472701109772, + "learning_rate": 1.5031399946978134e-06, + "loss": 0.6756, + "step": 17762 + }, + { + "epoch": 1.2832914913215454, + "grad_norm": 5.919957315961216, + "learning_rate": 1.5028717586149672e-06, + "loss": 0.6181, + "step": 17763 + }, + { + "epoch": 1.2833637365217547, + "grad_norm": 5.96351050045304, + "learning_rate": 1.5026035361820328e-06, + "loss": 0.5912, + "step": 17764 + }, + { + "epoch": 1.2834359817219643, + "grad_norm": 6.258428459951434, + "learning_rate": 1.502335327402681e-06, + "loss": 0.5984, + "step": 17765 + }, + { + "epoch": 1.2835082269221738, + "grad_norm": 6.68031110027702, + "learning_rate": 1.502067132280584e-06, + "loss": 0.6275, + "step": 17766 + }, + { + "epoch": 1.2835804721223834, + "grad_norm": 6.700512926196105, + "learning_rate": 1.5017989508194141e-06, + "loss": 0.6161, + "step": 17767 + }, + { + "epoch": 1.283652717322593, + "grad_norm": 6.962846522037745, + "learning_rate": 1.501530783022841e-06, + "loss": 0.6629, + "step": 17768 + }, + { + "epoch": 1.2837249625228024, + "grad_norm": 6.234178888969881, + "learning_rate": 1.5012626288945353e-06, + "loss": 0.6042, + "step": 17769 + }, + { + "epoch": 1.283797207723012, + "grad_norm": 5.486515160126228, + "learning_rate": 1.5009944884381693e-06, + "loss": 0.613, + "step": 17770 + }, + { + "epoch": 1.2838694529232213, + "grad_norm": 7.235875808232076, + "learning_rate": 1.5007263616574135e-06, + "loss": 0.6035, + "step": 17771 + }, + { + "epoch": 1.283941698123431, + "grad_norm": 6.667542527202266, + "learning_rate": 1.5004582485559376e-06, + "loss": 0.5818, + "step": 17772 + }, + { + "epoch": 1.2840139433236404, + "grad_norm": 6.0862800394015375, + "learning_rate": 1.500190149137412e-06, + "loss": 0.6716, + "step": 17773 + }, + { + "epoch": 1.28408618852385, + "grad_norm": 6.667498187024471, + "learning_rate": 1.4999220634055072e-06, + "loss": 0.5848, + "step": 17774 + }, + { + "epoch": 1.2841584337240595, + "grad_norm": 7.031508513042855, + "learning_rate": 1.499653991363893e-06, + "loss": 0.6357, + "step": 17775 + }, + { + "epoch": 1.284230678924269, + "grad_norm": 6.135145232106813, + "learning_rate": 1.4993859330162386e-06, + "loss": 0.6058, + "step": 17776 + }, + { + "epoch": 1.2843029241244786, + "grad_norm": 7.220061740347089, + "learning_rate": 1.4991178883662135e-06, + "loss": 0.6408, + "step": 17777 + }, + { + "epoch": 1.2843751693246879, + "grad_norm": 6.692179531081089, + "learning_rate": 1.4988498574174882e-06, + "loss": 0.6019, + "step": 17778 + }, + { + "epoch": 1.2844474145248976, + "grad_norm": 7.2713574181436424, + "learning_rate": 1.4985818401737306e-06, + "loss": 0.6468, + "step": 17779 + }, + { + "epoch": 1.284519659725107, + "grad_norm": 7.688851090462644, + "learning_rate": 1.4983138366386098e-06, + "loss": 0.5914, + "step": 17780 + }, + { + "epoch": 1.2845919049253165, + "grad_norm": 7.940919635085026, + "learning_rate": 1.498045846815796e-06, + "loss": 0.672, + "step": 17781 + }, + { + "epoch": 1.284664150125526, + "grad_norm": 9.010444938012608, + "learning_rate": 1.4977778707089558e-06, + "loss": 0.6919, + "step": 17782 + }, + { + "epoch": 1.2847363953257356, + "grad_norm": 7.2519407634388315, + "learning_rate": 1.4975099083217582e-06, + "loss": 0.6204, + "step": 17783 + }, + { + "epoch": 1.2848086405259451, + "grad_norm": 6.627068826355137, + "learning_rate": 1.4972419596578713e-06, + "loss": 0.6906, + "step": 17784 + }, + { + "epoch": 1.2848808857261544, + "grad_norm": 6.388202534540874, + "learning_rate": 1.4969740247209642e-06, + "loss": 0.6196, + "step": 17785 + }, + { + "epoch": 1.2849531309263642, + "grad_norm": 5.90140589111605, + "learning_rate": 1.496706103514703e-06, + "loss": 0.6412, + "step": 17786 + }, + { + "epoch": 1.2850253761265735, + "grad_norm": 6.452567826941403, + "learning_rate": 1.4964381960427566e-06, + "loss": 0.6067, + "step": 17787 + }, + { + "epoch": 1.285097621326783, + "grad_norm": 6.955641936042398, + "learning_rate": 1.496170302308792e-06, + "loss": 0.6314, + "step": 17788 + }, + { + "epoch": 1.2851698665269926, + "grad_norm": 7.523242032848236, + "learning_rate": 1.495902422316477e-06, + "loss": 0.7007, + "step": 17789 + }, + { + "epoch": 1.2852421117272022, + "grad_norm": 6.327586059226225, + "learning_rate": 1.495634556069478e-06, + "loss": 0.654, + "step": 17790 + }, + { + "epoch": 1.2853143569274117, + "grad_norm": 6.4938328169534065, + "learning_rate": 1.4953667035714615e-06, + "loss": 0.6067, + "step": 17791 + }, + { + "epoch": 1.285386602127621, + "grad_norm": 6.353282662690289, + "learning_rate": 1.4950988648260962e-06, + "loss": 0.6247, + "step": 17792 + }, + { + "epoch": 1.2854588473278308, + "grad_norm": 5.396988537626197, + "learning_rate": 1.494831039837046e-06, + "loss": 0.6131, + "step": 17793 + }, + { + "epoch": 1.28553109252804, + "grad_norm": 5.8308010962954775, + "learning_rate": 1.494563228607978e-06, + "loss": 0.6076, + "step": 17794 + }, + { + "epoch": 1.2856033377282496, + "grad_norm": 6.944723956099049, + "learning_rate": 1.4942954311425595e-06, + "loss": 0.618, + "step": 17795 + }, + { + "epoch": 1.2856755829284592, + "grad_norm": 7.609328878838667, + "learning_rate": 1.4940276474444562e-06, + "loss": 0.6617, + "step": 17796 + }, + { + "epoch": 1.2857478281286687, + "grad_norm": 6.222152929071569, + "learning_rate": 1.4937598775173326e-06, + "loss": 0.6654, + "step": 17797 + }, + { + "epoch": 1.2858200733288783, + "grad_norm": 7.865795993575528, + "learning_rate": 1.4934921213648549e-06, + "loss": 0.6623, + "step": 17798 + }, + { + "epoch": 1.2858923185290876, + "grad_norm": 6.605635358354927, + "learning_rate": 1.4932243789906892e-06, + "loss": 0.6235, + "step": 17799 + }, + { + "epoch": 1.2859645637292973, + "grad_norm": 6.002864471771379, + "learning_rate": 1.4929566503984994e-06, + "loss": 0.616, + "step": 17800 + }, + { + "epoch": 1.2860368089295067, + "grad_norm": 6.698384491604172, + "learning_rate": 1.492688935591951e-06, + "loss": 0.7133, + "step": 17801 + }, + { + "epoch": 1.2861090541297162, + "grad_norm": 6.112206312741863, + "learning_rate": 1.4924212345747092e-06, + "loss": 0.625, + "step": 17802 + }, + { + "epoch": 1.2861812993299258, + "grad_norm": 6.696633634788021, + "learning_rate": 1.4921535473504383e-06, + "loss": 0.6065, + "step": 17803 + }, + { + "epoch": 1.2862535445301353, + "grad_norm": 6.827713244700268, + "learning_rate": 1.491885873922803e-06, + "loss": 0.5919, + "step": 17804 + }, + { + "epoch": 1.2863257897303448, + "grad_norm": 5.650854076547509, + "learning_rate": 1.4916182142954672e-06, + "loss": 0.6062, + "step": 17805 + }, + { + "epoch": 1.2863980349305544, + "grad_norm": 8.535609881394725, + "learning_rate": 1.4913505684720958e-06, + "loss": 0.6221, + "step": 17806 + }, + { + "epoch": 1.286470280130764, + "grad_norm": 10.285390281115408, + "learning_rate": 1.4910829364563506e-06, + "loss": 0.5907, + "step": 17807 + }, + { + "epoch": 1.2865425253309732, + "grad_norm": 6.182709968725126, + "learning_rate": 1.4908153182518969e-06, + "loss": 0.6158, + "step": 17808 + }, + { + "epoch": 1.2866147705311828, + "grad_norm": 6.929975123064932, + "learning_rate": 1.4905477138623983e-06, + "loss": 0.6411, + "step": 17809 + }, + { + "epoch": 1.2866870157313923, + "grad_norm": 8.926630412971356, + "learning_rate": 1.4902801232915186e-06, + "loss": 0.7297, + "step": 17810 + }, + { + "epoch": 1.2867592609316019, + "grad_norm": 7.605835892525123, + "learning_rate": 1.4900125465429188e-06, + "loss": 0.7367, + "step": 17811 + }, + { + "epoch": 1.2868315061318114, + "grad_norm": 6.047830038033916, + "learning_rate": 1.4897449836202633e-06, + "loss": 0.6335, + "step": 17812 + }, + { + "epoch": 1.286903751332021, + "grad_norm": 6.428381229796169, + "learning_rate": 1.489477434527214e-06, + "loss": 0.6573, + "step": 17813 + }, + { + "epoch": 1.2869759965322305, + "grad_norm": 7.543998596595803, + "learning_rate": 1.4892098992674353e-06, + "loss": 0.6609, + "step": 17814 + }, + { + "epoch": 1.2870482417324398, + "grad_norm": 7.5742371329231535, + "learning_rate": 1.4889423778445877e-06, + "loss": 0.6696, + "step": 17815 + }, + { + "epoch": 1.2871204869326494, + "grad_norm": 7.241762150821598, + "learning_rate": 1.4886748702623334e-06, + "loss": 0.6622, + "step": 17816 + }, + { + "epoch": 1.287192732132859, + "grad_norm": 5.9224779129099545, + "learning_rate": 1.4884073765243357e-06, + "loss": 0.6519, + "step": 17817 + }, + { + "epoch": 1.2872649773330684, + "grad_norm": 8.592579820898706, + "learning_rate": 1.488139896634255e-06, + "loss": 0.6965, + "step": 17818 + }, + { + "epoch": 1.287337222533278, + "grad_norm": 6.142361383298536, + "learning_rate": 1.4878724305957533e-06, + "loss": 0.6255, + "step": 17819 + }, + { + "epoch": 1.2874094677334875, + "grad_norm": 7.8958988505709256, + "learning_rate": 1.4876049784124924e-06, + "loss": 0.6422, + "step": 17820 + }, + { + "epoch": 1.287481712933697, + "grad_norm": 6.951688536660873, + "learning_rate": 1.4873375400881337e-06, + "loss": 0.6882, + "step": 17821 + }, + { + "epoch": 1.2875539581339064, + "grad_norm": 6.36497277433052, + "learning_rate": 1.4870701156263373e-06, + "loss": 0.653, + "step": 17822 + }, + { + "epoch": 1.287626203334116, + "grad_norm": 7.155742510662765, + "learning_rate": 1.4868027050307643e-06, + "loss": 0.6187, + "step": 17823 + }, + { + "epoch": 1.2876984485343255, + "grad_norm": 7.702853617810861, + "learning_rate": 1.4865353083050772e-06, + "loss": 0.6348, + "step": 17824 + }, + { + "epoch": 1.287770693734535, + "grad_norm": 6.749632083614643, + "learning_rate": 1.486267925452933e-06, + "loss": 0.5525, + "step": 17825 + }, + { + "epoch": 1.2878429389347446, + "grad_norm": 8.525176069393533, + "learning_rate": 1.4860005564779944e-06, + "loss": 0.6405, + "step": 17826 + }, + { + "epoch": 1.287915184134954, + "grad_norm": 8.811417316997154, + "learning_rate": 1.4857332013839199e-06, + "loss": 0.6794, + "step": 17827 + }, + { + "epoch": 1.2879874293351636, + "grad_norm": 7.008727763443273, + "learning_rate": 1.4854658601743722e-06, + "loss": 0.5214, + "step": 17828 + }, + { + "epoch": 1.288059674535373, + "grad_norm": 8.000461565053996, + "learning_rate": 1.485198532853008e-06, + "loss": 0.6629, + "step": 17829 + }, + { + "epoch": 1.2881319197355825, + "grad_norm": 6.059103417372253, + "learning_rate": 1.484931219423488e-06, + "loss": 0.6468, + "step": 17830 + }, + { + "epoch": 1.288204164935792, + "grad_norm": 7.1452328028301935, + "learning_rate": 1.4846639198894719e-06, + "loss": 0.6376, + "step": 17831 + }, + { + "epoch": 1.2882764101360016, + "grad_norm": 7.064024634070986, + "learning_rate": 1.4843966342546179e-06, + "loss": 0.5861, + "step": 17832 + }, + { + "epoch": 1.2883486553362111, + "grad_norm": 6.685172843667322, + "learning_rate": 1.4841293625225856e-06, + "loss": 0.6352, + "step": 17833 + }, + { + "epoch": 1.2884209005364207, + "grad_norm": 8.045227948854118, + "learning_rate": 1.483862104697033e-06, + "loss": 0.6847, + "step": 17834 + }, + { + "epoch": 1.2884931457366302, + "grad_norm": 7.645854509897728, + "learning_rate": 1.4835948607816203e-06, + "loss": 0.7052, + "step": 17835 + }, + { + "epoch": 1.2885653909368395, + "grad_norm": 6.531642774243113, + "learning_rate": 1.4833276307800038e-06, + "loss": 0.5743, + "step": 17836 + }, + { + "epoch": 1.288637636137049, + "grad_norm": 6.3332805965971835, + "learning_rate": 1.483060414695843e-06, + "loss": 0.6354, + "step": 17837 + }, + { + "epoch": 1.2887098813372586, + "grad_norm": 6.877971215199245, + "learning_rate": 1.4827932125327953e-06, + "loss": 0.5711, + "step": 17838 + }, + { + "epoch": 1.2887821265374682, + "grad_norm": 8.93152295837373, + "learning_rate": 1.4825260242945199e-06, + "loss": 0.6673, + "step": 17839 + }, + { + "epoch": 1.2888543717376777, + "grad_norm": 7.746430590255646, + "learning_rate": 1.482258849984672e-06, + "loss": 0.6199, + "step": 17840 + }, + { + "epoch": 1.2889266169378872, + "grad_norm": 6.1453087879192, + "learning_rate": 1.4819916896069097e-06, + "loss": 0.6576, + "step": 17841 + }, + { + "epoch": 1.2889988621380968, + "grad_norm": 6.703886117828267, + "learning_rate": 1.4817245431648925e-06, + "loss": 0.6437, + "step": 17842 + }, + { + "epoch": 1.289071107338306, + "grad_norm": 7.273116779734146, + "learning_rate": 1.4814574106622747e-06, + "loss": 0.7343, + "step": 17843 + }, + { + "epoch": 1.2891433525385156, + "grad_norm": 6.055494212890762, + "learning_rate": 1.4811902921027138e-06, + "loss": 0.6191, + "step": 17844 + }, + { + "epoch": 1.2892155977387252, + "grad_norm": 5.886516241021556, + "learning_rate": 1.480923187489867e-06, + "loss": 0.655, + "step": 17845 + }, + { + "epoch": 1.2892878429389347, + "grad_norm": 6.413441264353978, + "learning_rate": 1.480656096827391e-06, + "loss": 0.6467, + "step": 17846 + }, + { + "epoch": 1.2893600881391443, + "grad_norm": 6.122959653485632, + "learning_rate": 1.4803890201189408e-06, + "loss": 0.6764, + "step": 17847 + }, + { + "epoch": 1.2894323333393538, + "grad_norm": 8.680801114148304, + "learning_rate": 1.4801219573681736e-06, + "loss": 0.6695, + "step": 17848 + }, + { + "epoch": 1.2895045785395634, + "grad_norm": 6.374388123113606, + "learning_rate": 1.4798549085787451e-06, + "loss": 0.5887, + "step": 17849 + }, + { + "epoch": 1.2895768237397727, + "grad_norm": 6.4540837570916825, + "learning_rate": 1.47958787375431e-06, + "loss": 0.6263, + "step": 17850 + }, + { + "epoch": 1.2896490689399824, + "grad_norm": 7.147769617450635, + "learning_rate": 1.4793208528985254e-06, + "loss": 0.6273, + "step": 17851 + }, + { + "epoch": 1.2897213141401918, + "grad_norm": 5.961196033875826, + "learning_rate": 1.4790538460150455e-06, + "loss": 0.6589, + "step": 17852 + }, + { + "epoch": 1.2897935593404013, + "grad_norm": 7.399006452103653, + "learning_rate": 1.4787868531075266e-06, + "loss": 0.6137, + "step": 17853 + }, + { + "epoch": 1.2898658045406108, + "grad_norm": 6.51519905728931, + "learning_rate": 1.478519874179622e-06, + "loss": 0.6367, + "step": 17854 + }, + { + "epoch": 1.2899380497408204, + "grad_norm": 6.181167907478283, + "learning_rate": 1.4782529092349863e-06, + "loss": 0.5911, + "step": 17855 + }, + { + "epoch": 1.29001029494103, + "grad_norm": 7.278897801896192, + "learning_rate": 1.4779859582772766e-06, + "loss": 0.6248, + "step": 17856 + }, + { + "epoch": 1.2900825401412392, + "grad_norm": 6.451916597663296, + "learning_rate": 1.4777190213101446e-06, + "loss": 0.6307, + "step": 17857 + }, + { + "epoch": 1.290154785341449, + "grad_norm": 6.942959398641306, + "learning_rate": 1.4774520983372457e-06, + "loss": 0.6498, + "step": 17858 + }, + { + "epoch": 1.2902270305416583, + "grad_norm": 6.359078665162632, + "learning_rate": 1.4771851893622336e-06, + "loss": 0.6353, + "step": 17859 + }, + { + "epoch": 1.2902992757418679, + "grad_norm": 6.395503264451925, + "learning_rate": 1.4769182943887622e-06, + "loss": 0.5937, + "step": 17860 + }, + { + "epoch": 1.2903715209420774, + "grad_norm": 7.047624163337334, + "learning_rate": 1.4766514134204845e-06, + "loss": 0.5968, + "step": 17861 + }, + { + "epoch": 1.290443766142287, + "grad_norm": 7.4239622373442655, + "learning_rate": 1.4763845464610549e-06, + "loss": 0.6958, + "step": 17862 + }, + { + "epoch": 1.2905160113424965, + "grad_norm": 5.957339899790898, + "learning_rate": 1.4761176935141256e-06, + "loss": 0.6454, + "step": 17863 + }, + { + "epoch": 1.2905882565427058, + "grad_norm": 6.387709869102241, + "learning_rate": 1.4758508545833505e-06, + "loss": 0.6462, + "step": 17864 + }, + { + "epoch": 1.2906605017429156, + "grad_norm": 6.414966440566447, + "learning_rate": 1.475584029672382e-06, + "loss": 0.6191, + "step": 17865 + }, + { + "epoch": 1.290732746943125, + "grad_norm": 6.710982121595688, + "learning_rate": 1.4753172187848725e-06, + "loss": 0.6108, + "step": 17866 + }, + { + "epoch": 1.2908049921433344, + "grad_norm": 7.737367240000996, + "learning_rate": 1.4750504219244754e-06, + "loss": 0.685, + "step": 17867 + }, + { + "epoch": 1.290877237343544, + "grad_norm": 7.207891579323811, + "learning_rate": 1.4747836390948417e-06, + "loss": 0.6551, + "step": 17868 + }, + { + "epoch": 1.2909494825437535, + "grad_norm": 5.8215429042384175, + "learning_rate": 1.4745168702996235e-06, + "loss": 0.6033, + "step": 17869 + }, + { + "epoch": 1.291021727743963, + "grad_norm": 7.46624305181916, + "learning_rate": 1.4742501155424727e-06, + "loss": 0.6438, + "step": 17870 + }, + { + "epoch": 1.2910939729441724, + "grad_norm": 7.279008381139241, + "learning_rate": 1.473983374827043e-06, + "loss": 0.7489, + "step": 17871 + }, + { + "epoch": 1.2911662181443821, + "grad_norm": 7.194221074168744, + "learning_rate": 1.4737166481569832e-06, + "loss": 0.6748, + "step": 17872 + }, + { + "epoch": 1.2912384633445915, + "grad_norm": 7.30462352704273, + "learning_rate": 1.4734499355359454e-06, + "loss": 0.6667, + "step": 17873 + }, + { + "epoch": 1.291310708544801, + "grad_norm": 6.264756140076716, + "learning_rate": 1.473183236967582e-06, + "loss": 0.6678, + "step": 17874 + }, + { + "epoch": 1.2913829537450106, + "grad_norm": 7.416411434739576, + "learning_rate": 1.4729165524555418e-06, + "loss": 0.7038, + "step": 17875 + }, + { + "epoch": 1.29145519894522, + "grad_norm": 7.487555033333463, + "learning_rate": 1.4726498820034768e-06, + "loss": 0.6433, + "step": 17876 + }, + { + "epoch": 1.2915274441454296, + "grad_norm": 7.634960235512138, + "learning_rate": 1.4723832256150369e-06, + "loss": 0.7061, + "step": 17877 + }, + { + "epoch": 1.2915996893456392, + "grad_norm": 6.683626853731578, + "learning_rate": 1.4721165832938736e-06, + "loss": 0.5683, + "step": 17878 + }, + { + "epoch": 1.2916719345458487, + "grad_norm": 9.009580387416731, + "learning_rate": 1.4718499550436354e-06, + "loss": 0.6598, + "step": 17879 + }, + { + "epoch": 1.291744179746058, + "grad_norm": 5.945079262528309, + "learning_rate": 1.471583340867973e-06, + "loss": 0.5991, + "step": 17880 + }, + { + "epoch": 1.2918164249462676, + "grad_norm": 7.787313649502788, + "learning_rate": 1.4713167407705376e-06, + "loss": 0.5659, + "step": 17881 + }, + { + "epoch": 1.2918886701464771, + "grad_norm": 6.2236992392353745, + "learning_rate": 1.471050154754976e-06, + "loss": 0.6418, + "step": 17882 + }, + { + "epoch": 1.2919609153466867, + "grad_norm": 7.108935801595143, + "learning_rate": 1.4707835828249386e-06, + "loss": 0.6698, + "step": 17883 + }, + { + "epoch": 1.2920331605468962, + "grad_norm": 7.092498534467651, + "learning_rate": 1.4705170249840745e-06, + "loss": 0.6219, + "step": 17884 + }, + { + "epoch": 1.2921054057471058, + "grad_norm": 6.4162810899229115, + "learning_rate": 1.4702504812360346e-06, + "loss": 0.6499, + "step": 17885 + }, + { + "epoch": 1.2921776509473153, + "grad_norm": 6.93934455612721, + "learning_rate": 1.469983951584465e-06, + "loss": 0.6627, + "step": 17886 + }, + { + "epoch": 1.2922498961475246, + "grad_norm": 7.076438989317385, + "learning_rate": 1.4697174360330155e-06, + "loss": 0.6109, + "step": 17887 + }, + { + "epoch": 1.2923221413477342, + "grad_norm": 6.712621546378311, + "learning_rate": 1.4694509345853342e-06, + "loss": 0.6137, + "step": 17888 + }, + { + "epoch": 1.2923943865479437, + "grad_norm": 7.063375773186106, + "learning_rate": 1.46918444724507e-06, + "loss": 0.666, + "step": 17889 + }, + { + "epoch": 1.2924666317481532, + "grad_norm": 6.956814933382517, + "learning_rate": 1.4689179740158696e-06, + "loss": 0.6785, + "step": 17890 + }, + { + "epoch": 1.2925388769483628, + "grad_norm": 6.353213012505987, + "learning_rate": 1.4686515149013819e-06, + "loss": 0.5935, + "step": 17891 + }, + { + "epoch": 1.2926111221485723, + "grad_norm": 7.811197401172748, + "learning_rate": 1.4683850699052543e-06, + "loss": 0.7188, + "step": 17892 + }, + { + "epoch": 1.2926833673487819, + "grad_norm": 7.877384127651773, + "learning_rate": 1.4681186390311337e-06, + "loss": 0.5951, + "step": 17893 + }, + { + "epoch": 1.2927556125489912, + "grad_norm": 8.603472837932765, + "learning_rate": 1.467852222282668e-06, + "loss": 0.6316, + "step": 17894 + }, + { + "epoch": 1.2928278577492007, + "grad_norm": 7.603926256812527, + "learning_rate": 1.4675858196635036e-06, + "loss": 0.6355, + "step": 17895 + }, + { + "epoch": 1.2929001029494103, + "grad_norm": 7.865866556804351, + "learning_rate": 1.467319431177289e-06, + "loss": 0.6526, + "step": 17896 + }, + { + "epoch": 1.2929723481496198, + "grad_norm": 7.131224740708305, + "learning_rate": 1.4670530568276684e-06, + "loss": 0.6123, + "step": 17897 + }, + { + "epoch": 1.2930445933498294, + "grad_norm": 7.235436643804828, + "learning_rate": 1.4667866966182888e-06, + "loss": 0.6084, + "step": 17898 + }, + { + "epoch": 1.293116838550039, + "grad_norm": 6.882961587665963, + "learning_rate": 1.4665203505527989e-06, + "loss": 0.5896, + "step": 17899 + }, + { + "epoch": 1.2931890837502484, + "grad_norm": 7.662779970017179, + "learning_rate": 1.4662540186348418e-06, + "loss": 0.6158, + "step": 17900 + }, + { + "epoch": 1.2932613289504578, + "grad_norm": 6.883889848997077, + "learning_rate": 1.4659877008680644e-06, + "loss": 0.7984, + "step": 17901 + }, + { + "epoch": 1.2933335741506673, + "grad_norm": 6.254623533975312, + "learning_rate": 1.465721397256113e-06, + "loss": 0.6071, + "step": 17902 + }, + { + "epoch": 1.2934058193508768, + "grad_norm": 8.422679402678966, + "learning_rate": 1.4654551078026324e-06, + "loss": 0.697, + "step": 17903 + }, + { + "epoch": 1.2934780645510864, + "grad_norm": 6.93106330365602, + "learning_rate": 1.4651888325112681e-06, + "loss": 0.6408, + "step": 17904 + }, + { + "epoch": 1.293550309751296, + "grad_norm": 8.089814985915787, + "learning_rate": 1.464922571385665e-06, + "loss": 0.6068, + "step": 17905 + }, + { + "epoch": 1.2936225549515055, + "grad_norm": 6.326086248069779, + "learning_rate": 1.4646563244294689e-06, + "loss": 0.595, + "step": 17906 + }, + { + "epoch": 1.293694800151715, + "grad_norm": 5.812481951941906, + "learning_rate": 1.4643900916463233e-06, + "loss": 0.7286, + "step": 17907 + }, + { + "epoch": 1.2937670453519243, + "grad_norm": 6.246262310108317, + "learning_rate": 1.4641238730398732e-06, + "loss": 0.5959, + "step": 17908 + }, + { + "epoch": 1.2938392905521339, + "grad_norm": 6.676328651123497, + "learning_rate": 1.4638576686137629e-06, + "loss": 0.6302, + "step": 17909 + }, + { + "epoch": 1.2939115357523434, + "grad_norm": 6.157005922023149, + "learning_rate": 1.4635914783716377e-06, + "loss": 0.5513, + "step": 17910 + }, + { + "epoch": 1.293983780952553, + "grad_norm": 7.138763786173023, + "learning_rate": 1.4633253023171392e-06, + "loss": 0.6187, + "step": 17911 + }, + { + "epoch": 1.2940560261527625, + "grad_norm": 5.684983252078762, + "learning_rate": 1.463059140453912e-06, + "loss": 0.7063, + "step": 17912 + }, + { + "epoch": 1.294128271352972, + "grad_norm": 8.173832558615882, + "learning_rate": 1.4627929927856017e-06, + "loss": 0.6679, + "step": 17913 + }, + { + "epoch": 1.2942005165531816, + "grad_norm": 9.873353458704281, + "learning_rate": 1.4625268593158487e-06, + "loss": 0.6558, + "step": 17914 + }, + { + "epoch": 1.294272761753391, + "grad_norm": 6.513114610251345, + "learning_rate": 1.462260740048298e-06, + "loss": 0.6397, + "step": 17915 + }, + { + "epoch": 1.2943450069536004, + "grad_norm": 7.198527227877122, + "learning_rate": 1.4619946349865915e-06, + "loss": 0.6748, + "step": 17916 + }, + { + "epoch": 1.29441725215381, + "grad_norm": 7.100204959113912, + "learning_rate": 1.4617285441343732e-06, + "loss": 0.7126, + "step": 17917 + }, + { + "epoch": 1.2944894973540195, + "grad_norm": 7.585827371915598, + "learning_rate": 1.4614624674952843e-06, + "loss": 0.6369, + "step": 17918 + }, + { + "epoch": 1.294561742554229, + "grad_norm": 6.603844314875043, + "learning_rate": 1.461196405072968e-06, + "loss": 0.6831, + "step": 17919 + }, + { + "epoch": 1.2946339877544386, + "grad_norm": 7.269147384159091, + "learning_rate": 1.460930356871066e-06, + "loss": 0.6754, + "step": 17920 + }, + { + "epoch": 1.2947062329546482, + "grad_norm": 5.967919097106576, + "learning_rate": 1.4606643228932217e-06, + "loss": 0.5376, + "step": 17921 + }, + { + "epoch": 1.2947784781548575, + "grad_norm": 5.913819321907626, + "learning_rate": 1.460398303143075e-06, + "loss": 0.7143, + "step": 17922 + }, + { + "epoch": 1.2948507233550672, + "grad_norm": 6.281046242516828, + "learning_rate": 1.4601322976242683e-06, + "loss": 0.6262, + "step": 17923 + }, + { + "epoch": 1.2949229685552766, + "grad_norm": 6.421984595038828, + "learning_rate": 1.459866306340444e-06, + "loss": 0.6775, + "step": 17924 + }, + { + "epoch": 1.294995213755486, + "grad_norm": 7.046071888565848, + "learning_rate": 1.4596003292952416e-06, + "loss": 0.7134, + "step": 17925 + }, + { + "epoch": 1.2950674589556956, + "grad_norm": 5.805827587189269, + "learning_rate": 1.4593343664923026e-06, + "loss": 0.5868, + "step": 17926 + }, + { + "epoch": 1.2951397041559052, + "grad_norm": 6.328341105974333, + "learning_rate": 1.4590684179352682e-06, + "loss": 0.6168, + "step": 17927 + }, + { + "epoch": 1.2952119493561147, + "grad_norm": 8.117072830638556, + "learning_rate": 1.4588024836277792e-06, + "loss": 0.673, + "step": 17928 + }, + { + "epoch": 1.295284194556324, + "grad_norm": 7.158164755183658, + "learning_rate": 1.4585365635734755e-06, + "loss": 0.6071, + "step": 17929 + }, + { + "epoch": 1.2953564397565338, + "grad_norm": 8.035168593512639, + "learning_rate": 1.4582706577759974e-06, + "loss": 0.6893, + "step": 17930 + }, + { + "epoch": 1.2954286849567431, + "grad_norm": 7.166054218944178, + "learning_rate": 1.4580047662389867e-06, + "loss": 0.6725, + "step": 17931 + }, + { + "epoch": 1.2955009301569527, + "grad_norm": 6.109867320017096, + "learning_rate": 1.4577388889660803e-06, + "loss": 0.6601, + "step": 17932 + }, + { + "epoch": 1.2955731753571622, + "grad_norm": 8.427944169245658, + "learning_rate": 1.4574730259609194e-06, + "loss": 0.6251, + "step": 17933 + }, + { + "epoch": 1.2956454205573718, + "grad_norm": 7.1229866261676005, + "learning_rate": 1.4572071772271432e-06, + "loss": 0.6483, + "step": 17934 + }, + { + "epoch": 1.2957176657575813, + "grad_norm": 6.6578244062136855, + "learning_rate": 1.456941342768392e-06, + "loss": 0.6227, + "step": 17935 + }, + { + "epoch": 1.2957899109577906, + "grad_norm": 6.790483086863446, + "learning_rate": 1.456675522588303e-06, + "loss": 0.669, + "step": 17936 + }, + { + "epoch": 1.2958621561580004, + "grad_norm": 6.689733684810005, + "learning_rate": 1.4564097166905161e-06, + "loss": 0.718, + "step": 17937 + }, + { + "epoch": 1.2959344013582097, + "grad_norm": 10.330795058372125, + "learning_rate": 1.4561439250786696e-06, + "loss": 0.6655, + "step": 17938 + }, + { + "epoch": 1.2960066465584192, + "grad_norm": 6.5856449881289505, + "learning_rate": 1.4558781477564027e-06, + "loss": 0.6718, + "step": 17939 + }, + { + "epoch": 1.2960788917586288, + "grad_norm": 4.964271878637811, + "learning_rate": 1.4556123847273528e-06, + "loss": 0.5824, + "step": 17940 + }, + { + "epoch": 1.2961511369588383, + "grad_norm": 7.513463460928717, + "learning_rate": 1.455346635995159e-06, + "loss": 0.6173, + "step": 17941 + }, + { + "epoch": 1.2962233821590479, + "grad_norm": 5.408930119688445, + "learning_rate": 1.4550809015634594e-06, + "loss": 0.6172, + "step": 17942 + }, + { + "epoch": 1.2962956273592572, + "grad_norm": 7.716921199723007, + "learning_rate": 1.4548151814358897e-06, + "loss": 0.6923, + "step": 17943 + }, + { + "epoch": 1.296367872559467, + "grad_norm": 8.042510098779164, + "learning_rate": 1.4545494756160886e-06, + "loss": 0.6497, + "step": 17944 + }, + { + "epoch": 1.2964401177596763, + "grad_norm": 5.667770876542591, + "learning_rate": 1.4542837841076935e-06, + "loss": 0.5964, + "step": 17945 + }, + { + "epoch": 1.2965123629598858, + "grad_norm": 6.7789907603388695, + "learning_rate": 1.4540181069143428e-06, + "loss": 0.6555, + "step": 17946 + }, + { + "epoch": 1.2965846081600954, + "grad_norm": 6.686021017367335, + "learning_rate": 1.4537524440396698e-06, + "loss": 0.6206, + "step": 17947 + }, + { + "epoch": 1.296656853360305, + "grad_norm": 6.71116486809248, + "learning_rate": 1.4534867954873144e-06, + "loss": 0.6263, + "step": 17948 + }, + { + "epoch": 1.2967290985605144, + "grad_norm": 6.131787956816779, + "learning_rate": 1.4532211612609132e-06, + "loss": 0.5811, + "step": 17949 + }, + { + "epoch": 1.296801343760724, + "grad_norm": 7.265179232797218, + "learning_rate": 1.4529555413641006e-06, + "loss": 0.6845, + "step": 17950 + }, + { + "epoch": 1.2968735889609335, + "grad_norm": 5.965410666514588, + "learning_rate": 1.4526899358005136e-06, + "loss": 0.6836, + "step": 17951 + }, + { + "epoch": 1.2969458341611428, + "grad_norm": 7.298814274889856, + "learning_rate": 1.4524243445737884e-06, + "loss": 0.6008, + "step": 17952 + }, + { + "epoch": 1.2970180793613524, + "grad_norm": 7.28194492936782, + "learning_rate": 1.4521587676875615e-06, + "loss": 0.6125, + "step": 17953 + }, + { + "epoch": 1.297090324561562, + "grad_norm": 6.763204586532694, + "learning_rate": 1.4518932051454665e-06, + "loss": 0.6067, + "step": 17954 + }, + { + "epoch": 1.2971625697617715, + "grad_norm": 5.6524006090026315, + "learning_rate": 1.4516276569511396e-06, + "loss": 0.6101, + "step": 17955 + }, + { + "epoch": 1.297234814961981, + "grad_norm": 6.035073131001248, + "learning_rate": 1.451362123108216e-06, + "loss": 0.6182, + "step": 17956 + }, + { + "epoch": 1.2973070601621906, + "grad_norm": 6.218349424716425, + "learning_rate": 1.4510966036203305e-06, + "loss": 0.5994, + "step": 17957 + }, + { + "epoch": 1.2973793053624, + "grad_norm": 6.517142651956244, + "learning_rate": 1.4508310984911187e-06, + "loss": 0.644, + "step": 17958 + }, + { + "epoch": 1.2974515505626094, + "grad_norm": 6.294112212318729, + "learning_rate": 1.4505656077242142e-06, + "loss": 0.6642, + "step": 17959 + }, + { + "epoch": 1.297523795762819, + "grad_norm": 7.390283556545214, + "learning_rate": 1.4503001313232526e-06, + "loss": 0.7653, + "step": 17960 + }, + { + "epoch": 1.2975960409630285, + "grad_norm": 8.597823075959857, + "learning_rate": 1.4500346692918663e-06, + "loss": 0.6062, + "step": 17961 + }, + { + "epoch": 1.297668286163238, + "grad_norm": 8.114355068562809, + "learning_rate": 1.44976922163369e-06, + "loss": 0.6737, + "step": 17962 + }, + { + "epoch": 1.2977405313634476, + "grad_norm": 8.491529395354407, + "learning_rate": 1.4495037883523588e-06, + "loss": 0.6163, + "step": 17963 + }, + { + "epoch": 1.2978127765636571, + "grad_norm": 6.441025444512347, + "learning_rate": 1.449238369451504e-06, + "loss": 0.654, + "step": 17964 + }, + { + "epoch": 1.2978850217638667, + "grad_norm": 6.552520430095308, + "learning_rate": 1.44897296493476e-06, + "loss": 0.6023, + "step": 17965 + }, + { + "epoch": 1.297957266964076, + "grad_norm": 5.77130393118939, + "learning_rate": 1.44870757480576e-06, + "loss": 0.5926, + "step": 17966 + }, + { + "epoch": 1.2980295121642855, + "grad_norm": 7.67277543325237, + "learning_rate": 1.448442199068137e-06, + "loss": 0.619, + "step": 17967 + }, + { + "epoch": 1.298101757364495, + "grad_norm": 8.47757005460655, + "learning_rate": 1.448176837725524e-06, + "loss": 0.7059, + "step": 17968 + }, + { + "epoch": 1.2981740025647046, + "grad_norm": 6.326798363545261, + "learning_rate": 1.4479114907815534e-06, + "loss": 0.6193, + "step": 17969 + }, + { + "epoch": 1.2982462477649142, + "grad_norm": 6.2074872492191835, + "learning_rate": 1.4476461582398576e-06, + "loss": 0.6078, + "step": 17970 + }, + { + "epoch": 1.2983184929651237, + "grad_norm": 7.337942351178479, + "learning_rate": 1.4473808401040695e-06, + "loss": 0.6089, + "step": 17971 + }, + { + "epoch": 1.2983907381653332, + "grad_norm": 7.343630006499734, + "learning_rate": 1.44711553637782e-06, + "loss": 0.6305, + "step": 17972 + }, + { + "epoch": 1.2984629833655426, + "grad_norm": 8.492638485610088, + "learning_rate": 1.4468502470647405e-06, + "loss": 0.6609, + "step": 17973 + }, + { + "epoch": 1.298535228565752, + "grad_norm": 7.559282611536244, + "learning_rate": 1.4465849721684649e-06, + "loss": 0.6038, + "step": 17974 + }, + { + "epoch": 1.2986074737659616, + "grad_norm": 6.100655579906038, + "learning_rate": 1.446319711692622e-06, + "loss": 0.6337, + "step": 17975 + }, + { + "epoch": 1.2986797189661712, + "grad_norm": 7.083826863168223, + "learning_rate": 1.4460544656408432e-06, + "loss": 0.6244, + "step": 17976 + }, + { + "epoch": 1.2987519641663807, + "grad_norm": 7.452489541912451, + "learning_rate": 1.4457892340167612e-06, + "loss": 0.6478, + "step": 17977 + }, + { + "epoch": 1.2988242093665903, + "grad_norm": 6.506723594468606, + "learning_rate": 1.4455240168240075e-06, + "loss": 0.6806, + "step": 17978 + }, + { + "epoch": 1.2988964545667998, + "grad_norm": 6.891102746157937, + "learning_rate": 1.4452588140662099e-06, + "loss": 0.5598, + "step": 17979 + }, + { + "epoch": 1.2989686997670091, + "grad_norm": 6.414754442256021, + "learning_rate": 1.444993625747e-06, + "loss": 0.5845, + "step": 17980 + }, + { + "epoch": 1.2990409449672187, + "grad_norm": 7.101753996060173, + "learning_rate": 1.4447284518700097e-06, + "loss": 0.7067, + "step": 17981 + }, + { + "epoch": 1.2991131901674282, + "grad_norm": 7.857729426695988, + "learning_rate": 1.4444632924388663e-06, + "loss": 0.6177, + "step": 17982 + }, + { + "epoch": 1.2991854353676378, + "grad_norm": 7.449897499147403, + "learning_rate": 1.4441981474572009e-06, + "loss": 0.6048, + "step": 17983 + }, + { + "epoch": 1.2992576805678473, + "grad_norm": 6.404699817207295, + "learning_rate": 1.4439330169286428e-06, + "loss": 0.6665, + "step": 17984 + }, + { + "epoch": 1.2993299257680568, + "grad_norm": 6.469913396476815, + "learning_rate": 1.4436679008568222e-06, + "loss": 0.5962, + "step": 17985 + }, + { + "epoch": 1.2994021709682664, + "grad_norm": 6.943213781877208, + "learning_rate": 1.4434027992453676e-06, + "loss": 0.6158, + "step": 17986 + }, + { + "epoch": 1.2994744161684757, + "grad_norm": 7.608433972710142, + "learning_rate": 1.443137712097908e-06, + "loss": 0.61, + "step": 17987 + }, + { + "epoch": 1.2995466613686852, + "grad_norm": 6.804039977981659, + "learning_rate": 1.4428726394180736e-06, + "loss": 0.6366, + "step": 17988 + }, + { + "epoch": 1.2996189065688948, + "grad_norm": 6.574151440534812, + "learning_rate": 1.4426075812094914e-06, + "loss": 0.6964, + "step": 17989 + }, + { + "epoch": 1.2996911517691043, + "grad_norm": 6.373131571705815, + "learning_rate": 1.4423425374757902e-06, + "loss": 0.5533, + "step": 17990 + }, + { + "epoch": 1.2997633969693139, + "grad_norm": 6.350726721322215, + "learning_rate": 1.4420775082205984e-06, + "loss": 0.6084, + "step": 17991 + }, + { + "epoch": 1.2998356421695234, + "grad_norm": 6.503761083684008, + "learning_rate": 1.441812493447545e-06, + "loss": 0.7379, + "step": 17992 + }, + { + "epoch": 1.299907887369733, + "grad_norm": 8.496868678739833, + "learning_rate": 1.441547493160256e-06, + "loss": 0.6252, + "step": 17993 + }, + { + "epoch": 1.2999801325699423, + "grad_norm": 6.9284832068151205, + "learning_rate": 1.4412825073623599e-06, + "loss": 0.6101, + "step": 17994 + }, + { + "epoch": 1.3000523777701518, + "grad_norm": 5.849926952981944, + "learning_rate": 1.4410175360574847e-06, + "loss": 0.5636, + "step": 17995 + }, + { + "epoch": 1.3001246229703614, + "grad_norm": 6.4951564642522035, + "learning_rate": 1.4407525792492567e-06, + "loss": 0.6131, + "step": 17996 + }, + { + "epoch": 1.300196868170571, + "grad_norm": 6.787894513939727, + "learning_rate": 1.4404876369413035e-06, + "loss": 0.6182, + "step": 17997 + }, + { + "epoch": 1.3002691133707804, + "grad_norm": 7.734033935425073, + "learning_rate": 1.4402227091372516e-06, + "loss": 0.6845, + "step": 17998 + }, + { + "epoch": 1.30034135857099, + "grad_norm": 5.751902058347166, + "learning_rate": 1.4399577958407296e-06, + "loss": 0.721, + "step": 17999 + }, + { + "epoch": 1.3004136037711995, + "grad_norm": 7.596825749917486, + "learning_rate": 1.439692897055361e-06, + "loss": 0.6205, + "step": 18000 + }, + { + "epoch": 1.3004858489714088, + "grad_norm": 5.638420540705319, + "learning_rate": 1.4394280127847732e-06, + "loss": 0.6478, + "step": 18001 + }, + { + "epoch": 1.3005580941716186, + "grad_norm": 7.660813073414648, + "learning_rate": 1.4391631430325925e-06, + "loss": 0.7312, + "step": 18002 + }, + { + "epoch": 1.300630339371828, + "grad_norm": 6.865675272560026, + "learning_rate": 1.438898287802446e-06, + "loss": 0.6648, + "step": 18003 + }, + { + "epoch": 1.3007025845720375, + "grad_norm": 6.760551964611391, + "learning_rate": 1.4386334470979557e-06, + "loss": 0.6066, + "step": 18004 + }, + { + "epoch": 1.300774829772247, + "grad_norm": 7.645892677457948, + "learning_rate": 1.4383686209227504e-06, + "loss": 0.6311, + "step": 18005 + }, + { + "epoch": 1.3008470749724566, + "grad_norm": 6.219925505753195, + "learning_rate": 1.4381038092804556e-06, + "loss": 0.6545, + "step": 18006 + }, + { + "epoch": 1.300919320172666, + "grad_norm": 6.8856146972003005, + "learning_rate": 1.4378390121746943e-06, + "loss": 0.7095, + "step": 18007 + }, + { + "epoch": 1.3009915653728754, + "grad_norm": 7.382581789963322, + "learning_rate": 1.4375742296090917e-06, + "loss": 0.6812, + "step": 18008 + }, + { + "epoch": 1.3010638105730852, + "grad_norm": 7.617699802483799, + "learning_rate": 1.4373094615872735e-06, + "loss": 0.6225, + "step": 18009 + }, + { + "epoch": 1.3011360557732945, + "grad_norm": 6.479043777461584, + "learning_rate": 1.4370447081128641e-06, + "loss": 0.7337, + "step": 18010 + }, + { + "epoch": 1.301208300973504, + "grad_norm": 6.07352004371728, + "learning_rate": 1.4367799691894868e-06, + "loss": 0.6095, + "step": 18011 + }, + { + "epoch": 1.3012805461737136, + "grad_norm": 6.274597896747579, + "learning_rate": 1.4365152448207658e-06, + "loss": 0.6436, + "step": 18012 + }, + { + "epoch": 1.3013527913739231, + "grad_norm": 6.0854714524901246, + "learning_rate": 1.4362505350103257e-06, + "loss": 0.6407, + "step": 18013 + }, + { + "epoch": 1.3014250365741327, + "grad_norm": 7.877828181000474, + "learning_rate": 1.4359858397617895e-06, + "loss": 0.6221, + "step": 18014 + }, + { + "epoch": 1.301497281774342, + "grad_norm": 7.329889471409916, + "learning_rate": 1.4357211590787811e-06, + "loss": 0.5941, + "step": 18015 + }, + { + "epoch": 1.3015695269745517, + "grad_norm": 6.876078573065374, + "learning_rate": 1.435456492964924e-06, + "loss": 0.6341, + "step": 18016 + }, + { + "epoch": 1.301641772174761, + "grad_norm": 7.1986594435647415, + "learning_rate": 1.4351918414238413e-06, + "loss": 0.639, + "step": 18017 + }, + { + "epoch": 1.3017140173749706, + "grad_norm": 6.687209934111599, + "learning_rate": 1.434927204459155e-06, + "loss": 0.6243, + "step": 18018 + }, + { + "epoch": 1.3017862625751802, + "grad_norm": 8.240901669547359, + "learning_rate": 1.4346625820744883e-06, + "loss": 0.6691, + "step": 18019 + }, + { + "epoch": 1.3018585077753897, + "grad_norm": 6.672348398749639, + "learning_rate": 1.4343979742734646e-06, + "loss": 0.5938, + "step": 18020 + }, + { + "epoch": 1.3019307529755992, + "grad_norm": 6.567302871841533, + "learning_rate": 1.4341333810597042e-06, + "loss": 0.6879, + "step": 18021 + }, + { + "epoch": 1.3020029981758086, + "grad_norm": 6.185755319940632, + "learning_rate": 1.4338688024368302e-06, + "loss": 0.6215, + "step": 18022 + }, + { + "epoch": 1.3020752433760183, + "grad_norm": 7.132077098761327, + "learning_rate": 1.4336042384084647e-06, + "loss": 0.7186, + "step": 18023 + }, + { + "epoch": 1.3021474885762276, + "grad_norm": 7.685117771137278, + "learning_rate": 1.433339688978229e-06, + "loss": 0.5707, + "step": 18024 + }, + { + "epoch": 1.3022197337764372, + "grad_norm": 7.390753263042003, + "learning_rate": 1.4330751541497446e-06, + "loss": 0.5798, + "step": 18025 + }, + { + "epoch": 1.3022919789766467, + "grad_norm": 6.127895994055778, + "learning_rate": 1.432810633926633e-06, + "loss": 0.6186, + "step": 18026 + }, + { + "epoch": 1.3023642241768563, + "grad_norm": 5.851559240444686, + "learning_rate": 1.4325461283125152e-06, + "loss": 0.6316, + "step": 18027 + }, + { + "epoch": 1.3024364693770658, + "grad_norm": 7.454735539924595, + "learning_rate": 1.4322816373110132e-06, + "loss": 0.5502, + "step": 18028 + }, + { + "epoch": 1.3025087145772754, + "grad_norm": 6.600272618067567, + "learning_rate": 1.4320171609257454e-06, + "loss": 0.7079, + "step": 18029 + }, + { + "epoch": 1.302580959777485, + "grad_norm": 6.7031537391028735, + "learning_rate": 1.4317526991603336e-06, + "loss": 0.6702, + "step": 18030 + }, + { + "epoch": 1.3026532049776942, + "grad_norm": 8.055326834168396, + "learning_rate": 1.4314882520183992e-06, + "loss": 0.5682, + "step": 18031 + }, + { + "epoch": 1.3027254501779038, + "grad_norm": 6.694089406132467, + "learning_rate": 1.4312238195035586e-06, + "loss": 0.6314, + "step": 18032 + }, + { + "epoch": 1.3027976953781133, + "grad_norm": 6.713328458499496, + "learning_rate": 1.430959401619435e-06, + "loss": 0.6564, + "step": 18033 + }, + { + "epoch": 1.3028699405783228, + "grad_norm": 8.209178848740322, + "learning_rate": 1.4306949983696473e-06, + "loss": 0.6964, + "step": 18034 + }, + { + "epoch": 1.3029421857785324, + "grad_norm": 6.8435214627224035, + "learning_rate": 1.4304306097578159e-06, + "loss": 0.6984, + "step": 18035 + }, + { + "epoch": 1.303014430978742, + "grad_norm": 6.16700355365639, + "learning_rate": 1.4301662357875576e-06, + "loss": 0.6813, + "step": 18036 + }, + { + "epoch": 1.3030866761789515, + "grad_norm": 6.842824934358279, + "learning_rate": 1.429901876462493e-06, + "loss": 0.6815, + "step": 18037 + }, + { + "epoch": 1.3031589213791608, + "grad_norm": 7.647228422053476, + "learning_rate": 1.4296375317862422e-06, + "loss": 0.5738, + "step": 18038 + }, + { + "epoch": 1.3032311665793703, + "grad_norm": 6.553775764864331, + "learning_rate": 1.4293732017624212e-06, + "loss": 0.6357, + "step": 18039 + }, + { + "epoch": 1.3033034117795799, + "grad_norm": 6.9100905566761455, + "learning_rate": 1.4291088863946494e-06, + "loss": 0.6511, + "step": 18040 + }, + { + "epoch": 1.3033756569797894, + "grad_norm": 7.045718349764557, + "learning_rate": 1.4288445856865463e-06, + "loss": 0.7196, + "step": 18041 + }, + { + "epoch": 1.303447902179999, + "grad_norm": 6.383746859682475, + "learning_rate": 1.4285802996417286e-06, + "loss": 0.5939, + "step": 18042 + }, + { + "epoch": 1.3035201473802085, + "grad_norm": 7.523588596938071, + "learning_rate": 1.4283160282638148e-06, + "loss": 0.5485, + "step": 18043 + }, + { + "epoch": 1.303592392580418, + "grad_norm": 6.2844655171253425, + "learning_rate": 1.4280517715564223e-06, + "loss": 0.6228, + "step": 18044 + }, + { + "epoch": 1.3036646377806274, + "grad_norm": 6.738315500365852, + "learning_rate": 1.42778752952317e-06, + "loss": 0.6309, + "step": 18045 + }, + { + "epoch": 1.303736882980837, + "grad_norm": 6.767550504994143, + "learning_rate": 1.4275233021676726e-06, + "loss": 0.6503, + "step": 18046 + }, + { + "epoch": 1.3038091281810464, + "grad_norm": 6.091363840712637, + "learning_rate": 1.427259089493549e-06, + "loss": 0.649, + "step": 18047 + }, + { + "epoch": 1.303881373381256, + "grad_norm": 6.370158432647594, + "learning_rate": 1.4269948915044152e-06, + "loss": 0.6831, + "step": 18048 + }, + { + "epoch": 1.3039536185814655, + "grad_norm": 7.200814826210777, + "learning_rate": 1.426730708203889e-06, + "loss": 0.6472, + "step": 18049 + }, + { + "epoch": 1.304025863781675, + "grad_norm": 9.5936283265391, + "learning_rate": 1.4264665395955852e-06, + "loss": 0.6805, + "step": 18050 + }, + { + "epoch": 1.3040981089818846, + "grad_norm": 5.9570949663600645, + "learning_rate": 1.426202385683121e-06, + "loss": 0.5911, + "step": 18051 + }, + { + "epoch": 1.304170354182094, + "grad_norm": 7.88685587385437, + "learning_rate": 1.4259382464701124e-06, + "loss": 0.6458, + "step": 18052 + }, + { + "epoch": 1.3042425993823035, + "grad_norm": 6.183358087344459, + "learning_rate": 1.4256741219601752e-06, + "loss": 0.6039, + "step": 18053 + }, + { + "epoch": 1.304314844582513, + "grad_norm": 6.275393968350339, + "learning_rate": 1.4254100121569254e-06, + "loss": 0.5855, + "step": 18054 + }, + { + "epoch": 1.3043870897827226, + "grad_norm": 6.185218467462113, + "learning_rate": 1.4251459170639776e-06, + "loss": 0.6243, + "step": 18055 + }, + { + "epoch": 1.304459334982932, + "grad_norm": 6.50547120108111, + "learning_rate": 1.424881836684949e-06, + "loss": 0.6166, + "step": 18056 + }, + { + "epoch": 1.3045315801831416, + "grad_norm": 7.559277817478742, + "learning_rate": 1.424617771023452e-06, + "loss": 0.6337, + "step": 18057 + }, + { + "epoch": 1.3046038253833512, + "grad_norm": 7.906325992023012, + "learning_rate": 1.4243537200831032e-06, + "loss": 0.6507, + "step": 18058 + }, + { + "epoch": 1.3046760705835605, + "grad_norm": 8.755237428766968, + "learning_rate": 1.4240896838675163e-06, + "loss": 0.6559, + "step": 18059 + }, + { + "epoch": 1.30474831578377, + "grad_norm": 8.200797195760671, + "learning_rate": 1.4238256623803065e-06, + "loss": 0.6483, + "step": 18060 + }, + { + "epoch": 1.3048205609839796, + "grad_norm": 7.682684654055009, + "learning_rate": 1.4235616556250878e-06, + "loss": 0.6683, + "step": 18061 + }, + { + "epoch": 1.3048928061841891, + "grad_norm": 6.163076885218554, + "learning_rate": 1.423297663605474e-06, + "loss": 0.68, + "step": 18062 + }, + { + "epoch": 1.3049650513843987, + "grad_norm": 7.2738421170855165, + "learning_rate": 1.4230336863250804e-06, + "loss": 0.6325, + "step": 18063 + }, + { + "epoch": 1.3050372965846082, + "grad_norm": 5.948936290385076, + "learning_rate": 1.4227697237875189e-06, + "loss": 0.6428, + "step": 18064 + }, + { + "epoch": 1.3051095417848178, + "grad_norm": 7.7357240675560055, + "learning_rate": 1.4225057759964026e-06, + "loss": 0.6327, + "step": 18065 + }, + { + "epoch": 1.305181786985027, + "grad_norm": 7.580092216071675, + "learning_rate": 1.4222418429553459e-06, + "loss": 0.6696, + "step": 18066 + }, + { + "epoch": 1.3052540321852366, + "grad_norm": 6.071832140164126, + "learning_rate": 1.4219779246679626e-06, + "loss": 0.6284, + "step": 18067 + }, + { + "epoch": 1.3053262773854462, + "grad_norm": 5.861767415227647, + "learning_rate": 1.4217140211378635e-06, + "loss": 0.6406, + "step": 18068 + }, + { + "epoch": 1.3053985225856557, + "grad_norm": 8.08716495061629, + "learning_rate": 1.4214501323686624e-06, + "loss": 0.6502, + "step": 18069 + }, + { + "epoch": 1.3054707677858652, + "grad_norm": 6.294629778167216, + "learning_rate": 1.4211862583639713e-06, + "loss": 0.6133, + "step": 18070 + }, + { + "epoch": 1.3055430129860748, + "grad_norm": 6.459398079338607, + "learning_rate": 1.4209223991274027e-06, + "loss": 0.6267, + "step": 18071 + }, + { + "epoch": 1.3056152581862843, + "grad_norm": 7.1381925283660905, + "learning_rate": 1.4206585546625685e-06, + "loss": 0.5607, + "step": 18072 + }, + { + "epoch": 1.3056875033864936, + "grad_norm": 6.506647378913767, + "learning_rate": 1.4203947249730809e-06, + "loss": 0.6051, + "step": 18073 + }, + { + "epoch": 1.3057597485867034, + "grad_norm": 7.046892051929896, + "learning_rate": 1.4201309100625522e-06, + "loss": 0.6332, + "step": 18074 + }, + { + "epoch": 1.3058319937869127, + "grad_norm": 7.37130929442737, + "learning_rate": 1.4198671099345918e-06, + "loss": 0.5792, + "step": 18075 + }, + { + "epoch": 1.3059042389871223, + "grad_norm": 6.9846043901088715, + "learning_rate": 1.419603324592812e-06, + "loss": 0.7051, + "step": 18076 + }, + { + "epoch": 1.3059764841873318, + "grad_norm": 6.777866626984009, + "learning_rate": 1.4193395540408236e-06, + "loss": 0.6524, + "step": 18077 + }, + { + "epoch": 1.3060487293875414, + "grad_norm": 8.278481074530955, + "learning_rate": 1.419075798282239e-06, + "loss": 0.6407, + "step": 18078 + }, + { + "epoch": 1.306120974587751, + "grad_norm": 6.493116401845123, + "learning_rate": 1.4188120573206662e-06, + "loss": 0.6834, + "step": 18079 + }, + { + "epoch": 1.3061932197879602, + "grad_norm": 8.061273525899924, + "learning_rate": 1.418548331159717e-06, + "loss": 0.622, + "step": 18080 + }, + { + "epoch": 1.30626546498817, + "grad_norm": 6.693135391641918, + "learning_rate": 1.4182846198030013e-06, + "loss": 0.6211, + "step": 18081 + }, + { + "epoch": 1.3063377101883793, + "grad_norm": 8.591171932190452, + "learning_rate": 1.4180209232541292e-06, + "loss": 0.6138, + "step": 18082 + }, + { + "epoch": 1.3064099553885888, + "grad_norm": 5.946461870974957, + "learning_rate": 1.4177572415167106e-06, + "loss": 0.7131, + "step": 18083 + }, + { + "epoch": 1.3064822005887984, + "grad_norm": 7.1347065444617925, + "learning_rate": 1.4174935745943551e-06, + "loss": 0.6611, + "step": 18084 + }, + { + "epoch": 1.306554445789008, + "grad_norm": 5.976899819422776, + "learning_rate": 1.4172299224906733e-06, + "loss": 0.5879, + "step": 18085 + }, + { + "epoch": 1.3066266909892175, + "grad_norm": 7.183198653490505, + "learning_rate": 1.416966285209272e-06, + "loss": 0.6157, + "step": 18086 + }, + { + "epoch": 1.3066989361894268, + "grad_norm": 6.70059860743785, + "learning_rate": 1.4167026627537611e-06, + "loss": 0.631, + "step": 18087 + }, + { + "epoch": 1.3067711813896365, + "grad_norm": 7.1470261478161925, + "learning_rate": 1.416439055127751e-06, + "loss": 0.6412, + "step": 18088 + }, + { + "epoch": 1.3068434265898459, + "grad_norm": 5.688621326618274, + "learning_rate": 1.4161754623348468e-06, + "loss": 0.5976, + "step": 18089 + }, + { + "epoch": 1.3069156717900554, + "grad_norm": 6.789195385368791, + "learning_rate": 1.4159118843786595e-06, + "loss": 0.6189, + "step": 18090 + }, + { + "epoch": 1.306987916990265, + "grad_norm": 6.751407335295366, + "learning_rate": 1.4156483212627976e-06, + "loss": 0.602, + "step": 18091 + }, + { + "epoch": 1.3070601621904745, + "grad_norm": 7.965686882559867, + "learning_rate": 1.4153847729908687e-06, + "loss": 0.6776, + "step": 18092 + }, + { + "epoch": 1.307132407390684, + "grad_norm": 8.728351515505242, + "learning_rate": 1.415121239566479e-06, + "loss": 0.6484, + "step": 18093 + }, + { + "epoch": 1.3072046525908934, + "grad_norm": 6.768568706814402, + "learning_rate": 1.4148577209932373e-06, + "loss": 0.6587, + "step": 18094 + }, + { + "epoch": 1.3072768977911031, + "grad_norm": 6.7451159857073275, + "learning_rate": 1.4145942172747521e-06, + "loss": 0.6053, + "step": 18095 + }, + { + "epoch": 1.3073491429913124, + "grad_norm": 8.711241411458584, + "learning_rate": 1.414330728414628e-06, + "loss": 0.6837, + "step": 18096 + }, + { + "epoch": 1.307421388191522, + "grad_norm": 6.978514531730952, + "learning_rate": 1.4140672544164736e-06, + "loss": 0.6101, + "step": 18097 + }, + { + "epoch": 1.3074936333917315, + "grad_norm": 8.406651597197314, + "learning_rate": 1.4138037952838948e-06, + "loss": 0.6717, + "step": 18098 + }, + { + "epoch": 1.307565878591941, + "grad_norm": 8.193426651615926, + "learning_rate": 1.4135403510204993e-06, + "loss": 0.7038, + "step": 18099 + }, + { + "epoch": 1.3076381237921506, + "grad_norm": 6.167265201424151, + "learning_rate": 1.4132769216298921e-06, + "loss": 0.5066, + "step": 18100 + }, + { + "epoch": 1.3077103689923602, + "grad_norm": 7.143096963398693, + "learning_rate": 1.4130135071156806e-06, + "loss": 0.6388, + "step": 18101 + }, + { + "epoch": 1.3077826141925697, + "grad_norm": 6.454505163206198, + "learning_rate": 1.4127501074814698e-06, + "loss": 0.6861, + "step": 18102 + }, + { + "epoch": 1.307854859392779, + "grad_norm": 7.497389784228919, + "learning_rate": 1.4124867227308672e-06, + "loss": 0.5667, + "step": 18103 + }, + { + "epoch": 1.3079271045929886, + "grad_norm": 6.528034203364956, + "learning_rate": 1.4122233528674759e-06, + "loss": 0.6429, + "step": 18104 + }, + { + "epoch": 1.307999349793198, + "grad_norm": 7.123572758726969, + "learning_rate": 1.4119599978949022e-06, + "loss": 0.6822, + "step": 18105 + }, + { + "epoch": 1.3080715949934076, + "grad_norm": 6.23573200496103, + "learning_rate": 1.4116966578167524e-06, + "loss": 0.5828, + "step": 18106 + }, + { + "epoch": 1.3081438401936172, + "grad_norm": 6.357825688020084, + "learning_rate": 1.4114333326366293e-06, + "loss": 0.7251, + "step": 18107 + }, + { + "epoch": 1.3082160853938267, + "grad_norm": 6.302643890251473, + "learning_rate": 1.4111700223581387e-06, + "loss": 0.6078, + "step": 18108 + }, + { + "epoch": 1.3082883305940363, + "grad_norm": 6.32765780007605, + "learning_rate": 1.4109067269848837e-06, + "loss": 0.6688, + "step": 18109 + }, + { + "epoch": 1.3083605757942456, + "grad_norm": 6.217285597248521, + "learning_rate": 1.4106434465204725e-06, + "loss": 0.6295, + "step": 18110 + }, + { + "epoch": 1.3084328209944551, + "grad_norm": 8.161444518940394, + "learning_rate": 1.4103801809685053e-06, + "loss": 0.7072, + "step": 18111 + }, + { + "epoch": 1.3085050661946647, + "grad_norm": 6.218185322490315, + "learning_rate": 1.4101169303325876e-06, + "loss": 0.6804, + "step": 18112 + }, + { + "epoch": 1.3085773113948742, + "grad_norm": 7.622410318829833, + "learning_rate": 1.4098536946163238e-06, + "loss": 0.6364, + "step": 18113 + }, + { + "epoch": 1.3086495565950838, + "grad_norm": 5.791225096338, + "learning_rate": 1.4095904738233157e-06, + "loss": 0.576, + "step": 18114 + }, + { + "epoch": 1.3087218017952933, + "grad_norm": 6.681751097541137, + "learning_rate": 1.409327267957167e-06, + "loss": 0.7234, + "step": 18115 + }, + { + "epoch": 1.3087940469955028, + "grad_norm": 7.292698843425419, + "learning_rate": 1.4090640770214814e-06, + "loss": 0.755, + "step": 18116 + }, + { + "epoch": 1.3088662921957122, + "grad_norm": 5.973087033291528, + "learning_rate": 1.4088009010198616e-06, + "loss": 0.6873, + "step": 18117 + }, + { + "epoch": 1.3089385373959217, + "grad_norm": 7.662246037766062, + "learning_rate": 1.40853773995591e-06, + "loss": 0.6541, + "step": 18118 + }, + { + "epoch": 1.3090107825961312, + "grad_norm": 7.634375639582454, + "learning_rate": 1.4082745938332293e-06, + "loss": 0.708, + "step": 18119 + }, + { + "epoch": 1.3090830277963408, + "grad_norm": 6.988890960860169, + "learning_rate": 1.4080114626554228e-06, + "loss": 0.6576, + "step": 18120 + }, + { + "epoch": 1.3091552729965503, + "grad_norm": 7.2843325397263685, + "learning_rate": 1.4077483464260905e-06, + "loss": 0.6571, + "step": 18121 + }, + { + "epoch": 1.3092275181967599, + "grad_norm": 6.947287314152958, + "learning_rate": 1.4074852451488351e-06, + "loss": 0.6265, + "step": 18122 + }, + { + "epoch": 1.3092997633969694, + "grad_norm": 6.1093618875738445, + "learning_rate": 1.407222158827259e-06, + "loss": 0.6858, + "step": 18123 + }, + { + "epoch": 1.3093720085971787, + "grad_norm": 6.783378583427547, + "learning_rate": 1.4069590874649632e-06, + "loss": 0.67, + "step": 18124 + }, + { + "epoch": 1.3094442537973883, + "grad_norm": 6.488810665204907, + "learning_rate": 1.4066960310655484e-06, + "loss": 0.6541, + "step": 18125 + }, + { + "epoch": 1.3095164989975978, + "grad_norm": 7.876023725544623, + "learning_rate": 1.4064329896326156e-06, + "loss": 0.6393, + "step": 18126 + }, + { + "epoch": 1.3095887441978074, + "grad_norm": 7.348854141704333, + "learning_rate": 1.4061699631697668e-06, + "loss": 0.6555, + "step": 18127 + }, + { + "epoch": 1.309660989398017, + "grad_norm": 6.783027099645223, + "learning_rate": 1.4059069516806012e-06, + "loss": 0.6548, + "step": 18128 + }, + { + "epoch": 1.3097332345982264, + "grad_norm": 5.3805394570845, + "learning_rate": 1.40564395516872e-06, + "loss": 0.6175, + "step": 18129 + }, + { + "epoch": 1.309805479798436, + "grad_norm": 6.219663926640375, + "learning_rate": 1.4053809736377235e-06, + "loss": 0.5646, + "step": 18130 + }, + { + "epoch": 1.3098777249986453, + "grad_norm": 6.64824499620321, + "learning_rate": 1.4051180070912126e-06, + "loss": 0.6782, + "step": 18131 + }, + { + "epoch": 1.3099499701988548, + "grad_norm": 6.1547158745418855, + "learning_rate": 1.404855055532785e-06, + "loss": 0.637, + "step": 18132 + }, + { + "epoch": 1.3100222153990644, + "grad_norm": 7.9117684988900745, + "learning_rate": 1.4045921189660416e-06, + "loss": 0.6402, + "step": 18133 + }, + { + "epoch": 1.310094460599274, + "grad_norm": 7.169996360448023, + "learning_rate": 1.4043291973945812e-06, + "loss": 0.7363, + "step": 18134 + }, + { + "epoch": 1.3101667057994835, + "grad_norm": 6.3381539453195925, + "learning_rate": 1.4040662908220043e-06, + "loss": 0.6684, + "step": 18135 + }, + { + "epoch": 1.310238950999693, + "grad_norm": 5.47640333610992, + "learning_rate": 1.4038033992519084e-06, + "loss": 0.6198, + "step": 18136 + }, + { + "epoch": 1.3103111961999026, + "grad_norm": 6.806702553071942, + "learning_rate": 1.4035405226878916e-06, + "loss": 0.564, + "step": 18137 + }, + { + "epoch": 1.3103834414001119, + "grad_norm": 8.01810980461525, + "learning_rate": 1.4032776611335554e-06, + "loss": 0.6196, + "step": 18138 + }, + { + "epoch": 1.3104556866003214, + "grad_norm": 6.570960122939102, + "learning_rate": 1.403014814592496e-06, + "loss": 0.6315, + "step": 18139 + }, + { + "epoch": 1.310527931800531, + "grad_norm": 7.90733794314926, + "learning_rate": 1.4027519830683116e-06, + "loss": 0.6626, + "step": 18140 + }, + { + "epoch": 1.3106001770007405, + "grad_norm": 7.817041648649363, + "learning_rate": 1.4024891665646006e-06, + "loss": 0.6428, + "step": 18141 + }, + { + "epoch": 1.31067242220095, + "grad_norm": 6.688185612124305, + "learning_rate": 1.4022263650849622e-06, + "loss": 0.6372, + "step": 18142 + }, + { + "epoch": 1.3107446674011596, + "grad_norm": 5.984817369012164, + "learning_rate": 1.401963578632991e-06, + "loss": 0.6692, + "step": 18143 + }, + { + "epoch": 1.3108169126013691, + "grad_norm": 7.0048745757697635, + "learning_rate": 1.4017008072122863e-06, + "loss": 0.6674, + "step": 18144 + }, + { + "epoch": 1.3108891578015784, + "grad_norm": 6.076448106265269, + "learning_rate": 1.4014380508264441e-06, + "loss": 0.5924, + "step": 18145 + }, + { + "epoch": 1.3109614030017882, + "grad_norm": 7.156051433090536, + "learning_rate": 1.4011753094790625e-06, + "loss": 0.6162, + "step": 18146 + }, + { + "epoch": 1.3110336482019975, + "grad_norm": 5.6305904158797135, + "learning_rate": 1.4009125831737377e-06, + "loss": 0.6514, + "step": 18147 + }, + { + "epoch": 1.311105893402207, + "grad_norm": 6.567671708910654, + "learning_rate": 1.4006498719140662e-06, + "loss": 0.6, + "step": 18148 + }, + { + "epoch": 1.3111781386024166, + "grad_norm": 7.997974139244966, + "learning_rate": 1.4003871757036452e-06, + "loss": 0.6689, + "step": 18149 + }, + { + "epoch": 1.3112503838026262, + "grad_norm": 8.041752579251519, + "learning_rate": 1.4001244945460693e-06, + "loss": 0.6717, + "step": 18150 + }, + { + "epoch": 1.3113226290028357, + "grad_norm": 6.866839750509668, + "learning_rate": 1.399861828444935e-06, + "loss": 0.7301, + "step": 18151 + }, + { + "epoch": 1.311394874203045, + "grad_norm": 6.904793287935642, + "learning_rate": 1.3995991774038386e-06, + "loss": 0.5774, + "step": 18152 + }, + { + "epoch": 1.3114671194032548, + "grad_norm": 6.271030291973685, + "learning_rate": 1.3993365414263748e-06, + "loss": 0.6009, + "step": 18153 + }, + { + "epoch": 1.311539364603464, + "grad_norm": 7.596971119092541, + "learning_rate": 1.3990739205161386e-06, + "loss": 0.6472, + "step": 18154 + }, + { + "epoch": 1.3116116098036736, + "grad_norm": 6.522521104233836, + "learning_rate": 1.3988113146767258e-06, + "loss": 0.5768, + "step": 18155 + }, + { + "epoch": 1.3116838550038832, + "grad_norm": 5.7400712246081245, + "learning_rate": 1.3985487239117313e-06, + "loss": 0.6462, + "step": 18156 + }, + { + "epoch": 1.3117561002040927, + "grad_norm": 6.27090650074329, + "learning_rate": 1.398286148224749e-06, + "loss": 0.5936, + "step": 18157 + }, + { + "epoch": 1.3118283454043023, + "grad_norm": 9.247899822286511, + "learning_rate": 1.3980235876193743e-06, + "loss": 0.6102, + "step": 18158 + }, + { + "epoch": 1.3119005906045116, + "grad_norm": 6.702878863032677, + "learning_rate": 1.397761042099201e-06, + "loss": 0.6507, + "step": 18159 + }, + { + "epoch": 1.3119728358047213, + "grad_norm": 6.716033244338692, + "learning_rate": 1.397498511667824e-06, + "loss": 0.5891, + "step": 18160 + }, + { + "epoch": 1.3120450810049307, + "grad_norm": 5.602603831522729, + "learning_rate": 1.3972359963288356e-06, + "loss": 0.571, + "step": 18161 + }, + { + "epoch": 1.3121173262051402, + "grad_norm": 6.610538749958074, + "learning_rate": 1.39697349608583e-06, + "loss": 0.6339, + "step": 18162 + }, + { + "epoch": 1.3121895714053498, + "grad_norm": 5.719239927891048, + "learning_rate": 1.3967110109424021e-06, + "loss": 0.6002, + "step": 18163 + }, + { + "epoch": 1.3122618166055593, + "grad_norm": 6.465295431256621, + "learning_rate": 1.3964485409021426e-06, + "loss": 0.5548, + "step": 18164 + }, + { + "epoch": 1.3123340618057688, + "grad_norm": 7.553583863366329, + "learning_rate": 1.3961860859686454e-06, + "loss": 0.5769, + "step": 18165 + }, + { + "epoch": 1.3124063070059782, + "grad_norm": 6.472261369593175, + "learning_rate": 1.3959236461455032e-06, + "loss": 0.6276, + "step": 18166 + }, + { + "epoch": 1.312478552206188, + "grad_norm": 7.942100813514725, + "learning_rate": 1.3956612214363108e-06, + "loss": 0.6846, + "step": 18167 + }, + { + "epoch": 1.3125507974063972, + "grad_norm": 6.304552268240665, + "learning_rate": 1.3953988118446577e-06, + "loss": 0.6526, + "step": 18168 + }, + { + "epoch": 1.3126230426066068, + "grad_norm": 6.476994517312402, + "learning_rate": 1.3951364173741373e-06, + "loss": 0.6495, + "step": 18169 + }, + { + "epoch": 1.3126952878068163, + "grad_norm": 6.284360201005853, + "learning_rate": 1.3948740380283422e-06, + "loss": 0.5967, + "step": 18170 + }, + { + "epoch": 1.3127675330070259, + "grad_norm": 10.817387000159995, + "learning_rate": 1.3946116738108628e-06, + "loss": 0.6547, + "step": 18171 + }, + { + "epoch": 1.3128397782072354, + "grad_norm": 7.917840837105057, + "learning_rate": 1.3943493247252911e-06, + "loss": 0.698, + "step": 18172 + }, + { + "epoch": 1.312912023407445, + "grad_norm": 6.5394956383765335, + "learning_rate": 1.394086990775219e-06, + "loss": 0.6198, + "step": 18173 + }, + { + "epoch": 1.3129842686076545, + "grad_norm": 6.931646128308607, + "learning_rate": 1.3938246719642368e-06, + "loss": 0.6074, + "step": 18174 + }, + { + "epoch": 1.3130565138078638, + "grad_norm": 6.9061853742380706, + "learning_rate": 1.3935623682959365e-06, + "loss": 0.6432, + "step": 18175 + }, + { + "epoch": 1.3131287590080734, + "grad_norm": 6.9279471946272055, + "learning_rate": 1.3933000797739078e-06, + "loss": 0.612, + "step": 18176 + }, + { + "epoch": 1.313201004208283, + "grad_norm": 7.2715921816645706, + "learning_rate": 1.393037806401743e-06, + "loss": 0.62, + "step": 18177 + }, + { + "epoch": 1.3132732494084924, + "grad_norm": 6.953060261285696, + "learning_rate": 1.39277554818303e-06, + "loss": 0.6687, + "step": 18178 + }, + { + "epoch": 1.313345494608702, + "grad_norm": 7.093163285380254, + "learning_rate": 1.3925133051213602e-06, + "loss": 0.5862, + "step": 18179 + }, + { + "epoch": 1.3134177398089115, + "grad_norm": 6.6357142343276205, + "learning_rate": 1.3922510772203229e-06, + "loss": 0.628, + "step": 18180 + }, + { + "epoch": 1.313489985009121, + "grad_norm": 7.482450424932481, + "learning_rate": 1.3919888644835097e-06, + "loss": 0.5667, + "step": 18181 + }, + { + "epoch": 1.3135622302093304, + "grad_norm": 6.672926094111241, + "learning_rate": 1.3917266669145075e-06, + "loss": 0.6631, + "step": 18182 + }, + { + "epoch": 1.31363447540954, + "grad_norm": 6.456303070234296, + "learning_rate": 1.3914644845169068e-06, + "loss": 0.589, + "step": 18183 + }, + { + "epoch": 1.3137067206097495, + "grad_norm": 6.249682609128543, + "learning_rate": 1.3912023172942965e-06, + "loss": 0.5806, + "step": 18184 + }, + { + "epoch": 1.313778965809959, + "grad_norm": 7.796587554586807, + "learning_rate": 1.3909401652502654e-06, + "loss": 0.5903, + "step": 18185 + }, + { + "epoch": 1.3138512110101686, + "grad_norm": 7.034427721596216, + "learning_rate": 1.3906780283884025e-06, + "loss": 0.6886, + "step": 18186 + }, + { + "epoch": 1.313923456210378, + "grad_norm": 7.997194036490261, + "learning_rate": 1.390415906712296e-06, + "loss": 0.6113, + "step": 18187 + }, + { + "epoch": 1.3139957014105876, + "grad_norm": 7.0140811436389, + "learning_rate": 1.3901538002255352e-06, + "loss": 0.6813, + "step": 18188 + }, + { + "epoch": 1.314067946610797, + "grad_norm": 7.3099118078419885, + "learning_rate": 1.3898917089317062e-06, + "loss": 0.6045, + "step": 18189 + }, + { + "epoch": 1.3141401918110065, + "grad_norm": 7.107824403791005, + "learning_rate": 1.3896296328343977e-06, + "loss": 0.627, + "step": 18190 + }, + { + "epoch": 1.314212437011216, + "grad_norm": 7.654911593793168, + "learning_rate": 1.3893675719371975e-06, + "loss": 0.6033, + "step": 18191 + }, + { + "epoch": 1.3142846822114256, + "grad_norm": 7.292858382433494, + "learning_rate": 1.3891055262436936e-06, + "loss": 0.6081, + "step": 18192 + }, + { + "epoch": 1.3143569274116351, + "grad_norm": 7.310643408726796, + "learning_rate": 1.3888434957574718e-06, + "loss": 0.695, + "step": 18193 + }, + { + "epoch": 1.3144291726118447, + "grad_norm": 6.3417677013688385, + "learning_rate": 1.3885814804821185e-06, + "loss": 0.619, + "step": 18194 + }, + { + "epoch": 1.3145014178120542, + "grad_norm": 6.658009757660728, + "learning_rate": 1.388319480421224e-06, + "loss": 0.5999, + "step": 18195 + }, + { + "epoch": 1.3145736630122635, + "grad_norm": 6.34654196950616, + "learning_rate": 1.3880574955783716e-06, + "loss": 0.6295, + "step": 18196 + }, + { + "epoch": 1.314645908212473, + "grad_norm": 6.691650813722498, + "learning_rate": 1.3877955259571488e-06, + "loss": 0.5854, + "step": 18197 + }, + { + "epoch": 1.3147181534126826, + "grad_norm": 6.437919528957171, + "learning_rate": 1.3875335715611415e-06, + "loss": 0.6327, + "step": 18198 + }, + { + "epoch": 1.3147903986128922, + "grad_norm": 9.318148340135476, + "learning_rate": 1.3872716323939367e-06, + "loss": 0.6403, + "step": 18199 + }, + { + "epoch": 1.3148626438131017, + "grad_norm": 6.471113132667458, + "learning_rate": 1.3870097084591188e-06, + "loss": 0.5789, + "step": 18200 + }, + { + "epoch": 1.3149348890133112, + "grad_norm": 6.85494817512449, + "learning_rate": 1.3867477997602735e-06, + "loss": 0.6569, + "step": 18201 + }, + { + "epoch": 1.3150071342135208, + "grad_norm": 6.369370911104381, + "learning_rate": 1.386485906300986e-06, + "loss": 0.6088, + "step": 18202 + }, + { + "epoch": 1.31507937941373, + "grad_norm": 6.765089617330844, + "learning_rate": 1.3862240280848427e-06, + "loss": 0.6479, + "step": 18203 + }, + { + "epoch": 1.3151516246139396, + "grad_norm": 6.8055659010758776, + "learning_rate": 1.3859621651154271e-06, + "loss": 0.6388, + "step": 18204 + }, + { + "epoch": 1.3152238698141492, + "grad_norm": 9.210783435416372, + "learning_rate": 1.3857003173963245e-06, + "loss": 0.7218, + "step": 18205 + }, + { + "epoch": 1.3152961150143587, + "grad_norm": 6.967035330671729, + "learning_rate": 1.3854384849311206e-06, + "loss": 0.6074, + "step": 18206 + }, + { + "epoch": 1.3153683602145683, + "grad_norm": 6.3367156316452995, + "learning_rate": 1.3851766677233975e-06, + "loss": 0.6488, + "step": 18207 + }, + { + "epoch": 1.3154406054147778, + "grad_norm": 6.313443859278216, + "learning_rate": 1.3849148657767397e-06, + "loss": 0.6284, + "step": 18208 + }, + { + "epoch": 1.3155128506149874, + "grad_norm": 7.046031554637356, + "learning_rate": 1.3846530790947322e-06, + "loss": 0.6667, + "step": 18209 + }, + { + "epoch": 1.3155850958151967, + "grad_norm": 5.90090232026928, + "learning_rate": 1.3843913076809583e-06, + "loss": 0.6111, + "step": 18210 + }, + { + "epoch": 1.3156573410154062, + "grad_norm": 6.188317447186145, + "learning_rate": 1.3841295515390007e-06, + "loss": 0.6529, + "step": 18211 + }, + { + "epoch": 1.3157295862156158, + "grad_norm": 7.39674894727499, + "learning_rate": 1.3838678106724433e-06, + "loss": 0.7067, + "step": 18212 + }, + { + "epoch": 1.3158018314158253, + "grad_norm": 6.1675187974140036, + "learning_rate": 1.3836060850848688e-06, + "loss": 0.7335, + "step": 18213 + }, + { + "epoch": 1.3158740766160348, + "grad_norm": 7.380826297392142, + "learning_rate": 1.3833443747798596e-06, + "loss": 0.5868, + "step": 18214 + }, + { + "epoch": 1.3159463218162444, + "grad_norm": 5.9949163517274355, + "learning_rate": 1.3830826797609997e-06, + "loss": 0.5958, + "step": 18215 + }, + { + "epoch": 1.316018567016454, + "grad_norm": 6.636358348991597, + "learning_rate": 1.3828210000318703e-06, + "loss": 0.6243, + "step": 18216 + }, + { + "epoch": 1.3160908122166632, + "grad_norm": 7.282210519984509, + "learning_rate": 1.3825593355960548e-06, + "loss": 0.6301, + "step": 18217 + }, + { + "epoch": 1.3161630574168728, + "grad_norm": 5.893400555280497, + "learning_rate": 1.3822976864571337e-06, + "loss": 0.6135, + "step": 18218 + }, + { + "epoch": 1.3162353026170823, + "grad_norm": 6.300937752337972, + "learning_rate": 1.3820360526186894e-06, + "loss": 0.6785, + "step": 18219 + }, + { + "epoch": 1.3163075478172919, + "grad_norm": 7.203197553356133, + "learning_rate": 1.3817744340843042e-06, + "loss": 0.6272, + "step": 18220 + }, + { + "epoch": 1.3163797930175014, + "grad_norm": 6.323357941146102, + "learning_rate": 1.3815128308575582e-06, + "loss": 0.5985, + "step": 18221 + }, + { + "epoch": 1.316452038217711, + "grad_norm": 7.188856444694237, + "learning_rate": 1.3812512429420317e-06, + "loss": 0.6111, + "step": 18222 + }, + { + "epoch": 1.3165242834179205, + "grad_norm": 6.3505077729744865, + "learning_rate": 1.3809896703413083e-06, + "loss": 0.6486, + "step": 18223 + }, + { + "epoch": 1.3165965286181298, + "grad_norm": 8.614660727827092, + "learning_rate": 1.3807281130589683e-06, + "loss": 0.6894, + "step": 18224 + }, + { + "epoch": 1.3166687738183396, + "grad_norm": 7.128526584496206, + "learning_rate": 1.3804665710985903e-06, + "loss": 0.6395, + "step": 18225 + }, + { + "epoch": 1.316741019018549, + "grad_norm": 6.470192863502445, + "learning_rate": 1.380205044463756e-06, + "loss": 0.6225, + "step": 18226 + }, + { + "epoch": 1.3168132642187584, + "grad_norm": 6.188216659195327, + "learning_rate": 1.3799435331580457e-06, + "loss": 0.6018, + "step": 18227 + }, + { + "epoch": 1.316885509418968, + "grad_norm": 7.814009619772081, + "learning_rate": 1.3796820371850384e-06, + "loss": 0.6758, + "step": 18228 + }, + { + "epoch": 1.3169577546191775, + "grad_norm": 6.990849917890019, + "learning_rate": 1.3794205565483135e-06, + "loss": 0.6468, + "step": 18229 + }, + { + "epoch": 1.317029999819387, + "grad_norm": 6.650567579786013, + "learning_rate": 1.3791590912514513e-06, + "loss": 0.5549, + "step": 18230 + }, + { + "epoch": 1.3171022450195964, + "grad_norm": 6.561234125026191, + "learning_rate": 1.378897641298031e-06, + "loss": 0.6191, + "step": 18231 + }, + { + "epoch": 1.3171744902198061, + "grad_norm": 5.242779443921652, + "learning_rate": 1.3786362066916315e-06, + "loss": 0.5513, + "step": 18232 + }, + { + "epoch": 1.3172467354200155, + "grad_norm": 6.524144153810312, + "learning_rate": 1.3783747874358316e-06, + "loss": 0.6297, + "step": 18233 + }, + { + "epoch": 1.317318980620225, + "grad_norm": 7.559250566988865, + "learning_rate": 1.3781133835342098e-06, + "loss": 0.682, + "step": 18234 + }, + { + "epoch": 1.3173912258204346, + "grad_norm": 6.178940526520177, + "learning_rate": 1.377851994990346e-06, + "loss": 0.6379, + "step": 18235 + }, + { + "epoch": 1.317463471020644, + "grad_norm": 6.1144271215428025, + "learning_rate": 1.377590621807816e-06, + "loss": 0.5933, + "step": 18236 + }, + { + "epoch": 1.3175357162208536, + "grad_norm": 7.1059798297504555, + "learning_rate": 1.377329263990199e-06, + "loss": 0.6458, + "step": 18237 + }, + { + "epoch": 1.317607961421063, + "grad_norm": 7.028281437144987, + "learning_rate": 1.3770679215410731e-06, + "loss": 0.6373, + "step": 18238 + }, + { + "epoch": 1.3176802066212727, + "grad_norm": 7.538434033066711, + "learning_rate": 1.3768065944640151e-06, + "loss": 0.6179, + "step": 18239 + }, + { + "epoch": 1.317752451821482, + "grad_norm": 8.435600010324084, + "learning_rate": 1.3765452827626024e-06, + "loss": 0.6754, + "step": 18240 + }, + { + "epoch": 1.3178246970216916, + "grad_norm": 6.9886718094991895, + "learning_rate": 1.3762839864404127e-06, + "loss": 0.655, + "step": 18241 + }, + { + "epoch": 1.3178969422219011, + "grad_norm": 6.878701618016806, + "learning_rate": 1.3760227055010229e-06, + "loss": 0.6111, + "step": 18242 + }, + { + "epoch": 1.3179691874221107, + "grad_norm": 6.304535628785748, + "learning_rate": 1.375761439948009e-06, + "loss": 0.6471, + "step": 18243 + }, + { + "epoch": 1.3180414326223202, + "grad_norm": 7.142945828568441, + "learning_rate": 1.3755001897849486e-06, + "loss": 0.6508, + "step": 18244 + }, + { + "epoch": 1.3181136778225295, + "grad_norm": 6.806778491250841, + "learning_rate": 1.3752389550154176e-06, + "loss": 0.6033, + "step": 18245 + }, + { + "epoch": 1.3181859230227393, + "grad_norm": 8.096567495698553, + "learning_rate": 1.374977735642992e-06, + "loss": 0.6042, + "step": 18246 + }, + { + "epoch": 1.3182581682229486, + "grad_norm": 8.717220267907583, + "learning_rate": 1.3747165316712472e-06, + "loss": 0.6742, + "step": 18247 + }, + { + "epoch": 1.3183304134231582, + "grad_norm": 6.835753066708428, + "learning_rate": 1.3744553431037595e-06, + "loss": 0.6238, + "step": 18248 + }, + { + "epoch": 1.3184026586233677, + "grad_norm": 6.886060383572103, + "learning_rate": 1.374194169944105e-06, + "loss": 0.6806, + "step": 18249 + }, + { + "epoch": 1.3184749038235772, + "grad_norm": 7.083470631185064, + "learning_rate": 1.3739330121958562e-06, + "loss": 0.5823, + "step": 18250 + }, + { + "epoch": 1.3185471490237868, + "grad_norm": 7.424761979626119, + "learning_rate": 1.3736718698625912e-06, + "loss": 0.5532, + "step": 18251 + }, + { + "epoch": 1.3186193942239963, + "grad_norm": 5.891219779426155, + "learning_rate": 1.373410742947885e-06, + "loss": 0.6774, + "step": 18252 + }, + { + "epoch": 1.3186916394242059, + "grad_norm": 7.127858977013463, + "learning_rate": 1.3731496314553096e-06, + "loss": 0.671, + "step": 18253 + }, + { + "epoch": 1.3187638846244152, + "grad_norm": 6.924126459105126, + "learning_rate": 1.3728885353884413e-06, + "loss": 0.5965, + "step": 18254 + }, + { + "epoch": 1.3188361298246247, + "grad_norm": 7.1509963828531005, + "learning_rate": 1.3726274547508533e-06, + "loss": 0.6372, + "step": 18255 + }, + { + "epoch": 1.3189083750248343, + "grad_norm": 6.95174149025368, + "learning_rate": 1.3723663895461215e-06, + "loss": 0.6169, + "step": 18256 + }, + { + "epoch": 1.3189806202250438, + "grad_norm": 6.45676007540831, + "learning_rate": 1.372105339777817e-06, + "loss": 0.6049, + "step": 18257 + }, + { + "epoch": 1.3190528654252534, + "grad_norm": 7.555726099271137, + "learning_rate": 1.3718443054495145e-06, + "loss": 0.6237, + "step": 18258 + }, + { + "epoch": 1.319125110625463, + "grad_norm": 6.353544144283518, + "learning_rate": 1.3715832865647879e-06, + "loss": 0.6398, + "step": 18259 + }, + { + "epoch": 1.3191973558256724, + "grad_norm": 6.950994615258843, + "learning_rate": 1.3713222831272099e-06, + "loss": 0.6687, + "step": 18260 + }, + { + "epoch": 1.3192696010258818, + "grad_norm": 7.6849388260103595, + "learning_rate": 1.3710612951403536e-06, + "loss": 0.6396, + "step": 18261 + }, + { + "epoch": 1.3193418462260913, + "grad_norm": 7.330918290096693, + "learning_rate": 1.370800322607791e-06, + "loss": 0.6559, + "step": 18262 + }, + { + "epoch": 1.3194140914263008, + "grad_norm": 5.716485580972013, + "learning_rate": 1.3705393655330967e-06, + "loss": 0.6546, + "step": 18263 + }, + { + "epoch": 1.3194863366265104, + "grad_norm": 6.6483594663458145, + "learning_rate": 1.3702784239198403e-06, + "loss": 0.6171, + "step": 18264 + }, + { + "epoch": 1.31955858182672, + "grad_norm": 6.972967539444906, + "learning_rate": 1.3700174977715952e-06, + "loss": 0.6804, + "step": 18265 + }, + { + "epoch": 1.3196308270269295, + "grad_norm": 6.753430660308415, + "learning_rate": 1.3697565870919333e-06, + "loss": 0.6415, + "step": 18266 + }, + { + "epoch": 1.319703072227139, + "grad_norm": 5.63386396734201, + "learning_rate": 1.369495691884427e-06, + "loss": 0.588, + "step": 18267 + }, + { + "epoch": 1.3197753174273483, + "grad_norm": 6.602570767122703, + "learning_rate": 1.3692348121526457e-06, + "loss": 0.6453, + "step": 18268 + }, + { + "epoch": 1.3198475626275579, + "grad_norm": 6.6578393032749315, + "learning_rate": 1.3689739479001624e-06, + "loss": 0.6215, + "step": 18269 + }, + { + "epoch": 1.3199198078277674, + "grad_norm": 6.951107941280162, + "learning_rate": 1.368713099130547e-06, + "loss": 0.6403, + "step": 18270 + }, + { + "epoch": 1.319992053027977, + "grad_norm": 6.509658093827598, + "learning_rate": 1.3684522658473712e-06, + "loss": 0.6455, + "step": 18271 + }, + { + "epoch": 1.3200642982281865, + "grad_norm": 6.165242245409346, + "learning_rate": 1.3681914480542052e-06, + "loss": 0.6162, + "step": 18272 + }, + { + "epoch": 1.320136543428396, + "grad_norm": 6.7890984607793285, + "learning_rate": 1.3679306457546193e-06, + "loss": 0.5705, + "step": 18273 + }, + { + "epoch": 1.3202087886286056, + "grad_norm": 6.083426000159291, + "learning_rate": 1.3676698589521853e-06, + "loss": 0.588, + "step": 18274 + }, + { + "epoch": 1.320281033828815, + "grad_norm": 7.042794883981219, + "learning_rate": 1.3674090876504703e-06, + "loss": 0.6778, + "step": 18275 + }, + { + "epoch": 1.3203532790290244, + "grad_norm": 5.424318845677787, + "learning_rate": 1.367148331853046e-06, + "loss": 0.6133, + "step": 18276 + }, + { + "epoch": 1.320425524229234, + "grad_norm": 8.274897143136716, + "learning_rate": 1.3668875915634819e-06, + "loss": 0.6947, + "step": 18277 + }, + { + "epoch": 1.3204977694294435, + "grad_norm": 6.646383360421037, + "learning_rate": 1.366626866785346e-06, + "loss": 0.6231, + "step": 18278 + }, + { + "epoch": 1.320570014629653, + "grad_norm": 6.1388412097826555, + "learning_rate": 1.3663661575222076e-06, + "loss": 0.63, + "step": 18279 + }, + { + "epoch": 1.3206422598298626, + "grad_norm": 6.408925163147575, + "learning_rate": 1.366105463777637e-06, + "loss": 0.5984, + "step": 18280 + }, + { + "epoch": 1.3207145050300722, + "grad_norm": 5.126857886020176, + "learning_rate": 1.3658447855552033e-06, + "loss": 0.6209, + "step": 18281 + }, + { + "epoch": 1.3207867502302815, + "grad_norm": 5.961825043484679, + "learning_rate": 1.3655841228584727e-06, + "loss": 0.6369, + "step": 18282 + }, + { + "epoch": 1.320858995430491, + "grad_norm": 6.308802362069392, + "learning_rate": 1.3653234756910148e-06, + "loss": 0.6154, + "step": 18283 + }, + { + "epoch": 1.3209312406307006, + "grad_norm": 6.48247483689573, + "learning_rate": 1.3650628440563984e-06, + "loss": 0.635, + "step": 18284 + }, + { + "epoch": 1.32100348583091, + "grad_norm": 6.142281888723206, + "learning_rate": 1.3648022279581897e-06, + "loss": 0.659, + "step": 18285 + }, + { + "epoch": 1.3210757310311196, + "grad_norm": 6.6817665121691086, + "learning_rate": 1.3645416273999568e-06, + "loss": 0.6762, + "step": 18286 + }, + { + "epoch": 1.3211479762313292, + "grad_norm": 6.197860213164204, + "learning_rate": 1.3642810423852673e-06, + "loss": 0.6938, + "step": 18287 + }, + { + "epoch": 1.3212202214315387, + "grad_norm": 8.166453559845742, + "learning_rate": 1.3640204729176888e-06, + "loss": 0.6644, + "step": 18288 + }, + { + "epoch": 1.321292466631748, + "grad_norm": 6.764616787609119, + "learning_rate": 1.3637599190007877e-06, + "loss": 0.6225, + "step": 18289 + }, + { + "epoch": 1.3213647118319576, + "grad_norm": 6.15634170696288, + "learning_rate": 1.3634993806381314e-06, + "loss": 0.606, + "step": 18290 + }, + { + "epoch": 1.3214369570321671, + "grad_norm": 6.828139804850226, + "learning_rate": 1.3632388578332862e-06, + "loss": 0.7147, + "step": 18291 + }, + { + "epoch": 1.3215092022323767, + "grad_norm": 7.264859460677711, + "learning_rate": 1.3629783505898188e-06, + "loss": 0.6343, + "step": 18292 + }, + { + "epoch": 1.3215814474325862, + "grad_norm": 7.42362129949004, + "learning_rate": 1.3627178589112945e-06, + "loss": 0.6367, + "step": 18293 + }, + { + "epoch": 1.3216536926327958, + "grad_norm": 6.302471390643738, + "learning_rate": 1.3624573828012795e-06, + "loss": 0.5733, + "step": 18294 + }, + { + "epoch": 1.3217259378330053, + "grad_norm": 6.163602050292913, + "learning_rate": 1.3621969222633408e-06, + "loss": 0.6099, + "step": 18295 + }, + { + "epoch": 1.3217981830332146, + "grad_norm": 6.725144009661324, + "learning_rate": 1.3619364773010416e-06, + "loss": 0.6189, + "step": 18296 + }, + { + "epoch": 1.3218704282334244, + "grad_norm": 6.953203728047973, + "learning_rate": 1.3616760479179483e-06, + "loss": 0.6202, + "step": 18297 + }, + { + "epoch": 1.3219426734336337, + "grad_norm": 7.374821288966225, + "learning_rate": 1.3614156341176264e-06, + "loss": 0.6077, + "step": 18298 + }, + { + "epoch": 1.3220149186338432, + "grad_norm": 6.814137673176549, + "learning_rate": 1.36115523590364e-06, + "loss": 0.5869, + "step": 18299 + }, + { + "epoch": 1.3220871638340528, + "grad_norm": 7.460198394755419, + "learning_rate": 1.3608948532795546e-06, + "loss": 0.6215, + "step": 18300 + }, + { + "epoch": 1.3221594090342623, + "grad_norm": 7.425669003540791, + "learning_rate": 1.360634486248934e-06, + "loss": 0.7022, + "step": 18301 + }, + { + "epoch": 1.3222316542344719, + "grad_norm": 7.089576602663187, + "learning_rate": 1.3603741348153434e-06, + "loss": 0.625, + "step": 18302 + }, + { + "epoch": 1.3223038994346812, + "grad_norm": 7.803695233782379, + "learning_rate": 1.3601137989823454e-06, + "loss": 0.6709, + "step": 18303 + }, + { + "epoch": 1.322376144634891, + "grad_norm": 6.002023037951096, + "learning_rate": 1.359853478753504e-06, + "loss": 0.618, + "step": 18304 + }, + { + "epoch": 1.3224483898351003, + "grad_norm": 7.503587500550518, + "learning_rate": 1.3595931741323837e-06, + "loss": 0.6266, + "step": 18305 + }, + { + "epoch": 1.3225206350353098, + "grad_norm": 6.712985240368278, + "learning_rate": 1.359332885122548e-06, + "loss": 0.6409, + "step": 18306 + }, + { + "epoch": 1.3225928802355194, + "grad_norm": 6.606258153414139, + "learning_rate": 1.3590726117275576e-06, + "loss": 0.6945, + "step": 18307 + }, + { + "epoch": 1.322665125435729, + "grad_norm": 6.117243000426706, + "learning_rate": 1.3588123539509783e-06, + "loss": 0.6022, + "step": 18308 + }, + { + "epoch": 1.3227373706359384, + "grad_norm": 7.837006485533674, + "learning_rate": 1.3585521117963726e-06, + "loss": 0.6226, + "step": 18309 + }, + { + "epoch": 1.3228096158361478, + "grad_norm": 6.50517976223668, + "learning_rate": 1.3582918852673017e-06, + "loss": 0.6241, + "step": 18310 + }, + { + "epoch": 1.3228818610363575, + "grad_norm": 6.791534921815413, + "learning_rate": 1.3580316743673278e-06, + "loss": 0.6086, + "step": 18311 + }, + { + "epoch": 1.3229541062365668, + "grad_norm": 6.634275169537851, + "learning_rate": 1.357771479100014e-06, + "loss": 0.6656, + "step": 18312 + }, + { + "epoch": 1.3230263514367764, + "grad_norm": 7.881989783008674, + "learning_rate": 1.3575112994689227e-06, + "loss": 0.715, + "step": 18313 + }, + { + "epoch": 1.323098596636986, + "grad_norm": 6.251306931225556, + "learning_rate": 1.3572511354776135e-06, + "loss": 0.5777, + "step": 18314 + }, + { + "epoch": 1.3231708418371955, + "grad_norm": 5.975097795907135, + "learning_rate": 1.3569909871296489e-06, + "loss": 0.6495, + "step": 18315 + }, + { + "epoch": 1.323243087037405, + "grad_norm": 8.045514334293383, + "learning_rate": 1.3567308544285904e-06, + "loss": 0.6283, + "step": 18316 + }, + { + "epoch": 1.3233153322376143, + "grad_norm": 6.415135023282312, + "learning_rate": 1.3564707373779987e-06, + "loss": 0.6158, + "step": 18317 + }, + { + "epoch": 1.323387577437824, + "grad_norm": 6.762035800986869, + "learning_rate": 1.3562106359814348e-06, + "loss": 0.6401, + "step": 18318 + }, + { + "epoch": 1.3234598226380334, + "grad_norm": 8.023757467603678, + "learning_rate": 1.3559505502424592e-06, + "loss": 0.646, + "step": 18319 + }, + { + "epoch": 1.323532067838243, + "grad_norm": 7.1672154002107105, + "learning_rate": 1.3556904801646331e-06, + "loss": 0.6271, + "step": 18320 + }, + { + "epoch": 1.3236043130384525, + "grad_norm": 7.68338125659398, + "learning_rate": 1.3554304257515149e-06, + "loss": 0.6662, + "step": 18321 + }, + { + "epoch": 1.323676558238662, + "grad_norm": 7.8506169750949155, + "learning_rate": 1.3551703870066657e-06, + "loss": 0.7356, + "step": 18322 + }, + { + "epoch": 1.3237488034388716, + "grad_norm": 7.454350464717236, + "learning_rate": 1.3549103639336448e-06, + "loss": 0.6585, + "step": 18323 + }, + { + "epoch": 1.3238210486390811, + "grad_norm": 7.578355797468048, + "learning_rate": 1.354650356536013e-06, + "loss": 0.5594, + "step": 18324 + }, + { + "epoch": 1.3238932938392907, + "grad_norm": 6.259115057359908, + "learning_rate": 1.3543903648173274e-06, + "loss": 0.5706, + "step": 18325 + }, + { + "epoch": 1.3239655390395, + "grad_norm": 8.023154605680995, + "learning_rate": 1.354130388781148e-06, + "loss": 0.6844, + "step": 18326 + }, + { + "epoch": 1.3240377842397095, + "grad_norm": 6.763835150444074, + "learning_rate": 1.3538704284310345e-06, + "loss": 0.6764, + "step": 18327 + }, + { + "epoch": 1.324110029439919, + "grad_norm": 6.676094668348552, + "learning_rate": 1.3536104837705445e-06, + "loss": 0.577, + "step": 18328 + }, + { + "epoch": 1.3241822746401286, + "grad_norm": 7.777645394545883, + "learning_rate": 1.3533505548032367e-06, + "loss": 0.7032, + "step": 18329 + }, + { + "epoch": 1.3242545198403382, + "grad_norm": 6.378591087016873, + "learning_rate": 1.3530906415326696e-06, + "loss": 0.6299, + "step": 18330 + }, + { + "epoch": 1.3243267650405477, + "grad_norm": 6.858705944199963, + "learning_rate": 1.3528307439624022e-06, + "loss": 0.6457, + "step": 18331 + }, + { + "epoch": 1.3243990102407572, + "grad_norm": 7.184227505265413, + "learning_rate": 1.3525708620959904e-06, + "loss": 0.6029, + "step": 18332 + }, + { + "epoch": 1.3244712554409666, + "grad_norm": 6.693277020667128, + "learning_rate": 1.3523109959369924e-06, + "loss": 0.5759, + "step": 18333 + }, + { + "epoch": 1.324543500641176, + "grad_norm": 7.725794492685713, + "learning_rate": 1.3520511454889666e-06, + "loss": 0.611, + "step": 18334 + }, + { + "epoch": 1.3246157458413856, + "grad_norm": 6.252268265155667, + "learning_rate": 1.3517913107554674e-06, + "loss": 0.6514, + "step": 18335 + }, + { + "epoch": 1.3246879910415952, + "grad_norm": 6.663818641127318, + "learning_rate": 1.3515314917400547e-06, + "loss": 0.6716, + "step": 18336 + }, + { + "epoch": 1.3247602362418047, + "grad_norm": 5.105617996754573, + "learning_rate": 1.351271688446284e-06, + "loss": 0.6131, + "step": 18337 + }, + { + "epoch": 1.3248324814420143, + "grad_norm": 6.9554752106879185, + "learning_rate": 1.351011900877713e-06, + "loss": 0.6518, + "step": 18338 + }, + { + "epoch": 1.3249047266422238, + "grad_norm": 6.587114656531417, + "learning_rate": 1.3507521290378958e-06, + "loss": 0.6467, + "step": 18339 + }, + { + "epoch": 1.3249769718424331, + "grad_norm": 6.479541273010195, + "learning_rate": 1.3504923729303898e-06, + "loss": 0.5925, + "step": 18340 + }, + { + "epoch": 1.3250492170426427, + "grad_norm": 6.59960879698133, + "learning_rate": 1.3502326325587506e-06, + "loss": 0.6253, + "step": 18341 + }, + { + "epoch": 1.3251214622428522, + "grad_norm": 6.636347427443381, + "learning_rate": 1.3499729079265351e-06, + "loss": 0.6108, + "step": 18342 + }, + { + "epoch": 1.3251937074430618, + "grad_norm": 6.340699312649789, + "learning_rate": 1.3497131990372964e-06, + "loss": 0.6078, + "step": 18343 + }, + { + "epoch": 1.3252659526432713, + "grad_norm": 7.346542801317623, + "learning_rate": 1.349453505894591e-06, + "loss": 0.6469, + "step": 18344 + }, + { + "epoch": 1.3253381978434808, + "grad_norm": 5.829427046978538, + "learning_rate": 1.3491938285019737e-06, + "loss": 0.6002, + "step": 18345 + }, + { + "epoch": 1.3254104430436904, + "grad_norm": 6.496055873592976, + "learning_rate": 1.3489341668629993e-06, + "loss": 0.6167, + "step": 18346 + }, + { + "epoch": 1.3254826882438997, + "grad_norm": 6.682068802964306, + "learning_rate": 1.3486745209812225e-06, + "loss": 0.6333, + "step": 18347 + }, + { + "epoch": 1.3255549334441092, + "grad_norm": 5.9801711812604275, + "learning_rate": 1.3484148908601974e-06, + "loss": 0.6541, + "step": 18348 + }, + { + "epoch": 1.3256271786443188, + "grad_norm": 7.343893885724736, + "learning_rate": 1.3481552765034794e-06, + "loss": 0.6807, + "step": 18349 + }, + { + "epoch": 1.3256994238445283, + "grad_norm": 8.169000824023133, + "learning_rate": 1.3478956779146202e-06, + "loss": 0.6483, + "step": 18350 + }, + { + "epoch": 1.3257716690447379, + "grad_norm": 6.458172835129779, + "learning_rate": 1.3476360950971748e-06, + "loss": 0.6558, + "step": 18351 + }, + { + "epoch": 1.3258439142449474, + "grad_norm": 8.221118194187028, + "learning_rate": 1.3473765280546974e-06, + "loss": 0.5763, + "step": 18352 + }, + { + "epoch": 1.325916159445157, + "grad_norm": 7.306938208154166, + "learning_rate": 1.3471169767907393e-06, + "loss": 0.67, + "step": 18353 + }, + { + "epoch": 1.3259884046453663, + "grad_norm": 6.407806058637186, + "learning_rate": 1.3468574413088547e-06, + "loss": 0.6265, + "step": 18354 + }, + { + "epoch": 1.3260606498455758, + "grad_norm": 7.128455411686074, + "learning_rate": 1.3465979216125966e-06, + "loss": 0.6251, + "step": 18355 + }, + { + "epoch": 1.3261328950457854, + "grad_norm": 12.910348310787722, + "learning_rate": 1.3463384177055173e-06, + "loss": 0.6025, + "step": 18356 + }, + { + "epoch": 1.326205140245995, + "grad_norm": 6.020868250691714, + "learning_rate": 1.346078929591169e-06, + "loss": 0.5695, + "step": 18357 + }, + { + "epoch": 1.3262773854462044, + "grad_norm": 6.8758522199069985, + "learning_rate": 1.3458194572731044e-06, + "loss": 0.6175, + "step": 18358 + }, + { + "epoch": 1.326349630646414, + "grad_norm": 5.510542216129649, + "learning_rate": 1.3455600007548764e-06, + "loss": 0.6838, + "step": 18359 + }, + { + "epoch": 1.3264218758466235, + "grad_norm": 8.176561342979566, + "learning_rate": 1.3453005600400344e-06, + "loss": 0.7139, + "step": 18360 + }, + { + "epoch": 1.3264941210468328, + "grad_norm": 6.892246323747218, + "learning_rate": 1.3450411351321313e-06, + "loss": 0.7386, + "step": 18361 + }, + { + "epoch": 1.3265663662470424, + "grad_norm": 6.793452521499303, + "learning_rate": 1.3447817260347182e-06, + "loss": 0.5909, + "step": 18362 + }, + { + "epoch": 1.326638611447252, + "grad_norm": 7.217666874995562, + "learning_rate": 1.3445223327513474e-06, + "loss": 0.6548, + "step": 18363 + }, + { + "epoch": 1.3267108566474615, + "grad_norm": 7.114315119128321, + "learning_rate": 1.3442629552855667e-06, + "loss": 0.6624, + "step": 18364 + }, + { + "epoch": 1.326783101847671, + "grad_norm": 7.955102580983768, + "learning_rate": 1.3440035936409295e-06, + "loss": 0.6328, + "step": 18365 + }, + { + "epoch": 1.3268553470478806, + "grad_norm": 6.700853652091504, + "learning_rate": 1.3437442478209859e-06, + "loss": 0.6222, + "step": 18366 + }, + { + "epoch": 1.32692759224809, + "grad_norm": 7.891617212137217, + "learning_rate": 1.3434849178292867e-06, + "loss": 0.6412, + "step": 18367 + }, + { + "epoch": 1.3269998374482994, + "grad_norm": 9.27922431597868, + "learning_rate": 1.3432256036693798e-06, + "loss": 0.64, + "step": 18368 + }, + { + "epoch": 1.3270720826485092, + "grad_norm": 7.302794969158882, + "learning_rate": 1.3429663053448163e-06, + "loss": 0.6017, + "step": 18369 + }, + { + "epoch": 1.3271443278487185, + "grad_norm": 6.697398623005031, + "learning_rate": 1.3427070228591465e-06, + "loss": 0.6163, + "step": 18370 + }, + { + "epoch": 1.327216573048928, + "grad_norm": 10.377853185074311, + "learning_rate": 1.3424477562159183e-06, + "loss": 0.6576, + "step": 18371 + }, + { + "epoch": 1.3272888182491376, + "grad_norm": 6.907631300107343, + "learning_rate": 1.3421885054186818e-06, + "loss": 0.6583, + "step": 18372 + }, + { + "epoch": 1.3273610634493471, + "grad_norm": 7.279190754609484, + "learning_rate": 1.3419292704709852e-06, + "loss": 0.7032, + "step": 18373 + }, + { + "epoch": 1.3274333086495567, + "grad_norm": 8.411618018430715, + "learning_rate": 1.3416700513763775e-06, + "loss": 0.6362, + "step": 18374 + }, + { + "epoch": 1.327505553849766, + "grad_norm": 6.834156855139798, + "learning_rate": 1.341410848138408e-06, + "loss": 0.6682, + "step": 18375 + }, + { + "epoch": 1.3275777990499757, + "grad_norm": 7.467741958074916, + "learning_rate": 1.3411516607606239e-06, + "loss": 0.7042, + "step": 18376 + }, + { + "epoch": 1.327650044250185, + "grad_norm": 7.305221456503865, + "learning_rate": 1.3408924892465752e-06, + "loss": 0.662, + "step": 18377 + }, + { + "epoch": 1.3277222894503946, + "grad_norm": 6.441766898825865, + "learning_rate": 1.3406333335998074e-06, + "loss": 0.6314, + "step": 18378 + }, + { + "epoch": 1.3277945346506042, + "grad_norm": 6.246492240747658, + "learning_rate": 1.3403741938238685e-06, + "loss": 0.6399, + "step": 18379 + }, + { + "epoch": 1.3278667798508137, + "grad_norm": 6.7275092226303315, + "learning_rate": 1.340115069922307e-06, + "loss": 0.6224, + "step": 18380 + }, + { + "epoch": 1.3279390250510232, + "grad_norm": 6.618361115670697, + "learning_rate": 1.33985596189867e-06, + "loss": 0.5631, + "step": 18381 + }, + { + "epoch": 1.3280112702512326, + "grad_norm": 7.149070636664431, + "learning_rate": 1.3395968697565038e-06, + "loss": 0.5738, + "step": 18382 + }, + { + "epoch": 1.3280835154514423, + "grad_norm": 6.801933843850091, + "learning_rate": 1.3393377934993552e-06, + "loss": 0.7012, + "step": 18383 + }, + { + "epoch": 1.3281557606516516, + "grad_norm": 6.428125759601317, + "learning_rate": 1.339078733130771e-06, + "loss": 0.6259, + "step": 18384 + }, + { + "epoch": 1.3282280058518612, + "grad_norm": 8.01861790037576, + "learning_rate": 1.3388196886542976e-06, + "loss": 0.5269, + "step": 18385 + }, + { + "epoch": 1.3283002510520707, + "grad_norm": 6.795320179562802, + "learning_rate": 1.3385606600734807e-06, + "loss": 0.6425, + "step": 18386 + }, + { + "epoch": 1.3283724962522803, + "grad_norm": 7.448522270141177, + "learning_rate": 1.338301647391867e-06, + "loss": 0.6846, + "step": 18387 + }, + { + "epoch": 1.3284447414524898, + "grad_norm": 6.358214176339692, + "learning_rate": 1.3380426506130023e-06, + "loss": 0.5136, + "step": 18388 + }, + { + "epoch": 1.3285169866526991, + "grad_norm": 7.288682279656543, + "learning_rate": 1.3377836697404312e-06, + "loss": 0.6707, + "step": 18389 + }, + { + "epoch": 1.328589231852909, + "grad_norm": 6.6021706566212295, + "learning_rate": 1.3375247047776984e-06, + "loss": 0.6464, + "step": 18390 + }, + { + "epoch": 1.3286614770531182, + "grad_norm": 6.827636701290182, + "learning_rate": 1.3372657557283514e-06, + "loss": 0.6789, + "step": 18391 + }, + { + "epoch": 1.3287337222533278, + "grad_norm": 6.204516814397918, + "learning_rate": 1.3370068225959316e-06, + "loss": 0.6111, + "step": 18392 + }, + { + "epoch": 1.3288059674535373, + "grad_norm": 6.999662118659059, + "learning_rate": 1.3367479053839857e-06, + "loss": 0.7316, + "step": 18393 + }, + { + "epoch": 1.3288782126537468, + "grad_norm": 6.524681767562978, + "learning_rate": 1.3364890040960582e-06, + "loss": 0.6053, + "step": 18394 + }, + { + "epoch": 1.3289504578539564, + "grad_norm": 8.977570878006416, + "learning_rate": 1.3362301187356934e-06, + "loss": 0.6859, + "step": 18395 + }, + { + "epoch": 1.329022703054166, + "grad_norm": 6.046586061814525, + "learning_rate": 1.3359712493064343e-06, + "loss": 0.649, + "step": 18396 + }, + { + "epoch": 1.3290949482543755, + "grad_norm": 5.952331009428221, + "learning_rate": 1.3357123958118245e-06, + "loss": 0.5604, + "step": 18397 + }, + { + "epoch": 1.3291671934545848, + "grad_norm": 7.6107900819420005, + "learning_rate": 1.3354535582554084e-06, + "loss": 0.746, + "step": 18398 + }, + { + "epoch": 1.3292394386547943, + "grad_norm": 6.326826701791504, + "learning_rate": 1.3351947366407297e-06, + "loss": 0.6246, + "step": 18399 + }, + { + "epoch": 1.3293116838550039, + "grad_norm": 5.8466097928060785, + "learning_rate": 1.3349359309713295e-06, + "loss": 0.5775, + "step": 18400 + }, + { + "epoch": 1.3293839290552134, + "grad_norm": 6.727689535645734, + "learning_rate": 1.334677141250752e-06, + "loss": 0.6172, + "step": 18401 + }, + { + "epoch": 1.329456174255423, + "grad_norm": 6.489395587913835, + "learning_rate": 1.3344183674825395e-06, + "loss": 0.5905, + "step": 18402 + }, + { + "epoch": 1.3295284194556325, + "grad_norm": 6.53784664248888, + "learning_rate": 1.3341596096702345e-06, + "loss": 0.6348, + "step": 18403 + }, + { + "epoch": 1.329600664655842, + "grad_norm": 7.275000776048337, + "learning_rate": 1.3339008678173793e-06, + "loss": 0.6281, + "step": 18404 + }, + { + "epoch": 1.3296729098560514, + "grad_norm": 6.5839443969418125, + "learning_rate": 1.3336421419275158e-06, + "loss": 0.6442, + "step": 18405 + }, + { + "epoch": 1.329745155056261, + "grad_norm": 7.541333045394727, + "learning_rate": 1.3333834320041867e-06, + "loss": 0.6584, + "step": 18406 + }, + { + "epoch": 1.3298174002564704, + "grad_norm": 5.9714394147543715, + "learning_rate": 1.3331247380509316e-06, + "loss": 0.6131, + "step": 18407 + }, + { + "epoch": 1.32988964545668, + "grad_norm": 8.341530036633015, + "learning_rate": 1.3328660600712927e-06, + "loss": 0.6585, + "step": 18408 + }, + { + "epoch": 1.3299618906568895, + "grad_norm": 6.3220100905070264, + "learning_rate": 1.3326073980688126e-06, + "loss": 0.6856, + "step": 18409 + }, + { + "epoch": 1.330034135857099, + "grad_norm": 7.126288581908308, + "learning_rate": 1.3323487520470293e-06, + "loss": 0.5571, + "step": 18410 + }, + { + "epoch": 1.3301063810573086, + "grad_norm": 7.137133255871054, + "learning_rate": 1.3320901220094854e-06, + "loss": 0.6085, + "step": 18411 + }, + { + "epoch": 1.330178626257518, + "grad_norm": 7.680934428350951, + "learning_rate": 1.3318315079597196e-06, + "loss": 0.6851, + "step": 18412 + }, + { + "epoch": 1.3302508714577275, + "grad_norm": 5.769231727795644, + "learning_rate": 1.3315729099012756e-06, + "loss": 0.6587, + "step": 18413 + }, + { + "epoch": 1.330323116657937, + "grad_norm": 7.202168233553302, + "learning_rate": 1.33131432783769e-06, + "loss": 0.6161, + "step": 18414 + }, + { + "epoch": 1.3303953618581466, + "grad_norm": 6.868467244236974, + "learning_rate": 1.3310557617725042e-06, + "loss": 0.7091, + "step": 18415 + }, + { + "epoch": 1.330467607058356, + "grad_norm": 7.455213977754197, + "learning_rate": 1.3307972117092581e-06, + "loss": 0.6116, + "step": 18416 + }, + { + "epoch": 1.3305398522585656, + "grad_norm": 5.408074749063208, + "learning_rate": 1.3305386776514895e-06, + "loss": 0.6723, + "step": 18417 + }, + { + "epoch": 1.3306120974587752, + "grad_norm": 8.205750137991556, + "learning_rate": 1.3302801596027384e-06, + "loss": 0.6593, + "step": 18418 + }, + { + "epoch": 1.3306843426589845, + "grad_norm": 7.92268250091179, + "learning_rate": 1.3300216575665436e-06, + "loss": 0.7223, + "step": 18419 + }, + { + "epoch": 1.330756587859194, + "grad_norm": 6.7629397660015025, + "learning_rate": 1.3297631715464437e-06, + "loss": 0.6395, + "step": 18420 + }, + { + "epoch": 1.3308288330594036, + "grad_norm": 7.104592259282323, + "learning_rate": 1.3295047015459777e-06, + "loss": 0.632, + "step": 18421 + }, + { + "epoch": 1.3309010782596131, + "grad_norm": 5.644530067311727, + "learning_rate": 1.3292462475686834e-06, + "loss": 0.5866, + "step": 18422 + }, + { + "epoch": 1.3309733234598227, + "grad_norm": 6.47934257374838, + "learning_rate": 1.3289878096180988e-06, + "loss": 0.6953, + "step": 18423 + }, + { + "epoch": 1.3310455686600322, + "grad_norm": 7.686669529663153, + "learning_rate": 1.3287293876977632e-06, + "loss": 0.6494, + "step": 18424 + }, + { + "epoch": 1.3311178138602417, + "grad_norm": 6.698637057927373, + "learning_rate": 1.3284709818112112e-06, + "loss": 0.6538, + "step": 18425 + }, + { + "epoch": 1.331190059060451, + "grad_norm": 6.499381402730883, + "learning_rate": 1.3282125919619826e-06, + "loss": 0.5729, + "step": 18426 + }, + { + "epoch": 1.3312623042606606, + "grad_norm": 6.852641143625613, + "learning_rate": 1.3279542181536143e-06, + "loss": 0.5338, + "step": 18427 + }, + { + "epoch": 1.3313345494608702, + "grad_norm": 6.944108169890968, + "learning_rate": 1.3276958603896417e-06, + "loss": 0.6433, + "step": 18428 + }, + { + "epoch": 1.3314067946610797, + "grad_norm": 6.79806837808327, + "learning_rate": 1.3274375186736027e-06, + "loss": 0.6527, + "step": 18429 + }, + { + "epoch": 1.3314790398612892, + "grad_norm": 7.100220002540249, + "learning_rate": 1.3271791930090336e-06, + "loss": 0.6187, + "step": 18430 + }, + { + "epoch": 1.3315512850614988, + "grad_norm": 7.53614565700036, + "learning_rate": 1.3269208833994707e-06, + "loss": 0.6265, + "step": 18431 + }, + { + "epoch": 1.3316235302617083, + "grad_norm": 7.009941942788098, + "learning_rate": 1.3266625898484503e-06, + "loss": 0.6414, + "step": 18432 + }, + { + "epoch": 1.3316957754619176, + "grad_norm": 6.733786714353544, + "learning_rate": 1.3264043123595078e-06, + "loss": 0.5774, + "step": 18433 + }, + { + "epoch": 1.3317680206621272, + "grad_norm": 6.592146402770242, + "learning_rate": 1.3261460509361798e-06, + "loss": 0.5891, + "step": 18434 + }, + { + "epoch": 1.3318402658623367, + "grad_norm": 7.111480468191057, + "learning_rate": 1.3258878055820002e-06, + "loss": 0.6342, + "step": 18435 + }, + { + "epoch": 1.3319125110625463, + "grad_norm": 7.604999764848112, + "learning_rate": 1.3256295763005048e-06, + "loss": 0.5912, + "step": 18436 + }, + { + "epoch": 1.3319847562627558, + "grad_norm": 6.473008085828352, + "learning_rate": 1.3253713630952286e-06, + "loss": 0.5685, + "step": 18437 + }, + { + "epoch": 1.3320570014629654, + "grad_norm": 7.55592325038916, + "learning_rate": 1.3251131659697075e-06, + "loss": 0.5801, + "step": 18438 + }, + { + "epoch": 1.332129246663175, + "grad_norm": 5.996594416190339, + "learning_rate": 1.324854984927474e-06, + "loss": 0.6145, + "step": 18439 + }, + { + "epoch": 1.3322014918633842, + "grad_norm": 6.2776837449734835, + "learning_rate": 1.3245968199720633e-06, + "loss": 0.5866, + "step": 18440 + }, + { + "epoch": 1.3322737370635938, + "grad_norm": 6.451406328524243, + "learning_rate": 1.3243386711070094e-06, + "loss": 0.6219, + "step": 18441 + }, + { + "epoch": 1.3323459822638033, + "grad_norm": 7.31578524877216, + "learning_rate": 1.3240805383358463e-06, + "loss": 0.6203, + "step": 18442 + }, + { + "epoch": 1.3324182274640128, + "grad_norm": 7.531491604170545, + "learning_rate": 1.3238224216621077e-06, + "loss": 0.6456, + "step": 18443 + }, + { + "epoch": 1.3324904726642224, + "grad_norm": 5.710436505470296, + "learning_rate": 1.3235643210893264e-06, + "loss": 0.6338, + "step": 18444 + }, + { + "epoch": 1.332562717864432, + "grad_norm": 7.857126206219357, + "learning_rate": 1.3233062366210374e-06, + "loss": 0.7055, + "step": 18445 + }, + { + "epoch": 1.3326349630646415, + "grad_norm": 6.151488655854408, + "learning_rate": 1.3230481682607715e-06, + "loss": 0.584, + "step": 18446 + }, + { + "epoch": 1.3327072082648508, + "grad_norm": 6.231578993458157, + "learning_rate": 1.3227901160120624e-06, + "loss": 0.7433, + "step": 18447 + }, + { + "epoch": 1.3327794534650605, + "grad_norm": 6.284397228776419, + "learning_rate": 1.3225320798784425e-06, + "loss": 0.6425, + "step": 18448 + }, + { + "epoch": 1.3328516986652699, + "grad_norm": 8.878121270431063, + "learning_rate": 1.322274059863444e-06, + "loss": 0.677, + "step": 18449 + }, + { + "epoch": 1.3329239438654794, + "grad_norm": 7.906061826132706, + "learning_rate": 1.322016055970599e-06, + "loss": 0.6313, + "step": 18450 + }, + { + "epoch": 1.332996189065689, + "grad_norm": 6.0518270712966435, + "learning_rate": 1.3217580682034403e-06, + "loss": 0.6045, + "step": 18451 + }, + { + "epoch": 1.3330684342658985, + "grad_norm": 8.040691837132336, + "learning_rate": 1.3215000965654992e-06, + "loss": 0.7514, + "step": 18452 + }, + { + "epoch": 1.333140679466108, + "grad_norm": 8.082333794633273, + "learning_rate": 1.321242141060306e-06, + "loss": 0.7038, + "step": 18453 + }, + { + "epoch": 1.3332129246663174, + "grad_norm": 6.857610453829716, + "learning_rate": 1.3209842016913924e-06, + "loss": 0.5631, + "step": 18454 + }, + { + "epoch": 1.3332851698665271, + "grad_norm": 8.641319426350506, + "learning_rate": 1.3207262784622898e-06, + "loss": 0.6868, + "step": 18455 + }, + { + "epoch": 1.3333574150667364, + "grad_norm": 5.402717627043668, + "learning_rate": 1.32046837137653e-06, + "loss": 0.6061, + "step": 18456 + }, + { + "epoch": 1.333429660266946, + "grad_norm": 7.718769027130521, + "learning_rate": 1.320210480437641e-06, + "loss": 0.669, + "step": 18457 + }, + { + "epoch": 1.3335019054671555, + "grad_norm": 6.538223267438414, + "learning_rate": 1.3199526056491544e-06, + "loss": 0.5249, + "step": 18458 + }, + { + "epoch": 1.333574150667365, + "grad_norm": 6.288087984959482, + "learning_rate": 1.3196947470146005e-06, + "loss": 0.6454, + "step": 18459 + }, + { + "epoch": 1.3336463958675746, + "grad_norm": 7.347481027366438, + "learning_rate": 1.3194369045375092e-06, + "loss": 0.6479, + "step": 18460 + }, + { + "epoch": 1.333718641067784, + "grad_norm": 9.001599911368837, + "learning_rate": 1.3191790782214097e-06, + "loss": 0.5938, + "step": 18461 + }, + { + "epoch": 1.3337908862679937, + "grad_norm": 6.925465360904485, + "learning_rate": 1.3189212680698322e-06, + "loss": 0.6749, + "step": 18462 + }, + { + "epoch": 1.333863131468203, + "grad_norm": 6.803868977033628, + "learning_rate": 1.3186634740863063e-06, + "loss": 0.6809, + "step": 18463 + }, + { + "epoch": 1.3339353766684126, + "grad_norm": 6.581120984442182, + "learning_rate": 1.3184056962743591e-06, + "loss": 0.6549, + "step": 18464 + }, + { + "epoch": 1.334007621868622, + "grad_norm": 7.136713404828321, + "learning_rate": 1.318147934637521e-06, + "loss": 0.6396, + "step": 18465 + }, + { + "epoch": 1.3340798670688316, + "grad_norm": 7.625855945009909, + "learning_rate": 1.3178901891793203e-06, + "loss": 0.5892, + "step": 18466 + }, + { + "epoch": 1.3341521122690412, + "grad_norm": 7.323502308709531, + "learning_rate": 1.3176324599032844e-06, + "loss": 0.5603, + "step": 18467 + }, + { + "epoch": 1.3342243574692505, + "grad_norm": 7.071050009014253, + "learning_rate": 1.3173747468129422e-06, + "loss": 0.6483, + "step": 18468 + }, + { + "epoch": 1.3342966026694603, + "grad_norm": 7.29629886811468, + "learning_rate": 1.3171170499118201e-06, + "loss": 0.6545, + "step": 18469 + }, + { + "epoch": 1.3343688478696696, + "grad_norm": 6.413989049233062, + "learning_rate": 1.3168593692034492e-06, + "loss": 0.5815, + "step": 18470 + }, + { + "epoch": 1.3344410930698791, + "grad_norm": 6.405383539918358, + "learning_rate": 1.3166017046913543e-06, + "loss": 0.612, + "step": 18471 + }, + { + "epoch": 1.3345133382700887, + "grad_norm": 8.005868190026252, + "learning_rate": 1.3163440563790629e-06, + "loss": 0.7089, + "step": 18472 + }, + { + "epoch": 1.3345855834702982, + "grad_norm": 7.516916398032492, + "learning_rate": 1.3160864242701022e-06, + "loss": 0.5987, + "step": 18473 + }, + { + "epoch": 1.3346578286705078, + "grad_norm": 6.614049246144144, + "learning_rate": 1.3158288083680005e-06, + "loss": 0.6259, + "step": 18474 + }, + { + "epoch": 1.3347300738707173, + "grad_norm": 6.689631612582306, + "learning_rate": 1.3155712086762818e-06, + "loss": 0.632, + "step": 18475 + }, + { + "epoch": 1.3348023190709268, + "grad_norm": 6.442272010117529, + "learning_rate": 1.3153136251984733e-06, + "loss": 0.5976, + "step": 18476 + }, + { + "epoch": 1.3348745642711362, + "grad_norm": 6.633352808069111, + "learning_rate": 1.3150560579381016e-06, + "loss": 0.66, + "step": 18477 + }, + { + "epoch": 1.3349468094713457, + "grad_norm": 6.343150744767904, + "learning_rate": 1.3147985068986924e-06, + "loss": 0.6152, + "step": 18478 + }, + { + "epoch": 1.3350190546715552, + "grad_norm": 7.006208935905669, + "learning_rate": 1.3145409720837716e-06, + "loss": 0.6015, + "step": 18479 + }, + { + "epoch": 1.3350912998717648, + "grad_norm": 8.29122473786645, + "learning_rate": 1.3142834534968643e-06, + "loss": 0.7143, + "step": 18480 + }, + { + "epoch": 1.3351635450719743, + "grad_norm": 6.054006703156616, + "learning_rate": 1.314025951141497e-06, + "loss": 0.5946, + "step": 18481 + }, + { + "epoch": 1.3352357902721839, + "grad_norm": 8.150511144043648, + "learning_rate": 1.3137684650211924e-06, + "loss": 0.6947, + "step": 18482 + }, + { + "epoch": 1.3353080354723934, + "grad_norm": 7.557231991350653, + "learning_rate": 1.3135109951394764e-06, + "loss": 0.7017, + "step": 18483 + }, + { + "epoch": 1.3353802806726027, + "grad_norm": 6.1583483842486, + "learning_rate": 1.313253541499875e-06, + "loss": 0.5888, + "step": 18484 + }, + { + "epoch": 1.3354525258728123, + "grad_norm": 5.577152797915692, + "learning_rate": 1.3129961041059096e-06, + "loss": 0.6028, + "step": 18485 + }, + { + "epoch": 1.3355247710730218, + "grad_norm": 7.340390739746638, + "learning_rate": 1.3127386829611061e-06, + "loss": 0.6627, + "step": 18486 + }, + { + "epoch": 1.3355970162732314, + "grad_norm": 5.810539325100137, + "learning_rate": 1.3124812780689883e-06, + "loss": 0.6298, + "step": 18487 + }, + { + "epoch": 1.335669261473441, + "grad_norm": 5.445750849633518, + "learning_rate": 1.31222388943308e-06, + "loss": 0.6066, + "step": 18488 + }, + { + "epoch": 1.3357415066736504, + "grad_norm": 6.834052195380109, + "learning_rate": 1.311966517056904e-06, + "loss": 0.6165, + "step": 18489 + }, + { + "epoch": 1.33581375187386, + "grad_norm": 6.320441962614893, + "learning_rate": 1.3117091609439842e-06, + "loss": 0.6115, + "step": 18490 + }, + { + "epoch": 1.3358859970740693, + "grad_norm": 6.525252951911196, + "learning_rate": 1.311451821097844e-06, + "loss": 0.64, + "step": 18491 + }, + { + "epoch": 1.3359582422742788, + "grad_norm": 5.790596000816992, + "learning_rate": 1.3111944975220047e-06, + "loss": 0.571, + "step": 18492 + }, + { + "epoch": 1.3360304874744884, + "grad_norm": 5.91872835729362, + "learning_rate": 1.3109371902199897e-06, + "loss": 0.5788, + "step": 18493 + }, + { + "epoch": 1.336102732674698, + "grad_norm": 6.598082362090791, + "learning_rate": 1.3106798991953216e-06, + "loss": 0.608, + "step": 18494 + }, + { + "epoch": 1.3361749778749075, + "grad_norm": 5.674520584895979, + "learning_rate": 1.3104226244515229e-06, + "loss": 0.6154, + "step": 18495 + }, + { + "epoch": 1.336247223075117, + "grad_norm": 7.425092589960237, + "learning_rate": 1.3101653659921138e-06, + "loss": 0.5988, + "step": 18496 + }, + { + "epoch": 1.3363194682753265, + "grad_norm": 7.550206051471558, + "learning_rate": 1.3099081238206162e-06, + "loss": 0.6408, + "step": 18497 + }, + { + "epoch": 1.3363917134755359, + "grad_norm": 7.380596042123679, + "learning_rate": 1.309650897940553e-06, + "loss": 0.6686, + "step": 18498 + }, + { + "epoch": 1.3364639586757454, + "grad_norm": 8.230725516580957, + "learning_rate": 1.3093936883554462e-06, + "loss": 0.6221, + "step": 18499 + }, + { + "epoch": 1.336536203875955, + "grad_norm": 6.152051084308819, + "learning_rate": 1.309136495068814e-06, + "loss": 0.57, + "step": 18500 + }, + { + "epoch": 1.3366084490761645, + "grad_norm": 6.826859206275932, + "learning_rate": 1.3088793180841786e-06, + "loss": 0.6791, + "step": 18501 + }, + { + "epoch": 1.336680694276374, + "grad_norm": 5.773919814511981, + "learning_rate": 1.3086221574050618e-06, + "loss": 0.635, + "step": 18502 + }, + { + "epoch": 1.3367529394765836, + "grad_norm": 6.454594110068612, + "learning_rate": 1.3083650130349813e-06, + "loss": 0.667, + "step": 18503 + }, + { + "epoch": 1.3368251846767931, + "grad_norm": 7.43175124137453, + "learning_rate": 1.3081078849774587e-06, + "loss": 0.6646, + "step": 18504 + }, + { + "epoch": 1.3368974298770024, + "grad_norm": 6.478848301301651, + "learning_rate": 1.3078507732360135e-06, + "loss": 0.535, + "step": 18505 + }, + { + "epoch": 1.336969675077212, + "grad_norm": 6.290933157834209, + "learning_rate": 1.3075936778141658e-06, + "loss": 0.6577, + "step": 18506 + }, + { + "epoch": 1.3370419202774215, + "grad_norm": 7.912418416884778, + "learning_rate": 1.3073365987154347e-06, + "loss": 0.6421, + "step": 18507 + }, + { + "epoch": 1.337114165477631, + "grad_norm": 8.937899920712683, + "learning_rate": 1.3070795359433395e-06, + "loss": 0.6744, + "step": 18508 + }, + { + "epoch": 1.3371864106778406, + "grad_norm": 5.74663943066674, + "learning_rate": 1.3068224895014004e-06, + "loss": 0.6102, + "step": 18509 + }, + { + "epoch": 1.3372586558780502, + "grad_norm": 6.83922772381168, + "learning_rate": 1.3065654593931337e-06, + "loss": 0.6111, + "step": 18510 + }, + { + "epoch": 1.3373309010782597, + "grad_norm": 7.5542572751075685, + "learning_rate": 1.3063084456220592e-06, + "loss": 0.633, + "step": 18511 + }, + { + "epoch": 1.337403146278469, + "grad_norm": 7.159296045488227, + "learning_rate": 1.3060514481916953e-06, + "loss": 0.6465, + "step": 18512 + }, + { + "epoch": 1.3374753914786786, + "grad_norm": 9.174226683579342, + "learning_rate": 1.305794467105561e-06, + "loss": 0.5897, + "step": 18513 + }, + { + "epoch": 1.337547636678888, + "grad_norm": 6.701379081793567, + "learning_rate": 1.3055375023671724e-06, + "loss": 0.5748, + "step": 18514 + }, + { + "epoch": 1.3376198818790976, + "grad_norm": 7.066249695957892, + "learning_rate": 1.305280553980048e-06, + "loss": 0.633, + "step": 18515 + }, + { + "epoch": 1.3376921270793072, + "grad_norm": 6.065456023655793, + "learning_rate": 1.305023621947705e-06, + "loss": 0.6457, + "step": 18516 + }, + { + "epoch": 1.3377643722795167, + "grad_norm": 6.007298163532916, + "learning_rate": 1.3047667062736608e-06, + "loss": 0.6485, + "step": 18517 + }, + { + "epoch": 1.3378366174797263, + "grad_norm": 6.652664293326925, + "learning_rate": 1.3045098069614323e-06, + "loss": 0.5928, + "step": 18518 + }, + { + "epoch": 1.3379088626799356, + "grad_norm": 6.789910337055557, + "learning_rate": 1.3042529240145362e-06, + "loss": 0.6068, + "step": 18519 + }, + { + "epoch": 1.3379811078801453, + "grad_norm": 6.471251662815788, + "learning_rate": 1.3039960574364904e-06, + "loss": 0.616, + "step": 18520 + }, + { + "epoch": 1.3380533530803547, + "grad_norm": 6.601138346211228, + "learning_rate": 1.3037392072308088e-06, + "loss": 0.5904, + "step": 18521 + }, + { + "epoch": 1.3381255982805642, + "grad_norm": 6.515934120365054, + "learning_rate": 1.303482373401009e-06, + "loss": 0.6165, + "step": 18522 + }, + { + "epoch": 1.3381978434807738, + "grad_norm": 8.175584815336082, + "learning_rate": 1.3032255559506068e-06, + "loss": 0.6432, + "step": 18523 + }, + { + "epoch": 1.3382700886809833, + "grad_norm": 6.295881396484034, + "learning_rate": 1.3029687548831168e-06, + "loss": 0.5967, + "step": 18524 + }, + { + "epoch": 1.3383423338811928, + "grad_norm": 8.398433412062959, + "learning_rate": 1.302711970202054e-06, + "loss": 0.6728, + "step": 18525 + }, + { + "epoch": 1.3384145790814022, + "grad_norm": 5.92321633318726, + "learning_rate": 1.302455201910936e-06, + "loss": 0.5562, + "step": 18526 + }, + { + "epoch": 1.338486824281612, + "grad_norm": 8.419427349705717, + "learning_rate": 1.3021984500132773e-06, + "loss": 0.6676, + "step": 18527 + }, + { + "epoch": 1.3385590694818212, + "grad_norm": 7.561749886126361, + "learning_rate": 1.3019417145125906e-06, + "loss": 0.6327, + "step": 18528 + }, + { + "epoch": 1.3386313146820308, + "grad_norm": 6.841704598560569, + "learning_rate": 1.301684995412392e-06, + "loss": 0.6061, + "step": 18529 + }, + { + "epoch": 1.3387035598822403, + "grad_norm": 6.1182543919159835, + "learning_rate": 1.3014282927161952e-06, + "loss": 0.7084, + "step": 18530 + }, + { + "epoch": 1.3387758050824499, + "grad_norm": 6.980960292832743, + "learning_rate": 1.3011716064275154e-06, + "loss": 0.6677, + "step": 18531 + }, + { + "epoch": 1.3388480502826594, + "grad_norm": 7.110552948000483, + "learning_rate": 1.3009149365498644e-06, + "loss": 0.5938, + "step": 18532 + }, + { + "epoch": 1.3389202954828687, + "grad_norm": 6.416990032552713, + "learning_rate": 1.300658283086757e-06, + "loss": 0.6392, + "step": 18533 + }, + { + "epoch": 1.3389925406830785, + "grad_norm": 7.392029578393325, + "learning_rate": 1.3004016460417063e-06, + "loss": 0.6636, + "step": 18534 + }, + { + "epoch": 1.3390647858832878, + "grad_norm": 6.960528394876977, + "learning_rate": 1.3001450254182257e-06, + "loss": 0.6443, + "step": 18535 + }, + { + "epoch": 1.3391370310834974, + "grad_norm": 7.372250577860445, + "learning_rate": 1.2998884212198282e-06, + "loss": 0.6844, + "step": 18536 + }, + { + "epoch": 1.339209276283707, + "grad_norm": 6.120785255667453, + "learning_rate": 1.2996318334500263e-06, + "loss": 0.6222, + "step": 18537 + }, + { + "epoch": 1.3392815214839164, + "grad_norm": 7.90952471437288, + "learning_rate": 1.2993752621123338e-06, + "loss": 0.7206, + "step": 18538 + }, + { + "epoch": 1.339353766684126, + "grad_norm": 6.5130536977088, + "learning_rate": 1.2991187072102604e-06, + "loss": 0.5939, + "step": 18539 + }, + { + "epoch": 1.3394260118843353, + "grad_norm": 6.297104899940099, + "learning_rate": 1.2988621687473195e-06, + "loss": 0.6058, + "step": 18540 + }, + { + "epoch": 1.339498257084545, + "grad_norm": 7.273556553637625, + "learning_rate": 1.2986056467270241e-06, + "loss": 0.6292, + "step": 18541 + }, + { + "epoch": 1.3395705022847544, + "grad_norm": 8.296272672639748, + "learning_rate": 1.2983491411528837e-06, + "loss": 0.738, + "step": 18542 + }, + { + "epoch": 1.339642747484964, + "grad_norm": 6.953625333122105, + "learning_rate": 1.29809265202841e-06, + "loss": 0.5735, + "step": 18543 + }, + { + "epoch": 1.3397149926851735, + "grad_norm": 5.860948681903323, + "learning_rate": 1.2978361793571152e-06, + "loss": 0.5922, + "step": 18544 + }, + { + "epoch": 1.339787237885383, + "grad_norm": 6.995785670736269, + "learning_rate": 1.2975797231425094e-06, + "loss": 0.5836, + "step": 18545 + }, + { + "epoch": 1.3398594830855926, + "grad_norm": 6.472930883959416, + "learning_rate": 1.2973232833881036e-06, + "loss": 0.6671, + "step": 18546 + }, + { + "epoch": 1.339931728285802, + "grad_norm": 8.016537263054504, + "learning_rate": 1.2970668600974085e-06, + "loss": 0.6073, + "step": 18547 + }, + { + "epoch": 1.3400039734860116, + "grad_norm": 6.414054173639541, + "learning_rate": 1.2968104532739346e-06, + "loss": 0.6573, + "step": 18548 + }, + { + "epoch": 1.340076218686221, + "grad_norm": 7.959061061747117, + "learning_rate": 1.2965540629211907e-06, + "loss": 0.5595, + "step": 18549 + }, + { + "epoch": 1.3401484638864305, + "grad_norm": 8.255195600016467, + "learning_rate": 1.296297689042687e-06, + "loss": 0.7061, + "step": 18550 + }, + { + "epoch": 1.34022070908664, + "grad_norm": 6.582813322880764, + "learning_rate": 1.2960413316419337e-06, + "loss": 0.624, + "step": 18551 + }, + { + "epoch": 1.3402929542868496, + "grad_norm": 6.704062229605119, + "learning_rate": 1.2957849907224407e-06, + "loss": 0.6022, + "step": 18552 + }, + { + "epoch": 1.3403651994870591, + "grad_norm": 7.734022591008667, + "learning_rate": 1.2955286662877148e-06, + "loss": 0.6457, + "step": 18553 + }, + { + "epoch": 1.3404374446872687, + "grad_norm": 6.395592435427515, + "learning_rate": 1.2952723583412658e-06, + "loss": 0.5756, + "step": 18554 + }, + { + "epoch": 1.3405096898874782, + "grad_norm": 6.924183479950085, + "learning_rate": 1.2950160668866033e-06, + "loss": 0.617, + "step": 18555 + }, + { + "epoch": 1.3405819350876875, + "grad_norm": 7.932012628634697, + "learning_rate": 1.2947597919272364e-06, + "loss": 0.6404, + "step": 18556 + }, + { + "epoch": 1.340654180287897, + "grad_norm": 6.465725841445667, + "learning_rate": 1.2945035334666712e-06, + "loss": 0.626, + "step": 18557 + }, + { + "epoch": 1.3407264254881066, + "grad_norm": 7.000086647587334, + "learning_rate": 1.2942472915084164e-06, + "loss": 0.6745, + "step": 18558 + }, + { + "epoch": 1.3407986706883162, + "grad_norm": 6.529800181113771, + "learning_rate": 1.2939910660559813e-06, + "loss": 0.6726, + "step": 18559 + }, + { + "epoch": 1.3408709158885257, + "grad_norm": 7.270750932490705, + "learning_rate": 1.2937348571128709e-06, + "loss": 0.6792, + "step": 18560 + }, + { + "epoch": 1.3409431610887352, + "grad_norm": 6.576085152661771, + "learning_rate": 1.2934786646825937e-06, + "loss": 0.6522, + "step": 18561 + }, + { + "epoch": 1.3410154062889448, + "grad_norm": 5.523959770492587, + "learning_rate": 1.2932224887686568e-06, + "loss": 0.577, + "step": 18562 + }, + { + "epoch": 1.341087651489154, + "grad_norm": 6.858803275575622, + "learning_rate": 1.292966329374567e-06, + "loss": 0.6013, + "step": 18563 + }, + { + "epoch": 1.3411598966893636, + "grad_norm": 7.433409263097592, + "learning_rate": 1.292710186503831e-06, + "loss": 0.6353, + "step": 18564 + }, + { + "epoch": 1.3412321418895732, + "grad_norm": 7.161096522583795, + "learning_rate": 1.292454060159955e-06, + "loss": 0.6546, + "step": 18565 + }, + { + "epoch": 1.3413043870897827, + "grad_norm": 8.625433869437245, + "learning_rate": 1.2921979503464465e-06, + "loss": 0.7818, + "step": 18566 + }, + { + "epoch": 1.3413766322899923, + "grad_norm": 7.1829126729031545, + "learning_rate": 1.2919418570668093e-06, + "loss": 0.6495, + "step": 18567 + }, + { + "epoch": 1.3414488774902018, + "grad_norm": 6.87656592828604, + "learning_rate": 1.2916857803245503e-06, + "loss": 0.6393, + "step": 18568 + }, + { + "epoch": 1.3415211226904113, + "grad_norm": 6.54929032303131, + "learning_rate": 1.2914297201231743e-06, + "loss": 0.5683, + "step": 18569 + }, + { + "epoch": 1.3415933678906207, + "grad_norm": 6.25042479025176, + "learning_rate": 1.2911736764661881e-06, + "loss": 0.6671, + "step": 18570 + }, + { + "epoch": 1.3416656130908302, + "grad_norm": 6.443762242539595, + "learning_rate": 1.2909176493570949e-06, + "loss": 0.606, + "step": 18571 + }, + { + "epoch": 1.3417378582910398, + "grad_norm": 6.282015777519498, + "learning_rate": 1.2906616387994004e-06, + "loss": 0.7267, + "step": 18572 + }, + { + "epoch": 1.3418101034912493, + "grad_norm": 5.849364169526816, + "learning_rate": 1.2904056447966085e-06, + "loss": 0.6468, + "step": 18573 + }, + { + "epoch": 1.3418823486914588, + "grad_norm": 5.9100319999951605, + "learning_rate": 1.2901496673522247e-06, + "loss": 0.5939, + "step": 18574 + }, + { + "epoch": 1.3419545938916684, + "grad_norm": 7.50081909793141, + "learning_rate": 1.2898937064697524e-06, + "loss": 0.6685, + "step": 18575 + }, + { + "epoch": 1.342026839091878, + "grad_norm": 8.605556963772786, + "learning_rate": 1.2896377621526956e-06, + "loss": 0.7219, + "step": 18576 + }, + { + "epoch": 1.3420990842920872, + "grad_norm": 6.483225083938832, + "learning_rate": 1.2893818344045589e-06, + "loss": 0.6281, + "step": 18577 + }, + { + "epoch": 1.3421713294922968, + "grad_norm": 7.144015719594033, + "learning_rate": 1.2891259232288441e-06, + "loss": 0.6672, + "step": 18578 + }, + { + "epoch": 1.3422435746925063, + "grad_norm": 6.291294853221854, + "learning_rate": 1.2888700286290551e-06, + "loss": 0.567, + "step": 18579 + }, + { + "epoch": 1.3423158198927159, + "grad_norm": 8.258287371043368, + "learning_rate": 1.2886141506086951e-06, + "loss": 0.6888, + "step": 18580 + }, + { + "epoch": 1.3423880650929254, + "grad_norm": 6.247573992049459, + "learning_rate": 1.2883582891712682e-06, + "loss": 0.6453, + "step": 18581 + }, + { + "epoch": 1.342460310293135, + "grad_norm": 7.362179750171014, + "learning_rate": 1.288102444320273e-06, + "loss": 0.6475, + "step": 18582 + }, + { + "epoch": 1.3425325554933445, + "grad_norm": 7.7460294827610054, + "learning_rate": 1.2878466160592157e-06, + "loss": 0.6558, + "step": 18583 + }, + { + "epoch": 1.3426048006935538, + "grad_norm": 6.059446844154086, + "learning_rate": 1.2875908043915978e-06, + "loss": 0.6045, + "step": 18584 + }, + { + "epoch": 1.3426770458937634, + "grad_norm": 6.651077768282095, + "learning_rate": 1.2873350093209194e-06, + "loss": 0.6204, + "step": 18585 + }, + { + "epoch": 1.342749291093973, + "grad_norm": 6.859728688916704, + "learning_rate": 1.2870792308506832e-06, + "loss": 0.6999, + "step": 18586 + }, + { + "epoch": 1.3428215362941824, + "grad_norm": 6.623998890217859, + "learning_rate": 1.2868234689843907e-06, + "loss": 0.6226, + "step": 18587 + }, + { + "epoch": 1.342893781494392, + "grad_norm": 7.27308111413741, + "learning_rate": 1.2865677237255437e-06, + "loss": 0.6603, + "step": 18588 + }, + { + "epoch": 1.3429660266946015, + "grad_norm": 6.50941782702396, + "learning_rate": 1.2863119950776414e-06, + "loss": 0.6399, + "step": 18589 + }, + { + "epoch": 1.343038271894811, + "grad_norm": 6.774459040254501, + "learning_rate": 1.2860562830441859e-06, + "loss": 0.6156, + "step": 18590 + }, + { + "epoch": 1.3431105170950204, + "grad_norm": 6.436169829485127, + "learning_rate": 1.2858005876286771e-06, + "loss": 0.6539, + "step": 18591 + }, + { + "epoch": 1.3431827622952301, + "grad_norm": 7.800727702698665, + "learning_rate": 1.2855449088346155e-06, + "loss": 0.6419, + "step": 18592 + }, + { + "epoch": 1.3432550074954395, + "grad_norm": 8.391229646800971, + "learning_rate": 1.2852892466655008e-06, + "loss": 0.6285, + "step": 18593 + }, + { + "epoch": 1.343327252695649, + "grad_norm": 7.904340068528511, + "learning_rate": 1.2850336011248338e-06, + "loss": 0.6008, + "step": 18594 + }, + { + "epoch": 1.3433994978958586, + "grad_norm": 6.3129187718843776, + "learning_rate": 1.2847779722161138e-06, + "loss": 0.6913, + "step": 18595 + }, + { + "epoch": 1.343471743096068, + "grad_norm": 7.08292642060381, + "learning_rate": 1.2845223599428392e-06, + "loss": 0.5871, + "step": 18596 + }, + { + "epoch": 1.3435439882962776, + "grad_norm": 7.302862353452022, + "learning_rate": 1.2842667643085096e-06, + "loss": 0.687, + "step": 18597 + }, + { + "epoch": 1.343616233496487, + "grad_norm": 7.435479923605197, + "learning_rate": 1.284011185316625e-06, + "loss": 0.6868, + "step": 18598 + }, + { + "epoch": 1.3436884786966967, + "grad_norm": 6.2116333385761076, + "learning_rate": 1.283755622970682e-06, + "loss": 0.6497, + "step": 18599 + }, + { + "epoch": 1.343760723896906, + "grad_norm": 5.698543349073475, + "learning_rate": 1.2835000772741808e-06, + "loss": 0.6204, + "step": 18600 + }, + { + "epoch": 1.3438329690971156, + "grad_norm": 8.099230468364727, + "learning_rate": 1.2832445482306184e-06, + "loss": 0.7095, + "step": 18601 + }, + { + "epoch": 1.3439052142973251, + "grad_norm": 6.396686244429062, + "learning_rate": 1.2829890358434936e-06, + "loss": 0.5949, + "step": 18602 + }, + { + "epoch": 1.3439774594975347, + "grad_norm": 7.047145119372532, + "learning_rate": 1.282733540116304e-06, + "loss": 0.5847, + "step": 18603 + }, + { + "epoch": 1.3440497046977442, + "grad_norm": 5.980539551956791, + "learning_rate": 1.2824780610525467e-06, + "loss": 0.6434, + "step": 18604 + }, + { + "epoch": 1.3441219498979535, + "grad_norm": 6.508174744194916, + "learning_rate": 1.2822225986557195e-06, + "loss": 0.6164, + "step": 18605 + }, + { + "epoch": 1.3441941950981633, + "grad_norm": 7.317481797446239, + "learning_rate": 1.281967152929321e-06, + "loss": 0.6943, + "step": 18606 + }, + { + "epoch": 1.3442664402983726, + "grad_norm": 6.4375112589024726, + "learning_rate": 1.2817117238768447e-06, + "loss": 0.617, + "step": 18607 + }, + { + "epoch": 1.3443386854985822, + "grad_norm": 6.962143302286464, + "learning_rate": 1.2814563115017893e-06, + "loss": 0.7545, + "step": 18608 + }, + { + "epoch": 1.3444109306987917, + "grad_norm": 7.1716776635773245, + "learning_rate": 1.2812009158076516e-06, + "loss": 0.6998, + "step": 18609 + }, + { + "epoch": 1.3444831758990012, + "grad_norm": 7.084072687793884, + "learning_rate": 1.2809455367979255e-06, + "loss": 0.6384, + "step": 18610 + }, + { + "epoch": 1.3445554210992108, + "grad_norm": 6.11073104755453, + "learning_rate": 1.2806901744761092e-06, + "loss": 0.6686, + "step": 18611 + }, + { + "epoch": 1.34462766629942, + "grad_norm": 7.223181219090943, + "learning_rate": 1.2804348288456976e-06, + "loss": 0.6375, + "step": 18612 + }, + { + "epoch": 1.3446999114996299, + "grad_norm": 7.601872625033029, + "learning_rate": 1.2801794999101874e-06, + "loss": 0.6124, + "step": 18613 + }, + { + "epoch": 1.3447721566998392, + "grad_norm": 9.121609410610942, + "learning_rate": 1.2799241876730716e-06, + "loss": 0.6725, + "step": 18614 + }, + { + "epoch": 1.3448444019000487, + "grad_norm": 6.629145638724047, + "learning_rate": 1.2796688921378464e-06, + "loss": 0.7125, + "step": 18615 + }, + { + "epoch": 1.3449166471002583, + "grad_norm": 7.402239135969212, + "learning_rate": 1.2794136133080076e-06, + "loss": 0.665, + "step": 18616 + }, + { + "epoch": 1.3449888923004678, + "grad_norm": 7.117696398432829, + "learning_rate": 1.2791583511870475e-06, + "loss": 0.7193, + "step": 18617 + }, + { + "epoch": 1.3450611375006774, + "grad_norm": 6.260423437063033, + "learning_rate": 1.2789031057784617e-06, + "loss": 0.5433, + "step": 18618 + }, + { + "epoch": 1.345133382700887, + "grad_norm": 7.002802696454381, + "learning_rate": 1.2786478770857446e-06, + "loss": 0.6334, + "step": 18619 + }, + { + "epoch": 1.3452056279010964, + "grad_norm": 8.679528840105924, + "learning_rate": 1.2783926651123895e-06, + "loss": 0.6647, + "step": 18620 + }, + { + "epoch": 1.3452778731013058, + "grad_norm": 6.953939120175656, + "learning_rate": 1.2781374698618903e-06, + "loss": 0.6351, + "step": 18621 + }, + { + "epoch": 1.3453501183015153, + "grad_norm": 6.512811506146912, + "learning_rate": 1.2778822913377403e-06, + "loss": 0.5747, + "step": 18622 + }, + { + "epoch": 1.3454223635017248, + "grad_norm": 6.513718726405717, + "learning_rate": 1.277627129543434e-06, + "loss": 0.5982, + "step": 18623 + }, + { + "epoch": 1.3454946087019344, + "grad_norm": 8.138320098664117, + "learning_rate": 1.2773719844824622e-06, + "loss": 0.6568, + "step": 18624 + }, + { + "epoch": 1.345566853902144, + "grad_norm": 6.602489302546001, + "learning_rate": 1.2771168561583187e-06, + "loss": 0.6255, + "step": 18625 + }, + { + "epoch": 1.3456390991023535, + "grad_norm": 6.199049526085065, + "learning_rate": 1.2768617445744954e-06, + "loss": 0.649, + "step": 18626 + }, + { + "epoch": 1.345711344302563, + "grad_norm": 6.219838722812109, + "learning_rate": 1.2766066497344865e-06, + "loss": 0.6616, + "step": 18627 + }, + { + "epoch": 1.3457835895027723, + "grad_norm": 6.160890331082908, + "learning_rate": 1.2763515716417818e-06, + "loss": 0.6646, + "step": 18628 + }, + { + "epoch": 1.3458558347029819, + "grad_norm": 6.395889166009442, + "learning_rate": 1.2760965102998738e-06, + "loss": 0.5992, + "step": 18629 + }, + { + "epoch": 1.3459280799031914, + "grad_norm": 7.042559264302356, + "learning_rate": 1.2758414657122542e-06, + "loss": 0.62, + "step": 18630 + }, + { + "epoch": 1.346000325103401, + "grad_norm": 7.5056957233980865, + "learning_rate": 1.2755864378824145e-06, + "loss": 0.6268, + "step": 18631 + }, + { + "epoch": 1.3460725703036105, + "grad_norm": 9.44538277346687, + "learning_rate": 1.2753314268138462e-06, + "loss": 0.6297, + "step": 18632 + }, + { + "epoch": 1.34614481550382, + "grad_norm": 6.772702222292495, + "learning_rate": 1.275076432510039e-06, + "loss": 0.6409, + "step": 18633 + }, + { + "epoch": 1.3462170607040296, + "grad_norm": 6.6193498233391646, + "learning_rate": 1.2748214549744857e-06, + "loss": 0.5889, + "step": 18634 + }, + { + "epoch": 1.346289305904239, + "grad_norm": 7.680227173385854, + "learning_rate": 1.2745664942106745e-06, + "loss": 0.6266, + "step": 18635 + }, + { + "epoch": 1.3463615511044484, + "grad_norm": 6.928445216561684, + "learning_rate": 1.2743115502220965e-06, + "loss": 0.6167, + "step": 18636 + }, + { + "epoch": 1.346433796304658, + "grad_norm": 7.111793458838064, + "learning_rate": 1.2740566230122416e-06, + "loss": 0.6342, + "step": 18637 + }, + { + "epoch": 1.3465060415048675, + "grad_norm": 7.86188880958621, + "learning_rate": 1.2738017125845997e-06, + "loss": 0.5887, + "step": 18638 + }, + { + "epoch": 1.346578286705077, + "grad_norm": 6.756963175945536, + "learning_rate": 1.2735468189426604e-06, + "loss": 0.616, + "step": 18639 + }, + { + "epoch": 1.3466505319052866, + "grad_norm": 6.938698330045835, + "learning_rate": 1.2732919420899125e-06, + "loss": 0.6704, + "step": 18640 + }, + { + "epoch": 1.3467227771054961, + "grad_norm": 7.629448515434009, + "learning_rate": 1.2730370820298469e-06, + "loss": 0.541, + "step": 18641 + }, + { + "epoch": 1.3467950223057055, + "grad_norm": 7.433276090986007, + "learning_rate": 1.2727822387659495e-06, + "loss": 0.7149, + "step": 18642 + }, + { + "epoch": 1.346867267505915, + "grad_norm": 7.240900840987513, + "learning_rate": 1.2725274123017106e-06, + "loss": 0.631, + "step": 18643 + }, + { + "epoch": 1.3469395127061246, + "grad_norm": 6.461362887012771, + "learning_rate": 1.2722726026406184e-06, + "loss": 0.6164, + "step": 18644 + }, + { + "epoch": 1.347011757906334, + "grad_norm": 7.036324123982236, + "learning_rate": 1.272017809786162e-06, + "loss": 0.6492, + "step": 18645 + }, + { + "epoch": 1.3470840031065436, + "grad_norm": 6.530854281616232, + "learning_rate": 1.2717630337418273e-06, + "loss": 0.6509, + "step": 18646 + }, + { + "epoch": 1.3471562483067532, + "grad_norm": 6.57396082335318, + "learning_rate": 1.2715082745111028e-06, + "loss": 0.6614, + "step": 18647 + }, + { + "epoch": 1.3472284935069627, + "grad_norm": 6.961810706927602, + "learning_rate": 1.2712535320974768e-06, + "loss": 0.6283, + "step": 18648 + }, + { + "epoch": 1.347300738707172, + "grad_norm": 6.206659727219015, + "learning_rate": 1.2709988065044351e-06, + "loss": 0.5839, + "step": 18649 + }, + { + "epoch": 1.3473729839073816, + "grad_norm": 7.592719365261641, + "learning_rate": 1.2707440977354657e-06, + "loss": 0.6194, + "step": 18650 + }, + { + "epoch": 1.3474452291075911, + "grad_norm": 6.621327948015112, + "learning_rate": 1.2704894057940554e-06, + "loss": 0.6509, + "step": 18651 + }, + { + "epoch": 1.3475174743078007, + "grad_norm": 7.3439162215230915, + "learning_rate": 1.2702347306836914e-06, + "loss": 0.6703, + "step": 18652 + }, + { + "epoch": 1.3475897195080102, + "grad_norm": 6.969913402659762, + "learning_rate": 1.269980072407858e-06, + "loss": 0.5899, + "step": 18653 + }, + { + "epoch": 1.3476619647082198, + "grad_norm": 8.946293273198577, + "learning_rate": 1.2697254309700425e-06, + "loss": 0.6454, + "step": 18654 + }, + { + "epoch": 1.3477342099084293, + "grad_norm": 6.433800087765856, + "learning_rate": 1.2694708063737313e-06, + "loss": 0.636, + "step": 18655 + }, + { + "epoch": 1.3478064551086386, + "grad_norm": 7.251931558007252, + "learning_rate": 1.2692161986224082e-06, + "loss": 0.6057, + "step": 18656 + }, + { + "epoch": 1.3478787003088482, + "grad_norm": 7.418528748115834, + "learning_rate": 1.2689616077195604e-06, + "loss": 0.6955, + "step": 18657 + }, + { + "epoch": 1.3479509455090577, + "grad_norm": 7.335099209957686, + "learning_rate": 1.268707033668672e-06, + "loss": 0.5967, + "step": 18658 + }, + { + "epoch": 1.3480231907092672, + "grad_norm": 7.513798545283309, + "learning_rate": 1.2684524764732282e-06, + "loss": 0.7004, + "step": 18659 + }, + { + "epoch": 1.3480954359094768, + "grad_norm": 7.57103959598448, + "learning_rate": 1.2681979361367136e-06, + "loss": 0.5856, + "step": 18660 + }, + { + "epoch": 1.3481676811096863, + "grad_norm": 7.683315471813213, + "learning_rate": 1.267943412662613e-06, + "loss": 0.665, + "step": 18661 + }, + { + "epoch": 1.3482399263098959, + "grad_norm": 6.982345934901727, + "learning_rate": 1.2676889060544106e-06, + "loss": 0.6188, + "step": 18662 + }, + { + "epoch": 1.3483121715101052, + "grad_norm": 6.209440845126948, + "learning_rate": 1.267434416315591e-06, + "loss": 0.6661, + "step": 18663 + }, + { + "epoch": 1.3483844167103147, + "grad_norm": 6.6070858573664175, + "learning_rate": 1.2671799434496363e-06, + "loss": 0.6706, + "step": 18664 + }, + { + "epoch": 1.3484566619105243, + "grad_norm": 6.514689938712248, + "learning_rate": 1.2669254874600311e-06, + "loss": 0.632, + "step": 18665 + }, + { + "epoch": 1.3485289071107338, + "grad_norm": 7.515535413561048, + "learning_rate": 1.2666710483502592e-06, + "loss": 0.6063, + "step": 18666 + }, + { + "epoch": 1.3486011523109434, + "grad_norm": 7.262667671408507, + "learning_rate": 1.2664166261238015e-06, + "loss": 0.5835, + "step": 18667 + }, + { + "epoch": 1.348673397511153, + "grad_norm": 6.425547642505152, + "learning_rate": 1.266162220784143e-06, + "loss": 0.518, + "step": 18668 + }, + { + "epoch": 1.3487456427113624, + "grad_norm": 7.041271612458233, + "learning_rate": 1.265907832334766e-06, + "loss": 0.6453, + "step": 18669 + }, + { + "epoch": 1.3488178879115718, + "grad_norm": 6.868145386567279, + "learning_rate": 1.2656534607791532e-06, + "loss": 0.5628, + "step": 18670 + }, + { + "epoch": 1.3488901331117815, + "grad_norm": 6.609898467560969, + "learning_rate": 1.2653991061207853e-06, + "loss": 0.5833, + "step": 18671 + }, + { + "epoch": 1.3489623783119908, + "grad_norm": 7.4352318647154565, + "learning_rate": 1.2651447683631452e-06, + "loss": 0.5332, + "step": 18672 + }, + { + "epoch": 1.3490346235122004, + "grad_norm": 5.915138297959261, + "learning_rate": 1.2648904475097151e-06, + "loss": 0.6073, + "step": 18673 + }, + { + "epoch": 1.34910686871241, + "grad_norm": 6.06469466608355, + "learning_rate": 1.2646361435639747e-06, + "loss": 0.6918, + "step": 18674 + }, + { + "epoch": 1.3491791139126195, + "grad_norm": 7.0805457819688336, + "learning_rate": 1.264381856529407e-06, + "loss": 0.6718, + "step": 18675 + }, + { + "epoch": 1.349251359112829, + "grad_norm": 6.543202246300074, + "learning_rate": 1.2641275864094915e-06, + "loss": 0.6677, + "step": 18676 + }, + { + "epoch": 1.3493236043130383, + "grad_norm": 7.5578485481328395, + "learning_rate": 1.2638733332077103e-06, + "loss": 0.6484, + "step": 18677 + }, + { + "epoch": 1.349395849513248, + "grad_norm": 6.743132594626974, + "learning_rate": 1.263619096927543e-06, + "loss": 0.7328, + "step": 18678 + }, + { + "epoch": 1.3494680947134574, + "grad_norm": 7.233912541745379, + "learning_rate": 1.2633648775724704e-06, + "loss": 0.5879, + "step": 18679 + }, + { + "epoch": 1.349540339913667, + "grad_norm": 8.739467712534758, + "learning_rate": 1.263110675145973e-06, + "loss": 0.5874, + "step": 18680 + }, + { + "epoch": 1.3496125851138765, + "grad_norm": 6.495105074061212, + "learning_rate": 1.26285648965153e-06, + "loss": 0.6343, + "step": 18681 + }, + { + "epoch": 1.349684830314086, + "grad_norm": 7.96993734452378, + "learning_rate": 1.2626023210926203e-06, + "loss": 0.5738, + "step": 18682 + }, + { + "epoch": 1.3497570755142956, + "grad_norm": 6.66460056078124, + "learning_rate": 1.2623481694727247e-06, + "loss": 0.6695, + "step": 18683 + }, + { + "epoch": 1.349829320714505, + "grad_norm": 6.961935363557152, + "learning_rate": 1.262094034795322e-06, + "loss": 0.6455, + "step": 18684 + }, + { + "epoch": 1.3499015659147147, + "grad_norm": 6.150998116731239, + "learning_rate": 1.26183991706389e-06, + "loss": 0.588, + "step": 18685 + }, + { + "epoch": 1.349973811114924, + "grad_norm": 8.58588374002164, + "learning_rate": 1.2615858162819084e-06, + "loss": 0.6306, + "step": 18686 + }, + { + "epoch": 1.3500460563151335, + "grad_norm": 6.260257390979891, + "learning_rate": 1.2613317324528545e-06, + "loss": 0.6145, + "step": 18687 + }, + { + "epoch": 1.350118301515343, + "grad_norm": 6.372462945877523, + "learning_rate": 1.2610776655802094e-06, + "loss": 0.6225, + "step": 18688 + }, + { + "epoch": 1.3501905467155526, + "grad_norm": 7.2208101049520295, + "learning_rate": 1.2608236156674474e-06, + "loss": 0.6719, + "step": 18689 + }, + { + "epoch": 1.3502627919157622, + "grad_norm": 8.560293539843201, + "learning_rate": 1.2605695827180487e-06, + "loss": 0.6519, + "step": 18690 + }, + { + "epoch": 1.3503350371159715, + "grad_norm": 7.098623878472332, + "learning_rate": 1.2603155667354907e-06, + "loss": 0.5971, + "step": 18691 + }, + { + "epoch": 1.3504072823161812, + "grad_norm": 6.55619901886384, + "learning_rate": 1.260061567723249e-06, + "loss": 0.5727, + "step": 18692 + }, + { + "epoch": 1.3504795275163906, + "grad_norm": 6.549735598287209, + "learning_rate": 1.259807585684802e-06, + "loss": 0.7077, + "step": 18693 + }, + { + "epoch": 1.3505517727166, + "grad_norm": 7.389052629050818, + "learning_rate": 1.2595536206236262e-06, + "loss": 0.59, + "step": 18694 + }, + { + "epoch": 1.3506240179168096, + "grad_norm": 6.745809031068296, + "learning_rate": 1.259299672543198e-06, + "loss": 0.6254, + "step": 18695 + }, + { + "epoch": 1.3506962631170192, + "grad_norm": 7.383304123429614, + "learning_rate": 1.2590457414469938e-06, + "loss": 0.6458, + "step": 18696 + }, + { + "epoch": 1.3507685083172287, + "grad_norm": 6.323975057071378, + "learning_rate": 1.2587918273384903e-06, + "loss": 0.6102, + "step": 18697 + }, + { + "epoch": 1.3508407535174383, + "grad_norm": 7.2984541628007715, + "learning_rate": 1.2585379302211637e-06, + "loss": 0.6406, + "step": 18698 + }, + { + "epoch": 1.3509129987176478, + "grad_norm": 8.813888115823897, + "learning_rate": 1.2582840500984877e-06, + "loss": 0.7165, + "step": 18699 + }, + { + "epoch": 1.3509852439178571, + "grad_norm": 7.752793823696972, + "learning_rate": 1.2580301869739391e-06, + "loss": 0.695, + "step": 18700 + }, + { + "epoch": 1.3510574891180667, + "grad_norm": 6.394217155446463, + "learning_rate": 1.257776340850993e-06, + "loss": 0.5783, + "step": 18701 + }, + { + "epoch": 1.3511297343182762, + "grad_norm": 8.307266308657091, + "learning_rate": 1.2575225117331253e-06, + "loss": 0.6875, + "step": 18702 + }, + { + "epoch": 1.3512019795184858, + "grad_norm": 6.7654113030988485, + "learning_rate": 1.2572686996238085e-06, + "loss": 0.6506, + "step": 18703 + }, + { + "epoch": 1.3512742247186953, + "grad_norm": 6.471259915582091, + "learning_rate": 1.2570149045265185e-06, + "loss": 0.6849, + "step": 18704 + }, + { + "epoch": 1.3513464699189048, + "grad_norm": 7.02293399283204, + "learning_rate": 1.2567611264447287e-06, + "loss": 0.68, + "step": 18705 + }, + { + "epoch": 1.3514187151191144, + "grad_norm": 5.3993800619803505, + "learning_rate": 1.2565073653819145e-06, + "loss": 0.6156, + "step": 18706 + }, + { + "epoch": 1.3514909603193237, + "grad_norm": 7.738465123541084, + "learning_rate": 1.2562536213415488e-06, + "loss": 0.7006, + "step": 18707 + }, + { + "epoch": 1.3515632055195332, + "grad_norm": 6.767053043753437, + "learning_rate": 1.255999894327105e-06, + "loss": 0.6653, + "step": 18708 + }, + { + "epoch": 1.3516354507197428, + "grad_norm": 5.7432772270616335, + "learning_rate": 1.255746184342058e-06, + "loss": 0.6445, + "step": 18709 + }, + { + "epoch": 1.3517076959199523, + "grad_norm": 9.469467488415605, + "learning_rate": 1.255492491389879e-06, + "loss": 0.6302, + "step": 18710 + }, + { + "epoch": 1.3517799411201619, + "grad_norm": 6.232546430614507, + "learning_rate": 1.2552388154740409e-06, + "loss": 0.6799, + "step": 18711 + }, + { + "epoch": 1.3518521863203714, + "grad_norm": 7.368249939960939, + "learning_rate": 1.2549851565980173e-06, + "loss": 0.7348, + "step": 18712 + }, + { + "epoch": 1.351924431520581, + "grad_norm": 7.202910513177284, + "learning_rate": 1.2547315147652811e-06, + "loss": 0.6586, + "step": 18713 + }, + { + "epoch": 1.3519966767207903, + "grad_norm": 5.897792674811365, + "learning_rate": 1.2544778899793026e-06, + "loss": 0.5964, + "step": 18714 + }, + { + "epoch": 1.3520689219209998, + "grad_norm": 7.0695136699105925, + "learning_rate": 1.2542242822435535e-06, + "loss": 0.6479, + "step": 18715 + }, + { + "epoch": 1.3521411671212094, + "grad_norm": 7.220264886873687, + "learning_rate": 1.2539706915615088e-06, + "loss": 0.6415, + "step": 18716 + }, + { + "epoch": 1.352213412321419, + "grad_norm": 6.192706451116286, + "learning_rate": 1.253717117936637e-06, + "loss": 0.6031, + "step": 18717 + }, + { + "epoch": 1.3522856575216284, + "grad_norm": 7.4249687733619725, + "learning_rate": 1.25346356137241e-06, + "loss": 0.6301, + "step": 18718 + }, + { + "epoch": 1.352357902721838, + "grad_norm": 7.299797567408735, + "learning_rate": 1.253210021872299e-06, + "loss": 0.682, + "step": 18719 + }, + { + "epoch": 1.3524301479220475, + "grad_norm": 7.245117384944902, + "learning_rate": 1.2529564994397759e-06, + "loss": 0.644, + "step": 18720 + }, + { + "epoch": 1.3525023931222568, + "grad_norm": 8.382868927466188, + "learning_rate": 1.2527029940783086e-06, + "loss": 0.5717, + "step": 18721 + }, + { + "epoch": 1.3525746383224664, + "grad_norm": 6.875210013216329, + "learning_rate": 1.2524495057913693e-06, + "loss": 0.6365, + "step": 18722 + }, + { + "epoch": 1.352646883522676, + "grad_norm": 8.260773069398624, + "learning_rate": 1.2521960345824275e-06, + "loss": 0.6203, + "step": 18723 + }, + { + "epoch": 1.3527191287228855, + "grad_norm": 6.769706781029962, + "learning_rate": 1.2519425804549532e-06, + "loss": 0.6525, + "step": 18724 + }, + { + "epoch": 1.352791373923095, + "grad_norm": 7.083706236354579, + "learning_rate": 1.2516891434124156e-06, + "loss": 0.6371, + "step": 18725 + }, + { + "epoch": 1.3528636191233046, + "grad_norm": 6.087023343140031, + "learning_rate": 1.2514357234582847e-06, + "loss": 0.6016, + "step": 18726 + }, + { + "epoch": 1.352935864323514, + "grad_norm": 6.1565189206016875, + "learning_rate": 1.2511823205960302e-06, + "loss": 0.609, + "step": 18727 + }, + { + "epoch": 1.3530081095237234, + "grad_norm": 6.2398351789372395, + "learning_rate": 1.2509289348291194e-06, + "loss": 0.6102, + "step": 18728 + }, + { + "epoch": 1.353080354723933, + "grad_norm": 6.73106157611162, + "learning_rate": 1.250675566161021e-06, + "loss": 0.6327, + "step": 18729 + }, + { + "epoch": 1.3531525999241425, + "grad_norm": 6.451562133404718, + "learning_rate": 1.2504222145952054e-06, + "loss": 0.6401, + "step": 18730 + }, + { + "epoch": 1.353224845124352, + "grad_norm": 6.91112584394961, + "learning_rate": 1.2501688801351387e-06, + "loss": 0.6701, + "step": 18731 + }, + { + "epoch": 1.3532970903245616, + "grad_norm": 7.259594061145374, + "learning_rate": 1.249915562784289e-06, + "loss": 0.6805, + "step": 18732 + }, + { + "epoch": 1.3533693355247711, + "grad_norm": 7.633467428481597, + "learning_rate": 1.249662262546125e-06, + "loss": 0.6308, + "step": 18733 + }, + { + "epoch": 1.3534415807249807, + "grad_norm": 6.591399872304162, + "learning_rate": 1.2494089794241138e-06, + "loss": 0.5768, + "step": 18734 + }, + { + "epoch": 1.35351382592519, + "grad_norm": 6.361399485322804, + "learning_rate": 1.2491557134217225e-06, + "loss": 0.5905, + "step": 18735 + }, + { + "epoch": 1.3535860711253995, + "grad_norm": 7.2694375812577645, + "learning_rate": 1.2489024645424184e-06, + "loss": 0.6861, + "step": 18736 + }, + { + "epoch": 1.353658316325609, + "grad_norm": 6.8244507285628515, + "learning_rate": 1.2486492327896681e-06, + "loss": 0.64, + "step": 18737 + }, + { + "epoch": 1.3537305615258186, + "grad_norm": 6.619513200915143, + "learning_rate": 1.2483960181669391e-06, + "loss": 0.5828, + "step": 18738 + }, + { + "epoch": 1.3538028067260282, + "grad_norm": 6.87074925186347, + "learning_rate": 1.248142820677696e-06, + "loss": 0.6538, + "step": 18739 + }, + { + "epoch": 1.3538750519262377, + "grad_norm": 7.367920144343176, + "learning_rate": 1.2478896403254058e-06, + "loss": 0.6179, + "step": 18740 + }, + { + "epoch": 1.3539472971264472, + "grad_norm": 7.227748794100085, + "learning_rate": 1.2476364771135348e-06, + "loss": 0.6692, + "step": 18741 + }, + { + "epoch": 1.3540195423266566, + "grad_norm": 6.202409448358589, + "learning_rate": 1.2473833310455472e-06, + "loss": 0.5699, + "step": 18742 + }, + { + "epoch": 1.3540917875268663, + "grad_norm": 6.722360324054046, + "learning_rate": 1.2471302021249094e-06, + "loss": 0.6529, + "step": 18743 + }, + { + "epoch": 1.3541640327270756, + "grad_norm": 6.754976486741148, + "learning_rate": 1.246877090355085e-06, + "loss": 0.5929, + "step": 18744 + }, + { + "epoch": 1.3542362779272852, + "grad_norm": 5.980226996460855, + "learning_rate": 1.2466239957395425e-06, + "loss": 0.6255, + "step": 18745 + }, + { + "epoch": 1.3543085231274947, + "grad_norm": 7.459738174630072, + "learning_rate": 1.2463709182817433e-06, + "loss": 0.675, + "step": 18746 + }, + { + "epoch": 1.3543807683277043, + "grad_norm": 6.601872219065967, + "learning_rate": 1.246117857985153e-06, + "loss": 0.662, + "step": 18747 + }, + { + "epoch": 1.3544530135279138, + "grad_norm": 6.771106076738231, + "learning_rate": 1.2458648148532363e-06, + "loss": 0.6509, + "step": 18748 + }, + { + "epoch": 1.3545252587281231, + "grad_norm": 7.218089416136097, + "learning_rate": 1.2456117888894557e-06, + "loss": 0.6287, + "step": 18749 + }, + { + "epoch": 1.354597503928333, + "grad_norm": 5.968014656826154, + "learning_rate": 1.2453587800972755e-06, + "loss": 0.5914, + "step": 18750 + }, + { + "epoch": 1.3546697491285422, + "grad_norm": 7.847464240589072, + "learning_rate": 1.2451057884801599e-06, + "loss": 0.6682, + "step": 18751 + }, + { + "epoch": 1.3547419943287518, + "grad_norm": 6.0428440744451315, + "learning_rate": 1.2448528140415714e-06, + "loss": 0.6174, + "step": 18752 + }, + { + "epoch": 1.3548142395289613, + "grad_norm": 6.010165186689053, + "learning_rate": 1.2445998567849732e-06, + "loss": 0.6739, + "step": 18753 + }, + { + "epoch": 1.3548864847291708, + "grad_norm": 6.923514901677471, + "learning_rate": 1.2443469167138284e-06, + "loss": 0.6559, + "step": 18754 + }, + { + "epoch": 1.3549587299293804, + "grad_norm": 7.466551133806038, + "learning_rate": 1.2440939938316002e-06, + "loss": 0.6749, + "step": 18755 + }, + { + "epoch": 1.3550309751295897, + "grad_norm": 6.321977205152728, + "learning_rate": 1.2438410881417493e-06, + "loss": 0.6288, + "step": 18756 + }, + { + "epoch": 1.3551032203297995, + "grad_norm": 6.962280828837602, + "learning_rate": 1.2435881996477383e-06, + "loss": 0.6848, + "step": 18757 + }, + { + "epoch": 1.3551754655300088, + "grad_norm": 7.994293561861652, + "learning_rate": 1.2433353283530294e-06, + "loss": 0.6297, + "step": 18758 + }, + { + "epoch": 1.3552477107302183, + "grad_norm": 6.679449922675613, + "learning_rate": 1.2430824742610854e-06, + "loss": 0.6636, + "step": 18759 + }, + { + "epoch": 1.3553199559304279, + "grad_norm": 6.74163582711854, + "learning_rate": 1.242829637375365e-06, + "loss": 0.6873, + "step": 18760 + }, + { + "epoch": 1.3553922011306374, + "grad_norm": 6.927079631450687, + "learning_rate": 1.242576817699331e-06, + "loss": 0.6791, + "step": 18761 + }, + { + "epoch": 1.355464446330847, + "grad_norm": 6.322245713592371, + "learning_rate": 1.2423240152364439e-06, + "loss": 0.7018, + "step": 18762 + }, + { + "epoch": 1.3555366915310563, + "grad_norm": 7.201037851343, + "learning_rate": 1.2420712299901646e-06, + "loss": 0.6416, + "step": 18763 + }, + { + "epoch": 1.355608936731266, + "grad_norm": 7.958466959791795, + "learning_rate": 1.2418184619639534e-06, + "loss": 0.6324, + "step": 18764 + }, + { + "epoch": 1.3556811819314754, + "grad_norm": 7.338609040390766, + "learning_rate": 1.2415657111612705e-06, + "loss": 0.6328, + "step": 18765 + }, + { + "epoch": 1.355753427131685, + "grad_norm": 8.23940637320508, + "learning_rate": 1.2413129775855768e-06, + "loss": 0.6765, + "step": 18766 + }, + { + "epoch": 1.3558256723318944, + "grad_norm": 6.502696431678812, + "learning_rate": 1.2410602612403302e-06, + "loss": 0.5803, + "step": 18767 + }, + { + "epoch": 1.355897917532104, + "grad_norm": 5.554635478230818, + "learning_rate": 1.2408075621289914e-06, + "loss": 0.5566, + "step": 18768 + }, + { + "epoch": 1.3559701627323135, + "grad_norm": 6.914483181499275, + "learning_rate": 1.240554880255019e-06, + "loss": 0.6515, + "step": 18769 + }, + { + "epoch": 1.356042407932523, + "grad_norm": 8.72469354217013, + "learning_rate": 1.2403022156218735e-06, + "loss": 0.7008, + "step": 18770 + }, + { + "epoch": 1.3561146531327326, + "grad_norm": 6.086895182993543, + "learning_rate": 1.2400495682330116e-06, + "loss": 0.6404, + "step": 18771 + }, + { + "epoch": 1.356186898332942, + "grad_norm": 7.197120132942607, + "learning_rate": 1.239796938091892e-06, + "loss": 0.6388, + "step": 18772 + }, + { + "epoch": 1.3562591435331515, + "grad_norm": 6.758688527034572, + "learning_rate": 1.2395443252019755e-06, + "loss": 0.6131, + "step": 18773 + }, + { + "epoch": 1.356331388733361, + "grad_norm": 8.610172961687574, + "learning_rate": 1.2392917295667179e-06, + "loss": 0.6425, + "step": 18774 + }, + { + "epoch": 1.3564036339335706, + "grad_norm": 8.475019660859385, + "learning_rate": 1.2390391511895774e-06, + "loss": 0.6047, + "step": 18775 + }, + { + "epoch": 1.35647587913378, + "grad_norm": 5.743332687676044, + "learning_rate": 1.2387865900740118e-06, + "loss": 0.5869, + "step": 18776 + }, + { + "epoch": 1.3565481243339896, + "grad_norm": 6.5968461564163965, + "learning_rate": 1.2385340462234796e-06, + "loss": 0.6884, + "step": 18777 + }, + { + "epoch": 1.3566203695341992, + "grad_norm": 9.727812341260359, + "learning_rate": 1.2382815196414359e-06, + "loss": 0.6415, + "step": 18778 + }, + { + "epoch": 1.3566926147344085, + "grad_norm": 6.477165313754852, + "learning_rate": 1.2380290103313384e-06, + "loss": 0.5652, + "step": 18779 + }, + { + "epoch": 1.356764859934618, + "grad_norm": 7.071181910893211, + "learning_rate": 1.2377765182966438e-06, + "loss": 0.6247, + "step": 18780 + }, + { + "epoch": 1.3568371051348276, + "grad_norm": 7.454960946812925, + "learning_rate": 1.237524043540809e-06, + "loss": 0.6573, + "step": 18781 + }, + { + "epoch": 1.3569093503350371, + "grad_norm": 6.67205624479376, + "learning_rate": 1.2372715860672896e-06, + "loss": 0.6199, + "step": 18782 + }, + { + "epoch": 1.3569815955352467, + "grad_norm": 6.343194645969048, + "learning_rate": 1.2370191458795417e-06, + "loss": 0.6482, + "step": 18783 + }, + { + "epoch": 1.3570538407354562, + "grad_norm": 6.733483912732497, + "learning_rate": 1.236766722981022e-06, + "loss": 0.5973, + "step": 18784 + }, + { + "epoch": 1.3571260859356657, + "grad_norm": 8.021228757416695, + "learning_rate": 1.2365143173751842e-06, + "loss": 0.6703, + "step": 18785 + }, + { + "epoch": 1.357198331135875, + "grad_norm": 7.911139261836188, + "learning_rate": 1.2362619290654843e-06, + "loss": 0.6493, + "step": 18786 + }, + { + "epoch": 1.3572705763360846, + "grad_norm": 7.328604676381825, + "learning_rate": 1.2360095580553782e-06, + "loss": 0.6194, + "step": 18787 + }, + { + "epoch": 1.3573428215362942, + "grad_norm": 7.056558996552511, + "learning_rate": 1.2357572043483187e-06, + "loss": 0.6375, + "step": 18788 + }, + { + "epoch": 1.3574150667365037, + "grad_norm": 6.060519494681142, + "learning_rate": 1.235504867947762e-06, + "loss": 0.5817, + "step": 18789 + }, + { + "epoch": 1.3574873119367132, + "grad_norm": 6.86299055392485, + "learning_rate": 1.2352525488571614e-06, + "loss": 0.6397, + "step": 18790 + }, + { + "epoch": 1.3575595571369228, + "grad_norm": 7.1883677083228426, + "learning_rate": 1.2350002470799715e-06, + "loss": 0.6285, + "step": 18791 + }, + { + "epoch": 1.3576318023371323, + "grad_norm": 5.605173902034253, + "learning_rate": 1.234747962619646e-06, + "loss": 0.5711, + "step": 18792 + }, + { + "epoch": 1.3577040475373416, + "grad_norm": 7.666698068747963, + "learning_rate": 1.2344956954796387e-06, + "loss": 0.6523, + "step": 18793 + }, + { + "epoch": 1.3577762927375512, + "grad_norm": 5.888211917152679, + "learning_rate": 1.2342434456634025e-06, + "loss": 0.644, + "step": 18794 + }, + { + "epoch": 1.3578485379377607, + "grad_norm": 7.754726322345145, + "learning_rate": 1.2339912131743918e-06, + "loss": 0.6879, + "step": 18795 + }, + { + "epoch": 1.3579207831379703, + "grad_norm": 7.056072990420563, + "learning_rate": 1.2337389980160572e-06, + "loss": 0.5749, + "step": 18796 + }, + { + "epoch": 1.3579930283381798, + "grad_norm": 6.6151354782059855, + "learning_rate": 1.2334868001918526e-06, + "loss": 0.5861, + "step": 18797 + }, + { + "epoch": 1.3580652735383894, + "grad_norm": 6.271107546196049, + "learning_rate": 1.2332346197052316e-06, + "loss": 0.6172, + "step": 18798 + }, + { + "epoch": 1.358137518738599, + "grad_norm": 6.853351981339274, + "learning_rate": 1.2329824565596438e-06, + "loss": 0.6797, + "step": 18799 + }, + { + "epoch": 1.3582097639388082, + "grad_norm": 6.494231672294093, + "learning_rate": 1.2327303107585416e-06, + "loss": 0.7131, + "step": 18800 + }, + { + "epoch": 1.3582820091390178, + "grad_norm": 7.560376838697608, + "learning_rate": 1.2324781823053784e-06, + "loss": 0.6924, + "step": 18801 + }, + { + "epoch": 1.3583542543392273, + "grad_norm": 6.967767894290892, + "learning_rate": 1.2322260712036058e-06, + "loss": 0.6708, + "step": 18802 + }, + { + "epoch": 1.3584264995394368, + "grad_norm": 7.12086768531735, + "learning_rate": 1.2319739774566727e-06, + "loss": 0.6497, + "step": 18803 + }, + { + "epoch": 1.3584987447396464, + "grad_norm": 8.432721176345645, + "learning_rate": 1.2317219010680315e-06, + "loss": 0.5425, + "step": 18804 + }, + { + "epoch": 1.358570989939856, + "grad_norm": 6.168905350140666, + "learning_rate": 1.2314698420411333e-06, + "loss": 0.5952, + "step": 18805 + }, + { + "epoch": 1.3586432351400655, + "grad_norm": 6.598789404613949, + "learning_rate": 1.2312178003794275e-06, + "loss": 0.6128, + "step": 18806 + }, + { + "epoch": 1.3587154803402748, + "grad_norm": 7.1767192096738714, + "learning_rate": 1.2309657760863646e-06, + "loss": 0.5449, + "step": 18807 + }, + { + "epoch": 1.3587877255404843, + "grad_norm": 7.896436066046886, + "learning_rate": 1.230713769165395e-06, + "loss": 0.649, + "step": 18808 + }, + { + "epoch": 1.3588599707406939, + "grad_norm": 8.456069066944842, + "learning_rate": 1.2304617796199683e-06, + "loss": 0.6583, + "step": 18809 + }, + { + "epoch": 1.3589322159409034, + "grad_norm": 6.265324535442983, + "learning_rate": 1.2302098074535343e-06, + "loss": 0.6073, + "step": 18810 + }, + { + "epoch": 1.359004461141113, + "grad_norm": 8.20606299693513, + "learning_rate": 1.229957852669542e-06, + "loss": 0.652, + "step": 18811 + }, + { + "epoch": 1.3590767063413225, + "grad_norm": 6.19447134303861, + "learning_rate": 1.2297059152714413e-06, + "loss": 0.6267, + "step": 18812 + }, + { + "epoch": 1.359148951541532, + "grad_norm": 6.387433661248895, + "learning_rate": 1.2294539952626797e-06, + "loss": 0.5965, + "step": 18813 + }, + { + "epoch": 1.3592211967417414, + "grad_norm": 8.449629140085408, + "learning_rate": 1.2292020926467063e-06, + "loss": 0.6136, + "step": 18814 + }, + { + "epoch": 1.3592934419419511, + "grad_norm": 6.838036229169567, + "learning_rate": 1.2289502074269693e-06, + "loss": 0.6198, + "step": 18815 + }, + { + "epoch": 1.3593656871421604, + "grad_norm": 6.747681961461087, + "learning_rate": 1.2286983396069182e-06, + "loss": 0.6842, + "step": 18816 + }, + { + "epoch": 1.35943793234237, + "grad_norm": 5.903691782279648, + "learning_rate": 1.228446489189999e-06, + "loss": 0.5277, + "step": 18817 + }, + { + "epoch": 1.3595101775425795, + "grad_norm": 6.581280094378605, + "learning_rate": 1.2281946561796595e-06, + "loss": 0.6268, + "step": 18818 + }, + { + "epoch": 1.359582422742789, + "grad_norm": 7.081295962070884, + "learning_rate": 1.2279428405793482e-06, + "loss": 0.6447, + "step": 18819 + }, + { + "epoch": 1.3596546679429986, + "grad_norm": 5.7039650494332665, + "learning_rate": 1.2276910423925115e-06, + "loss": 0.6072, + "step": 18820 + }, + { + "epoch": 1.359726913143208, + "grad_norm": 6.75862531239018, + "learning_rate": 1.2274392616225964e-06, + "loss": 0.6167, + "step": 18821 + }, + { + "epoch": 1.3597991583434177, + "grad_norm": 7.769811595463812, + "learning_rate": 1.22718749827305e-06, + "loss": 0.6573, + "step": 18822 + }, + { + "epoch": 1.359871403543627, + "grad_norm": 7.073030705906714, + "learning_rate": 1.2269357523473195e-06, + "loss": 0.644, + "step": 18823 + }, + { + "epoch": 1.3599436487438366, + "grad_norm": 7.512882104889751, + "learning_rate": 1.2266840238488486e-06, + "loss": 0.663, + "step": 18824 + }, + { + "epoch": 1.360015893944046, + "grad_norm": 7.234147729455215, + "learning_rate": 1.226432312781085e-06, + "loss": 0.6532, + "step": 18825 + }, + { + "epoch": 1.3600881391442556, + "grad_norm": 10.384570751474651, + "learning_rate": 1.2261806191474745e-06, + "loss": 0.6169, + "step": 18826 + }, + { + "epoch": 1.3601603843444652, + "grad_norm": 7.493572532709516, + "learning_rate": 1.225928942951463e-06, + "loss": 0.6133, + "step": 18827 + }, + { + "epoch": 1.3602326295446745, + "grad_norm": 6.386599291696271, + "learning_rate": 1.225677284196494e-06, + "loss": 0.6357, + "step": 18828 + }, + { + "epoch": 1.3603048747448843, + "grad_norm": 6.6885181258062145, + "learning_rate": 1.2254256428860126e-06, + "loss": 0.6048, + "step": 18829 + }, + { + "epoch": 1.3603771199450936, + "grad_norm": 7.1989659944439355, + "learning_rate": 1.2251740190234664e-06, + "loss": 0.5692, + "step": 18830 + }, + { + "epoch": 1.3604493651453031, + "grad_norm": 6.75314512095351, + "learning_rate": 1.2249224126122971e-06, + "loss": 0.6173, + "step": 18831 + }, + { + "epoch": 1.3605216103455127, + "grad_norm": 5.842774661671483, + "learning_rate": 1.2246708236559498e-06, + "loss": 0.6781, + "step": 18832 + }, + { + "epoch": 1.3605938555457222, + "grad_norm": 7.841661152312407, + "learning_rate": 1.224419252157869e-06, + "loss": 0.6351, + "step": 18833 + }, + { + "epoch": 1.3606661007459318, + "grad_norm": 6.40975186538684, + "learning_rate": 1.224167698121499e-06, + "loss": 0.5761, + "step": 18834 + }, + { + "epoch": 1.360738345946141, + "grad_norm": 8.901003285982446, + "learning_rate": 1.2239161615502819e-06, + "loss": 0.6892, + "step": 18835 + }, + { + "epoch": 1.3608105911463508, + "grad_norm": 6.90879478002495, + "learning_rate": 1.2236646424476615e-06, + "loss": 0.6625, + "step": 18836 + }, + { + "epoch": 1.3608828363465602, + "grad_norm": 7.857651508265163, + "learning_rate": 1.223413140817081e-06, + "loss": 0.6155, + "step": 18837 + }, + { + "epoch": 1.3609550815467697, + "grad_norm": 7.346517357990775, + "learning_rate": 1.2231616566619834e-06, + "loss": 0.5893, + "step": 18838 + }, + { + "epoch": 1.3610273267469792, + "grad_norm": 6.331641155183826, + "learning_rate": 1.2229101899858114e-06, + "loss": 0.6424, + "step": 18839 + }, + { + "epoch": 1.3610995719471888, + "grad_norm": 6.8263741699280835, + "learning_rate": 1.2226587407920074e-06, + "loss": 0.6427, + "step": 18840 + }, + { + "epoch": 1.3611718171473983, + "grad_norm": 6.286555692165872, + "learning_rate": 1.2224073090840143e-06, + "loss": 0.6529, + "step": 18841 + }, + { + "epoch": 1.3612440623476079, + "grad_norm": 6.244207520852951, + "learning_rate": 1.2221558948652723e-06, + "loss": 0.5599, + "step": 18842 + }, + { + "epoch": 1.3613163075478174, + "grad_norm": 7.648845470797068, + "learning_rate": 1.221904498139224e-06, + "loss": 0.632, + "step": 18843 + }, + { + "epoch": 1.3613885527480267, + "grad_norm": 9.24975874302672, + "learning_rate": 1.2216531189093107e-06, + "loss": 0.729, + "step": 18844 + }, + { + "epoch": 1.3614607979482363, + "grad_norm": 7.204909759316391, + "learning_rate": 1.2214017571789744e-06, + "loss": 0.6348, + "step": 18845 + }, + { + "epoch": 1.3615330431484458, + "grad_norm": 7.06984497547824, + "learning_rate": 1.2211504129516544e-06, + "loss": 0.6347, + "step": 18846 + }, + { + "epoch": 1.3616052883486554, + "grad_norm": 6.552738158996672, + "learning_rate": 1.2208990862307924e-06, + "loss": 0.6343, + "step": 18847 + }, + { + "epoch": 1.361677533548865, + "grad_norm": 7.257014071562993, + "learning_rate": 1.2206477770198286e-06, + "loss": 0.6057, + "step": 18848 + }, + { + "epoch": 1.3617497787490744, + "grad_norm": 7.687670698054044, + "learning_rate": 1.2203964853222033e-06, + "loss": 0.6382, + "step": 18849 + }, + { + "epoch": 1.361822023949284, + "grad_norm": 5.9513731453903835, + "learning_rate": 1.2201452111413566e-06, + "loss": 0.5782, + "step": 18850 + }, + { + "epoch": 1.3618942691494933, + "grad_norm": 6.973777155176282, + "learning_rate": 1.2198939544807283e-06, + "loss": 0.6488, + "step": 18851 + }, + { + "epoch": 1.3619665143497028, + "grad_norm": 5.574878934221405, + "learning_rate": 1.219642715343759e-06, + "loss": 0.5272, + "step": 18852 + }, + { + "epoch": 1.3620387595499124, + "grad_norm": 6.339084958249901, + "learning_rate": 1.2193914937338855e-06, + "loss": 0.6957, + "step": 18853 + }, + { + "epoch": 1.362111004750122, + "grad_norm": 8.69082248741812, + "learning_rate": 1.2191402896545481e-06, + "loss": 0.6821, + "step": 18854 + }, + { + "epoch": 1.3621832499503315, + "grad_norm": 6.383639894831535, + "learning_rate": 1.2188891031091868e-06, + "loss": 0.6859, + "step": 18855 + }, + { + "epoch": 1.362255495150541, + "grad_norm": 6.806543948829225, + "learning_rate": 1.218637934101238e-06, + "loss": 0.6589, + "step": 18856 + }, + { + "epoch": 1.3623277403507505, + "grad_norm": 7.5275745841454835, + "learning_rate": 1.21838678263414e-06, + "loss": 0.5493, + "step": 18857 + }, + { + "epoch": 1.3623999855509599, + "grad_norm": 6.913397081327064, + "learning_rate": 1.2181356487113327e-06, + "loss": 0.5604, + "step": 18858 + }, + { + "epoch": 1.3624722307511694, + "grad_norm": 6.3180837579955265, + "learning_rate": 1.217884532336254e-06, + "loss": 0.6262, + "step": 18859 + }, + { + "epoch": 1.362544475951379, + "grad_norm": 5.739027085209272, + "learning_rate": 1.2176334335123396e-06, + "loss": 0.6349, + "step": 18860 + }, + { + "epoch": 1.3626167211515885, + "grad_norm": 6.099750713820357, + "learning_rate": 1.217382352243028e-06, + "loss": 0.5932, + "step": 18861 + }, + { + "epoch": 1.362688966351798, + "grad_norm": 7.09706985830987, + "learning_rate": 1.2171312885317569e-06, + "loss": 0.5981, + "step": 18862 + }, + { + "epoch": 1.3627612115520076, + "grad_norm": 7.81001742782309, + "learning_rate": 1.2168802423819615e-06, + "loss": 0.6254, + "step": 18863 + }, + { + "epoch": 1.3628334567522171, + "grad_norm": 6.9041918971863625, + "learning_rate": 1.2166292137970793e-06, + "loss": 0.6555, + "step": 18864 + }, + { + "epoch": 1.3629057019524264, + "grad_norm": 8.649659660567712, + "learning_rate": 1.2163782027805466e-06, + "loss": 0.6115, + "step": 18865 + }, + { + "epoch": 1.362977947152636, + "grad_norm": 6.556060829015393, + "learning_rate": 1.2161272093357994e-06, + "loss": 0.6243, + "step": 18866 + }, + { + "epoch": 1.3630501923528455, + "grad_norm": 5.991776552992136, + "learning_rate": 1.2158762334662744e-06, + "loss": 0.5558, + "step": 18867 + }, + { + "epoch": 1.363122437553055, + "grad_norm": 6.130551683203534, + "learning_rate": 1.2156252751754063e-06, + "loss": 0.6087, + "step": 18868 + }, + { + "epoch": 1.3631946827532646, + "grad_norm": 5.761703853264641, + "learning_rate": 1.215374334466631e-06, + "loss": 0.6106, + "step": 18869 + }, + { + "epoch": 1.3632669279534742, + "grad_norm": 5.861420866789615, + "learning_rate": 1.2151234113433845e-06, + "loss": 0.6196, + "step": 18870 + }, + { + "epoch": 1.3633391731536837, + "grad_norm": 6.451428797771455, + "learning_rate": 1.2148725058091002e-06, + "loss": 0.6197, + "step": 18871 + }, + { + "epoch": 1.363411418353893, + "grad_norm": 7.159528089773914, + "learning_rate": 1.2146216178672132e-06, + "loss": 0.691, + "step": 18872 + }, + { + "epoch": 1.3634836635541026, + "grad_norm": 6.704376886302988, + "learning_rate": 1.2143707475211593e-06, + "loss": 0.6381, + "step": 18873 + }, + { + "epoch": 1.363555908754312, + "grad_norm": 7.700381519724617, + "learning_rate": 1.2141198947743708e-06, + "loss": 0.6163, + "step": 18874 + }, + { + "epoch": 1.3636281539545216, + "grad_norm": 8.502091543138674, + "learning_rate": 1.2138690596302821e-06, + "loss": 0.698, + "step": 18875 + }, + { + "epoch": 1.3637003991547312, + "grad_norm": 6.126695359376972, + "learning_rate": 1.2136182420923277e-06, + "loss": 0.6409, + "step": 18876 + }, + { + "epoch": 1.3637726443549407, + "grad_norm": 6.460337895865801, + "learning_rate": 1.2133674421639408e-06, + "loss": 0.598, + "step": 18877 + }, + { + "epoch": 1.3638448895551503, + "grad_norm": 6.7587452503862835, + "learning_rate": 1.2131166598485545e-06, + "loss": 0.6438, + "step": 18878 + }, + { + "epoch": 1.3639171347553596, + "grad_norm": 8.381584198367062, + "learning_rate": 1.2128658951496016e-06, + "loss": 0.6223, + "step": 18879 + }, + { + "epoch": 1.3639893799555691, + "grad_norm": 7.213436400452224, + "learning_rate": 1.2126151480705167e-06, + "loss": 0.6569, + "step": 18880 + }, + { + "epoch": 1.3640616251557787, + "grad_norm": 8.942742790699365, + "learning_rate": 1.2123644186147296e-06, + "loss": 0.7125, + "step": 18881 + }, + { + "epoch": 1.3641338703559882, + "grad_norm": 7.853995027024239, + "learning_rate": 1.2121137067856738e-06, + "loss": 0.6272, + "step": 18882 + }, + { + "epoch": 1.3642061155561978, + "grad_norm": 5.834736282939879, + "learning_rate": 1.2118630125867813e-06, + "loss": 0.6297, + "step": 18883 + }, + { + "epoch": 1.3642783607564073, + "grad_norm": 6.139653330407029, + "learning_rate": 1.2116123360214852e-06, + "loss": 0.6187, + "step": 18884 + }, + { + "epoch": 1.3643506059566168, + "grad_norm": 6.52473204782376, + "learning_rate": 1.2113616770932138e-06, + "loss": 0.7255, + "step": 18885 + }, + { + "epoch": 1.3644228511568262, + "grad_norm": 6.933681777610223, + "learning_rate": 1.2111110358054015e-06, + "loss": 0.671, + "step": 18886 + }, + { + "epoch": 1.3644950963570357, + "grad_norm": 6.6126348874733205, + "learning_rate": 1.2108604121614793e-06, + "loss": 0.6399, + "step": 18887 + }, + { + "epoch": 1.3645673415572452, + "grad_norm": 5.665879288367479, + "learning_rate": 1.2106098061648761e-06, + "loss": 0.6081, + "step": 18888 + }, + { + "epoch": 1.3646395867574548, + "grad_norm": 6.478324549205894, + "learning_rate": 1.2103592178190235e-06, + "loss": 0.6402, + "step": 18889 + }, + { + "epoch": 1.3647118319576643, + "grad_norm": 7.23839851033257, + "learning_rate": 1.2101086471273518e-06, + "loss": 0.5884, + "step": 18890 + }, + { + "epoch": 1.3647840771578739, + "grad_norm": 6.9183131024015765, + "learning_rate": 1.2098580940932923e-06, + "loss": 0.6012, + "step": 18891 + }, + { + "epoch": 1.3648563223580834, + "grad_norm": 5.807189484340188, + "learning_rate": 1.2096075587202727e-06, + "loss": 0.527, + "step": 18892 + }, + { + "epoch": 1.3649285675582927, + "grad_norm": 6.489639241165008, + "learning_rate": 1.209357041011724e-06, + "loss": 0.6611, + "step": 18893 + }, + { + "epoch": 1.3650008127585025, + "grad_norm": 7.073274209185593, + "learning_rate": 1.2091065409710748e-06, + "loss": 0.591, + "step": 18894 + }, + { + "epoch": 1.3650730579587118, + "grad_norm": 6.63699119327496, + "learning_rate": 1.2088560586017551e-06, + "loss": 0.6589, + "step": 18895 + }, + { + "epoch": 1.3651453031589214, + "grad_norm": 6.197459210949816, + "learning_rate": 1.208605593907193e-06, + "loss": 0.6747, + "step": 18896 + }, + { + "epoch": 1.365217548359131, + "grad_norm": 6.928069707488961, + "learning_rate": 1.208355146890818e-06, + "loss": 0.6878, + "step": 18897 + }, + { + "epoch": 1.3652897935593404, + "grad_norm": 6.649543356454279, + "learning_rate": 1.2081047175560593e-06, + "loss": 0.5901, + "step": 18898 + }, + { + "epoch": 1.36536203875955, + "grad_norm": 8.57600529508285, + "learning_rate": 1.2078543059063425e-06, + "loss": 0.6367, + "step": 18899 + }, + { + "epoch": 1.3654342839597593, + "grad_norm": 7.419028288354763, + "learning_rate": 1.2076039119450971e-06, + "loss": 0.6361, + "step": 18900 + }, + { + "epoch": 1.365506529159969, + "grad_norm": 7.220431837867661, + "learning_rate": 1.2073535356757508e-06, + "loss": 0.644, + "step": 18901 + }, + { + "epoch": 1.3655787743601784, + "grad_norm": 8.477419311535424, + "learning_rate": 1.2071031771017319e-06, + "loss": 0.7471, + "step": 18902 + }, + { + "epoch": 1.365651019560388, + "grad_norm": 7.231066480729552, + "learning_rate": 1.2068528362264655e-06, + "loss": 0.6024, + "step": 18903 + }, + { + "epoch": 1.3657232647605975, + "grad_norm": 7.495868307742054, + "learning_rate": 1.2066025130533797e-06, + "loss": 0.632, + "step": 18904 + }, + { + "epoch": 1.365795509960807, + "grad_norm": 7.618662217020176, + "learning_rate": 1.2063522075859013e-06, + "loss": 0.6673, + "step": 18905 + }, + { + "epoch": 1.3658677551610166, + "grad_norm": 8.206905285950278, + "learning_rate": 1.2061019198274568e-06, + "loss": 0.6276, + "step": 18906 + }, + { + "epoch": 1.3659400003612259, + "grad_norm": 6.4619736123191185, + "learning_rate": 1.2058516497814724e-06, + "loss": 0.64, + "step": 18907 + }, + { + "epoch": 1.3660122455614356, + "grad_norm": 7.3721163009601804, + "learning_rate": 1.205601397451374e-06, + "loss": 0.5869, + "step": 18908 + }, + { + "epoch": 1.366084490761645, + "grad_norm": 7.072063621200299, + "learning_rate": 1.2053511628405883e-06, + "loss": 0.6194, + "step": 18909 + }, + { + "epoch": 1.3661567359618545, + "grad_norm": 6.768675506266411, + "learning_rate": 1.2051009459525392e-06, + "loss": 0.6192, + "step": 18910 + }, + { + "epoch": 1.366228981162064, + "grad_norm": 6.49665452987662, + "learning_rate": 1.2048507467906525e-06, + "loss": 0.5871, + "step": 18911 + }, + { + "epoch": 1.3663012263622736, + "grad_norm": 7.830163381925468, + "learning_rate": 1.2046005653583546e-06, + "loss": 0.6066, + "step": 18912 + }, + { + "epoch": 1.3663734715624831, + "grad_norm": 8.032067876946932, + "learning_rate": 1.204350401659067e-06, + "loss": 0.6688, + "step": 18913 + }, + { + "epoch": 1.3664457167626924, + "grad_norm": 7.417279107456464, + "learning_rate": 1.204100255696218e-06, + "loss": 0.6337, + "step": 18914 + }, + { + "epoch": 1.3665179619629022, + "grad_norm": 7.565044384893598, + "learning_rate": 1.2038501274732295e-06, + "loss": 0.6619, + "step": 18915 + }, + { + "epoch": 1.3665902071631115, + "grad_norm": 7.450892083950732, + "learning_rate": 1.2036000169935278e-06, + "loss": 0.6532, + "step": 18916 + }, + { + "epoch": 1.366662452363321, + "grad_norm": 7.322673013973796, + "learning_rate": 1.203349924260534e-06, + "loss": 0.6656, + "step": 18917 + }, + { + "epoch": 1.3667346975635306, + "grad_norm": 6.9188076826687315, + "learning_rate": 1.203099849277673e-06, + "loss": 0.573, + "step": 18918 + }, + { + "epoch": 1.3668069427637402, + "grad_norm": 6.927807885235167, + "learning_rate": 1.2028497920483691e-06, + "loss": 0.645, + "step": 18919 + }, + { + "epoch": 1.3668791879639497, + "grad_norm": 5.9652974793663045, + "learning_rate": 1.2025997525760435e-06, + "loss": 0.6425, + "step": 18920 + }, + { + "epoch": 1.3669514331641592, + "grad_norm": 5.518216224164369, + "learning_rate": 1.20234973086412e-06, + "loss": 0.5522, + "step": 18921 + }, + { + "epoch": 1.3670236783643688, + "grad_norm": 8.3377906577077, + "learning_rate": 1.202099726916021e-06, + "loss": 0.6578, + "step": 18922 + }, + { + "epoch": 1.367095923564578, + "grad_norm": 9.944215053230815, + "learning_rate": 1.201849740735169e-06, + "loss": 0.7088, + "step": 18923 + }, + { + "epoch": 1.3671681687647876, + "grad_norm": 8.60004208576639, + "learning_rate": 1.2015997723249859e-06, + "loss": 0.6369, + "step": 18924 + }, + { + "epoch": 1.3672404139649972, + "grad_norm": 5.949313007091463, + "learning_rate": 1.201349821688894e-06, + "loss": 0.5857, + "step": 18925 + }, + { + "epoch": 1.3673126591652067, + "grad_norm": 9.567550621885612, + "learning_rate": 1.2010998888303147e-06, + "loss": 0.7483, + "step": 18926 + }, + { + "epoch": 1.3673849043654163, + "grad_norm": 8.063127360221246, + "learning_rate": 1.2008499737526702e-06, + "loss": 0.6254, + "step": 18927 + }, + { + "epoch": 1.3674571495656258, + "grad_norm": 7.567055583263236, + "learning_rate": 1.2006000764593801e-06, + "loss": 0.6233, + "step": 18928 + }, + { + "epoch": 1.3675293947658353, + "grad_norm": 6.986374263808368, + "learning_rate": 1.2003501969538658e-06, + "loss": 0.6098, + "step": 18929 + }, + { + "epoch": 1.3676016399660447, + "grad_norm": 8.061006156272605, + "learning_rate": 1.2001003352395494e-06, + "loss": 0.6044, + "step": 18930 + }, + { + "epoch": 1.3676738851662542, + "grad_norm": 5.846470816509801, + "learning_rate": 1.1998504913198492e-06, + "loss": 0.6466, + "step": 18931 + }, + { + "epoch": 1.3677461303664638, + "grad_norm": 7.2056194628402155, + "learning_rate": 1.199600665198186e-06, + "loss": 0.6325, + "step": 18932 + }, + { + "epoch": 1.3678183755666733, + "grad_norm": 6.2232254256523865, + "learning_rate": 1.1993508568779799e-06, + "loss": 0.6836, + "step": 18933 + }, + { + "epoch": 1.3678906207668828, + "grad_norm": 6.133907458482604, + "learning_rate": 1.199101066362651e-06, + "loss": 0.5795, + "step": 18934 + }, + { + "epoch": 1.3679628659670924, + "grad_norm": 6.97037340080714, + "learning_rate": 1.1988512936556182e-06, + "loss": 0.6311, + "step": 18935 + }, + { + "epoch": 1.368035111167302, + "grad_norm": 8.050253860546155, + "learning_rate": 1.1986015387603011e-06, + "loss": 0.6535, + "step": 18936 + }, + { + "epoch": 1.3681073563675112, + "grad_norm": 8.044572637858474, + "learning_rate": 1.1983518016801194e-06, + "loss": 0.6036, + "step": 18937 + }, + { + "epoch": 1.3681796015677208, + "grad_norm": 6.889009385781297, + "learning_rate": 1.1981020824184897e-06, + "loss": 0.6878, + "step": 18938 + }, + { + "epoch": 1.3682518467679303, + "grad_norm": 7.120077206053739, + "learning_rate": 1.1978523809788318e-06, + "loss": 0.7241, + "step": 18939 + }, + { + "epoch": 1.3683240919681399, + "grad_norm": 7.01372626423495, + "learning_rate": 1.1976026973645637e-06, + "loss": 0.6466, + "step": 18940 + }, + { + "epoch": 1.3683963371683494, + "grad_norm": 5.90203351683037, + "learning_rate": 1.1973530315791043e-06, + "loss": 0.6457, + "step": 18941 + }, + { + "epoch": 1.368468582368559, + "grad_norm": 5.788770907699334, + "learning_rate": 1.1971033836258689e-06, + "loss": 0.6203, + "step": 18942 + }, + { + "epoch": 1.3685408275687685, + "grad_norm": 7.527767405047276, + "learning_rate": 1.1968537535082771e-06, + "loss": 0.69, + "step": 18943 + }, + { + "epoch": 1.3686130727689778, + "grad_norm": 6.348815398971279, + "learning_rate": 1.1966041412297469e-06, + "loss": 0.5615, + "step": 18944 + }, + { + "epoch": 1.3686853179691874, + "grad_norm": 6.7030737814913826, + "learning_rate": 1.1963545467936927e-06, + "loss": 0.5966, + "step": 18945 + }, + { + "epoch": 1.368757563169397, + "grad_norm": 6.6043858376445925, + "learning_rate": 1.1961049702035326e-06, + "loss": 0.5853, + "step": 18946 + }, + { + "epoch": 1.3688298083696064, + "grad_norm": 8.435639805071009, + "learning_rate": 1.195855411462683e-06, + "loss": 0.6393, + "step": 18947 + }, + { + "epoch": 1.368902053569816, + "grad_norm": 6.967296773836808, + "learning_rate": 1.1956058705745617e-06, + "loss": 0.6269, + "step": 18948 + }, + { + "epoch": 1.3689742987700255, + "grad_norm": 6.838825282897641, + "learning_rate": 1.1953563475425817e-06, + "loss": 0.5987, + "step": 18949 + }, + { + "epoch": 1.369046543970235, + "grad_norm": 5.4168549529387215, + "learning_rate": 1.1951068423701604e-06, + "loss": 0.6454, + "step": 18950 + }, + { + "epoch": 1.3691187891704444, + "grad_norm": 7.049472913585455, + "learning_rate": 1.1948573550607134e-06, + "loss": 0.5859, + "step": 18951 + }, + { + "epoch": 1.369191034370654, + "grad_norm": 7.215788791229873, + "learning_rate": 1.1946078856176557e-06, + "loss": 0.6924, + "step": 18952 + }, + { + "epoch": 1.3692632795708635, + "grad_norm": 6.42964151952347, + "learning_rate": 1.1943584340444025e-06, + "loss": 0.6073, + "step": 18953 + }, + { + "epoch": 1.369335524771073, + "grad_norm": 8.075275088133123, + "learning_rate": 1.1941090003443686e-06, + "loss": 0.6398, + "step": 18954 + }, + { + "epoch": 1.3694077699712826, + "grad_norm": 6.805766286204048, + "learning_rate": 1.1938595845209694e-06, + "loss": 0.6547, + "step": 18955 + }, + { + "epoch": 1.369480015171492, + "grad_norm": 8.21143970351086, + "learning_rate": 1.1936101865776176e-06, + "loss": 0.6626, + "step": 18956 + }, + { + "epoch": 1.3695522603717016, + "grad_norm": 6.092966508626495, + "learning_rate": 1.1933608065177277e-06, + "loss": 0.5754, + "step": 18957 + }, + { + "epoch": 1.369624505571911, + "grad_norm": 5.960343277537292, + "learning_rate": 1.1931114443447139e-06, + "loss": 0.6368, + "step": 18958 + }, + { + "epoch": 1.3696967507721205, + "grad_norm": 7.536180836899106, + "learning_rate": 1.1928621000619907e-06, + "loss": 0.611, + "step": 18959 + }, + { + "epoch": 1.36976899597233, + "grad_norm": 6.682961037683907, + "learning_rate": 1.1926127736729695e-06, + "loss": 0.5931, + "step": 18960 + }, + { + "epoch": 1.3698412411725396, + "grad_norm": 6.473985405453838, + "learning_rate": 1.1923634651810644e-06, + "loss": 0.5621, + "step": 18961 + }, + { + "epoch": 1.3699134863727491, + "grad_norm": 6.313139628117798, + "learning_rate": 1.192114174589688e-06, + "loss": 0.6242, + "step": 18962 + }, + { + "epoch": 1.3699857315729587, + "grad_norm": 6.677177376960168, + "learning_rate": 1.1918649019022532e-06, + "loss": 0.6687, + "step": 18963 + }, + { + "epoch": 1.3700579767731682, + "grad_norm": 6.287933286143097, + "learning_rate": 1.191615647122172e-06, + "loss": 0.6151, + "step": 18964 + }, + { + "epoch": 1.3701302219733775, + "grad_norm": 6.832033762596022, + "learning_rate": 1.191366410252857e-06, + "loss": 0.6218, + "step": 18965 + }, + { + "epoch": 1.3702024671735873, + "grad_norm": 7.367575059811939, + "learning_rate": 1.1911171912977203e-06, + "loss": 0.7387, + "step": 18966 + }, + { + "epoch": 1.3702747123737966, + "grad_norm": 6.6984266341103265, + "learning_rate": 1.1908679902601721e-06, + "loss": 0.6195, + "step": 18967 + }, + { + "epoch": 1.3703469575740062, + "grad_norm": 5.968264255431076, + "learning_rate": 1.1906188071436248e-06, + "loss": 0.6539, + "step": 18968 + }, + { + "epoch": 1.3704192027742157, + "grad_norm": 6.178678447070613, + "learning_rate": 1.1903696419514905e-06, + "loss": 0.6371, + "step": 18969 + }, + { + "epoch": 1.3704914479744252, + "grad_norm": 6.869828099483645, + "learning_rate": 1.1901204946871767e-06, + "loss": 0.5629, + "step": 18970 + }, + { + "epoch": 1.3705636931746348, + "grad_norm": 7.200419615068044, + "learning_rate": 1.1898713653540973e-06, + "loss": 0.6546, + "step": 18971 + }, + { + "epoch": 1.370635938374844, + "grad_norm": 6.495131797011272, + "learning_rate": 1.1896222539556615e-06, + "loss": 0.6416, + "step": 18972 + }, + { + "epoch": 1.3707081835750539, + "grad_norm": 6.392846651669942, + "learning_rate": 1.1893731604952807e-06, + "loss": 0.5355, + "step": 18973 + }, + { + "epoch": 1.3707804287752632, + "grad_norm": 7.723459390141951, + "learning_rate": 1.189124084976363e-06, + "loss": 0.7169, + "step": 18974 + }, + { + "epoch": 1.3708526739754727, + "grad_norm": 6.730060372386476, + "learning_rate": 1.1888750274023183e-06, + "loss": 0.6121, + "step": 18975 + }, + { + "epoch": 1.3709249191756823, + "grad_norm": 7.28353361411609, + "learning_rate": 1.188625987776557e-06, + "loss": 0.5761, + "step": 18976 + }, + { + "epoch": 1.3709971643758918, + "grad_norm": 6.307672751076103, + "learning_rate": 1.188376966102488e-06, + "loss": 0.6077, + "step": 18977 + }, + { + "epoch": 1.3710694095761014, + "grad_norm": 7.086172753984791, + "learning_rate": 1.1881279623835193e-06, + "loss": 0.5695, + "step": 18978 + }, + { + "epoch": 1.3711416547763107, + "grad_norm": 8.303898771194534, + "learning_rate": 1.1878789766230603e-06, + "loss": 0.7175, + "step": 18979 + }, + { + "epoch": 1.3712138999765204, + "grad_norm": 7.297012491176558, + "learning_rate": 1.1876300088245193e-06, + "loss": 0.6933, + "step": 18980 + }, + { + "epoch": 1.3712861451767298, + "grad_norm": 7.4766877581428846, + "learning_rate": 1.1873810589913042e-06, + "loss": 0.6101, + "step": 18981 + }, + { + "epoch": 1.3713583903769393, + "grad_norm": 8.0056189831102, + "learning_rate": 1.1871321271268235e-06, + "loss": 0.6191, + "step": 18982 + }, + { + "epoch": 1.3714306355771488, + "grad_norm": 7.613881243594014, + "learning_rate": 1.1868832132344846e-06, + "loss": 0.6719, + "step": 18983 + }, + { + "epoch": 1.3715028807773584, + "grad_norm": 8.107857328780964, + "learning_rate": 1.1866343173176959e-06, + "loss": 0.6458, + "step": 18984 + }, + { + "epoch": 1.371575125977568, + "grad_norm": 6.455553535348763, + "learning_rate": 1.1863854393798623e-06, + "loss": 0.7093, + "step": 18985 + }, + { + "epoch": 1.3716473711777772, + "grad_norm": 7.349249416095529, + "learning_rate": 1.1861365794243925e-06, + "loss": 0.6448, + "step": 18986 + }, + { + "epoch": 1.371719616377987, + "grad_norm": 6.98529524575806, + "learning_rate": 1.1858877374546937e-06, + "loss": 0.5818, + "step": 18987 + }, + { + "epoch": 1.3717918615781963, + "grad_norm": 7.133147550403684, + "learning_rate": 1.1856389134741703e-06, + "loss": 0.6102, + "step": 18988 + }, + { + "epoch": 1.3718641067784059, + "grad_norm": 7.3490879868376355, + "learning_rate": 1.1853901074862299e-06, + "loss": 0.648, + "step": 18989 + }, + { + "epoch": 1.3719363519786154, + "grad_norm": 6.125263130608305, + "learning_rate": 1.1851413194942771e-06, + "loss": 0.617, + "step": 18990 + }, + { + "epoch": 1.372008597178825, + "grad_norm": 8.180789439713847, + "learning_rate": 1.1848925495017205e-06, + "loss": 0.644, + "step": 18991 + }, + { + "epoch": 1.3720808423790345, + "grad_norm": 6.319213017102492, + "learning_rate": 1.1846437975119628e-06, + "loss": 0.5924, + "step": 18992 + }, + { + "epoch": 1.372153087579244, + "grad_norm": 6.2019257046734335, + "learning_rate": 1.1843950635284105e-06, + "loss": 0.6204, + "step": 18993 + }, + { + "epoch": 1.3722253327794536, + "grad_norm": 8.296646028796724, + "learning_rate": 1.184146347554469e-06, + "loss": 0.5843, + "step": 18994 + }, + { + "epoch": 1.372297577979663, + "grad_norm": 6.932576124706711, + "learning_rate": 1.1838976495935415e-06, + "loss": 0.5889, + "step": 18995 + }, + { + "epoch": 1.3723698231798724, + "grad_norm": 7.287278197169998, + "learning_rate": 1.1836489696490331e-06, + "loss": 0.6734, + "step": 18996 + }, + { + "epoch": 1.372442068380082, + "grad_norm": 6.8303199574716515, + "learning_rate": 1.1834003077243484e-06, + "loss": 0.601, + "step": 18997 + }, + { + "epoch": 1.3725143135802915, + "grad_norm": 7.233435287577426, + "learning_rate": 1.183151663822891e-06, + "loss": 0.6145, + "step": 18998 + }, + { + "epoch": 1.372586558780501, + "grad_norm": 7.273736966008848, + "learning_rate": 1.1829030379480652e-06, + "loss": 0.6655, + "step": 18999 + }, + { + "epoch": 1.3726588039807106, + "grad_norm": 6.469920176937832, + "learning_rate": 1.1826544301032739e-06, + "loss": 0.6507, + "step": 19000 + }, + { + "epoch": 1.3727310491809201, + "grad_norm": 6.421157091392017, + "learning_rate": 1.1824058402919213e-06, + "loss": 0.6615, + "step": 19001 + }, + { + "epoch": 1.3728032943811295, + "grad_norm": 6.879639603246426, + "learning_rate": 1.1821572685174093e-06, + "loss": 0.5916, + "step": 19002 + }, + { + "epoch": 1.372875539581339, + "grad_norm": 6.009103544534372, + "learning_rate": 1.1819087147831407e-06, + "loss": 0.6717, + "step": 19003 + }, + { + "epoch": 1.3729477847815486, + "grad_norm": 7.669432707774548, + "learning_rate": 1.1816601790925186e-06, + "loss": 0.6522, + "step": 19004 + }, + { + "epoch": 1.373020029981758, + "grad_norm": 5.9835038708935455, + "learning_rate": 1.1814116614489458e-06, + "loss": 0.6462, + "step": 19005 + }, + { + "epoch": 1.3730922751819676, + "grad_norm": 7.840447570325774, + "learning_rate": 1.1811631618558225e-06, + "loss": 0.6673, + "step": 19006 + }, + { + "epoch": 1.3731645203821772, + "grad_norm": 6.963359852264488, + "learning_rate": 1.1809146803165516e-06, + "loss": 0.6083, + "step": 19007 + }, + { + "epoch": 1.3732367655823867, + "grad_norm": 9.716680864571286, + "learning_rate": 1.1806662168345348e-06, + "loss": 0.65, + "step": 19008 + }, + { + "epoch": 1.373309010782596, + "grad_norm": 7.132981230462538, + "learning_rate": 1.1804177714131729e-06, + "loss": 0.656, + "step": 19009 + }, + { + "epoch": 1.3733812559828056, + "grad_norm": 7.080981623386155, + "learning_rate": 1.180169344055867e-06, + "loss": 0.5845, + "step": 19010 + }, + { + "epoch": 1.3734535011830151, + "grad_norm": 6.956731859433889, + "learning_rate": 1.1799209347660179e-06, + "loss": 0.6611, + "step": 19011 + }, + { + "epoch": 1.3735257463832247, + "grad_norm": 6.413942064090825, + "learning_rate": 1.1796725435470274e-06, + "loss": 0.6339, + "step": 19012 + }, + { + "epoch": 1.3735979915834342, + "grad_norm": 6.319040667963683, + "learning_rate": 1.1794241704022937e-06, + "loss": 0.6105, + "step": 19013 + }, + { + "epoch": 1.3736702367836437, + "grad_norm": 6.788995634919574, + "learning_rate": 1.1791758153352176e-06, + "loss": 0.6114, + "step": 19014 + }, + { + "epoch": 1.3737424819838533, + "grad_norm": 5.919378635170559, + "learning_rate": 1.178927478349199e-06, + "loss": 0.6501, + "step": 19015 + }, + { + "epoch": 1.3738147271840626, + "grad_norm": 6.936778537936395, + "learning_rate": 1.1786791594476384e-06, + "loss": 0.6043, + "step": 19016 + }, + { + "epoch": 1.3738869723842722, + "grad_norm": 6.799621448077694, + "learning_rate": 1.1784308586339336e-06, + "loss": 0.655, + "step": 19017 + }, + { + "epoch": 1.3739592175844817, + "grad_norm": 6.237162413855089, + "learning_rate": 1.178182575911484e-06, + "loss": 0.5699, + "step": 19018 + }, + { + "epoch": 1.3740314627846912, + "grad_norm": 5.3227988551689505, + "learning_rate": 1.1779343112836883e-06, + "loss": 0.6578, + "step": 19019 + }, + { + "epoch": 1.3741037079849008, + "grad_norm": 7.2232411603176745, + "learning_rate": 1.1776860647539457e-06, + "loss": 0.6395, + "step": 19020 + }, + { + "epoch": 1.3741759531851103, + "grad_norm": 9.682730208067689, + "learning_rate": 1.177437836325654e-06, + "loss": 0.6353, + "step": 19021 + }, + { + "epoch": 1.3742481983853199, + "grad_norm": 7.158561499811399, + "learning_rate": 1.1771896260022118e-06, + "loss": 0.6251, + "step": 19022 + }, + { + "epoch": 1.3743204435855292, + "grad_norm": 6.982017853258659, + "learning_rate": 1.176941433787017e-06, + "loss": 0.6776, + "step": 19023 + }, + { + "epoch": 1.3743926887857387, + "grad_norm": 6.892063950784916, + "learning_rate": 1.176693259683466e-06, + "loss": 0.6246, + "step": 19024 + }, + { + "epoch": 1.3744649339859483, + "grad_norm": 6.343610188864972, + "learning_rate": 1.1764451036949565e-06, + "loss": 0.552, + "step": 19025 + }, + { + "epoch": 1.3745371791861578, + "grad_norm": 6.389558213413323, + "learning_rate": 1.1761969658248862e-06, + "loss": 0.6142, + "step": 19026 + }, + { + "epoch": 1.3746094243863674, + "grad_norm": 6.875236923296887, + "learning_rate": 1.1759488460766514e-06, + "loss": 0.6098, + "step": 19027 + }, + { + "epoch": 1.374681669586577, + "grad_norm": 6.248011463442945, + "learning_rate": 1.1757007444536488e-06, + "loss": 0.6044, + "step": 19028 + }, + { + "epoch": 1.3747539147867864, + "grad_norm": 6.937148626553602, + "learning_rate": 1.1754526609592748e-06, + "loss": 0.6529, + "step": 19029 + }, + { + "epoch": 1.3748261599869958, + "grad_norm": 7.242023157520452, + "learning_rate": 1.1752045955969264e-06, + "loss": 0.6006, + "step": 19030 + }, + { + "epoch": 1.3748984051872053, + "grad_norm": 6.440664161885124, + "learning_rate": 1.1749565483699977e-06, + "loss": 0.6592, + "step": 19031 + }, + { + "epoch": 1.3749706503874148, + "grad_norm": 5.41035085390076, + "learning_rate": 1.1747085192818848e-06, + "loss": 0.549, + "step": 19032 + }, + { + "epoch": 1.3750428955876244, + "grad_norm": 7.96557075059989, + "learning_rate": 1.174460508335983e-06, + "loss": 0.6313, + "step": 19033 + }, + { + "epoch": 1.375115140787834, + "grad_norm": 8.54541717314526, + "learning_rate": 1.174212515535689e-06, + "loss": 0.6269, + "step": 19034 + }, + { + "epoch": 1.3751873859880435, + "grad_norm": 6.199892523049353, + "learning_rate": 1.173964540884395e-06, + "loss": 0.584, + "step": 19035 + }, + { + "epoch": 1.375259631188253, + "grad_norm": 8.189617785802891, + "learning_rate": 1.1737165843854969e-06, + "loss": 0.6348, + "step": 19036 + }, + { + "epoch": 1.3753318763884623, + "grad_norm": 7.819176834372497, + "learning_rate": 1.1734686460423892e-06, + "loss": 0.7232, + "step": 19037 + }, + { + "epoch": 1.375404121588672, + "grad_norm": 8.207493256569915, + "learning_rate": 1.1732207258584657e-06, + "loss": 0.6475, + "step": 19038 + }, + { + "epoch": 1.3754763667888814, + "grad_norm": 6.64436532990225, + "learning_rate": 1.1729728238371201e-06, + "loss": 0.6056, + "step": 19039 + }, + { + "epoch": 1.375548611989091, + "grad_norm": 6.803942704198903, + "learning_rate": 1.172724939981746e-06, + "loss": 0.6477, + "step": 19040 + }, + { + "epoch": 1.3756208571893005, + "grad_norm": 6.787961951846578, + "learning_rate": 1.1724770742957384e-06, + "loss": 0.6322, + "step": 19041 + }, + { + "epoch": 1.37569310238951, + "grad_norm": 9.53321292863407, + "learning_rate": 1.1722292267824875e-06, + "loss": 0.6248, + "step": 19042 + }, + { + "epoch": 1.3757653475897196, + "grad_norm": 7.039873726319211, + "learning_rate": 1.1719813974453878e-06, + "loss": 0.6448, + "step": 19043 + }, + { + "epoch": 1.375837592789929, + "grad_norm": 6.402035997312782, + "learning_rate": 1.1717335862878326e-06, + "loss": 0.6724, + "step": 19044 + }, + { + "epoch": 1.3759098379901387, + "grad_norm": 6.85006854169204, + "learning_rate": 1.171485793313212e-06, + "loss": 0.5797, + "step": 19045 + }, + { + "epoch": 1.375982083190348, + "grad_norm": 7.6131910589834755, + "learning_rate": 1.1712380185249198e-06, + "loss": 0.641, + "step": 19046 + }, + { + "epoch": 1.3760543283905575, + "grad_norm": 8.426254351141836, + "learning_rate": 1.1709902619263462e-06, + "loss": 0.6246, + "step": 19047 + }, + { + "epoch": 1.376126573590767, + "grad_norm": 7.071467285270462, + "learning_rate": 1.1707425235208857e-06, + "loss": 0.6344, + "step": 19048 + }, + { + "epoch": 1.3761988187909766, + "grad_norm": 6.115658233973317, + "learning_rate": 1.1704948033119272e-06, + "loss": 0.6872, + "step": 19049 + }, + { + "epoch": 1.3762710639911861, + "grad_norm": 7.127021636591772, + "learning_rate": 1.1702471013028627e-06, + "loss": 0.6049, + "step": 19050 + }, + { + "epoch": 1.3763433091913955, + "grad_norm": 8.057709919570772, + "learning_rate": 1.1699994174970837e-06, + "loss": 0.6035, + "step": 19051 + }, + { + "epoch": 1.3764155543916052, + "grad_norm": 7.313061032414427, + "learning_rate": 1.1697517518979792e-06, + "loss": 0.6874, + "step": 19052 + }, + { + "epoch": 1.3764877995918146, + "grad_norm": 6.439941239651649, + "learning_rate": 1.1695041045089402e-06, + "loss": 0.6364, + "step": 19053 + }, + { + "epoch": 1.376560044792024, + "grad_norm": 6.078823490753641, + "learning_rate": 1.1692564753333568e-06, + "loss": 0.6126, + "step": 19054 + }, + { + "epoch": 1.3766322899922336, + "grad_norm": 7.466251993017795, + "learning_rate": 1.1690088643746193e-06, + "loss": 0.6407, + "step": 19055 + }, + { + "epoch": 1.3767045351924432, + "grad_norm": 7.057568203809016, + "learning_rate": 1.1687612716361168e-06, + "loss": 0.5931, + "step": 19056 + }, + { + "epoch": 1.3767767803926527, + "grad_norm": 6.509207439784096, + "learning_rate": 1.1685136971212392e-06, + "loss": 0.5801, + "step": 19057 + }, + { + "epoch": 1.376849025592862, + "grad_norm": 5.318452798539072, + "learning_rate": 1.1682661408333749e-06, + "loss": 0.5083, + "step": 19058 + }, + { + "epoch": 1.3769212707930718, + "grad_norm": 5.97731178922314, + "learning_rate": 1.1680186027759137e-06, + "loss": 0.6385, + "step": 19059 + }, + { + "epoch": 1.3769935159932811, + "grad_norm": 7.3727066871022995, + "learning_rate": 1.1677710829522435e-06, + "loss": 0.6466, + "step": 19060 + }, + { + "epoch": 1.3770657611934907, + "grad_norm": 7.579621912401762, + "learning_rate": 1.1675235813657521e-06, + "loss": 0.6, + "step": 19061 + }, + { + "epoch": 1.3771380063937002, + "grad_norm": 6.896720283684609, + "learning_rate": 1.1672760980198294e-06, + "loss": 0.5707, + "step": 19062 + }, + { + "epoch": 1.3772102515939098, + "grad_norm": 6.191156713228466, + "learning_rate": 1.1670286329178615e-06, + "loss": 0.505, + "step": 19063 + }, + { + "epoch": 1.3772824967941193, + "grad_norm": 7.724125646595511, + "learning_rate": 1.1667811860632364e-06, + "loss": 0.6612, + "step": 19064 + }, + { + "epoch": 1.3773547419943288, + "grad_norm": 6.7397266905296025, + "learning_rate": 1.1665337574593417e-06, + "loss": 0.6655, + "step": 19065 + }, + { + "epoch": 1.3774269871945384, + "grad_norm": 5.6840779835601625, + "learning_rate": 1.1662863471095642e-06, + "loss": 0.5559, + "step": 19066 + }, + { + "epoch": 1.3774992323947477, + "grad_norm": 6.0589359464733406, + "learning_rate": 1.1660389550172912e-06, + "loss": 0.6971, + "step": 19067 + }, + { + "epoch": 1.3775714775949572, + "grad_norm": 6.713370507196406, + "learning_rate": 1.1657915811859093e-06, + "loss": 0.6379, + "step": 19068 + }, + { + "epoch": 1.3776437227951668, + "grad_norm": 7.008928871328332, + "learning_rate": 1.1655442256188058e-06, + "loss": 0.6156, + "step": 19069 + }, + { + "epoch": 1.3777159679953763, + "grad_norm": 6.128942349718414, + "learning_rate": 1.1652968883193645e-06, + "loss": 0.6292, + "step": 19070 + }, + { + "epoch": 1.3777882131955859, + "grad_norm": 7.457955832136029, + "learning_rate": 1.1650495692909725e-06, + "loss": 0.6845, + "step": 19071 + }, + { + "epoch": 1.3778604583957954, + "grad_norm": 6.867932657861456, + "learning_rate": 1.1648022685370155e-06, + "loss": 0.6847, + "step": 19072 + }, + { + "epoch": 1.377932703596005, + "grad_norm": 7.442004466604211, + "learning_rate": 1.1645549860608797e-06, + "loss": 0.6582, + "step": 19073 + }, + { + "epoch": 1.3780049487962143, + "grad_norm": 5.963362734135578, + "learning_rate": 1.164307721865948e-06, + "loss": 0.6301, + "step": 19074 + }, + { + "epoch": 1.3780771939964238, + "grad_norm": 6.5342283621944715, + "learning_rate": 1.1640604759556057e-06, + "loss": 0.636, + "step": 19075 + }, + { + "epoch": 1.3781494391966334, + "grad_norm": 6.444937548279324, + "learning_rate": 1.16381324833324e-06, + "loss": 0.6402, + "step": 19076 + }, + { + "epoch": 1.378221684396843, + "grad_norm": 6.858505715312385, + "learning_rate": 1.1635660390022327e-06, + "loss": 0.7035, + "step": 19077 + }, + { + "epoch": 1.3782939295970524, + "grad_norm": 7.089025596302308, + "learning_rate": 1.1633188479659685e-06, + "loss": 0.6792, + "step": 19078 + }, + { + "epoch": 1.378366174797262, + "grad_norm": 6.907934199246765, + "learning_rate": 1.1630716752278315e-06, + "loss": 0.5406, + "step": 19079 + }, + { + "epoch": 1.3784384199974715, + "grad_norm": 7.2911370075952755, + "learning_rate": 1.1628245207912062e-06, + "loss": 0.5829, + "step": 19080 + }, + { + "epoch": 1.3785106651976808, + "grad_norm": 6.30676522697443, + "learning_rate": 1.162577384659474e-06, + "loss": 0.5981, + "step": 19081 + }, + { + "epoch": 1.3785829103978904, + "grad_norm": 6.6108913260434, + "learning_rate": 1.162330266836019e-06, + "loss": 0.6697, + "step": 19082 + }, + { + "epoch": 1.3786551555981, + "grad_norm": 7.2906287044903255, + "learning_rate": 1.162083167324224e-06, + "loss": 0.6276, + "step": 19083 + }, + { + "epoch": 1.3787274007983095, + "grad_norm": 6.812579513225768, + "learning_rate": 1.1618360861274713e-06, + "loss": 0.6767, + "step": 19084 + }, + { + "epoch": 1.378799645998519, + "grad_norm": 6.984909412667699, + "learning_rate": 1.1615890232491439e-06, + "loss": 0.6217, + "step": 19085 + }, + { + "epoch": 1.3788718911987285, + "grad_norm": 6.597543791133768, + "learning_rate": 1.1613419786926233e-06, + "loss": 0.6182, + "step": 19086 + }, + { + "epoch": 1.378944136398938, + "grad_norm": 6.96080159098523, + "learning_rate": 1.1610949524612928e-06, + "loss": 0.6311, + "step": 19087 + }, + { + "epoch": 1.3790163815991474, + "grad_norm": 6.083183321562031, + "learning_rate": 1.1608479445585316e-06, + "loss": 0.6466, + "step": 19088 + }, + { + "epoch": 1.379088626799357, + "grad_norm": 6.645226748855239, + "learning_rate": 1.1606009549877226e-06, + "loss": 0.6155, + "step": 19089 + }, + { + "epoch": 1.3791608719995665, + "grad_norm": 7.093889008222946, + "learning_rate": 1.1603539837522461e-06, + "loss": 0.6253, + "step": 19090 + }, + { + "epoch": 1.379233117199776, + "grad_norm": 5.889564479405933, + "learning_rate": 1.1601070308554849e-06, + "loss": 0.5791, + "step": 19091 + }, + { + "epoch": 1.3793053623999856, + "grad_norm": 6.734853351358635, + "learning_rate": 1.159860096300817e-06, + "loss": 0.5984, + "step": 19092 + }, + { + "epoch": 1.3793776076001951, + "grad_norm": 6.364871787003273, + "learning_rate": 1.1596131800916236e-06, + "loss": 0.6818, + "step": 19093 + }, + { + "epoch": 1.3794498528004047, + "grad_norm": 6.9321584672022825, + "learning_rate": 1.159366282231285e-06, + "loss": 0.6247, + "step": 19094 + }, + { + "epoch": 1.379522098000614, + "grad_norm": 7.059407325052568, + "learning_rate": 1.1591194027231812e-06, + "loss": 0.6311, + "step": 19095 + }, + { + "epoch": 1.3795943432008235, + "grad_norm": 6.273855535087375, + "learning_rate": 1.1588725415706917e-06, + "loss": 0.6231, + "step": 19096 + }, + { + "epoch": 1.379666588401033, + "grad_norm": 6.252452521740657, + "learning_rate": 1.1586256987771958e-06, + "loss": 0.5909, + "step": 19097 + }, + { + "epoch": 1.3797388336012426, + "grad_norm": 6.619694438693521, + "learning_rate": 1.1583788743460733e-06, + "loss": 0.6295, + "step": 19098 + }, + { + "epoch": 1.3798110788014522, + "grad_norm": 6.793825644990549, + "learning_rate": 1.1581320682807015e-06, + "loss": 0.6474, + "step": 19099 + }, + { + "epoch": 1.3798833240016617, + "grad_norm": 6.269681535387247, + "learning_rate": 1.1578852805844599e-06, + "loss": 0.6451, + "step": 19100 + }, + { + "epoch": 1.3799555692018712, + "grad_norm": 6.606998963427675, + "learning_rate": 1.1576385112607275e-06, + "loss": 0.6596, + "step": 19101 + }, + { + "epoch": 1.3800278144020806, + "grad_norm": 5.798275184672378, + "learning_rate": 1.1573917603128807e-06, + "loss": 0.6659, + "step": 19102 + }, + { + "epoch": 1.38010005960229, + "grad_norm": 8.438926293655259, + "learning_rate": 1.1571450277442972e-06, + "loss": 0.6156, + "step": 19103 + }, + { + "epoch": 1.3801723048024996, + "grad_norm": 7.43029333426367, + "learning_rate": 1.1568983135583567e-06, + "loss": 0.6066, + "step": 19104 + }, + { + "epoch": 1.3802445500027092, + "grad_norm": 10.82900429330301, + "learning_rate": 1.1566516177584363e-06, + "loss": 0.5904, + "step": 19105 + }, + { + "epoch": 1.3803167952029187, + "grad_norm": 6.579956669534323, + "learning_rate": 1.1564049403479114e-06, + "loss": 0.5998, + "step": 19106 + }, + { + "epoch": 1.3803890404031283, + "grad_norm": 6.765136701128879, + "learning_rate": 1.1561582813301595e-06, + "loss": 0.6145, + "step": 19107 + }, + { + "epoch": 1.3804612856033378, + "grad_norm": 6.921989629412016, + "learning_rate": 1.1559116407085571e-06, + "loss": 0.611, + "step": 19108 + }, + { + "epoch": 1.3805335308035471, + "grad_norm": 6.7451120268593625, + "learning_rate": 1.155665018486482e-06, + "loss": 0.6244, + "step": 19109 + }, + { + "epoch": 1.3806057760037567, + "grad_norm": 6.808904136284294, + "learning_rate": 1.155418414667308e-06, + "loss": 0.6514, + "step": 19110 + }, + { + "epoch": 1.3806780212039662, + "grad_norm": 7.20951404346506, + "learning_rate": 1.1551718292544118e-06, + "loss": 0.64, + "step": 19111 + }, + { + "epoch": 1.3807502664041758, + "grad_norm": 7.405211556945892, + "learning_rate": 1.154925262251169e-06, + "loss": 0.6198, + "step": 19112 + }, + { + "epoch": 1.3808225116043853, + "grad_norm": 6.823568329496998, + "learning_rate": 1.1546787136609552e-06, + "loss": 0.5964, + "step": 19113 + }, + { + "epoch": 1.3808947568045948, + "grad_norm": 8.256942140154512, + "learning_rate": 1.1544321834871452e-06, + "loss": 0.6157, + "step": 19114 + }, + { + "epoch": 1.3809670020048044, + "grad_norm": 6.190405577651622, + "learning_rate": 1.1541856717331137e-06, + "loss": 0.589, + "step": 19115 + }, + { + "epoch": 1.3810392472050137, + "grad_norm": 5.542553402889047, + "learning_rate": 1.1539391784022366e-06, + "loss": 0.6551, + "step": 19116 + }, + { + "epoch": 1.3811114924052235, + "grad_norm": 5.732272477290857, + "learning_rate": 1.1536927034978857e-06, + "loss": 0.5573, + "step": 19117 + }, + { + "epoch": 1.3811837376054328, + "grad_norm": 6.567473933498899, + "learning_rate": 1.1534462470234368e-06, + "loss": 0.5962, + "step": 19118 + }, + { + "epoch": 1.3812559828056423, + "grad_norm": 5.757966535104625, + "learning_rate": 1.1531998089822638e-06, + "loss": 0.5606, + "step": 19119 + }, + { + "epoch": 1.3813282280058519, + "grad_norm": 6.6848846740589885, + "learning_rate": 1.1529533893777386e-06, + "loss": 0.6097, + "step": 19120 + }, + { + "epoch": 1.3814004732060614, + "grad_norm": 7.6474037601816365, + "learning_rate": 1.152706988213236e-06, + "loss": 0.689, + "step": 19121 + }, + { + "epoch": 1.381472718406271, + "grad_norm": 7.224471561945914, + "learning_rate": 1.1524606054921284e-06, + "loss": 0.6454, + "step": 19122 + }, + { + "epoch": 1.3815449636064803, + "grad_norm": 6.035007077444457, + "learning_rate": 1.152214241217789e-06, + "loss": 0.6252, + "step": 19123 + }, + { + "epoch": 1.38161720880669, + "grad_norm": 7.766673947877977, + "learning_rate": 1.15196789539359e-06, + "loss": 0.6273, + "step": 19124 + }, + { + "epoch": 1.3816894540068994, + "grad_norm": 7.307108660414026, + "learning_rate": 1.1517215680229038e-06, + "loss": 0.6275, + "step": 19125 + }, + { + "epoch": 1.381761699207109, + "grad_norm": 6.890081613442077, + "learning_rate": 1.1514752591091036e-06, + "loss": 0.6084, + "step": 19126 + }, + { + "epoch": 1.3818339444073184, + "grad_norm": 5.743012868509118, + "learning_rate": 1.151228968655559e-06, + "loss": 0.5687, + "step": 19127 + }, + { + "epoch": 1.381906189607528, + "grad_norm": 6.197988232966107, + "learning_rate": 1.1509826966656423e-06, + "loss": 0.5991, + "step": 19128 + }, + { + "epoch": 1.3819784348077375, + "grad_norm": 6.61480417758994, + "learning_rate": 1.1507364431427253e-06, + "loss": 0.6698, + "step": 19129 + }, + { + "epoch": 1.3820506800079468, + "grad_norm": 7.448839278975209, + "learning_rate": 1.1504902080901797e-06, + "loss": 0.6941, + "step": 19130 + }, + { + "epoch": 1.3821229252081566, + "grad_norm": 6.2906802922115155, + "learning_rate": 1.1502439915113744e-06, + "loss": 0.5908, + "step": 19131 + }, + { + "epoch": 1.382195170408366, + "grad_norm": 6.550692735687029, + "learning_rate": 1.14999779340968e-06, + "loss": 0.6207, + "step": 19132 + }, + { + "epoch": 1.3822674156085755, + "grad_norm": 7.343114687446212, + "learning_rate": 1.1497516137884692e-06, + "loss": 0.6206, + "step": 19133 + }, + { + "epoch": 1.382339660808785, + "grad_norm": 8.108917282211362, + "learning_rate": 1.1495054526511096e-06, + "loss": 0.6351, + "step": 19134 + }, + { + "epoch": 1.3824119060089946, + "grad_norm": 6.284615445386195, + "learning_rate": 1.1492593100009717e-06, + "loss": 0.6326, + "step": 19135 + }, + { + "epoch": 1.382484151209204, + "grad_norm": 7.068936276194905, + "learning_rate": 1.1490131858414252e-06, + "loss": 0.5587, + "step": 19136 + }, + { + "epoch": 1.3825563964094134, + "grad_norm": 8.539029441553117, + "learning_rate": 1.14876708017584e-06, + "loss": 0.6126, + "step": 19137 + }, + { + "epoch": 1.3826286416096232, + "grad_norm": 6.413059393146869, + "learning_rate": 1.1485209930075836e-06, + "loss": 0.6617, + "step": 19138 + }, + { + "epoch": 1.3827008868098325, + "grad_norm": 5.59077707065701, + "learning_rate": 1.148274924340025e-06, + "loss": 0.5495, + "step": 19139 + }, + { + "epoch": 1.382773132010042, + "grad_norm": 7.56746718531571, + "learning_rate": 1.1480288741765335e-06, + "loss": 0.57, + "step": 19140 + }, + { + "epoch": 1.3828453772102516, + "grad_norm": 6.675203801733527, + "learning_rate": 1.147782842520477e-06, + "loss": 0.5967, + "step": 19141 + }, + { + "epoch": 1.3829176224104611, + "grad_norm": 7.0795107531285035, + "learning_rate": 1.1475368293752234e-06, + "loss": 0.5892, + "step": 19142 + }, + { + "epoch": 1.3829898676106707, + "grad_norm": 7.154583199634906, + "learning_rate": 1.1472908347441405e-06, + "loss": 0.6485, + "step": 19143 + }, + { + "epoch": 1.3830621128108802, + "grad_norm": 6.517511986478927, + "learning_rate": 1.1470448586305966e-06, + "loss": 0.6257, + "step": 19144 + }, + { + "epoch": 1.3831343580110897, + "grad_norm": 8.02062047370421, + "learning_rate": 1.1467989010379571e-06, + "loss": 0.7072, + "step": 19145 + }, + { + "epoch": 1.383206603211299, + "grad_norm": 6.2054940031196475, + "learning_rate": 1.14655296196959e-06, + "loss": 0.5896, + "step": 19146 + }, + { + "epoch": 1.3832788484115086, + "grad_norm": 6.3990110110036, + "learning_rate": 1.146307041428862e-06, + "loss": 0.6221, + "step": 19147 + }, + { + "epoch": 1.3833510936117182, + "grad_norm": 7.5400374814829645, + "learning_rate": 1.1460611394191406e-06, + "loss": 0.6908, + "step": 19148 + }, + { + "epoch": 1.3834233388119277, + "grad_norm": 7.334602795164971, + "learning_rate": 1.1458152559437898e-06, + "loss": 0.5652, + "step": 19149 + }, + { + "epoch": 1.3834955840121372, + "grad_norm": 7.736264762821187, + "learning_rate": 1.145569391006177e-06, + "loss": 0.665, + "step": 19150 + }, + { + "epoch": 1.3835678292123468, + "grad_norm": 8.034096536221128, + "learning_rate": 1.1453235446096671e-06, + "loss": 0.6742, + "step": 19151 + }, + { + "epoch": 1.3836400744125563, + "grad_norm": 6.534383359855365, + "learning_rate": 1.1450777167576266e-06, + "loss": 0.6636, + "step": 19152 + }, + { + "epoch": 1.3837123196127656, + "grad_norm": 6.518069753146614, + "learning_rate": 1.1448319074534198e-06, + "loss": 0.6557, + "step": 19153 + }, + { + "epoch": 1.3837845648129752, + "grad_norm": 6.494182624394265, + "learning_rate": 1.1445861167004118e-06, + "loss": 0.5401, + "step": 19154 + }, + { + "epoch": 1.3838568100131847, + "grad_norm": 8.493581704780455, + "learning_rate": 1.144340344501969e-06, + "loss": 0.6912, + "step": 19155 + }, + { + "epoch": 1.3839290552133943, + "grad_norm": 6.735626458455204, + "learning_rate": 1.1440945908614527e-06, + "loss": 0.6299, + "step": 19156 + }, + { + "epoch": 1.3840013004136038, + "grad_norm": 7.361307396660919, + "learning_rate": 1.1438488557822288e-06, + "loss": 0.64, + "step": 19157 + }, + { + "epoch": 1.3840735456138133, + "grad_norm": 6.764054818786786, + "learning_rate": 1.1436031392676622e-06, + "loss": 0.5838, + "step": 19158 + }, + { + "epoch": 1.384145790814023, + "grad_norm": 6.74541091334639, + "learning_rate": 1.1433574413211142e-06, + "loss": 0.6443, + "step": 19159 + }, + { + "epoch": 1.3842180360142322, + "grad_norm": 7.364563103359842, + "learning_rate": 1.143111761945949e-06, + "loss": 0.6939, + "step": 19160 + }, + { + "epoch": 1.3842902812144418, + "grad_norm": 6.696369599582223, + "learning_rate": 1.142866101145531e-06, + "loss": 0.5522, + "step": 19161 + }, + { + "epoch": 1.3843625264146513, + "grad_norm": 7.4168905445841515, + "learning_rate": 1.1426204589232229e-06, + "loss": 0.6504, + "step": 19162 + }, + { + "epoch": 1.3844347716148608, + "grad_norm": 6.624121265794076, + "learning_rate": 1.1423748352823861e-06, + "loss": 0.6468, + "step": 19163 + }, + { + "epoch": 1.3845070168150704, + "grad_norm": 7.357031187778751, + "learning_rate": 1.1421292302263837e-06, + "loss": 0.6676, + "step": 19164 + }, + { + "epoch": 1.38457926201528, + "grad_norm": 6.093883649876524, + "learning_rate": 1.1418836437585778e-06, + "loss": 0.5629, + "step": 19165 + }, + { + "epoch": 1.3846515072154895, + "grad_norm": 8.204054076851179, + "learning_rate": 1.141638075882331e-06, + "loss": 0.66, + "step": 19166 + }, + { + "epoch": 1.3847237524156988, + "grad_norm": 6.190322078281657, + "learning_rate": 1.1413925266010037e-06, + "loss": 0.6475, + "step": 19167 + }, + { + "epoch": 1.3847959976159083, + "grad_norm": 6.7124070145117685, + "learning_rate": 1.1411469959179579e-06, + "loss": 0.6083, + "step": 19168 + }, + { + "epoch": 1.3848682428161179, + "grad_norm": 7.865128642665282, + "learning_rate": 1.1409014838365542e-06, + "loss": 0.6135, + "step": 19169 + }, + { + "epoch": 1.3849404880163274, + "grad_norm": 8.006678654484135, + "learning_rate": 1.1406559903601541e-06, + "loss": 0.6703, + "step": 19170 + }, + { + "epoch": 1.385012733216537, + "grad_norm": 6.530417357305405, + "learning_rate": 1.1404105154921184e-06, + "loss": 0.5827, + "step": 19171 + }, + { + "epoch": 1.3850849784167465, + "grad_norm": 8.680164341227801, + "learning_rate": 1.1401650592358067e-06, + "loss": 0.6935, + "step": 19172 + }, + { + "epoch": 1.385157223616956, + "grad_norm": 7.3194330719246805, + "learning_rate": 1.1399196215945805e-06, + "loss": 0.6182, + "step": 19173 + }, + { + "epoch": 1.3852294688171654, + "grad_norm": 7.712513740317101, + "learning_rate": 1.1396742025717975e-06, + "loss": 0.58, + "step": 19174 + }, + { + "epoch": 1.385301714017375, + "grad_norm": 6.436310889777858, + "learning_rate": 1.1394288021708185e-06, + "loss": 0.693, + "step": 19175 + }, + { + "epoch": 1.3853739592175844, + "grad_norm": 6.9119004829448185, + "learning_rate": 1.139183420395004e-06, + "loss": 0.6944, + "step": 19176 + }, + { + "epoch": 1.385446204417794, + "grad_norm": 6.576139970593828, + "learning_rate": 1.1389380572477107e-06, + "loss": 0.6536, + "step": 19177 + }, + { + "epoch": 1.3855184496180035, + "grad_norm": 6.423812982176588, + "learning_rate": 1.1386927127322988e-06, + "loss": 0.6951, + "step": 19178 + }, + { + "epoch": 1.385590694818213, + "grad_norm": 5.408075101748527, + "learning_rate": 1.1384473868521265e-06, + "loss": 0.5726, + "step": 19179 + }, + { + "epoch": 1.3856629400184226, + "grad_norm": 6.953409184852577, + "learning_rate": 1.1382020796105526e-06, + "loss": 0.7119, + "step": 19180 + }, + { + "epoch": 1.385735185218632, + "grad_norm": 6.431567968102359, + "learning_rate": 1.1379567910109346e-06, + "loss": 0.5976, + "step": 19181 + }, + { + "epoch": 1.3858074304188415, + "grad_norm": 5.543192067933598, + "learning_rate": 1.1377115210566306e-06, + "loss": 0.6107, + "step": 19182 + }, + { + "epoch": 1.385879675619051, + "grad_norm": 8.578557238085514, + "learning_rate": 1.137466269750999e-06, + "loss": 0.7718, + "step": 19183 + }, + { + "epoch": 1.3859519208192606, + "grad_norm": 7.162560757950199, + "learning_rate": 1.137221037097395e-06, + "loss": 0.645, + "step": 19184 + }, + { + "epoch": 1.38602416601947, + "grad_norm": 6.3848083438651955, + "learning_rate": 1.1369758230991775e-06, + "loss": 0.5997, + "step": 19185 + }, + { + "epoch": 1.3860964112196796, + "grad_norm": 7.81077251408884, + "learning_rate": 1.1367306277597022e-06, + "loss": 0.7069, + "step": 19186 + }, + { + "epoch": 1.3861686564198892, + "grad_norm": 7.519113726233122, + "learning_rate": 1.1364854510823275e-06, + "loss": 0.6406, + "step": 19187 + }, + { + "epoch": 1.3862409016200985, + "grad_norm": 6.998779735471358, + "learning_rate": 1.1362402930704062e-06, + "loss": 0.6534, + "step": 19188 + }, + { + "epoch": 1.3863131468203083, + "grad_norm": 6.1972296154997, + "learning_rate": 1.1359951537272971e-06, + "loss": 0.6023, + "step": 19189 + }, + { + "epoch": 1.3863853920205176, + "grad_norm": 6.501667102356386, + "learning_rate": 1.1357500330563553e-06, + "loss": 0.6134, + "step": 19190 + }, + { + "epoch": 1.3864576372207271, + "grad_norm": 6.827034660245784, + "learning_rate": 1.1355049310609376e-06, + "loss": 0.6569, + "step": 19191 + }, + { + "epoch": 1.3865298824209367, + "grad_norm": 5.729848962094723, + "learning_rate": 1.1352598477443966e-06, + "loss": 0.6219, + "step": 19192 + }, + { + "epoch": 1.3866021276211462, + "grad_norm": 6.679423651587225, + "learning_rate": 1.1350147831100887e-06, + "loss": 0.6314, + "step": 19193 + }, + { + "epoch": 1.3866743728213557, + "grad_norm": 8.350456599925904, + "learning_rate": 1.1347697371613697e-06, + "loss": 0.5734, + "step": 19194 + }, + { + "epoch": 1.386746618021565, + "grad_norm": 7.28941810451906, + "learning_rate": 1.1345247099015919e-06, + "loss": 0.642, + "step": 19195 + }, + { + "epoch": 1.3868188632217748, + "grad_norm": 7.088933309311577, + "learning_rate": 1.1342797013341105e-06, + "loss": 0.6211, + "step": 19196 + }, + { + "epoch": 1.3868911084219842, + "grad_norm": 6.131912379206795, + "learning_rate": 1.1340347114622795e-06, + "loss": 0.6635, + "step": 19197 + }, + { + "epoch": 1.3869633536221937, + "grad_norm": 6.576765268743175, + "learning_rate": 1.1337897402894529e-06, + "loss": 0.6275, + "step": 19198 + }, + { + "epoch": 1.3870355988224032, + "grad_norm": 6.061532798773382, + "learning_rate": 1.133544787818984e-06, + "loss": 0.5991, + "step": 19199 + }, + { + "epoch": 1.3871078440226128, + "grad_norm": 7.7396801565556475, + "learning_rate": 1.133299854054226e-06, + "loss": 0.6815, + "step": 19200 + }, + { + "epoch": 1.3871800892228223, + "grad_norm": 6.866256147291958, + "learning_rate": 1.1330549389985326e-06, + "loss": 0.5897, + "step": 19201 + }, + { + "epoch": 1.3872523344230316, + "grad_norm": 6.737869948950826, + "learning_rate": 1.132810042655255e-06, + "loss": 0.6137, + "step": 19202 + }, + { + "epoch": 1.3873245796232414, + "grad_norm": 8.16210259818156, + "learning_rate": 1.1325651650277462e-06, + "loss": 0.6429, + "step": 19203 + }, + { + "epoch": 1.3873968248234507, + "grad_norm": 6.772331314661656, + "learning_rate": 1.1323203061193585e-06, + "loss": 0.6094, + "step": 19204 + }, + { + "epoch": 1.3874690700236603, + "grad_norm": 6.871594504564151, + "learning_rate": 1.1320754659334449e-06, + "loss": 0.647, + "step": 19205 + }, + { + "epoch": 1.3875413152238698, + "grad_norm": 7.414251327119901, + "learning_rate": 1.1318306444733551e-06, + "loss": 0.7119, + "step": 19206 + }, + { + "epoch": 1.3876135604240794, + "grad_norm": 6.159679096830267, + "learning_rate": 1.1315858417424414e-06, + "loss": 0.5813, + "step": 19207 + }, + { + "epoch": 1.387685805624289, + "grad_norm": 8.52270392767281, + "learning_rate": 1.131341057744055e-06, + "loss": 0.6605, + "step": 19208 + }, + { + "epoch": 1.3877580508244982, + "grad_norm": 7.1122584940530995, + "learning_rate": 1.1310962924815469e-06, + "loss": 0.611, + "step": 19209 + }, + { + "epoch": 1.387830296024708, + "grad_norm": 7.525682092720457, + "learning_rate": 1.1308515459582678e-06, + "loss": 0.6278, + "step": 19210 + }, + { + "epoch": 1.3879025412249173, + "grad_norm": 6.839373857270963, + "learning_rate": 1.1306068181775675e-06, + "loss": 0.6258, + "step": 19211 + }, + { + "epoch": 1.3879747864251268, + "grad_norm": 7.505388739961406, + "learning_rate": 1.130362109142798e-06, + "loss": 0.6164, + "step": 19212 + }, + { + "epoch": 1.3880470316253364, + "grad_norm": 6.4651355320294535, + "learning_rate": 1.1301174188573067e-06, + "loss": 0.552, + "step": 19213 + }, + { + "epoch": 1.388119276825546, + "grad_norm": 7.6586688521992174, + "learning_rate": 1.1298727473244442e-06, + "loss": 0.6208, + "step": 19214 + }, + { + "epoch": 1.3881915220257555, + "grad_norm": 6.533294211808974, + "learning_rate": 1.1296280945475602e-06, + "loss": 0.6017, + "step": 19215 + }, + { + "epoch": 1.388263767225965, + "grad_norm": 5.995168966245896, + "learning_rate": 1.1293834605300032e-06, + "loss": 0.664, + "step": 19216 + }, + { + "epoch": 1.3883360124261745, + "grad_norm": 6.762298682525025, + "learning_rate": 1.1291388452751226e-06, + "loss": 0.6518, + "step": 19217 + }, + { + "epoch": 1.3884082576263839, + "grad_norm": 6.922062925108393, + "learning_rate": 1.128894248786267e-06, + "loss": 0.6245, + "step": 19218 + }, + { + "epoch": 1.3884805028265934, + "grad_norm": 6.509249048942001, + "learning_rate": 1.1286496710667851e-06, + "loss": 0.606, + "step": 19219 + }, + { + "epoch": 1.388552748026803, + "grad_norm": 6.479860062206687, + "learning_rate": 1.1284051121200237e-06, + "loss": 0.6301, + "step": 19220 + }, + { + "epoch": 1.3886249932270125, + "grad_norm": 5.543140626464454, + "learning_rate": 1.1281605719493313e-06, + "loss": 0.5895, + "step": 19221 + }, + { + "epoch": 1.388697238427222, + "grad_norm": 7.055057891889347, + "learning_rate": 1.1279160505580556e-06, + "loss": 0.578, + "step": 19222 + }, + { + "epoch": 1.3887694836274316, + "grad_norm": 6.518825559095893, + "learning_rate": 1.1276715479495445e-06, + "loss": 0.6286, + "step": 19223 + }, + { + "epoch": 1.3888417288276411, + "grad_norm": 6.6647075953520245, + "learning_rate": 1.1274270641271438e-06, + "loss": 0.5661, + "step": 19224 + }, + { + "epoch": 1.3889139740278504, + "grad_norm": 7.059519991455908, + "learning_rate": 1.1271825990942007e-06, + "loss": 0.6101, + "step": 19225 + }, + { + "epoch": 1.38898621922806, + "grad_norm": 5.900696742410886, + "learning_rate": 1.1269381528540615e-06, + "loss": 0.6252, + "step": 19226 + }, + { + "epoch": 1.3890584644282695, + "grad_norm": 7.068022872533744, + "learning_rate": 1.1266937254100736e-06, + "loss": 0.636, + "step": 19227 + }, + { + "epoch": 1.389130709628479, + "grad_norm": 7.8406876745109875, + "learning_rate": 1.1264493167655815e-06, + "loss": 0.6386, + "step": 19228 + }, + { + "epoch": 1.3892029548286886, + "grad_norm": 8.642120618187233, + "learning_rate": 1.1262049269239323e-06, + "loss": 0.6928, + "step": 19229 + }, + { + "epoch": 1.3892752000288981, + "grad_norm": 7.560875079659421, + "learning_rate": 1.1259605558884718e-06, + "loss": 0.7074, + "step": 19230 + }, + { + "epoch": 1.3893474452291077, + "grad_norm": 5.924118542996469, + "learning_rate": 1.1257162036625435e-06, + "loss": 0.579, + "step": 19231 + }, + { + "epoch": 1.389419690429317, + "grad_norm": 6.5950587994063365, + "learning_rate": 1.1254718702494932e-06, + "loss": 0.6022, + "step": 19232 + }, + { + "epoch": 1.3894919356295266, + "grad_norm": 7.768146312824648, + "learning_rate": 1.1252275556526668e-06, + "loss": 0.7072, + "step": 19233 + }, + { + "epoch": 1.389564180829736, + "grad_norm": 8.080501361815415, + "learning_rate": 1.1249832598754068e-06, + "loss": 0.6944, + "step": 19234 + }, + { + "epoch": 1.3896364260299456, + "grad_norm": 8.123698438418266, + "learning_rate": 1.124738982921058e-06, + "loss": 0.6478, + "step": 19235 + }, + { + "epoch": 1.3897086712301552, + "grad_norm": 7.3581269779265845, + "learning_rate": 1.1244947247929653e-06, + "loss": 0.611, + "step": 19236 + }, + { + "epoch": 1.3897809164303647, + "grad_norm": 7.047045788101537, + "learning_rate": 1.1242504854944714e-06, + "loss": 0.6554, + "step": 19237 + }, + { + "epoch": 1.3898531616305743, + "grad_norm": 7.035280420371438, + "learning_rate": 1.1240062650289202e-06, + "loss": 0.5835, + "step": 19238 + }, + { + "epoch": 1.3899254068307836, + "grad_norm": 7.920516219221767, + "learning_rate": 1.1237620633996548e-06, + "loss": 0.6192, + "step": 19239 + }, + { + "epoch": 1.3899976520309931, + "grad_norm": 6.605906484978519, + "learning_rate": 1.1235178806100183e-06, + "loss": 0.5591, + "step": 19240 + }, + { + "epoch": 1.3900698972312027, + "grad_norm": 7.802681818893715, + "learning_rate": 1.123273716663354e-06, + "loss": 0.5672, + "step": 19241 + }, + { + "epoch": 1.3901421424314122, + "grad_norm": 5.8678662431966595, + "learning_rate": 1.1230295715630028e-06, + "loss": 0.5151, + "step": 19242 + }, + { + "epoch": 1.3902143876316218, + "grad_norm": 6.942979452949722, + "learning_rate": 1.1227854453123075e-06, + "loss": 0.6139, + "step": 19243 + }, + { + "epoch": 1.3902866328318313, + "grad_norm": 7.713417836976239, + "learning_rate": 1.1225413379146111e-06, + "loss": 0.6288, + "step": 19244 + }, + { + "epoch": 1.3903588780320408, + "grad_norm": 8.521737968593678, + "learning_rate": 1.1222972493732526e-06, + "loss": 0.6313, + "step": 19245 + }, + { + "epoch": 1.3904311232322502, + "grad_norm": 6.625137615573957, + "learning_rate": 1.1220531796915756e-06, + "loss": 0.6917, + "step": 19246 + }, + { + "epoch": 1.3905033684324597, + "grad_norm": 7.9478999213112695, + "learning_rate": 1.1218091288729207e-06, + "loss": 0.614, + "step": 19247 + }, + { + "epoch": 1.3905756136326692, + "grad_norm": 7.941541949237197, + "learning_rate": 1.12156509692063e-06, + "loss": 0.5952, + "step": 19248 + }, + { + "epoch": 1.3906478588328788, + "grad_norm": 7.004495267253352, + "learning_rate": 1.1213210838380418e-06, + "loss": 0.6931, + "step": 19249 + }, + { + "epoch": 1.3907201040330883, + "grad_norm": 8.017095895592186, + "learning_rate": 1.1210770896284973e-06, + "loss": 0.5865, + "step": 19250 + }, + { + "epoch": 1.3907923492332979, + "grad_norm": 5.718247355066122, + "learning_rate": 1.1208331142953377e-06, + "loss": 0.546, + "step": 19251 + }, + { + "epoch": 1.3908645944335074, + "grad_norm": 6.085133569234115, + "learning_rate": 1.1205891578419013e-06, + "loss": 0.6759, + "step": 19252 + }, + { + "epoch": 1.3909368396337167, + "grad_norm": 7.034027771618089, + "learning_rate": 1.1203452202715279e-06, + "loss": 0.6299, + "step": 19253 + }, + { + "epoch": 1.3910090848339263, + "grad_norm": 8.042538083418828, + "learning_rate": 1.1201013015875574e-06, + "loss": 0.6222, + "step": 19254 + }, + { + "epoch": 1.3910813300341358, + "grad_norm": 7.718746540516684, + "learning_rate": 1.1198574017933285e-06, + "loss": 0.6471, + "step": 19255 + }, + { + "epoch": 1.3911535752343454, + "grad_norm": 7.857612427390526, + "learning_rate": 1.1196135208921804e-06, + "loss": 0.7007, + "step": 19256 + }, + { + "epoch": 1.391225820434555, + "grad_norm": 7.921532136317996, + "learning_rate": 1.1193696588874513e-06, + "loss": 0.6703, + "step": 19257 + }, + { + "epoch": 1.3912980656347644, + "grad_norm": 7.98356728844756, + "learning_rate": 1.1191258157824805e-06, + "loss": 0.6119, + "step": 19258 + }, + { + "epoch": 1.391370310834974, + "grad_norm": 9.463289505589563, + "learning_rate": 1.1188819915806042e-06, + "loss": 0.5806, + "step": 19259 + }, + { + "epoch": 1.3914425560351833, + "grad_norm": 6.37122756939091, + "learning_rate": 1.118638186285161e-06, + "loss": 0.6204, + "step": 19260 + }, + { + "epoch": 1.391514801235393, + "grad_norm": 7.65685833440992, + "learning_rate": 1.1183943998994883e-06, + "loss": 0.6435, + "step": 19261 + }, + { + "epoch": 1.3915870464356024, + "grad_norm": 8.96600448640281, + "learning_rate": 1.1181506324269247e-06, + "loss": 0.6293, + "step": 19262 + }, + { + "epoch": 1.391659291635812, + "grad_norm": 6.219576526712318, + "learning_rate": 1.1179068838708051e-06, + "loss": 0.6361, + "step": 19263 + }, + { + "epoch": 1.3917315368360215, + "grad_norm": 6.908460444564428, + "learning_rate": 1.117663154234467e-06, + "loss": 0.6816, + "step": 19264 + }, + { + "epoch": 1.391803782036231, + "grad_norm": 7.37457610383008, + "learning_rate": 1.1174194435212471e-06, + "loss": 0.6451, + "step": 19265 + }, + { + "epoch": 1.3918760272364405, + "grad_norm": 7.15465091346905, + "learning_rate": 1.1171757517344816e-06, + "loss": 0.5579, + "step": 19266 + }, + { + "epoch": 1.3919482724366499, + "grad_norm": 7.83159190631824, + "learning_rate": 1.1169320788775062e-06, + "loss": 0.6064, + "step": 19267 + }, + { + "epoch": 1.3920205176368596, + "grad_norm": 6.72897455120334, + "learning_rate": 1.116688424953657e-06, + "loss": 0.564, + "step": 19268 + }, + { + "epoch": 1.392092762837069, + "grad_norm": 5.824871719977979, + "learning_rate": 1.1164447899662697e-06, + "loss": 0.604, + "step": 19269 + }, + { + "epoch": 1.3921650080372785, + "grad_norm": 7.725115788156815, + "learning_rate": 1.1162011739186785e-06, + "loss": 0.6672, + "step": 19270 + }, + { + "epoch": 1.392237253237488, + "grad_norm": 6.539348345380185, + "learning_rate": 1.1159575768142183e-06, + "loss": 0.6369, + "step": 19271 + }, + { + "epoch": 1.3923094984376976, + "grad_norm": 6.404197103427625, + "learning_rate": 1.1157139986562245e-06, + "loss": 0.5192, + "step": 19272 + }, + { + "epoch": 1.3923817436379071, + "grad_norm": 6.951708840158644, + "learning_rate": 1.1154704394480313e-06, + "loss": 0.62, + "step": 19273 + }, + { + "epoch": 1.3924539888381164, + "grad_norm": 7.730950847934786, + "learning_rate": 1.1152268991929727e-06, + "loss": 0.6432, + "step": 19274 + }, + { + "epoch": 1.3925262340383262, + "grad_norm": 7.519702463554051, + "learning_rate": 1.1149833778943826e-06, + "loss": 0.6733, + "step": 19275 + }, + { + "epoch": 1.3925984792385355, + "grad_norm": 8.573768051224118, + "learning_rate": 1.1147398755555954e-06, + "loss": 0.6522, + "step": 19276 + }, + { + "epoch": 1.392670724438745, + "grad_norm": 6.981178320206797, + "learning_rate": 1.1144963921799432e-06, + "loss": 0.6048, + "step": 19277 + }, + { + "epoch": 1.3927429696389546, + "grad_norm": 7.011551725900656, + "learning_rate": 1.1142529277707592e-06, + "loss": 0.5737, + "step": 19278 + }, + { + "epoch": 1.3928152148391642, + "grad_norm": 6.489596036732233, + "learning_rate": 1.1140094823313768e-06, + "loss": 0.7077, + "step": 19279 + }, + { + "epoch": 1.3928874600393737, + "grad_norm": 7.835903422470766, + "learning_rate": 1.1137660558651294e-06, + "loss": 0.6077, + "step": 19280 + }, + { + "epoch": 1.392959705239583, + "grad_norm": 7.022284866139435, + "learning_rate": 1.1135226483753474e-06, + "loss": 0.6029, + "step": 19281 + }, + { + "epoch": 1.3930319504397928, + "grad_norm": 6.407905774017858, + "learning_rate": 1.113279259865364e-06, + "loss": 0.5583, + "step": 19282 + }, + { + "epoch": 1.393104195640002, + "grad_norm": 7.786646430934235, + "learning_rate": 1.1130358903385107e-06, + "loss": 0.629, + "step": 19283 + }, + { + "epoch": 1.3931764408402116, + "grad_norm": 6.062397749520289, + "learning_rate": 1.1127925397981192e-06, + "loss": 0.6201, + "step": 19284 + }, + { + "epoch": 1.3932486860404212, + "grad_norm": 5.7514447594747065, + "learning_rate": 1.1125492082475205e-06, + "loss": 0.5622, + "step": 19285 + }, + { + "epoch": 1.3933209312406307, + "grad_norm": 6.065038719714068, + "learning_rate": 1.1123058956900462e-06, + "loss": 0.6288, + "step": 19286 + }, + { + "epoch": 1.3933931764408403, + "grad_norm": 6.499080299455418, + "learning_rate": 1.1120626021290272e-06, + "loss": 0.5992, + "step": 19287 + }, + { + "epoch": 1.3934654216410496, + "grad_norm": 5.991778462958216, + "learning_rate": 1.111819327567793e-06, + "loss": 0.6049, + "step": 19288 + }, + { + "epoch": 1.3935376668412593, + "grad_norm": 6.677848339642465, + "learning_rate": 1.1115760720096743e-06, + "loss": 0.5969, + "step": 19289 + }, + { + "epoch": 1.3936099120414687, + "grad_norm": 6.422040134357982, + "learning_rate": 1.111332835458002e-06, + "loss": 0.622, + "step": 19290 + }, + { + "epoch": 1.3936821572416782, + "grad_norm": 7.358959534533955, + "learning_rate": 1.111089617916104e-06, + "loss": 0.6117, + "step": 19291 + }, + { + "epoch": 1.3937544024418878, + "grad_norm": 7.609634676973146, + "learning_rate": 1.1108464193873106e-06, + "loss": 0.5979, + "step": 19292 + }, + { + "epoch": 1.3938266476420973, + "grad_norm": 7.026595138870993, + "learning_rate": 1.1106032398749503e-06, + "loss": 0.6159, + "step": 19293 + }, + { + "epoch": 1.3938988928423068, + "grad_norm": 6.732234322525374, + "learning_rate": 1.1103600793823546e-06, + "loss": 0.5623, + "step": 19294 + }, + { + "epoch": 1.3939711380425164, + "grad_norm": 7.526088607976086, + "learning_rate": 1.1101169379128496e-06, + "loss": 0.6566, + "step": 19295 + }, + { + "epoch": 1.394043383242726, + "grad_norm": 8.351913287768632, + "learning_rate": 1.1098738154697646e-06, + "loss": 0.6363, + "step": 19296 + }, + { + "epoch": 1.3941156284429352, + "grad_norm": 6.680317381668645, + "learning_rate": 1.1096307120564276e-06, + "loss": 0.6824, + "step": 19297 + }, + { + "epoch": 1.3941878736431448, + "grad_norm": 6.233077676735527, + "learning_rate": 1.1093876276761676e-06, + "loss": 0.615, + "step": 19298 + }, + { + "epoch": 1.3942601188433543, + "grad_norm": 6.218332861323745, + "learning_rate": 1.1091445623323103e-06, + "loss": 0.6119, + "step": 19299 + }, + { + "epoch": 1.3943323640435639, + "grad_norm": 5.335667437174291, + "learning_rate": 1.1089015160281838e-06, + "loss": 0.6224, + "step": 19300 + }, + { + "epoch": 1.3944046092437734, + "grad_norm": 6.906463481658781, + "learning_rate": 1.1086584887671157e-06, + "loss": 0.6217, + "step": 19301 + }, + { + "epoch": 1.394476854443983, + "grad_norm": 7.157877774410404, + "learning_rate": 1.1084154805524324e-06, + "loss": 0.6839, + "step": 19302 + }, + { + "epoch": 1.3945490996441925, + "grad_norm": 6.598212445102319, + "learning_rate": 1.108172491387461e-06, + "loss": 0.5969, + "step": 19303 + }, + { + "epoch": 1.3946213448444018, + "grad_norm": 6.739426419899417, + "learning_rate": 1.107929521275527e-06, + "loss": 0.5587, + "step": 19304 + }, + { + "epoch": 1.3946935900446114, + "grad_norm": 8.297455675742894, + "learning_rate": 1.1076865702199581e-06, + "loss": 0.6943, + "step": 19305 + }, + { + "epoch": 1.394765835244821, + "grad_norm": 6.885926043607159, + "learning_rate": 1.1074436382240782e-06, + "loss": 0.7106, + "step": 19306 + }, + { + "epoch": 1.3948380804450304, + "grad_norm": 7.878732674634266, + "learning_rate": 1.1072007252912137e-06, + "loss": 0.6151, + "step": 19307 + }, + { + "epoch": 1.39491032564524, + "grad_norm": 8.106651833838562, + "learning_rate": 1.1069578314246906e-06, + "loss": 0.6415, + "step": 19308 + }, + { + "epoch": 1.3949825708454495, + "grad_norm": 7.2032399199414625, + "learning_rate": 1.1067149566278324e-06, + "loss": 0.5644, + "step": 19309 + }, + { + "epoch": 1.395054816045659, + "grad_norm": 6.971783307138099, + "learning_rate": 1.1064721009039645e-06, + "loss": 0.6248, + "step": 19310 + }, + { + "epoch": 1.3951270612458684, + "grad_norm": 6.761399427396437, + "learning_rate": 1.1062292642564114e-06, + "loss": 0.5968, + "step": 19311 + }, + { + "epoch": 1.395199306446078, + "grad_norm": 7.585153745477365, + "learning_rate": 1.1059864466884976e-06, + "loss": 0.6913, + "step": 19312 + }, + { + "epoch": 1.3952715516462875, + "grad_norm": 7.17624027816999, + "learning_rate": 1.105743648203547e-06, + "loss": 0.6167, + "step": 19313 + }, + { + "epoch": 1.395343796846497, + "grad_norm": 7.0186981063378235, + "learning_rate": 1.1055008688048831e-06, + "loss": 0.6585, + "step": 19314 + }, + { + "epoch": 1.3954160420467066, + "grad_norm": 6.717403117177938, + "learning_rate": 1.1052581084958308e-06, + "loss": 0.5806, + "step": 19315 + }, + { + "epoch": 1.395488287246916, + "grad_norm": 6.101101710153398, + "learning_rate": 1.105015367279711e-06, + "loss": 0.6787, + "step": 19316 + }, + { + "epoch": 1.3955605324471256, + "grad_norm": 7.417767675164466, + "learning_rate": 1.1047726451598476e-06, + "loss": 0.7041, + "step": 19317 + }, + { + "epoch": 1.395632777647335, + "grad_norm": 7.352188521515767, + "learning_rate": 1.1045299421395637e-06, + "loss": 0.6312, + "step": 19318 + }, + { + "epoch": 1.3957050228475445, + "grad_norm": 7.888243665262793, + "learning_rate": 1.1042872582221817e-06, + "loss": 0.6807, + "step": 19319 + }, + { + "epoch": 1.395777268047754, + "grad_norm": 7.278246608060498, + "learning_rate": 1.104044593411023e-06, + "loss": 0.6023, + "step": 19320 + }, + { + "epoch": 1.3958495132479636, + "grad_norm": 7.045156332735672, + "learning_rate": 1.1038019477094097e-06, + "loss": 0.6401, + "step": 19321 + }, + { + "epoch": 1.3959217584481731, + "grad_norm": 8.23948878359811, + "learning_rate": 1.1035593211206626e-06, + "loss": 0.6243, + "step": 19322 + }, + { + "epoch": 1.3959940036483827, + "grad_norm": 6.431535049809169, + "learning_rate": 1.1033167136481063e-06, + "loss": 0.6333, + "step": 19323 + }, + { + "epoch": 1.3960662488485922, + "grad_norm": 7.956449462116434, + "learning_rate": 1.1030741252950588e-06, + "loss": 0.6167, + "step": 19324 + }, + { + "epoch": 1.3961384940488015, + "grad_norm": 6.689796980213571, + "learning_rate": 1.1028315560648418e-06, + "loss": 0.583, + "step": 19325 + }, + { + "epoch": 1.396210739249011, + "grad_norm": 7.212954882684935, + "learning_rate": 1.1025890059607766e-06, + "loss": 0.6664, + "step": 19326 + }, + { + "epoch": 1.3962829844492206, + "grad_norm": 6.432666334187728, + "learning_rate": 1.1023464749861823e-06, + "loss": 0.5726, + "step": 19327 + }, + { + "epoch": 1.3963552296494302, + "grad_norm": 6.790222420062248, + "learning_rate": 1.1021039631443794e-06, + "loss": 0.6416, + "step": 19328 + }, + { + "epoch": 1.3964274748496397, + "grad_norm": 7.747739154464045, + "learning_rate": 1.1018614704386879e-06, + "loss": 0.6057, + "step": 19329 + }, + { + "epoch": 1.3964997200498492, + "grad_norm": 6.927368464134893, + "learning_rate": 1.1016189968724275e-06, + "loss": 0.6727, + "step": 19330 + }, + { + "epoch": 1.3965719652500588, + "grad_norm": 6.817476187518576, + "learning_rate": 1.101376542448917e-06, + "loss": 0.6356, + "step": 19331 + }, + { + "epoch": 1.396644210450268, + "grad_norm": 9.035562825239369, + "learning_rate": 1.1011341071714758e-06, + "loss": 0.6315, + "step": 19332 + }, + { + "epoch": 1.3967164556504776, + "grad_norm": 7.814942001156842, + "learning_rate": 1.1008916910434233e-06, + "loss": 0.6496, + "step": 19333 + }, + { + "epoch": 1.3967887008506872, + "grad_norm": 6.58064334170843, + "learning_rate": 1.1006492940680762e-06, + "loss": 0.6232, + "step": 19334 + }, + { + "epoch": 1.3968609460508967, + "grad_norm": 6.831599907320479, + "learning_rate": 1.100406916248754e-06, + "loss": 0.6361, + "step": 19335 + }, + { + "epoch": 1.3969331912511063, + "grad_norm": 6.01979899074014, + "learning_rate": 1.1001645575887742e-06, + "loss": 0.6125, + "step": 19336 + }, + { + "epoch": 1.3970054364513158, + "grad_norm": 7.3286145662861575, + "learning_rate": 1.099922218091456e-06, + "loss": 0.6717, + "step": 19337 + }, + { + "epoch": 1.3970776816515253, + "grad_norm": 6.282831249194855, + "learning_rate": 1.099679897760114e-06, + "loss": 0.5638, + "step": 19338 + }, + { + "epoch": 1.3971499268517347, + "grad_norm": 7.22786543363991, + "learning_rate": 1.0994375965980675e-06, + "loss": 0.7035, + "step": 19339 + }, + { + "epoch": 1.3972221720519444, + "grad_norm": 6.982296491444172, + "learning_rate": 1.0991953146086325e-06, + "loss": 0.6359, + "step": 19340 + }, + { + "epoch": 1.3972944172521538, + "grad_norm": 6.831548535244721, + "learning_rate": 1.0989530517951264e-06, + "loss": 0.6406, + "step": 19341 + }, + { + "epoch": 1.3973666624523633, + "grad_norm": 6.956086700250111, + "learning_rate": 1.0987108081608647e-06, + "loss": 0.7585, + "step": 19342 + }, + { + "epoch": 1.3974389076525728, + "grad_norm": 6.705364004519477, + "learning_rate": 1.0984685837091642e-06, + "loss": 0.6724, + "step": 19343 + }, + { + "epoch": 1.3975111528527824, + "grad_norm": 7.54807741052083, + "learning_rate": 1.0982263784433414e-06, + "loss": 0.5744, + "step": 19344 + }, + { + "epoch": 1.397583398052992, + "grad_norm": 7.520887915359489, + "learning_rate": 1.0979841923667104e-06, + "loss": 0.662, + "step": 19345 + }, + { + "epoch": 1.3976556432532012, + "grad_norm": 7.805131290560757, + "learning_rate": 1.0977420254825868e-06, + "loss": 0.6904, + "step": 19346 + }, + { + "epoch": 1.397727888453411, + "grad_norm": 6.855831264898785, + "learning_rate": 1.0974998777942863e-06, + "loss": 0.6153, + "step": 19347 + }, + { + "epoch": 1.3978001336536203, + "grad_norm": 7.0667096310635, + "learning_rate": 1.0972577493051246e-06, + "loss": 0.6347, + "step": 19348 + }, + { + "epoch": 1.3978723788538299, + "grad_norm": 7.577608763424858, + "learning_rate": 1.0970156400184139e-06, + "loss": 0.638, + "step": 19349 + }, + { + "epoch": 1.3979446240540394, + "grad_norm": 7.808364139633178, + "learning_rate": 1.0967735499374693e-06, + "loss": 0.6127, + "step": 19350 + }, + { + "epoch": 1.398016869254249, + "grad_norm": 7.005658995775336, + "learning_rate": 1.0965314790656067e-06, + "loss": 0.6116, + "step": 19351 + }, + { + "epoch": 1.3980891144544585, + "grad_norm": 7.1543717897217505, + "learning_rate": 1.0962894274061375e-06, + "loss": 0.6293, + "step": 19352 + }, + { + "epoch": 1.3981613596546678, + "grad_norm": 7.078233595935009, + "learning_rate": 1.0960473949623761e-06, + "loss": 0.6285, + "step": 19353 + }, + { + "epoch": 1.3982336048548776, + "grad_norm": 7.034250661800934, + "learning_rate": 1.0958053817376362e-06, + "loss": 0.5223, + "step": 19354 + }, + { + "epoch": 1.398305850055087, + "grad_norm": 6.078101150740485, + "learning_rate": 1.095563387735231e-06, + "loss": 0.6628, + "step": 19355 + }, + { + "epoch": 1.3983780952552964, + "grad_norm": 6.141240601100519, + "learning_rate": 1.0953214129584716e-06, + "loss": 0.5903, + "step": 19356 + }, + { + "epoch": 1.398450340455506, + "grad_norm": 6.385317363454321, + "learning_rate": 1.0950794574106712e-06, + "loss": 0.6191, + "step": 19357 + }, + { + "epoch": 1.3985225856557155, + "grad_norm": 6.6097008013856335, + "learning_rate": 1.0948375210951425e-06, + "loss": 0.6637, + "step": 19358 + }, + { + "epoch": 1.398594830855925, + "grad_norm": 6.245854337969531, + "learning_rate": 1.094595604015197e-06, + "loss": 0.6533, + "step": 19359 + }, + { + "epoch": 1.3986670760561344, + "grad_norm": 6.017819645834771, + "learning_rate": 1.0943537061741464e-06, + "loss": 0.6467, + "step": 19360 + }, + { + "epoch": 1.3987393212563441, + "grad_norm": 6.9632604214564235, + "learning_rate": 1.094111827575302e-06, + "loss": 0.6837, + "step": 19361 + }, + { + "epoch": 1.3988115664565535, + "grad_norm": 8.125219254103, + "learning_rate": 1.0938699682219763e-06, + "loss": 0.6502, + "step": 19362 + }, + { + "epoch": 1.398883811656763, + "grad_norm": 6.897168294376833, + "learning_rate": 1.093628128117478e-06, + "loss": 0.5889, + "step": 19363 + }, + { + "epoch": 1.3989560568569726, + "grad_norm": 7.642200518160043, + "learning_rate": 1.0933863072651185e-06, + "loss": 0.5859, + "step": 19364 + }, + { + "epoch": 1.399028302057182, + "grad_norm": 6.258661600672965, + "learning_rate": 1.0931445056682092e-06, + "loss": 0.5787, + "step": 19365 + }, + { + "epoch": 1.3991005472573916, + "grad_norm": 8.32705767521561, + "learning_rate": 1.0929027233300584e-06, + "loss": 0.6241, + "step": 19366 + }, + { + "epoch": 1.3991727924576012, + "grad_norm": 8.130250613395695, + "learning_rate": 1.0926609602539767e-06, + "loss": 0.6204, + "step": 19367 + }, + { + "epoch": 1.3992450376578107, + "grad_norm": 7.629816003983233, + "learning_rate": 1.0924192164432739e-06, + "loss": 0.679, + "step": 19368 + }, + { + "epoch": 1.39931728285802, + "grad_norm": 6.9906027249973395, + "learning_rate": 1.092177491901259e-06, + "loss": 0.6427, + "step": 19369 + }, + { + "epoch": 1.3993895280582296, + "grad_norm": 7.25949684858592, + "learning_rate": 1.091935786631241e-06, + "loss": 0.7182, + "step": 19370 + }, + { + "epoch": 1.3994617732584391, + "grad_norm": 7.594209884720948, + "learning_rate": 1.091694100636529e-06, + "loss": 0.6228, + "step": 19371 + }, + { + "epoch": 1.3995340184586487, + "grad_norm": 7.702593121533711, + "learning_rate": 1.0914524339204321e-06, + "loss": 0.6619, + "step": 19372 + }, + { + "epoch": 1.3996062636588582, + "grad_norm": 7.338635810661504, + "learning_rate": 1.091210786486257e-06, + "loss": 0.6355, + "step": 19373 + }, + { + "epoch": 1.3996785088590677, + "grad_norm": 6.4104740266296805, + "learning_rate": 1.0909691583373122e-06, + "loss": 0.604, + "step": 19374 + }, + { + "epoch": 1.3997507540592773, + "grad_norm": 7.190431212757036, + "learning_rate": 1.0907275494769055e-06, + "loss": 0.6201, + "step": 19375 + }, + { + "epoch": 1.3998229992594866, + "grad_norm": 5.661639433218734, + "learning_rate": 1.0904859599083454e-06, + "loss": 0.5581, + "step": 19376 + }, + { + "epoch": 1.3998952444596962, + "grad_norm": 7.856312695528474, + "learning_rate": 1.090244389634937e-06, + "loss": 0.6421, + "step": 19377 + }, + { + "epoch": 1.3999674896599057, + "grad_norm": 7.585842960919662, + "learning_rate": 1.0900028386599878e-06, + "loss": 0.6316, + "step": 19378 + }, + { + "epoch": 1.4000397348601152, + "grad_norm": 7.783403336116906, + "learning_rate": 1.0897613069868056e-06, + "loss": 0.6213, + "step": 19379 + }, + { + "epoch": 1.4001119800603248, + "grad_norm": 7.490908643868981, + "learning_rate": 1.0895197946186971e-06, + "loss": 0.5918, + "step": 19380 + }, + { + "epoch": 1.4001842252605343, + "grad_norm": 7.0024973637862935, + "learning_rate": 1.0892783015589664e-06, + "loss": 0.6237, + "step": 19381 + }, + { + "epoch": 1.4002564704607439, + "grad_norm": 7.078742330884812, + "learning_rate": 1.0890368278109207e-06, + "loss": 0.6217, + "step": 19382 + }, + { + "epoch": 1.4003287156609532, + "grad_norm": 5.998526074251621, + "learning_rate": 1.0887953733778657e-06, + "loss": 0.5441, + "step": 19383 + }, + { + "epoch": 1.4004009608611627, + "grad_norm": 5.934266826332743, + "learning_rate": 1.088553938263106e-06, + "loss": 0.6159, + "step": 19384 + }, + { + "epoch": 1.4004732060613723, + "grad_norm": 7.044056533872551, + "learning_rate": 1.0883125224699467e-06, + "loss": 0.6564, + "step": 19385 + }, + { + "epoch": 1.4005454512615818, + "grad_norm": 5.874976949443644, + "learning_rate": 1.0880711260016927e-06, + "loss": 0.6159, + "step": 19386 + }, + { + "epoch": 1.4006176964617914, + "grad_norm": 6.565853751481208, + "learning_rate": 1.087829748861649e-06, + "loss": 0.6149, + "step": 19387 + }, + { + "epoch": 1.400689941662001, + "grad_norm": 7.437753624959333, + "learning_rate": 1.0875883910531192e-06, + "loss": 0.6226, + "step": 19388 + }, + { + "epoch": 1.4007621868622104, + "grad_norm": 7.474707872685221, + "learning_rate": 1.087347052579408e-06, + "loss": 0.6398, + "step": 19389 + }, + { + "epoch": 1.4008344320624198, + "grad_norm": 8.460728749300399, + "learning_rate": 1.0871057334438195e-06, + "loss": 0.6604, + "step": 19390 + }, + { + "epoch": 1.4009066772626293, + "grad_norm": 7.249351472458777, + "learning_rate": 1.0868644336496556e-06, + "loss": 0.6931, + "step": 19391 + }, + { + "epoch": 1.4009789224628388, + "grad_norm": 7.342289686090617, + "learning_rate": 1.0866231532002205e-06, + "loss": 0.669, + "step": 19392 + }, + { + "epoch": 1.4010511676630484, + "grad_norm": 7.783925527171289, + "learning_rate": 1.0863818920988168e-06, + "loss": 0.6185, + "step": 19393 + }, + { + "epoch": 1.401123412863258, + "grad_norm": 7.597534743565672, + "learning_rate": 1.0861406503487484e-06, + "loss": 0.5492, + "step": 19394 + }, + { + "epoch": 1.4011956580634675, + "grad_norm": 5.918836956734841, + "learning_rate": 1.0858994279533159e-06, + "loss": 0.6106, + "step": 19395 + }, + { + "epoch": 1.401267903263677, + "grad_norm": 6.559357680997116, + "learning_rate": 1.085658224915822e-06, + "loss": 0.6187, + "step": 19396 + }, + { + "epoch": 1.4013401484638863, + "grad_norm": 7.393596930705005, + "learning_rate": 1.085417041239569e-06, + "loss": 0.6508, + "step": 19397 + }, + { + "epoch": 1.4014123936640959, + "grad_norm": 6.4479871344426165, + "learning_rate": 1.0851758769278583e-06, + "loss": 0.5887, + "step": 19398 + }, + { + "epoch": 1.4014846388643054, + "grad_norm": 5.792697110434173, + "learning_rate": 1.0849347319839915e-06, + "loss": 0.5963, + "step": 19399 + }, + { + "epoch": 1.401556884064515, + "grad_norm": 6.458728934036431, + "learning_rate": 1.0846936064112693e-06, + "loss": 0.6318, + "step": 19400 + }, + { + "epoch": 1.4016291292647245, + "grad_norm": 9.073243816680646, + "learning_rate": 1.0844525002129939e-06, + "loss": 0.6774, + "step": 19401 + }, + { + "epoch": 1.401701374464934, + "grad_norm": 7.030006943071173, + "learning_rate": 1.0842114133924633e-06, + "loss": 0.589, + "step": 19402 + }, + { + "epoch": 1.4017736196651436, + "grad_norm": 7.3181211597205476, + "learning_rate": 1.0839703459529794e-06, + "loss": 0.6885, + "step": 19403 + }, + { + "epoch": 1.401845864865353, + "grad_norm": 8.29757428859224, + "learning_rate": 1.083729297897842e-06, + "loss": 0.5888, + "step": 19404 + }, + { + "epoch": 1.4019181100655624, + "grad_norm": 8.081981218126918, + "learning_rate": 1.0834882692303517e-06, + "loss": 0.6784, + "step": 19405 + }, + { + "epoch": 1.401990355265772, + "grad_norm": 7.389670570474486, + "learning_rate": 1.0832472599538064e-06, + "loss": 0.6534, + "step": 19406 + }, + { + "epoch": 1.4020626004659815, + "grad_norm": 6.424151163396298, + "learning_rate": 1.083006270071505e-06, + "loss": 0.5835, + "step": 19407 + }, + { + "epoch": 1.402134845666191, + "grad_norm": 6.239293808713075, + "learning_rate": 1.0827652995867493e-06, + "loss": 0.6617, + "step": 19408 + }, + { + "epoch": 1.4022070908664006, + "grad_norm": 6.616678445775127, + "learning_rate": 1.0825243485028356e-06, + "loss": 0.6308, + "step": 19409 + }, + { + "epoch": 1.4022793360666101, + "grad_norm": 6.511919683275108, + "learning_rate": 1.0822834168230625e-06, + "loss": 0.5862, + "step": 19410 + }, + { + "epoch": 1.4023515812668195, + "grad_norm": 7.478752811921662, + "learning_rate": 1.082042504550729e-06, + "loss": 0.5319, + "step": 19411 + }, + { + "epoch": 1.4024238264670292, + "grad_norm": 6.634128543338681, + "learning_rate": 1.0818016116891337e-06, + "loss": 0.6084, + "step": 19412 + }, + { + "epoch": 1.4024960716672386, + "grad_norm": 6.430108725190689, + "learning_rate": 1.081560738241572e-06, + "loss": 0.6675, + "step": 19413 + }, + { + "epoch": 1.402568316867448, + "grad_norm": 6.850503176286436, + "learning_rate": 1.0813198842113424e-06, + "loss": 0.619, + "step": 19414 + }, + { + "epoch": 1.4026405620676576, + "grad_norm": 6.802712785728733, + "learning_rate": 1.081079049601742e-06, + "loss": 0.6221, + "step": 19415 + }, + { + "epoch": 1.4027128072678672, + "grad_norm": 7.0883905944496615, + "learning_rate": 1.0808382344160682e-06, + "loss": 0.6545, + "step": 19416 + }, + { + "epoch": 1.4027850524680767, + "grad_norm": 6.342517751895178, + "learning_rate": 1.0805974386576166e-06, + "loss": 0.5496, + "step": 19417 + }, + { + "epoch": 1.402857297668286, + "grad_norm": 7.6475810895338725, + "learning_rate": 1.080356662329684e-06, + "loss": 0.642, + "step": 19418 + }, + { + "epoch": 1.4029295428684958, + "grad_norm": 6.644192803053685, + "learning_rate": 1.0801159054355672e-06, + "loss": 0.6915, + "step": 19419 + }, + { + "epoch": 1.4030017880687051, + "grad_norm": 6.791550648952355, + "learning_rate": 1.0798751679785605e-06, + "loss": 0.6618, + "step": 19420 + }, + { + "epoch": 1.4030740332689147, + "grad_norm": 8.05092955725237, + "learning_rate": 1.0796344499619602e-06, + "loss": 0.5985, + "step": 19421 + }, + { + "epoch": 1.4031462784691242, + "grad_norm": 6.869074261745822, + "learning_rate": 1.079393751389062e-06, + "loss": 0.6151, + "step": 19422 + }, + { + "epoch": 1.4032185236693338, + "grad_norm": 8.204267498563675, + "learning_rate": 1.0791530722631596e-06, + "loss": 0.6554, + "step": 19423 + }, + { + "epoch": 1.4032907688695433, + "grad_norm": 7.049683140336904, + "learning_rate": 1.0789124125875485e-06, + "loss": 0.5982, + "step": 19424 + }, + { + "epoch": 1.4033630140697526, + "grad_norm": 7.019938546805644, + "learning_rate": 1.0786717723655231e-06, + "loss": 0.6614, + "step": 19425 + }, + { + "epoch": 1.4034352592699624, + "grad_norm": 6.91003203940796, + "learning_rate": 1.0784311516003773e-06, + "loss": 0.6205, + "step": 19426 + }, + { + "epoch": 1.4035075044701717, + "grad_norm": 6.499884384300796, + "learning_rate": 1.0781905502954054e-06, + "loss": 0.5741, + "step": 19427 + }, + { + "epoch": 1.4035797496703812, + "grad_norm": 7.8043189587035515, + "learning_rate": 1.0779499684539011e-06, + "loss": 0.6629, + "step": 19428 + }, + { + "epoch": 1.4036519948705908, + "grad_norm": 6.332896000000803, + "learning_rate": 1.0777094060791574e-06, + "loss": 0.6679, + "step": 19429 + }, + { + "epoch": 1.4037242400708003, + "grad_norm": 6.8449474663177945, + "learning_rate": 1.0774688631744683e-06, + "loss": 0.6344, + "step": 19430 + }, + { + "epoch": 1.4037964852710099, + "grad_norm": 8.73049758803962, + "learning_rate": 1.0772283397431255e-06, + "loss": 0.6234, + "step": 19431 + }, + { + "epoch": 1.4038687304712192, + "grad_norm": 7.932072984399096, + "learning_rate": 1.0769878357884217e-06, + "loss": 0.6372, + "step": 19432 + }, + { + "epoch": 1.403940975671429, + "grad_norm": 6.270935395720856, + "learning_rate": 1.0767473513136503e-06, + "loss": 0.6236, + "step": 19433 + }, + { + "epoch": 1.4040132208716383, + "grad_norm": 7.619468809020785, + "learning_rate": 1.076506886322102e-06, + "loss": 0.6369, + "step": 19434 + }, + { + "epoch": 1.4040854660718478, + "grad_norm": 5.821363029144449, + "learning_rate": 1.076266440817068e-06, + "loss": 0.616, + "step": 19435 + }, + { + "epoch": 1.4041577112720574, + "grad_norm": 6.039634766304297, + "learning_rate": 1.076026014801842e-06, + "loss": 0.7188, + "step": 19436 + }, + { + "epoch": 1.404229956472267, + "grad_norm": 7.83745745049567, + "learning_rate": 1.0757856082797149e-06, + "loss": 0.6926, + "step": 19437 + }, + { + "epoch": 1.4043022016724764, + "grad_norm": 6.8923130173584, + "learning_rate": 1.0755452212539764e-06, + "loss": 0.6136, + "step": 19438 + }, + { + "epoch": 1.404374446872686, + "grad_norm": 7.752212946798466, + "learning_rate": 1.0753048537279177e-06, + "loss": 0.6536, + "step": 19439 + }, + { + "epoch": 1.4044466920728955, + "grad_norm": 7.501856002039881, + "learning_rate": 1.0750645057048303e-06, + "loss": 0.6292, + "step": 19440 + }, + { + "epoch": 1.4045189372731048, + "grad_norm": 6.642331754472567, + "learning_rate": 1.0748241771880026e-06, + "loss": 0.6351, + "step": 19441 + }, + { + "epoch": 1.4045911824733144, + "grad_norm": 7.991883452030557, + "learning_rate": 1.074583868180725e-06, + "loss": 0.5952, + "step": 19442 + }, + { + "epoch": 1.404663427673524, + "grad_norm": 6.968298653667836, + "learning_rate": 1.0743435786862876e-06, + "loss": 0.6119, + "step": 19443 + }, + { + "epoch": 1.4047356728737335, + "grad_norm": 9.096326954463544, + "learning_rate": 1.0741033087079797e-06, + "loss": 0.747, + "step": 19444 + }, + { + "epoch": 1.404807918073943, + "grad_norm": 6.027704807822306, + "learning_rate": 1.0738630582490906e-06, + "loss": 0.6169, + "step": 19445 + }, + { + "epoch": 1.4048801632741525, + "grad_norm": 5.4559605612350195, + "learning_rate": 1.0736228273129085e-06, + "loss": 0.5828, + "step": 19446 + }, + { + "epoch": 1.404952408474362, + "grad_norm": 7.493417776091295, + "learning_rate": 1.0733826159027236e-06, + "loss": 0.6844, + "step": 19447 + }, + { + "epoch": 1.4050246536745714, + "grad_norm": 7.596578691259638, + "learning_rate": 1.073142424021822e-06, + "loss": 0.5526, + "step": 19448 + }, + { + "epoch": 1.405096898874781, + "grad_norm": 7.2953234703715175, + "learning_rate": 1.0729022516734928e-06, + "loss": 0.6615, + "step": 19449 + }, + { + "epoch": 1.4051691440749905, + "grad_norm": 6.734552863442963, + "learning_rate": 1.0726620988610235e-06, + "loss": 0.6077, + "step": 19450 + }, + { + "epoch": 1.4052413892752, + "grad_norm": 7.348491290684611, + "learning_rate": 1.0724219655877026e-06, + "loss": 0.5831, + "step": 19451 + }, + { + "epoch": 1.4053136344754096, + "grad_norm": 6.956737617066284, + "learning_rate": 1.0721818518568162e-06, + "loss": 0.6331, + "step": 19452 + }, + { + "epoch": 1.4053858796756191, + "grad_norm": 8.591010305479074, + "learning_rate": 1.0719417576716512e-06, + "loss": 0.6197, + "step": 19453 + }, + { + "epoch": 1.4054581248758287, + "grad_norm": 7.3456358740440555, + "learning_rate": 1.0717016830354948e-06, + "loss": 0.5585, + "step": 19454 + }, + { + "epoch": 1.405530370076038, + "grad_norm": 7.466638753475512, + "learning_rate": 1.0714616279516333e-06, + "loss": 0.6799, + "step": 19455 + }, + { + "epoch": 1.4056026152762475, + "grad_norm": 6.151161530759068, + "learning_rate": 1.071221592423353e-06, + "loss": 0.5677, + "step": 19456 + }, + { + "epoch": 1.405674860476457, + "grad_norm": 6.357701186740599, + "learning_rate": 1.0709815764539396e-06, + "loss": 0.6052, + "step": 19457 + }, + { + "epoch": 1.4057471056766666, + "grad_norm": 7.281801129222555, + "learning_rate": 1.07074158004668e-06, + "loss": 0.641, + "step": 19458 + }, + { + "epoch": 1.4058193508768762, + "grad_norm": 8.246564207476187, + "learning_rate": 1.0705016032048574e-06, + "loss": 0.6815, + "step": 19459 + }, + { + "epoch": 1.4058915960770857, + "grad_norm": 7.988358371800702, + "learning_rate": 1.070261645931758e-06, + "loss": 0.6676, + "step": 19460 + }, + { + "epoch": 1.4059638412772952, + "grad_norm": 5.918832445225024, + "learning_rate": 1.0700217082306662e-06, + "loss": 0.5838, + "step": 19461 + }, + { + "epoch": 1.4060360864775046, + "grad_norm": 5.966949670711387, + "learning_rate": 1.0697817901048683e-06, + "loss": 0.6236, + "step": 19462 + }, + { + "epoch": 1.406108331677714, + "grad_norm": 6.815677566710399, + "learning_rate": 1.0695418915576453e-06, + "loss": 0.6071, + "step": 19463 + }, + { + "epoch": 1.4061805768779236, + "grad_norm": 6.468868530956614, + "learning_rate": 1.0693020125922837e-06, + "loss": 0.6421, + "step": 19464 + }, + { + "epoch": 1.4062528220781332, + "grad_norm": 8.01619178597367, + "learning_rate": 1.069062153212068e-06, + "loss": 0.6392, + "step": 19465 + }, + { + "epoch": 1.4063250672783427, + "grad_norm": 6.059761923466284, + "learning_rate": 1.068822313420279e-06, + "loss": 0.5545, + "step": 19466 + }, + { + "epoch": 1.4063973124785523, + "grad_norm": 5.0960180931668395, + "learning_rate": 1.0685824932202016e-06, + "loss": 0.5925, + "step": 19467 + }, + { + "epoch": 1.4064695576787618, + "grad_norm": 7.271653822226601, + "learning_rate": 1.0683426926151184e-06, + "loss": 0.6593, + "step": 19468 + }, + { + "epoch": 1.4065418028789711, + "grad_norm": 7.237447988161826, + "learning_rate": 1.068102911608313e-06, + "loss": 0.6535, + "step": 19469 + }, + { + "epoch": 1.4066140480791807, + "grad_norm": 6.991340185218369, + "learning_rate": 1.0678631502030661e-06, + "loss": 0.579, + "step": 19470 + }, + { + "epoch": 1.4066862932793902, + "grad_norm": 6.873288305648786, + "learning_rate": 1.0676234084026608e-06, + "loss": 0.611, + "step": 19471 + }, + { + "epoch": 1.4067585384795998, + "grad_norm": 7.6389622663576455, + "learning_rate": 1.0673836862103785e-06, + "loss": 0.6085, + "step": 19472 + }, + { + "epoch": 1.4068307836798093, + "grad_norm": 7.787068225530856, + "learning_rate": 1.0671439836295014e-06, + "loss": 0.7337, + "step": 19473 + }, + { + "epoch": 1.4069030288800188, + "grad_norm": 7.117608502830185, + "learning_rate": 1.0669043006633107e-06, + "loss": 0.6106, + "step": 19474 + }, + { + "epoch": 1.4069752740802284, + "grad_norm": 7.350131764171551, + "learning_rate": 1.0666646373150874e-06, + "loss": 0.588, + "step": 19475 + }, + { + "epoch": 1.4070475192804377, + "grad_norm": 7.231699240667487, + "learning_rate": 1.066424993588113e-06, + "loss": 0.5848, + "step": 19476 + }, + { + "epoch": 1.4071197644806472, + "grad_norm": 8.166622187694431, + "learning_rate": 1.0661853694856667e-06, + "loss": 0.6242, + "step": 19477 + }, + { + "epoch": 1.4071920096808568, + "grad_norm": 8.148313439102319, + "learning_rate": 1.0659457650110291e-06, + "loss": 0.6807, + "step": 19478 + }, + { + "epoch": 1.4072642548810663, + "grad_norm": 7.182251182997157, + "learning_rate": 1.0657061801674806e-06, + "loss": 0.6177, + "step": 19479 + }, + { + "epoch": 1.4073365000812759, + "grad_norm": 6.416733217100689, + "learning_rate": 1.065466614958302e-06, + "loss": 0.6385, + "step": 19480 + }, + { + "epoch": 1.4074087452814854, + "grad_norm": 7.2162743488947, + "learning_rate": 1.0652270693867705e-06, + "loss": 0.7143, + "step": 19481 + }, + { + "epoch": 1.407480990481695, + "grad_norm": 6.39657651412646, + "learning_rate": 1.0649875434561662e-06, + "loss": 0.6164, + "step": 19482 + }, + { + "epoch": 1.4075532356819043, + "grad_norm": 6.417327681673627, + "learning_rate": 1.0647480371697685e-06, + "loss": 0.6645, + "step": 19483 + }, + { + "epoch": 1.407625480882114, + "grad_norm": 10.3411020998325, + "learning_rate": 1.064508550530856e-06, + "loss": 0.606, + "step": 19484 + }, + { + "epoch": 1.4076977260823234, + "grad_norm": 6.972119530288397, + "learning_rate": 1.0642690835427064e-06, + "loss": 0.6878, + "step": 19485 + }, + { + "epoch": 1.407769971282533, + "grad_norm": 7.158949163885962, + "learning_rate": 1.0640296362085984e-06, + "loss": 0.7206, + "step": 19486 + }, + { + "epoch": 1.4078422164827424, + "grad_norm": 6.293283957810327, + "learning_rate": 1.0637902085318106e-06, + "loss": 0.6129, + "step": 19487 + }, + { + "epoch": 1.407914461682952, + "grad_norm": 6.321167990522345, + "learning_rate": 1.0635508005156187e-06, + "loss": 0.6217, + "step": 19488 + }, + { + "epoch": 1.4079867068831615, + "grad_norm": 5.4235328975849795, + "learning_rate": 1.0633114121633011e-06, + "loss": 0.582, + "step": 19489 + }, + { + "epoch": 1.4080589520833708, + "grad_norm": 7.40644160155642, + "learning_rate": 1.0630720434781359e-06, + "loss": 0.6025, + "step": 19490 + }, + { + "epoch": 1.4081311972835806, + "grad_norm": 5.931433539972553, + "learning_rate": 1.0628326944633965e-06, + "loss": 0.5702, + "step": 19491 + }, + { + "epoch": 1.40820344248379, + "grad_norm": 6.787528651617848, + "learning_rate": 1.062593365122363e-06, + "loss": 0.6123, + "step": 19492 + }, + { + "epoch": 1.4082756876839995, + "grad_norm": 5.778302075162406, + "learning_rate": 1.0623540554583097e-06, + "loss": 0.6061, + "step": 19493 + }, + { + "epoch": 1.408347932884209, + "grad_norm": 5.956404936337438, + "learning_rate": 1.0621147654745145e-06, + "loss": 0.6221, + "step": 19494 + }, + { + "epoch": 1.4084201780844186, + "grad_norm": 6.32132549679076, + "learning_rate": 1.0618754951742507e-06, + "loss": 0.5584, + "step": 19495 + }, + { + "epoch": 1.408492423284628, + "grad_norm": 8.125459628676158, + "learning_rate": 1.0616362445607948e-06, + "loss": 0.5956, + "step": 19496 + }, + { + "epoch": 1.4085646684848374, + "grad_norm": 6.34175958084969, + "learning_rate": 1.0613970136374223e-06, + "loss": 0.5709, + "step": 19497 + }, + { + "epoch": 1.4086369136850472, + "grad_norm": 7.242317075094886, + "learning_rate": 1.0611578024074073e-06, + "loss": 0.6219, + "step": 19498 + }, + { + "epoch": 1.4087091588852565, + "grad_norm": 7.39959598160048, + "learning_rate": 1.0609186108740248e-06, + "loss": 0.7192, + "step": 19499 + }, + { + "epoch": 1.408781404085466, + "grad_norm": 7.993483989187718, + "learning_rate": 1.0606794390405489e-06, + "loss": 0.6828, + "step": 19500 + }, + { + "epoch": 1.4088536492856756, + "grad_norm": 7.546322504200463, + "learning_rate": 1.060440286910254e-06, + "loss": 0.5926, + "step": 19501 + }, + { + "epoch": 1.4089258944858851, + "grad_norm": 9.141370328854302, + "learning_rate": 1.0602011544864137e-06, + "loss": 0.647, + "step": 19502 + }, + { + "epoch": 1.4089981396860947, + "grad_norm": 8.05536850752293, + "learning_rate": 1.0599620417723018e-06, + "loss": 0.6215, + "step": 19503 + }, + { + "epoch": 1.409070384886304, + "grad_norm": 7.600566802723087, + "learning_rate": 1.0597229487711924e-06, + "loss": 0.6867, + "step": 19504 + }, + { + "epoch": 1.4091426300865137, + "grad_norm": 6.552974217853563, + "learning_rate": 1.0594838754863564e-06, + "loss": 0.662, + "step": 19505 + }, + { + "epoch": 1.409214875286723, + "grad_norm": 7.096720472675048, + "learning_rate": 1.0592448219210679e-06, + "loss": 0.6164, + "step": 19506 + }, + { + "epoch": 1.4092871204869326, + "grad_norm": 8.153750215583317, + "learning_rate": 1.0590057880785991e-06, + "loss": 0.6356, + "step": 19507 + }, + { + "epoch": 1.4093593656871422, + "grad_norm": 5.89947282407935, + "learning_rate": 1.058766773962223e-06, + "loss": 0.6367, + "step": 19508 + }, + { + "epoch": 1.4094316108873517, + "grad_norm": 7.599737624608538, + "learning_rate": 1.0585277795752099e-06, + "loss": 0.6817, + "step": 19509 + }, + { + "epoch": 1.4095038560875612, + "grad_norm": 6.603798680527705, + "learning_rate": 1.0582888049208323e-06, + "loss": 0.6202, + "step": 19510 + }, + { + "epoch": 1.4095761012877706, + "grad_norm": 6.435807385186318, + "learning_rate": 1.0580498500023614e-06, + "loss": 0.6466, + "step": 19511 + }, + { + "epoch": 1.4096483464879803, + "grad_norm": 7.211630476926989, + "learning_rate": 1.0578109148230685e-06, + "loss": 0.6092, + "step": 19512 + }, + { + "epoch": 1.4097205916881896, + "grad_norm": 5.867854866443322, + "learning_rate": 1.0575719993862244e-06, + "loss": 0.5327, + "step": 19513 + }, + { + "epoch": 1.4097928368883992, + "grad_norm": 6.401420745509168, + "learning_rate": 1.0573331036950995e-06, + "loss": 0.6579, + "step": 19514 + }, + { + "epoch": 1.4098650820886087, + "grad_norm": 7.410226004135977, + "learning_rate": 1.0570942277529656e-06, + "loss": 0.5846, + "step": 19515 + }, + { + "epoch": 1.4099373272888183, + "grad_norm": 5.629981823039821, + "learning_rate": 1.05685537156309e-06, + "loss": 0.6288, + "step": 19516 + }, + { + "epoch": 1.4100095724890278, + "grad_norm": 7.100768797178678, + "learning_rate": 1.056616535128744e-06, + "loss": 0.5358, + "step": 19517 + }, + { + "epoch": 1.4100818176892373, + "grad_norm": 8.003611226413625, + "learning_rate": 1.056377718453197e-06, + "loss": 0.632, + "step": 19518 + }, + { + "epoch": 1.410154062889447, + "grad_norm": 6.58975662870315, + "learning_rate": 1.0561389215397192e-06, + "loss": 0.6199, + "step": 19519 + }, + { + "epoch": 1.4102263080896562, + "grad_norm": 7.248154767116675, + "learning_rate": 1.0559001443915761e-06, + "loss": 0.6727, + "step": 19520 + }, + { + "epoch": 1.4102985532898658, + "grad_norm": 6.430817923012246, + "learning_rate": 1.0556613870120403e-06, + "loss": 0.6105, + "step": 19521 + }, + { + "epoch": 1.4103707984900753, + "grad_norm": 7.364877251601166, + "learning_rate": 1.055422649404379e-06, + "loss": 0.6907, + "step": 19522 + }, + { + "epoch": 1.4104430436902848, + "grad_norm": 6.561863168970458, + "learning_rate": 1.0551839315718596e-06, + "loss": 0.6366, + "step": 19523 + }, + { + "epoch": 1.4105152888904944, + "grad_norm": 6.391382746087299, + "learning_rate": 1.0549452335177499e-06, + "loss": 0.5849, + "step": 19524 + }, + { + "epoch": 1.410587534090704, + "grad_norm": 7.495222477375275, + "learning_rate": 1.054706555245318e-06, + "loss": 0.6625, + "step": 19525 + }, + { + "epoch": 1.4106597792909135, + "grad_norm": 6.955348244456215, + "learning_rate": 1.0544678967578324e-06, + "loss": 0.6051, + "step": 19526 + }, + { + "epoch": 1.4107320244911228, + "grad_norm": 7.559538710715015, + "learning_rate": 1.0542292580585578e-06, + "loss": 0.6297, + "step": 19527 + }, + { + "epoch": 1.4108042696913323, + "grad_norm": 6.629709549147284, + "learning_rate": 1.0539906391507621e-06, + "loss": 0.6071, + "step": 19528 + }, + { + "epoch": 1.4108765148915419, + "grad_norm": 6.501551809671126, + "learning_rate": 1.0537520400377119e-06, + "loss": 0.5831, + "step": 19529 + }, + { + "epoch": 1.4109487600917514, + "grad_norm": 7.097754604264878, + "learning_rate": 1.0535134607226733e-06, + "loss": 0.585, + "step": 19530 + }, + { + "epoch": 1.411021005291961, + "grad_norm": 6.680319094774426, + "learning_rate": 1.0532749012089118e-06, + "loss": 0.6433, + "step": 19531 + }, + { + "epoch": 1.4110932504921705, + "grad_norm": 6.726178839505077, + "learning_rate": 1.0530363614996942e-06, + "loss": 0.6056, + "step": 19532 + }, + { + "epoch": 1.41116549569238, + "grad_norm": 6.73371958364412, + "learning_rate": 1.0527978415982863e-06, + "loss": 0.6294, + "step": 19533 + }, + { + "epoch": 1.4112377408925894, + "grad_norm": 7.539533815821248, + "learning_rate": 1.052559341507951e-06, + "loss": 0.5931, + "step": 19534 + }, + { + "epoch": 1.411309986092799, + "grad_norm": 6.799255375202976, + "learning_rate": 1.0523208612319543e-06, + "loss": 0.6312, + "step": 19535 + }, + { + "epoch": 1.4113822312930084, + "grad_norm": 8.00153526833386, + "learning_rate": 1.0520824007735615e-06, + "loss": 0.6165, + "step": 19536 + }, + { + "epoch": 1.411454476493218, + "grad_norm": 6.845279052485478, + "learning_rate": 1.051843960136037e-06, + "loss": 0.5477, + "step": 19537 + }, + { + "epoch": 1.4115267216934275, + "grad_norm": 7.412279981979229, + "learning_rate": 1.0516055393226432e-06, + "loss": 0.6045, + "step": 19538 + }, + { + "epoch": 1.411598966893637, + "grad_norm": 6.776892883958292, + "learning_rate": 1.0513671383366452e-06, + "loss": 0.6154, + "step": 19539 + }, + { + "epoch": 1.4116712120938466, + "grad_norm": 5.723109183760873, + "learning_rate": 1.0511287571813061e-06, + "loss": 0.5618, + "step": 19540 + }, + { + "epoch": 1.411743457294056, + "grad_norm": 7.134786476837857, + "learning_rate": 1.0508903958598896e-06, + "loss": 0.6504, + "step": 19541 + }, + { + "epoch": 1.4118157024942655, + "grad_norm": 6.680689400829927, + "learning_rate": 1.0506520543756581e-06, + "loss": 0.6503, + "step": 19542 + }, + { + "epoch": 1.411887947694475, + "grad_norm": 7.223407250369282, + "learning_rate": 1.050413732731875e-06, + "loss": 0.6726, + "step": 19543 + }, + { + "epoch": 1.4119601928946846, + "grad_norm": 5.9759245089418815, + "learning_rate": 1.0501754309318029e-06, + "loss": 0.5718, + "step": 19544 + }, + { + "epoch": 1.412032438094894, + "grad_norm": 7.21858142474299, + "learning_rate": 1.0499371489787027e-06, + "loss": 0.5944, + "step": 19545 + }, + { + "epoch": 1.4121046832951036, + "grad_norm": 6.630745465700535, + "learning_rate": 1.049698886875837e-06, + "loss": 0.6712, + "step": 19546 + }, + { + "epoch": 1.4121769284953132, + "grad_norm": 8.062800911899156, + "learning_rate": 1.0494606446264686e-06, + "loss": 0.6068, + "step": 19547 + }, + { + "epoch": 1.4122491736955225, + "grad_norm": 6.266412775835026, + "learning_rate": 1.0492224222338557e-06, + "loss": 0.6513, + "step": 19548 + }, + { + "epoch": 1.412321418895732, + "grad_norm": 6.837707081227018, + "learning_rate": 1.0489842197012622e-06, + "loss": 0.679, + "step": 19549 + }, + { + "epoch": 1.4123936640959416, + "grad_norm": 6.7346100732971514, + "learning_rate": 1.0487460370319482e-06, + "loss": 0.6062, + "step": 19550 + }, + { + "epoch": 1.4124659092961511, + "grad_norm": 6.173669327283935, + "learning_rate": 1.048507874229175e-06, + "loss": 0.6519, + "step": 19551 + }, + { + "epoch": 1.4125381544963607, + "grad_norm": 7.324941370602476, + "learning_rate": 1.0482697312962013e-06, + "loss": 0.5835, + "step": 19552 + }, + { + "epoch": 1.4126103996965702, + "grad_norm": 7.766311216065524, + "learning_rate": 1.0480316082362873e-06, + "loss": 0.6762, + "step": 19553 + }, + { + "epoch": 1.4126826448967797, + "grad_norm": 6.804048387753472, + "learning_rate": 1.0477935050526943e-06, + "loss": 0.6802, + "step": 19554 + }, + { + "epoch": 1.412754890096989, + "grad_norm": 8.870071117025711, + "learning_rate": 1.04755542174868e-06, + "loss": 0.6594, + "step": 19555 + }, + { + "epoch": 1.4128271352971986, + "grad_norm": 6.22775031310684, + "learning_rate": 1.047317358327504e-06, + "loss": 0.5889, + "step": 19556 + }, + { + "epoch": 1.4128993804974082, + "grad_norm": 7.452789235099333, + "learning_rate": 1.0470793147924254e-06, + "loss": 0.6126, + "step": 19557 + }, + { + "epoch": 1.4129716256976177, + "grad_norm": 8.113880236374849, + "learning_rate": 1.046841291146703e-06, + "loss": 0.6363, + "step": 19558 + }, + { + "epoch": 1.4130438708978272, + "grad_norm": 6.4239151213134695, + "learning_rate": 1.046603287393595e-06, + "loss": 0.6444, + "step": 19559 + }, + { + "epoch": 1.4131161160980368, + "grad_norm": 7.061957954603517, + "learning_rate": 1.0463653035363593e-06, + "loss": 0.5083, + "step": 19560 + }, + { + "epoch": 1.4131883612982463, + "grad_norm": 6.334671816013957, + "learning_rate": 1.0461273395782541e-06, + "loss": 0.687, + "step": 19561 + }, + { + "epoch": 1.4132606064984556, + "grad_norm": 6.857040529395852, + "learning_rate": 1.0458893955225378e-06, + "loss": 0.6319, + "step": 19562 + }, + { + "epoch": 1.4133328516986654, + "grad_norm": 7.155169143010379, + "learning_rate": 1.0456514713724656e-06, + "loss": 0.5807, + "step": 19563 + }, + { + "epoch": 1.4134050968988747, + "grad_norm": 7.641125746468242, + "learning_rate": 1.0454135671312954e-06, + "loss": 0.6176, + "step": 19564 + }, + { + "epoch": 1.4134773420990843, + "grad_norm": 8.82067491920394, + "learning_rate": 1.0451756828022853e-06, + "loss": 0.6189, + "step": 19565 + }, + { + "epoch": 1.4135495872992938, + "grad_norm": 6.323718385181925, + "learning_rate": 1.0449378183886893e-06, + "loss": 0.6781, + "step": 19566 + }, + { + "epoch": 1.4136218324995034, + "grad_norm": 6.16860852296684, + "learning_rate": 1.0446999738937647e-06, + "loss": 0.5769, + "step": 19567 + }, + { + "epoch": 1.413694077699713, + "grad_norm": 7.64935490499993, + "learning_rate": 1.0444621493207669e-06, + "loss": 0.596, + "step": 19568 + }, + { + "epoch": 1.4137663228999222, + "grad_norm": 7.994532385648678, + "learning_rate": 1.0442243446729536e-06, + "loss": 0.6582, + "step": 19569 + }, + { + "epoch": 1.413838568100132, + "grad_norm": 7.55285685469283, + "learning_rate": 1.0439865599535782e-06, + "loss": 0.6747, + "step": 19570 + }, + { + "epoch": 1.4139108133003413, + "grad_norm": 6.511417045537628, + "learning_rate": 1.043748795165896e-06, + "loss": 0.6268, + "step": 19571 + }, + { + "epoch": 1.4139830585005508, + "grad_norm": 7.074670078955924, + "learning_rate": 1.043511050313163e-06, + "loss": 0.6297, + "step": 19572 + }, + { + "epoch": 1.4140553037007604, + "grad_norm": 7.450716473222926, + "learning_rate": 1.0432733253986319e-06, + "loss": 0.699, + "step": 19573 + }, + { + "epoch": 1.41412754890097, + "grad_norm": 7.556366001245268, + "learning_rate": 1.0430356204255579e-06, + "loss": 0.7047, + "step": 19574 + }, + { + "epoch": 1.4141997941011795, + "grad_norm": 7.894677177337589, + "learning_rate": 1.0427979353971952e-06, + "loss": 0.6312, + "step": 19575 + }, + { + "epoch": 1.4142720393013888, + "grad_norm": 7.169399391035055, + "learning_rate": 1.042560270316797e-06, + "loss": 0.6455, + "step": 19576 + }, + { + "epoch": 1.4143442845015985, + "grad_norm": 6.676082669010833, + "learning_rate": 1.042322625187617e-06, + "loss": 0.6649, + "step": 19577 + }, + { + "epoch": 1.4144165297018079, + "grad_norm": 6.2031517508672245, + "learning_rate": 1.0420850000129087e-06, + "loss": 0.6001, + "step": 19578 + }, + { + "epoch": 1.4144887749020174, + "grad_norm": 7.387854542752791, + "learning_rate": 1.0418473947959256e-06, + "loss": 0.6231, + "step": 19579 + }, + { + "epoch": 1.414561020102227, + "grad_norm": 7.312448093849565, + "learning_rate": 1.0416098095399187e-06, + "loss": 0.5901, + "step": 19580 + }, + { + "epoch": 1.4146332653024365, + "grad_norm": 7.426852514702745, + "learning_rate": 1.041372244248141e-06, + "loss": 0.6125, + "step": 19581 + }, + { + "epoch": 1.414705510502646, + "grad_norm": 8.192001386642339, + "learning_rate": 1.041134698923845e-06, + "loss": 0.6413, + "step": 19582 + }, + { + "epoch": 1.4147777557028554, + "grad_norm": 6.438154168825293, + "learning_rate": 1.0408971735702828e-06, + "loss": 0.7118, + "step": 19583 + }, + { + "epoch": 1.4148500009030651, + "grad_norm": 6.273256900225788, + "learning_rate": 1.0406596681907047e-06, + "loss": 0.5682, + "step": 19584 + }, + { + "epoch": 1.4149222461032744, + "grad_norm": 8.031256174760516, + "learning_rate": 1.0404221827883624e-06, + "loss": 0.6679, + "step": 19585 + }, + { + "epoch": 1.414994491303484, + "grad_norm": 6.276781001144466, + "learning_rate": 1.0401847173665073e-06, + "loss": 0.6892, + "step": 19586 + }, + { + "epoch": 1.4150667365036935, + "grad_norm": 6.600547432593157, + "learning_rate": 1.0399472719283899e-06, + "loss": 0.6303, + "step": 19587 + }, + { + "epoch": 1.415138981703903, + "grad_norm": 7.176669244950335, + "learning_rate": 1.0397098464772606e-06, + "loss": 0.7047, + "step": 19588 + }, + { + "epoch": 1.4152112269041126, + "grad_norm": 7.807311020908368, + "learning_rate": 1.0394724410163698e-06, + "loss": 0.6192, + "step": 19589 + }, + { + "epoch": 1.4152834721043221, + "grad_norm": 6.870244826572834, + "learning_rate": 1.0392350555489681e-06, + "loss": 0.5592, + "step": 19590 + }, + { + "epoch": 1.4153557173045317, + "grad_norm": 6.705151231668887, + "learning_rate": 1.0389976900783033e-06, + "loss": 0.6299, + "step": 19591 + }, + { + "epoch": 1.415427962504741, + "grad_norm": 8.116413450371, + "learning_rate": 1.038760344607626e-06, + "loss": 0.6512, + "step": 19592 + }, + { + "epoch": 1.4155002077049506, + "grad_norm": 7.210853386054648, + "learning_rate": 1.0385230191401846e-06, + "loss": 0.6228, + "step": 19593 + }, + { + "epoch": 1.41557245290516, + "grad_norm": 6.665539487280391, + "learning_rate": 1.0382857136792296e-06, + "loss": 0.6435, + "step": 19594 + }, + { + "epoch": 1.4156446981053696, + "grad_norm": 6.555598817384434, + "learning_rate": 1.038048428228007e-06, + "loss": 0.6377, + "step": 19595 + }, + { + "epoch": 1.4157169433055792, + "grad_norm": 7.184808641830801, + "learning_rate": 1.0378111627897664e-06, + "loss": 0.6656, + "step": 19596 + }, + { + "epoch": 1.4157891885057887, + "grad_norm": 6.5046766036833725, + "learning_rate": 1.0375739173677556e-06, + "loss": 0.6717, + "step": 19597 + }, + { + "epoch": 1.4158614337059983, + "grad_norm": 7.8625544947932635, + "learning_rate": 1.0373366919652224e-06, + "loss": 0.6216, + "step": 19598 + }, + { + "epoch": 1.4159336789062076, + "grad_norm": 6.518703547574633, + "learning_rate": 1.0370994865854146e-06, + "loss": 0.5498, + "step": 19599 + }, + { + "epoch": 1.4160059241064171, + "grad_norm": 7.251974954939538, + "learning_rate": 1.0368623012315785e-06, + "loss": 0.7057, + "step": 19600 + }, + { + "epoch": 1.4160781693066267, + "grad_norm": 6.632287965125139, + "learning_rate": 1.0366251359069626e-06, + "loss": 0.6718, + "step": 19601 + }, + { + "epoch": 1.4161504145068362, + "grad_norm": 6.741379778981665, + "learning_rate": 1.036387990614811e-06, + "loss": 0.633, + "step": 19602 + }, + { + "epoch": 1.4162226597070458, + "grad_norm": 5.981174818501183, + "learning_rate": 1.036150865358372e-06, + "loss": 0.5959, + "step": 19603 + }, + { + "epoch": 1.4162949049072553, + "grad_norm": 6.291495853488493, + "learning_rate": 1.0359137601408906e-06, + "loss": 0.5579, + "step": 19604 + }, + { + "epoch": 1.4163671501074648, + "grad_norm": 7.271503523259102, + "learning_rate": 1.0356766749656133e-06, + "loss": 0.6575, + "step": 19605 + }, + { + "epoch": 1.4164393953076742, + "grad_norm": 7.071537952760629, + "learning_rate": 1.0354396098357853e-06, + "loss": 0.7104, + "step": 19606 + }, + { + "epoch": 1.4165116405078837, + "grad_norm": 7.331741449220823, + "learning_rate": 1.0352025647546518e-06, + "loss": 0.6163, + "step": 19607 + }, + { + "epoch": 1.4165838857080932, + "grad_norm": 6.874685106868788, + "learning_rate": 1.0349655397254588e-06, + "loss": 0.5518, + "step": 19608 + }, + { + "epoch": 1.4166561309083028, + "grad_norm": 6.29910519012733, + "learning_rate": 1.034728534751449e-06, + "loss": 0.6645, + "step": 19609 + }, + { + "epoch": 1.4167283761085123, + "grad_norm": 6.900067492168781, + "learning_rate": 1.034491549835868e-06, + "loss": 0.5894, + "step": 19610 + }, + { + "epoch": 1.4168006213087219, + "grad_norm": 6.568387251836473, + "learning_rate": 1.0342545849819596e-06, + "loss": 0.5975, + "step": 19611 + }, + { + "epoch": 1.4168728665089314, + "grad_norm": 8.100276951705784, + "learning_rate": 1.0340176401929691e-06, + "loss": 0.7088, + "step": 19612 + }, + { + "epoch": 1.4169451117091407, + "grad_norm": 6.432704287330655, + "learning_rate": 1.0337807154721376e-06, + "loss": 0.6625, + "step": 19613 + }, + { + "epoch": 1.4170173569093503, + "grad_norm": 7.291039692399334, + "learning_rate": 1.03354381082271e-06, + "loss": 0.5896, + "step": 19614 + }, + { + "epoch": 1.4170896021095598, + "grad_norm": 6.750006499110731, + "learning_rate": 1.0333069262479287e-06, + "loss": 0.6063, + "step": 19615 + }, + { + "epoch": 1.4171618473097694, + "grad_norm": 6.270606897602321, + "learning_rate": 1.033070061751037e-06, + "loss": 0.5786, + "step": 19616 + }, + { + "epoch": 1.417234092509979, + "grad_norm": 7.87643855443295, + "learning_rate": 1.0328332173352768e-06, + "loss": 0.6804, + "step": 19617 + }, + { + "epoch": 1.4173063377101884, + "grad_norm": 6.739596225857871, + "learning_rate": 1.032596393003891e-06, + "loss": 0.6076, + "step": 19618 + }, + { + "epoch": 1.417378582910398, + "grad_norm": 6.8172898558741, + "learning_rate": 1.032359588760122e-06, + "loss": 0.6141, + "step": 19619 + }, + { + "epoch": 1.4174508281106073, + "grad_norm": 7.727746824520503, + "learning_rate": 1.03212280460721e-06, + "loss": 0.6596, + "step": 19620 + }, + { + "epoch": 1.4175230733108168, + "grad_norm": 5.798538010593941, + "learning_rate": 1.0318860405483967e-06, + "loss": 0.5972, + "step": 19621 + }, + { + "epoch": 1.4175953185110264, + "grad_norm": 6.148652476487131, + "learning_rate": 1.0316492965869247e-06, + "loss": 0.6175, + "step": 19622 + }, + { + "epoch": 1.417667563711236, + "grad_norm": 6.507922699397107, + "learning_rate": 1.0314125727260327e-06, + "loss": 0.5796, + "step": 19623 + }, + { + "epoch": 1.4177398089114455, + "grad_norm": 8.10919765506346, + "learning_rate": 1.0311758689689624e-06, + "loss": 0.6273, + "step": 19624 + }, + { + "epoch": 1.417812054111655, + "grad_norm": 7.8014208551005915, + "learning_rate": 1.030939185318953e-06, + "loss": 0.5742, + "step": 19625 + }, + { + "epoch": 1.4178842993118645, + "grad_norm": 6.797835779795103, + "learning_rate": 1.0307025217792473e-06, + "loss": 0.5674, + "step": 19626 + }, + { + "epoch": 1.4179565445120739, + "grad_norm": 6.950114560726791, + "learning_rate": 1.0304658783530825e-06, + "loss": 0.7024, + "step": 19627 + }, + { + "epoch": 1.4180287897122834, + "grad_norm": 7.343534166126843, + "learning_rate": 1.0302292550436987e-06, + "loss": 0.5866, + "step": 19628 + }, + { + "epoch": 1.418101034912493, + "grad_norm": 6.620146503037503, + "learning_rate": 1.029992651854336e-06, + "loss": 0.6388, + "step": 19629 + }, + { + "epoch": 1.4181732801127025, + "grad_norm": 5.644653403468461, + "learning_rate": 1.0297560687882319e-06, + "loss": 0.6337, + "step": 19630 + }, + { + "epoch": 1.418245525312912, + "grad_norm": 6.340665922601586, + "learning_rate": 1.0295195058486253e-06, + "loss": 0.6573, + "step": 19631 + }, + { + "epoch": 1.4183177705131216, + "grad_norm": 8.087109761794778, + "learning_rate": 1.0292829630387551e-06, + "loss": 0.5594, + "step": 19632 + }, + { + "epoch": 1.4183900157133311, + "grad_norm": 7.731846128161148, + "learning_rate": 1.0290464403618592e-06, + "loss": 0.6698, + "step": 19633 + }, + { + "epoch": 1.4184622609135404, + "grad_norm": 7.1192877106719195, + "learning_rate": 1.0288099378211755e-06, + "loss": 0.6223, + "step": 19634 + }, + { + "epoch": 1.4185345061137502, + "grad_norm": 7.289159579923586, + "learning_rate": 1.028573455419941e-06, + "loss": 0.5991, + "step": 19635 + }, + { + "epoch": 1.4186067513139595, + "grad_norm": 6.742151006111402, + "learning_rate": 1.028336993161395e-06, + "loss": 0.6676, + "step": 19636 + }, + { + "epoch": 1.418678996514169, + "grad_norm": 7.900077732524639, + "learning_rate": 1.0281005510487715e-06, + "loss": 0.6669, + "step": 19637 + }, + { + "epoch": 1.4187512417143786, + "grad_norm": 7.436633147684655, + "learning_rate": 1.0278641290853086e-06, + "loss": 0.6392, + "step": 19638 + }, + { + "epoch": 1.4188234869145882, + "grad_norm": 7.503078337575513, + "learning_rate": 1.027627727274243e-06, + "loss": 0.7185, + "step": 19639 + }, + { + "epoch": 1.4188957321147977, + "grad_norm": 8.784263881006327, + "learning_rate": 1.0273913456188115e-06, + "loss": 0.6157, + "step": 19640 + }, + { + "epoch": 1.418967977315007, + "grad_norm": 9.393304883572245, + "learning_rate": 1.0271549841222479e-06, + "loss": 0.6044, + "step": 19641 + }, + { + "epoch": 1.4190402225152168, + "grad_norm": 6.740697596640534, + "learning_rate": 1.0269186427877893e-06, + "loss": 0.6417, + "step": 19642 + }, + { + "epoch": 1.419112467715426, + "grad_norm": 7.447284833521648, + "learning_rate": 1.0266823216186703e-06, + "loss": 0.6544, + "step": 19643 + }, + { + "epoch": 1.4191847129156356, + "grad_norm": 6.4201048850169995, + "learning_rate": 1.0264460206181264e-06, + "loss": 0.6641, + "step": 19644 + }, + { + "epoch": 1.4192569581158452, + "grad_norm": 8.143537296963233, + "learning_rate": 1.0262097397893927e-06, + "loss": 0.6096, + "step": 19645 + }, + { + "epoch": 1.4193292033160547, + "grad_norm": 6.465527012609355, + "learning_rate": 1.0259734791357032e-06, + "loss": 0.6179, + "step": 19646 + }, + { + "epoch": 1.4194014485162643, + "grad_norm": 7.270055720182655, + "learning_rate": 1.0257372386602928e-06, + "loss": 0.6431, + "step": 19647 + }, + { + "epoch": 1.4194736937164736, + "grad_norm": 6.324843924183679, + "learning_rate": 1.0255010183663943e-06, + "loss": 0.6489, + "step": 19648 + }, + { + "epoch": 1.4195459389166833, + "grad_norm": 8.461869376054109, + "learning_rate": 1.0252648182572422e-06, + "loss": 0.5737, + "step": 19649 + }, + { + "epoch": 1.4196181841168927, + "grad_norm": 8.638850052212353, + "learning_rate": 1.0250286383360695e-06, + "loss": 0.6218, + "step": 19650 + }, + { + "epoch": 1.4196904293171022, + "grad_norm": 6.859165614240845, + "learning_rate": 1.0247924786061103e-06, + "loss": 0.6529, + "step": 19651 + }, + { + "epoch": 1.4197626745173118, + "grad_norm": 6.887103437302471, + "learning_rate": 1.024556339070596e-06, + "loss": 0.5654, + "step": 19652 + }, + { + "epoch": 1.4198349197175213, + "grad_norm": 8.485539421419297, + "learning_rate": 1.0243202197327587e-06, + "loss": 0.6784, + "step": 19653 + }, + { + "epoch": 1.4199071649177308, + "grad_norm": 7.079009348210438, + "learning_rate": 1.0240841205958335e-06, + "loss": 0.5494, + "step": 19654 + }, + { + "epoch": 1.4199794101179402, + "grad_norm": 7.103296519338778, + "learning_rate": 1.02384804166305e-06, + "loss": 0.546, + "step": 19655 + }, + { + "epoch": 1.42005165531815, + "grad_norm": 6.730098915676866, + "learning_rate": 1.0236119829376406e-06, + "loss": 0.6314, + "step": 19656 + }, + { + "epoch": 1.4201239005183592, + "grad_norm": 6.569974294115754, + "learning_rate": 1.0233759444228367e-06, + "loss": 0.5918, + "step": 19657 + }, + { + "epoch": 1.4201961457185688, + "grad_norm": 7.7141507303447545, + "learning_rate": 1.0231399261218709e-06, + "loss": 0.5835, + "step": 19658 + }, + { + "epoch": 1.4202683909187783, + "grad_norm": 7.0174696776166785, + "learning_rate": 1.0229039280379713e-06, + "loss": 0.5824, + "step": 19659 + }, + { + "epoch": 1.4203406361189879, + "grad_norm": 7.164777846902092, + "learning_rate": 1.0226679501743706e-06, + "loss": 0.641, + "step": 19660 + }, + { + "epoch": 1.4204128813191974, + "grad_norm": 8.704802702410355, + "learning_rate": 1.0224319925342982e-06, + "loss": 0.6445, + "step": 19661 + }, + { + "epoch": 1.420485126519407, + "grad_norm": 8.759158763468804, + "learning_rate": 1.0221960551209848e-06, + "loss": 0.6336, + "step": 19662 + }, + { + "epoch": 1.4205573717196165, + "grad_norm": 7.2950763977108215, + "learning_rate": 1.0219601379376598e-06, + "loss": 0.6196, + "step": 19663 + }, + { + "epoch": 1.4206296169198258, + "grad_norm": 6.391321270234012, + "learning_rate": 1.0217242409875531e-06, + "loss": 0.5981, + "step": 19664 + }, + { + "epoch": 1.4207018621200354, + "grad_norm": 8.490421058808531, + "learning_rate": 1.0214883642738943e-06, + "loss": 0.6362, + "step": 19665 + }, + { + "epoch": 1.420774107320245, + "grad_norm": 7.125534572959383, + "learning_rate": 1.021252507799911e-06, + "loss": 0.6651, + "step": 19666 + }, + { + "epoch": 1.4208463525204544, + "grad_norm": 8.104013894282911, + "learning_rate": 1.0210166715688327e-06, + "loss": 0.6157, + "step": 19667 + }, + { + "epoch": 1.420918597720664, + "grad_norm": 7.433433895827862, + "learning_rate": 1.020780855583888e-06, + "loss": 0.6351, + "step": 19668 + }, + { + "epoch": 1.4209908429208735, + "grad_norm": 6.2083693467019465, + "learning_rate": 1.0205450598483057e-06, + "loss": 0.6356, + "step": 19669 + }, + { + "epoch": 1.421063088121083, + "grad_norm": 5.4016736015566424, + "learning_rate": 1.0203092843653118e-06, + "loss": 0.6528, + "step": 19670 + }, + { + "epoch": 1.4211353333212924, + "grad_norm": 6.118414627874916, + "learning_rate": 1.020073529138135e-06, + "loss": 0.6617, + "step": 19671 + }, + { + "epoch": 1.421207578521502, + "grad_norm": 7.1226422608382425, + "learning_rate": 1.0198377941700025e-06, + "loss": 0.5945, + "step": 19672 + }, + { + "epoch": 1.4212798237217115, + "grad_norm": 6.156137223348365, + "learning_rate": 1.0196020794641412e-06, + "loss": 0.5603, + "step": 19673 + }, + { + "epoch": 1.421352068921921, + "grad_norm": 7.270337223590565, + "learning_rate": 1.0193663850237782e-06, + "loss": 0.6363, + "step": 19674 + }, + { + "epoch": 1.4214243141221305, + "grad_norm": 7.810670928465954, + "learning_rate": 1.0191307108521394e-06, + "loss": 0.6524, + "step": 19675 + }, + { + "epoch": 1.42149655932234, + "grad_norm": 7.738575790630598, + "learning_rate": 1.0188950569524525e-06, + "loss": 0.6156, + "step": 19676 + }, + { + "epoch": 1.4215688045225496, + "grad_norm": 7.232016259217341, + "learning_rate": 1.0186594233279413e-06, + "loss": 0.6628, + "step": 19677 + }, + { + "epoch": 1.421641049722759, + "grad_norm": 7.703746716400986, + "learning_rate": 1.0184238099818322e-06, + "loss": 0.5833, + "step": 19678 + }, + { + "epoch": 1.4217132949229685, + "grad_norm": 7.349509719793415, + "learning_rate": 1.018188216917352e-06, + "loss": 0.6706, + "step": 19679 + }, + { + "epoch": 1.421785540123178, + "grad_norm": 7.784344283892882, + "learning_rate": 1.0179526441377235e-06, + "loss": 0.6258, + "step": 19680 + }, + { + "epoch": 1.4218577853233876, + "grad_norm": 7.108771061658976, + "learning_rate": 1.0177170916461719e-06, + "loss": 0.5553, + "step": 19681 + }, + { + "epoch": 1.4219300305235971, + "grad_norm": 9.28255117010329, + "learning_rate": 1.0174815594459232e-06, + "loss": 0.696, + "step": 19682 + }, + { + "epoch": 1.4220022757238067, + "grad_norm": 7.210195517673302, + "learning_rate": 1.0172460475402012e-06, + "loss": 0.6498, + "step": 19683 + }, + { + "epoch": 1.4220745209240162, + "grad_norm": 7.599375960177971, + "learning_rate": 1.0170105559322295e-06, + "loss": 0.6067, + "step": 19684 + }, + { + "epoch": 1.4221467661242255, + "grad_norm": 6.878358055509681, + "learning_rate": 1.0167750846252312e-06, + "loss": 0.5621, + "step": 19685 + }, + { + "epoch": 1.422219011324435, + "grad_norm": 7.084179038630178, + "learning_rate": 1.0165396336224314e-06, + "loss": 0.6035, + "step": 19686 + }, + { + "epoch": 1.4222912565246446, + "grad_norm": 6.470644466135417, + "learning_rate": 1.0163042029270514e-06, + "loss": 0.5941, + "step": 19687 + }, + { + "epoch": 1.4223635017248542, + "grad_norm": 7.835114242801691, + "learning_rate": 1.0160687925423145e-06, + "loss": 0.6453, + "step": 19688 + }, + { + "epoch": 1.4224357469250637, + "grad_norm": 7.31396130689651, + "learning_rate": 1.0158334024714443e-06, + "loss": 0.6876, + "step": 19689 + }, + { + "epoch": 1.4225079921252732, + "grad_norm": 6.739843568882575, + "learning_rate": 1.0155980327176617e-06, + "loss": 0.6338, + "step": 19690 + }, + { + "epoch": 1.4225802373254828, + "grad_norm": 8.00381521803406, + "learning_rate": 1.01536268328419e-06, + "loss": 0.674, + "step": 19691 + }, + { + "epoch": 1.422652482525692, + "grad_norm": 6.484768260382875, + "learning_rate": 1.01512735417425e-06, + "loss": 0.665, + "step": 19692 + }, + { + "epoch": 1.4227247277259016, + "grad_norm": 7.077585768211961, + "learning_rate": 1.0148920453910642e-06, + "loss": 0.6329, + "step": 19693 + }, + { + "epoch": 1.4227969729261112, + "grad_norm": 6.682575444781399, + "learning_rate": 1.0146567569378536e-06, + "loss": 0.5886, + "step": 19694 + }, + { + "epoch": 1.4228692181263207, + "grad_norm": 7.210090761076381, + "learning_rate": 1.014421488817838e-06, + "loss": 0.6241, + "step": 19695 + }, + { + "epoch": 1.4229414633265303, + "grad_norm": 6.403524274837078, + "learning_rate": 1.0141862410342385e-06, + "loss": 0.6009, + "step": 19696 + }, + { + "epoch": 1.4230137085267398, + "grad_norm": 8.06483934160855, + "learning_rate": 1.013951013590277e-06, + "loss": 0.6725, + "step": 19697 + }, + { + "epoch": 1.4230859537269493, + "grad_norm": 6.021854969539695, + "learning_rate": 1.013715806489171e-06, + "loss": 0.6167, + "step": 19698 + }, + { + "epoch": 1.4231581989271587, + "grad_norm": 6.957817226008248, + "learning_rate": 1.0134806197341419e-06, + "loss": 0.6274, + "step": 19699 + }, + { + "epoch": 1.4232304441273682, + "grad_norm": 7.723762892147281, + "learning_rate": 1.0132454533284086e-06, + "loss": 0.5643, + "step": 19700 + }, + { + "epoch": 1.4233026893275778, + "grad_norm": 6.199445502803966, + "learning_rate": 1.0130103072751907e-06, + "loss": 0.6702, + "step": 19701 + }, + { + "epoch": 1.4233749345277873, + "grad_norm": 7.334238979019377, + "learning_rate": 1.0127751815777073e-06, + "loss": 0.6127, + "step": 19702 + }, + { + "epoch": 1.4234471797279968, + "grad_norm": 6.25179753203763, + "learning_rate": 1.0125400762391768e-06, + "loss": 0.6459, + "step": 19703 + }, + { + "epoch": 1.4235194249282064, + "grad_norm": 7.026432268743939, + "learning_rate": 1.0123049912628185e-06, + "loss": 0.5751, + "step": 19704 + }, + { + "epoch": 1.423591670128416, + "grad_norm": 7.378477149712407, + "learning_rate": 1.012069926651849e-06, + "loss": 0.6487, + "step": 19705 + }, + { + "epoch": 1.4236639153286252, + "grad_norm": 6.882604658645376, + "learning_rate": 1.011834882409487e-06, + "loss": 0.5711, + "step": 19706 + }, + { + "epoch": 1.423736160528835, + "grad_norm": 6.62743030812197, + "learning_rate": 1.0115998585389498e-06, + "loss": 0.6328, + "step": 19707 + }, + { + "epoch": 1.4238084057290443, + "grad_norm": 7.049788927746791, + "learning_rate": 1.0113648550434558e-06, + "loss": 0.6282, + "step": 19708 + }, + { + "epoch": 1.4238806509292539, + "grad_norm": 7.173154363609494, + "learning_rate": 1.0111298719262203e-06, + "loss": 0.599, + "step": 19709 + }, + { + "epoch": 1.4239528961294634, + "grad_norm": 5.651100301365113, + "learning_rate": 1.0108949091904596e-06, + "loss": 0.5786, + "step": 19710 + }, + { + "epoch": 1.424025141329673, + "grad_norm": 6.473063776681211, + "learning_rate": 1.0106599668393935e-06, + "loss": 0.6332, + "step": 19711 + }, + { + "epoch": 1.4240973865298825, + "grad_norm": 6.523227564922868, + "learning_rate": 1.0104250448762351e-06, + "loss": 0.6026, + "step": 19712 + }, + { + "epoch": 1.4241696317300918, + "grad_norm": 6.314548575484886, + "learning_rate": 1.010190143304201e-06, + "loss": 0.593, + "step": 19713 + }, + { + "epoch": 1.4242418769303016, + "grad_norm": 8.150290698150311, + "learning_rate": 1.0099552621265073e-06, + "loss": 0.6176, + "step": 19714 + }, + { + "epoch": 1.424314122130511, + "grad_norm": 8.614598733564554, + "learning_rate": 1.0097204013463702e-06, + "loss": 0.7113, + "step": 19715 + }, + { + "epoch": 1.4243863673307204, + "grad_norm": 7.477161475157931, + "learning_rate": 1.0094855609670026e-06, + "loss": 0.6524, + "step": 19716 + }, + { + "epoch": 1.42445861253093, + "grad_norm": 7.91462908841455, + "learning_rate": 1.0092507409916203e-06, + "loss": 0.6335, + "step": 19717 + }, + { + "epoch": 1.4245308577311395, + "grad_norm": 6.63430133191112, + "learning_rate": 1.009015941423438e-06, + "loss": 0.6203, + "step": 19718 + }, + { + "epoch": 1.424603102931349, + "grad_norm": 6.77333668916474, + "learning_rate": 1.0087811622656696e-06, + "loss": 0.6522, + "step": 19719 + }, + { + "epoch": 1.4246753481315584, + "grad_norm": 7.817291255417153, + "learning_rate": 1.0085464035215295e-06, + "loss": 0.6802, + "step": 19720 + }, + { + "epoch": 1.4247475933317681, + "grad_norm": 7.076764041172603, + "learning_rate": 1.0083116651942306e-06, + "loss": 0.6462, + "step": 19721 + }, + { + "epoch": 1.4248198385319775, + "grad_norm": 6.667392055465845, + "learning_rate": 1.0080769472869884e-06, + "loss": 0.5888, + "step": 19722 + }, + { + "epoch": 1.424892083732187, + "grad_norm": 7.62035015473127, + "learning_rate": 1.007842249803013e-06, + "loss": 0.6032, + "step": 19723 + }, + { + "epoch": 1.4249643289323966, + "grad_norm": 7.87022448867866, + "learning_rate": 1.0076075727455186e-06, + "loss": 0.6069, + "step": 19724 + }, + { + "epoch": 1.425036574132606, + "grad_norm": 6.978658841764392, + "learning_rate": 1.0073729161177178e-06, + "loss": 0.5966, + "step": 19725 + }, + { + "epoch": 1.4251088193328156, + "grad_norm": 6.7459673668134, + "learning_rate": 1.0071382799228239e-06, + "loss": 0.6101, + "step": 19726 + }, + { + "epoch": 1.425181064533025, + "grad_norm": 7.4047888751072515, + "learning_rate": 1.0069036641640468e-06, + "loss": 0.6578, + "step": 19727 + }, + { + "epoch": 1.4252533097332347, + "grad_norm": 5.9465131913182105, + "learning_rate": 1.0066690688445993e-06, + "loss": 0.5515, + "step": 19728 + }, + { + "epoch": 1.425325554933444, + "grad_norm": 6.7719747510463, + "learning_rate": 1.0064344939676927e-06, + "loss": 0.6317, + "step": 19729 + }, + { + "epoch": 1.4253978001336536, + "grad_norm": 8.078903662453651, + "learning_rate": 1.0061999395365383e-06, + "loss": 0.6419, + "step": 19730 + }, + { + "epoch": 1.4254700453338631, + "grad_norm": 6.962842139119404, + "learning_rate": 1.0059654055543466e-06, + "loss": 0.6467, + "step": 19731 + }, + { + "epoch": 1.4255422905340727, + "grad_norm": 7.865950213420485, + "learning_rate": 1.0057308920243287e-06, + "loss": 0.6663, + "step": 19732 + }, + { + "epoch": 1.4256145357342822, + "grad_norm": 7.229257310048025, + "learning_rate": 1.0054963989496955e-06, + "loss": 0.7073, + "step": 19733 + }, + { + "epoch": 1.4256867809344915, + "grad_norm": 6.726101140627836, + "learning_rate": 1.0052619263336553e-06, + "loss": 0.5975, + "step": 19734 + }, + { + "epoch": 1.4257590261347013, + "grad_norm": 9.170507377806652, + "learning_rate": 1.0050274741794186e-06, + "loss": 0.6341, + "step": 19735 + }, + { + "epoch": 1.4258312713349106, + "grad_norm": 9.409163825314211, + "learning_rate": 1.004793042490196e-06, + "loss": 0.7347, + "step": 19736 + }, + { + "epoch": 1.4259035165351202, + "grad_norm": 7.723968595025073, + "learning_rate": 1.004558631269195e-06, + "loss": 0.6404, + "step": 19737 + }, + { + "epoch": 1.4259757617353297, + "grad_norm": 6.8986752772582856, + "learning_rate": 1.0043242405196237e-06, + "loss": 0.5744, + "step": 19738 + }, + { + "epoch": 1.4260480069355392, + "grad_norm": 7.431116522255184, + "learning_rate": 1.0040898702446938e-06, + "loss": 0.6422, + "step": 19739 + }, + { + "epoch": 1.4261202521357488, + "grad_norm": 6.484885616190784, + "learning_rate": 1.0038555204476127e-06, + "loss": 0.5668, + "step": 19740 + }, + { + "epoch": 1.4261924973359583, + "grad_norm": 7.162435864865427, + "learning_rate": 1.003621191131587e-06, + "loss": 0.673, + "step": 19741 + }, + { + "epoch": 1.4262647425361679, + "grad_norm": 6.906601288435715, + "learning_rate": 1.0033868822998252e-06, + "loss": 0.5614, + "step": 19742 + }, + { + "epoch": 1.4263369877363772, + "grad_norm": 6.82860795676522, + "learning_rate": 1.0031525939555358e-06, + "loss": 0.6393, + "step": 19743 + }, + { + "epoch": 1.4264092329365867, + "grad_norm": 5.812508859935284, + "learning_rate": 1.0029183261019245e-06, + "loss": 0.6158, + "step": 19744 + }, + { + "epoch": 1.4264814781367963, + "grad_norm": 6.332397224444649, + "learning_rate": 1.0026840787421989e-06, + "loss": 0.6818, + "step": 19745 + }, + { + "epoch": 1.4265537233370058, + "grad_norm": 6.007179097888736, + "learning_rate": 1.0024498518795656e-06, + "loss": 0.6061, + "step": 19746 + }, + { + "epoch": 1.4266259685372153, + "grad_norm": 7.389964035834527, + "learning_rate": 1.0022156455172312e-06, + "loss": 0.5995, + "step": 19747 + }, + { + "epoch": 1.426698213737425, + "grad_norm": 6.8274724373528235, + "learning_rate": 1.0019814596584013e-06, + "loss": 0.6008, + "step": 19748 + }, + { + "epoch": 1.4267704589376344, + "grad_norm": 8.213743592374279, + "learning_rate": 1.0017472943062824e-06, + "loss": 0.6108, + "step": 19749 + }, + { + "epoch": 1.4268427041378438, + "grad_norm": 6.408793618976567, + "learning_rate": 1.0015131494640794e-06, + "loss": 0.6119, + "step": 19750 + }, + { + "epoch": 1.4269149493380533, + "grad_norm": 7.4565232564987705, + "learning_rate": 1.0012790251349991e-06, + "loss": 0.6695, + "step": 19751 + }, + { + "epoch": 1.4269871945382628, + "grad_norm": 8.02997884821214, + "learning_rate": 1.0010449213222442e-06, + "loss": 0.7106, + "step": 19752 + }, + { + "epoch": 1.4270594397384724, + "grad_norm": 6.328593973748855, + "learning_rate": 1.0008108380290206e-06, + "loss": 0.6394, + "step": 19753 + }, + { + "epoch": 1.427131684938682, + "grad_norm": 8.492618721802259, + "learning_rate": 1.0005767752585332e-06, + "loss": 0.6447, + "step": 19754 + }, + { + "epoch": 1.4272039301388915, + "grad_norm": 7.385673945312292, + "learning_rate": 1.0003427330139848e-06, + "loss": 0.6325, + "step": 19755 + }, + { + "epoch": 1.427276175339101, + "grad_norm": 6.954944295433853, + "learning_rate": 1.0001087112985799e-06, + "loss": 0.6306, + "step": 19756 + }, + { + "epoch": 1.4273484205393103, + "grad_norm": 6.810876215330642, + "learning_rate": 9.998747101155223e-07, + "loss": 0.5835, + "step": 19757 + }, + { + "epoch": 1.4274206657395199, + "grad_norm": 7.791831302093443, + "learning_rate": 9.996407294680149e-07, + "loss": 0.6785, + "step": 19758 + }, + { + "epoch": 1.4274929109397294, + "grad_norm": 6.987569944410384, + "learning_rate": 9.994067693592612e-07, + "loss": 0.6964, + "step": 19759 + }, + { + "epoch": 1.427565156139939, + "grad_norm": 6.652241104059817, + "learning_rate": 9.991728297924638e-07, + "loss": 0.6563, + "step": 19760 + }, + { + "epoch": 1.4276374013401485, + "grad_norm": 6.79062437082947, + "learning_rate": 9.989389107708258e-07, + "loss": 0.6138, + "step": 19761 + }, + { + "epoch": 1.427709646540358, + "grad_norm": 7.237377886491419, + "learning_rate": 9.987050122975475e-07, + "loss": 0.5907, + "step": 19762 + }, + { + "epoch": 1.4277818917405676, + "grad_norm": 7.652812824603641, + "learning_rate": 9.98471134375832e-07, + "loss": 0.6398, + "step": 19763 + }, + { + "epoch": 1.427854136940777, + "grad_norm": 6.631563785761915, + "learning_rate": 9.98237277008881e-07, + "loss": 0.6475, + "step": 19764 + }, + { + "epoch": 1.4279263821409864, + "grad_norm": 5.874739945520162, + "learning_rate": 9.980034401998964e-07, + "loss": 0.6096, + "step": 19765 + }, + { + "epoch": 1.427998627341196, + "grad_norm": 6.8216561198888925, + "learning_rate": 9.977696239520767e-07, + "loss": 0.6398, + "step": 19766 + }, + { + "epoch": 1.4280708725414055, + "grad_norm": 6.8872612941244595, + "learning_rate": 9.975358282686254e-07, + "loss": 0.7253, + "step": 19767 + }, + { + "epoch": 1.428143117741615, + "grad_norm": 6.816777277688724, + "learning_rate": 9.97302053152743e-07, + "loss": 0.6322, + "step": 19768 + }, + { + "epoch": 1.4282153629418246, + "grad_norm": 6.6513714172143725, + "learning_rate": 9.970682986076277e-07, + "loss": 0.6349, + "step": 19769 + }, + { + "epoch": 1.4282876081420341, + "grad_norm": 7.228409022957529, + "learning_rate": 9.968345646364804e-07, + "loss": 0.5967, + "step": 19770 + }, + { + "epoch": 1.4283598533422435, + "grad_norm": 7.485956140477862, + "learning_rate": 9.966008512425008e-07, + "loss": 0.6226, + "step": 19771 + }, + { + "epoch": 1.428432098542453, + "grad_norm": 6.571998624777113, + "learning_rate": 9.963671584288892e-07, + "loss": 0.6722, + "step": 19772 + }, + { + "epoch": 1.4285043437426626, + "grad_norm": 6.299920569025089, + "learning_rate": 9.961334861988425e-07, + "loss": 0.5882, + "step": 19773 + }, + { + "epoch": 1.428576588942872, + "grad_norm": 6.597599298063102, + "learning_rate": 9.958998345555612e-07, + "loss": 0.5289, + "step": 19774 + }, + { + "epoch": 1.4286488341430816, + "grad_norm": 5.8793254107215605, + "learning_rate": 9.95666203502243e-07, + "loss": 0.6076, + "step": 19775 + }, + { + "epoch": 1.4287210793432912, + "grad_norm": 6.869153120101658, + "learning_rate": 9.954325930420863e-07, + "loss": 0.5849, + "step": 19776 + }, + { + "epoch": 1.4287933245435007, + "grad_norm": 7.479974842947204, + "learning_rate": 9.951990031782895e-07, + "loss": 0.5533, + "step": 19777 + }, + { + "epoch": 1.42886556974371, + "grad_norm": 6.380743991916684, + "learning_rate": 9.949654339140498e-07, + "loss": 0.6442, + "step": 19778 + }, + { + "epoch": 1.4289378149439196, + "grad_norm": 6.917617763100056, + "learning_rate": 9.947318852525657e-07, + "loss": 0.6328, + "step": 19779 + }, + { + "epoch": 1.4290100601441291, + "grad_norm": 6.407260424317053, + "learning_rate": 9.944983571970324e-07, + "loss": 0.6013, + "step": 19780 + }, + { + "epoch": 1.4290823053443387, + "grad_norm": 7.291359101121265, + "learning_rate": 9.942648497506476e-07, + "loss": 0.6506, + "step": 19781 + }, + { + "epoch": 1.4291545505445482, + "grad_norm": 7.391978746755193, + "learning_rate": 9.94031362916608e-07, + "loss": 0.6238, + "step": 19782 + }, + { + "epoch": 1.4292267957447577, + "grad_norm": 6.6244872272629465, + "learning_rate": 9.937978966981105e-07, + "loss": 0.596, + "step": 19783 + }, + { + "epoch": 1.4292990409449673, + "grad_norm": 6.650941836554611, + "learning_rate": 9.935644510983492e-07, + "loss": 0.6273, + "step": 19784 + }, + { + "epoch": 1.4293712861451766, + "grad_norm": 7.864998172825276, + "learning_rate": 9.933310261205211e-07, + "loss": 0.6346, + "step": 19785 + }, + { + "epoch": 1.4294435313453864, + "grad_norm": 6.264528706596489, + "learning_rate": 9.930976217678214e-07, + "loss": 0.5693, + "step": 19786 + }, + { + "epoch": 1.4295157765455957, + "grad_norm": 8.62393491498914, + "learning_rate": 9.928642380434453e-07, + "loss": 0.6102, + "step": 19787 + }, + { + "epoch": 1.4295880217458052, + "grad_norm": 8.902655696048388, + "learning_rate": 9.926308749505876e-07, + "loss": 0.5609, + "step": 19788 + }, + { + "epoch": 1.4296602669460148, + "grad_norm": 7.052380088373617, + "learning_rate": 9.923975324924427e-07, + "loss": 0.6309, + "step": 19789 + }, + { + "epoch": 1.4297325121462243, + "grad_norm": 6.433275632978354, + "learning_rate": 9.92164210672206e-07, + "loss": 0.6548, + "step": 19790 + }, + { + "epoch": 1.4298047573464339, + "grad_norm": 5.661439990938356, + "learning_rate": 9.919309094930695e-07, + "loss": 0.5913, + "step": 19791 + }, + { + "epoch": 1.4298770025466432, + "grad_norm": 7.8017257248565635, + "learning_rate": 9.916976289582279e-07, + "loss": 0.6796, + "step": 19792 + }, + { + "epoch": 1.429949247746853, + "grad_norm": 6.785136853653191, + "learning_rate": 9.914643690708755e-07, + "loss": 0.6686, + "step": 19793 + }, + { + "epoch": 1.4300214929470623, + "grad_norm": 7.589706034841699, + "learning_rate": 9.912311298342028e-07, + "loss": 0.6395, + "step": 19794 + }, + { + "epoch": 1.4300937381472718, + "grad_norm": 10.208420692121958, + "learning_rate": 9.909979112514056e-07, + "loss": 0.7103, + "step": 19795 + }, + { + "epoch": 1.4301659833474814, + "grad_norm": 6.679245747910937, + "learning_rate": 9.907647133256752e-07, + "loss": 0.6124, + "step": 19796 + }, + { + "epoch": 1.430238228547691, + "grad_norm": 6.399244991114094, + "learning_rate": 9.905315360602047e-07, + "loss": 0.6097, + "step": 19797 + }, + { + "epoch": 1.4303104737479004, + "grad_norm": 6.611924421502719, + "learning_rate": 9.902983794581847e-07, + "loss": 0.5927, + "step": 19798 + }, + { + "epoch": 1.4303827189481098, + "grad_norm": 6.387333626269375, + "learning_rate": 9.900652435228079e-07, + "loss": 0.5487, + "step": 19799 + }, + { + "epoch": 1.4304549641483195, + "grad_norm": 6.55484143191197, + "learning_rate": 9.898321282572652e-07, + "loss": 0.5886, + "step": 19800 + }, + { + "epoch": 1.4305272093485288, + "grad_norm": 7.481926312387348, + "learning_rate": 9.89599033664749e-07, + "loss": 0.6396, + "step": 19801 + }, + { + "epoch": 1.4305994545487384, + "grad_norm": 6.862591452067381, + "learning_rate": 9.893659597484488e-07, + "loss": 0.5815, + "step": 19802 + }, + { + "epoch": 1.430671699748948, + "grad_norm": 6.413943253592343, + "learning_rate": 9.89132906511555e-07, + "loss": 0.6318, + "step": 19803 + }, + { + "epoch": 1.4307439449491575, + "grad_norm": 7.497385713800772, + "learning_rate": 9.88899873957259e-07, + "loss": 0.5931, + "step": 19804 + }, + { + "epoch": 1.430816190149367, + "grad_norm": 5.160490199912558, + "learning_rate": 9.886668620887504e-07, + "loss": 0.5453, + "step": 19805 + }, + { + "epoch": 1.4308884353495763, + "grad_norm": 6.286371524930967, + "learning_rate": 9.884338709092191e-07, + "loss": 0.592, + "step": 19806 + }, + { + "epoch": 1.430960680549786, + "grad_norm": 6.820598023895299, + "learning_rate": 9.882009004218544e-07, + "loss": 0.6416, + "step": 19807 + }, + { + "epoch": 1.4310329257499954, + "grad_norm": 7.136548771580043, + "learning_rate": 9.879679506298464e-07, + "loss": 0.6311, + "step": 19808 + }, + { + "epoch": 1.431105170950205, + "grad_norm": 7.038690182474526, + "learning_rate": 9.87735021536382e-07, + "loss": 0.6881, + "step": 19809 + }, + { + "epoch": 1.4311774161504145, + "grad_norm": 6.76697440488994, + "learning_rate": 9.875021131446514e-07, + "loss": 0.6146, + "step": 19810 + }, + { + "epoch": 1.431249661350624, + "grad_norm": 6.855676301167283, + "learning_rate": 9.87269225457843e-07, + "loss": 0.6382, + "step": 19811 + }, + { + "epoch": 1.4313219065508336, + "grad_norm": 6.596122134214364, + "learning_rate": 9.870363584791437e-07, + "loss": 0.5792, + "step": 19812 + }, + { + "epoch": 1.4313941517510431, + "grad_norm": 7.585774570212203, + "learning_rate": 9.86803512211742e-07, + "loss": 0.6411, + "step": 19813 + }, + { + "epoch": 1.4314663969512527, + "grad_norm": 6.89994503496917, + "learning_rate": 9.865706866588252e-07, + "loss": 0.6296, + "step": 19814 + }, + { + "epoch": 1.431538642151462, + "grad_norm": 7.321005284637182, + "learning_rate": 9.863378818235802e-07, + "loss": 0.6287, + "step": 19815 + }, + { + "epoch": 1.4316108873516715, + "grad_norm": 7.661838033686392, + "learning_rate": 9.861050977091946e-07, + "loss": 0.6185, + "step": 19816 + }, + { + "epoch": 1.431683132551881, + "grad_norm": 7.113835203752779, + "learning_rate": 9.85872334318855e-07, + "loss": 0.6834, + "step": 19817 + }, + { + "epoch": 1.4317553777520906, + "grad_norm": 6.141645895078705, + "learning_rate": 9.856395916557482e-07, + "loss": 0.653, + "step": 19818 + }, + { + "epoch": 1.4318276229523001, + "grad_norm": 6.118568625017045, + "learning_rate": 9.854068697230584e-07, + "loss": 0.5798, + "step": 19819 + }, + { + "epoch": 1.4318998681525097, + "grad_norm": 6.987780668732842, + "learning_rate": 9.851741685239729e-07, + "loss": 0.6495, + "step": 19820 + }, + { + "epoch": 1.4319721133527192, + "grad_norm": 7.452866779708597, + "learning_rate": 9.849414880616767e-07, + "loss": 0.6171, + "step": 19821 + }, + { + "epoch": 1.4320443585529286, + "grad_norm": 6.526816291431804, + "learning_rate": 9.84708828339356e-07, + "loss": 0.5827, + "step": 19822 + }, + { + "epoch": 1.432116603753138, + "grad_norm": 7.881353086301061, + "learning_rate": 9.844761893601933e-07, + "loss": 0.6488, + "step": 19823 + }, + { + "epoch": 1.4321888489533476, + "grad_norm": 7.21556384299007, + "learning_rate": 9.842435711273758e-07, + "loss": 0.6227, + "step": 19824 + }, + { + "epoch": 1.4322610941535572, + "grad_norm": 8.163319998382772, + "learning_rate": 9.840109736440867e-07, + "loss": 0.6932, + "step": 19825 + }, + { + "epoch": 1.4323333393537667, + "grad_norm": 7.672942978745387, + "learning_rate": 9.837783969135113e-07, + "loss": 0.7145, + "step": 19826 + }, + { + "epoch": 1.4324055845539763, + "grad_norm": 6.695938633025701, + "learning_rate": 9.835458409388312e-07, + "loss": 0.6393, + "step": 19827 + }, + { + "epoch": 1.4324778297541858, + "grad_norm": 7.259263007638069, + "learning_rate": 9.833133057232313e-07, + "loss": 0.6517, + "step": 19828 + }, + { + "epoch": 1.4325500749543951, + "grad_norm": 6.90565066951939, + "learning_rate": 9.830807912698957e-07, + "loss": 0.6033, + "step": 19829 + }, + { + "epoch": 1.4326223201546047, + "grad_norm": 7.295157187469821, + "learning_rate": 9.82848297582005e-07, + "loss": 0.6811, + "step": 19830 + }, + { + "epoch": 1.4326945653548142, + "grad_norm": 7.8042102014807675, + "learning_rate": 9.826158246627433e-07, + "loss": 0.631, + "step": 19831 + }, + { + "epoch": 1.4327668105550238, + "grad_norm": 6.543645895464134, + "learning_rate": 9.823833725152926e-07, + "loss": 0.6925, + "step": 19832 + }, + { + "epoch": 1.4328390557552333, + "grad_norm": 6.785903669883869, + "learning_rate": 9.821509411428353e-07, + "loss": 0.6982, + "step": 19833 + }, + { + "epoch": 1.4329113009554428, + "grad_norm": 6.435019894964116, + "learning_rate": 9.81918530548553e-07, + "loss": 0.6589, + "step": 19834 + }, + { + "epoch": 1.4329835461556524, + "grad_norm": 8.031337396168446, + "learning_rate": 9.816861407356275e-07, + "loss": 0.6841, + "step": 19835 + }, + { + "epoch": 1.4330557913558617, + "grad_norm": 7.681502320853222, + "learning_rate": 9.814537717072405e-07, + "loss": 0.6445, + "step": 19836 + }, + { + "epoch": 1.4331280365560712, + "grad_norm": 6.653324540972713, + "learning_rate": 9.812214234665717e-07, + "loss": 0.5502, + "step": 19837 + }, + { + "epoch": 1.4332002817562808, + "grad_norm": 7.959555194254316, + "learning_rate": 9.809890960168022e-07, + "loss": 0.631, + "step": 19838 + }, + { + "epoch": 1.4332725269564903, + "grad_norm": 6.19857844391291, + "learning_rate": 9.807567893611124e-07, + "loss": 0.6253, + "step": 19839 + }, + { + "epoch": 1.4333447721566999, + "grad_norm": 6.631079421514539, + "learning_rate": 9.80524503502684e-07, + "loss": 0.579, + "step": 19840 + }, + { + "epoch": 1.4334170173569094, + "grad_norm": 6.339455038530274, + "learning_rate": 9.80292238444694e-07, + "loss": 0.6283, + "step": 19841 + }, + { + "epoch": 1.433489262557119, + "grad_norm": 5.9623221892708775, + "learning_rate": 9.80059994190323e-07, + "loss": 0.5813, + "step": 19842 + }, + { + "epoch": 1.4335615077573283, + "grad_norm": 8.246502221497316, + "learning_rate": 9.798277707427508e-07, + "loss": 0.6328, + "step": 19843 + }, + { + "epoch": 1.4336337529575378, + "grad_norm": 6.368758493404696, + "learning_rate": 9.795955681051563e-07, + "loss": 0.6152, + "step": 19844 + }, + { + "epoch": 1.4337059981577474, + "grad_norm": 7.318069814711273, + "learning_rate": 9.793633862807178e-07, + "loss": 0.5857, + "step": 19845 + }, + { + "epoch": 1.433778243357957, + "grad_norm": 6.439343234863841, + "learning_rate": 9.791312252726137e-07, + "loss": 0.619, + "step": 19846 + }, + { + "epoch": 1.4338504885581664, + "grad_norm": 7.483923147813121, + "learning_rate": 9.788990850840232e-07, + "loss": 0.6506, + "step": 19847 + }, + { + "epoch": 1.433922733758376, + "grad_norm": 6.825729822954231, + "learning_rate": 9.786669657181225e-07, + "loss": 0.6355, + "step": 19848 + }, + { + "epoch": 1.4339949789585855, + "grad_norm": 7.133038453516545, + "learning_rate": 9.784348671780893e-07, + "loss": 0.6078, + "step": 19849 + }, + { + "epoch": 1.4340672241587948, + "grad_norm": 7.215508331769831, + "learning_rate": 9.78202789467102e-07, + "loss": 0.5401, + "step": 19850 + }, + { + "epoch": 1.4341394693590044, + "grad_norm": 6.1775117653735085, + "learning_rate": 9.779707325883365e-07, + "loss": 0.5882, + "step": 19851 + }, + { + "epoch": 1.434211714559214, + "grad_norm": 9.579664196906293, + "learning_rate": 9.777386965449701e-07, + "loss": 0.6144, + "step": 19852 + }, + { + "epoch": 1.4342839597594235, + "grad_norm": 8.432527108034362, + "learning_rate": 9.775066813401788e-07, + "loss": 0.6998, + "step": 19853 + }, + { + "epoch": 1.434356204959633, + "grad_norm": 6.274828308917821, + "learning_rate": 9.7727468697714e-07, + "loss": 0.5855, + "step": 19854 + }, + { + "epoch": 1.4344284501598425, + "grad_norm": 6.397806995892378, + "learning_rate": 9.77042713459027e-07, + "loss": 0.5605, + "step": 19855 + }, + { + "epoch": 1.434500695360052, + "grad_norm": 8.096241924546305, + "learning_rate": 9.768107607890173e-07, + "loss": 0.6236, + "step": 19856 + }, + { + "epoch": 1.4345729405602614, + "grad_norm": 7.0360422037234915, + "learning_rate": 9.765788289702855e-07, + "loss": 0.6409, + "step": 19857 + }, + { + "epoch": 1.4346451857604712, + "grad_norm": 7.772600506525033, + "learning_rate": 9.763469180060072e-07, + "loss": 0.638, + "step": 19858 + }, + { + "epoch": 1.4347174309606805, + "grad_norm": 6.48957957782504, + "learning_rate": 9.761150278993561e-07, + "loss": 0.5734, + "step": 19859 + }, + { + "epoch": 1.43478967616089, + "grad_norm": 7.320302859435354, + "learning_rate": 9.75883158653507e-07, + "loss": 0.6958, + "step": 19860 + }, + { + "epoch": 1.4348619213610996, + "grad_norm": 6.211003832192959, + "learning_rate": 9.75651310271634e-07, + "loss": 0.6086, + "step": 19861 + }, + { + "epoch": 1.4349341665613091, + "grad_norm": 7.2569499411757326, + "learning_rate": 9.754194827569107e-07, + "loss": 0.6699, + "step": 19862 + }, + { + "epoch": 1.4350064117615187, + "grad_norm": 6.697467826489742, + "learning_rate": 9.751876761125111e-07, + "loss": 0.584, + "step": 19863 + }, + { + "epoch": 1.435078656961728, + "grad_norm": 6.524891655851879, + "learning_rate": 9.749558903416085e-07, + "loss": 0.5938, + "step": 19864 + }, + { + "epoch": 1.4351509021619377, + "grad_norm": 7.069522573276253, + "learning_rate": 9.747241254473762e-07, + "loss": 0.5945, + "step": 19865 + }, + { + "epoch": 1.435223147362147, + "grad_norm": 7.081626985755523, + "learning_rate": 9.744923814329856e-07, + "loss": 0.6289, + "step": 19866 + }, + { + "epoch": 1.4352953925623566, + "grad_norm": 6.545943819890609, + "learning_rate": 9.742606583016102e-07, + "loss": 0.6244, + "step": 19867 + }, + { + "epoch": 1.4353676377625662, + "grad_norm": 7.4120216253863775, + "learning_rate": 9.74028956056422e-07, + "loss": 0.5605, + "step": 19868 + }, + { + "epoch": 1.4354398829627757, + "grad_norm": 6.566030660640766, + "learning_rate": 9.73797274700592e-07, + "loss": 0.5853, + "step": 19869 + }, + { + "epoch": 1.4355121281629852, + "grad_norm": 6.954646460558396, + "learning_rate": 9.735656142372921e-07, + "loss": 0.5968, + "step": 19870 + }, + { + "epoch": 1.4355843733631946, + "grad_norm": 6.05197047151874, + "learning_rate": 9.73333974669693e-07, + "loss": 0.6214, + "step": 19871 + }, + { + "epoch": 1.4356566185634043, + "grad_norm": 8.51633857747056, + "learning_rate": 9.73102356000968e-07, + "loss": 0.6218, + "step": 19872 + }, + { + "epoch": 1.4357288637636136, + "grad_norm": 6.838286147872546, + "learning_rate": 9.728707582342856e-07, + "loss": 0.691, + "step": 19873 + }, + { + "epoch": 1.4358011089638232, + "grad_norm": 8.608303550331415, + "learning_rate": 9.726391813728164e-07, + "loss": 0.6158, + "step": 19874 + }, + { + "epoch": 1.4358733541640327, + "grad_norm": 7.545576090423908, + "learning_rate": 9.72407625419732e-07, + "loss": 0.6629, + "step": 19875 + }, + { + "epoch": 1.4359455993642423, + "grad_norm": 5.8830271323312875, + "learning_rate": 9.721760903782e-07, + "loss": 0.6023, + "step": 19876 + }, + { + "epoch": 1.4360178445644518, + "grad_norm": 6.835727396324786, + "learning_rate": 9.719445762513916e-07, + "loss": 0.6121, + "step": 19877 + }, + { + "epoch": 1.4360900897646611, + "grad_norm": 6.239872776592998, + "learning_rate": 9.717130830424752e-07, + "loss": 0.5728, + "step": 19878 + }, + { + "epoch": 1.436162334964871, + "grad_norm": 7.133452905858944, + "learning_rate": 9.714816107546199e-07, + "loss": 0.6039, + "step": 19879 + }, + { + "epoch": 1.4362345801650802, + "grad_norm": 6.573789640351728, + "learning_rate": 9.712501593909945e-07, + "loss": 0.6652, + "step": 19880 + }, + { + "epoch": 1.4363068253652898, + "grad_norm": 6.458419438270094, + "learning_rate": 9.710187289547676e-07, + "loss": 0.712, + "step": 19881 + }, + { + "epoch": 1.4363790705654993, + "grad_norm": 6.141066363054502, + "learning_rate": 9.707873194491072e-07, + "loss": 0.6091, + "step": 19882 + }, + { + "epoch": 1.4364513157657088, + "grad_norm": 6.290975301116498, + "learning_rate": 9.705559308771817e-07, + "loss": 0.6241, + "step": 19883 + }, + { + "epoch": 1.4365235609659184, + "grad_norm": 6.0453812678831325, + "learning_rate": 9.70324563242157e-07, + "loss": 0.6048, + "step": 19884 + }, + { + "epoch": 1.436595806166128, + "grad_norm": 7.521399676555051, + "learning_rate": 9.700932165472015e-07, + "loss": 0.6194, + "step": 19885 + }, + { + "epoch": 1.4366680513663375, + "grad_norm": 6.121553346558033, + "learning_rate": 9.69861890795483e-07, + "loss": 0.6122, + "step": 19886 + }, + { + "epoch": 1.4367402965665468, + "grad_norm": 8.261429239165004, + "learning_rate": 9.696305859901661e-07, + "loss": 0.6472, + "step": 19887 + }, + { + "epoch": 1.4368125417667563, + "grad_norm": 7.325595964267307, + "learning_rate": 9.69399302134418e-07, + "loss": 0.6349, + "step": 19888 + }, + { + "epoch": 1.4368847869669659, + "grad_norm": 6.652415715901836, + "learning_rate": 9.691680392314053e-07, + "loss": 0.5954, + "step": 19889 + }, + { + "epoch": 1.4369570321671754, + "grad_norm": 6.068536643492251, + "learning_rate": 9.689367972842936e-07, + "loss": 0.6126, + "step": 19890 + }, + { + "epoch": 1.437029277367385, + "grad_norm": 6.508948988212486, + "learning_rate": 9.68705576296248e-07, + "loss": 0.6266, + "step": 19891 + }, + { + "epoch": 1.4371015225675945, + "grad_norm": 7.500345095006593, + "learning_rate": 9.684743762704343e-07, + "loss": 0.5892, + "step": 19892 + }, + { + "epoch": 1.437173767767804, + "grad_norm": 6.603612385142175, + "learning_rate": 9.682431972100182e-07, + "loss": 0.6884, + "step": 19893 + }, + { + "epoch": 1.4372460129680134, + "grad_norm": 7.51178299057619, + "learning_rate": 9.680120391181624e-07, + "loss": 0.5416, + "step": 19894 + }, + { + "epoch": 1.437318258168223, + "grad_norm": 8.143507785664344, + "learning_rate": 9.677809019980324e-07, + "loss": 0.6031, + "step": 19895 + }, + { + "epoch": 1.4373905033684324, + "grad_norm": 6.903812858741566, + "learning_rate": 9.67549785852792e-07, + "loss": 0.6777, + "step": 19896 + }, + { + "epoch": 1.437462748568642, + "grad_norm": 6.425288200594069, + "learning_rate": 9.673186906856061e-07, + "loss": 0.5766, + "step": 19897 + }, + { + "epoch": 1.4375349937688515, + "grad_norm": 7.421009149740829, + "learning_rate": 9.670876164996366e-07, + "loss": 0.631, + "step": 19898 + }, + { + "epoch": 1.437607238969061, + "grad_norm": 6.43756340514133, + "learning_rate": 9.66856563298047e-07, + "loss": 0.5626, + "step": 19899 + }, + { + "epoch": 1.4376794841692706, + "grad_norm": 6.533568632339169, + "learning_rate": 9.666255310840008e-07, + "loss": 0.6207, + "step": 19900 + }, + { + "epoch": 1.43775172936948, + "grad_norm": 6.75633055915056, + "learning_rate": 9.663945198606605e-07, + "loss": 0.6256, + "step": 19901 + }, + { + "epoch": 1.4378239745696895, + "grad_norm": 7.290819420411188, + "learning_rate": 9.661635296311885e-07, + "loss": 0.6101, + "step": 19902 + }, + { + "epoch": 1.437896219769899, + "grad_norm": 7.045761933990443, + "learning_rate": 9.659325603987472e-07, + "loss": 0.6645, + "step": 19903 + }, + { + "epoch": 1.4379684649701086, + "grad_norm": 7.007209062909309, + "learning_rate": 9.657016121664984e-07, + "loss": 0.6214, + "step": 19904 + }, + { + "epoch": 1.438040710170318, + "grad_norm": 8.239417947748239, + "learning_rate": 9.654706849376024e-07, + "loss": 0.6043, + "step": 19905 + }, + { + "epoch": 1.4381129553705276, + "grad_norm": 8.900555848326535, + "learning_rate": 9.652397787152212e-07, + "loss": 0.702, + "step": 19906 + }, + { + "epoch": 1.4381852005707372, + "grad_norm": 6.657838443829996, + "learning_rate": 9.650088935025159e-07, + "loss": 0.6448, + "step": 19907 + }, + { + "epoch": 1.4382574457709465, + "grad_norm": 6.6329672068603776, + "learning_rate": 9.64778029302647e-07, + "loss": 0.6065, + "step": 19908 + }, + { + "epoch": 1.438329690971156, + "grad_norm": 6.786932326827456, + "learning_rate": 9.645471861187749e-07, + "loss": 0.5673, + "step": 19909 + }, + { + "epoch": 1.4384019361713656, + "grad_norm": 6.977679224121465, + "learning_rate": 9.643163639540596e-07, + "loss": 0.718, + "step": 19910 + }, + { + "epoch": 1.4384741813715751, + "grad_norm": 8.139298753830182, + "learning_rate": 9.640855628116616e-07, + "loss": 0.6458, + "step": 19911 + }, + { + "epoch": 1.4385464265717847, + "grad_norm": 7.801458017087755, + "learning_rate": 9.638547826947388e-07, + "loss": 0.6167, + "step": 19912 + }, + { + "epoch": 1.4386186717719942, + "grad_norm": 7.526722920525458, + "learning_rate": 9.636240236064516e-07, + "loss": 0.633, + "step": 19913 + }, + { + "epoch": 1.4386909169722037, + "grad_norm": 6.12908612407336, + "learning_rate": 9.633932855499584e-07, + "loss": 0.6069, + "step": 19914 + }, + { + "epoch": 1.438763162172413, + "grad_norm": 6.311620972115982, + "learning_rate": 9.63162568528419e-07, + "loss": 0.5483, + "step": 19915 + }, + { + "epoch": 1.4388354073726226, + "grad_norm": 7.072744585583477, + "learning_rate": 9.629318725449898e-07, + "loss": 0.7714, + "step": 19916 + }, + { + "epoch": 1.4389076525728322, + "grad_norm": 6.785505655853161, + "learning_rate": 9.627011976028297e-07, + "loss": 0.6263, + "step": 19917 + }, + { + "epoch": 1.4389798977730417, + "grad_norm": 6.6199425160658025, + "learning_rate": 9.624705437050968e-07, + "loss": 0.6857, + "step": 19918 + }, + { + "epoch": 1.4390521429732512, + "grad_norm": 5.799776322885894, + "learning_rate": 9.622399108549483e-07, + "loss": 0.629, + "step": 19919 + }, + { + "epoch": 1.4391243881734608, + "grad_norm": 6.897569819971866, + "learning_rate": 9.620092990555412e-07, + "loss": 0.7035, + "step": 19920 + }, + { + "epoch": 1.4391966333736703, + "grad_norm": 6.148336678711969, + "learning_rate": 9.617787083100328e-07, + "loss": 0.5732, + "step": 19921 + }, + { + "epoch": 1.4392688785738796, + "grad_norm": 9.223364344375732, + "learning_rate": 9.615481386215805e-07, + "loss": 0.6696, + "step": 19922 + }, + { + "epoch": 1.4393411237740892, + "grad_norm": 6.90783617955679, + "learning_rate": 9.613175899933386e-07, + "loss": 0.6258, + "step": 19923 + }, + { + "epoch": 1.4394133689742987, + "grad_norm": 7.256875296896525, + "learning_rate": 9.610870624284643e-07, + "loss": 0.5589, + "step": 19924 + }, + { + "epoch": 1.4394856141745083, + "grad_norm": 6.759897111188707, + "learning_rate": 9.60856555930114e-07, + "loss": 0.597, + "step": 19925 + }, + { + "epoch": 1.4395578593747178, + "grad_norm": 8.350184784632015, + "learning_rate": 9.606260705014415e-07, + "loss": 0.723, + "step": 19926 + }, + { + "epoch": 1.4396301045749273, + "grad_norm": 5.725281034676411, + "learning_rate": 9.60395606145603e-07, + "loss": 0.656, + "step": 19927 + }, + { + "epoch": 1.439702349775137, + "grad_norm": 5.9454908707638205, + "learning_rate": 9.601651628657521e-07, + "loss": 0.6302, + "step": 19928 + }, + { + "epoch": 1.4397745949753462, + "grad_norm": 6.775448053050077, + "learning_rate": 9.599347406650464e-07, + "loss": 0.5657, + "step": 19929 + }, + { + "epoch": 1.4398468401755558, + "grad_norm": 6.138379489636419, + "learning_rate": 9.597043395466374e-07, + "loss": 0.6195, + "step": 19930 + }, + { + "epoch": 1.4399190853757653, + "grad_norm": 7.994801024059738, + "learning_rate": 9.594739595136801e-07, + "loss": 0.6602, + "step": 19931 + }, + { + "epoch": 1.4399913305759748, + "grad_norm": 7.732590101415804, + "learning_rate": 9.592436005693282e-07, + "loss": 0.6424, + "step": 19932 + }, + { + "epoch": 1.4400635757761844, + "grad_norm": 7.625406723431473, + "learning_rate": 9.590132627167359e-07, + "loss": 0.6253, + "step": 19933 + }, + { + "epoch": 1.440135820976394, + "grad_norm": 5.83954130174161, + "learning_rate": 9.587829459590546e-07, + "loss": 0.6474, + "step": 19934 + }, + { + "epoch": 1.4402080661766035, + "grad_norm": 8.125892942218623, + "learning_rate": 9.585526502994382e-07, + "loss": 0.5978, + "step": 19935 + }, + { + "epoch": 1.4402803113768128, + "grad_norm": 6.339588322315223, + "learning_rate": 9.583223757410392e-07, + "loss": 0.6066, + "step": 19936 + }, + { + "epoch": 1.4403525565770225, + "grad_norm": 8.051082126079631, + "learning_rate": 9.580921222870097e-07, + "loss": 0.5692, + "step": 19937 + }, + { + "epoch": 1.4404248017772319, + "grad_norm": 7.1680831692162155, + "learning_rate": 9.578618899405019e-07, + "loss": 0.6668, + "step": 19938 + }, + { + "epoch": 1.4404970469774414, + "grad_norm": 7.23006038656382, + "learning_rate": 9.576316787046675e-07, + "loss": 0.657, + "step": 19939 + }, + { + "epoch": 1.440569292177651, + "grad_norm": 7.920158847410378, + "learning_rate": 9.574014885826585e-07, + "loss": 0.682, + "step": 19940 + }, + { + "epoch": 1.4406415373778605, + "grad_norm": 6.642898565300915, + "learning_rate": 9.571713195776248e-07, + "loss": 0.599, + "step": 19941 + }, + { + "epoch": 1.44071378257807, + "grad_norm": 5.936487693823345, + "learning_rate": 9.569411716927174e-07, + "loss": 0.6483, + "step": 19942 + }, + { + "epoch": 1.4407860277782794, + "grad_norm": 8.117554055125881, + "learning_rate": 9.567110449310884e-07, + "loss": 0.6527, + "step": 19943 + }, + { + "epoch": 1.4408582729784891, + "grad_norm": 8.542260969623003, + "learning_rate": 9.56480939295886e-07, + "loss": 0.6233, + "step": 19944 + }, + { + "epoch": 1.4409305181786984, + "grad_norm": 7.082239433043964, + "learning_rate": 9.56250854790261e-07, + "loss": 0.6673, + "step": 19945 + }, + { + "epoch": 1.441002763378908, + "grad_norm": 6.482468952262018, + "learning_rate": 9.560207914173634e-07, + "loss": 0.639, + "step": 19946 + }, + { + "epoch": 1.4410750085791175, + "grad_norm": 6.487699754759784, + "learning_rate": 9.557907491803422e-07, + "loss": 0.6383, + "step": 19947 + }, + { + "epoch": 1.441147253779327, + "grad_norm": 7.477487983145539, + "learning_rate": 9.555607280823465e-07, + "loss": 0.6528, + "step": 19948 + }, + { + "epoch": 1.4412194989795366, + "grad_norm": 6.608617139432838, + "learning_rate": 9.553307281265254e-07, + "loss": 0.5761, + "step": 19949 + }, + { + "epoch": 1.441291744179746, + "grad_norm": 7.448181176101401, + "learning_rate": 9.551007493160282e-07, + "loss": 0.6574, + "step": 19950 + }, + { + "epoch": 1.4413639893799557, + "grad_norm": 5.3749180277406685, + "learning_rate": 9.548707916540011e-07, + "loss": 0.6154, + "step": 19951 + }, + { + "epoch": 1.441436234580165, + "grad_norm": 7.0238464711643775, + "learning_rate": 9.546408551435935e-07, + "loss": 0.6183, + "step": 19952 + }, + { + "epoch": 1.4415084797803746, + "grad_norm": 7.716621630587341, + "learning_rate": 9.544109397879525e-07, + "loss": 0.729, + "step": 19953 + }, + { + "epoch": 1.441580724980584, + "grad_norm": 6.469641877393872, + "learning_rate": 9.541810455902264e-07, + "loss": 0.5862, + "step": 19954 + }, + { + "epoch": 1.4416529701807936, + "grad_norm": 7.09755171350852, + "learning_rate": 9.539511725535608e-07, + "loss": 0.7134, + "step": 19955 + }, + { + "epoch": 1.4417252153810032, + "grad_norm": 5.711660524618592, + "learning_rate": 9.537213206811025e-07, + "loss": 0.5952, + "step": 19956 + }, + { + "epoch": 1.4417974605812125, + "grad_norm": 6.655383416300578, + "learning_rate": 9.534914899759992e-07, + "loss": 0.6912, + "step": 19957 + }, + { + "epoch": 1.4418697057814223, + "grad_norm": 6.913618618784656, + "learning_rate": 9.532616804413977e-07, + "loss": 0.6538, + "step": 19958 + }, + { + "epoch": 1.4419419509816316, + "grad_norm": 9.758600598678647, + "learning_rate": 9.530318920804421e-07, + "loss": 0.6666, + "step": 19959 + }, + { + "epoch": 1.4420141961818411, + "grad_norm": 7.251976269994038, + "learning_rate": 9.528021248962785e-07, + "loss": 0.7157, + "step": 19960 + }, + { + "epoch": 1.4420864413820507, + "grad_norm": 6.905906979426822, + "learning_rate": 9.525723788920536e-07, + "loss": 0.64, + "step": 19961 + }, + { + "epoch": 1.4421586865822602, + "grad_norm": 7.3961960681201635, + "learning_rate": 9.523426540709104e-07, + "loss": 0.6926, + "step": 19962 + }, + { + "epoch": 1.4422309317824697, + "grad_norm": 6.241758787785105, + "learning_rate": 9.521129504359944e-07, + "loss": 0.6368, + "step": 19963 + }, + { + "epoch": 1.4423031769826793, + "grad_norm": 7.202741302399185, + "learning_rate": 9.518832679904505e-07, + "loss": 0.6993, + "step": 19964 + }, + { + "epoch": 1.4423754221828888, + "grad_norm": 6.439314799374947, + "learning_rate": 9.516536067374224e-07, + "loss": 0.5786, + "step": 19965 + }, + { + "epoch": 1.4424476673830982, + "grad_norm": 5.553219715313178, + "learning_rate": 9.514239666800543e-07, + "loss": 0.55, + "step": 19966 + }, + { + "epoch": 1.4425199125833077, + "grad_norm": 6.230308028393308, + "learning_rate": 9.5119434782149e-07, + "loss": 0.6153, + "step": 19967 + }, + { + "epoch": 1.4425921577835172, + "grad_norm": 5.820103747829418, + "learning_rate": 9.509647501648731e-07, + "loss": 0.5973, + "step": 19968 + }, + { + "epoch": 1.4426644029837268, + "grad_norm": 6.442030118268443, + "learning_rate": 9.50735173713345e-07, + "loss": 0.5739, + "step": 19969 + }, + { + "epoch": 1.4427366481839363, + "grad_norm": 7.03997234599144, + "learning_rate": 9.505056184700495e-07, + "loss": 0.6452, + "step": 19970 + }, + { + "epoch": 1.4428088933841459, + "grad_norm": 6.1904049614242345, + "learning_rate": 9.502760844381293e-07, + "loss": 0.6798, + "step": 19971 + }, + { + "epoch": 1.4428811385843554, + "grad_norm": 6.105121976116589, + "learning_rate": 9.500465716207266e-07, + "loss": 0.592, + "step": 19972 + }, + { + "epoch": 1.4429533837845647, + "grad_norm": 7.436920399881909, + "learning_rate": 9.498170800209824e-07, + "loss": 0.6555, + "step": 19973 + }, + { + "epoch": 1.4430256289847743, + "grad_norm": 6.717995960594387, + "learning_rate": 9.495876096420386e-07, + "loss": 0.6283, + "step": 19974 + }, + { + "epoch": 1.4430978741849838, + "grad_norm": 7.204505506776157, + "learning_rate": 9.493581604870367e-07, + "loss": 0.523, + "step": 19975 + }, + { + "epoch": 1.4431701193851934, + "grad_norm": 6.611521703683748, + "learning_rate": 9.491287325591175e-07, + "loss": 0.6204, + "step": 19976 + }, + { + "epoch": 1.443242364585403, + "grad_norm": 6.828114385181967, + "learning_rate": 9.488993258614218e-07, + "loss": 0.5954, + "step": 19977 + }, + { + "epoch": 1.4433146097856124, + "grad_norm": 5.62866684816848, + "learning_rate": 9.486699403970897e-07, + "loss": 0.5313, + "step": 19978 + }, + { + "epoch": 1.443386854985822, + "grad_norm": 7.561384891234454, + "learning_rate": 9.484405761692628e-07, + "loss": 0.6457, + "step": 19979 + }, + { + "epoch": 1.4434591001860313, + "grad_norm": 7.350875968375566, + "learning_rate": 9.482112331810789e-07, + "loss": 0.7207, + "step": 19980 + }, + { + "epoch": 1.4435313453862408, + "grad_norm": 7.1138987474804765, + "learning_rate": 9.479819114356781e-07, + "loss": 0.6289, + "step": 19981 + }, + { + "epoch": 1.4436035905864504, + "grad_norm": 6.6457428004237995, + "learning_rate": 9.477526109362001e-07, + "loss": 0.6688, + "step": 19982 + }, + { + "epoch": 1.44367583578666, + "grad_norm": 6.484164094367565, + "learning_rate": 9.475233316857843e-07, + "loss": 0.6495, + "step": 19983 + }, + { + "epoch": 1.4437480809868695, + "grad_norm": 7.849711914607694, + "learning_rate": 9.472940736875677e-07, + "loss": 0.6492, + "step": 19984 + }, + { + "epoch": 1.443820326187079, + "grad_norm": 7.590278240024296, + "learning_rate": 9.470648369446889e-07, + "loss": 0.7549, + "step": 19985 + }, + { + "epoch": 1.4438925713872885, + "grad_norm": 6.652416862762495, + "learning_rate": 9.468356214602883e-07, + "loss": 0.6071, + "step": 19986 + }, + { + "epoch": 1.4439648165874979, + "grad_norm": 6.973869325061742, + "learning_rate": 9.466064272375014e-07, + "loss": 0.6318, + "step": 19987 + }, + { + "epoch": 1.4440370617877074, + "grad_norm": 7.527085288440215, + "learning_rate": 9.463772542794664e-07, + "loss": 0.5915, + "step": 19988 + }, + { + "epoch": 1.444109306987917, + "grad_norm": 6.752530224198255, + "learning_rate": 9.461481025893205e-07, + "loss": 0.6566, + "step": 19989 + }, + { + "epoch": 1.4441815521881265, + "grad_norm": 6.916596959999916, + "learning_rate": 9.459189721702014e-07, + "loss": 0.5704, + "step": 19990 + }, + { + "epoch": 1.444253797388336, + "grad_norm": 6.4215619458747755, + "learning_rate": 9.45689863025244e-07, + "loss": 0.6069, + "step": 19991 + }, + { + "epoch": 1.4443260425885456, + "grad_norm": 8.833347632438555, + "learning_rate": 9.454607751575856e-07, + "loss": 0.6107, + "step": 19992 + }, + { + "epoch": 1.4443982877887551, + "grad_norm": 6.616273422790729, + "learning_rate": 9.452317085703619e-07, + "loss": 0.6995, + "step": 19993 + }, + { + "epoch": 1.4444705329889644, + "grad_norm": 8.920744014211474, + "learning_rate": 9.450026632667092e-07, + "loss": 0.5902, + "step": 19994 + }, + { + "epoch": 1.444542778189174, + "grad_norm": 7.417657364531466, + "learning_rate": 9.447736392497625e-07, + "loss": 0.6495, + "step": 19995 + }, + { + "epoch": 1.4446150233893835, + "grad_norm": 6.839229118229673, + "learning_rate": 9.445446365226574e-07, + "loss": 0.6903, + "step": 19996 + }, + { + "epoch": 1.444687268589593, + "grad_norm": 6.860107938845947, + "learning_rate": 9.443156550885291e-07, + "loss": 0.6485, + "step": 19997 + }, + { + "epoch": 1.4447595137898026, + "grad_norm": 7.602977029071562, + "learning_rate": 9.440866949505106e-07, + "loss": 0.6817, + "step": 19998 + }, + { + "epoch": 1.4448317589900121, + "grad_norm": 6.709673761433061, + "learning_rate": 9.438577561117374e-07, + "loss": 0.6018, + "step": 19999 + }, + { + "epoch": 1.4449040041902217, + "grad_norm": 7.312909497915794, + "learning_rate": 9.436288385753442e-07, + "loss": 0.6884, + "step": 20000 + }, + { + "epoch": 1.444976249390431, + "grad_norm": 5.701706466885286, + "learning_rate": 9.433999423444626e-07, + "loss": 0.606, + "step": 20001 + }, + { + "epoch": 1.4450484945906406, + "grad_norm": 6.736581816946634, + "learning_rate": 9.431710674222275e-07, + "loss": 0.5683, + "step": 20002 + }, + { + "epoch": 1.44512073979085, + "grad_norm": 5.96734219071145, + "learning_rate": 9.429422138117713e-07, + "loss": 0.5678, + "step": 20003 + }, + { + "epoch": 1.4451929849910596, + "grad_norm": 6.886543154844465, + "learning_rate": 9.427133815162273e-07, + "loss": 0.6331, + "step": 20004 + }, + { + "epoch": 1.4452652301912692, + "grad_norm": 7.209901613365751, + "learning_rate": 9.424845705387281e-07, + "loss": 0.6155, + "step": 20005 + }, + { + "epoch": 1.4453374753914787, + "grad_norm": 6.932166721546897, + "learning_rate": 9.42255780882406e-07, + "loss": 0.5423, + "step": 20006 + }, + { + "epoch": 1.4454097205916883, + "grad_norm": 7.246072428792756, + "learning_rate": 9.420270125503933e-07, + "loss": 0.6697, + "step": 20007 + }, + { + "epoch": 1.4454819657918976, + "grad_norm": 6.969335420249995, + "learning_rate": 9.417982655458202e-07, + "loss": 0.5751, + "step": 20008 + }, + { + "epoch": 1.4455542109921073, + "grad_norm": 7.1007456965463485, + "learning_rate": 9.415695398718192e-07, + "loss": 0.696, + "step": 20009 + }, + { + "epoch": 1.4456264561923167, + "grad_norm": 6.619699336942988, + "learning_rate": 9.413408355315209e-07, + "loss": 0.6713, + "step": 20010 + }, + { + "epoch": 1.4456987013925262, + "grad_norm": 7.549343550482577, + "learning_rate": 9.411121525280575e-07, + "loss": 0.7539, + "step": 20011 + }, + { + "epoch": 1.4457709465927358, + "grad_norm": 6.053427603291462, + "learning_rate": 9.408834908645573e-07, + "loss": 0.5707, + "step": 20012 + }, + { + "epoch": 1.4458431917929453, + "grad_norm": 5.597775337565666, + "learning_rate": 9.406548505441507e-07, + "loss": 0.6381, + "step": 20013 + }, + { + "epoch": 1.4459154369931548, + "grad_norm": 6.442938127996528, + "learning_rate": 9.404262315699691e-07, + "loss": 0.5828, + "step": 20014 + }, + { + "epoch": 1.4459876821933642, + "grad_norm": 6.99702881017437, + "learning_rate": 9.401976339451427e-07, + "loss": 0.6882, + "step": 20015 + }, + { + "epoch": 1.446059927393574, + "grad_norm": 6.7739547655179395, + "learning_rate": 9.399690576727985e-07, + "loss": 0.6163, + "step": 20016 + }, + { + "epoch": 1.4461321725937832, + "grad_norm": 6.752738114713056, + "learning_rate": 9.397405027560666e-07, + "loss": 0.5963, + "step": 20017 + }, + { + "epoch": 1.4462044177939928, + "grad_norm": 6.846043868570419, + "learning_rate": 9.395119691980767e-07, + "loss": 0.6094, + "step": 20018 + }, + { + "epoch": 1.4462766629942023, + "grad_norm": 6.322458702064146, + "learning_rate": 9.392834570019555e-07, + "loss": 0.6722, + "step": 20019 + }, + { + "epoch": 1.4463489081944119, + "grad_norm": 7.069950730989788, + "learning_rate": 9.390549661708318e-07, + "loss": 0.6435, + "step": 20020 + }, + { + "epoch": 1.4464211533946214, + "grad_norm": 6.732222989882172, + "learning_rate": 9.388264967078337e-07, + "loss": 0.6031, + "step": 20021 + }, + { + "epoch": 1.4464933985948307, + "grad_norm": 5.430549125093044, + "learning_rate": 9.385980486160887e-07, + "loss": 0.5775, + "step": 20022 + }, + { + "epoch": 1.4465656437950405, + "grad_norm": 6.9445300525051, + "learning_rate": 9.38369621898724e-07, + "loss": 0.5867, + "step": 20023 + }, + { + "epoch": 1.4466378889952498, + "grad_norm": 7.180162216946767, + "learning_rate": 9.381412165588666e-07, + "loss": 0.5975, + "step": 20024 + }, + { + "epoch": 1.4467101341954594, + "grad_norm": 8.579601278933, + "learning_rate": 9.379128325996442e-07, + "loss": 0.6669, + "step": 20025 + }, + { + "epoch": 1.446782379395669, + "grad_norm": 6.571686917314989, + "learning_rate": 9.376844700241813e-07, + "loss": 0.5993, + "step": 20026 + }, + { + "epoch": 1.4468546245958784, + "grad_norm": 8.415417518706969, + "learning_rate": 9.374561288356051e-07, + "loss": 0.6122, + "step": 20027 + }, + { + "epoch": 1.446926869796088, + "grad_norm": 7.636542916773699, + "learning_rate": 9.372278090370413e-07, + "loss": 0.6638, + "step": 20028 + }, + { + "epoch": 1.4469991149962973, + "grad_norm": 8.050492682692285, + "learning_rate": 9.36999510631616e-07, + "loss": 0.6366, + "step": 20029 + }, + { + "epoch": 1.447071360196507, + "grad_norm": 5.880123744882156, + "learning_rate": 9.36771233622453e-07, + "loss": 0.6287, + "step": 20030 + }, + { + "epoch": 1.4471436053967164, + "grad_norm": 7.615655645246995, + "learning_rate": 9.365429780126781e-07, + "loss": 0.6725, + "step": 20031 + }, + { + "epoch": 1.447215850596926, + "grad_norm": 6.18293516757025, + "learning_rate": 9.363147438054159e-07, + "loss": 0.6614, + "step": 20032 + }, + { + "epoch": 1.4472880957971355, + "grad_norm": 7.308747200754125, + "learning_rate": 9.360865310037909e-07, + "loss": 0.5971, + "step": 20033 + }, + { + "epoch": 1.447360340997345, + "grad_norm": 6.710082527028356, + "learning_rate": 9.358583396109266e-07, + "loss": 0.6174, + "step": 20034 + }, + { + "epoch": 1.4474325861975545, + "grad_norm": 6.83514309000119, + "learning_rate": 9.356301696299475e-07, + "loss": 0.6379, + "step": 20035 + }, + { + "epoch": 1.447504831397764, + "grad_norm": 6.856895885107139, + "learning_rate": 9.354020210639775e-07, + "loss": 0.6718, + "step": 20036 + }, + { + "epoch": 1.4475770765979736, + "grad_norm": 6.559417581987945, + "learning_rate": 9.351738939161381e-07, + "loss": 0.625, + "step": 20037 + }, + { + "epoch": 1.447649321798183, + "grad_norm": 6.9493543366129185, + "learning_rate": 9.34945788189553e-07, + "loss": 0.6921, + "step": 20038 + }, + { + "epoch": 1.4477215669983925, + "grad_norm": 6.4520289342317785, + "learning_rate": 9.347177038873448e-07, + "loss": 0.5459, + "step": 20039 + }, + { + "epoch": 1.447793812198602, + "grad_norm": 6.845606164612914, + "learning_rate": 9.344896410126369e-07, + "loss": 0.6498, + "step": 20040 + }, + { + "epoch": 1.4478660573988116, + "grad_norm": 6.620627633851135, + "learning_rate": 9.342615995685487e-07, + "loss": 0.6072, + "step": 20041 + }, + { + "epoch": 1.4479383025990211, + "grad_norm": 7.031148002732422, + "learning_rate": 9.340335795582039e-07, + "loss": 0.6367, + "step": 20042 + }, + { + "epoch": 1.4480105477992307, + "grad_norm": 7.246336175611141, + "learning_rate": 9.338055809847249e-07, + "loss": 0.6059, + "step": 20043 + }, + { + "epoch": 1.4480827929994402, + "grad_norm": 5.743816533912286, + "learning_rate": 9.335776038512301e-07, + "loss": 0.6345, + "step": 20044 + }, + { + "epoch": 1.4481550381996495, + "grad_norm": 7.141029129445411, + "learning_rate": 9.33349648160842e-07, + "loss": 0.6758, + "step": 20045 + }, + { + "epoch": 1.448227283399859, + "grad_norm": 7.52224408247811, + "learning_rate": 9.331217139166807e-07, + "loss": 0.6452, + "step": 20046 + }, + { + "epoch": 1.4482995286000686, + "grad_norm": 8.093559586925235, + "learning_rate": 9.328938011218671e-07, + "loss": 0.677, + "step": 20047 + }, + { + "epoch": 1.4483717738002782, + "grad_norm": 7.232680845350039, + "learning_rate": 9.326659097795202e-07, + "loss": 0.608, + "step": 20048 + }, + { + "epoch": 1.4484440190004877, + "grad_norm": 7.198705546259553, + "learning_rate": 9.324380398927596e-07, + "loss": 0.6409, + "step": 20049 + }, + { + "epoch": 1.4485162642006972, + "grad_norm": 6.164837265156265, + "learning_rate": 9.322101914647052e-07, + "loss": 0.6218, + "step": 20050 + }, + { + "epoch": 1.4485885094009068, + "grad_norm": 7.35050413517738, + "learning_rate": 9.319823644984763e-07, + "loss": 0.6313, + "step": 20051 + }, + { + "epoch": 1.448660754601116, + "grad_norm": 6.8233600812101045, + "learning_rate": 9.317545589971911e-07, + "loss": 0.6403, + "step": 20052 + }, + { + "epoch": 1.4487329998013256, + "grad_norm": 6.308307728616747, + "learning_rate": 9.315267749639684e-07, + "loss": 0.6172, + "step": 20053 + }, + { + "epoch": 1.4488052450015352, + "grad_norm": 6.664929672293707, + "learning_rate": 9.31299012401927e-07, + "loss": 0.6259, + "step": 20054 + }, + { + "epoch": 1.4488774902017447, + "grad_norm": 6.6418002176997115, + "learning_rate": 9.310712713141834e-07, + "loss": 0.664, + "step": 20055 + }, + { + "epoch": 1.4489497354019543, + "grad_norm": 7.497841333631077, + "learning_rate": 9.308435517038559e-07, + "loss": 0.6834, + "step": 20056 + }, + { + "epoch": 1.4490219806021638, + "grad_norm": 5.635169098423069, + "learning_rate": 9.306158535740625e-07, + "loss": 0.5152, + "step": 20057 + }, + { + "epoch": 1.4490942258023733, + "grad_norm": 6.778419571943783, + "learning_rate": 9.303881769279188e-07, + "loss": 0.6591, + "step": 20058 + }, + { + "epoch": 1.4491664710025827, + "grad_norm": 7.081001556159276, + "learning_rate": 9.301605217685423e-07, + "loss": 0.7181, + "step": 20059 + }, + { + "epoch": 1.4492387162027922, + "grad_norm": 7.826476755432186, + "learning_rate": 9.299328880990491e-07, + "loss": 0.5988, + "step": 20060 + }, + { + "epoch": 1.4493109614030018, + "grad_norm": 7.973321059907549, + "learning_rate": 9.297052759225558e-07, + "loss": 0.6942, + "step": 20061 + }, + { + "epoch": 1.4493832066032113, + "grad_norm": 6.865129355526758, + "learning_rate": 9.29477685242178e-07, + "loss": 0.6623, + "step": 20062 + }, + { + "epoch": 1.4494554518034208, + "grad_norm": 5.876969352337184, + "learning_rate": 9.292501160610312e-07, + "loss": 0.5926, + "step": 20063 + }, + { + "epoch": 1.4495276970036304, + "grad_norm": 7.053711140302676, + "learning_rate": 9.290225683822308e-07, + "loss": 0.5574, + "step": 20064 + }, + { + "epoch": 1.44959994220384, + "grad_norm": 7.915226480758413, + "learning_rate": 9.287950422088923e-07, + "loss": 0.6699, + "step": 20065 + }, + { + "epoch": 1.4496721874040492, + "grad_norm": 6.848181556968757, + "learning_rate": 9.285675375441292e-07, + "loss": 0.6034, + "step": 20066 + }, + { + "epoch": 1.4497444326042588, + "grad_norm": 7.368521479589105, + "learning_rate": 9.283400543910559e-07, + "loss": 0.6442, + "step": 20067 + }, + { + "epoch": 1.4498166778044683, + "grad_norm": 7.695847360060845, + "learning_rate": 9.281125927527881e-07, + "loss": 0.6175, + "step": 20068 + }, + { + "epoch": 1.4498889230046779, + "grad_norm": 7.670132751907542, + "learning_rate": 9.278851526324367e-07, + "loss": 0.5921, + "step": 20069 + }, + { + "epoch": 1.4499611682048874, + "grad_norm": 7.75450274193376, + "learning_rate": 9.276577340331177e-07, + "loss": 0.631, + "step": 20070 + }, + { + "epoch": 1.450033413405097, + "grad_norm": 7.592567383156599, + "learning_rate": 9.274303369579435e-07, + "loss": 0.6345, + "step": 20071 + }, + { + "epoch": 1.4501056586053065, + "grad_norm": 7.118039931200035, + "learning_rate": 9.272029614100278e-07, + "loss": 0.6172, + "step": 20072 + }, + { + "epoch": 1.4501779038055158, + "grad_norm": 6.22433236460022, + "learning_rate": 9.269756073924815e-07, + "loss": 0.5791, + "step": 20073 + }, + { + "epoch": 1.4502501490057254, + "grad_norm": 7.3310931280837455, + "learning_rate": 9.267482749084178e-07, + "loss": 0.6214, + "step": 20074 + }, + { + "epoch": 1.450322394205935, + "grad_norm": 6.289319676418706, + "learning_rate": 9.265209639609496e-07, + "loss": 0.618, + "step": 20075 + }, + { + "epoch": 1.4503946394061444, + "grad_norm": 6.870625439288767, + "learning_rate": 9.262936745531867e-07, + "loss": 0.6437, + "step": 20076 + }, + { + "epoch": 1.450466884606354, + "grad_norm": 7.396113287413458, + "learning_rate": 9.260664066882413e-07, + "loss": 0.5338, + "step": 20077 + }, + { + "epoch": 1.4505391298065635, + "grad_norm": 5.625706946135644, + "learning_rate": 9.258391603692249e-07, + "loss": 0.5385, + "step": 20078 + }, + { + "epoch": 1.450611375006773, + "grad_norm": 6.406102025253655, + "learning_rate": 9.256119355992482e-07, + "loss": 0.5861, + "step": 20079 + }, + { + "epoch": 1.4506836202069824, + "grad_norm": 6.625647171367724, + "learning_rate": 9.253847323814216e-07, + "loss": 0.5546, + "step": 20080 + }, + { + "epoch": 1.4507558654071921, + "grad_norm": 7.984164778299058, + "learning_rate": 9.251575507188554e-07, + "loss": 0.6614, + "step": 20081 + }, + { + "epoch": 1.4508281106074015, + "grad_norm": 8.31810233262465, + "learning_rate": 9.249303906146606e-07, + "loss": 0.6662, + "step": 20082 + }, + { + "epoch": 1.450900355807611, + "grad_norm": 6.920659976864985, + "learning_rate": 9.247032520719446e-07, + "loss": 0.6331, + "step": 20083 + }, + { + "epoch": 1.4509726010078206, + "grad_norm": 6.735235668157071, + "learning_rate": 9.24476135093818e-07, + "loss": 0.656, + "step": 20084 + }, + { + "epoch": 1.45104484620803, + "grad_norm": 6.808796567188608, + "learning_rate": 9.2424903968339e-07, + "loss": 0.6318, + "step": 20085 + }, + { + "epoch": 1.4511170914082396, + "grad_norm": 7.369195496582086, + "learning_rate": 9.240219658437699e-07, + "loss": 0.6345, + "step": 20086 + }, + { + "epoch": 1.451189336608449, + "grad_norm": 6.925747375921814, + "learning_rate": 9.237949135780646e-07, + "loss": 0.5804, + "step": 20087 + }, + { + "epoch": 1.4512615818086587, + "grad_norm": 6.526960652750162, + "learning_rate": 9.235678828893829e-07, + "loss": 0.6219, + "step": 20088 + }, + { + "epoch": 1.451333827008868, + "grad_norm": 6.473214935152369, + "learning_rate": 9.23340873780833e-07, + "loss": 0.6433, + "step": 20089 + }, + { + "epoch": 1.4514060722090776, + "grad_norm": 7.097870424069797, + "learning_rate": 9.231138862555225e-07, + "loss": 0.6152, + "step": 20090 + }, + { + "epoch": 1.4514783174092871, + "grad_norm": 7.946848252971848, + "learning_rate": 9.228869203165583e-07, + "loss": 0.592, + "step": 20091 + }, + { + "epoch": 1.4515505626094967, + "grad_norm": 7.470767718266508, + "learning_rate": 9.226599759670479e-07, + "loss": 0.6393, + "step": 20092 + }, + { + "epoch": 1.4516228078097062, + "grad_norm": 7.038695331104432, + "learning_rate": 9.224330532100984e-07, + "loss": 0.6182, + "step": 20093 + }, + { + "epoch": 1.4516950530099155, + "grad_norm": 6.65636891719992, + "learning_rate": 9.222061520488146e-07, + "loss": 0.6453, + "step": 20094 + }, + { + "epoch": 1.4517672982101253, + "grad_norm": 6.629889645125548, + "learning_rate": 9.219792724863033e-07, + "loss": 0.5422, + "step": 20095 + }, + { + "epoch": 1.4518395434103346, + "grad_norm": 5.609547561923983, + "learning_rate": 9.217524145256706e-07, + "loss": 0.5163, + "step": 20096 + }, + { + "epoch": 1.4519117886105442, + "grad_norm": 6.452938788362063, + "learning_rate": 9.21525578170023e-07, + "loss": 0.6047, + "step": 20097 + }, + { + "epoch": 1.4519840338107537, + "grad_norm": 8.00903191934932, + "learning_rate": 9.212987634224629e-07, + "loss": 0.6351, + "step": 20098 + }, + { + "epoch": 1.4520562790109632, + "grad_norm": 8.34721479734689, + "learning_rate": 9.210719702860976e-07, + "loss": 0.6354, + "step": 20099 + }, + { + "epoch": 1.4521285242111728, + "grad_norm": 7.399756824518744, + "learning_rate": 9.208451987640321e-07, + "loss": 0.6313, + "step": 20100 + }, + { + "epoch": 1.452200769411382, + "grad_norm": 6.389589258404118, + "learning_rate": 9.206184488593686e-07, + "loss": 0.5998, + "step": 20101 + }, + { + "epoch": 1.4522730146115919, + "grad_norm": 5.559275517637695, + "learning_rate": 9.203917205752125e-07, + "loss": 0.5755, + "step": 20102 + }, + { + "epoch": 1.4523452598118012, + "grad_norm": 5.842785434369474, + "learning_rate": 9.20165013914667e-07, + "loss": 0.6442, + "step": 20103 + }, + { + "epoch": 1.4524175050120107, + "grad_norm": 6.611485354020364, + "learning_rate": 9.19938328880837e-07, + "loss": 0.6177, + "step": 20104 + }, + { + "epoch": 1.4524897502122203, + "grad_norm": 7.659033444530932, + "learning_rate": 9.197116654768231e-07, + "loss": 0.5936, + "step": 20105 + }, + { + "epoch": 1.4525619954124298, + "grad_norm": 6.867719089386421, + "learning_rate": 9.194850237057299e-07, + "loss": 0.6631, + "step": 20106 + }, + { + "epoch": 1.4526342406126393, + "grad_norm": 7.155473025591719, + "learning_rate": 9.192584035706595e-07, + "loss": 0.6533, + "step": 20107 + }, + { + "epoch": 1.452706485812849, + "grad_norm": 6.752919731215545, + "learning_rate": 9.190318050747141e-07, + "loss": 0.5925, + "step": 20108 + }, + { + "epoch": 1.4527787310130584, + "grad_norm": 8.339892692332972, + "learning_rate": 9.188052282209956e-07, + "loss": 0.5644, + "step": 20109 + }, + { + "epoch": 1.4528509762132678, + "grad_norm": 7.625140079399771, + "learning_rate": 9.185786730126059e-07, + "loss": 0.6089, + "step": 20110 + }, + { + "epoch": 1.4529232214134773, + "grad_norm": 6.851264625259488, + "learning_rate": 9.183521394526473e-07, + "loss": 0.6068, + "step": 20111 + }, + { + "epoch": 1.4529954666136868, + "grad_norm": 5.415715217169064, + "learning_rate": 9.181256275442188e-07, + "loss": 0.6067, + "step": 20112 + }, + { + "epoch": 1.4530677118138964, + "grad_norm": 6.979318584857374, + "learning_rate": 9.178991372904223e-07, + "loss": 0.5491, + "step": 20113 + }, + { + "epoch": 1.453139957014106, + "grad_norm": 6.274663859935111, + "learning_rate": 9.17672668694359e-07, + "loss": 0.5344, + "step": 20114 + }, + { + "epoch": 1.4532122022143155, + "grad_norm": 6.487324312804313, + "learning_rate": 9.174462217591274e-07, + "loss": 0.5982, + "step": 20115 + }, + { + "epoch": 1.453284447414525, + "grad_norm": 6.434173020559526, + "learning_rate": 9.172197964878282e-07, + "loss": 0.6445, + "step": 20116 + }, + { + "epoch": 1.4533566926147343, + "grad_norm": 6.788870612309077, + "learning_rate": 9.169933928835612e-07, + "loss": 0.5934, + "step": 20117 + }, + { + "epoch": 1.4534289378149439, + "grad_norm": 7.995256925251449, + "learning_rate": 9.167670109494253e-07, + "loss": 0.5725, + "step": 20118 + }, + { + "epoch": 1.4535011830151534, + "grad_norm": 7.492287294043414, + "learning_rate": 9.165406506885199e-07, + "loss": 0.6321, + "step": 20119 + }, + { + "epoch": 1.453573428215363, + "grad_norm": 8.80522212357575, + "learning_rate": 9.163143121039436e-07, + "loss": 0.6345, + "step": 20120 + }, + { + "epoch": 1.4536456734155725, + "grad_norm": 7.12224030170867, + "learning_rate": 9.160879951987945e-07, + "loss": 0.6027, + "step": 20121 + }, + { + "epoch": 1.453717918615782, + "grad_norm": 6.911110112938736, + "learning_rate": 9.158616999761719e-07, + "loss": 0.6575, + "step": 20122 + }, + { + "epoch": 1.4537901638159916, + "grad_norm": 7.653994356422945, + "learning_rate": 9.156354264391717e-07, + "loss": 0.6884, + "step": 20123 + }, + { + "epoch": 1.453862409016201, + "grad_norm": 7.788945448750638, + "learning_rate": 9.154091745908925e-07, + "loss": 0.6254, + "step": 20124 + }, + { + "epoch": 1.4539346542164104, + "grad_norm": 6.747045364808596, + "learning_rate": 9.151829444344321e-07, + "loss": 0.6424, + "step": 20125 + }, + { + "epoch": 1.45400689941662, + "grad_norm": 5.934782671271511, + "learning_rate": 9.149567359728848e-07, + "loss": 0.6354, + "step": 20126 + }, + { + "epoch": 1.4540791446168295, + "grad_norm": 6.707717993652763, + "learning_rate": 9.1473054920935e-07, + "loss": 0.6119, + "step": 20127 + }, + { + "epoch": 1.454151389817039, + "grad_norm": 6.940830015738251, + "learning_rate": 9.145043841469231e-07, + "loss": 0.6772, + "step": 20128 + }, + { + "epoch": 1.4542236350172486, + "grad_norm": 6.9838511976752145, + "learning_rate": 9.14278240788701e-07, + "loss": 0.6766, + "step": 20129 + }, + { + "epoch": 1.4542958802174581, + "grad_norm": 6.450429727596007, + "learning_rate": 9.140521191377777e-07, + "loss": 0.5429, + "step": 20130 + }, + { + "epoch": 1.4543681254176675, + "grad_norm": 6.445647481825309, + "learning_rate": 9.138260191972495e-07, + "loss": 0.5734, + "step": 20131 + }, + { + "epoch": 1.454440370617877, + "grad_norm": 7.639088856555447, + "learning_rate": 9.135999409702123e-07, + "loss": 0.6233, + "step": 20132 + }, + { + "epoch": 1.4545126158180866, + "grad_norm": 7.2981481319582615, + "learning_rate": 9.133738844597595e-07, + "loss": 0.6168, + "step": 20133 + }, + { + "epoch": 1.454584861018296, + "grad_norm": 7.213680452372791, + "learning_rate": 9.13147849668986e-07, + "loss": 0.5943, + "step": 20134 + }, + { + "epoch": 1.4546571062185056, + "grad_norm": 6.556672915190819, + "learning_rate": 9.129218366009865e-07, + "loss": 0.616, + "step": 20135 + }, + { + "epoch": 1.4547293514187152, + "grad_norm": 6.884195455113763, + "learning_rate": 9.126958452588547e-07, + "loss": 0.6107, + "step": 20136 + }, + { + "epoch": 1.4548015966189247, + "grad_norm": 6.831327407551911, + "learning_rate": 9.124698756456843e-07, + "loss": 0.6229, + "step": 20137 + }, + { + "epoch": 1.454873841819134, + "grad_norm": 6.566063195139715, + "learning_rate": 9.122439277645689e-07, + "loss": 0.6381, + "step": 20138 + }, + { + "epoch": 1.4549460870193436, + "grad_norm": 5.710893748229296, + "learning_rate": 9.12018001618602e-07, + "loss": 0.5697, + "step": 20139 + }, + { + "epoch": 1.4550183322195531, + "grad_norm": 6.497969970614951, + "learning_rate": 9.117920972108749e-07, + "loss": 0.5969, + "step": 20140 + }, + { + "epoch": 1.4550905774197627, + "grad_norm": 7.562400628059674, + "learning_rate": 9.115662145444806e-07, + "loss": 0.6196, + "step": 20141 + }, + { + "epoch": 1.4551628226199722, + "grad_norm": 6.2822009832809345, + "learning_rate": 9.113403536225115e-07, + "loss": 0.6704, + "step": 20142 + }, + { + "epoch": 1.4552350678201817, + "grad_norm": 6.130854708404564, + "learning_rate": 9.111145144480604e-07, + "loss": 0.5794, + "step": 20143 + }, + { + "epoch": 1.4553073130203913, + "grad_norm": 6.9587538662531925, + "learning_rate": 9.10888697024217e-07, + "loss": 0.611, + "step": 20144 + }, + { + "epoch": 1.4553795582206006, + "grad_norm": 7.625011006331708, + "learning_rate": 9.106629013540736e-07, + "loss": 0.6721, + "step": 20145 + }, + { + "epoch": 1.4554518034208102, + "grad_norm": 7.169709853256463, + "learning_rate": 9.104371274407203e-07, + "loss": 0.6617, + "step": 20146 + }, + { + "epoch": 1.4555240486210197, + "grad_norm": 6.073066548176793, + "learning_rate": 9.102113752872499e-07, + "loss": 0.5296, + "step": 20147 + }, + { + "epoch": 1.4555962938212292, + "grad_norm": 6.4496253917643775, + "learning_rate": 9.099856448967506e-07, + "loss": 0.6028, + "step": 20148 + }, + { + "epoch": 1.4556685390214388, + "grad_norm": 6.489126647588064, + "learning_rate": 9.097599362723134e-07, + "loss": 0.5781, + "step": 20149 + }, + { + "epoch": 1.4557407842216483, + "grad_norm": 6.512810627563788, + "learning_rate": 9.095342494170287e-07, + "loss": 0.6174, + "step": 20150 + }, + { + "epoch": 1.4558130294218579, + "grad_norm": 6.605745080799725, + "learning_rate": 9.093085843339844e-07, + "loss": 0.6478, + "step": 20151 + }, + { + "epoch": 1.4558852746220672, + "grad_norm": 8.023327196174602, + "learning_rate": 9.090829410262706e-07, + "loss": 0.6671, + "step": 20152 + }, + { + "epoch": 1.4559575198222767, + "grad_norm": 6.334806705889025, + "learning_rate": 9.088573194969758e-07, + "loss": 0.5955, + "step": 20153 + }, + { + "epoch": 1.4560297650224863, + "grad_norm": 6.069212667819984, + "learning_rate": 9.086317197491889e-07, + "loss": 0.5385, + "step": 20154 + }, + { + "epoch": 1.4561020102226958, + "grad_norm": 6.561225694730178, + "learning_rate": 9.084061417859982e-07, + "loss": 0.6212, + "step": 20155 + }, + { + "epoch": 1.4561742554229054, + "grad_norm": 7.414305350343011, + "learning_rate": 9.081805856104916e-07, + "loss": 0.6252, + "step": 20156 + }, + { + "epoch": 1.456246500623115, + "grad_norm": 7.0481131876279655, + "learning_rate": 9.079550512257579e-07, + "loss": 0.6712, + "step": 20157 + }, + { + "epoch": 1.4563187458233244, + "grad_norm": 6.324115302292444, + "learning_rate": 9.077295386348822e-07, + "loss": 0.5941, + "step": 20158 + }, + { + "epoch": 1.4563909910235338, + "grad_norm": 6.1531664876473995, + "learning_rate": 9.07504047840953e-07, + "loss": 0.6882, + "step": 20159 + }, + { + "epoch": 1.4564632362237435, + "grad_norm": 6.89943002751603, + "learning_rate": 9.072785788470568e-07, + "loss": 0.6989, + "step": 20160 + }, + { + "epoch": 1.4565354814239528, + "grad_norm": 6.45053854180006, + "learning_rate": 9.070531316562811e-07, + "loss": 0.6759, + "step": 20161 + }, + { + "epoch": 1.4566077266241624, + "grad_norm": 6.37986928441184, + "learning_rate": 9.068277062717105e-07, + "loss": 0.6201, + "step": 20162 + }, + { + "epoch": 1.456679971824372, + "grad_norm": 6.783890030095542, + "learning_rate": 9.066023026964312e-07, + "loss": 0.6045, + "step": 20163 + }, + { + "epoch": 1.4567522170245815, + "grad_norm": 7.526167678218065, + "learning_rate": 9.063769209335293e-07, + "loss": 0.6528, + "step": 20164 + }, + { + "epoch": 1.456824462224791, + "grad_norm": 6.1863149413613705, + "learning_rate": 9.061515609860902e-07, + "loss": 0.6024, + "step": 20165 + }, + { + "epoch": 1.4568967074250003, + "grad_norm": 5.733483518067627, + "learning_rate": 9.059262228571985e-07, + "loss": 0.544, + "step": 20166 + }, + { + "epoch": 1.45696895262521, + "grad_norm": 7.843521175618323, + "learning_rate": 9.057009065499392e-07, + "loss": 0.6414, + "step": 20167 + }, + { + "epoch": 1.4570411978254194, + "grad_norm": 9.075539514567096, + "learning_rate": 9.054756120673975e-07, + "loss": 0.6507, + "step": 20168 + }, + { + "epoch": 1.457113443025629, + "grad_norm": 6.5544343337953395, + "learning_rate": 9.052503394126555e-07, + "loss": 0.6273, + "step": 20169 + }, + { + "epoch": 1.4571856882258385, + "grad_norm": 6.159412172186951, + "learning_rate": 9.050250885887982e-07, + "loss": 0.5897, + "step": 20170 + }, + { + "epoch": 1.457257933426048, + "grad_norm": 6.627121207896411, + "learning_rate": 9.047998595989091e-07, + "loss": 0.5607, + "step": 20171 + }, + { + "epoch": 1.4573301786262576, + "grad_norm": 7.069883824625492, + "learning_rate": 9.045746524460722e-07, + "loss": 0.6926, + "step": 20172 + }, + { + "epoch": 1.457402423826467, + "grad_norm": 5.540452790744168, + "learning_rate": 9.043494671333686e-07, + "loss": 0.6116, + "step": 20173 + }, + { + "epoch": 1.4574746690266767, + "grad_norm": 7.333002025171674, + "learning_rate": 9.041243036638819e-07, + "loss": 0.6854, + "step": 20174 + }, + { + "epoch": 1.457546914226886, + "grad_norm": 6.591583040513141, + "learning_rate": 9.038991620406945e-07, + "loss": 0.6265, + "step": 20175 + }, + { + "epoch": 1.4576191594270955, + "grad_norm": 6.397998985755509, + "learning_rate": 9.03674042266888e-07, + "loss": 0.6444, + "step": 20176 + }, + { + "epoch": 1.457691404627305, + "grad_norm": 5.922536526169222, + "learning_rate": 9.034489443455446e-07, + "loss": 0.598, + "step": 20177 + }, + { + "epoch": 1.4577636498275146, + "grad_norm": 7.217017556250404, + "learning_rate": 9.032238682797453e-07, + "loss": 0.5855, + "step": 20178 + }, + { + "epoch": 1.4578358950277241, + "grad_norm": 7.905490280862443, + "learning_rate": 9.029988140725726e-07, + "loss": 0.6277, + "step": 20179 + }, + { + "epoch": 1.4579081402279335, + "grad_norm": 6.2819292450873085, + "learning_rate": 9.027737817271051e-07, + "loss": 0.646, + "step": 20180 + }, + { + "epoch": 1.4579803854281432, + "grad_norm": 7.449395164157126, + "learning_rate": 9.025487712464243e-07, + "loss": 0.6956, + "step": 20181 + }, + { + "epoch": 1.4580526306283526, + "grad_norm": 6.8285247196896135, + "learning_rate": 9.023237826336106e-07, + "loss": 0.6961, + "step": 20182 + }, + { + "epoch": 1.458124875828562, + "grad_norm": 7.014697041131386, + "learning_rate": 9.020988158917437e-07, + "loss": 0.6249, + "step": 20183 + }, + { + "epoch": 1.4581971210287716, + "grad_norm": 6.835137229942676, + "learning_rate": 9.018738710239036e-07, + "loss": 0.5941, + "step": 20184 + }, + { + "epoch": 1.4582693662289812, + "grad_norm": 6.539546971393695, + "learning_rate": 9.016489480331688e-07, + "loss": 0.574, + "step": 20185 + }, + { + "epoch": 1.4583416114291907, + "grad_norm": 7.733120903298312, + "learning_rate": 9.014240469226201e-07, + "loss": 0.6892, + "step": 20186 + }, + { + "epoch": 1.4584138566294003, + "grad_norm": 6.3297379310197845, + "learning_rate": 9.011991676953341e-07, + "loss": 0.5655, + "step": 20187 + }, + { + "epoch": 1.4584861018296098, + "grad_norm": 6.62852010237267, + "learning_rate": 9.009743103543902e-07, + "loss": 0.59, + "step": 20188 + }, + { + "epoch": 1.4585583470298191, + "grad_norm": 7.134534646719724, + "learning_rate": 9.007494749028673e-07, + "loss": 0.6638, + "step": 20189 + }, + { + "epoch": 1.4586305922300287, + "grad_norm": 6.81706434860401, + "learning_rate": 9.005246613438412e-07, + "loss": 0.6175, + "step": 20190 + }, + { + "epoch": 1.4587028374302382, + "grad_norm": 6.607824265385452, + "learning_rate": 9.002998696803908e-07, + "loss": 0.621, + "step": 20191 + }, + { + "epoch": 1.4587750826304478, + "grad_norm": 6.559525460548051, + "learning_rate": 9.00075099915593e-07, + "loss": 0.5905, + "step": 20192 + }, + { + "epoch": 1.4588473278306573, + "grad_norm": 6.275744098452216, + "learning_rate": 8.998503520525248e-07, + "loss": 0.6495, + "step": 20193 + }, + { + "epoch": 1.4589195730308668, + "grad_norm": 7.985932618141361, + "learning_rate": 8.996256260942629e-07, + "loss": 0.6587, + "step": 20194 + }, + { + "epoch": 1.4589918182310764, + "grad_norm": 7.623036866666502, + "learning_rate": 8.994009220438835e-07, + "loss": 0.5956, + "step": 20195 + }, + { + "epoch": 1.4590640634312857, + "grad_norm": 6.809330194053626, + "learning_rate": 8.991762399044626e-07, + "loss": 0.6484, + "step": 20196 + }, + { + "epoch": 1.4591363086314952, + "grad_norm": 7.966322106394646, + "learning_rate": 8.989515796790771e-07, + "loss": 0.5415, + "step": 20197 + }, + { + "epoch": 1.4592085538317048, + "grad_norm": 7.073109177978488, + "learning_rate": 8.987269413708005e-07, + "loss": 0.6362, + "step": 20198 + }, + { + "epoch": 1.4592807990319143, + "grad_norm": 8.347022854128516, + "learning_rate": 8.985023249827085e-07, + "loss": 0.666, + "step": 20199 + }, + { + "epoch": 1.4593530442321239, + "grad_norm": 7.011861288592976, + "learning_rate": 8.982777305178775e-07, + "loss": 0.6182, + "step": 20200 + }, + { + "epoch": 1.4594252894323334, + "grad_norm": 7.752589623733701, + "learning_rate": 8.980531579793795e-07, + "loss": 0.5983, + "step": 20201 + }, + { + "epoch": 1.459497534632543, + "grad_norm": 7.5613432701035235, + "learning_rate": 8.978286073702899e-07, + "loss": 0.5482, + "step": 20202 + }, + { + "epoch": 1.4595697798327523, + "grad_norm": 5.468818533331738, + "learning_rate": 8.976040786936818e-07, + "loss": 0.6184, + "step": 20203 + }, + { + "epoch": 1.4596420250329618, + "grad_norm": 7.29381031315439, + "learning_rate": 8.973795719526316e-07, + "loss": 0.5884, + "step": 20204 + }, + { + "epoch": 1.4597142702331714, + "grad_norm": 7.394658155606739, + "learning_rate": 8.971550871502096e-07, + "loss": 0.6577, + "step": 20205 + }, + { + "epoch": 1.459786515433381, + "grad_norm": 6.1829857590591715, + "learning_rate": 8.969306242894904e-07, + "loss": 0.5984, + "step": 20206 + }, + { + "epoch": 1.4598587606335904, + "grad_norm": 7.25969862887771, + "learning_rate": 8.967061833735466e-07, + "loss": 0.6573, + "step": 20207 + }, + { + "epoch": 1.4599310058338, + "grad_norm": 6.021281330200564, + "learning_rate": 8.964817644054496e-07, + "loss": 0.6282, + "step": 20208 + }, + { + "epoch": 1.4600032510340095, + "grad_norm": 8.063342146546036, + "learning_rate": 8.962573673882721e-07, + "loss": 0.5467, + "step": 20209 + }, + { + "epoch": 1.4600754962342188, + "grad_norm": 6.524884640196414, + "learning_rate": 8.960329923250863e-07, + "loss": 0.5848, + "step": 20210 + }, + { + "epoch": 1.4601477414344284, + "grad_norm": 7.648333508853185, + "learning_rate": 8.958086392189633e-07, + "loss": 0.5886, + "step": 20211 + }, + { + "epoch": 1.460219986634638, + "grad_norm": 8.38079223748161, + "learning_rate": 8.955843080729742e-07, + "loss": 0.6479, + "step": 20212 + }, + { + "epoch": 1.4602922318348475, + "grad_norm": 6.556558589745956, + "learning_rate": 8.953599988901904e-07, + "loss": 0.6074, + "step": 20213 + }, + { + "epoch": 1.460364477035057, + "grad_norm": 7.116743959577416, + "learning_rate": 8.951357116736834e-07, + "loss": 0.5975, + "step": 20214 + }, + { + "epoch": 1.4604367222352665, + "grad_norm": 7.835266875667953, + "learning_rate": 8.94911446426521e-07, + "loss": 0.6489, + "step": 20215 + }, + { + "epoch": 1.460508967435476, + "grad_norm": 7.550329077414182, + "learning_rate": 8.94687203151775e-07, + "loss": 0.6839, + "step": 20216 + }, + { + "epoch": 1.4605812126356854, + "grad_norm": 7.328554445872206, + "learning_rate": 8.944629818525147e-07, + "loss": 0.617, + "step": 20217 + }, + { + "epoch": 1.460653457835895, + "grad_norm": 6.429987699932953, + "learning_rate": 8.942387825318102e-07, + "loss": 0.6224, + "step": 20218 + }, + { + "epoch": 1.4607257030361045, + "grad_norm": 6.4376522898021245, + "learning_rate": 8.940146051927295e-07, + "loss": 0.5735, + "step": 20219 + }, + { + "epoch": 1.460797948236314, + "grad_norm": 8.617261870712616, + "learning_rate": 8.937904498383415e-07, + "loss": 0.5431, + "step": 20220 + }, + { + "epoch": 1.4608701934365236, + "grad_norm": 7.841373159474242, + "learning_rate": 8.935663164717154e-07, + "loss": 0.597, + "step": 20221 + }, + { + "epoch": 1.4609424386367331, + "grad_norm": 7.14785901006269, + "learning_rate": 8.933422050959189e-07, + "loss": 0.6214, + "step": 20222 + }, + { + "epoch": 1.4610146838369427, + "grad_norm": 7.164747232476065, + "learning_rate": 8.931181157140203e-07, + "loss": 0.6163, + "step": 20223 + }, + { + "epoch": 1.461086929037152, + "grad_norm": 8.258495695928403, + "learning_rate": 8.928940483290869e-07, + "loss": 0.6275, + "step": 20224 + }, + { + "epoch": 1.4611591742373615, + "grad_norm": 6.552344611638491, + "learning_rate": 8.926700029441871e-07, + "loss": 0.6152, + "step": 20225 + }, + { + "epoch": 1.461231419437571, + "grad_norm": 6.518765870291683, + "learning_rate": 8.924459795623861e-07, + "loss": 0.5773, + "step": 20226 + }, + { + "epoch": 1.4613036646377806, + "grad_norm": 8.531249552855986, + "learning_rate": 8.922219781867511e-07, + "loss": 0.6084, + "step": 20227 + }, + { + "epoch": 1.4613759098379902, + "grad_norm": 6.4077602188072325, + "learning_rate": 8.919979988203492e-07, + "loss": 0.6004, + "step": 20228 + }, + { + "epoch": 1.4614481550381997, + "grad_norm": 6.719740683683934, + "learning_rate": 8.917740414662471e-07, + "loss": 0.6446, + "step": 20229 + }, + { + "epoch": 1.4615204002384092, + "grad_norm": 7.0694394747606895, + "learning_rate": 8.915501061275087e-07, + "loss": 0.6761, + "step": 20230 + }, + { + "epoch": 1.4615926454386186, + "grad_norm": 6.706143924268835, + "learning_rate": 8.913261928071995e-07, + "loss": 0.5974, + "step": 20231 + }, + { + "epoch": 1.4616648906388283, + "grad_norm": 6.383056635956026, + "learning_rate": 8.911023015083875e-07, + "loss": 0.5993, + "step": 20232 + }, + { + "epoch": 1.4617371358390376, + "grad_norm": 7.247284479321807, + "learning_rate": 8.908784322341349e-07, + "loss": 0.5908, + "step": 20233 + }, + { + "epoch": 1.4618093810392472, + "grad_norm": 7.102027668002309, + "learning_rate": 8.90654584987507e-07, + "loss": 0.5743, + "step": 20234 + }, + { + "epoch": 1.4618816262394567, + "grad_norm": 7.138697791933119, + "learning_rate": 8.904307597715683e-07, + "loss": 0.6131, + "step": 20235 + }, + { + "epoch": 1.4619538714396663, + "grad_norm": 6.274681490531387, + "learning_rate": 8.902069565893839e-07, + "loss": 0.5348, + "step": 20236 + }, + { + "epoch": 1.4620261166398758, + "grad_norm": 7.952812503453599, + "learning_rate": 8.899831754440152e-07, + "loss": 0.6443, + "step": 20237 + }, + { + "epoch": 1.4620983618400851, + "grad_norm": 6.9709291343999595, + "learning_rate": 8.897594163385268e-07, + "loss": 0.641, + "step": 20238 + }, + { + "epoch": 1.462170607040295, + "grad_norm": 5.724827273232758, + "learning_rate": 8.895356792759818e-07, + "loss": 0.6373, + "step": 20239 + }, + { + "epoch": 1.4622428522405042, + "grad_norm": 6.331204039697754, + "learning_rate": 8.893119642594428e-07, + "loss": 0.5562, + "step": 20240 + }, + { + "epoch": 1.4623150974407138, + "grad_norm": 6.654277097215251, + "learning_rate": 8.890882712919724e-07, + "loss": 0.5466, + "step": 20241 + }, + { + "epoch": 1.4623873426409233, + "grad_norm": 7.283942907504503, + "learning_rate": 8.888646003766327e-07, + "loss": 0.6416, + "step": 20242 + }, + { + "epoch": 1.4624595878411328, + "grad_norm": 6.848267061765887, + "learning_rate": 8.886409515164868e-07, + "loss": 0.5698, + "step": 20243 + }, + { + "epoch": 1.4625318330413424, + "grad_norm": 6.716468885111281, + "learning_rate": 8.884173247145941e-07, + "loss": 0.5766, + "step": 20244 + }, + { + "epoch": 1.4626040782415517, + "grad_norm": 7.119903079884119, + "learning_rate": 8.881937199740167e-07, + "loss": 0.5941, + "step": 20245 + }, + { + "epoch": 1.4626763234417615, + "grad_norm": 8.341806706927594, + "learning_rate": 8.87970137297817e-07, + "loss": 0.726, + "step": 20246 + }, + { + "epoch": 1.4627485686419708, + "grad_norm": 8.795177126473074, + "learning_rate": 8.877465766890533e-07, + "loss": 0.6809, + "step": 20247 + }, + { + "epoch": 1.4628208138421803, + "grad_norm": 6.844369520600234, + "learning_rate": 8.875230381507874e-07, + "loss": 0.6218, + "step": 20248 + }, + { + "epoch": 1.4628930590423899, + "grad_norm": 7.294412266754794, + "learning_rate": 8.872995216860788e-07, + "loss": 0.6284, + "step": 20249 + }, + { + "epoch": 1.4629653042425994, + "grad_norm": 6.6083423718666845, + "learning_rate": 8.870760272979878e-07, + "loss": 0.6744, + "step": 20250 + }, + { + "epoch": 1.463037549442809, + "grad_norm": 5.64133100238903, + "learning_rate": 8.868525549895737e-07, + "loss": 0.6208, + "step": 20251 + }, + { + "epoch": 1.4631097946430183, + "grad_norm": 6.014140314960015, + "learning_rate": 8.866291047638953e-07, + "loss": 0.5714, + "step": 20252 + }, + { + "epoch": 1.463182039843228, + "grad_norm": 9.004135029631438, + "learning_rate": 8.86405676624012e-07, + "loss": 0.6548, + "step": 20253 + }, + { + "epoch": 1.4632542850434374, + "grad_norm": 6.962027690215414, + "learning_rate": 8.861822705729831e-07, + "loss": 0.5843, + "step": 20254 + }, + { + "epoch": 1.463326530243647, + "grad_norm": 7.2762620073804944, + "learning_rate": 8.859588866138647e-07, + "loss": 0.6434, + "step": 20255 + }, + { + "epoch": 1.4633987754438564, + "grad_norm": 6.759794405457368, + "learning_rate": 8.85735524749716e-07, + "loss": 0.5506, + "step": 20256 + }, + { + "epoch": 1.463471020644066, + "grad_norm": 6.024272776675453, + "learning_rate": 8.855121849835954e-07, + "loss": 0.5351, + "step": 20257 + }, + { + "epoch": 1.4635432658442755, + "grad_norm": 7.830750898697466, + "learning_rate": 8.852888673185586e-07, + "loss": 0.6103, + "step": 20258 + }, + { + "epoch": 1.463615511044485, + "grad_norm": 6.328410125713905, + "learning_rate": 8.850655717576626e-07, + "loss": 0.6371, + "step": 20259 + }, + { + "epoch": 1.4636877562446946, + "grad_norm": 6.193150262060257, + "learning_rate": 8.848422983039659e-07, + "loss": 0.6648, + "step": 20260 + }, + { + "epoch": 1.463760001444904, + "grad_norm": 7.280973895817923, + "learning_rate": 8.846190469605248e-07, + "loss": 0.5697, + "step": 20261 + }, + { + "epoch": 1.4638322466451135, + "grad_norm": 8.872916823350796, + "learning_rate": 8.843958177303941e-07, + "loss": 0.7284, + "step": 20262 + }, + { + "epoch": 1.463904491845323, + "grad_norm": 8.038459362244549, + "learning_rate": 8.841726106166298e-07, + "loss": 0.6697, + "step": 20263 + }, + { + "epoch": 1.4639767370455326, + "grad_norm": 6.309878267543733, + "learning_rate": 8.839494256222891e-07, + "loss": 0.6051, + "step": 20264 + }, + { + "epoch": 1.464048982245742, + "grad_norm": 8.415106550098562, + "learning_rate": 8.83726262750425e-07, + "loss": 0.6608, + "step": 20265 + }, + { + "epoch": 1.4641212274459516, + "grad_norm": 6.480774249693341, + "learning_rate": 8.835031220040932e-07, + "loss": 0.6716, + "step": 20266 + }, + { + "epoch": 1.4641934726461612, + "grad_norm": 7.270783723882361, + "learning_rate": 8.83280003386349e-07, + "loss": 0.6128, + "step": 20267 + }, + { + "epoch": 1.4642657178463705, + "grad_norm": 6.675501246687865, + "learning_rate": 8.830569069002459e-07, + "loss": 0.6093, + "step": 20268 + }, + { + "epoch": 1.46433796304658, + "grad_norm": 6.414141302519117, + "learning_rate": 8.828338325488383e-07, + "loss": 0.5787, + "step": 20269 + }, + { + "epoch": 1.4644102082467896, + "grad_norm": 6.283847556418336, + "learning_rate": 8.826107803351799e-07, + "loss": 0.6007, + "step": 20270 + }, + { + "epoch": 1.4644824534469991, + "grad_norm": 8.420679577294045, + "learning_rate": 8.823877502623249e-07, + "loss": 0.5861, + "step": 20271 + }, + { + "epoch": 1.4645546986472087, + "grad_norm": 8.429790223974429, + "learning_rate": 8.821647423333249e-07, + "loss": 0.6331, + "step": 20272 + }, + { + "epoch": 1.4646269438474182, + "grad_norm": 7.440947430901094, + "learning_rate": 8.819417565512334e-07, + "loss": 0.6569, + "step": 20273 + }, + { + "epoch": 1.4646991890476277, + "grad_norm": 7.932201870071661, + "learning_rate": 8.817187929191026e-07, + "loss": 0.6321, + "step": 20274 + }, + { + "epoch": 1.464771434247837, + "grad_norm": 7.83342437150147, + "learning_rate": 8.814958514399863e-07, + "loss": 0.6898, + "step": 20275 + }, + { + "epoch": 1.4648436794480466, + "grad_norm": 6.01675681824214, + "learning_rate": 8.812729321169338e-07, + "loss": 0.5654, + "step": 20276 + }, + { + "epoch": 1.4649159246482562, + "grad_norm": 6.901933382591041, + "learning_rate": 8.810500349529983e-07, + "loss": 0.5914, + "step": 20277 + }, + { + "epoch": 1.4649881698484657, + "grad_norm": 7.538717153278711, + "learning_rate": 8.808271599512308e-07, + "loss": 0.6875, + "step": 20278 + }, + { + "epoch": 1.4650604150486752, + "grad_norm": 6.961763035450688, + "learning_rate": 8.806043071146822e-07, + "loss": 0.611, + "step": 20279 + }, + { + "epoch": 1.4651326602488848, + "grad_norm": 7.054892975274545, + "learning_rate": 8.803814764464033e-07, + "loss": 0.583, + "step": 20280 + }, + { + "epoch": 1.4652049054490943, + "grad_norm": 9.450656234319252, + "learning_rate": 8.801586679494445e-07, + "loss": 0.6857, + "step": 20281 + }, + { + "epoch": 1.4652771506493036, + "grad_norm": 7.0962649017225905, + "learning_rate": 8.799358816268563e-07, + "loss": 0.5831, + "step": 20282 + }, + { + "epoch": 1.4653493958495132, + "grad_norm": 7.527607016916003, + "learning_rate": 8.797131174816875e-07, + "loss": 0.6568, + "step": 20283 + }, + { + "epoch": 1.4654216410497227, + "grad_norm": 8.33696937073504, + "learning_rate": 8.794903755169879e-07, + "loss": 0.5917, + "step": 20284 + }, + { + "epoch": 1.4654938862499323, + "grad_norm": 7.325183790589692, + "learning_rate": 8.792676557358071e-07, + "loss": 0.6969, + "step": 20285 + }, + { + "epoch": 1.4655661314501418, + "grad_norm": 6.288713414710001, + "learning_rate": 8.790449581411941e-07, + "loss": 0.5641, + "step": 20286 + }, + { + "epoch": 1.4656383766503513, + "grad_norm": 7.723988843991774, + "learning_rate": 8.788222827361965e-07, + "loss": 0.6615, + "step": 20287 + }, + { + "epoch": 1.465710621850561, + "grad_norm": 6.986136468226594, + "learning_rate": 8.785996295238619e-07, + "loss": 0.6337, + "step": 20288 + }, + { + "epoch": 1.4657828670507702, + "grad_norm": 7.010099754331163, + "learning_rate": 8.783769985072416e-07, + "loss": 0.6387, + "step": 20289 + }, + { + "epoch": 1.4658551122509798, + "grad_norm": 6.808880325563212, + "learning_rate": 8.781543896893798e-07, + "loss": 0.5896, + "step": 20290 + }, + { + "epoch": 1.4659273574511893, + "grad_norm": 12.071191058236678, + "learning_rate": 8.779318030733253e-07, + "loss": 0.6053, + "step": 20291 + }, + { + "epoch": 1.4659996026513988, + "grad_norm": 6.290635418113424, + "learning_rate": 8.777092386621249e-07, + "loss": 0.5797, + "step": 20292 + }, + { + "epoch": 1.4660718478516084, + "grad_norm": 7.238034601841921, + "learning_rate": 8.774866964588263e-07, + "loss": 0.5828, + "step": 20293 + }, + { + "epoch": 1.466144093051818, + "grad_norm": 6.327285221091504, + "learning_rate": 8.772641764664741e-07, + "loss": 0.6595, + "step": 20294 + }, + { + "epoch": 1.4662163382520275, + "grad_norm": 5.961736103123087, + "learning_rate": 8.770416786881156e-07, + "loss": 0.6061, + "step": 20295 + }, + { + "epoch": 1.4662885834522368, + "grad_norm": 8.354751021067202, + "learning_rate": 8.768192031267961e-07, + "loss": 0.668, + "step": 20296 + }, + { + "epoch": 1.4663608286524463, + "grad_norm": 6.774023750040539, + "learning_rate": 8.765967497855615e-07, + "loss": 0.6512, + "step": 20297 + }, + { + "epoch": 1.4664330738526559, + "grad_norm": 8.160110436982455, + "learning_rate": 8.76374318667457e-07, + "loss": 0.6135, + "step": 20298 + }, + { + "epoch": 1.4665053190528654, + "grad_norm": 7.149219507686961, + "learning_rate": 8.761519097755272e-07, + "loss": 0.657, + "step": 20299 + }, + { + "epoch": 1.466577564253075, + "grad_norm": 6.465062661512659, + "learning_rate": 8.759295231128179e-07, + "loss": 0.6318, + "step": 20300 + }, + { + "epoch": 1.4666498094532845, + "grad_norm": 6.822296378401913, + "learning_rate": 8.757071586823715e-07, + "loss": 0.6579, + "step": 20301 + }, + { + "epoch": 1.466722054653494, + "grad_norm": 7.0140593890962695, + "learning_rate": 8.754848164872332e-07, + "loss": 0.5805, + "step": 20302 + }, + { + "epoch": 1.4667942998537034, + "grad_norm": 7.144364661125075, + "learning_rate": 8.752624965304459e-07, + "loss": 0.6463, + "step": 20303 + }, + { + "epoch": 1.4668665450539131, + "grad_norm": 6.219332720421435, + "learning_rate": 8.750401988150547e-07, + "loss": 0.6732, + "step": 20304 + }, + { + "epoch": 1.4669387902541224, + "grad_norm": 6.464095205436811, + "learning_rate": 8.748179233441007e-07, + "loss": 0.5292, + "step": 20305 + }, + { + "epoch": 1.467011035454332, + "grad_norm": 7.179236131163395, + "learning_rate": 8.74595670120627e-07, + "loss": 0.6356, + "step": 20306 + }, + { + "epoch": 1.4670832806545415, + "grad_norm": 7.725095789046966, + "learning_rate": 8.743734391476772e-07, + "loss": 0.5837, + "step": 20307 + }, + { + "epoch": 1.467155525854751, + "grad_norm": 7.225909229340888, + "learning_rate": 8.741512304282923e-07, + "loss": 0.6132, + "step": 20308 + }, + { + "epoch": 1.4672277710549606, + "grad_norm": 6.429458780960062, + "learning_rate": 8.739290439655149e-07, + "loss": 0.5739, + "step": 20309 + }, + { + "epoch": 1.46730001625517, + "grad_norm": 7.479084350926354, + "learning_rate": 8.73706879762386e-07, + "loss": 0.5536, + "step": 20310 + }, + { + "epoch": 1.4673722614553797, + "grad_norm": 8.145696484391026, + "learning_rate": 8.734847378219483e-07, + "loss": 0.6877, + "step": 20311 + }, + { + "epoch": 1.467444506655589, + "grad_norm": 6.629282592808403, + "learning_rate": 8.732626181472409e-07, + "loss": 0.6019, + "step": 20312 + }, + { + "epoch": 1.4675167518557986, + "grad_norm": 7.614253491425765, + "learning_rate": 8.730405207413048e-07, + "loss": 0.6518, + "step": 20313 + }, + { + "epoch": 1.467588997056008, + "grad_norm": 8.788297275715454, + "learning_rate": 8.728184456071819e-07, + "loss": 0.6891, + "step": 20314 + }, + { + "epoch": 1.4676612422562176, + "grad_norm": 6.739539341338343, + "learning_rate": 8.7259639274791e-07, + "loss": 0.5497, + "step": 20315 + }, + { + "epoch": 1.4677334874564272, + "grad_norm": 6.828668847674573, + "learning_rate": 8.723743621665293e-07, + "loss": 0.6798, + "step": 20316 + }, + { + "epoch": 1.4678057326566365, + "grad_norm": 6.9718987574358495, + "learning_rate": 8.721523538660803e-07, + "loss": 0.6051, + "step": 20317 + }, + { + "epoch": 1.4678779778568463, + "grad_norm": 6.375273455570418, + "learning_rate": 8.719303678496027e-07, + "loss": 0.563, + "step": 20318 + }, + { + "epoch": 1.4679502230570556, + "grad_norm": 6.386993794249624, + "learning_rate": 8.717084041201335e-07, + "loss": 0.5444, + "step": 20319 + }, + { + "epoch": 1.4680224682572651, + "grad_norm": 6.838529084642684, + "learning_rate": 8.714864626807118e-07, + "loss": 0.6539, + "step": 20320 + }, + { + "epoch": 1.4680947134574747, + "grad_norm": 7.943306308603785, + "learning_rate": 8.712645435343767e-07, + "loss": 0.694, + "step": 20321 + }, + { + "epoch": 1.4681669586576842, + "grad_norm": 7.143582121978249, + "learning_rate": 8.710426466841648e-07, + "loss": 0.6575, + "step": 20322 + }, + { + "epoch": 1.4682392038578937, + "grad_norm": 7.186377661608362, + "learning_rate": 8.708207721331141e-07, + "loss": 0.6034, + "step": 20323 + }, + { + "epoch": 1.468311449058103, + "grad_norm": 6.611026927223236, + "learning_rate": 8.705989198842621e-07, + "loss": 0.6747, + "step": 20324 + }, + { + "epoch": 1.4683836942583128, + "grad_norm": 6.523086629785588, + "learning_rate": 8.703770899406458e-07, + "loss": 0.5948, + "step": 20325 + }, + { + "epoch": 1.4684559394585222, + "grad_norm": 6.84841439488887, + "learning_rate": 8.701552823053016e-07, + "loss": 0.6046, + "step": 20326 + }, + { + "epoch": 1.4685281846587317, + "grad_norm": 6.585038201779583, + "learning_rate": 8.699334969812662e-07, + "loss": 0.5917, + "step": 20327 + }, + { + "epoch": 1.4686004298589412, + "grad_norm": 7.55793990429428, + "learning_rate": 8.697117339715755e-07, + "loss": 0.6183, + "step": 20328 + }, + { + "epoch": 1.4686726750591508, + "grad_norm": 7.639646627432516, + "learning_rate": 8.694899932792664e-07, + "loss": 0.5783, + "step": 20329 + }, + { + "epoch": 1.4687449202593603, + "grad_norm": 6.003437646908833, + "learning_rate": 8.692682749073722e-07, + "loss": 0.5865, + "step": 20330 + }, + { + "epoch": 1.4688171654595699, + "grad_norm": 7.222025866977424, + "learning_rate": 8.690465788589295e-07, + "loss": 0.5849, + "step": 20331 + }, + { + "epoch": 1.4688894106597794, + "grad_norm": 6.660980119955866, + "learning_rate": 8.688249051369732e-07, + "loss": 0.5772, + "step": 20332 + }, + { + "epoch": 1.4689616558599887, + "grad_norm": 7.159958856986556, + "learning_rate": 8.686032537445369e-07, + "loss": 0.5992, + "step": 20333 + }, + { + "epoch": 1.4690339010601983, + "grad_norm": 7.307381166946403, + "learning_rate": 8.683816246846549e-07, + "loss": 0.5674, + "step": 20334 + }, + { + "epoch": 1.4691061462604078, + "grad_norm": 6.933068311956959, + "learning_rate": 8.68160017960362e-07, + "loss": 0.6435, + "step": 20335 + }, + { + "epoch": 1.4691783914606174, + "grad_norm": 6.16153393460152, + "learning_rate": 8.679384335746913e-07, + "loss": 0.5951, + "step": 20336 + }, + { + "epoch": 1.469250636660827, + "grad_norm": 6.725143158817647, + "learning_rate": 8.677168715306764e-07, + "loss": 0.6554, + "step": 20337 + }, + { + "epoch": 1.4693228818610364, + "grad_norm": 7.491449441048321, + "learning_rate": 8.674953318313498e-07, + "loss": 0.5798, + "step": 20338 + }, + { + "epoch": 1.469395127061246, + "grad_norm": 7.025426190572333, + "learning_rate": 8.672738144797454e-07, + "loss": 0.6354, + "step": 20339 + }, + { + "epoch": 1.4694673722614553, + "grad_norm": 6.765224383256666, + "learning_rate": 8.670523194788944e-07, + "loss": 0.6898, + "step": 20340 + }, + { + "epoch": 1.4695396174616648, + "grad_norm": 7.784301159606881, + "learning_rate": 8.668308468318287e-07, + "loss": 0.5971, + "step": 20341 + }, + { + "epoch": 1.4696118626618744, + "grad_norm": 7.097547951241049, + "learning_rate": 8.666093965415809e-07, + "loss": 0.6, + "step": 20342 + }, + { + "epoch": 1.469684107862084, + "grad_norm": 7.363683261572147, + "learning_rate": 8.663879686111831e-07, + "loss": 0.6405, + "step": 20343 + }, + { + "epoch": 1.4697563530622935, + "grad_norm": 6.5112020359627145, + "learning_rate": 8.661665630436638e-07, + "loss": 0.5947, + "step": 20344 + }, + { + "epoch": 1.469828598262503, + "grad_norm": 7.445238975650212, + "learning_rate": 8.659451798420566e-07, + "loss": 0.6004, + "step": 20345 + }, + { + "epoch": 1.4699008434627125, + "grad_norm": 6.2156002176518586, + "learning_rate": 8.657238190093917e-07, + "loss": 0.6263, + "step": 20346 + }, + { + "epoch": 1.4699730886629219, + "grad_norm": 7.3095227562957, + "learning_rate": 8.655024805486981e-07, + "loss": 0.6257, + "step": 20347 + }, + { + "epoch": 1.4700453338631314, + "grad_norm": 7.000464287755536, + "learning_rate": 8.652811644630066e-07, + "loss": 0.622, + "step": 20348 + }, + { + "epoch": 1.470117579063341, + "grad_norm": 7.319719190787473, + "learning_rate": 8.650598707553465e-07, + "loss": 0.6965, + "step": 20349 + }, + { + "epoch": 1.4701898242635505, + "grad_norm": 7.566436247601223, + "learning_rate": 8.648385994287481e-07, + "loss": 0.5754, + "step": 20350 + }, + { + "epoch": 1.47026206946376, + "grad_norm": 6.289837636626019, + "learning_rate": 8.64617350486239e-07, + "loss": 0.5679, + "step": 20351 + }, + { + "epoch": 1.4703343146639696, + "grad_norm": 7.557525511965783, + "learning_rate": 8.643961239308485e-07, + "loss": 0.6029, + "step": 20352 + }, + { + "epoch": 1.4704065598641791, + "grad_norm": 7.121647899533621, + "learning_rate": 8.641749197656052e-07, + "loss": 0.6876, + "step": 20353 + }, + { + "epoch": 1.4704788050643884, + "grad_norm": 7.119369155711268, + "learning_rate": 8.639537379935369e-07, + "loss": 0.6097, + "step": 20354 + }, + { + "epoch": 1.470551050264598, + "grad_norm": 7.525465140430456, + "learning_rate": 8.637325786176718e-07, + "loss": 0.6187, + "step": 20355 + }, + { + "epoch": 1.4706232954648075, + "grad_norm": 6.532032604876669, + "learning_rate": 8.63511441641037e-07, + "loss": 0.5873, + "step": 20356 + }, + { + "epoch": 1.470695540665017, + "grad_norm": 5.745353479890225, + "learning_rate": 8.63290327066661e-07, + "loss": 0.6216, + "step": 20357 + }, + { + "epoch": 1.4707677858652266, + "grad_norm": 6.5522590294242695, + "learning_rate": 8.630692348975686e-07, + "loss": 0.6493, + "step": 20358 + }, + { + "epoch": 1.4708400310654361, + "grad_norm": 7.983441143329966, + "learning_rate": 8.628481651367876e-07, + "loss": 0.6517, + "step": 20359 + }, + { + "epoch": 1.4709122762656457, + "grad_norm": 6.5683010074904455, + "learning_rate": 8.626271177873438e-07, + "loss": 0.5636, + "step": 20360 + }, + { + "epoch": 1.470984521465855, + "grad_norm": 6.34684429903179, + "learning_rate": 8.624060928522643e-07, + "loss": 0.6468, + "step": 20361 + }, + { + "epoch": 1.4710567666660646, + "grad_norm": 6.887801580178552, + "learning_rate": 8.621850903345732e-07, + "loss": 0.6051, + "step": 20362 + }, + { + "epoch": 1.471129011866274, + "grad_norm": 7.160024655262358, + "learning_rate": 8.619641102372964e-07, + "loss": 0.6244, + "step": 20363 + }, + { + "epoch": 1.4712012570664836, + "grad_norm": 7.167637722260949, + "learning_rate": 8.61743152563459e-07, + "loss": 0.6784, + "step": 20364 + }, + { + "epoch": 1.4712735022666932, + "grad_norm": 7.881310492819612, + "learning_rate": 8.615222173160859e-07, + "loss": 0.6236, + "step": 20365 + }, + { + "epoch": 1.4713457474669027, + "grad_norm": 7.464737201211728, + "learning_rate": 8.613013044982016e-07, + "loss": 0.6225, + "step": 20366 + }, + { + "epoch": 1.4714179926671123, + "grad_norm": 7.464073600545179, + "learning_rate": 8.610804141128299e-07, + "loss": 0.6025, + "step": 20367 + }, + { + "epoch": 1.4714902378673216, + "grad_norm": 6.9227454195779226, + "learning_rate": 8.608595461629957e-07, + "loss": 0.5845, + "step": 20368 + }, + { + "epoch": 1.4715624830675311, + "grad_norm": 8.318011528959216, + "learning_rate": 8.606387006517209e-07, + "loss": 0.5977, + "step": 20369 + }, + { + "epoch": 1.4716347282677407, + "grad_norm": 7.210118537595056, + "learning_rate": 8.604178775820291e-07, + "loss": 0.6219, + "step": 20370 + }, + { + "epoch": 1.4717069734679502, + "grad_norm": 7.2515246662582316, + "learning_rate": 8.601970769569445e-07, + "loss": 0.6623, + "step": 20371 + }, + { + "epoch": 1.4717792186681597, + "grad_norm": 6.8612800333676125, + "learning_rate": 8.599762987794869e-07, + "loss": 0.6068, + "step": 20372 + }, + { + "epoch": 1.4718514638683693, + "grad_norm": 7.322807416105461, + "learning_rate": 8.597555430526813e-07, + "loss": 0.7034, + "step": 20373 + }, + { + "epoch": 1.4719237090685788, + "grad_norm": 7.200327961128072, + "learning_rate": 8.595348097795489e-07, + "loss": 0.6274, + "step": 20374 + }, + { + "epoch": 1.4719959542687882, + "grad_norm": 6.0521724867259055, + "learning_rate": 8.593140989631119e-07, + "loss": 0.5845, + "step": 20375 + }, + { + "epoch": 1.4720681994689977, + "grad_norm": 8.207669871928482, + "learning_rate": 8.5909341060639e-07, + "loss": 0.6039, + "step": 20376 + }, + { + "epoch": 1.4721404446692072, + "grad_norm": 6.336048280474195, + "learning_rate": 8.588727447124054e-07, + "loss": 0.5918, + "step": 20377 + }, + { + "epoch": 1.4722126898694168, + "grad_norm": 6.244420727510917, + "learning_rate": 8.586521012841795e-07, + "loss": 0.6875, + "step": 20378 + }, + { + "epoch": 1.4722849350696263, + "grad_norm": 8.000185010678074, + "learning_rate": 8.584314803247312e-07, + "loss": 0.6649, + "step": 20379 + }, + { + "epoch": 1.4723571802698359, + "grad_norm": 7.113175602421109, + "learning_rate": 8.58210881837081e-07, + "loss": 0.6334, + "step": 20380 + }, + { + "epoch": 1.4724294254700454, + "grad_norm": 8.348917414675409, + "learning_rate": 8.579903058242494e-07, + "loss": 0.6636, + "step": 20381 + }, + { + "epoch": 1.4725016706702547, + "grad_norm": 6.157070047202818, + "learning_rate": 8.577697522892553e-07, + "loss": 0.6412, + "step": 20382 + }, + { + "epoch": 1.4725739158704645, + "grad_norm": 7.206302099378694, + "learning_rate": 8.575492212351183e-07, + "loss": 0.6252, + "step": 20383 + }, + { + "epoch": 1.4726461610706738, + "grad_norm": 7.411191683967814, + "learning_rate": 8.573287126648571e-07, + "loss": 0.6144, + "step": 20384 + }, + { + "epoch": 1.4727184062708834, + "grad_norm": 7.622440846715041, + "learning_rate": 8.571082265814907e-07, + "loss": 0.683, + "step": 20385 + }, + { + "epoch": 1.472790651471093, + "grad_norm": 6.698708241754845, + "learning_rate": 8.568877629880376e-07, + "loss": 0.5387, + "step": 20386 + }, + { + "epoch": 1.4728628966713024, + "grad_norm": 8.753758958209753, + "learning_rate": 8.566673218875146e-07, + "loss": 0.7862, + "step": 20387 + }, + { + "epoch": 1.472935141871512, + "grad_norm": 6.383506933230477, + "learning_rate": 8.564469032829398e-07, + "loss": 0.6279, + "step": 20388 + }, + { + "epoch": 1.4730073870717213, + "grad_norm": 8.997181345052557, + "learning_rate": 8.562265071773315e-07, + "loss": 0.6808, + "step": 20389 + }, + { + "epoch": 1.473079632271931, + "grad_norm": 7.292074515426004, + "learning_rate": 8.560061335737055e-07, + "loss": 0.6223, + "step": 20390 + }, + { + "epoch": 1.4731518774721404, + "grad_norm": 6.883928362210412, + "learning_rate": 8.557857824750787e-07, + "loss": 0.6661, + "step": 20391 + }, + { + "epoch": 1.47322412267235, + "grad_norm": 6.578643415958638, + "learning_rate": 8.555654538844683e-07, + "loss": 0.6683, + "step": 20392 + }, + { + "epoch": 1.4732963678725595, + "grad_norm": 6.5200648560838, + "learning_rate": 8.553451478048896e-07, + "loss": 0.5839, + "step": 20393 + }, + { + "epoch": 1.473368613072769, + "grad_norm": 7.416758104539341, + "learning_rate": 8.551248642393589e-07, + "loss": 0.6433, + "step": 20394 + }, + { + "epoch": 1.4734408582729785, + "grad_norm": 8.54987066516876, + "learning_rate": 8.549046031908919e-07, + "loss": 0.6085, + "step": 20395 + }, + { + "epoch": 1.4735131034731879, + "grad_norm": 6.6247598856472125, + "learning_rate": 8.546843646625041e-07, + "loss": 0.5883, + "step": 20396 + }, + { + "epoch": 1.4735853486733976, + "grad_norm": 7.691641335602362, + "learning_rate": 8.544641486572092e-07, + "loss": 0.6424, + "step": 20397 + }, + { + "epoch": 1.473657593873607, + "grad_norm": 8.447962173232732, + "learning_rate": 8.542439551780224e-07, + "loss": 0.6849, + "step": 20398 + }, + { + "epoch": 1.4737298390738165, + "grad_norm": 6.607753257098017, + "learning_rate": 8.54023784227958e-07, + "loss": 0.578, + "step": 20399 + }, + { + "epoch": 1.473802084274026, + "grad_norm": 6.716274071413684, + "learning_rate": 8.538036358100308e-07, + "loss": 0.6656, + "step": 20400 + }, + { + "epoch": 1.4738743294742356, + "grad_norm": 6.698988129230146, + "learning_rate": 8.535835099272519e-07, + "loss": 0.6575, + "step": 20401 + }, + { + "epoch": 1.4739465746744451, + "grad_norm": 6.929403168049963, + "learning_rate": 8.533634065826374e-07, + "loss": 0.5944, + "step": 20402 + }, + { + "epoch": 1.4740188198746544, + "grad_norm": 8.916169007100729, + "learning_rate": 8.531433257791999e-07, + "loss": 0.6767, + "step": 20403 + }, + { + "epoch": 1.4740910650748642, + "grad_norm": 7.403994446144927, + "learning_rate": 8.529232675199509e-07, + "loss": 0.6508, + "step": 20404 + }, + { + "epoch": 1.4741633102750735, + "grad_norm": 7.605146733360081, + "learning_rate": 8.527032318079034e-07, + "loss": 0.5917, + "step": 20405 + }, + { + "epoch": 1.474235555475283, + "grad_norm": 6.247082754232764, + "learning_rate": 8.524832186460699e-07, + "loss": 0.5571, + "step": 20406 + }, + { + "epoch": 1.4743078006754926, + "grad_norm": 7.041023751729013, + "learning_rate": 8.522632280374624e-07, + "loss": 0.6297, + "step": 20407 + }, + { + "epoch": 1.4743800458757021, + "grad_norm": 7.440054062267484, + "learning_rate": 8.520432599850914e-07, + "loss": 0.7903, + "step": 20408 + }, + { + "epoch": 1.4744522910759117, + "grad_norm": 6.377297679244498, + "learning_rate": 8.518233144919683e-07, + "loss": 0.5757, + "step": 20409 + }, + { + "epoch": 1.4745245362761212, + "grad_norm": 6.79295723789637, + "learning_rate": 8.516033915611046e-07, + "loss": 0.606, + "step": 20410 + }, + { + "epoch": 1.4745967814763308, + "grad_norm": 9.886581389800442, + "learning_rate": 8.513834911955104e-07, + "loss": 0.7021, + "step": 20411 + }, + { + "epoch": 1.47466902667654, + "grad_norm": 6.661559087504843, + "learning_rate": 8.511636133981963e-07, + "loss": 0.6081, + "step": 20412 + }, + { + "epoch": 1.4747412718767496, + "grad_norm": 7.093834964730709, + "learning_rate": 8.509437581721719e-07, + "loss": 0.6421, + "step": 20413 + }, + { + "epoch": 1.4748135170769592, + "grad_norm": 6.385821264827129, + "learning_rate": 8.507239255204478e-07, + "loss": 0.6518, + "step": 20414 + }, + { + "epoch": 1.4748857622771687, + "grad_norm": 6.709770696277992, + "learning_rate": 8.505041154460319e-07, + "loss": 0.6233, + "step": 20415 + }, + { + "epoch": 1.4749580074773783, + "grad_norm": 6.181484187890212, + "learning_rate": 8.502843279519338e-07, + "loss": 0.5605, + "step": 20416 + }, + { + "epoch": 1.4750302526775878, + "grad_norm": 6.788139535129539, + "learning_rate": 8.500645630411624e-07, + "loss": 0.6321, + "step": 20417 + }, + { + "epoch": 1.4751024978777973, + "grad_norm": 7.350086611282848, + "learning_rate": 8.49844820716727e-07, + "loss": 0.6403, + "step": 20418 + }, + { + "epoch": 1.4751747430780067, + "grad_norm": 6.680192894805988, + "learning_rate": 8.496251009816337e-07, + "loss": 0.5852, + "step": 20419 + }, + { + "epoch": 1.4752469882782162, + "grad_norm": 7.4574839642896125, + "learning_rate": 8.494054038388916e-07, + "loss": 0.6287, + "step": 20420 + }, + { + "epoch": 1.4753192334784258, + "grad_norm": 7.080583225517271, + "learning_rate": 8.491857292915076e-07, + "loss": 0.6944, + "step": 20421 + }, + { + "epoch": 1.4753914786786353, + "grad_norm": 5.915899880738837, + "learning_rate": 8.489660773424893e-07, + "loss": 0.6603, + "step": 20422 + }, + { + "epoch": 1.4754637238788448, + "grad_norm": 7.057355509301481, + "learning_rate": 8.487464479948434e-07, + "loss": 0.6028, + "step": 20423 + }, + { + "epoch": 1.4755359690790544, + "grad_norm": 6.285217853858288, + "learning_rate": 8.485268412515768e-07, + "loss": 0.644, + "step": 20424 + }, + { + "epoch": 1.475608214279264, + "grad_norm": 7.2412000721639656, + "learning_rate": 8.483072571156961e-07, + "loss": 0.6852, + "step": 20425 + }, + { + "epoch": 1.4756804594794732, + "grad_norm": 7.936839399106977, + "learning_rate": 8.480876955902057e-07, + "loss": 0.6136, + "step": 20426 + }, + { + "epoch": 1.4757527046796828, + "grad_norm": 7.59219859409562, + "learning_rate": 8.478681566781122e-07, + "loss": 0.6204, + "step": 20427 + }, + { + "epoch": 1.4758249498798923, + "grad_norm": 7.096491750147541, + "learning_rate": 8.476486403824216e-07, + "loss": 0.5686, + "step": 20428 + }, + { + "epoch": 1.4758971950801019, + "grad_norm": 7.417554252176682, + "learning_rate": 8.474291467061366e-07, + "loss": 0.6013, + "step": 20429 + }, + { + "epoch": 1.4759694402803114, + "grad_norm": 7.602648133149041, + "learning_rate": 8.47209675652264e-07, + "loss": 0.6027, + "step": 20430 + }, + { + "epoch": 1.476041685480521, + "grad_norm": 7.416927575940596, + "learning_rate": 8.469902272238081e-07, + "loss": 0.6859, + "step": 20431 + }, + { + "epoch": 1.4761139306807305, + "grad_norm": 6.927337075846847, + "learning_rate": 8.46770801423773e-07, + "loss": 0.6446, + "step": 20432 + }, + { + "epoch": 1.4761861758809398, + "grad_norm": 7.09787472360854, + "learning_rate": 8.465513982551612e-07, + "loss": 0.6753, + "step": 20433 + }, + { + "epoch": 1.4762584210811494, + "grad_norm": 7.043185670276365, + "learning_rate": 8.463320177209769e-07, + "loss": 0.6274, + "step": 20434 + }, + { + "epoch": 1.476330666281359, + "grad_norm": 7.201969873316074, + "learning_rate": 8.461126598242233e-07, + "loss": 0.5745, + "step": 20435 + }, + { + "epoch": 1.4764029114815684, + "grad_norm": 7.324496348905958, + "learning_rate": 8.458933245679043e-07, + "loss": 0.6097, + "step": 20436 + }, + { + "epoch": 1.476475156681778, + "grad_norm": 7.860283558319477, + "learning_rate": 8.456740119550203e-07, + "loss": 0.6589, + "step": 20437 + }, + { + "epoch": 1.4765474018819875, + "grad_norm": 6.930132553792515, + "learning_rate": 8.454547219885744e-07, + "loss": 0.659, + "step": 20438 + }, + { + "epoch": 1.476619647082197, + "grad_norm": 5.6367175656725195, + "learning_rate": 8.452354546715691e-07, + "loss": 0.6265, + "step": 20439 + }, + { + "epoch": 1.4766918922824064, + "grad_norm": 6.3297340137107, + "learning_rate": 8.450162100070053e-07, + "loss": 0.5564, + "step": 20440 + }, + { + "epoch": 1.476764137482616, + "grad_norm": 6.863529971571064, + "learning_rate": 8.447969879978846e-07, + "loss": 0.5511, + "step": 20441 + }, + { + "epoch": 1.4768363826828255, + "grad_norm": 7.043992630876177, + "learning_rate": 8.445777886472082e-07, + "loss": 0.6264, + "step": 20442 + }, + { + "epoch": 1.476908627883035, + "grad_norm": 7.571225515763407, + "learning_rate": 8.443586119579769e-07, + "loss": 0.6803, + "step": 20443 + }, + { + "epoch": 1.4769808730832445, + "grad_norm": 6.944292472024181, + "learning_rate": 8.441394579331902e-07, + "loss": 0.5799, + "step": 20444 + }, + { + "epoch": 1.477053118283454, + "grad_norm": 5.64565232703484, + "learning_rate": 8.439203265758486e-07, + "loss": 0.5434, + "step": 20445 + }, + { + "epoch": 1.4771253634836636, + "grad_norm": 6.751869507907172, + "learning_rate": 8.437012178889525e-07, + "loss": 0.6437, + "step": 20446 + }, + { + "epoch": 1.477197608683873, + "grad_norm": 6.636980560150116, + "learning_rate": 8.434821318755002e-07, + "loss": 0.6048, + "step": 20447 + }, + { + "epoch": 1.4772698538840825, + "grad_norm": 6.821813254398046, + "learning_rate": 8.43263068538491e-07, + "loss": 0.6543, + "step": 20448 + }, + { + "epoch": 1.477342099084292, + "grad_norm": 7.129675151171368, + "learning_rate": 8.430440278809232e-07, + "loss": 0.6524, + "step": 20449 + }, + { + "epoch": 1.4774143442845016, + "grad_norm": 6.350113406632956, + "learning_rate": 8.428250099057981e-07, + "loss": 0.6539, + "step": 20450 + }, + { + "epoch": 1.4774865894847111, + "grad_norm": 7.5176839248693, + "learning_rate": 8.426060146161108e-07, + "loss": 0.7214, + "step": 20451 + }, + { + "epoch": 1.4775588346849207, + "grad_norm": 8.28641081452107, + "learning_rate": 8.423870420148602e-07, + "loss": 0.6039, + "step": 20452 + }, + { + "epoch": 1.4776310798851302, + "grad_norm": 6.819696107956924, + "learning_rate": 8.421680921050448e-07, + "loss": 0.6123, + "step": 20453 + }, + { + "epoch": 1.4777033250853395, + "grad_norm": 6.754062136720367, + "learning_rate": 8.419491648896602e-07, + "loss": 0.725, + "step": 20454 + }, + { + "epoch": 1.4777755702855493, + "grad_norm": 7.5541800138170085, + "learning_rate": 8.417302603717042e-07, + "loss": 0.6533, + "step": 20455 + }, + { + "epoch": 1.4778478154857586, + "grad_norm": 7.883580694961815, + "learning_rate": 8.415113785541734e-07, + "loss": 0.6957, + "step": 20456 + }, + { + "epoch": 1.4779200606859682, + "grad_norm": 6.712241919757918, + "learning_rate": 8.412925194400643e-07, + "loss": 0.627, + "step": 20457 + }, + { + "epoch": 1.4779923058861777, + "grad_norm": 6.682117898950789, + "learning_rate": 8.410736830323723e-07, + "loss": 0.6282, + "step": 20458 + }, + { + "epoch": 1.4780645510863872, + "grad_norm": 7.035716897080167, + "learning_rate": 8.408548693340937e-07, + "loss": 0.6262, + "step": 20459 + }, + { + "epoch": 1.4781367962865968, + "grad_norm": 7.130484628351446, + "learning_rate": 8.406360783482237e-07, + "loss": 0.6495, + "step": 20460 + }, + { + "epoch": 1.478209041486806, + "grad_norm": 7.350373612853645, + "learning_rate": 8.404173100777582e-07, + "loss": 0.6646, + "step": 20461 + }, + { + "epoch": 1.4782812866870159, + "grad_norm": 7.84011160673772, + "learning_rate": 8.401985645256902e-07, + "loss": 0.6646, + "step": 20462 + }, + { + "epoch": 1.4783535318872252, + "grad_norm": 6.645998802237757, + "learning_rate": 8.399798416950152e-07, + "loss": 0.6335, + "step": 20463 + }, + { + "epoch": 1.4784257770874347, + "grad_norm": 6.811049840726562, + "learning_rate": 8.39761141588728e-07, + "loss": 0.5985, + "step": 20464 + }, + { + "epoch": 1.4784980222876443, + "grad_norm": 6.073983240008351, + "learning_rate": 8.39542464209821e-07, + "loss": 0.6389, + "step": 20465 + }, + { + "epoch": 1.4785702674878538, + "grad_norm": 6.26040058694595, + "learning_rate": 8.393238095612882e-07, + "loss": 0.5817, + "step": 20466 + }, + { + "epoch": 1.4786425126880633, + "grad_norm": 7.328866754337401, + "learning_rate": 8.391051776461232e-07, + "loss": 0.5869, + "step": 20467 + }, + { + "epoch": 1.4787147578882727, + "grad_norm": 7.5244165340075435, + "learning_rate": 8.388865684673187e-07, + "loss": 0.6416, + "step": 20468 + }, + { + "epoch": 1.4787870030884824, + "grad_norm": 8.101912811857302, + "learning_rate": 8.386679820278671e-07, + "loss": 0.6919, + "step": 20469 + }, + { + "epoch": 1.4788592482886918, + "grad_norm": 7.1994081783780315, + "learning_rate": 8.38449418330761e-07, + "loss": 0.6391, + "step": 20470 + }, + { + "epoch": 1.4789314934889013, + "grad_norm": 7.251114595171866, + "learning_rate": 8.382308773789932e-07, + "loss": 0.6231, + "step": 20471 + }, + { + "epoch": 1.4790037386891108, + "grad_norm": 6.821570281537604, + "learning_rate": 8.380123591755534e-07, + "loss": 0.5901, + "step": 20472 + }, + { + "epoch": 1.4790759838893204, + "grad_norm": 5.3872188698334185, + "learning_rate": 8.377938637234342e-07, + "loss": 0.5764, + "step": 20473 + }, + { + "epoch": 1.47914822908953, + "grad_norm": 8.871867741994654, + "learning_rate": 8.375753910256263e-07, + "loss": 0.6804, + "step": 20474 + }, + { + "epoch": 1.4792204742897392, + "grad_norm": 7.232777099786452, + "learning_rate": 8.373569410851212e-07, + "loss": 0.5385, + "step": 20475 + }, + { + "epoch": 1.479292719489949, + "grad_norm": 7.317830286785911, + "learning_rate": 8.371385139049077e-07, + "loss": 0.684, + "step": 20476 + }, + { + "epoch": 1.4793649646901583, + "grad_norm": 7.6278621117261824, + "learning_rate": 8.369201094879769e-07, + "loss": 0.6402, + "step": 20477 + }, + { + "epoch": 1.4794372098903679, + "grad_norm": 7.454078725257333, + "learning_rate": 8.367017278373188e-07, + "loss": 0.5917, + "step": 20478 + }, + { + "epoch": 1.4795094550905774, + "grad_norm": 6.934305916330759, + "learning_rate": 8.364833689559224e-07, + "loss": 0.643, + "step": 20479 + }, + { + "epoch": 1.479581700290787, + "grad_norm": 7.558587441454086, + "learning_rate": 8.36265032846777e-07, + "loss": 0.6652, + "step": 20480 + }, + { + "epoch": 1.4796539454909965, + "grad_norm": 9.222120179782497, + "learning_rate": 8.360467195128713e-07, + "loss": 0.6721, + "step": 20481 + }, + { + "epoch": 1.479726190691206, + "grad_norm": 6.665899741246606, + "learning_rate": 8.35828428957195e-07, + "loss": 0.5285, + "step": 20482 + }, + { + "epoch": 1.4797984358914156, + "grad_norm": 5.924764044203971, + "learning_rate": 8.356101611827347e-07, + "loss": 0.5291, + "step": 20483 + }, + { + "epoch": 1.479870681091625, + "grad_norm": 7.142296393045677, + "learning_rate": 8.353919161924789e-07, + "loss": 0.6524, + "step": 20484 + }, + { + "epoch": 1.4799429262918344, + "grad_norm": 6.737963647406353, + "learning_rate": 8.351736939894159e-07, + "loss": 0.6182, + "step": 20485 + }, + { + "epoch": 1.480015171492044, + "grad_norm": 7.173889276216362, + "learning_rate": 8.349554945765309e-07, + "loss": 0.6577, + "step": 20486 + }, + { + "epoch": 1.4800874166922535, + "grad_norm": 7.030478473295865, + "learning_rate": 8.347373179568133e-07, + "loss": 0.6745, + "step": 20487 + }, + { + "epoch": 1.480159661892463, + "grad_norm": 7.2973287633269015, + "learning_rate": 8.345191641332487e-07, + "loss": 0.6546, + "step": 20488 + }, + { + "epoch": 1.4802319070926726, + "grad_norm": 6.7123652439336725, + "learning_rate": 8.343010331088244e-07, + "loss": 0.6665, + "step": 20489 + }, + { + "epoch": 1.4803041522928821, + "grad_norm": 7.103402582359409, + "learning_rate": 8.340829248865248e-07, + "loss": 0.5735, + "step": 20490 + }, + { + "epoch": 1.4803763974930915, + "grad_norm": 7.11912990825668, + "learning_rate": 8.338648394693363e-07, + "loss": 0.6068, + "step": 20491 + }, + { + "epoch": 1.480448642693301, + "grad_norm": 8.833150705812823, + "learning_rate": 8.336467768602447e-07, + "loss": 0.5993, + "step": 20492 + }, + { + "epoch": 1.4805208878935106, + "grad_norm": 7.211984081096292, + "learning_rate": 8.334287370622357e-07, + "loss": 0.5826, + "step": 20493 + }, + { + "epoch": 1.48059313309372, + "grad_norm": 6.533504991157688, + "learning_rate": 8.332107200782924e-07, + "loss": 0.577, + "step": 20494 + }, + { + "epoch": 1.4806653782939296, + "grad_norm": 10.259021417600088, + "learning_rate": 8.329927259114001e-07, + "loss": 0.5864, + "step": 20495 + }, + { + "epoch": 1.4807376234941392, + "grad_norm": 8.715418900507773, + "learning_rate": 8.327747545645431e-07, + "loss": 0.649, + "step": 20496 + }, + { + "epoch": 1.4808098686943487, + "grad_norm": 6.261732761798109, + "learning_rate": 8.325568060407052e-07, + "loss": 0.5971, + "step": 20497 + }, + { + "epoch": 1.480882113894558, + "grad_norm": 7.368957889332512, + "learning_rate": 8.3233888034287e-07, + "loss": 0.5834, + "step": 20498 + }, + { + "epoch": 1.4809543590947676, + "grad_norm": 7.544512582332968, + "learning_rate": 8.321209774740207e-07, + "loss": 0.6149, + "step": 20499 + }, + { + "epoch": 1.4810266042949771, + "grad_norm": 7.5648736932853895, + "learning_rate": 8.319030974371408e-07, + "loss": 0.6547, + "step": 20500 + }, + { + "epoch": 1.4810988494951867, + "grad_norm": 9.37760095756043, + "learning_rate": 8.316852402352116e-07, + "loss": 0.5867, + "step": 20501 + }, + { + "epoch": 1.4811710946953962, + "grad_norm": 6.2327566699484676, + "learning_rate": 8.314674058712163e-07, + "loss": 0.5345, + "step": 20502 + }, + { + "epoch": 1.4812433398956057, + "grad_norm": 7.055449080310446, + "learning_rate": 8.312495943481372e-07, + "loss": 0.6188, + "step": 20503 + }, + { + "epoch": 1.4813155850958153, + "grad_norm": 6.76500052373846, + "learning_rate": 8.310318056689548e-07, + "loss": 0.5794, + "step": 20504 + }, + { + "epoch": 1.4813878302960246, + "grad_norm": 8.004037315148222, + "learning_rate": 8.30814039836651e-07, + "loss": 0.6124, + "step": 20505 + }, + { + "epoch": 1.4814600754962342, + "grad_norm": 6.717871320177463, + "learning_rate": 8.305962968542061e-07, + "loss": 0.6166, + "step": 20506 + }, + { + "epoch": 1.4815323206964437, + "grad_norm": 8.335560869683997, + "learning_rate": 8.303785767246034e-07, + "loss": 0.6941, + "step": 20507 + }, + { + "epoch": 1.4816045658966532, + "grad_norm": 6.810484981884665, + "learning_rate": 8.301608794508209e-07, + "loss": 0.6298, + "step": 20508 + }, + { + "epoch": 1.4816768110968628, + "grad_norm": 6.175199052946056, + "learning_rate": 8.299432050358395e-07, + "loss": 0.6701, + "step": 20509 + }, + { + "epoch": 1.4817490562970723, + "grad_norm": 7.125157940101168, + "learning_rate": 8.297255534826393e-07, + "loss": 0.6435, + "step": 20510 + }, + { + "epoch": 1.4818213014972819, + "grad_norm": 7.010480392037941, + "learning_rate": 8.29507924794199e-07, + "loss": 0.6561, + "step": 20511 + }, + { + "epoch": 1.4818935466974912, + "grad_norm": 7.73994039034499, + "learning_rate": 8.292903189734977e-07, + "loss": 0.5953, + "step": 20512 + }, + { + "epoch": 1.4819657918977007, + "grad_norm": 6.7458503118171365, + "learning_rate": 8.290727360235151e-07, + "loss": 0.6724, + "step": 20513 + }, + { + "epoch": 1.4820380370979103, + "grad_norm": 6.277459514431158, + "learning_rate": 8.288551759472294e-07, + "loss": 0.5499, + "step": 20514 + }, + { + "epoch": 1.4821102822981198, + "grad_norm": 6.677529576506359, + "learning_rate": 8.286376387476183e-07, + "loss": 0.6099, + "step": 20515 + }, + { + "epoch": 1.4821825274983293, + "grad_norm": 6.3801210067897065, + "learning_rate": 8.284201244276607e-07, + "loss": 0.5956, + "step": 20516 + }, + { + "epoch": 1.482254772698539, + "grad_norm": 6.423355711479118, + "learning_rate": 8.282026329903333e-07, + "loss": 0.6613, + "step": 20517 + }, + { + "epoch": 1.4823270178987484, + "grad_norm": 8.626437302399326, + "learning_rate": 8.27985164438615e-07, + "loss": 0.6684, + "step": 20518 + }, + { + "epoch": 1.4823992630989578, + "grad_norm": 7.480527139540394, + "learning_rate": 8.277677187754804e-07, + "loss": 0.6802, + "step": 20519 + }, + { + "epoch": 1.4824715082991673, + "grad_norm": 6.874670679701371, + "learning_rate": 8.275502960039075e-07, + "loss": 0.5915, + "step": 20520 + }, + { + "epoch": 1.4825437534993768, + "grad_norm": 7.059802595193825, + "learning_rate": 8.273328961268734e-07, + "loss": 0.6466, + "step": 20521 + }, + { + "epoch": 1.4826159986995864, + "grad_norm": 6.453795612949073, + "learning_rate": 8.271155191473523e-07, + "loss": 0.6012, + "step": 20522 + }, + { + "epoch": 1.482688243899796, + "grad_norm": 6.940230923199457, + "learning_rate": 8.268981650683208e-07, + "loss": 0.6067, + "step": 20523 + }, + { + "epoch": 1.4827604891000055, + "grad_norm": 6.6364175549664735, + "learning_rate": 8.266808338927543e-07, + "loss": 0.5857, + "step": 20524 + }, + { + "epoch": 1.482832734300215, + "grad_norm": 7.9842039563993366, + "learning_rate": 8.264635256236281e-07, + "loss": 0.6556, + "step": 20525 + }, + { + "epoch": 1.4829049795004243, + "grad_norm": 7.123031076466576, + "learning_rate": 8.262462402639166e-07, + "loss": 0.6086, + "step": 20526 + }, + { + "epoch": 1.482977224700634, + "grad_norm": 7.788187021777381, + "learning_rate": 8.260289778165945e-07, + "loss": 0.6212, + "step": 20527 + }, + { + "epoch": 1.4830494699008434, + "grad_norm": 5.82361188729283, + "learning_rate": 8.25811738284637e-07, + "loss": 0.5753, + "step": 20528 + }, + { + "epoch": 1.483121715101053, + "grad_norm": 7.204781099868439, + "learning_rate": 8.255945216710157e-07, + "loss": 0.7313, + "step": 20529 + }, + { + "epoch": 1.4831939603012625, + "grad_norm": 7.496030392759218, + "learning_rate": 8.253773279787056e-07, + "loss": 0.6196, + "step": 20530 + }, + { + "epoch": 1.483266205501472, + "grad_norm": 6.395291217439545, + "learning_rate": 8.251601572106796e-07, + "loss": 0.6005, + "step": 20531 + }, + { + "epoch": 1.4833384507016816, + "grad_norm": 6.474118276529715, + "learning_rate": 8.249430093699112e-07, + "loss": 0.5925, + "step": 20532 + }, + { + "epoch": 1.483410695901891, + "grad_norm": 6.813166874434144, + "learning_rate": 8.247258844593717e-07, + "loss": 0.589, + "step": 20533 + }, + { + "epoch": 1.4834829411021007, + "grad_norm": 6.838493662711926, + "learning_rate": 8.245087824820333e-07, + "loss": 0.5767, + "step": 20534 + }, + { + "epoch": 1.48355518630231, + "grad_norm": 6.523594800685827, + "learning_rate": 8.242917034408704e-07, + "loss": 0.5635, + "step": 20535 + }, + { + "epoch": 1.4836274315025195, + "grad_norm": 7.5391275195544685, + "learning_rate": 8.240746473388523e-07, + "loss": 0.646, + "step": 20536 + }, + { + "epoch": 1.483699676702729, + "grad_norm": 7.221108319710364, + "learning_rate": 8.238576141789506e-07, + "loss": 0.6013, + "step": 20537 + }, + { + "epoch": 1.4837719219029386, + "grad_norm": 6.912501754898483, + "learning_rate": 8.23640603964137e-07, + "loss": 0.5635, + "step": 20538 + }, + { + "epoch": 1.4838441671031481, + "grad_norm": 7.101734390108958, + "learning_rate": 8.234236166973827e-07, + "loss": 0.5965, + "step": 20539 + }, + { + "epoch": 1.4839164123033575, + "grad_norm": 6.313601558241577, + "learning_rate": 8.232066523816565e-07, + "loss": 0.5491, + "step": 20540 + }, + { + "epoch": 1.4839886575035672, + "grad_norm": 7.043101448590438, + "learning_rate": 8.229897110199295e-07, + "loss": 0.5984, + "step": 20541 + }, + { + "epoch": 1.4840609027037766, + "grad_norm": 7.3195302702062, + "learning_rate": 8.22772792615171e-07, + "loss": 0.5858, + "step": 20542 + }, + { + "epoch": 1.484133147903986, + "grad_norm": 7.811830049399636, + "learning_rate": 8.225558971703507e-07, + "loss": 0.6434, + "step": 20543 + }, + { + "epoch": 1.4842053931041956, + "grad_norm": 8.23395805099509, + "learning_rate": 8.223390246884374e-07, + "loss": 0.6454, + "step": 20544 + }, + { + "epoch": 1.4842776383044052, + "grad_norm": 7.105409425862162, + "learning_rate": 8.221221751724006e-07, + "loss": 0.6254, + "step": 20545 + }, + { + "epoch": 1.4843498835046147, + "grad_norm": 7.073437348996299, + "learning_rate": 8.219053486252094e-07, + "loss": 0.5578, + "step": 20546 + }, + { + "epoch": 1.484422128704824, + "grad_norm": 7.621166829160127, + "learning_rate": 8.2168854504983e-07, + "loss": 0.6172, + "step": 20547 + }, + { + "epoch": 1.4844943739050338, + "grad_norm": 6.949470982897811, + "learning_rate": 8.214717644492312e-07, + "loss": 0.6774, + "step": 20548 + }, + { + "epoch": 1.4845666191052431, + "grad_norm": 6.793752369533712, + "learning_rate": 8.212550068263808e-07, + "loss": 0.6655, + "step": 20549 + }, + { + "epoch": 1.4846388643054527, + "grad_norm": 8.921365325853648, + "learning_rate": 8.210382721842467e-07, + "loss": 0.6435, + "step": 20550 + }, + { + "epoch": 1.4847111095056622, + "grad_norm": 7.141591613921178, + "learning_rate": 8.208215605257941e-07, + "loss": 0.619, + "step": 20551 + }, + { + "epoch": 1.4847833547058717, + "grad_norm": 6.211389220503251, + "learning_rate": 8.206048718539905e-07, + "loss": 0.611, + "step": 20552 + }, + { + "epoch": 1.4848555999060813, + "grad_norm": 7.175741910956403, + "learning_rate": 8.203882061718024e-07, + "loss": 0.6616, + "step": 20553 + }, + { + "epoch": 1.4849278451062908, + "grad_norm": 7.121506754856914, + "learning_rate": 8.201715634821958e-07, + "loss": 0.6879, + "step": 20554 + }, + { + "epoch": 1.4850000903065004, + "grad_norm": 7.915699494828719, + "learning_rate": 8.199549437881357e-07, + "loss": 0.6526, + "step": 20555 + }, + { + "epoch": 1.4850723355067097, + "grad_norm": 6.853273497789071, + "learning_rate": 8.197383470925882e-07, + "loss": 0.6062, + "step": 20556 + }, + { + "epoch": 1.4851445807069192, + "grad_norm": 7.879687654917706, + "learning_rate": 8.195217733985192e-07, + "loss": 0.6943, + "step": 20557 + }, + { + "epoch": 1.4852168259071288, + "grad_norm": 7.777802833637872, + "learning_rate": 8.19305222708891e-07, + "loss": 0.6564, + "step": 20558 + }, + { + "epoch": 1.4852890711073383, + "grad_norm": 6.682423598983381, + "learning_rate": 8.190886950266697e-07, + "loss": 0.5728, + "step": 20559 + }, + { + "epoch": 1.4853613163075479, + "grad_norm": 7.507141527883815, + "learning_rate": 8.188721903548197e-07, + "loss": 0.6777, + "step": 20560 + }, + { + "epoch": 1.4854335615077574, + "grad_norm": 7.494693913638559, + "learning_rate": 8.186557086963032e-07, + "loss": 0.6292, + "step": 20561 + }, + { + "epoch": 1.485505806707967, + "grad_norm": 9.017064764667872, + "learning_rate": 8.184392500540847e-07, + "loss": 0.6631, + "step": 20562 + }, + { + "epoch": 1.4855780519081763, + "grad_norm": 6.298674598419682, + "learning_rate": 8.182228144311263e-07, + "loss": 0.5918, + "step": 20563 + }, + { + "epoch": 1.4856502971083858, + "grad_norm": 6.213514721778158, + "learning_rate": 8.180064018303935e-07, + "loss": 0.6212, + "step": 20564 + }, + { + "epoch": 1.4857225423085954, + "grad_norm": 6.455284366977564, + "learning_rate": 8.17790012254846e-07, + "loss": 0.6546, + "step": 20565 + }, + { + "epoch": 1.485794787508805, + "grad_norm": 8.935320395012338, + "learning_rate": 8.175736457074473e-07, + "loss": 0.6522, + "step": 20566 + }, + { + "epoch": 1.4858670327090144, + "grad_norm": 6.643034087371999, + "learning_rate": 8.17357302191159e-07, + "loss": 0.6403, + "step": 20567 + }, + { + "epoch": 1.485939277909224, + "grad_norm": 8.411375390619186, + "learning_rate": 8.171409817089437e-07, + "loss": 0.6891, + "step": 20568 + }, + { + "epoch": 1.4860115231094335, + "grad_norm": 6.805222289906411, + "learning_rate": 8.169246842637607e-07, + "loss": 0.6175, + "step": 20569 + }, + { + "epoch": 1.4860837683096428, + "grad_norm": 7.318334876193315, + "learning_rate": 8.16708409858572e-07, + "loss": 0.6057, + "step": 20570 + }, + { + "epoch": 1.4861560135098524, + "grad_norm": 5.366045390275148, + "learning_rate": 8.164921584963384e-07, + "loss": 0.5744, + "step": 20571 + }, + { + "epoch": 1.486228258710062, + "grad_norm": 6.935999527611106, + "learning_rate": 8.162759301800197e-07, + "loss": 0.5484, + "step": 20572 + }, + { + "epoch": 1.4863005039102715, + "grad_norm": 7.552159577785025, + "learning_rate": 8.160597249125763e-07, + "loss": 0.6159, + "step": 20573 + }, + { + "epoch": 1.486372749110481, + "grad_norm": 7.1491757538012894, + "learning_rate": 8.158435426969677e-07, + "loss": 0.658, + "step": 20574 + }, + { + "epoch": 1.4864449943106905, + "grad_norm": 6.103752495403667, + "learning_rate": 8.156273835361542e-07, + "loss": 0.6226, + "step": 20575 + }, + { + "epoch": 1.4865172395109, + "grad_norm": 8.06777001183519, + "learning_rate": 8.154112474330933e-07, + "loss": 0.6086, + "step": 20576 + }, + { + "epoch": 1.4865894847111094, + "grad_norm": 7.807985512047756, + "learning_rate": 8.151951343907444e-07, + "loss": 0.5838, + "step": 20577 + }, + { + "epoch": 1.486661729911319, + "grad_norm": 7.054334122603641, + "learning_rate": 8.149790444120664e-07, + "loss": 0.6972, + "step": 20578 + }, + { + "epoch": 1.4867339751115285, + "grad_norm": 7.101740030182661, + "learning_rate": 8.147629775000165e-07, + "loss": 0.6168, + "step": 20579 + }, + { + "epoch": 1.486806220311738, + "grad_norm": 6.133608004736015, + "learning_rate": 8.145469336575529e-07, + "loss": 0.6167, + "step": 20580 + }, + { + "epoch": 1.4868784655119476, + "grad_norm": 5.890923530441039, + "learning_rate": 8.143309128876331e-07, + "loss": 0.6348, + "step": 20581 + }, + { + "epoch": 1.4869507107121571, + "grad_norm": 6.6298790006111075, + "learning_rate": 8.14114915193214e-07, + "loss": 0.6448, + "step": 20582 + }, + { + "epoch": 1.4870229559123667, + "grad_norm": 6.944995851453691, + "learning_rate": 8.13898940577253e-07, + "loss": 0.5654, + "step": 20583 + }, + { + "epoch": 1.487095201112576, + "grad_norm": 6.634349343910149, + "learning_rate": 8.136829890427062e-07, + "loss": 0.6159, + "step": 20584 + }, + { + "epoch": 1.4871674463127855, + "grad_norm": 6.266839497124116, + "learning_rate": 8.13467060592531e-07, + "loss": 0.608, + "step": 20585 + }, + { + "epoch": 1.487239691512995, + "grad_norm": 7.444827533310119, + "learning_rate": 8.132511552296812e-07, + "loss": 0.6879, + "step": 20586 + }, + { + "epoch": 1.4873119367132046, + "grad_norm": 6.728376722078586, + "learning_rate": 8.130352729571134e-07, + "loss": 0.5854, + "step": 20587 + }, + { + "epoch": 1.4873841819134141, + "grad_norm": 6.440542446534704, + "learning_rate": 8.128194137777828e-07, + "loss": 0.5926, + "step": 20588 + }, + { + "epoch": 1.4874564271136237, + "grad_norm": 6.676350649082323, + "learning_rate": 8.126035776946453e-07, + "loss": 0.6139, + "step": 20589 + }, + { + "epoch": 1.4875286723138332, + "grad_norm": 6.838757230949728, + "learning_rate": 8.12387764710654e-07, + "loss": 0.6509, + "step": 20590 + }, + { + "epoch": 1.4876009175140426, + "grad_norm": 6.7066922599248535, + "learning_rate": 8.121719748287629e-07, + "loss": 0.5817, + "step": 20591 + }, + { + "epoch": 1.487673162714252, + "grad_norm": 7.242655486918957, + "learning_rate": 8.119562080519278e-07, + "loss": 0.5294, + "step": 20592 + }, + { + "epoch": 1.4877454079144616, + "grad_norm": 6.4114584986058905, + "learning_rate": 8.117404643831022e-07, + "loss": 0.6759, + "step": 20593 + }, + { + "epoch": 1.4878176531146712, + "grad_norm": 7.123596320836571, + "learning_rate": 8.11524743825238e-07, + "loss": 0.624, + "step": 20594 + }, + { + "epoch": 1.4878898983148807, + "grad_norm": 5.638390095682787, + "learning_rate": 8.113090463812892e-07, + "loss": 0.6114, + "step": 20595 + }, + { + "epoch": 1.4879621435150903, + "grad_norm": 8.811732481952461, + "learning_rate": 8.110933720542091e-07, + "loss": 0.5701, + "step": 20596 + }, + { + "epoch": 1.4880343887152998, + "grad_norm": 6.478918661622924, + "learning_rate": 8.108777208469487e-07, + "loss": 0.5771, + "step": 20597 + }, + { + "epoch": 1.4881066339155091, + "grad_norm": 6.763254221601878, + "learning_rate": 8.106620927624606e-07, + "loss": 0.6002, + "step": 20598 + }, + { + "epoch": 1.4881788791157187, + "grad_norm": 6.9259822878675825, + "learning_rate": 8.104464878036969e-07, + "loss": 0.5786, + "step": 20599 + }, + { + "epoch": 1.4882511243159282, + "grad_norm": 8.859701375987775, + "learning_rate": 8.102309059736089e-07, + "loss": 0.6631, + "step": 20600 + }, + { + "epoch": 1.4883233695161378, + "grad_norm": 6.890290059709649, + "learning_rate": 8.100153472751476e-07, + "loss": 0.5952, + "step": 20601 + }, + { + "epoch": 1.4883956147163473, + "grad_norm": 6.308389363949209, + "learning_rate": 8.097998117112641e-07, + "loss": 0.6196, + "step": 20602 + }, + { + "epoch": 1.4884678599165568, + "grad_norm": 6.511112983596295, + "learning_rate": 8.095842992849099e-07, + "loss": 0.6322, + "step": 20603 + }, + { + "epoch": 1.4885401051167664, + "grad_norm": 6.576267297254683, + "learning_rate": 8.09368809999033e-07, + "loss": 0.6209, + "step": 20604 + }, + { + "epoch": 1.4886123503169757, + "grad_norm": 7.391777739036161, + "learning_rate": 8.091533438565844e-07, + "loss": 0.6731, + "step": 20605 + }, + { + "epoch": 1.4886845955171855, + "grad_norm": 8.178734666476771, + "learning_rate": 8.089379008605138e-07, + "loss": 0.6549, + "step": 20606 + }, + { + "epoch": 1.4887568407173948, + "grad_norm": 7.755105828572136, + "learning_rate": 8.087224810137712e-07, + "loss": 0.6229, + "step": 20607 + }, + { + "epoch": 1.4888290859176043, + "grad_norm": 6.138826917486073, + "learning_rate": 8.085070843193035e-07, + "loss": 0.5543, + "step": 20608 + }, + { + "epoch": 1.4889013311178139, + "grad_norm": 7.0820643765600035, + "learning_rate": 8.082917107800605e-07, + "loss": 0.6491, + "step": 20609 + }, + { + "epoch": 1.4889735763180234, + "grad_norm": 6.362074970071061, + "learning_rate": 8.080763603989908e-07, + "loss": 0.6691, + "step": 20610 + }, + { + "epoch": 1.489045821518233, + "grad_norm": 8.5345819135996, + "learning_rate": 8.078610331790418e-07, + "loss": 0.5926, + "step": 20611 + }, + { + "epoch": 1.4891180667184423, + "grad_norm": 7.283827165961157, + "learning_rate": 8.076457291231615e-07, + "loss": 0.631, + "step": 20612 + }, + { + "epoch": 1.489190311918652, + "grad_norm": 5.512594368401527, + "learning_rate": 8.07430448234297e-07, + "loss": 0.5425, + "step": 20613 + }, + { + "epoch": 1.4892625571188614, + "grad_norm": 7.017266096423705, + "learning_rate": 8.072151905153963e-07, + "loss": 0.5688, + "step": 20614 + }, + { + "epoch": 1.489334802319071, + "grad_norm": 7.09277014329832, + "learning_rate": 8.069999559694048e-07, + "loss": 0.6326, + "step": 20615 + }, + { + "epoch": 1.4894070475192804, + "grad_norm": 7.038487486051857, + "learning_rate": 8.067847445992691e-07, + "loss": 0.6871, + "step": 20616 + }, + { + "epoch": 1.48947929271949, + "grad_norm": 7.076404488741528, + "learning_rate": 8.065695564079362e-07, + "loss": 0.5205, + "step": 20617 + }, + { + "epoch": 1.4895515379196995, + "grad_norm": 7.137609734167675, + "learning_rate": 8.063543913983507e-07, + "loss": 0.6727, + "step": 20618 + }, + { + "epoch": 1.4896237831199088, + "grad_norm": 7.132618094027196, + "learning_rate": 8.061392495734574e-07, + "loss": 0.6556, + "step": 20619 + }, + { + "epoch": 1.4896960283201186, + "grad_norm": 8.296889253018595, + "learning_rate": 8.059241309362037e-07, + "loss": 0.5612, + "step": 20620 + }, + { + "epoch": 1.489768273520328, + "grad_norm": 6.647694995626892, + "learning_rate": 8.057090354895339e-07, + "loss": 0.6192, + "step": 20621 + }, + { + "epoch": 1.4898405187205375, + "grad_norm": 7.06806577951319, + "learning_rate": 8.054939632363912e-07, + "loss": 0.5561, + "step": 20622 + }, + { + "epoch": 1.489912763920747, + "grad_norm": 8.416726090123872, + "learning_rate": 8.052789141797204e-07, + "loss": 0.6816, + "step": 20623 + }, + { + "epoch": 1.4899850091209565, + "grad_norm": 8.769450340662392, + "learning_rate": 8.050638883224654e-07, + "loss": 0.57, + "step": 20624 + }, + { + "epoch": 1.490057254321166, + "grad_norm": 7.467764434270269, + "learning_rate": 8.048488856675704e-07, + "loss": 0.634, + "step": 20625 + }, + { + "epoch": 1.4901294995213754, + "grad_norm": 6.241709894999132, + "learning_rate": 8.046339062179775e-07, + "loss": 0.6554, + "step": 20626 + }, + { + "epoch": 1.4902017447215852, + "grad_norm": 5.726595806946114, + "learning_rate": 8.044189499766297e-07, + "loss": 0.5633, + "step": 20627 + }, + { + "epoch": 1.4902739899217945, + "grad_norm": 7.773890435139834, + "learning_rate": 8.042040169464702e-07, + "loss": 0.6276, + "step": 20628 + }, + { + "epoch": 1.490346235122004, + "grad_norm": 6.521478818038213, + "learning_rate": 8.039891071304409e-07, + "loss": 0.5643, + "step": 20629 + }, + { + "epoch": 1.4904184803222136, + "grad_norm": 7.160040638557911, + "learning_rate": 8.037742205314839e-07, + "loss": 0.64, + "step": 20630 + }, + { + "epoch": 1.4904907255224231, + "grad_norm": 8.052200244155175, + "learning_rate": 8.035593571525404e-07, + "loss": 0.652, + "step": 20631 + }, + { + "epoch": 1.4905629707226327, + "grad_norm": 6.435353634238377, + "learning_rate": 8.033445169965534e-07, + "loss": 0.6156, + "step": 20632 + }, + { + "epoch": 1.4906352159228422, + "grad_norm": 7.665917594914616, + "learning_rate": 8.031297000664617e-07, + "loss": 0.64, + "step": 20633 + }, + { + "epoch": 1.4907074611230517, + "grad_norm": 8.105039051110088, + "learning_rate": 8.029149063652067e-07, + "loss": 0.6211, + "step": 20634 + }, + { + "epoch": 1.490779706323261, + "grad_norm": 7.276962131030044, + "learning_rate": 8.027001358957298e-07, + "loss": 0.6198, + "step": 20635 + }, + { + "epoch": 1.4908519515234706, + "grad_norm": 7.093847064055786, + "learning_rate": 8.024853886609693e-07, + "loss": 0.6014, + "step": 20636 + }, + { + "epoch": 1.4909241967236802, + "grad_norm": 7.272062997391944, + "learning_rate": 8.022706646638661e-07, + "loss": 0.6219, + "step": 20637 + }, + { + "epoch": 1.4909964419238897, + "grad_norm": 7.748971193999219, + "learning_rate": 8.020559639073591e-07, + "loss": 0.6359, + "step": 20638 + }, + { + "epoch": 1.4910686871240992, + "grad_norm": 7.59970173501445, + "learning_rate": 8.018412863943875e-07, + "loss": 0.6493, + "step": 20639 + }, + { + "epoch": 1.4911409323243088, + "grad_norm": 8.060838631668592, + "learning_rate": 8.016266321278901e-07, + "loss": 0.5951, + "step": 20640 + }, + { + "epoch": 1.4912131775245183, + "grad_norm": 6.639428603164106, + "learning_rate": 8.014120011108057e-07, + "loss": 0.5518, + "step": 20641 + }, + { + "epoch": 1.4912854227247276, + "grad_norm": 7.0138513578747395, + "learning_rate": 8.011973933460726e-07, + "loss": 0.5891, + "step": 20642 + }, + { + "epoch": 1.4913576679249372, + "grad_norm": 6.1122487521655735, + "learning_rate": 8.009828088366275e-07, + "loss": 0.5709, + "step": 20643 + }, + { + "epoch": 1.4914299131251467, + "grad_norm": 6.815622436578925, + "learning_rate": 8.007682475854086e-07, + "loss": 0.6394, + "step": 20644 + }, + { + "epoch": 1.4915021583253563, + "grad_norm": 6.522422556243751, + "learning_rate": 8.005537095953531e-07, + "loss": 0.6002, + "step": 20645 + }, + { + "epoch": 1.4915744035255658, + "grad_norm": 8.070317226825818, + "learning_rate": 8.003391948693984e-07, + "loss": 0.6984, + "step": 20646 + }, + { + "epoch": 1.4916466487257753, + "grad_norm": 6.036955511526032, + "learning_rate": 8.00124703410479e-07, + "loss": 0.5559, + "step": 20647 + }, + { + "epoch": 1.491718893925985, + "grad_norm": 7.392504076164807, + "learning_rate": 7.999102352215332e-07, + "loss": 0.6626, + "step": 20648 + }, + { + "epoch": 1.4917911391261942, + "grad_norm": 7.447654908033158, + "learning_rate": 7.996957903054964e-07, + "loss": 0.6735, + "step": 20649 + }, + { + "epoch": 1.4918633843264038, + "grad_norm": 5.946433003087235, + "learning_rate": 7.994813686653047e-07, + "loss": 0.6134, + "step": 20650 + }, + { + "epoch": 1.4919356295266133, + "grad_norm": 6.537544685050589, + "learning_rate": 7.992669703038919e-07, + "loss": 0.5275, + "step": 20651 + }, + { + "epoch": 1.4920078747268228, + "grad_norm": 6.887937268121009, + "learning_rate": 7.990525952241937e-07, + "loss": 0.6138, + "step": 20652 + }, + { + "epoch": 1.4920801199270324, + "grad_norm": 7.357800876221839, + "learning_rate": 7.988382434291455e-07, + "loss": 0.6441, + "step": 20653 + }, + { + "epoch": 1.492152365127242, + "grad_norm": 8.907715579798557, + "learning_rate": 7.986239149216802e-07, + "loss": 0.61, + "step": 20654 + }, + { + "epoch": 1.4922246103274515, + "grad_norm": 7.498417750986574, + "learning_rate": 7.984096097047325e-07, + "loss": 0.6651, + "step": 20655 + }, + { + "epoch": 1.4922968555276608, + "grad_norm": 6.360057297454505, + "learning_rate": 7.98195327781236e-07, + "loss": 0.6053, + "step": 20656 + }, + { + "epoch": 1.4923691007278703, + "grad_norm": 8.301035382139228, + "learning_rate": 7.97981069154124e-07, + "loss": 0.6345, + "step": 20657 + }, + { + "epoch": 1.4924413459280799, + "grad_norm": 8.123931814549, + "learning_rate": 7.977668338263297e-07, + "loss": 0.6058, + "step": 20658 + }, + { + "epoch": 1.4925135911282894, + "grad_norm": 7.388557257735104, + "learning_rate": 7.975526218007859e-07, + "loss": 0.6233, + "step": 20659 + }, + { + "epoch": 1.492585836328499, + "grad_norm": 7.249466712017, + "learning_rate": 7.973384330804257e-07, + "loss": 0.6405, + "step": 20660 + }, + { + "epoch": 1.4926580815287085, + "grad_norm": 5.694896206600703, + "learning_rate": 7.971242676681793e-07, + "loss": 0.5978, + "step": 20661 + }, + { + "epoch": 1.492730326728918, + "grad_norm": 7.210579876695294, + "learning_rate": 7.969101255669795e-07, + "loss": 0.6077, + "step": 20662 + }, + { + "epoch": 1.4928025719291274, + "grad_norm": 6.923322332548249, + "learning_rate": 7.966960067797577e-07, + "loss": 0.6, + "step": 20663 + }, + { + "epoch": 1.492874817129337, + "grad_norm": 6.674029893423504, + "learning_rate": 7.964819113094457e-07, + "loss": 0.6109, + "step": 20664 + }, + { + "epoch": 1.4929470623295464, + "grad_norm": 6.615964666196115, + "learning_rate": 7.962678391589732e-07, + "loss": 0.5525, + "step": 20665 + }, + { + "epoch": 1.493019307529756, + "grad_norm": 6.300548154205079, + "learning_rate": 7.96053790331271e-07, + "loss": 0.6205, + "step": 20666 + }, + { + "epoch": 1.4930915527299655, + "grad_norm": 6.391182798184891, + "learning_rate": 7.958397648292693e-07, + "loss": 0.5179, + "step": 20667 + }, + { + "epoch": 1.493163797930175, + "grad_norm": 7.699871745837313, + "learning_rate": 7.956257626558981e-07, + "loss": 0.6517, + "step": 20668 + }, + { + "epoch": 1.4932360431303846, + "grad_norm": 6.7279886291262665, + "learning_rate": 7.954117838140868e-07, + "loss": 0.6165, + "step": 20669 + }, + { + "epoch": 1.493308288330594, + "grad_norm": 8.777046473604015, + "learning_rate": 7.951978283067646e-07, + "loss": 0.6691, + "step": 20670 + }, + { + "epoch": 1.4933805335308035, + "grad_norm": 6.696131474761747, + "learning_rate": 7.949838961368611e-07, + "loss": 0.5387, + "step": 20671 + }, + { + "epoch": 1.493452778731013, + "grad_norm": 7.654787009457301, + "learning_rate": 7.947699873073036e-07, + "loss": 0.6985, + "step": 20672 + }, + { + "epoch": 1.4935250239312226, + "grad_norm": 7.738754234898166, + "learning_rate": 7.94556101821021e-07, + "loss": 0.6377, + "step": 20673 + }, + { + "epoch": 1.493597269131432, + "grad_norm": 8.750880169877691, + "learning_rate": 7.943422396809411e-07, + "loss": 0.6349, + "step": 20674 + }, + { + "epoch": 1.4936695143316416, + "grad_norm": 7.283324114039309, + "learning_rate": 7.941284008899924e-07, + "loss": 0.6473, + "step": 20675 + }, + { + "epoch": 1.4937417595318512, + "grad_norm": 7.261604492864578, + "learning_rate": 7.939145854510996e-07, + "loss": 0.6435, + "step": 20676 + }, + { + "epoch": 1.4938140047320605, + "grad_norm": 6.9340457049003135, + "learning_rate": 7.937007933671925e-07, + "loss": 0.5915, + "step": 20677 + }, + { + "epoch": 1.4938862499322703, + "grad_norm": 6.838377354767094, + "learning_rate": 7.934870246411974e-07, + "loss": 0.5881, + "step": 20678 + }, + { + "epoch": 1.4939584951324796, + "grad_norm": 7.08504540191296, + "learning_rate": 7.932732792760392e-07, + "loss": 0.6097, + "step": 20679 + }, + { + "epoch": 1.4940307403326891, + "grad_norm": 6.936605034914442, + "learning_rate": 7.930595572746444e-07, + "loss": 0.6977, + "step": 20680 + }, + { + "epoch": 1.4941029855328987, + "grad_norm": 6.157522621872142, + "learning_rate": 7.92845858639939e-07, + "loss": 0.5696, + "step": 20681 + }, + { + "epoch": 1.4941752307331082, + "grad_norm": 8.776549251981724, + "learning_rate": 7.92632183374849e-07, + "loss": 0.6836, + "step": 20682 + }, + { + "epoch": 1.4942474759333177, + "grad_norm": 6.605722270230999, + "learning_rate": 7.924185314822982e-07, + "loss": 0.6096, + "step": 20683 + }, + { + "epoch": 1.494319721133527, + "grad_norm": 6.937338337432291, + "learning_rate": 7.922049029652115e-07, + "loss": 0.5483, + "step": 20684 + }, + { + "epoch": 1.4943919663337368, + "grad_norm": 6.281810223092949, + "learning_rate": 7.91991297826514e-07, + "loss": 0.5823, + "step": 20685 + }, + { + "epoch": 1.4944642115339462, + "grad_norm": 7.266741398185001, + "learning_rate": 7.917777160691293e-07, + "loss": 0.6156, + "step": 20686 + }, + { + "epoch": 1.4945364567341557, + "grad_norm": 7.417296850741615, + "learning_rate": 7.915641576959812e-07, + "loss": 0.6464, + "step": 20687 + }, + { + "epoch": 1.4946087019343652, + "grad_norm": 5.939825947584462, + "learning_rate": 7.913506227099932e-07, + "loss": 0.5248, + "step": 20688 + }, + { + "epoch": 1.4946809471345748, + "grad_norm": 6.542585110021497, + "learning_rate": 7.911371111140895e-07, + "loss": 0.7257, + "step": 20689 + }, + { + "epoch": 1.4947531923347843, + "grad_norm": 7.67210272860839, + "learning_rate": 7.909236229111911e-07, + "loss": 0.5885, + "step": 20690 + }, + { + "epoch": 1.4948254375349936, + "grad_norm": 8.134359749403442, + "learning_rate": 7.907101581042212e-07, + "loss": 0.6481, + "step": 20691 + }, + { + "epoch": 1.4948976827352034, + "grad_norm": 6.509821587386967, + "learning_rate": 7.90496716696103e-07, + "loss": 0.5617, + "step": 20692 + }, + { + "epoch": 1.4949699279354127, + "grad_norm": 5.72235394092712, + "learning_rate": 7.902832986897566e-07, + "loss": 0.5855, + "step": 20693 + }, + { + "epoch": 1.4950421731356223, + "grad_norm": 6.473283588808896, + "learning_rate": 7.900699040881041e-07, + "loss": 0.6218, + "step": 20694 + }, + { + "epoch": 1.4951144183358318, + "grad_norm": 7.326391604776065, + "learning_rate": 7.898565328940671e-07, + "loss": 0.6022, + "step": 20695 + }, + { + "epoch": 1.4951866635360413, + "grad_norm": 6.962057278321606, + "learning_rate": 7.896431851105663e-07, + "loss": 0.6005, + "step": 20696 + }, + { + "epoch": 1.495258908736251, + "grad_norm": 7.541117555077111, + "learning_rate": 7.894298607405224e-07, + "loss": 0.6408, + "step": 20697 + }, + { + "epoch": 1.4953311539364602, + "grad_norm": 6.7093388848294415, + "learning_rate": 7.892165597868553e-07, + "loss": 0.6134, + "step": 20698 + }, + { + "epoch": 1.49540339913667, + "grad_norm": 6.441456884181674, + "learning_rate": 7.890032822524854e-07, + "loss": 0.6529, + "step": 20699 + }, + { + "epoch": 1.4954756443368793, + "grad_norm": 7.439554395559736, + "learning_rate": 7.887900281403327e-07, + "loss": 0.7059, + "step": 20700 + }, + { + "epoch": 1.4955478895370888, + "grad_norm": 7.685877434917895, + "learning_rate": 7.885767974533152e-07, + "loss": 0.739, + "step": 20701 + }, + { + "epoch": 1.4956201347372984, + "grad_norm": 5.63472517451902, + "learning_rate": 7.883635901943523e-07, + "loss": 0.6829, + "step": 20702 + }, + { + "epoch": 1.495692379937508, + "grad_norm": 7.946527588841232, + "learning_rate": 7.881504063663637e-07, + "loss": 0.6616, + "step": 20703 + }, + { + "epoch": 1.4957646251377175, + "grad_norm": 6.290139962535891, + "learning_rate": 7.879372459722653e-07, + "loss": 0.6153, + "step": 20704 + }, + { + "epoch": 1.495836870337927, + "grad_norm": 6.910804593047832, + "learning_rate": 7.877241090149775e-07, + "loss": 0.6502, + "step": 20705 + }, + { + "epoch": 1.4959091155381365, + "grad_norm": 6.960874203994631, + "learning_rate": 7.875109954974172e-07, + "loss": 0.677, + "step": 20706 + }, + { + "epoch": 1.4959813607383459, + "grad_norm": 6.258348306298493, + "learning_rate": 7.872979054225025e-07, + "loss": 0.6712, + "step": 20707 + }, + { + "epoch": 1.4960536059385554, + "grad_norm": 6.612625657385786, + "learning_rate": 7.870848387931487e-07, + "loss": 0.6332, + "step": 20708 + }, + { + "epoch": 1.496125851138765, + "grad_norm": 6.904710416805429, + "learning_rate": 7.868717956122737e-07, + "loss": 0.5258, + "step": 20709 + }, + { + "epoch": 1.4961980963389745, + "grad_norm": 6.186077840725387, + "learning_rate": 7.866587758827942e-07, + "loss": 0.5897, + "step": 20710 + }, + { + "epoch": 1.496270341539184, + "grad_norm": 7.5135360638070345, + "learning_rate": 7.86445779607625e-07, + "loss": 0.6571, + "step": 20711 + }, + { + "epoch": 1.4963425867393936, + "grad_norm": 7.518319707593309, + "learning_rate": 7.862328067896827e-07, + "loss": 0.5772, + "step": 20712 + }, + { + "epoch": 1.4964148319396031, + "grad_norm": 6.901901325927646, + "learning_rate": 7.860198574318825e-07, + "loss": 0.6777, + "step": 20713 + }, + { + "epoch": 1.4964870771398124, + "grad_norm": 6.844818449879909, + "learning_rate": 7.858069315371397e-07, + "loss": 0.6164, + "step": 20714 + }, + { + "epoch": 1.496559322340022, + "grad_norm": 7.510251985632394, + "learning_rate": 7.855940291083692e-07, + "loss": 0.6084, + "step": 20715 + }, + { + "epoch": 1.4966315675402315, + "grad_norm": 7.76913108965112, + "learning_rate": 7.853811501484851e-07, + "loss": 0.6787, + "step": 20716 + }, + { + "epoch": 1.496703812740441, + "grad_norm": 8.295073405211769, + "learning_rate": 7.851682946604025e-07, + "loss": 0.6371, + "step": 20717 + }, + { + "epoch": 1.4967760579406506, + "grad_norm": 7.338315860549303, + "learning_rate": 7.849554626470338e-07, + "loss": 0.5858, + "step": 20718 + }, + { + "epoch": 1.4968483031408601, + "grad_norm": 5.82060120141788, + "learning_rate": 7.847426541112929e-07, + "loss": 0.5389, + "step": 20719 + }, + { + "epoch": 1.4969205483410697, + "grad_norm": 6.947752653718744, + "learning_rate": 7.845298690560935e-07, + "loss": 0.6151, + "step": 20720 + }, + { + "epoch": 1.496992793541279, + "grad_norm": 7.0239884921003535, + "learning_rate": 7.843171074843492e-07, + "loss": 0.596, + "step": 20721 + }, + { + "epoch": 1.4970650387414886, + "grad_norm": 6.007532477828811, + "learning_rate": 7.841043693989703e-07, + "loss": 0.5975, + "step": 20722 + }, + { + "epoch": 1.497137283941698, + "grad_norm": 7.2674516248060765, + "learning_rate": 7.838916548028705e-07, + "loss": 0.6254, + "step": 20723 + }, + { + "epoch": 1.4972095291419076, + "grad_norm": 6.410066388957282, + "learning_rate": 7.836789636989614e-07, + "loss": 0.6726, + "step": 20724 + }, + { + "epoch": 1.4972817743421172, + "grad_norm": 7.666500532036428, + "learning_rate": 7.834662960901548e-07, + "loss": 0.636, + "step": 20725 + }, + { + "epoch": 1.4973540195423267, + "grad_norm": 6.304787333665895, + "learning_rate": 7.832536519793618e-07, + "loss": 0.6248, + "step": 20726 + }, + { + "epoch": 1.4974262647425363, + "grad_norm": 6.510468196410655, + "learning_rate": 7.830410313694934e-07, + "loss": 0.6625, + "step": 20727 + }, + { + "epoch": 1.4974985099427456, + "grad_norm": 7.6409892051969, + "learning_rate": 7.828284342634607e-07, + "loss": 0.5775, + "step": 20728 + }, + { + "epoch": 1.4975707551429551, + "grad_norm": 7.423131061476645, + "learning_rate": 7.826158606641726e-07, + "loss": 0.6371, + "step": 20729 + }, + { + "epoch": 1.4976430003431647, + "grad_norm": 8.139960029372697, + "learning_rate": 7.824033105745402e-07, + "loss": 0.5753, + "step": 20730 + }, + { + "epoch": 1.4977152455433742, + "grad_norm": 6.651882117533834, + "learning_rate": 7.821907839974727e-07, + "loss": 0.5845, + "step": 20731 + }, + { + "epoch": 1.4977874907435837, + "grad_norm": 7.283217528562454, + "learning_rate": 7.819782809358795e-07, + "loss": 0.6562, + "step": 20732 + }, + { + "epoch": 1.4978597359437933, + "grad_norm": 6.926565815421184, + "learning_rate": 7.817658013926699e-07, + "loss": 0.6645, + "step": 20733 + }, + { + "epoch": 1.4979319811440028, + "grad_norm": 5.90926481938036, + "learning_rate": 7.815533453707522e-07, + "loss": 0.5722, + "step": 20734 + }, + { + "epoch": 1.4980042263442122, + "grad_norm": 5.6164315979136585, + "learning_rate": 7.813409128730357e-07, + "loss": 0.4802, + "step": 20735 + }, + { + "epoch": 1.4980764715444217, + "grad_norm": 7.21917168512676, + "learning_rate": 7.811285039024269e-07, + "loss": 0.6004, + "step": 20736 + }, + { + "epoch": 1.4981487167446312, + "grad_norm": 6.446520660914056, + "learning_rate": 7.809161184618342e-07, + "loss": 0.6062, + "step": 20737 + }, + { + "epoch": 1.4982209619448408, + "grad_norm": 7.919695733130856, + "learning_rate": 7.80703756554165e-07, + "loss": 0.5663, + "step": 20738 + }, + { + "epoch": 1.4982932071450503, + "grad_norm": 6.670504069576476, + "learning_rate": 7.804914181823273e-07, + "loss": 0.6777, + "step": 20739 + }, + { + "epoch": 1.4983654523452599, + "grad_norm": 7.099637583764454, + "learning_rate": 7.802791033492263e-07, + "loss": 0.6286, + "step": 20740 + }, + { + "epoch": 1.4984376975454694, + "grad_norm": 8.238640102278001, + "learning_rate": 7.80066812057769e-07, + "loss": 0.5888, + "step": 20741 + }, + { + "epoch": 1.4985099427456787, + "grad_norm": 7.603235168536889, + "learning_rate": 7.798545443108615e-07, + "loss": 0.6193, + "step": 20742 + }, + { + "epoch": 1.4985821879458883, + "grad_norm": 6.003434469813892, + "learning_rate": 7.796423001114098e-07, + "loss": 0.6132, + "step": 20743 + }, + { + "epoch": 1.4986544331460978, + "grad_norm": 8.178778975929681, + "learning_rate": 7.794300794623191e-07, + "loss": 0.6126, + "step": 20744 + }, + { + "epoch": 1.4987266783463074, + "grad_norm": 7.256587225859268, + "learning_rate": 7.792178823664948e-07, + "loss": 0.6523, + "step": 20745 + }, + { + "epoch": 1.498798923546517, + "grad_norm": 7.698288952846189, + "learning_rate": 7.790057088268424e-07, + "loss": 0.7251, + "step": 20746 + }, + { + "epoch": 1.4988711687467264, + "grad_norm": 8.82948430491953, + "learning_rate": 7.787935588462647e-07, + "loss": 0.6227, + "step": 20747 + }, + { + "epoch": 1.498943413946936, + "grad_norm": 7.051564350123705, + "learning_rate": 7.78581432427667e-07, + "loss": 0.6018, + "step": 20748 + }, + { + "epoch": 1.4990156591471453, + "grad_norm": 7.163336164788417, + "learning_rate": 7.783693295739533e-07, + "loss": 0.6407, + "step": 20749 + }, + { + "epoch": 1.499087904347355, + "grad_norm": 6.628777920273736, + "learning_rate": 7.781572502880263e-07, + "loss": 0.5849, + "step": 20750 + }, + { + "epoch": 1.4991601495475644, + "grad_norm": 8.435573781865516, + "learning_rate": 7.779451945727894e-07, + "loss": 0.698, + "step": 20751 + }, + { + "epoch": 1.499232394747774, + "grad_norm": 7.037811608470927, + "learning_rate": 7.777331624311457e-07, + "loss": 0.5872, + "step": 20752 + }, + { + "epoch": 1.4993046399479835, + "grad_norm": 6.716844013355964, + "learning_rate": 7.775211538659977e-07, + "loss": 0.6017, + "step": 20753 + }, + { + "epoch": 1.499376885148193, + "grad_norm": 7.007992677275433, + "learning_rate": 7.773091688802478e-07, + "loss": 0.6101, + "step": 20754 + }, + { + "epoch": 1.4994491303484025, + "grad_norm": 7.8946684797573, + "learning_rate": 7.77097207476798e-07, + "loss": 0.5992, + "step": 20755 + }, + { + "epoch": 1.4995213755486119, + "grad_norm": 6.216485152607371, + "learning_rate": 7.768852696585493e-07, + "loss": 0.5894, + "step": 20756 + }, + { + "epoch": 1.4995936207488216, + "grad_norm": 7.653542799038167, + "learning_rate": 7.766733554284042e-07, + "loss": 0.6596, + "step": 20757 + }, + { + "epoch": 1.499665865949031, + "grad_norm": 6.622477789064331, + "learning_rate": 7.764614647892621e-07, + "loss": 0.5516, + "step": 20758 + }, + { + "epoch": 1.4997381111492405, + "grad_norm": 7.107916713847056, + "learning_rate": 7.762495977440243e-07, + "loss": 0.6193, + "step": 20759 + }, + { + "epoch": 1.49981035634945, + "grad_norm": 6.034171706202239, + "learning_rate": 7.76037754295591e-07, + "loss": 0.5958, + "step": 20760 + }, + { + "epoch": 1.4998826015496596, + "grad_norm": 7.524101441936102, + "learning_rate": 7.758259344468624e-07, + "loss": 0.6072, + "step": 20761 + }, + { + "epoch": 1.4999548467498691, + "grad_norm": 7.646470655718855, + "learning_rate": 7.756141382007382e-07, + "loss": 0.6426, + "step": 20762 + }, + { + "epoch": 1.5000270919500784, + "grad_norm": 7.18846137210136, + "learning_rate": 7.754023655601173e-07, + "loss": 0.5782, + "step": 20763 + }, + { + "epoch": 1.5000993371502882, + "grad_norm": 66.25330180451387, + "learning_rate": 7.751906165278997e-07, + "loss": 0.6687, + "step": 20764 + }, + { + "epoch": 1.5001715823504975, + "grad_norm": 6.917115101199501, + "learning_rate": 7.749788911069828e-07, + "loss": 0.6546, + "step": 20765 + }, + { + "epoch": 1.500243827550707, + "grad_norm": 6.345589807871733, + "learning_rate": 7.747671893002651e-07, + "loss": 0.6214, + "step": 20766 + }, + { + "epoch": 1.5003160727509166, + "grad_norm": 7.69743833660269, + "learning_rate": 7.745555111106462e-07, + "loss": 0.5913, + "step": 20767 + }, + { + "epoch": 1.5003883179511261, + "grad_norm": 7.404137676341831, + "learning_rate": 7.743438565410216e-07, + "loss": 0.5985, + "step": 20768 + }, + { + "epoch": 1.5004605631513357, + "grad_norm": 6.382571641913638, + "learning_rate": 7.741322255942896e-07, + "loss": 0.5972, + "step": 20769 + }, + { + "epoch": 1.500532808351545, + "grad_norm": 7.509407247219307, + "learning_rate": 7.739206182733478e-07, + "loss": 0.6123, + "step": 20770 + }, + { + "epoch": 1.5006050535517548, + "grad_norm": 7.342531013773957, + "learning_rate": 7.737090345810922e-07, + "loss": 0.6038, + "step": 20771 + }, + { + "epoch": 1.500677298751964, + "grad_norm": 7.5782629256134, + "learning_rate": 7.734974745204196e-07, + "loss": 0.5844, + "step": 20772 + }, + { + "epoch": 1.5007495439521736, + "grad_norm": 7.2508699618637085, + "learning_rate": 7.73285938094226e-07, + "loss": 0.5707, + "step": 20773 + }, + { + "epoch": 1.5008217891523832, + "grad_norm": 6.683274404721, + "learning_rate": 7.730744253054081e-07, + "loss": 0.5873, + "step": 20774 + }, + { + "epoch": 1.5008940343525927, + "grad_norm": 7.088684155342176, + "learning_rate": 7.728629361568596e-07, + "loss": 0.6116, + "step": 20775 + }, + { + "epoch": 1.5009662795528023, + "grad_norm": 8.31431662994315, + "learning_rate": 7.726514706514765e-07, + "loss": 0.6152, + "step": 20776 + }, + { + "epoch": 1.5010385247530116, + "grad_norm": 6.276113659035567, + "learning_rate": 7.724400287921535e-07, + "loss": 0.6205, + "step": 20777 + }, + { + "epoch": 1.5011107699532213, + "grad_norm": 7.1135670805079725, + "learning_rate": 7.722286105817861e-07, + "loss": 0.6576, + "step": 20778 + }, + { + "epoch": 1.5011830151534307, + "grad_norm": 8.362059659440781, + "learning_rate": 7.720172160232664e-07, + "loss": 0.613, + "step": 20779 + }, + { + "epoch": 1.5012552603536402, + "grad_norm": 6.686446632778733, + "learning_rate": 7.718058451194896e-07, + "loss": 0.5973, + "step": 20780 + }, + { + "epoch": 1.5013275055538498, + "grad_norm": 7.1115177488622505, + "learning_rate": 7.71594497873348e-07, + "loss": 0.6629, + "step": 20781 + }, + { + "epoch": 1.5013997507540593, + "grad_norm": 7.412667242675525, + "learning_rate": 7.713831742877373e-07, + "loss": 0.6784, + "step": 20782 + }, + { + "epoch": 1.5014719959542688, + "grad_norm": 5.98201854714924, + "learning_rate": 7.71171874365548e-07, + "loss": 0.6464, + "step": 20783 + }, + { + "epoch": 1.5015442411544782, + "grad_norm": 7.03581069537216, + "learning_rate": 7.709605981096732e-07, + "loss": 0.6581, + "step": 20784 + }, + { + "epoch": 1.501616486354688, + "grad_norm": 7.741633421378583, + "learning_rate": 7.707493455230064e-07, + "loss": 0.6609, + "step": 20785 + }, + { + "epoch": 1.5016887315548972, + "grad_norm": 7.866825765951343, + "learning_rate": 7.705381166084375e-07, + "loss": 0.6207, + "step": 20786 + }, + { + "epoch": 1.5017609767551068, + "grad_norm": 6.562638926170856, + "learning_rate": 7.703269113688589e-07, + "loss": 0.6861, + "step": 20787 + }, + { + "epoch": 1.5018332219553163, + "grad_norm": 7.823663167134894, + "learning_rate": 7.701157298071618e-07, + "loss": 0.6626, + "step": 20788 + }, + { + "epoch": 1.5019054671555259, + "grad_norm": 6.9308569092384165, + "learning_rate": 7.699045719262374e-07, + "loss": 0.5866, + "step": 20789 + }, + { + "epoch": 1.5019777123557354, + "grad_norm": 6.1592307063696765, + "learning_rate": 7.696934377289759e-07, + "loss": 0.5647, + "step": 20790 + }, + { + "epoch": 1.5020499575559447, + "grad_norm": 6.780533009960937, + "learning_rate": 7.694823272182677e-07, + "loss": 0.547, + "step": 20791 + }, + { + "epoch": 1.5021222027561545, + "grad_norm": 7.710349511760531, + "learning_rate": 7.692712403970038e-07, + "loss": 0.592, + "step": 20792 + }, + { + "epoch": 1.5021944479563638, + "grad_norm": 7.605145980968787, + "learning_rate": 7.690601772680717e-07, + "loss": 0.6384, + "step": 20793 + }, + { + "epoch": 1.5022666931565736, + "grad_norm": 7.211862688649958, + "learning_rate": 7.688491378343621e-07, + "loss": 0.641, + "step": 20794 + }, + { + "epoch": 1.502338938356783, + "grad_norm": 6.442658662925784, + "learning_rate": 7.686381220987635e-07, + "loss": 0.5882, + "step": 20795 + }, + { + "epoch": 1.5024111835569924, + "grad_norm": 7.287217473987146, + "learning_rate": 7.684271300641655e-07, + "loss": 0.6235, + "step": 20796 + }, + { + "epoch": 1.502483428757202, + "grad_norm": 8.117492963754362, + "learning_rate": 7.682161617334546e-07, + "loss": 0.639, + "step": 20797 + }, + { + "epoch": 1.5025556739574113, + "grad_norm": 6.895077882012895, + "learning_rate": 7.680052171095201e-07, + "loss": 0.5834, + "step": 20798 + }, + { + "epoch": 1.502627919157621, + "grad_norm": 6.982021677779608, + "learning_rate": 7.677942961952492e-07, + "loss": 0.5684, + "step": 20799 + }, + { + "epoch": 1.5027001643578304, + "grad_norm": 7.058266510209882, + "learning_rate": 7.675833989935297e-07, + "loss": 0.647, + "step": 20800 + }, + { + "epoch": 1.5027724095580401, + "grad_norm": 8.077609313206283, + "learning_rate": 7.673725255072481e-07, + "loss": 0.5992, + "step": 20801 + }, + { + "epoch": 1.5028446547582495, + "grad_norm": 8.245070400735578, + "learning_rate": 7.671616757392913e-07, + "loss": 0.6298, + "step": 20802 + }, + { + "epoch": 1.502916899958459, + "grad_norm": 7.110689213608572, + "learning_rate": 7.669508496925466e-07, + "loss": 0.6439, + "step": 20803 + }, + { + "epoch": 1.5029891451586685, + "grad_norm": 5.991959588641604, + "learning_rate": 7.667400473698982e-07, + "loss": 0.6734, + "step": 20804 + }, + { + "epoch": 1.5030613903588779, + "grad_norm": 6.679335699800168, + "learning_rate": 7.665292687742329e-07, + "loss": 0.5864, + "step": 20805 + }, + { + "epoch": 1.5031336355590876, + "grad_norm": 6.502102438489067, + "learning_rate": 7.663185139084358e-07, + "loss": 0.6164, + "step": 20806 + }, + { + "epoch": 1.503205880759297, + "grad_norm": 7.193349829446558, + "learning_rate": 7.661077827753929e-07, + "loss": 0.5776, + "step": 20807 + }, + { + "epoch": 1.5032781259595067, + "grad_norm": 7.435931128434836, + "learning_rate": 7.658970753779876e-07, + "loss": 0.6553, + "step": 20808 + }, + { + "epoch": 1.503350371159716, + "grad_norm": 7.0031077434526345, + "learning_rate": 7.65686391719104e-07, + "loss": 0.6945, + "step": 20809 + }, + { + "epoch": 1.5034226163599256, + "grad_norm": 6.702430386777591, + "learning_rate": 7.654757318016287e-07, + "loss": 0.66, + "step": 20810 + }, + { + "epoch": 1.5034948615601351, + "grad_norm": 7.269369099984978, + "learning_rate": 7.65265095628443e-07, + "loss": 0.6185, + "step": 20811 + }, + { + "epoch": 1.5035671067603447, + "grad_norm": 6.956079022687497, + "learning_rate": 7.650544832024315e-07, + "loss": 0.5984, + "step": 20812 + }, + { + "epoch": 1.5036393519605542, + "grad_norm": 7.560453279823024, + "learning_rate": 7.648438945264767e-07, + "loss": 0.6607, + "step": 20813 + }, + { + "epoch": 1.5037115971607635, + "grad_norm": 7.457797523162209, + "learning_rate": 7.646333296034625e-07, + "loss": 0.6429, + "step": 20814 + }, + { + "epoch": 1.5037838423609733, + "grad_norm": 6.603442837331766, + "learning_rate": 7.6442278843627e-07, + "loss": 0.6026, + "step": 20815 + }, + { + "epoch": 1.5038560875611826, + "grad_norm": 7.089988215212046, + "learning_rate": 7.642122710277819e-07, + "loss": 0.6804, + "step": 20816 + }, + { + "epoch": 1.5039283327613922, + "grad_norm": 8.589963654517842, + "learning_rate": 7.6400177738088e-07, + "loss": 0.609, + "step": 20817 + }, + { + "epoch": 1.5040005779616017, + "grad_norm": 7.271741429615704, + "learning_rate": 7.637913074984457e-07, + "loss": 0.6159, + "step": 20818 + }, + { + "epoch": 1.5040728231618112, + "grad_norm": 5.743713591318059, + "learning_rate": 7.635808613833607e-07, + "loss": 0.6006, + "step": 20819 + }, + { + "epoch": 1.5041450683620208, + "grad_norm": 7.19239600569479, + "learning_rate": 7.633704390385049e-07, + "loss": 0.6398, + "step": 20820 + }, + { + "epoch": 1.50421731356223, + "grad_norm": 6.41098428105266, + "learning_rate": 7.631600404667608e-07, + "loss": 0.6349, + "step": 20821 + }, + { + "epoch": 1.5042895587624399, + "grad_norm": 7.5826317039896916, + "learning_rate": 7.629496656710058e-07, + "loss": 0.6624, + "step": 20822 + }, + { + "epoch": 1.5043618039626492, + "grad_norm": 7.304110417545672, + "learning_rate": 7.627393146541215e-07, + "loss": 0.6391, + "step": 20823 + }, + { + "epoch": 1.5044340491628587, + "grad_norm": 6.114071184986165, + "learning_rate": 7.625289874189878e-07, + "loss": 0.6007, + "step": 20824 + }, + { + "epoch": 1.5045062943630683, + "grad_norm": 8.58166764278657, + "learning_rate": 7.623186839684826e-07, + "loss": 0.6607, + "step": 20825 + }, + { + "epoch": 1.5045785395632778, + "grad_norm": 7.335479104504828, + "learning_rate": 7.621084043054853e-07, + "loss": 0.6098, + "step": 20826 + }, + { + "epoch": 1.5046507847634873, + "grad_norm": 7.653987877311107, + "learning_rate": 7.618981484328744e-07, + "loss": 0.642, + "step": 20827 + }, + { + "epoch": 1.5047230299636967, + "grad_norm": 7.605979334106777, + "learning_rate": 7.616879163535284e-07, + "loss": 0.6192, + "step": 20828 + }, + { + "epoch": 1.5047952751639064, + "grad_norm": 7.054221102990255, + "learning_rate": 7.614777080703251e-07, + "loss": 0.6226, + "step": 20829 + }, + { + "epoch": 1.5048675203641158, + "grad_norm": 7.452707594853347, + "learning_rate": 7.612675235861419e-07, + "loss": 0.5924, + "step": 20830 + }, + { + "epoch": 1.5049397655643253, + "grad_norm": 7.520827810297776, + "learning_rate": 7.610573629038567e-07, + "loss": 0.7037, + "step": 20831 + }, + { + "epoch": 1.5050120107645348, + "grad_norm": 7.646288561110261, + "learning_rate": 7.608472260263464e-07, + "loss": 0.713, + "step": 20832 + }, + { + "epoch": 1.5050842559647444, + "grad_norm": 7.017020377745062, + "learning_rate": 7.606371129564866e-07, + "loss": 0.623, + "step": 20833 + }, + { + "epoch": 1.505156501164954, + "grad_norm": 7.758028363531257, + "learning_rate": 7.604270236971542e-07, + "loss": 0.6937, + "step": 20834 + }, + { + "epoch": 1.5052287463651632, + "grad_norm": 7.955353130604397, + "learning_rate": 7.602169582512259e-07, + "loss": 0.6147, + "step": 20835 + }, + { + "epoch": 1.505300991565373, + "grad_norm": 6.9582777496980786, + "learning_rate": 7.60006916621576e-07, + "loss": 0.6612, + "step": 20836 + }, + { + "epoch": 1.5053732367655823, + "grad_norm": 8.291270286504185, + "learning_rate": 7.597968988110793e-07, + "loss": 0.6373, + "step": 20837 + }, + { + "epoch": 1.5054454819657919, + "grad_norm": 6.21838255136943, + "learning_rate": 7.595869048226126e-07, + "loss": 0.6024, + "step": 20838 + }, + { + "epoch": 1.5055177271660014, + "grad_norm": 5.703340374785996, + "learning_rate": 7.59376934659051e-07, + "loss": 0.6964, + "step": 20839 + }, + { + "epoch": 1.505589972366211, + "grad_norm": 8.469393209545217, + "learning_rate": 7.591669883232666e-07, + "loss": 0.6463, + "step": 20840 + }, + { + "epoch": 1.5056622175664205, + "grad_norm": 7.480004932177544, + "learning_rate": 7.589570658181345e-07, + "loss": 0.6293, + "step": 20841 + }, + { + "epoch": 1.5057344627666298, + "grad_norm": 7.10980692798494, + "learning_rate": 7.587471671465291e-07, + "loss": 0.6522, + "step": 20842 + }, + { + "epoch": 1.5058067079668396, + "grad_norm": 7.328968772218923, + "learning_rate": 7.585372923113224e-07, + "loss": 0.6045, + "step": 20843 + }, + { + "epoch": 1.505878953167049, + "grad_norm": 6.987791313963931, + "learning_rate": 7.583274413153877e-07, + "loss": 0.5733, + "step": 20844 + }, + { + "epoch": 1.5059511983672584, + "grad_norm": 7.2719415587037775, + "learning_rate": 7.581176141615982e-07, + "loss": 0.6383, + "step": 20845 + }, + { + "epoch": 1.506023443567468, + "grad_norm": 6.712905683991692, + "learning_rate": 7.579078108528263e-07, + "loss": 0.5466, + "step": 20846 + }, + { + "epoch": 1.5060956887676775, + "grad_norm": 7.176221938869921, + "learning_rate": 7.576980313919432e-07, + "loss": 0.6803, + "step": 20847 + }, + { + "epoch": 1.506167933967887, + "grad_norm": 7.505438549385606, + "learning_rate": 7.574882757818217e-07, + "loss": 0.6064, + "step": 20848 + }, + { + "epoch": 1.5062401791680964, + "grad_norm": 6.036248700174747, + "learning_rate": 7.572785440253336e-07, + "loss": 0.59, + "step": 20849 + }, + { + "epoch": 1.5063124243683061, + "grad_norm": 6.803611065835278, + "learning_rate": 7.570688361253481e-07, + "loss": 0.5818, + "step": 20850 + }, + { + "epoch": 1.5063846695685155, + "grad_norm": 7.459039097076043, + "learning_rate": 7.568591520847368e-07, + "loss": 0.6059, + "step": 20851 + }, + { + "epoch": 1.506456914768725, + "grad_norm": 6.6946857386043055, + "learning_rate": 7.566494919063705e-07, + "loss": 0.6158, + "step": 20852 + }, + { + "epoch": 1.5065291599689346, + "grad_norm": 7.004960346204987, + "learning_rate": 7.564398555931199e-07, + "loss": 0.6135, + "step": 20853 + }, + { + "epoch": 1.506601405169144, + "grad_norm": 7.497461779561555, + "learning_rate": 7.562302431478527e-07, + "loss": 0.5759, + "step": 20854 + }, + { + "epoch": 1.5066736503693536, + "grad_norm": 7.88425978955534, + "learning_rate": 7.560206545734399e-07, + "loss": 0.5772, + "step": 20855 + }, + { + "epoch": 1.506745895569563, + "grad_norm": 7.356810558290609, + "learning_rate": 7.5581108987275e-07, + "loss": 0.6242, + "step": 20856 + }, + { + "epoch": 1.5068181407697727, + "grad_norm": 6.395028162007661, + "learning_rate": 7.556015490486521e-07, + "loss": 0.6017, + "step": 20857 + }, + { + "epoch": 1.506890385969982, + "grad_norm": 6.482215613715368, + "learning_rate": 7.553920321040145e-07, + "loss": 0.6452, + "step": 20858 + }, + { + "epoch": 1.5069626311701916, + "grad_norm": 5.8264160923505255, + "learning_rate": 7.551825390417055e-07, + "loss": 0.5412, + "step": 20859 + }, + { + "epoch": 1.5070348763704011, + "grad_norm": 7.33633685015255, + "learning_rate": 7.549730698645935e-07, + "loss": 0.6893, + "step": 20860 + }, + { + "epoch": 1.5071071215706107, + "grad_norm": 7.78167429307259, + "learning_rate": 7.547636245755443e-07, + "loss": 0.6436, + "step": 20861 + }, + { + "epoch": 1.5071793667708202, + "grad_norm": 6.748544606734257, + "learning_rate": 7.545542031774263e-07, + "loss": 0.6179, + "step": 20862 + }, + { + "epoch": 1.5072516119710295, + "grad_norm": 7.104266869816221, + "learning_rate": 7.543448056731059e-07, + "loss": 0.6804, + "step": 20863 + }, + { + "epoch": 1.5073238571712393, + "grad_norm": 6.852017917090761, + "learning_rate": 7.541354320654503e-07, + "loss": 0.6565, + "step": 20864 + }, + { + "epoch": 1.5073961023714486, + "grad_norm": 7.625975499523713, + "learning_rate": 7.539260823573244e-07, + "loss": 0.6833, + "step": 20865 + }, + { + "epoch": 1.5074683475716584, + "grad_norm": 6.954057060948925, + "learning_rate": 7.537167565515941e-07, + "loss": 0.5802, + "step": 20866 + }, + { + "epoch": 1.5075405927718677, + "grad_norm": 6.491725055920049, + "learning_rate": 7.535074546511267e-07, + "loss": 0.5746, + "step": 20867 + }, + { + "epoch": 1.5076128379720772, + "grad_norm": 7.396899795651828, + "learning_rate": 7.532981766587857e-07, + "loss": 0.5793, + "step": 20868 + }, + { + "epoch": 1.5076850831722868, + "grad_norm": 6.880298029632328, + "learning_rate": 7.530889225774362e-07, + "loss": 0.6636, + "step": 20869 + }, + { + "epoch": 1.507757328372496, + "grad_norm": 6.845620374410699, + "learning_rate": 7.528796924099432e-07, + "loss": 0.6095, + "step": 20870 + }, + { + "epoch": 1.5078295735727059, + "grad_norm": 6.112246255737041, + "learning_rate": 7.526704861591714e-07, + "loss": 0.5915, + "step": 20871 + }, + { + "epoch": 1.5079018187729152, + "grad_norm": 7.1540068063295665, + "learning_rate": 7.52461303827983e-07, + "loss": 0.6133, + "step": 20872 + }, + { + "epoch": 1.507974063973125, + "grad_norm": 7.146099504531273, + "learning_rate": 7.522521454192425e-07, + "loss": 0.5803, + "step": 20873 + }, + { + "epoch": 1.5080463091733343, + "grad_norm": 6.662691107150821, + "learning_rate": 7.520430109358131e-07, + "loss": 0.5623, + "step": 20874 + }, + { + "epoch": 1.5081185543735438, + "grad_norm": 6.120780581391323, + "learning_rate": 7.518339003805578e-07, + "loss": 0.6239, + "step": 20875 + }, + { + "epoch": 1.5081907995737533, + "grad_norm": 7.1976382196635935, + "learning_rate": 7.51624813756339e-07, + "loss": 0.6153, + "step": 20876 + }, + { + "epoch": 1.5082630447739627, + "grad_norm": 7.104831190902477, + "learning_rate": 7.514157510660189e-07, + "loss": 0.5921, + "step": 20877 + }, + { + "epoch": 1.5083352899741724, + "grad_norm": 7.6704567649144515, + "learning_rate": 7.512067123124603e-07, + "loss": 0.6585, + "step": 20878 + }, + { + "epoch": 1.5084075351743818, + "grad_norm": 7.029364167675952, + "learning_rate": 7.50997697498523e-07, + "loss": 0.6259, + "step": 20879 + }, + { + "epoch": 1.5084797803745915, + "grad_norm": 7.650256742270566, + "learning_rate": 7.507887066270695e-07, + "loss": 0.6509, + "step": 20880 + }, + { + "epoch": 1.5085520255748008, + "grad_norm": 7.558222546198367, + "learning_rate": 7.505797397009609e-07, + "loss": 0.6148, + "step": 20881 + }, + { + "epoch": 1.5086242707750104, + "grad_norm": 7.454396009533124, + "learning_rate": 7.50370796723057e-07, + "loss": 0.6459, + "step": 20882 + }, + { + "epoch": 1.50869651597522, + "grad_norm": 7.40825822217845, + "learning_rate": 7.50161877696218e-07, + "loss": 0.6339, + "step": 20883 + }, + { + "epoch": 1.5087687611754295, + "grad_norm": 9.06023004976626, + "learning_rate": 7.499529826233043e-07, + "loss": 0.6224, + "step": 20884 + }, + { + "epoch": 1.508841006375639, + "grad_norm": 7.614633486087259, + "learning_rate": 7.497441115071755e-07, + "loss": 0.6499, + "step": 20885 + }, + { + "epoch": 1.5089132515758483, + "grad_norm": 6.998402276943763, + "learning_rate": 7.495352643506909e-07, + "loss": 0.6075, + "step": 20886 + }, + { + "epoch": 1.508985496776058, + "grad_norm": 6.849070810378308, + "learning_rate": 7.493264411567094e-07, + "loss": 0.6272, + "step": 20887 + }, + { + "epoch": 1.5090577419762674, + "grad_norm": 7.081078054389283, + "learning_rate": 7.491176419280893e-07, + "loss": 0.5818, + "step": 20888 + }, + { + "epoch": 1.509129987176477, + "grad_norm": 6.3762454144046865, + "learning_rate": 7.489088666676903e-07, + "loss": 0.5921, + "step": 20889 + }, + { + "epoch": 1.5092022323766865, + "grad_norm": 6.34980822896863, + "learning_rate": 7.487001153783685e-07, + "loss": 0.5439, + "step": 20890 + }, + { + "epoch": 1.509274477576896, + "grad_norm": 6.05281567588186, + "learning_rate": 7.484913880629821e-07, + "loss": 0.6231, + "step": 20891 + }, + { + "epoch": 1.5093467227771056, + "grad_norm": 8.313937184852426, + "learning_rate": 7.482826847243895e-07, + "loss": 0.6571, + "step": 20892 + }, + { + "epoch": 1.509418967977315, + "grad_norm": 6.255143757364252, + "learning_rate": 7.480740053654461e-07, + "loss": 0.5739, + "step": 20893 + }, + { + "epoch": 1.5094912131775247, + "grad_norm": 6.813943972362562, + "learning_rate": 7.478653499890087e-07, + "loss": 0.6543, + "step": 20894 + }, + { + "epoch": 1.509563458377734, + "grad_norm": 6.79531737270606, + "learning_rate": 7.476567185979347e-07, + "loss": 0.6194, + "step": 20895 + }, + { + "epoch": 1.5096357035779435, + "grad_norm": 8.10090797892482, + "learning_rate": 7.474481111950808e-07, + "loss": 0.661, + "step": 20896 + }, + { + "epoch": 1.509707948778153, + "grad_norm": 8.437632919959968, + "learning_rate": 7.472395277833003e-07, + "loss": 0.6787, + "step": 20897 + }, + { + "epoch": 1.5097801939783626, + "grad_norm": 6.039824562303793, + "learning_rate": 7.470309683654497e-07, + "loss": 0.5995, + "step": 20898 + }, + { + "epoch": 1.5098524391785721, + "grad_norm": 7.953366259080127, + "learning_rate": 7.468224329443849e-07, + "loss": 0.6053, + "step": 20899 + }, + { + "epoch": 1.5099246843787815, + "grad_norm": 6.646664590379294, + "learning_rate": 7.466139215229593e-07, + "loss": 0.6081, + "step": 20900 + }, + { + "epoch": 1.5099969295789912, + "grad_norm": 6.401097155941682, + "learning_rate": 7.464054341040272e-07, + "loss": 0.6401, + "step": 20901 + }, + { + "epoch": 1.5100691747792006, + "grad_norm": 7.404324438174099, + "learning_rate": 7.461969706904434e-07, + "loss": 0.627, + "step": 20902 + }, + { + "epoch": 1.51014141997941, + "grad_norm": 9.111149139118178, + "learning_rate": 7.459885312850613e-07, + "loss": 0.6238, + "step": 20903 + }, + { + "epoch": 1.5102136651796196, + "grad_norm": 6.733830334793356, + "learning_rate": 7.457801158907343e-07, + "loss": 0.5489, + "step": 20904 + }, + { + "epoch": 1.5102859103798292, + "grad_norm": 6.338160264872835, + "learning_rate": 7.455717245103153e-07, + "loss": 0.5906, + "step": 20905 + }, + { + "epoch": 1.5103581555800387, + "grad_norm": 6.394968510744832, + "learning_rate": 7.453633571466581e-07, + "loss": 0.5822, + "step": 20906 + }, + { + "epoch": 1.510430400780248, + "grad_norm": 7.555214643578757, + "learning_rate": 7.451550138026134e-07, + "loss": 0.5632, + "step": 20907 + }, + { + "epoch": 1.5105026459804578, + "grad_norm": 6.62916980729747, + "learning_rate": 7.449466944810341e-07, + "loss": 0.6094, + "step": 20908 + }, + { + "epoch": 1.5105748911806671, + "grad_norm": 6.650570160939299, + "learning_rate": 7.447383991847718e-07, + "loss": 0.5555, + "step": 20909 + }, + { + "epoch": 1.5106471363808767, + "grad_norm": 7.248785509133366, + "learning_rate": 7.445301279166786e-07, + "loss": 0.644, + "step": 20910 + }, + { + "epoch": 1.5107193815810862, + "grad_norm": 6.912821823271791, + "learning_rate": 7.44321880679604e-07, + "loss": 0.6846, + "step": 20911 + }, + { + "epoch": 1.5107916267812957, + "grad_norm": 6.735395668375416, + "learning_rate": 7.441136574763999e-07, + "loss": 0.5962, + "step": 20912 + }, + { + "epoch": 1.5108638719815053, + "grad_norm": 6.203740548213697, + "learning_rate": 7.439054583099164e-07, + "loss": 0.5513, + "step": 20913 + }, + { + "epoch": 1.5109361171817146, + "grad_norm": 7.100679617578457, + "learning_rate": 7.436972831830037e-07, + "loss": 0.7301, + "step": 20914 + }, + { + "epoch": 1.5110083623819244, + "grad_norm": 6.6437782609971885, + "learning_rate": 7.434891320985113e-07, + "loss": 0.6789, + "step": 20915 + }, + { + "epoch": 1.5110806075821337, + "grad_norm": 6.579881592122872, + "learning_rate": 7.432810050592892e-07, + "loss": 0.6505, + "step": 20916 + }, + { + "epoch": 1.5111528527823432, + "grad_norm": 8.63029383860843, + "learning_rate": 7.430729020681868e-07, + "loss": 0.6713, + "step": 20917 + }, + { + "epoch": 1.5112250979825528, + "grad_norm": 6.699404379716808, + "learning_rate": 7.42864823128051e-07, + "loss": 0.5775, + "step": 20918 + }, + { + "epoch": 1.5112973431827623, + "grad_norm": 7.797513457858748, + "learning_rate": 7.426567682417321e-07, + "loss": 0.6105, + "step": 20919 + }, + { + "epoch": 1.5113695883829719, + "grad_norm": 7.836044356434301, + "learning_rate": 7.42448737412077e-07, + "loss": 0.6094, + "step": 20920 + }, + { + "epoch": 1.5114418335831812, + "grad_norm": 7.414616876583286, + "learning_rate": 7.422407306419352e-07, + "loss": 0.6139, + "step": 20921 + }, + { + "epoch": 1.511514078783391, + "grad_norm": 6.902754369329737, + "learning_rate": 7.420327479341513e-07, + "loss": 0.5952, + "step": 20922 + }, + { + "epoch": 1.5115863239836003, + "grad_norm": 7.761415503481036, + "learning_rate": 7.418247892915747e-07, + "loss": 0.5898, + "step": 20923 + }, + { + "epoch": 1.5116585691838098, + "grad_norm": 7.171226056560822, + "learning_rate": 7.416168547170527e-07, + "loss": 0.6066, + "step": 20924 + }, + { + "epoch": 1.5117308143840194, + "grad_norm": 6.618176094799635, + "learning_rate": 7.414089442134298e-07, + "loss": 0.6273, + "step": 20925 + }, + { + "epoch": 1.511803059584229, + "grad_norm": 7.548979471720209, + "learning_rate": 7.412010577835529e-07, + "loss": 0.586, + "step": 20926 + }, + { + "epoch": 1.5118753047844384, + "grad_norm": 5.935280475368376, + "learning_rate": 7.409931954302679e-07, + "loss": 0.6807, + "step": 20927 + }, + { + "epoch": 1.5119475499846478, + "grad_norm": 7.132917589330585, + "learning_rate": 7.40785357156421e-07, + "loss": 0.6689, + "step": 20928 + }, + { + "epoch": 1.5120197951848575, + "grad_norm": 8.386485408811431, + "learning_rate": 7.40577542964856e-07, + "loss": 0.5662, + "step": 20929 + }, + { + "epoch": 1.5120920403850668, + "grad_norm": 8.372129489267268, + "learning_rate": 7.403697528584183e-07, + "loss": 0.675, + "step": 20930 + }, + { + "epoch": 1.5121642855852764, + "grad_norm": 6.713023881696718, + "learning_rate": 7.401619868399524e-07, + "loss": 0.5657, + "step": 20931 + }, + { + "epoch": 1.512236530785486, + "grad_norm": 6.299169988300356, + "learning_rate": 7.399542449123023e-07, + "loss": 0.6287, + "step": 20932 + }, + { + "epoch": 1.5123087759856955, + "grad_norm": 8.323210964482238, + "learning_rate": 7.397465270783122e-07, + "loss": 0.6931, + "step": 20933 + }, + { + "epoch": 1.512381021185905, + "grad_norm": 5.927355643275929, + "learning_rate": 7.395388333408254e-07, + "loss": 0.5558, + "step": 20934 + }, + { + "epoch": 1.5124532663861143, + "grad_norm": 8.12754382879384, + "learning_rate": 7.39331163702686e-07, + "loss": 0.6168, + "step": 20935 + }, + { + "epoch": 1.512525511586324, + "grad_norm": 8.642792855736388, + "learning_rate": 7.391235181667347e-07, + "loss": 0.5722, + "step": 20936 + }, + { + "epoch": 1.5125977567865334, + "grad_norm": 7.3337344435609255, + "learning_rate": 7.389158967358154e-07, + "loss": 0.6229, + "step": 20937 + }, + { + "epoch": 1.512670001986743, + "grad_norm": 6.601179375901857, + "learning_rate": 7.3870829941277e-07, + "loss": 0.6395, + "step": 20938 + }, + { + "epoch": 1.5127422471869525, + "grad_norm": 6.973482859791138, + "learning_rate": 7.385007262004415e-07, + "loss": 0.6432, + "step": 20939 + }, + { + "epoch": 1.512814492387162, + "grad_norm": 5.86076415824376, + "learning_rate": 7.382931771016694e-07, + "loss": 0.6177, + "step": 20940 + }, + { + "epoch": 1.5128867375873716, + "grad_norm": 7.7358409378599005, + "learning_rate": 7.380856521192956e-07, + "loss": 0.6111, + "step": 20941 + }, + { + "epoch": 1.512958982787581, + "grad_norm": 7.157536153698695, + "learning_rate": 7.378781512561614e-07, + "loss": 0.6048, + "step": 20942 + }, + { + "epoch": 1.5130312279877907, + "grad_norm": 6.564071031259333, + "learning_rate": 7.376706745151072e-07, + "loss": 0.6258, + "step": 20943 + }, + { + "epoch": 1.513103473188, + "grad_norm": 7.924089772994041, + "learning_rate": 7.37463221898973e-07, + "loss": 0.6308, + "step": 20944 + }, + { + "epoch": 1.5131757183882097, + "grad_norm": 6.853409312726307, + "learning_rate": 7.372557934105986e-07, + "loss": 0.6363, + "step": 20945 + }, + { + "epoch": 1.513247963588419, + "grad_norm": 7.7599696361301636, + "learning_rate": 7.370483890528247e-07, + "loss": 0.6725, + "step": 20946 + }, + { + "epoch": 1.5133202087886286, + "grad_norm": 6.181158033097597, + "learning_rate": 7.368410088284886e-07, + "loss": 0.5943, + "step": 20947 + }, + { + "epoch": 1.5133924539888381, + "grad_norm": 6.683551513966489, + "learning_rate": 7.366336527404302e-07, + "loss": 0.5463, + "step": 20948 + }, + { + "epoch": 1.5134646991890475, + "grad_norm": 6.322052026628109, + "learning_rate": 7.364263207914887e-07, + "loss": 0.6146, + "step": 20949 + }, + { + "epoch": 1.5135369443892572, + "grad_norm": 6.950169996199677, + "learning_rate": 7.362190129845007e-07, + "loss": 0.6027, + "step": 20950 + }, + { + "epoch": 1.5136091895894666, + "grad_norm": 7.927453599286952, + "learning_rate": 7.360117293223043e-07, + "loss": 0.716, + "step": 20951 + }, + { + "epoch": 1.5136814347896763, + "grad_norm": 8.36208292515495, + "learning_rate": 7.358044698077383e-07, + "loss": 0.6356, + "step": 20952 + }, + { + "epoch": 1.5137536799898856, + "grad_norm": 9.268460385099363, + "learning_rate": 7.355972344436402e-07, + "loss": 0.5854, + "step": 20953 + }, + { + "epoch": 1.5138259251900952, + "grad_norm": 6.240618263817878, + "learning_rate": 7.353900232328451e-07, + "loss": 0.5733, + "step": 20954 + }, + { + "epoch": 1.5138981703903047, + "grad_norm": 6.9159414378434185, + "learning_rate": 7.351828361781904e-07, + "loss": 0.5919, + "step": 20955 + }, + { + "epoch": 1.513970415590514, + "grad_norm": 7.635358184826414, + "learning_rate": 7.349756732825133e-07, + "loss": 0.5995, + "step": 20956 + }, + { + "epoch": 1.5140426607907238, + "grad_norm": 6.4149735764325735, + "learning_rate": 7.347685345486479e-07, + "loss": 0.6338, + "step": 20957 + }, + { + "epoch": 1.5141149059909331, + "grad_norm": 6.292787498359667, + "learning_rate": 7.345614199794307e-07, + "loss": 0.5865, + "step": 20958 + }, + { + "epoch": 1.5141871511911429, + "grad_norm": 6.93102670349428, + "learning_rate": 7.343543295776967e-07, + "loss": 0.6527, + "step": 20959 + }, + { + "epoch": 1.5142593963913522, + "grad_norm": 7.688807182514891, + "learning_rate": 7.341472633462813e-07, + "loss": 0.6244, + "step": 20960 + }, + { + "epoch": 1.5143316415915618, + "grad_norm": 5.932855013880135, + "learning_rate": 7.339402212880187e-07, + "loss": 0.6325, + "step": 20961 + }, + { + "epoch": 1.5144038867917713, + "grad_norm": 7.580138263498233, + "learning_rate": 7.337332034057432e-07, + "loss": 0.6453, + "step": 20962 + }, + { + "epoch": 1.5144761319919808, + "grad_norm": 6.933904592504083, + "learning_rate": 7.335262097022885e-07, + "loss": 0.6264, + "step": 20963 + }, + { + "epoch": 1.5145483771921904, + "grad_norm": 6.604510598197206, + "learning_rate": 7.333192401804895e-07, + "loss": 0.6354, + "step": 20964 + }, + { + "epoch": 1.5146206223923997, + "grad_norm": 6.832044650501437, + "learning_rate": 7.331122948431774e-07, + "loss": 0.5433, + "step": 20965 + }, + { + "epoch": 1.5146928675926095, + "grad_norm": 6.534879561163025, + "learning_rate": 7.329053736931859e-07, + "loss": 0.599, + "step": 20966 + }, + { + "epoch": 1.5147651127928188, + "grad_norm": 6.77080156367747, + "learning_rate": 7.326984767333489e-07, + "loss": 0.6013, + "step": 20967 + }, + { + "epoch": 1.5148373579930283, + "grad_norm": 8.889883965849585, + "learning_rate": 7.324916039664967e-07, + "loss": 0.7192, + "step": 20968 + }, + { + "epoch": 1.5149096031932379, + "grad_norm": 6.771859272233121, + "learning_rate": 7.32284755395462e-07, + "loss": 0.5831, + "step": 20969 + }, + { + "epoch": 1.5149818483934474, + "grad_norm": 7.4551026861332375, + "learning_rate": 7.320779310230763e-07, + "loss": 0.5904, + "step": 20970 + }, + { + "epoch": 1.515054093593657, + "grad_norm": 6.637157010341976, + "learning_rate": 7.318711308521712e-07, + "loss": 0.6214, + "step": 20971 + }, + { + "epoch": 1.5151263387938663, + "grad_norm": 8.800271498219974, + "learning_rate": 7.316643548855776e-07, + "loss": 0.6603, + "step": 20972 + }, + { + "epoch": 1.515198583994076, + "grad_norm": 5.962695501229361, + "learning_rate": 7.314576031261256e-07, + "loss": 0.5863, + "step": 20973 + }, + { + "epoch": 1.5152708291942854, + "grad_norm": 5.436836991502317, + "learning_rate": 7.312508755766468e-07, + "loss": 0.5745, + "step": 20974 + }, + { + "epoch": 1.515343074394495, + "grad_norm": 7.187130064358911, + "learning_rate": 7.310441722399694e-07, + "loss": 0.5814, + "step": 20975 + }, + { + "epoch": 1.5154153195947044, + "grad_norm": 7.49598382866254, + "learning_rate": 7.308374931189239e-07, + "loss": 0.6356, + "step": 20976 + }, + { + "epoch": 1.515487564794914, + "grad_norm": 6.168592753623681, + "learning_rate": 7.306308382163393e-07, + "loss": 0.6037, + "step": 20977 + }, + { + "epoch": 1.5155598099951235, + "grad_norm": 6.489040819476784, + "learning_rate": 7.304242075350454e-07, + "loss": 0.678, + "step": 20978 + }, + { + "epoch": 1.5156320551953328, + "grad_norm": 6.652473058692599, + "learning_rate": 7.302176010778686e-07, + "loss": 0.6505, + "step": 20979 + }, + { + "epoch": 1.5157043003955426, + "grad_norm": 8.868533505357272, + "learning_rate": 7.300110188476395e-07, + "loss": 0.6305, + "step": 20980 + }, + { + "epoch": 1.515776545595752, + "grad_norm": 7.849890019320633, + "learning_rate": 7.29804460847186e-07, + "loss": 0.6354, + "step": 20981 + }, + { + "epoch": 1.5158487907959615, + "grad_norm": 7.9999277588443585, + "learning_rate": 7.295979270793343e-07, + "loss": 0.6374, + "step": 20982 + }, + { + "epoch": 1.515921035996171, + "grad_norm": 6.242327749421779, + "learning_rate": 7.293914175469125e-07, + "loss": 0.6497, + "step": 20983 + }, + { + "epoch": 1.5159932811963805, + "grad_norm": 5.808565838765322, + "learning_rate": 7.291849322527469e-07, + "loss": 0.5325, + "step": 20984 + }, + { + "epoch": 1.51606552639659, + "grad_norm": 6.925473623226491, + "learning_rate": 7.289784711996659e-07, + "loss": 0.655, + "step": 20985 + }, + { + "epoch": 1.5161377715967994, + "grad_norm": 6.2098189588316375, + "learning_rate": 7.287720343904933e-07, + "loss": 0.5983, + "step": 20986 + }, + { + "epoch": 1.5162100167970092, + "grad_norm": 7.346471404003677, + "learning_rate": 7.285656218280565e-07, + "loss": 0.6336, + "step": 20987 + }, + { + "epoch": 1.5162822619972185, + "grad_norm": 6.255725831777879, + "learning_rate": 7.283592335151809e-07, + "loss": 0.5595, + "step": 20988 + }, + { + "epoch": 1.516354507197428, + "grad_norm": 6.00975832537337, + "learning_rate": 7.281528694546918e-07, + "loss": 0.5823, + "step": 20989 + }, + { + "epoch": 1.5164267523976376, + "grad_norm": 6.996654801100432, + "learning_rate": 7.279465296494143e-07, + "loss": 0.58, + "step": 20990 + }, + { + "epoch": 1.5164989975978471, + "grad_norm": 7.746417786654075, + "learning_rate": 7.277402141021727e-07, + "loss": 0.5985, + "step": 20991 + }, + { + "epoch": 1.5165712427980567, + "grad_norm": 6.687687666211751, + "learning_rate": 7.275339228157924e-07, + "loss": 0.6031, + "step": 20992 + }, + { + "epoch": 1.516643487998266, + "grad_norm": 7.201956631450169, + "learning_rate": 7.273276557930959e-07, + "loss": 0.6073, + "step": 20993 + }, + { + "epoch": 1.5167157331984757, + "grad_norm": 7.0474734168551, + "learning_rate": 7.27121413036907e-07, + "loss": 0.6103, + "step": 20994 + }, + { + "epoch": 1.516787978398685, + "grad_norm": 7.5206616946542315, + "learning_rate": 7.269151945500497e-07, + "loss": 0.6183, + "step": 20995 + }, + { + "epoch": 1.5168602235988946, + "grad_norm": 6.964135528174704, + "learning_rate": 7.267090003353472e-07, + "loss": 0.6298, + "step": 20996 + }, + { + "epoch": 1.5169324687991042, + "grad_norm": 7.62877374036865, + "learning_rate": 7.265028303956209e-07, + "loss": 0.579, + "step": 20997 + }, + { + "epoch": 1.5170047139993137, + "grad_norm": 6.73259837353863, + "learning_rate": 7.262966847336939e-07, + "loss": 0.6196, + "step": 20998 + }, + { + "epoch": 1.5170769591995232, + "grad_norm": 7.770346727723479, + "learning_rate": 7.260905633523879e-07, + "loss": 0.571, + "step": 20999 + }, + { + "epoch": 1.5171492043997326, + "grad_norm": 6.567855829271209, + "learning_rate": 7.25884466254525e-07, + "loss": 0.6271, + "step": 21000 + }, + { + "epoch": 1.5172214495999423, + "grad_norm": 6.508004175411074, + "learning_rate": 7.256783934429262e-07, + "loss": 0.5933, + "step": 21001 + }, + { + "epoch": 1.5172936948001516, + "grad_norm": 6.381880095605644, + "learning_rate": 7.254723449204124e-07, + "loss": 0.539, + "step": 21002 + }, + { + "epoch": 1.5173659400003612, + "grad_norm": 7.238950003678829, + "learning_rate": 7.252663206898053e-07, + "loss": 0.6506, + "step": 21003 + }, + { + "epoch": 1.5174381852005707, + "grad_norm": 6.785847173551212, + "learning_rate": 7.250603207539234e-07, + "loss": 0.6894, + "step": 21004 + }, + { + "epoch": 1.5175104304007803, + "grad_norm": 6.882184522820668, + "learning_rate": 7.248543451155873e-07, + "loss": 0.6487, + "step": 21005 + }, + { + "epoch": 1.5175826756009898, + "grad_norm": 6.031026727368274, + "learning_rate": 7.246483937776181e-07, + "loss": 0.58, + "step": 21006 + }, + { + "epoch": 1.5176549208011991, + "grad_norm": 6.492666068496204, + "learning_rate": 7.244424667428321e-07, + "loss": 0.543, + "step": 21007 + }, + { + "epoch": 1.517727166001409, + "grad_norm": 7.391229907195123, + "learning_rate": 7.242365640140512e-07, + "loss": 0.6401, + "step": 21008 + }, + { + "epoch": 1.5177994112016182, + "grad_norm": 6.579230208892342, + "learning_rate": 7.240306855940926e-07, + "loss": 0.6147, + "step": 21009 + }, + { + "epoch": 1.5178716564018278, + "grad_norm": 7.046773228893713, + "learning_rate": 7.23824831485776e-07, + "loss": 0.6316, + "step": 21010 + }, + { + "epoch": 1.5179439016020373, + "grad_norm": 7.302423038462371, + "learning_rate": 7.236190016919173e-07, + "loss": 0.5827, + "step": 21011 + }, + { + "epoch": 1.5180161468022468, + "grad_norm": 6.868943199377145, + "learning_rate": 7.234131962153354e-07, + "loss": 0.6219, + "step": 21012 + }, + { + "epoch": 1.5180883920024564, + "grad_norm": 7.904474473696671, + "learning_rate": 7.232074150588481e-07, + "loss": 0.7034, + "step": 21013 + }, + { + "epoch": 1.5181606372026657, + "grad_norm": 6.477741571277586, + "learning_rate": 7.230016582252708e-07, + "loss": 0.6511, + "step": 21014 + }, + { + "epoch": 1.5182328824028755, + "grad_norm": 7.464081522194857, + "learning_rate": 7.227959257174214e-07, + "loss": 0.6588, + "step": 21015 + }, + { + "epoch": 1.5183051276030848, + "grad_norm": 7.19862075946657, + "learning_rate": 7.225902175381155e-07, + "loss": 0.6276, + "step": 21016 + }, + { + "epoch": 1.5183773728032945, + "grad_norm": 7.732764983941708, + "learning_rate": 7.223845336901697e-07, + "loss": 0.6711, + "step": 21017 + }, + { + "epoch": 1.5184496180035039, + "grad_norm": 7.257832998322775, + "learning_rate": 7.221788741763993e-07, + "loss": 0.5964, + "step": 21018 + }, + { + "epoch": 1.5185218632037134, + "grad_norm": 7.841718798257019, + "learning_rate": 7.219732389996198e-07, + "loss": 0.5627, + "step": 21019 + }, + { + "epoch": 1.518594108403923, + "grad_norm": 6.727351870059552, + "learning_rate": 7.217676281626459e-07, + "loss": 0.6148, + "step": 21020 + }, + { + "epoch": 1.5186663536041323, + "grad_norm": 6.035171419883847, + "learning_rate": 7.215620416682934e-07, + "loss": 0.6541, + "step": 21021 + }, + { + "epoch": 1.518738598804342, + "grad_norm": 7.582037790601249, + "learning_rate": 7.213564795193751e-07, + "loss": 0.6132, + "step": 21022 + }, + { + "epoch": 1.5188108440045514, + "grad_norm": 7.77967861502179, + "learning_rate": 7.211509417187051e-07, + "loss": 0.7094, + "step": 21023 + }, + { + "epoch": 1.5188830892047611, + "grad_norm": 8.758397895243528, + "learning_rate": 7.209454282690987e-07, + "loss": 0.6188, + "step": 21024 + }, + { + "epoch": 1.5189553344049704, + "grad_norm": 7.1785448096892726, + "learning_rate": 7.207399391733672e-07, + "loss": 0.5935, + "step": 21025 + }, + { + "epoch": 1.51902757960518, + "grad_norm": 6.41406844739371, + "learning_rate": 7.205344744343243e-07, + "loss": 0.6253, + "step": 21026 + }, + { + "epoch": 1.5190998248053895, + "grad_norm": 6.596003287514075, + "learning_rate": 7.203290340547819e-07, + "loss": 0.5544, + "step": 21027 + }, + { + "epoch": 1.5191720700055988, + "grad_norm": 6.939857151199348, + "learning_rate": 7.201236180375551e-07, + "loss": 0.7381, + "step": 21028 + }, + { + "epoch": 1.5192443152058086, + "grad_norm": 7.257742595027988, + "learning_rate": 7.199182263854529e-07, + "loss": 0.6189, + "step": 21029 + }, + { + "epoch": 1.519316560406018, + "grad_norm": 7.465283726731722, + "learning_rate": 7.197128591012881e-07, + "loss": 0.5802, + "step": 21030 + }, + { + "epoch": 1.5193888056062277, + "grad_norm": 6.670981854372502, + "learning_rate": 7.195075161878729e-07, + "loss": 0.5555, + "step": 21031 + }, + { + "epoch": 1.519461050806437, + "grad_norm": 7.049225070649943, + "learning_rate": 7.193021976480164e-07, + "loss": 0.5985, + "step": 21032 + }, + { + "epoch": 1.5195332960066465, + "grad_norm": 8.278297214581585, + "learning_rate": 7.190969034845302e-07, + "loss": 0.6516, + "step": 21033 + }, + { + "epoch": 1.519605541206856, + "grad_norm": 7.578373415306623, + "learning_rate": 7.188916337002247e-07, + "loss": 0.6636, + "step": 21034 + }, + { + "epoch": 1.5196777864070656, + "grad_norm": 6.5380006791724465, + "learning_rate": 7.186863882979098e-07, + "loss": 0.6104, + "step": 21035 + }, + { + "epoch": 1.5197500316072752, + "grad_norm": 6.2632379336939215, + "learning_rate": 7.184811672803952e-07, + "loss": 0.5713, + "step": 21036 + }, + { + "epoch": 1.5198222768074845, + "grad_norm": 6.392512483431696, + "learning_rate": 7.182759706504899e-07, + "loss": 0.6067, + "step": 21037 + }, + { + "epoch": 1.5198945220076943, + "grad_norm": 8.360217356799385, + "learning_rate": 7.180707984110041e-07, + "loss": 0.6469, + "step": 21038 + }, + { + "epoch": 1.5199667672079036, + "grad_norm": 6.785582955602509, + "learning_rate": 7.178656505647447e-07, + "loss": 0.6227, + "step": 21039 + }, + { + "epoch": 1.5200390124081131, + "grad_norm": 7.833517383550551, + "learning_rate": 7.176605271145206e-07, + "loss": 0.5692, + "step": 21040 + }, + { + "epoch": 1.5201112576083227, + "grad_norm": 7.899149605093107, + "learning_rate": 7.174554280631402e-07, + "loss": 0.6162, + "step": 21041 + }, + { + "epoch": 1.5201835028085322, + "grad_norm": 6.431544539784851, + "learning_rate": 7.172503534134118e-07, + "loss": 0.5918, + "step": 21042 + }, + { + "epoch": 1.5202557480087417, + "grad_norm": 6.751116342420451, + "learning_rate": 7.17045303168141e-07, + "loss": 0.6311, + "step": 21043 + }, + { + "epoch": 1.520327993208951, + "grad_norm": 6.876656072734464, + "learning_rate": 7.168402773301356e-07, + "loss": 0.5664, + "step": 21044 + }, + { + "epoch": 1.5204002384091608, + "grad_norm": 6.9919239868292085, + "learning_rate": 7.166352759022022e-07, + "loss": 0.6303, + "step": 21045 + }, + { + "epoch": 1.5204724836093702, + "grad_norm": 7.37181876154232, + "learning_rate": 7.164302988871472e-07, + "loss": 0.5623, + "step": 21046 + }, + { + "epoch": 1.5205447288095797, + "grad_norm": 6.3890952072369895, + "learning_rate": 7.162253462877763e-07, + "loss": 0.6969, + "step": 21047 + }, + { + "epoch": 1.5206169740097892, + "grad_norm": 8.000616049888556, + "learning_rate": 7.160204181068958e-07, + "loss": 0.6269, + "step": 21048 + }, + { + "epoch": 1.5206892192099988, + "grad_norm": 10.217984698741281, + "learning_rate": 7.158155143473114e-07, + "loss": 0.6295, + "step": 21049 + }, + { + "epoch": 1.5207614644102083, + "grad_norm": 6.156668866619604, + "learning_rate": 7.156106350118264e-07, + "loss": 0.5505, + "step": 21050 + }, + { + "epoch": 1.5208337096104176, + "grad_norm": 9.411369882043774, + "learning_rate": 7.15405780103246e-07, + "loss": 0.6741, + "step": 21051 + }, + { + "epoch": 1.5209059548106274, + "grad_norm": 5.812356926582553, + "learning_rate": 7.152009496243753e-07, + "loss": 0.5947, + "step": 21052 + }, + { + "epoch": 1.5209782000108367, + "grad_norm": 7.6362242087314085, + "learning_rate": 7.149961435780184e-07, + "loss": 0.6414, + "step": 21053 + }, + { + "epoch": 1.5210504452110463, + "grad_norm": 5.947731598637719, + "learning_rate": 7.147913619669775e-07, + "loss": 0.679, + "step": 21054 + }, + { + "epoch": 1.5211226904112558, + "grad_norm": 7.641739528007744, + "learning_rate": 7.145866047940567e-07, + "loss": 0.6563, + "step": 21055 + }, + { + "epoch": 1.5211949356114653, + "grad_norm": 7.341237259198861, + "learning_rate": 7.14381872062059e-07, + "loss": 0.593, + "step": 21056 + }, + { + "epoch": 1.521267180811675, + "grad_norm": 7.684641484948876, + "learning_rate": 7.141771637737871e-07, + "loss": 0.6923, + "step": 21057 + }, + { + "epoch": 1.5213394260118842, + "grad_norm": 6.475728716403063, + "learning_rate": 7.139724799320433e-07, + "loss": 0.5983, + "step": 21058 + }, + { + "epoch": 1.521411671212094, + "grad_norm": 8.265782836099355, + "learning_rate": 7.137678205396292e-07, + "loss": 0.5806, + "step": 21059 + }, + { + "epoch": 1.5214839164123033, + "grad_norm": 8.409406445181128, + "learning_rate": 7.135631855993477e-07, + "loss": 0.6597, + "step": 21060 + }, + { + "epoch": 1.5215561616125128, + "grad_norm": 7.010561196668661, + "learning_rate": 7.133585751139985e-07, + "loss": 0.5863, + "step": 21061 + }, + { + "epoch": 1.5216284068127224, + "grad_norm": 6.759368893659834, + "learning_rate": 7.131539890863828e-07, + "loss": 0.6104, + "step": 21062 + }, + { + "epoch": 1.521700652012932, + "grad_norm": 6.645475882201946, + "learning_rate": 7.129494275193027e-07, + "loss": 0.593, + "step": 21063 + }, + { + "epoch": 1.5217728972131415, + "grad_norm": 7.774915698413176, + "learning_rate": 7.127448904155557e-07, + "loss": 0.618, + "step": 21064 + }, + { + "epoch": 1.5218451424133508, + "grad_norm": 7.027505513354961, + "learning_rate": 7.125403777779441e-07, + "loss": 0.6468, + "step": 21065 + }, + { + "epoch": 1.5219173876135605, + "grad_norm": 6.625668473976854, + "learning_rate": 7.123358896092669e-07, + "loss": 0.5547, + "step": 21066 + }, + { + "epoch": 1.5219896328137699, + "grad_norm": 6.926453464716048, + "learning_rate": 7.121314259123241e-07, + "loss": 0.6277, + "step": 21067 + }, + { + "epoch": 1.5220618780139794, + "grad_norm": 6.912347233827729, + "learning_rate": 7.119269866899134e-07, + "loss": 0.6018, + "step": 21068 + }, + { + "epoch": 1.522134123214189, + "grad_norm": 7.165015836326844, + "learning_rate": 7.117225719448337e-07, + "loss": 0.6771, + "step": 21069 + }, + { + "epoch": 1.5222063684143985, + "grad_norm": 7.346371965861177, + "learning_rate": 7.115181816798833e-07, + "loss": 0.5997, + "step": 21070 + }, + { + "epoch": 1.522278613614608, + "grad_norm": 6.934206894104339, + "learning_rate": 7.113138158978614e-07, + "loss": 0.6925, + "step": 21071 + }, + { + "epoch": 1.5223508588148174, + "grad_norm": 6.12255468017784, + "learning_rate": 7.111094746015635e-07, + "loss": 0.6536, + "step": 21072 + }, + { + "epoch": 1.5224231040150271, + "grad_norm": 6.696240853721207, + "learning_rate": 7.109051577937882e-07, + "loss": 0.6433, + "step": 21073 + }, + { + "epoch": 1.5224953492152364, + "grad_norm": 5.980711450517045, + "learning_rate": 7.107008654773321e-07, + "loss": 0.5083, + "step": 21074 + }, + { + "epoch": 1.522567594415446, + "grad_norm": 6.240091633246149, + "learning_rate": 7.104965976549916e-07, + "loss": 0.5958, + "step": 21075 + }, + { + "epoch": 1.5226398396156555, + "grad_norm": 7.835251052622193, + "learning_rate": 7.102923543295634e-07, + "loss": 0.6028, + "step": 21076 + }, + { + "epoch": 1.522712084815865, + "grad_norm": 7.760026414177369, + "learning_rate": 7.100881355038431e-07, + "loss": 0.6688, + "step": 21077 + }, + { + "epoch": 1.5227843300160746, + "grad_norm": 7.3865163080403, + "learning_rate": 7.098839411806274e-07, + "loss": 0.553, + "step": 21078 + }, + { + "epoch": 1.522856575216284, + "grad_norm": 7.456978303652397, + "learning_rate": 7.096797713627096e-07, + "loss": 0.5691, + "step": 21079 + }, + { + "epoch": 1.5229288204164937, + "grad_norm": 6.6330054516803685, + "learning_rate": 7.094756260528859e-07, + "loss": 0.606, + "step": 21080 + }, + { + "epoch": 1.523001065616703, + "grad_norm": 6.74180699199658, + "learning_rate": 7.092715052539512e-07, + "loss": 0.5451, + "step": 21081 + }, + { + "epoch": 1.5230733108169126, + "grad_norm": 7.4880002983451845, + "learning_rate": 7.090674089686986e-07, + "loss": 0.5389, + "step": 21082 + }, + { + "epoch": 1.523145556017122, + "grad_norm": 8.871412384881943, + "learning_rate": 7.088633371999226e-07, + "loss": 0.6236, + "step": 21083 + }, + { + "epoch": 1.5232178012173316, + "grad_norm": 6.427576212065794, + "learning_rate": 7.086592899504158e-07, + "loss": 0.5651, + "step": 21084 + }, + { + "epoch": 1.5232900464175412, + "grad_norm": 6.359382198245312, + "learning_rate": 7.084552672229741e-07, + "loss": 0.6745, + "step": 21085 + }, + { + "epoch": 1.5233622916177505, + "grad_norm": 8.094703883220884, + "learning_rate": 7.08251269020388e-07, + "loss": 0.6495, + "step": 21086 + }, + { + "epoch": 1.5234345368179603, + "grad_norm": 6.845090133931794, + "learning_rate": 7.080472953454509e-07, + "loss": 0.5948, + "step": 21087 + }, + { + "epoch": 1.5235067820181696, + "grad_norm": 6.513095867991238, + "learning_rate": 7.078433462009556e-07, + "loss": 0.5987, + "step": 21088 + }, + { + "epoch": 1.5235790272183793, + "grad_norm": 7.583676285027846, + "learning_rate": 7.076394215896926e-07, + "loss": 0.7108, + "step": 21089 + }, + { + "epoch": 1.5236512724185887, + "grad_norm": 6.569176465680904, + "learning_rate": 7.074355215144543e-07, + "loss": 0.5253, + "step": 21090 + }, + { + "epoch": 1.5237235176187982, + "grad_norm": 6.186486671778085, + "learning_rate": 7.072316459780318e-07, + "loss": 0.6203, + "step": 21091 + }, + { + "epoch": 1.5237957628190077, + "grad_norm": 6.625826226302071, + "learning_rate": 7.07027794983216e-07, + "loss": 0.5978, + "step": 21092 + }, + { + "epoch": 1.523868008019217, + "grad_norm": 6.914265809599094, + "learning_rate": 7.06823968532798e-07, + "loss": 0.639, + "step": 21093 + }, + { + "epoch": 1.5239402532194268, + "grad_norm": 6.405628006676013, + "learning_rate": 7.066201666295669e-07, + "loss": 0.5775, + "step": 21094 + }, + { + "epoch": 1.5240124984196362, + "grad_norm": 8.149515111809855, + "learning_rate": 7.064163892763146e-07, + "loss": 0.5903, + "step": 21095 + }, + { + "epoch": 1.524084743619846, + "grad_norm": 7.385252727918294, + "learning_rate": 7.062126364758282e-07, + "loss": 0.6624, + "step": 21096 + }, + { + "epoch": 1.5241569888200552, + "grad_norm": 7.022151230937443, + "learning_rate": 7.060089082308979e-07, + "loss": 0.6303, + "step": 21097 + }, + { + "epoch": 1.5242292340202648, + "grad_norm": 5.920662869123224, + "learning_rate": 7.058052045443131e-07, + "loss": 0.6085, + "step": 21098 + }, + { + "epoch": 1.5243014792204743, + "grad_norm": 7.133370819468652, + "learning_rate": 7.056015254188625e-07, + "loss": 0.63, + "step": 21099 + }, + { + "epoch": 1.5243737244206836, + "grad_norm": 7.322245828027386, + "learning_rate": 7.05397870857333e-07, + "loss": 0.6642, + "step": 21100 + }, + { + "epoch": 1.5244459696208934, + "grad_norm": 7.046599456745685, + "learning_rate": 7.051942408625131e-07, + "loss": 0.6674, + "step": 21101 + }, + { + "epoch": 1.5245182148211027, + "grad_norm": 8.191210655747778, + "learning_rate": 7.049906354371908e-07, + "loss": 0.6116, + "step": 21102 + }, + { + "epoch": 1.5245904600213125, + "grad_norm": 7.443567702001329, + "learning_rate": 7.047870545841526e-07, + "loss": 0.6095, + "step": 21103 + }, + { + "epoch": 1.5246627052215218, + "grad_norm": 8.48332148802804, + "learning_rate": 7.045834983061859e-07, + "loss": 0.5748, + "step": 21104 + }, + { + "epoch": 1.5247349504217313, + "grad_norm": 7.491210364624049, + "learning_rate": 7.043799666060771e-07, + "loss": 0.6244, + "step": 21105 + }, + { + "epoch": 1.524807195621941, + "grad_norm": 6.509588065760281, + "learning_rate": 7.041764594866129e-07, + "loss": 0.5715, + "step": 21106 + }, + { + "epoch": 1.5248794408221504, + "grad_norm": 7.102557524918429, + "learning_rate": 7.039729769505782e-07, + "loss": 0.6188, + "step": 21107 + }, + { + "epoch": 1.52495168602236, + "grad_norm": 7.973878892884483, + "learning_rate": 7.037695190007587e-07, + "loss": 0.6482, + "step": 21108 + }, + { + "epoch": 1.5250239312225693, + "grad_norm": 26.490994056722016, + "learning_rate": 7.035660856399399e-07, + "loss": 0.6809, + "step": 21109 + }, + { + "epoch": 1.525096176422779, + "grad_norm": 7.026088600388399, + "learning_rate": 7.03362676870907e-07, + "loss": 0.6509, + "step": 21110 + }, + { + "epoch": 1.5251684216229884, + "grad_norm": 8.937716821561414, + "learning_rate": 7.031592926964436e-07, + "loss": 0.5844, + "step": 21111 + }, + { + "epoch": 1.525240666823198, + "grad_norm": 7.629831503111268, + "learning_rate": 7.029559331193336e-07, + "loss": 0.6852, + "step": 21112 + }, + { + "epoch": 1.5253129120234075, + "grad_norm": 6.555583978939759, + "learning_rate": 7.02752598142363e-07, + "loss": 0.5477, + "step": 21113 + }, + { + "epoch": 1.525385157223617, + "grad_norm": 7.8891278650088426, + "learning_rate": 7.025492877683129e-07, + "loss": 0.684, + "step": 21114 + }, + { + "epoch": 1.5254574024238265, + "grad_norm": 6.9324473634159824, + "learning_rate": 7.023460019999675e-07, + "loss": 0.6491, + "step": 21115 + }, + { + "epoch": 1.5255296476240359, + "grad_norm": 5.884962036217788, + "learning_rate": 7.021427408401097e-07, + "loss": 0.552, + "step": 21116 + }, + { + "epoch": 1.5256018928242456, + "grad_norm": 6.391185484093686, + "learning_rate": 7.019395042915225e-07, + "loss": 0.5338, + "step": 21117 + }, + { + "epoch": 1.525674138024455, + "grad_norm": 7.276496612663374, + "learning_rate": 7.017362923569867e-07, + "loss": 0.6145, + "step": 21118 + }, + { + "epoch": 1.5257463832246645, + "grad_norm": 7.98434633373034, + "learning_rate": 7.015331050392849e-07, + "loss": 0.5852, + "step": 21119 + }, + { + "epoch": 1.525818628424874, + "grad_norm": 7.541876043571109, + "learning_rate": 7.013299423411982e-07, + "loss": 0.5644, + "step": 21120 + }, + { + "epoch": 1.5258908736250836, + "grad_norm": 6.195660690778964, + "learning_rate": 7.011268042655081e-07, + "loss": 0.599, + "step": 21121 + }, + { + "epoch": 1.5259631188252931, + "grad_norm": 6.4461620533853985, + "learning_rate": 7.009236908149955e-07, + "loss": 0.5324, + "step": 21122 + }, + { + "epoch": 1.5260353640255024, + "grad_norm": 7.0906386757353514, + "learning_rate": 7.007206019924403e-07, + "loss": 0.7129, + "step": 21123 + }, + { + "epoch": 1.5261076092257122, + "grad_norm": 6.660788265008827, + "learning_rate": 7.00517537800624e-07, + "loss": 0.5835, + "step": 21124 + }, + { + "epoch": 1.5261798544259215, + "grad_norm": 7.444452450243439, + "learning_rate": 7.003144982423247e-07, + "loss": 0.6537, + "step": 21125 + }, + { + "epoch": 1.526252099626131, + "grad_norm": 7.6178643028650255, + "learning_rate": 7.001114833203227e-07, + "loss": 0.5993, + "step": 21126 + }, + { + "epoch": 1.5263243448263406, + "grad_norm": 5.6722227047778775, + "learning_rate": 6.999084930373967e-07, + "loss": 0.6022, + "step": 21127 + }, + { + "epoch": 1.5263965900265501, + "grad_norm": 7.114502518633786, + "learning_rate": 6.997055273963266e-07, + "loss": 0.6045, + "step": 21128 + }, + { + "epoch": 1.5264688352267597, + "grad_norm": 7.593820579420588, + "learning_rate": 6.995025863998891e-07, + "loss": 0.6063, + "step": 21129 + }, + { + "epoch": 1.526541080426969, + "grad_norm": 7.488638091800852, + "learning_rate": 6.992996700508634e-07, + "loss": 0.6503, + "step": 21130 + }, + { + "epoch": 1.5266133256271788, + "grad_norm": 7.733860314709055, + "learning_rate": 6.99096778352027e-07, + "loss": 0.6125, + "step": 21131 + }, + { + "epoch": 1.526685570827388, + "grad_norm": 6.097595879245508, + "learning_rate": 6.988939113061571e-07, + "loss": 0.5627, + "step": 21132 + }, + { + "epoch": 1.5267578160275976, + "grad_norm": 7.549594681325278, + "learning_rate": 6.986910689160315e-07, + "loss": 0.6076, + "step": 21133 + }, + { + "epoch": 1.5268300612278072, + "grad_norm": 7.849454105129551, + "learning_rate": 6.984882511844263e-07, + "loss": 0.7027, + "step": 21134 + }, + { + "epoch": 1.5269023064280167, + "grad_norm": 6.68651823163683, + "learning_rate": 6.982854581141191e-07, + "loss": 0.6607, + "step": 21135 + }, + { + "epoch": 1.5269745516282263, + "grad_norm": 7.802992750337297, + "learning_rate": 6.980826897078841e-07, + "loss": 0.6769, + "step": 21136 + }, + { + "epoch": 1.5270467968284356, + "grad_norm": 8.913031529310354, + "learning_rate": 6.978799459684979e-07, + "loss": 0.6577, + "step": 21137 + }, + { + "epoch": 1.5271190420286453, + "grad_norm": 6.555096619197288, + "learning_rate": 6.97677226898737e-07, + "loss": 0.6164, + "step": 21138 + }, + { + "epoch": 1.5271912872288547, + "grad_norm": 6.851517402039436, + "learning_rate": 6.974745325013746e-07, + "loss": 0.6394, + "step": 21139 + }, + { + "epoch": 1.5272635324290642, + "grad_norm": 7.3888283092083675, + "learning_rate": 6.972718627791863e-07, + "loss": 0.6262, + "step": 21140 + }, + { + "epoch": 1.5273357776292737, + "grad_norm": 8.784714636382663, + "learning_rate": 6.970692177349456e-07, + "loss": 0.6017, + "step": 21141 + }, + { + "epoch": 1.5274080228294833, + "grad_norm": 7.722859759584589, + "learning_rate": 6.968665973714289e-07, + "loss": 0.6277, + "step": 21142 + }, + { + "epoch": 1.5274802680296928, + "grad_norm": 7.278950995438432, + "learning_rate": 6.966640016914081e-07, + "loss": 0.6287, + "step": 21143 + }, + { + "epoch": 1.5275525132299022, + "grad_norm": 6.89854173592328, + "learning_rate": 6.964614306976563e-07, + "loss": 0.6064, + "step": 21144 + }, + { + "epoch": 1.527624758430112, + "grad_norm": 5.9038365191230735, + "learning_rate": 6.962588843929486e-07, + "loss": 0.6121, + "step": 21145 + }, + { + "epoch": 1.5276970036303212, + "grad_norm": 8.94716692363972, + "learning_rate": 6.960563627800549e-07, + "loss": 0.6686, + "step": 21146 + }, + { + "epoch": 1.5277692488305308, + "grad_norm": 6.209252240407839, + "learning_rate": 6.958538658617492e-07, + "loss": 0.6011, + "step": 21147 + }, + { + "epoch": 1.5278414940307403, + "grad_norm": 7.4753307256607595, + "learning_rate": 6.956513936408032e-07, + "loss": 0.5689, + "step": 21148 + }, + { + "epoch": 1.5279137392309499, + "grad_norm": 7.638469118857281, + "learning_rate": 6.954489461199887e-07, + "loss": 0.6315, + "step": 21149 + }, + { + "epoch": 1.5279859844311594, + "grad_norm": 8.986246408357538, + "learning_rate": 6.95246523302077e-07, + "loss": 0.5998, + "step": 21150 + }, + { + "epoch": 1.5280582296313687, + "grad_norm": 6.324757374569239, + "learning_rate": 6.950441251898388e-07, + "loss": 0.5865, + "step": 21151 + }, + { + "epoch": 1.5281304748315785, + "grad_norm": 7.0036990065371825, + "learning_rate": 6.948417517860454e-07, + "loss": 0.5918, + "step": 21152 + }, + { + "epoch": 1.5282027200317878, + "grad_norm": 6.855792872056522, + "learning_rate": 6.946394030934675e-07, + "loss": 0.7035, + "step": 21153 + }, + { + "epoch": 1.5282749652319974, + "grad_norm": 6.825767546591201, + "learning_rate": 6.944370791148736e-07, + "loss": 0.5826, + "step": 21154 + }, + { + "epoch": 1.528347210432207, + "grad_norm": 7.173113946549681, + "learning_rate": 6.942347798530341e-07, + "loss": 0.6802, + "step": 21155 + }, + { + "epoch": 1.5284194556324164, + "grad_norm": 7.544048656755738, + "learning_rate": 6.940325053107191e-07, + "loss": 0.5939, + "step": 21156 + }, + { + "epoch": 1.528491700832626, + "grad_norm": 7.014478424106518, + "learning_rate": 6.938302554906961e-07, + "loss": 0.6045, + "step": 21157 + }, + { + "epoch": 1.5285639460328353, + "grad_norm": 7.40757849001233, + "learning_rate": 6.936280303957346e-07, + "loss": 0.5783, + "step": 21158 + }, + { + "epoch": 1.528636191233045, + "grad_norm": 7.1067034386309125, + "learning_rate": 6.934258300286028e-07, + "loss": 0.6022, + "step": 21159 + }, + { + "epoch": 1.5287084364332544, + "grad_norm": 6.154995398314403, + "learning_rate": 6.932236543920687e-07, + "loss": 0.5566, + "step": 21160 + }, + { + "epoch": 1.528780681633464, + "grad_norm": 7.86839768497413, + "learning_rate": 6.930215034888998e-07, + "loss": 0.6322, + "step": 21161 + }, + { + "epoch": 1.5288529268336735, + "grad_norm": 5.808018423088006, + "learning_rate": 6.928193773218636e-07, + "loss": 0.6265, + "step": 21162 + }, + { + "epoch": 1.528925172033883, + "grad_norm": 7.385754777049464, + "learning_rate": 6.926172758937278e-07, + "loss": 0.5836, + "step": 21163 + }, + { + "epoch": 1.5289974172340925, + "grad_norm": 6.743969236527731, + "learning_rate": 6.924151992072572e-07, + "loss": 0.5747, + "step": 21164 + }, + { + "epoch": 1.5290696624343019, + "grad_norm": 7.503294411954705, + "learning_rate": 6.922131472652191e-07, + "loss": 0.617, + "step": 21165 + }, + { + "epoch": 1.5291419076345116, + "grad_norm": 6.671119379141102, + "learning_rate": 6.920111200703791e-07, + "loss": 0.5789, + "step": 21166 + }, + { + "epoch": 1.529214152834721, + "grad_norm": 8.747848682240841, + "learning_rate": 6.918091176255043e-07, + "loss": 0.6556, + "step": 21167 + }, + { + "epoch": 1.5292863980349307, + "grad_norm": 6.8949984902759915, + "learning_rate": 6.916071399333576e-07, + "loss": 0.6209, + "step": 21168 + }, + { + "epoch": 1.52935864323514, + "grad_norm": 7.399656813628092, + "learning_rate": 6.914051869967042e-07, + "loss": 0.6582, + "step": 21169 + }, + { + "epoch": 1.5294308884353496, + "grad_norm": 7.352412402718474, + "learning_rate": 6.912032588183112e-07, + "loss": 0.6339, + "step": 21170 + }, + { + "epoch": 1.5295031336355591, + "grad_norm": 7.863400348002064, + "learning_rate": 6.910013554009404e-07, + "loss": 0.6352, + "step": 21171 + }, + { + "epoch": 1.5295753788357684, + "grad_norm": 6.567108861038061, + "learning_rate": 6.907994767473564e-07, + "loss": 0.6041, + "step": 21172 + }, + { + "epoch": 1.5296476240359782, + "grad_norm": 9.08472322335259, + "learning_rate": 6.905976228603228e-07, + "loss": 0.6942, + "step": 21173 + }, + { + "epoch": 1.5297198692361875, + "grad_norm": 6.809713091078985, + "learning_rate": 6.903957937426037e-07, + "loss": 0.6165, + "step": 21174 + }, + { + "epoch": 1.5297921144363973, + "grad_norm": 7.444531362731529, + "learning_rate": 6.901939893969603e-07, + "loss": 0.5595, + "step": 21175 + }, + { + "epoch": 1.5298643596366066, + "grad_norm": 8.063041727954673, + "learning_rate": 6.899922098261561e-07, + "loss": 0.6595, + "step": 21176 + }, + { + "epoch": 1.5299366048368161, + "grad_norm": 8.102120449235853, + "learning_rate": 6.897904550329529e-07, + "loss": 0.6756, + "step": 21177 + }, + { + "epoch": 1.5300088500370257, + "grad_norm": 6.408705226919621, + "learning_rate": 6.895887250201128e-07, + "loss": 0.5947, + "step": 21178 + }, + { + "epoch": 1.530081095237235, + "grad_norm": 6.959707645984716, + "learning_rate": 6.893870197903976e-07, + "loss": 0.5322, + "step": 21179 + }, + { + "epoch": 1.5301533404374448, + "grad_norm": 6.888878702682566, + "learning_rate": 6.891853393465681e-07, + "loss": 0.7053, + "step": 21180 + }, + { + "epoch": 1.530225585637654, + "grad_norm": 6.793818345555196, + "learning_rate": 6.88983683691386e-07, + "loss": 0.5853, + "step": 21181 + }, + { + "epoch": 1.5302978308378639, + "grad_norm": 7.475112822328701, + "learning_rate": 6.887820528276107e-07, + "loss": 0.6422, + "step": 21182 + }, + { + "epoch": 1.5303700760380732, + "grad_norm": 8.119953172670836, + "learning_rate": 6.885804467580023e-07, + "loss": 0.6377, + "step": 21183 + }, + { + "epoch": 1.5304423212382827, + "grad_norm": 7.505412628195896, + "learning_rate": 6.883788654853215e-07, + "loss": 0.6471, + "step": 21184 + }, + { + "epoch": 1.5305145664384923, + "grad_norm": 7.645442886846723, + "learning_rate": 6.881773090123281e-07, + "loss": 0.6357, + "step": 21185 + }, + { + "epoch": 1.5305868116387018, + "grad_norm": 6.4795221392723334, + "learning_rate": 6.879757773417794e-07, + "loss": 0.5934, + "step": 21186 + }, + { + "epoch": 1.5306590568389113, + "grad_norm": 7.045958736423374, + "learning_rate": 6.877742704764359e-07, + "loss": 0.5397, + "step": 21187 + }, + { + "epoch": 1.5307313020391207, + "grad_norm": 7.1241609848998735, + "learning_rate": 6.875727884190553e-07, + "loss": 0.5999, + "step": 21188 + }, + { + "epoch": 1.5308035472393304, + "grad_norm": 6.555049772560995, + "learning_rate": 6.873713311723959e-07, + "loss": 0.6685, + "step": 21189 + }, + { + "epoch": 1.5308757924395398, + "grad_norm": 5.88302551127052, + "learning_rate": 6.871698987392155e-07, + "loss": 0.6304, + "step": 21190 + }, + { + "epoch": 1.5309480376397493, + "grad_norm": 6.449025031634052, + "learning_rate": 6.869684911222718e-07, + "loss": 0.5272, + "step": 21191 + }, + { + "epoch": 1.5310202828399588, + "grad_norm": 7.341612419537085, + "learning_rate": 6.867671083243224e-07, + "loss": 0.6315, + "step": 21192 + }, + { + "epoch": 1.5310925280401684, + "grad_norm": 7.561967310784608, + "learning_rate": 6.865657503481227e-07, + "loss": 0.655, + "step": 21193 + }, + { + "epoch": 1.531164773240378, + "grad_norm": 5.606609034444287, + "learning_rate": 6.863644171964298e-07, + "loss": 0.5724, + "step": 21194 + }, + { + "epoch": 1.5312370184405872, + "grad_norm": 7.381712622684284, + "learning_rate": 6.861631088720005e-07, + "loss": 0.6146, + "step": 21195 + }, + { + "epoch": 1.531309263640797, + "grad_norm": 6.156707901554093, + "learning_rate": 6.859618253775893e-07, + "loss": 0.5454, + "step": 21196 + }, + { + "epoch": 1.5313815088410063, + "grad_norm": 7.606723832704919, + "learning_rate": 6.857605667159514e-07, + "loss": 0.6233, + "step": 21197 + }, + { + "epoch": 1.5314537540412159, + "grad_norm": 7.988505211560966, + "learning_rate": 6.855593328898433e-07, + "loss": 0.6126, + "step": 21198 + }, + { + "epoch": 1.5315259992414254, + "grad_norm": 7.218195642100366, + "learning_rate": 6.853581239020198e-07, + "loss": 0.6687, + "step": 21199 + }, + { + "epoch": 1.531598244441635, + "grad_norm": 8.40769339156132, + "learning_rate": 6.85156939755234e-07, + "loss": 0.6115, + "step": 21200 + }, + { + "epoch": 1.5316704896418445, + "grad_norm": 8.66991505511953, + "learning_rate": 6.849557804522408e-07, + "loss": 0.685, + "step": 21201 + }, + { + "epoch": 1.5317427348420538, + "grad_norm": 6.197603242359665, + "learning_rate": 6.847546459957932e-07, + "loss": 0.5604, + "step": 21202 + }, + { + "epoch": 1.5318149800422636, + "grad_norm": 6.046806551917992, + "learning_rate": 6.845535363886463e-07, + "loss": 0.5265, + "step": 21203 + }, + { + "epoch": 1.531887225242473, + "grad_norm": 6.756746664503654, + "learning_rate": 6.84352451633551e-07, + "loss": 0.6156, + "step": 21204 + }, + { + "epoch": 1.5319594704426824, + "grad_norm": 8.172464090561881, + "learning_rate": 6.841513917332607e-07, + "loss": 0.6048, + "step": 21205 + }, + { + "epoch": 1.532031715642892, + "grad_norm": 6.434614997971451, + "learning_rate": 6.839503566905279e-07, + "loss": 0.5881, + "step": 21206 + }, + { + "epoch": 1.5321039608431015, + "grad_norm": 6.978072017019245, + "learning_rate": 6.837493465081047e-07, + "loss": 0.6398, + "step": 21207 + }, + { + "epoch": 1.532176206043311, + "grad_norm": 7.543667129276332, + "learning_rate": 6.835483611887428e-07, + "loss": 0.6186, + "step": 21208 + }, + { + "epoch": 1.5322484512435204, + "grad_norm": 6.984702151699041, + "learning_rate": 6.833474007351934e-07, + "loss": 0.6243, + "step": 21209 + }, + { + "epoch": 1.5323206964437301, + "grad_norm": 5.935843146192559, + "learning_rate": 6.831464651502081e-07, + "loss": 0.5525, + "step": 21210 + }, + { + "epoch": 1.5323929416439395, + "grad_norm": 6.759918837201145, + "learning_rate": 6.829455544365366e-07, + "loss": 0.6349, + "step": 21211 + }, + { + "epoch": 1.532465186844149, + "grad_norm": 7.962855174528642, + "learning_rate": 6.827446685969294e-07, + "loss": 0.5768, + "step": 21212 + }, + { + "epoch": 1.5325374320443585, + "grad_norm": 6.798008335378889, + "learning_rate": 6.825438076341376e-07, + "loss": 0.6223, + "step": 21213 + }, + { + "epoch": 1.532609677244568, + "grad_norm": 8.134080712838344, + "learning_rate": 6.823429715509089e-07, + "loss": 0.5978, + "step": 21214 + }, + { + "epoch": 1.5326819224447776, + "grad_norm": 7.257051130257919, + "learning_rate": 6.821421603499936e-07, + "loss": 0.6512, + "step": 21215 + }, + { + "epoch": 1.532754167644987, + "grad_norm": 7.033470786440429, + "learning_rate": 6.81941374034141e-07, + "loss": 0.5644, + "step": 21216 + }, + { + "epoch": 1.5328264128451967, + "grad_norm": 6.207786211658566, + "learning_rate": 6.817406126060991e-07, + "loss": 0.6125, + "step": 21217 + }, + { + "epoch": 1.532898658045406, + "grad_norm": 6.157336763610369, + "learning_rate": 6.815398760686165e-07, + "loss": 0.6924, + "step": 21218 + }, + { + "epoch": 1.5329709032456156, + "grad_norm": 8.115790209228848, + "learning_rate": 6.81339164424441e-07, + "loss": 0.5647, + "step": 21219 + }, + { + "epoch": 1.5330431484458251, + "grad_norm": 7.248382124758164, + "learning_rate": 6.81138477676321e-07, + "loss": 0.5635, + "step": 21220 + }, + { + "epoch": 1.5331153936460347, + "grad_norm": 6.352934404132699, + "learning_rate": 6.809378158270022e-07, + "loss": 0.6896, + "step": 21221 + }, + { + "epoch": 1.5331876388462442, + "grad_norm": 6.456538814499202, + "learning_rate": 6.807371788792325e-07, + "loss": 0.6055, + "step": 21222 + }, + { + "epoch": 1.5332598840464535, + "grad_norm": 6.724975540513721, + "learning_rate": 6.805365668357581e-07, + "loss": 0.5625, + "step": 21223 + }, + { + "epoch": 1.5333321292466633, + "grad_norm": 6.639168900561692, + "learning_rate": 6.803359796993261e-07, + "loss": 0.628, + "step": 21224 + }, + { + "epoch": 1.5334043744468726, + "grad_norm": 6.513215641818269, + "learning_rate": 6.801354174726802e-07, + "loss": 0.5968, + "step": 21225 + }, + { + "epoch": 1.5334766196470822, + "grad_norm": 7.081374881245889, + "learning_rate": 6.799348801585681e-07, + "loss": 0.5575, + "step": 21226 + }, + { + "epoch": 1.5335488648472917, + "grad_norm": 7.432912229005638, + "learning_rate": 6.797343677597352e-07, + "loss": 0.6638, + "step": 21227 + }, + { + "epoch": 1.5336211100475012, + "grad_norm": 6.845012670244705, + "learning_rate": 6.795338802789244e-07, + "loss": 0.5732, + "step": 21228 + }, + { + "epoch": 1.5336933552477108, + "grad_norm": 6.120652816461509, + "learning_rate": 6.793334177188815e-07, + "loss": 0.6008, + "step": 21229 + }, + { + "epoch": 1.53376560044792, + "grad_norm": 7.611039686525309, + "learning_rate": 6.791329800823504e-07, + "loss": 0.6668, + "step": 21230 + }, + { + "epoch": 1.5338378456481299, + "grad_norm": 8.259963521438111, + "learning_rate": 6.789325673720759e-07, + "loss": 0.6257, + "step": 21231 + }, + { + "epoch": 1.5339100908483392, + "grad_norm": 6.640164563541193, + "learning_rate": 6.787321795907998e-07, + "loss": 0.5429, + "step": 21232 + }, + { + "epoch": 1.5339823360485487, + "grad_norm": 6.809181735135258, + "learning_rate": 6.785318167412658e-07, + "loss": 0.608, + "step": 21233 + }, + { + "epoch": 1.5340545812487583, + "grad_norm": 6.456111337034965, + "learning_rate": 6.783314788262171e-07, + "loss": 0.6541, + "step": 21234 + }, + { + "epoch": 1.5341268264489678, + "grad_norm": 7.1893377193773365, + "learning_rate": 6.78131165848396e-07, + "loss": 0.6017, + "step": 21235 + }, + { + "epoch": 1.5341990716491773, + "grad_norm": 8.644946047436383, + "learning_rate": 6.779308778105446e-07, + "loss": 0.7319, + "step": 21236 + }, + { + "epoch": 1.5342713168493867, + "grad_norm": 8.113463207366207, + "learning_rate": 6.777306147154047e-07, + "loss": 0.6017, + "step": 21237 + }, + { + "epoch": 1.5343435620495964, + "grad_norm": 6.247337995110116, + "learning_rate": 6.775303765657187e-07, + "loss": 0.5725, + "step": 21238 + }, + { + "epoch": 1.5344158072498058, + "grad_norm": 8.094397088183369, + "learning_rate": 6.77330163364226e-07, + "loss": 0.5919, + "step": 21239 + }, + { + "epoch": 1.5344880524500155, + "grad_norm": 6.970573973481336, + "learning_rate": 6.771299751136682e-07, + "loss": 0.571, + "step": 21240 + }, + { + "epoch": 1.5345602976502248, + "grad_norm": 6.209971917867514, + "learning_rate": 6.769298118167855e-07, + "loss": 0.6649, + "step": 21241 + }, + { + "epoch": 1.5346325428504344, + "grad_norm": 6.929583457329152, + "learning_rate": 6.76729673476319e-07, + "loss": 0.6338, + "step": 21242 + }, + { + "epoch": 1.534704788050644, + "grad_norm": 7.7847385170008225, + "learning_rate": 6.76529560095007e-07, + "loss": 0.6854, + "step": 21243 + }, + { + "epoch": 1.5347770332508532, + "grad_norm": 6.837102858256907, + "learning_rate": 6.763294716755892e-07, + "loss": 0.5667, + "step": 21244 + }, + { + "epoch": 1.534849278451063, + "grad_norm": 7.408908029012983, + "learning_rate": 6.761294082208053e-07, + "loss": 0.5706, + "step": 21245 + }, + { + "epoch": 1.5349215236512723, + "grad_norm": 7.607205498768144, + "learning_rate": 6.759293697333932e-07, + "loss": 0.6538, + "step": 21246 + }, + { + "epoch": 1.534993768851482, + "grad_norm": 7.738274102012726, + "learning_rate": 6.757293562160921e-07, + "loss": 0.6087, + "step": 21247 + }, + { + "epoch": 1.5350660140516914, + "grad_norm": 7.020876951299118, + "learning_rate": 6.755293676716396e-07, + "loss": 0.5764, + "step": 21248 + }, + { + "epoch": 1.535138259251901, + "grad_norm": 7.289032669251659, + "learning_rate": 6.753294041027742e-07, + "loss": 0.5378, + "step": 21249 + }, + { + "epoch": 1.5352105044521105, + "grad_norm": 6.22703085254324, + "learning_rate": 6.751294655122315e-07, + "loss": 0.5912, + "step": 21250 + }, + { + "epoch": 1.5352827496523198, + "grad_norm": 6.838013635622339, + "learning_rate": 6.749295519027496e-07, + "loss": 0.6203, + "step": 21251 + }, + { + "epoch": 1.5353549948525296, + "grad_norm": 7.380964550471198, + "learning_rate": 6.747296632770659e-07, + "loss": 0.5592, + "step": 21252 + }, + { + "epoch": 1.535427240052739, + "grad_norm": 7.766104669934907, + "learning_rate": 6.745297996379152e-07, + "loss": 0.5993, + "step": 21253 + }, + { + "epoch": 1.5354994852529487, + "grad_norm": 7.345796600209564, + "learning_rate": 6.743299609880332e-07, + "loss": 0.5946, + "step": 21254 + }, + { + "epoch": 1.535571730453158, + "grad_norm": 6.535177847580869, + "learning_rate": 6.741301473301573e-07, + "loss": 0.5667, + "step": 21255 + }, + { + "epoch": 1.5356439756533675, + "grad_norm": 7.358160935222388, + "learning_rate": 6.739303586670229e-07, + "loss": 0.597, + "step": 21256 + }, + { + "epoch": 1.535716220853577, + "grad_norm": 7.0428430902544825, + "learning_rate": 6.737305950013631e-07, + "loss": 0.6979, + "step": 21257 + }, + { + "epoch": 1.5357884660537866, + "grad_norm": 8.113532792050314, + "learning_rate": 6.735308563359136e-07, + "loss": 0.6232, + "step": 21258 + }, + { + "epoch": 1.5358607112539961, + "grad_norm": 7.947930878859412, + "learning_rate": 6.733311426734085e-07, + "loss": 0.6483, + "step": 21259 + }, + { + "epoch": 1.5359329564542055, + "grad_norm": 6.455063055485436, + "learning_rate": 6.731314540165823e-07, + "loss": 0.6398, + "step": 21260 + }, + { + "epoch": 1.5360052016544152, + "grad_norm": 6.976795700977192, + "learning_rate": 6.729317903681673e-07, + "loss": 0.6568, + "step": 21261 + }, + { + "epoch": 1.5360774468546246, + "grad_norm": 7.257149426872447, + "learning_rate": 6.727321517308977e-07, + "loss": 0.6295, + "step": 21262 + }, + { + "epoch": 1.536149692054834, + "grad_norm": 6.668862871181931, + "learning_rate": 6.72532538107506e-07, + "loss": 0.6831, + "step": 21263 + }, + { + "epoch": 1.5362219372550436, + "grad_norm": 7.097636363999385, + "learning_rate": 6.72332949500725e-07, + "loss": 0.622, + "step": 21264 + }, + { + "epoch": 1.5362941824552532, + "grad_norm": 7.338557838727153, + "learning_rate": 6.721333859132867e-07, + "loss": 0.6194, + "step": 21265 + }, + { + "epoch": 1.5363664276554627, + "grad_norm": 6.85330327711429, + "learning_rate": 6.719338473479231e-07, + "loss": 0.5755, + "step": 21266 + }, + { + "epoch": 1.536438672855672, + "grad_norm": 6.092333506742145, + "learning_rate": 6.717343338073667e-07, + "loss": 0.5507, + "step": 21267 + }, + { + "epoch": 1.5365109180558818, + "grad_norm": 6.340869569080399, + "learning_rate": 6.715348452943468e-07, + "loss": 0.5862, + "step": 21268 + }, + { + "epoch": 1.5365831632560911, + "grad_norm": 7.759834694428953, + "learning_rate": 6.71335381811595e-07, + "loss": 0.5645, + "step": 21269 + }, + { + "epoch": 1.5366554084563007, + "grad_norm": 7.012347910926621, + "learning_rate": 6.711359433618431e-07, + "loss": 0.6167, + "step": 21270 + }, + { + "epoch": 1.5367276536565102, + "grad_norm": 8.329379504335607, + "learning_rate": 6.70936529947819e-07, + "loss": 0.691, + "step": 21271 + }, + { + "epoch": 1.5367998988567197, + "grad_norm": 7.018073863125909, + "learning_rate": 6.707371415722538e-07, + "loss": 0.642, + "step": 21272 + }, + { + "epoch": 1.5368721440569293, + "grad_norm": 6.672408142944, + "learning_rate": 6.705377782378769e-07, + "loss": 0.6342, + "step": 21273 + }, + { + "epoch": 1.5369443892571386, + "grad_norm": 6.860002840892326, + "learning_rate": 6.703384399474172e-07, + "loss": 0.5897, + "step": 21274 + }, + { + "epoch": 1.5370166344573484, + "grad_norm": 7.114148358908365, + "learning_rate": 6.701391267036034e-07, + "loss": 0.591, + "step": 21275 + }, + { + "epoch": 1.5370888796575577, + "grad_norm": 7.193701680780926, + "learning_rate": 6.699398385091647e-07, + "loss": 0.6133, + "step": 21276 + }, + { + "epoch": 1.5371611248577672, + "grad_norm": 6.640641658986457, + "learning_rate": 6.697405753668287e-07, + "loss": 0.6038, + "step": 21277 + }, + { + "epoch": 1.5372333700579768, + "grad_norm": 5.7427106346523855, + "learning_rate": 6.69541337279323e-07, + "loss": 0.553, + "step": 21278 + }, + { + "epoch": 1.5373056152581863, + "grad_norm": 7.663372603696491, + "learning_rate": 6.693421242493748e-07, + "loss": 0.5906, + "step": 21279 + }, + { + "epoch": 1.5373778604583959, + "grad_norm": 6.401961216836213, + "learning_rate": 6.691429362797116e-07, + "loss": 0.5607, + "step": 21280 + }, + { + "epoch": 1.5374501056586052, + "grad_norm": 7.245297189202336, + "learning_rate": 6.689437733730609e-07, + "loss": 0.6179, + "step": 21281 + }, + { + "epoch": 1.537522350858815, + "grad_norm": 5.938291798047631, + "learning_rate": 6.687446355321464e-07, + "loss": 0.5301, + "step": 21282 + }, + { + "epoch": 1.5375945960590243, + "grad_norm": 7.451871946552256, + "learning_rate": 6.685455227596971e-07, + "loss": 0.6227, + "step": 21283 + }, + { + "epoch": 1.5376668412592338, + "grad_norm": 6.5187685036328045, + "learning_rate": 6.683464350584373e-07, + "loss": 0.6387, + "step": 21284 + }, + { + "epoch": 1.5377390864594433, + "grad_norm": 6.359467076940138, + "learning_rate": 6.681473724310933e-07, + "loss": 0.5974, + "step": 21285 + }, + { + "epoch": 1.537811331659653, + "grad_norm": 7.255737927987661, + "learning_rate": 6.679483348803889e-07, + "loss": 0.6292, + "step": 21286 + }, + { + "epoch": 1.5378835768598624, + "grad_norm": 7.645337607574244, + "learning_rate": 6.677493224090492e-07, + "loss": 0.6966, + "step": 21287 + }, + { + "epoch": 1.5379558220600718, + "grad_norm": 7.093918852960105, + "learning_rate": 6.675503350197995e-07, + "loss": 0.5815, + "step": 21288 + }, + { + "epoch": 1.5380280672602815, + "grad_norm": 6.383398171569503, + "learning_rate": 6.673513727153618e-07, + "loss": 0.5503, + "step": 21289 + }, + { + "epoch": 1.5381003124604908, + "grad_norm": 7.420268637840199, + "learning_rate": 6.671524354984613e-07, + "loss": 0.5556, + "step": 21290 + }, + { + "epoch": 1.5381725576607004, + "grad_norm": 6.398159370385941, + "learning_rate": 6.669535233718205e-07, + "loss": 0.6799, + "step": 21291 + }, + { + "epoch": 1.53824480286091, + "grad_norm": 6.992228144522233, + "learning_rate": 6.66754636338163e-07, + "loss": 0.6982, + "step": 21292 + }, + { + "epoch": 1.5383170480611195, + "grad_norm": 8.606264857637411, + "learning_rate": 6.66555774400211e-07, + "loss": 0.578, + "step": 21293 + }, + { + "epoch": 1.538389293261329, + "grad_norm": 6.436749164533981, + "learning_rate": 6.663569375606868e-07, + "loss": 0.6266, + "step": 21294 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 6.3017653035607495, + "learning_rate": 6.661581258223129e-07, + "loss": 0.6029, + "step": 21295 + }, + { + "epoch": 1.538533783661748, + "grad_norm": 7.307594153482507, + "learning_rate": 6.6595933918781e-07, + "loss": 0.6491, + "step": 21296 + }, + { + "epoch": 1.5386060288619574, + "grad_norm": 7.4415884881720595, + "learning_rate": 6.657605776598996e-07, + "loss": 0.681, + "step": 21297 + }, + { + "epoch": 1.538678274062167, + "grad_norm": 7.189490266645436, + "learning_rate": 6.655618412413025e-07, + "loss": 0.5961, + "step": 21298 + }, + { + "epoch": 1.5387505192623765, + "grad_norm": 7.2070640666597034, + "learning_rate": 6.653631299347404e-07, + "loss": 0.5902, + "step": 21299 + }, + { + "epoch": 1.538822764462586, + "grad_norm": 6.568634653575901, + "learning_rate": 6.651644437429319e-07, + "loss": 0.5989, + "step": 21300 + }, + { + "epoch": 1.5388950096627956, + "grad_norm": 7.488275646352885, + "learning_rate": 6.649657826685974e-07, + "loss": 0.6024, + "step": 21301 + }, + { + "epoch": 1.538967254863005, + "grad_norm": 5.814164118300347, + "learning_rate": 6.647671467144564e-07, + "loss": 0.5868, + "step": 21302 + }, + { + "epoch": 1.5390395000632147, + "grad_norm": 8.138091353861117, + "learning_rate": 6.645685358832285e-07, + "loss": 0.6196, + "step": 21303 + }, + { + "epoch": 1.539111745263424, + "grad_norm": 6.338203899711369, + "learning_rate": 6.643699501776318e-07, + "loss": 0.5956, + "step": 21304 + }, + { + "epoch": 1.5391839904636335, + "grad_norm": 7.168919438402599, + "learning_rate": 6.641713896003856e-07, + "loss": 0.661, + "step": 21305 + }, + { + "epoch": 1.539256235663843, + "grad_norm": 7.085412591672998, + "learning_rate": 6.639728541542082e-07, + "loss": 0.6161, + "step": 21306 + }, + { + "epoch": 1.5393284808640526, + "grad_norm": 6.842193844642668, + "learning_rate": 6.637743438418162e-07, + "loss": 0.571, + "step": 21307 + }, + { + "epoch": 1.5394007260642621, + "grad_norm": 7.161208121812689, + "learning_rate": 6.635758586659274e-07, + "loss": 0.6199, + "step": 21308 + }, + { + "epoch": 1.5394729712644715, + "grad_norm": 6.872860939869168, + "learning_rate": 6.633773986292597e-07, + "loss": 0.5454, + "step": 21309 + }, + { + "epoch": 1.5395452164646812, + "grad_norm": 7.328906052209916, + "learning_rate": 6.63178963734529e-07, + "loss": 0.6524, + "step": 21310 + }, + { + "epoch": 1.5396174616648906, + "grad_norm": 5.938371453831567, + "learning_rate": 6.629805539844522e-07, + "loss": 0.669, + "step": 21311 + }, + { + "epoch": 1.5396897068651003, + "grad_norm": 7.601481955749417, + "learning_rate": 6.627821693817452e-07, + "loss": 0.624, + "step": 21312 + }, + { + "epoch": 1.5397619520653096, + "grad_norm": 7.628847245995582, + "learning_rate": 6.625838099291246e-07, + "loss": 0.5764, + "step": 21313 + }, + { + "epoch": 1.5398341972655192, + "grad_norm": 6.989890291283525, + "learning_rate": 6.623854756293041e-07, + "loss": 0.5596, + "step": 21314 + }, + { + "epoch": 1.5399064424657287, + "grad_norm": 7.191634075611066, + "learning_rate": 6.62187166485e-07, + "loss": 0.6742, + "step": 21315 + }, + { + "epoch": 1.539978687665938, + "grad_norm": 7.254319318391861, + "learning_rate": 6.619888824989262e-07, + "loss": 0.6807, + "step": 21316 + }, + { + "epoch": 1.5400509328661478, + "grad_norm": 8.061927479369755, + "learning_rate": 6.617906236737983e-07, + "loss": 0.6519, + "step": 21317 + }, + { + "epoch": 1.5401231780663571, + "grad_norm": 7.325988067542253, + "learning_rate": 6.615923900123289e-07, + "loss": 0.6088, + "step": 21318 + }, + { + "epoch": 1.5401954232665669, + "grad_norm": 7.759460581485352, + "learning_rate": 6.61394181517232e-07, + "loss": 0.5874, + "step": 21319 + }, + { + "epoch": 1.5402676684667762, + "grad_norm": 6.502942812932579, + "learning_rate": 6.611959981912214e-07, + "loss": 0.6397, + "step": 21320 + }, + { + "epoch": 1.5403399136669857, + "grad_norm": 7.338735873430158, + "learning_rate": 6.609978400370098e-07, + "loss": 0.5903, + "step": 21321 + }, + { + "epoch": 1.5404121588671953, + "grad_norm": 7.200886078458776, + "learning_rate": 6.607997070573099e-07, + "loss": 0.6525, + "step": 21322 + }, + { + "epoch": 1.5404844040674046, + "grad_norm": 6.61875650119192, + "learning_rate": 6.606015992548339e-07, + "loss": 0.5895, + "step": 21323 + }, + { + "epoch": 1.5405566492676144, + "grad_norm": 7.191876215292763, + "learning_rate": 6.604035166322944e-07, + "loss": 0.731, + "step": 21324 + }, + { + "epoch": 1.5406288944678237, + "grad_norm": 8.431172124391322, + "learning_rate": 6.60205459192402e-07, + "loss": 0.6072, + "step": 21325 + }, + { + "epoch": 1.5407011396680335, + "grad_norm": 6.982860289995056, + "learning_rate": 6.600074269378681e-07, + "loss": 0.5735, + "step": 21326 + }, + { + "epoch": 1.5407733848682428, + "grad_norm": 7.5441060485505735, + "learning_rate": 6.598094198714047e-07, + "loss": 0.6072, + "step": 21327 + }, + { + "epoch": 1.5408456300684523, + "grad_norm": 8.145314335854867, + "learning_rate": 6.596114379957208e-07, + "loss": 0.6238, + "step": 21328 + }, + { + "epoch": 1.5409178752686619, + "grad_norm": 8.265655459832818, + "learning_rate": 6.594134813135275e-07, + "loss": 0.6352, + "step": 21329 + }, + { + "epoch": 1.5409901204688712, + "grad_norm": 5.937966739981536, + "learning_rate": 6.592155498275343e-07, + "loss": 0.5676, + "step": 21330 + }, + { + "epoch": 1.541062365669081, + "grad_norm": 6.838496451846987, + "learning_rate": 6.590176435404508e-07, + "loss": 0.5852, + "step": 21331 + }, + { + "epoch": 1.5411346108692903, + "grad_norm": 7.043494655039244, + "learning_rate": 6.588197624549867e-07, + "loss": 0.567, + "step": 21332 + }, + { + "epoch": 1.5412068560695, + "grad_norm": 7.496534691385773, + "learning_rate": 6.586219065738503e-07, + "loss": 0.6319, + "step": 21333 + }, + { + "epoch": 1.5412791012697094, + "grad_norm": 8.05159666919176, + "learning_rate": 6.584240758997501e-07, + "loss": 0.6061, + "step": 21334 + }, + { + "epoch": 1.541351346469919, + "grad_norm": 5.93280518284955, + "learning_rate": 6.582262704353954e-07, + "loss": 0.642, + "step": 21335 + }, + { + "epoch": 1.5414235916701284, + "grad_norm": 6.298436276571296, + "learning_rate": 6.580284901834921e-07, + "loss": 0.6073, + "step": 21336 + }, + { + "epoch": 1.541495836870338, + "grad_norm": 6.332379152129029, + "learning_rate": 6.578307351467486e-07, + "loss": 0.6483, + "step": 21337 + }, + { + "epoch": 1.5415680820705475, + "grad_norm": 7.7790406549099735, + "learning_rate": 6.576330053278721e-07, + "loss": 0.6146, + "step": 21338 + }, + { + "epoch": 1.5416403272707568, + "grad_norm": 6.237136726274031, + "learning_rate": 6.574353007295692e-07, + "loss": 0.6278, + "step": 21339 + }, + { + "epoch": 1.5417125724709666, + "grad_norm": 7.603629761296574, + "learning_rate": 6.572376213545462e-07, + "loss": 0.6383, + "step": 21340 + }, + { + "epoch": 1.541784817671176, + "grad_norm": 7.653652201975773, + "learning_rate": 6.570399672055094e-07, + "loss": 0.6451, + "step": 21341 + }, + { + "epoch": 1.5418570628713855, + "grad_norm": 6.516403042013057, + "learning_rate": 6.568423382851655e-07, + "loss": 0.6192, + "step": 21342 + }, + { + "epoch": 1.541929308071595, + "grad_norm": 6.697597118327127, + "learning_rate": 6.566447345962179e-07, + "loss": 0.6385, + "step": 21343 + }, + { + "epoch": 1.5420015532718045, + "grad_norm": 8.171381102127775, + "learning_rate": 6.564471561413727e-07, + "loss": 0.6599, + "step": 21344 + }, + { + "epoch": 1.542073798472014, + "grad_norm": 6.311656933375649, + "learning_rate": 6.562496029233351e-07, + "loss": 0.6368, + "step": 21345 + }, + { + "epoch": 1.5421460436722234, + "grad_norm": 6.655717568750981, + "learning_rate": 6.560520749448082e-07, + "loss": 0.637, + "step": 21346 + }, + { + "epoch": 1.5422182888724332, + "grad_norm": 6.703970048896691, + "learning_rate": 6.55854572208497e-07, + "loss": 0.6472, + "step": 21347 + }, + { + "epoch": 1.5422905340726425, + "grad_norm": 7.620049292137132, + "learning_rate": 6.556570947171043e-07, + "loss": 0.5871, + "step": 21348 + }, + { + "epoch": 1.542362779272852, + "grad_norm": 6.980927779395287, + "learning_rate": 6.554596424733342e-07, + "loss": 0.5821, + "step": 21349 + }, + { + "epoch": 1.5424350244730616, + "grad_norm": 8.127144867886463, + "learning_rate": 6.552622154798893e-07, + "loss": 0.6667, + "step": 21350 + }, + { + "epoch": 1.5425072696732711, + "grad_norm": 6.3748997699113685, + "learning_rate": 6.550648137394725e-07, + "loss": 0.581, + "step": 21351 + }, + { + "epoch": 1.5425795148734807, + "grad_norm": 6.308517559524434, + "learning_rate": 6.548674372547865e-07, + "loss": 0.5303, + "step": 21352 + }, + { + "epoch": 1.54265176007369, + "grad_norm": 6.549709680544174, + "learning_rate": 6.546700860285318e-07, + "loss": 0.622, + "step": 21353 + }, + { + "epoch": 1.5427240052738997, + "grad_norm": 8.185785711067663, + "learning_rate": 6.544727600634107e-07, + "loss": 0.5974, + "step": 21354 + }, + { + "epoch": 1.542796250474109, + "grad_norm": 6.899782768921119, + "learning_rate": 6.542754593621247e-07, + "loss": 0.6523, + "step": 21355 + }, + { + "epoch": 1.5428684956743186, + "grad_norm": 6.874536394613759, + "learning_rate": 6.540781839273755e-07, + "loss": 0.7182, + "step": 21356 + }, + { + "epoch": 1.5429407408745281, + "grad_norm": 8.10359635753048, + "learning_rate": 6.538809337618617e-07, + "loss": 0.653, + "step": 21357 + }, + { + "epoch": 1.5430129860747377, + "grad_norm": 7.014611117839783, + "learning_rate": 6.536837088682849e-07, + "loss": 0.5995, + "step": 21358 + }, + { + "epoch": 1.5430852312749472, + "grad_norm": 6.3661074980331325, + "learning_rate": 6.534865092493442e-07, + "loss": 0.6101, + "step": 21359 + }, + { + "epoch": 1.5431574764751566, + "grad_norm": 7.6825478583253215, + "learning_rate": 6.532893349077393e-07, + "loss": 0.6173, + "step": 21360 + }, + { + "epoch": 1.5432297216753663, + "grad_norm": 9.331891856823107, + "learning_rate": 6.5309218584617e-07, + "loss": 0.6579, + "step": 21361 + }, + { + "epoch": 1.5433019668755756, + "grad_norm": 6.406984463899869, + "learning_rate": 6.528950620673344e-07, + "loss": 0.5237, + "step": 21362 + }, + { + "epoch": 1.5433742120757852, + "grad_norm": 7.641911497808126, + "learning_rate": 6.52697963573932e-07, + "loss": 0.6544, + "step": 21363 + }, + { + "epoch": 1.5434464572759947, + "grad_norm": 7.7618239251178105, + "learning_rate": 6.525008903686592e-07, + "loss": 0.6078, + "step": 21364 + }, + { + "epoch": 1.5435187024762043, + "grad_norm": 6.5920558398205715, + "learning_rate": 6.52303842454215e-07, + "loss": 0.561, + "step": 21365 + }, + { + "epoch": 1.5435909476764138, + "grad_norm": 6.9651397796209515, + "learning_rate": 6.521068198332963e-07, + "loss": 0.6518, + "step": 21366 + }, + { + "epoch": 1.5436631928766231, + "grad_norm": 8.109284681577963, + "learning_rate": 6.519098225086007e-07, + "loss": 0.5869, + "step": 21367 + }, + { + "epoch": 1.5437354380768329, + "grad_norm": 7.123686284538194, + "learning_rate": 6.517128504828243e-07, + "loss": 0.6747, + "step": 21368 + }, + { + "epoch": 1.5438076832770422, + "grad_norm": 7.15009532944945, + "learning_rate": 6.515159037586643e-07, + "loss": 0.6199, + "step": 21369 + }, + { + "epoch": 1.5438799284772518, + "grad_norm": 7.070760032020598, + "learning_rate": 6.513189823388169e-07, + "loss": 0.644, + "step": 21370 + }, + { + "epoch": 1.5439521736774613, + "grad_norm": 7.0441686334911475, + "learning_rate": 6.511220862259765e-07, + "loss": 0.6615, + "step": 21371 + }, + { + "epoch": 1.5440244188776708, + "grad_norm": 7.048616790351605, + "learning_rate": 6.50925215422839e-07, + "loss": 0.5923, + "step": 21372 + }, + { + "epoch": 1.5440966640778804, + "grad_norm": 6.774874876774276, + "learning_rate": 6.507283699320999e-07, + "loss": 0.5406, + "step": 21373 + }, + { + "epoch": 1.5441689092780897, + "grad_norm": 7.275007854863806, + "learning_rate": 6.50531549756454e-07, + "loss": 0.6844, + "step": 21374 + }, + { + "epoch": 1.5442411544782995, + "grad_norm": 7.9360936638626915, + "learning_rate": 6.503347548985944e-07, + "loss": 0.611, + "step": 21375 + }, + { + "epoch": 1.5443133996785088, + "grad_norm": 8.045136436125798, + "learning_rate": 6.501379853612161e-07, + "loss": 0.602, + "step": 21376 + }, + { + "epoch": 1.5443856448787183, + "grad_norm": 6.689415773162143, + "learning_rate": 6.499412411470124e-07, + "loss": 0.6689, + "step": 21377 + }, + { + "epoch": 1.5444578900789279, + "grad_norm": 7.382339446236734, + "learning_rate": 6.497445222586765e-07, + "loss": 0.6602, + "step": 21378 + }, + { + "epoch": 1.5445301352791374, + "grad_norm": 7.409465108039572, + "learning_rate": 6.495478286989015e-07, + "loss": 0.5877, + "step": 21379 + }, + { + "epoch": 1.544602380479347, + "grad_norm": 8.349525081811352, + "learning_rate": 6.4935116047038e-07, + "loss": 0.5607, + "step": 21380 + }, + { + "epoch": 1.5446746256795563, + "grad_norm": 7.111968051748531, + "learning_rate": 6.491545175758049e-07, + "loss": 0.6823, + "step": 21381 + }, + { + "epoch": 1.544746870879766, + "grad_norm": 6.029028606965098, + "learning_rate": 6.489579000178667e-07, + "loss": 0.6352, + "step": 21382 + }, + { + "epoch": 1.5448191160799754, + "grad_norm": 6.20409379979937, + "learning_rate": 6.487613077992574e-07, + "loss": 0.5971, + "step": 21383 + }, + { + "epoch": 1.544891361280185, + "grad_norm": 6.91957401926985, + "learning_rate": 6.485647409226692e-07, + "loss": 0.6133, + "step": 21384 + }, + { + "epoch": 1.5449636064803944, + "grad_norm": 8.2253832139368, + "learning_rate": 6.483681993907914e-07, + "loss": 0.6655, + "step": 21385 + }, + { + "epoch": 1.545035851680604, + "grad_norm": 8.032210355701734, + "learning_rate": 6.481716832063154e-07, + "loss": 0.5975, + "step": 21386 + }, + { + "epoch": 1.5451080968808135, + "grad_norm": 7.232582216187799, + "learning_rate": 6.479751923719305e-07, + "loss": 0.583, + "step": 21387 + }, + { + "epoch": 1.5451803420810228, + "grad_norm": 6.193090514209162, + "learning_rate": 6.477787268903285e-07, + "loss": 0.6263, + "step": 21388 + }, + { + "epoch": 1.5452525872812326, + "grad_norm": 7.929521530744649, + "learning_rate": 6.47582286764197e-07, + "loss": 0.7335, + "step": 21389 + }, + { + "epoch": 1.545324832481442, + "grad_norm": 6.951652593836979, + "learning_rate": 6.473858719962256e-07, + "loss": 0.6501, + "step": 21390 + }, + { + "epoch": 1.5453970776816517, + "grad_norm": 7.6258476911772375, + "learning_rate": 6.471894825891031e-07, + "loss": 0.5389, + "step": 21391 + }, + { + "epoch": 1.545469322881861, + "grad_norm": 7.036618501735393, + "learning_rate": 6.469931185455186e-07, + "loss": 0.6459, + "step": 21392 + }, + { + "epoch": 1.5455415680820705, + "grad_norm": 6.0854498260162275, + "learning_rate": 6.467967798681591e-07, + "loss": 0.628, + "step": 21393 + }, + { + "epoch": 1.54561381328228, + "grad_norm": 7.992383669238396, + "learning_rate": 6.466004665597126e-07, + "loss": 0.6527, + "step": 21394 + }, + { + "epoch": 1.5456860584824894, + "grad_norm": 7.284931087338138, + "learning_rate": 6.464041786228667e-07, + "loss": 0.6276, + "step": 21395 + }, + { + "epoch": 1.5457583036826992, + "grad_norm": 7.6903001601195236, + "learning_rate": 6.462079160603083e-07, + "loss": 0.6008, + "step": 21396 + }, + { + "epoch": 1.5458305488829085, + "grad_norm": 7.328610402117539, + "learning_rate": 6.460116788747242e-07, + "loss": 0.5371, + "step": 21397 + }, + { + "epoch": 1.5459027940831183, + "grad_norm": 6.81383984190363, + "learning_rate": 6.458154670688008e-07, + "loss": 0.5869, + "step": 21398 + }, + { + "epoch": 1.5459750392833276, + "grad_norm": 7.697540425426437, + "learning_rate": 6.456192806452243e-07, + "loss": 0.7043, + "step": 21399 + }, + { + "epoch": 1.5460472844835371, + "grad_norm": 7.641604994508016, + "learning_rate": 6.454231196066796e-07, + "loss": 0.5892, + "step": 21400 + }, + { + "epoch": 1.5461195296837467, + "grad_norm": 7.7972514763034795, + "learning_rate": 6.452269839558526e-07, + "loss": 0.6424, + "step": 21401 + }, + { + "epoch": 1.546191774883956, + "grad_norm": 7.899862853199746, + "learning_rate": 6.450308736954283e-07, + "loss": 0.5573, + "step": 21402 + }, + { + "epoch": 1.5462640200841657, + "grad_norm": 6.5132724530834505, + "learning_rate": 6.448347888280907e-07, + "loss": 0.5905, + "step": 21403 + }, + { + "epoch": 1.546336265284375, + "grad_norm": 7.177123698511838, + "learning_rate": 6.446387293565242e-07, + "loss": 0.6131, + "step": 21404 + }, + { + "epoch": 1.5464085104845848, + "grad_norm": 6.409339420363063, + "learning_rate": 6.444426952834129e-07, + "loss": 0.5836, + "step": 21405 + }, + { + "epoch": 1.5464807556847942, + "grad_norm": 7.083142655741997, + "learning_rate": 6.442466866114405e-07, + "loss": 0.5893, + "step": 21406 + }, + { + "epoch": 1.5465530008850037, + "grad_norm": 7.4886587223914285, + "learning_rate": 6.4405070334329e-07, + "loss": 0.6121, + "step": 21407 + }, + { + "epoch": 1.5466252460852132, + "grad_norm": 6.43597156935174, + "learning_rate": 6.438547454816444e-07, + "loss": 0.5858, + "step": 21408 + }, + { + "epoch": 1.5466974912854228, + "grad_norm": 6.559869729586147, + "learning_rate": 6.436588130291868e-07, + "loss": 0.6439, + "step": 21409 + }, + { + "epoch": 1.5467697364856323, + "grad_norm": 7.420872927829065, + "learning_rate": 6.434629059885977e-07, + "loss": 0.6837, + "step": 21410 + }, + { + "epoch": 1.5468419816858416, + "grad_norm": 7.0018697693783185, + "learning_rate": 6.432670243625602e-07, + "loss": 0.5943, + "step": 21411 + }, + { + "epoch": 1.5469142268860514, + "grad_norm": 6.932698280643653, + "learning_rate": 6.430711681537555e-07, + "loss": 0.6312, + "step": 21412 + }, + { + "epoch": 1.5469864720862607, + "grad_norm": 7.572919745009547, + "learning_rate": 6.428753373648653e-07, + "loss": 0.6113, + "step": 21413 + }, + { + "epoch": 1.5470587172864703, + "grad_norm": 6.738207936648272, + "learning_rate": 6.42679531998569e-07, + "loss": 0.5871, + "step": 21414 + }, + { + "epoch": 1.5471309624866798, + "grad_norm": 7.026528634025153, + "learning_rate": 6.424837520575472e-07, + "loss": 0.5376, + "step": 21415 + }, + { + "epoch": 1.5472032076868893, + "grad_norm": 6.586898353507028, + "learning_rate": 6.422879975444812e-07, + "loss": 0.6271, + "step": 21416 + }, + { + "epoch": 1.547275452887099, + "grad_norm": 6.561935255182947, + "learning_rate": 6.420922684620509e-07, + "loss": 0.5915, + "step": 21417 + }, + { + "epoch": 1.5473476980873082, + "grad_norm": 7.626844589496628, + "learning_rate": 6.418965648129341e-07, + "loss": 0.6218, + "step": 21418 + }, + { + "epoch": 1.547419943287518, + "grad_norm": 6.646486096826107, + "learning_rate": 6.417008865998106e-07, + "loss": 0.6325, + "step": 21419 + }, + { + "epoch": 1.5474921884877273, + "grad_norm": 7.135055941260174, + "learning_rate": 6.415052338253596e-07, + "loss": 0.6296, + "step": 21420 + }, + { + "epoch": 1.5475644336879368, + "grad_norm": 6.1874873613941315, + "learning_rate": 6.413096064922586e-07, + "loss": 0.5367, + "step": 21421 + }, + { + "epoch": 1.5476366788881464, + "grad_norm": 6.796342920984003, + "learning_rate": 6.411140046031858e-07, + "loss": 0.6619, + "step": 21422 + }, + { + "epoch": 1.547708924088356, + "grad_norm": 7.314597847783888, + "learning_rate": 6.409184281608189e-07, + "loss": 0.6259, + "step": 21423 + }, + { + "epoch": 1.5477811692885655, + "grad_norm": 7.089400113100053, + "learning_rate": 6.407228771678351e-07, + "loss": 0.5783, + "step": 21424 + }, + { + "epoch": 1.5478534144887748, + "grad_norm": 6.829660622695055, + "learning_rate": 6.405273516269115e-07, + "loss": 0.5945, + "step": 21425 + }, + { + "epoch": 1.5479256596889845, + "grad_norm": 8.071399597038802, + "learning_rate": 6.403318515407247e-07, + "loss": 0.577, + "step": 21426 + }, + { + "epoch": 1.5479979048891939, + "grad_norm": 7.0156119501549, + "learning_rate": 6.401363769119517e-07, + "loss": 0.6517, + "step": 21427 + }, + { + "epoch": 1.5480701500894034, + "grad_norm": 6.410107153891051, + "learning_rate": 6.39940927743267e-07, + "loss": 0.6288, + "step": 21428 + }, + { + "epoch": 1.548142395289613, + "grad_norm": 7.46181149781053, + "learning_rate": 6.397455040373465e-07, + "loss": 0.5831, + "step": 21429 + }, + { + "epoch": 1.5482146404898225, + "grad_norm": 6.860506629141619, + "learning_rate": 6.395501057968659e-07, + "loss": 0.6232, + "step": 21430 + }, + { + "epoch": 1.548286885690032, + "grad_norm": 6.87881835326794, + "learning_rate": 6.393547330245003e-07, + "loss": 0.6336, + "step": 21431 + }, + { + "epoch": 1.5483591308902414, + "grad_norm": 8.150376349810767, + "learning_rate": 6.39159385722923e-07, + "loss": 0.6996, + "step": 21432 + }, + { + "epoch": 1.5484313760904511, + "grad_norm": 6.63033727464058, + "learning_rate": 6.389640638948091e-07, + "loss": 0.6508, + "step": 21433 + }, + { + "epoch": 1.5485036212906604, + "grad_norm": 7.362742437182566, + "learning_rate": 6.38768767542832e-07, + "loss": 0.5785, + "step": 21434 + }, + { + "epoch": 1.54857586649087, + "grad_norm": 8.935486039751927, + "learning_rate": 6.385734966696652e-07, + "loss": 0.6269, + "step": 21435 + }, + { + "epoch": 1.5486481116910795, + "grad_norm": 6.817858067542209, + "learning_rate": 6.383782512779821e-07, + "loss": 0.5438, + "step": 21436 + }, + { + "epoch": 1.548720356891289, + "grad_norm": 6.816875487520182, + "learning_rate": 6.381830313704554e-07, + "loss": 0.6708, + "step": 21437 + }, + { + "epoch": 1.5487926020914986, + "grad_norm": 5.894086635194939, + "learning_rate": 6.379878369497577e-07, + "loss": 0.5743, + "step": 21438 + }, + { + "epoch": 1.548864847291708, + "grad_norm": 7.907243146074527, + "learning_rate": 6.377926680185606e-07, + "loss": 0.6328, + "step": 21439 + }, + { + "epoch": 1.5489370924919177, + "grad_norm": 7.2074784964546, + "learning_rate": 6.375975245795355e-07, + "loss": 0.6141, + "step": 21440 + }, + { + "epoch": 1.549009337692127, + "grad_norm": 6.260018520632515, + "learning_rate": 6.374024066353543e-07, + "loss": 0.6059, + "step": 21441 + }, + { + "epoch": 1.5490815828923366, + "grad_norm": 6.558813312938142, + "learning_rate": 6.37207314188689e-07, + "loss": 0.5831, + "step": 21442 + }, + { + "epoch": 1.549153828092546, + "grad_norm": 7.038352803516429, + "learning_rate": 6.37012247242208e-07, + "loss": 0.5764, + "step": 21443 + }, + { + "epoch": 1.5492260732927556, + "grad_norm": 7.658548313758288, + "learning_rate": 6.368172057985825e-07, + "loss": 0.6014, + "step": 21444 + }, + { + "epoch": 1.5492983184929652, + "grad_norm": 7.577222129285455, + "learning_rate": 6.366221898604838e-07, + "loss": 0.6083, + "step": 21445 + }, + { + "epoch": 1.5493705636931745, + "grad_norm": 6.679012152902781, + "learning_rate": 6.364271994305801e-07, + "loss": 0.6802, + "step": 21446 + }, + { + "epoch": 1.5494428088933843, + "grad_norm": 6.929346465413902, + "learning_rate": 6.36232234511541e-07, + "loss": 0.6003, + "step": 21447 + }, + { + "epoch": 1.5495150540935936, + "grad_norm": 6.848797891765814, + "learning_rate": 6.360372951060356e-07, + "loss": 0.5701, + "step": 21448 + }, + { + "epoch": 1.5495872992938031, + "grad_norm": 9.449736285692877, + "learning_rate": 6.35842381216733e-07, + "loss": 0.7646, + "step": 21449 + }, + { + "epoch": 1.5496595444940127, + "grad_norm": 7.9945750916478415, + "learning_rate": 6.356474928463e-07, + "loss": 0.572, + "step": 21450 + }, + { + "epoch": 1.5497317896942222, + "grad_norm": 6.7047586651067625, + "learning_rate": 6.354526299974051e-07, + "loss": 0.5908, + "step": 21451 + }, + { + "epoch": 1.5498040348944317, + "grad_norm": 7.074083669248017, + "learning_rate": 6.352577926727163e-07, + "loss": 0.6338, + "step": 21452 + }, + { + "epoch": 1.549876280094641, + "grad_norm": 7.607215026472196, + "learning_rate": 6.350629808749e-07, + "loss": 0.5927, + "step": 21453 + }, + { + "epoch": 1.5499485252948508, + "grad_norm": 6.455895962752557, + "learning_rate": 6.34868194606624e-07, + "loss": 0.662, + "step": 21454 + }, + { + "epoch": 1.5500207704950602, + "grad_norm": 6.801638562867963, + "learning_rate": 6.346734338705537e-07, + "loss": 0.6234, + "step": 21455 + }, + { + "epoch": 1.5500930156952697, + "grad_norm": 6.170737632543414, + "learning_rate": 6.344786986693568e-07, + "loss": 0.6399, + "step": 21456 + }, + { + "epoch": 1.5501652608954792, + "grad_norm": 7.940445481109769, + "learning_rate": 6.342839890056973e-07, + "loss": 0.6314, + "step": 21457 + }, + { + "epoch": 1.5502375060956888, + "grad_norm": 7.344691179715498, + "learning_rate": 6.340893048822413e-07, + "loss": 0.6105, + "step": 21458 + }, + { + "epoch": 1.5503097512958983, + "grad_norm": 7.00562360211019, + "learning_rate": 6.338946463016548e-07, + "loss": 0.6261, + "step": 21459 + }, + { + "epoch": 1.5503819964961076, + "grad_norm": 8.775128277224116, + "learning_rate": 6.337000132666008e-07, + "loss": 0.6405, + "step": 21460 + }, + { + "epoch": 1.5504542416963174, + "grad_norm": 7.498923669512304, + "learning_rate": 6.335054057797446e-07, + "loss": 0.5703, + "step": 21461 + }, + { + "epoch": 1.5505264868965267, + "grad_norm": 7.3912526160573755, + "learning_rate": 6.3331082384375e-07, + "loss": 0.6495, + "step": 21462 + }, + { + "epoch": 1.5505987320967365, + "grad_norm": 6.309740426457144, + "learning_rate": 6.331162674612809e-07, + "loss": 0.6106, + "step": 21463 + }, + { + "epoch": 1.5506709772969458, + "grad_norm": 6.176089159089706, + "learning_rate": 6.329217366350005e-07, + "loss": 0.5718, + "step": 21464 + }, + { + "epoch": 1.5507432224971553, + "grad_norm": 6.854115061773423, + "learning_rate": 6.32727231367572e-07, + "loss": 0.6198, + "step": 21465 + }, + { + "epoch": 1.550815467697365, + "grad_norm": 6.375218107194972, + "learning_rate": 6.325327516616583e-07, + "loss": 0.5869, + "step": 21466 + }, + { + "epoch": 1.5508877128975742, + "grad_norm": 6.280123633364378, + "learning_rate": 6.323382975199208e-07, + "loss": 0.5569, + "step": 21467 + }, + { + "epoch": 1.550959958097784, + "grad_norm": 6.601416591860198, + "learning_rate": 6.321438689450218e-07, + "loss": 0.5658, + "step": 21468 + }, + { + "epoch": 1.5510322032979933, + "grad_norm": 7.126155023482241, + "learning_rate": 6.319494659396231e-07, + "loss": 0.6698, + "step": 21469 + }, + { + "epoch": 1.551104448498203, + "grad_norm": 6.683996120449472, + "learning_rate": 6.317550885063861e-07, + "loss": 0.5853, + "step": 21470 + }, + { + "epoch": 1.5511766936984124, + "grad_norm": 8.37379059670825, + "learning_rate": 6.315607366479709e-07, + "loss": 0.5976, + "step": 21471 + }, + { + "epoch": 1.551248938898622, + "grad_norm": 6.37267036506574, + "learning_rate": 6.313664103670375e-07, + "loss": 0.6194, + "step": 21472 + }, + { + "epoch": 1.5513211840988315, + "grad_norm": 6.305012407404091, + "learning_rate": 6.311721096662479e-07, + "loss": 0.6519, + "step": 21473 + }, + { + "epoch": 1.5513934292990408, + "grad_norm": 6.57909627163677, + "learning_rate": 6.309778345482617e-07, + "loss": 0.6023, + "step": 21474 + }, + { + "epoch": 1.5514656744992505, + "grad_norm": 7.104195453962095, + "learning_rate": 6.307835850157374e-07, + "loss": 0.6226, + "step": 21475 + }, + { + "epoch": 1.5515379196994599, + "grad_norm": 7.922377711490133, + "learning_rate": 6.30589361071334e-07, + "loss": 0.636, + "step": 21476 + }, + { + "epoch": 1.5516101648996696, + "grad_norm": 5.61409074631297, + "learning_rate": 6.303951627177115e-07, + "loss": 0.6454, + "step": 21477 + }, + { + "epoch": 1.551682410099879, + "grad_norm": 6.591137987903601, + "learning_rate": 6.302009899575273e-07, + "loss": 0.5773, + "step": 21478 + }, + { + "epoch": 1.5517546553000885, + "grad_norm": 7.627686824504832, + "learning_rate": 6.300068427934394e-07, + "loss": 0.5662, + "step": 21479 + }, + { + "epoch": 1.551826900500298, + "grad_norm": 6.405552970368848, + "learning_rate": 6.298127212281057e-07, + "loss": 0.5318, + "step": 21480 + }, + { + "epoch": 1.5518991457005076, + "grad_norm": 7.535418737733061, + "learning_rate": 6.296186252641842e-07, + "loss": 0.6326, + "step": 21481 + }, + { + "epoch": 1.5519713909007171, + "grad_norm": 6.0217634316419435, + "learning_rate": 6.29424554904331e-07, + "loss": 0.5782, + "step": 21482 + }, + { + "epoch": 1.5520436361009264, + "grad_norm": 7.606725838665355, + "learning_rate": 6.292305101512036e-07, + "loss": 0.6432, + "step": 21483 + }, + { + "epoch": 1.5521158813011362, + "grad_norm": 7.767637794040995, + "learning_rate": 6.290364910074584e-07, + "loss": 0.6043, + "step": 21484 + }, + { + "epoch": 1.5521881265013455, + "grad_norm": 6.592026327025053, + "learning_rate": 6.288424974757504e-07, + "loss": 0.6449, + "step": 21485 + }, + { + "epoch": 1.552260371701555, + "grad_norm": 9.532122862824034, + "learning_rate": 6.286485295587358e-07, + "loss": 0.5939, + "step": 21486 + }, + { + "epoch": 1.5523326169017646, + "grad_norm": 7.110038173052633, + "learning_rate": 6.284545872590695e-07, + "loss": 0.5684, + "step": 21487 + }, + { + "epoch": 1.5524048621019741, + "grad_norm": 7.712908430119991, + "learning_rate": 6.282606705794078e-07, + "loss": 0.6243, + "step": 21488 + }, + { + "epoch": 1.5524771073021837, + "grad_norm": 7.396741984967942, + "learning_rate": 6.280667795224033e-07, + "loss": 0.6417, + "step": 21489 + }, + { + "epoch": 1.552549352502393, + "grad_norm": 7.340571588197685, + "learning_rate": 6.278729140907111e-07, + "loss": 0.6432, + "step": 21490 + }, + { + "epoch": 1.5526215977026028, + "grad_norm": 7.674018016707541, + "learning_rate": 6.276790742869851e-07, + "loss": 0.6279, + "step": 21491 + }, + { + "epoch": 1.552693842902812, + "grad_norm": 5.636389158979784, + "learning_rate": 6.27485260113879e-07, + "loss": 0.5196, + "step": 21492 + }, + { + "epoch": 1.5527660881030216, + "grad_norm": 8.028178655638866, + "learning_rate": 6.272914715740455e-07, + "loss": 0.6095, + "step": 21493 + }, + { + "epoch": 1.5528383333032312, + "grad_norm": 6.417292015352172, + "learning_rate": 6.27097708670138e-07, + "loss": 0.5919, + "step": 21494 + }, + { + "epoch": 1.5529105785034407, + "grad_norm": 6.702418434588701, + "learning_rate": 6.269039714048092e-07, + "loss": 0.6039, + "step": 21495 + }, + { + "epoch": 1.5529828237036503, + "grad_norm": 7.433451857142224, + "learning_rate": 6.267102597807101e-07, + "loss": 0.6029, + "step": 21496 + }, + { + "epoch": 1.5530550689038596, + "grad_norm": 6.852047423510944, + "learning_rate": 6.265165738004928e-07, + "loss": 0.6283, + "step": 21497 + }, + { + "epoch": 1.5531273141040693, + "grad_norm": 6.387832591137807, + "learning_rate": 6.263229134668089e-07, + "loss": 0.5362, + "step": 21498 + }, + { + "epoch": 1.5531995593042787, + "grad_norm": 6.468688964555803, + "learning_rate": 6.261292787823103e-07, + "loss": 0.5262, + "step": 21499 + }, + { + "epoch": 1.5532718045044882, + "grad_norm": 7.580632186333962, + "learning_rate": 6.259356697496458e-07, + "loss": 0.6223, + "step": 21500 + }, + { + "epoch": 1.5533440497046977, + "grad_norm": 8.080112353006426, + "learning_rate": 6.257420863714672e-07, + "loss": 0.6736, + "step": 21501 + }, + { + "epoch": 1.5534162949049073, + "grad_norm": 7.339552178715131, + "learning_rate": 6.255485286504248e-07, + "loss": 0.6876, + "step": 21502 + }, + { + "epoch": 1.5534885401051168, + "grad_norm": 7.341747514133083, + "learning_rate": 6.253549965891673e-07, + "loss": 0.5588, + "step": 21503 + }, + { + "epoch": 1.5535607853053262, + "grad_norm": 7.568408265971116, + "learning_rate": 6.251614901903441e-07, + "loss": 0.6061, + "step": 21504 + }, + { + "epoch": 1.553633030505536, + "grad_norm": 8.006412796878728, + "learning_rate": 6.249680094566044e-07, + "loss": 0.656, + "step": 21505 + }, + { + "epoch": 1.5537052757057452, + "grad_norm": 8.612203642199075, + "learning_rate": 6.247745543905975e-07, + "loss": 0.6186, + "step": 21506 + }, + { + "epoch": 1.5537775209059548, + "grad_norm": 6.75974925958724, + "learning_rate": 6.245811249949702e-07, + "loss": 0.6091, + "step": 21507 + }, + { + "epoch": 1.5538497661061643, + "grad_norm": 6.714264263826096, + "learning_rate": 6.243877212723712e-07, + "loss": 0.5868, + "step": 21508 + }, + { + "epoch": 1.5539220113063739, + "grad_norm": 7.398877816556774, + "learning_rate": 6.24194343225448e-07, + "loss": 0.6547, + "step": 21509 + }, + { + "epoch": 1.5539942565065834, + "grad_norm": 7.427563609412914, + "learning_rate": 6.240009908568476e-07, + "loss": 0.6746, + "step": 21510 + }, + { + "epoch": 1.5540665017067927, + "grad_norm": 7.220170578988925, + "learning_rate": 6.23807664169217e-07, + "loss": 0.6685, + "step": 21511 + }, + { + "epoch": 1.5541387469070025, + "grad_norm": 7.696532362863009, + "learning_rate": 6.236143631652028e-07, + "loss": 0.5883, + "step": 21512 + }, + { + "epoch": 1.5542109921072118, + "grad_norm": 7.113433015095729, + "learning_rate": 6.23421087847452e-07, + "loss": 0.6146, + "step": 21513 + }, + { + "epoch": 1.5542832373074214, + "grad_norm": 5.92986441085733, + "learning_rate": 6.232278382186083e-07, + "loss": 0.6587, + "step": 21514 + }, + { + "epoch": 1.554355482507631, + "grad_norm": 7.353048468311885, + "learning_rate": 6.230346142813185e-07, + "loss": 0.6298, + "step": 21515 + }, + { + "epoch": 1.5544277277078404, + "grad_norm": 6.194014077662975, + "learning_rate": 6.228414160382282e-07, + "loss": 0.5857, + "step": 21516 + }, + { + "epoch": 1.55449997290805, + "grad_norm": 7.245935287182404, + "learning_rate": 6.226482434919806e-07, + "loss": 0.638, + "step": 21517 + }, + { + "epoch": 1.5545722181082593, + "grad_norm": 6.927490987231512, + "learning_rate": 6.224550966452208e-07, + "loss": 0.6497, + "step": 21518 + }, + { + "epoch": 1.554644463308469, + "grad_norm": 6.732976002543754, + "learning_rate": 6.222619755005932e-07, + "loss": 0.5618, + "step": 21519 + }, + { + "epoch": 1.5547167085086784, + "grad_norm": 6.127409791815457, + "learning_rate": 6.220688800607411e-07, + "loss": 0.6414, + "step": 21520 + }, + { + "epoch": 1.554788953708888, + "grad_norm": 8.23907487138745, + "learning_rate": 6.218758103283076e-07, + "loss": 0.6154, + "step": 21521 + }, + { + "epoch": 1.5548611989090975, + "grad_norm": 8.357780860702638, + "learning_rate": 6.216827663059363e-07, + "loss": 0.5892, + "step": 21522 + }, + { + "epoch": 1.554933444109307, + "grad_norm": 5.9010212677089315, + "learning_rate": 6.214897479962695e-07, + "loss": 0.5661, + "step": 21523 + }, + { + "epoch": 1.5550056893095165, + "grad_norm": 7.481463859643666, + "learning_rate": 6.2129675540195e-07, + "loss": 0.6139, + "step": 21524 + }, + { + "epoch": 1.5550779345097259, + "grad_norm": 7.9903394066978475, + "learning_rate": 6.211037885256188e-07, + "loss": 0.5253, + "step": 21525 + }, + { + "epoch": 1.5551501797099356, + "grad_norm": 8.229163009707126, + "learning_rate": 6.209108473699177e-07, + "loss": 0.5818, + "step": 21526 + }, + { + "epoch": 1.555222424910145, + "grad_norm": 6.534604028147957, + "learning_rate": 6.207179319374887e-07, + "loss": 0.5681, + "step": 21527 + }, + { + "epoch": 1.5552946701103545, + "grad_norm": 8.595307476054021, + "learning_rate": 6.205250422309714e-07, + "loss": 0.6316, + "step": 21528 + }, + { + "epoch": 1.555366915310564, + "grad_norm": 6.676571481717865, + "learning_rate": 6.203321782530062e-07, + "loss": 0.5818, + "step": 21529 + }, + { + "epoch": 1.5554391605107736, + "grad_norm": 6.2199810094022165, + "learning_rate": 6.201393400062347e-07, + "loss": 0.619, + "step": 21530 + }, + { + "epoch": 1.5555114057109831, + "grad_norm": 6.266677882137137, + "learning_rate": 6.199465274932967e-07, + "loss": 0.624, + "step": 21531 + }, + { + "epoch": 1.5555836509111924, + "grad_norm": 6.941847255352886, + "learning_rate": 6.197537407168305e-07, + "loss": 0.607, + "step": 21532 + }, + { + "epoch": 1.5556558961114022, + "grad_norm": 7.839081244838811, + "learning_rate": 6.195609796794755e-07, + "loss": 0.5878, + "step": 21533 + }, + { + "epoch": 1.5557281413116115, + "grad_norm": 6.808812254454241, + "learning_rate": 6.193682443838714e-07, + "loss": 0.5529, + "step": 21534 + }, + { + "epoch": 1.5558003865118213, + "grad_norm": 6.606696702187657, + "learning_rate": 6.19175534832655e-07, + "loss": 0.5959, + "step": 21535 + }, + { + "epoch": 1.5558726317120306, + "grad_norm": 5.95762900433314, + "learning_rate": 6.189828510284651e-07, + "loss": 0.5636, + "step": 21536 + }, + { + "epoch": 1.5559448769122401, + "grad_norm": 8.115651077974803, + "learning_rate": 6.187901929739396e-07, + "loss": 0.636, + "step": 21537 + }, + { + "epoch": 1.5560171221124497, + "grad_norm": 7.065176397606565, + "learning_rate": 6.185975606717157e-07, + "loss": 0.6028, + "step": 21538 + }, + { + "epoch": 1.556089367312659, + "grad_norm": 6.409581950774278, + "learning_rate": 6.184049541244306e-07, + "loss": 0.5506, + "step": 21539 + }, + { + "epoch": 1.5561616125128688, + "grad_norm": 7.477126527755201, + "learning_rate": 6.182123733347204e-07, + "loss": 0.6371, + "step": 21540 + }, + { + "epoch": 1.556233857713078, + "grad_norm": 7.952974629057951, + "learning_rate": 6.180198183052224e-07, + "loss": 0.6322, + "step": 21541 + }, + { + "epoch": 1.5563061029132879, + "grad_norm": 8.275602437473964, + "learning_rate": 6.178272890385712e-07, + "loss": 0.6053, + "step": 21542 + }, + { + "epoch": 1.5563783481134972, + "grad_norm": 6.667178356878691, + "learning_rate": 6.176347855374029e-07, + "loss": 0.6303, + "step": 21543 + }, + { + "epoch": 1.5564505933137067, + "grad_norm": 7.208470543527545, + "learning_rate": 6.174423078043529e-07, + "loss": 0.6172, + "step": 21544 + }, + { + "epoch": 1.5565228385139163, + "grad_norm": 6.17929365250959, + "learning_rate": 6.172498558420567e-07, + "loss": 0.5577, + "step": 21545 + }, + { + "epoch": 1.5565950837141256, + "grad_norm": 7.638780492148374, + "learning_rate": 6.170574296531476e-07, + "loss": 0.6571, + "step": 21546 + }, + { + "epoch": 1.5566673289143353, + "grad_norm": 8.058536945388742, + "learning_rate": 6.1686502924026e-07, + "loss": 0.6069, + "step": 21547 + }, + { + "epoch": 1.5567395741145447, + "grad_norm": 6.079976170141803, + "learning_rate": 6.166726546060281e-07, + "loss": 0.6489, + "step": 21548 + }, + { + "epoch": 1.5568118193147544, + "grad_norm": 6.965354742366803, + "learning_rate": 6.164803057530852e-07, + "loss": 0.5428, + "step": 21549 + }, + { + "epoch": 1.5568840645149638, + "grad_norm": 6.348202500534539, + "learning_rate": 6.162879826840645e-07, + "loss": 0.6101, + "step": 21550 + }, + { + "epoch": 1.5569563097151733, + "grad_norm": 6.7413461100501495, + "learning_rate": 6.160956854015987e-07, + "loss": 0.6085, + "step": 21551 + }, + { + "epoch": 1.5570285549153828, + "grad_norm": 6.4134269892039475, + "learning_rate": 6.159034139083209e-07, + "loss": 0.6277, + "step": 21552 + }, + { + "epoch": 1.5571008001155922, + "grad_norm": 7.243660095025555, + "learning_rate": 6.157111682068617e-07, + "loss": 0.5856, + "step": 21553 + }, + { + "epoch": 1.557173045315802, + "grad_norm": 8.597059909464052, + "learning_rate": 6.155189482998536e-07, + "loss": 0.6091, + "step": 21554 + }, + { + "epoch": 1.5572452905160112, + "grad_norm": 7.407718561136476, + "learning_rate": 6.15326754189928e-07, + "loss": 0.5542, + "step": 21555 + }, + { + "epoch": 1.557317535716221, + "grad_norm": 7.640911822405962, + "learning_rate": 6.151345858797164e-07, + "loss": 0.5988, + "step": 21556 + }, + { + "epoch": 1.5573897809164303, + "grad_norm": 8.526457505910509, + "learning_rate": 6.149424433718473e-07, + "loss": 0.6029, + "step": 21557 + }, + { + "epoch": 1.5574620261166399, + "grad_norm": 8.514429298938426, + "learning_rate": 6.147503266689533e-07, + "loss": 0.574, + "step": 21558 + }, + { + "epoch": 1.5575342713168494, + "grad_norm": 8.305370973756606, + "learning_rate": 6.145582357736644e-07, + "loss": 0.5805, + "step": 21559 + }, + { + "epoch": 1.557606516517059, + "grad_norm": 7.419130866052788, + "learning_rate": 6.143661706886083e-07, + "loss": 0.6852, + "step": 21560 + }, + { + "epoch": 1.5576787617172685, + "grad_norm": 7.292480190853066, + "learning_rate": 6.141741314164154e-07, + "loss": 0.5706, + "step": 21561 + }, + { + "epoch": 1.5577510069174778, + "grad_norm": 6.324332450047254, + "learning_rate": 6.139821179597144e-07, + "loss": 0.6, + "step": 21562 + }, + { + "epoch": 1.5578232521176876, + "grad_norm": 7.335312951815636, + "learning_rate": 6.137901303211344e-07, + "loss": 0.5895, + "step": 21563 + }, + { + "epoch": 1.557895497317897, + "grad_norm": 7.770124578667808, + "learning_rate": 6.135981685033024e-07, + "loss": 0.5991, + "step": 21564 + }, + { + "epoch": 1.5579677425181064, + "grad_norm": 6.371946314047347, + "learning_rate": 6.134062325088467e-07, + "loss": 0.6228, + "step": 21565 + }, + { + "epoch": 1.558039987718316, + "grad_norm": 6.701658288507254, + "learning_rate": 6.132143223403947e-07, + "loss": 0.5606, + "step": 21566 + }, + { + "epoch": 1.5581122329185255, + "grad_norm": 6.308955339869569, + "learning_rate": 6.130224380005736e-07, + "loss": 0.542, + "step": 21567 + }, + { + "epoch": 1.558184478118735, + "grad_norm": 6.908713889444343, + "learning_rate": 6.128305794920106e-07, + "loss": 0.6386, + "step": 21568 + }, + { + "epoch": 1.5582567233189444, + "grad_norm": 7.725844115523124, + "learning_rate": 6.126387468173314e-07, + "loss": 0.6462, + "step": 21569 + }, + { + "epoch": 1.5583289685191541, + "grad_norm": 6.955655098400178, + "learning_rate": 6.12446939979163e-07, + "loss": 0.6176, + "step": 21570 + }, + { + "epoch": 1.5584012137193635, + "grad_norm": 7.571858566083719, + "learning_rate": 6.122551589801296e-07, + "loss": 0.6043, + "step": 21571 + }, + { + "epoch": 1.558473458919573, + "grad_norm": 6.625714533438132, + "learning_rate": 6.120634038228576e-07, + "loss": 0.6462, + "step": 21572 + }, + { + "epoch": 1.5585457041197825, + "grad_norm": 9.377309285896429, + "learning_rate": 6.118716745099715e-07, + "loss": 0.6481, + "step": 21573 + }, + { + "epoch": 1.558617949319992, + "grad_norm": 7.5730804327704675, + "learning_rate": 6.116799710440968e-07, + "loss": 0.5968, + "step": 21574 + }, + { + "epoch": 1.5586901945202016, + "grad_norm": 6.8622418019790095, + "learning_rate": 6.114882934278569e-07, + "loss": 0.6151, + "step": 21575 + }, + { + "epoch": 1.558762439720411, + "grad_norm": 7.390885652941958, + "learning_rate": 6.112966416638752e-07, + "loss": 0.5871, + "step": 21576 + }, + { + "epoch": 1.5588346849206207, + "grad_norm": 8.049728805996986, + "learning_rate": 6.111050157547765e-07, + "loss": 0.589, + "step": 21577 + }, + { + "epoch": 1.55890693012083, + "grad_norm": 6.341819431839311, + "learning_rate": 6.109134157031835e-07, + "loss": 0.5354, + "step": 21578 + }, + { + "epoch": 1.5589791753210396, + "grad_norm": 6.9984649609969996, + "learning_rate": 6.107218415117188e-07, + "loss": 0.617, + "step": 21579 + }, + { + "epoch": 1.5590514205212491, + "grad_norm": 7.144727200531839, + "learning_rate": 6.105302931830051e-07, + "loss": 0.6184, + "step": 21580 + }, + { + "epoch": 1.5591236657214587, + "grad_norm": 7.636493213181735, + "learning_rate": 6.103387707196656e-07, + "loss": 0.5764, + "step": 21581 + }, + { + "epoch": 1.5591959109216682, + "grad_norm": 6.57254625375984, + "learning_rate": 6.101472741243202e-07, + "loss": 0.6714, + "step": 21582 + }, + { + "epoch": 1.5592681561218775, + "grad_norm": 6.15210751034535, + "learning_rate": 6.099558033995915e-07, + "loss": 0.5827, + "step": 21583 + }, + { + "epoch": 1.5593404013220873, + "grad_norm": 6.322311179651295, + "learning_rate": 6.097643585481009e-07, + "loss": 0.5833, + "step": 21584 + }, + { + "epoch": 1.5594126465222966, + "grad_norm": 6.866501427794955, + "learning_rate": 6.095729395724669e-07, + "loss": 0.6246, + "step": 21585 + }, + { + "epoch": 1.5594848917225062, + "grad_norm": 6.4230596558818895, + "learning_rate": 6.093815464753125e-07, + "loss": 0.5216, + "step": 21586 + }, + { + "epoch": 1.5595571369227157, + "grad_norm": 7.156179635980491, + "learning_rate": 6.091901792592566e-07, + "loss": 0.6004, + "step": 21587 + }, + { + "epoch": 1.5596293821229252, + "grad_norm": 7.262618035360187, + "learning_rate": 6.089988379269198e-07, + "loss": 0.6927, + "step": 21588 + }, + { + "epoch": 1.5597016273231348, + "grad_norm": 6.155291332598333, + "learning_rate": 6.088075224809201e-07, + "loss": 0.6437, + "step": 21589 + }, + { + "epoch": 1.559773872523344, + "grad_norm": 8.141711142453314, + "learning_rate": 6.086162329238768e-07, + "loss": 0.6192, + "step": 21590 + }, + { + "epoch": 1.5598461177235539, + "grad_norm": 7.673618840999556, + "learning_rate": 6.084249692584097e-07, + "loss": 0.5922, + "step": 21591 + }, + { + "epoch": 1.5599183629237632, + "grad_norm": 6.70821359973779, + "learning_rate": 6.08233731487135e-07, + "loss": 0.5684, + "step": 21592 + }, + { + "epoch": 1.5599906081239727, + "grad_norm": 6.797973824612535, + "learning_rate": 6.080425196126719e-07, + "loss": 0.5695, + "step": 21593 + }, + { + "epoch": 1.5600628533241823, + "grad_norm": 6.711854030563231, + "learning_rate": 6.078513336376374e-07, + "loss": 0.58, + "step": 21594 + }, + { + "epoch": 1.5601350985243918, + "grad_norm": 7.0745700557269675, + "learning_rate": 6.076601735646493e-07, + "loss": 0.6612, + "step": 21595 + }, + { + "epoch": 1.5602073437246013, + "grad_norm": 6.739413118251974, + "learning_rate": 6.07469039396324e-07, + "loss": 0.5906, + "step": 21596 + }, + { + "epoch": 1.5602795889248107, + "grad_norm": 6.282715280155858, + "learning_rate": 6.07277931135278e-07, + "loss": 0.6103, + "step": 21597 + }, + { + "epoch": 1.5603518341250204, + "grad_norm": 7.203822964760946, + "learning_rate": 6.070868487841283e-07, + "loss": 0.5879, + "step": 21598 + }, + { + "epoch": 1.5604240793252298, + "grad_norm": 7.152958408613972, + "learning_rate": 6.06895792345489e-07, + "loss": 0.534, + "step": 21599 + }, + { + "epoch": 1.5604963245254393, + "grad_norm": 5.821164799698093, + "learning_rate": 6.067047618219768e-07, + "loss": 0.5713, + "step": 21600 + }, + { + "epoch": 1.5605685697256488, + "grad_norm": 7.268780554250459, + "learning_rate": 6.065137572162061e-07, + "loss": 0.6461, + "step": 21601 + }, + { + "epoch": 1.5606408149258584, + "grad_norm": 7.149730127750152, + "learning_rate": 6.063227785307926e-07, + "loss": 0.6455, + "step": 21602 + }, + { + "epoch": 1.560713060126068, + "grad_norm": 9.19969644460357, + "learning_rate": 6.061318257683493e-07, + "loss": 0.5901, + "step": 21603 + }, + { + "epoch": 1.5607853053262772, + "grad_norm": 6.788881850390714, + "learning_rate": 6.059408989314907e-07, + "loss": 0.5982, + "step": 21604 + }, + { + "epoch": 1.560857550526487, + "grad_norm": 5.709275365167851, + "learning_rate": 6.057499980228298e-07, + "loss": 0.5272, + "step": 21605 + }, + { + "epoch": 1.5609297957266963, + "grad_norm": 6.693424345869381, + "learning_rate": 6.055591230449822e-07, + "loss": 0.6338, + "step": 21606 + }, + { + "epoch": 1.5610020409269059, + "grad_norm": 6.150279291525672, + "learning_rate": 6.053682740005585e-07, + "loss": 0.652, + "step": 21607 + }, + { + "epoch": 1.5610742861271154, + "grad_norm": 11.611252398039436, + "learning_rate": 6.051774508921721e-07, + "loss": 0.6384, + "step": 21608 + }, + { + "epoch": 1.561146531327325, + "grad_norm": 6.823258051206155, + "learning_rate": 6.049866537224358e-07, + "loss": 0.5576, + "step": 21609 + }, + { + "epoch": 1.5612187765275345, + "grad_norm": 7.370429481611517, + "learning_rate": 6.047958824939601e-07, + "loss": 0.6447, + "step": 21610 + }, + { + "epoch": 1.5612910217277438, + "grad_norm": 7.392573480843984, + "learning_rate": 6.046051372093572e-07, + "loss": 0.577, + "step": 21611 + }, + { + "epoch": 1.5613632669279536, + "grad_norm": 7.0607246260474605, + "learning_rate": 6.044144178712386e-07, + "loss": 0.5668, + "step": 21612 + }, + { + "epoch": 1.561435512128163, + "grad_norm": 7.287941669670359, + "learning_rate": 6.042237244822144e-07, + "loss": 0.613, + "step": 21613 + }, + { + "epoch": 1.5615077573283727, + "grad_norm": 7.825488470754283, + "learning_rate": 6.040330570448957e-07, + "loss": 0.6082, + "step": 21614 + }, + { + "epoch": 1.561580002528582, + "grad_norm": 6.533804508008864, + "learning_rate": 6.038424155618922e-07, + "loss": 0.5563, + "step": 21615 + }, + { + "epoch": 1.5616522477287915, + "grad_norm": 6.673124459904299, + "learning_rate": 6.036518000358144e-07, + "loss": 0.5835, + "step": 21616 + }, + { + "epoch": 1.561724492929001, + "grad_norm": 6.624837909371025, + "learning_rate": 6.034612104692706e-07, + "loss": 0.5995, + "step": 21617 + }, + { + "epoch": 1.5617967381292104, + "grad_norm": 5.951306162847389, + "learning_rate": 6.0327064686487e-07, + "loss": 0.5423, + "step": 21618 + }, + { + "epoch": 1.5618689833294201, + "grad_norm": 6.6862911664829765, + "learning_rate": 6.030801092252215e-07, + "loss": 0.6276, + "step": 21619 + }, + { + "epoch": 1.5619412285296295, + "grad_norm": 8.994730784451841, + "learning_rate": 6.028895975529341e-07, + "loss": 0.6741, + "step": 21620 + }, + { + "epoch": 1.5620134737298392, + "grad_norm": 6.4935558359822805, + "learning_rate": 6.026991118506148e-07, + "loss": 0.5598, + "step": 21621 + }, + { + "epoch": 1.5620857189300486, + "grad_norm": 6.997890699572555, + "learning_rate": 6.025086521208712e-07, + "loss": 0.6281, + "step": 21622 + }, + { + "epoch": 1.562157964130258, + "grad_norm": 8.69004817339118, + "learning_rate": 6.023182183663109e-07, + "loss": 0.5725, + "step": 21623 + }, + { + "epoch": 1.5622302093304676, + "grad_norm": 5.281485106522589, + "learning_rate": 6.021278105895407e-07, + "loss": 0.5159, + "step": 21624 + }, + { + "epoch": 1.562302454530677, + "grad_norm": 7.4501130677930245, + "learning_rate": 6.019374287931673e-07, + "loss": 0.6122, + "step": 21625 + }, + { + "epoch": 1.5623746997308867, + "grad_norm": 7.24125301582936, + "learning_rate": 6.017470729797966e-07, + "loss": 0.6775, + "step": 21626 + }, + { + "epoch": 1.562446944931096, + "grad_norm": 6.362564824042009, + "learning_rate": 6.015567431520353e-07, + "loss": 0.6131, + "step": 21627 + }, + { + "epoch": 1.5625191901313058, + "grad_norm": 7.55186534347744, + "learning_rate": 6.013664393124874e-07, + "loss": 0.5986, + "step": 21628 + }, + { + "epoch": 1.5625914353315151, + "grad_norm": 7.284416852558339, + "learning_rate": 6.01176161463759e-07, + "loss": 0.5713, + "step": 21629 + }, + { + "epoch": 1.5626636805317247, + "grad_norm": 7.287638338061702, + "learning_rate": 6.009859096084544e-07, + "loss": 0.6913, + "step": 21630 + }, + { + "epoch": 1.5627359257319342, + "grad_norm": 7.641656412043567, + "learning_rate": 6.007956837491788e-07, + "loss": 0.6148, + "step": 21631 + }, + { + "epoch": 1.5628081709321437, + "grad_norm": 8.238978103677123, + "learning_rate": 6.00605483888535e-07, + "loss": 0.6671, + "step": 21632 + }, + { + "epoch": 1.5628804161323533, + "grad_norm": 7.424332446777517, + "learning_rate": 6.004153100291277e-07, + "loss": 0.6336, + "step": 21633 + }, + { + "epoch": 1.5629526613325626, + "grad_norm": 7.351405532247595, + "learning_rate": 6.002251621735594e-07, + "loss": 0.6021, + "step": 21634 + }, + { + "epoch": 1.5630249065327724, + "grad_norm": 6.7520135772532415, + "learning_rate": 6.000350403244338e-07, + "loss": 0.6211, + "step": 21635 + }, + { + "epoch": 1.5630971517329817, + "grad_norm": 6.377841690103591, + "learning_rate": 5.998449444843532e-07, + "loss": 0.5521, + "step": 21636 + }, + { + "epoch": 1.5631693969331912, + "grad_norm": 7.019896160781461, + "learning_rate": 5.9965487465592e-07, + "loss": 0.6859, + "step": 21637 + }, + { + "epoch": 1.5632416421334008, + "grad_norm": 7.459677321177102, + "learning_rate": 5.994648308417367e-07, + "loss": 0.6053, + "step": 21638 + }, + { + "epoch": 1.5633138873336103, + "grad_norm": 6.4675035243251475, + "learning_rate": 5.992748130444034e-07, + "loss": 0.5986, + "step": 21639 + }, + { + "epoch": 1.5633861325338199, + "grad_norm": 6.505617208792259, + "learning_rate": 5.990848212665223e-07, + "loss": 0.5692, + "step": 21640 + }, + { + "epoch": 1.5634583777340292, + "grad_norm": 6.120669020941748, + "learning_rate": 5.988948555106947e-07, + "loss": 0.5814, + "step": 21641 + }, + { + "epoch": 1.563530622934239, + "grad_norm": 6.898615557229241, + "learning_rate": 5.987049157795188e-07, + "loss": 0.6668, + "step": 21642 + }, + { + "epoch": 1.5636028681344483, + "grad_norm": 7.490948874005148, + "learning_rate": 5.985150020755972e-07, + "loss": 0.6275, + "step": 21643 + }, + { + "epoch": 1.5636751133346578, + "grad_norm": 6.758201420347597, + "learning_rate": 5.983251144015287e-07, + "loss": 0.576, + "step": 21644 + }, + { + "epoch": 1.5637473585348673, + "grad_norm": 6.868954028777809, + "learning_rate": 5.981352527599138e-07, + "loss": 0.6249, + "step": 21645 + }, + { + "epoch": 1.563819603735077, + "grad_norm": 8.001647779520166, + "learning_rate": 5.979454171533497e-07, + "loss": 0.5427, + "step": 21646 + }, + { + "epoch": 1.5638918489352864, + "grad_norm": 7.1206761675596475, + "learning_rate": 5.977556075844359e-07, + "loss": 0.5526, + "step": 21647 + }, + { + "epoch": 1.5639640941354958, + "grad_norm": 6.147212331773183, + "learning_rate": 5.975658240557716e-07, + "loss": 0.6833, + "step": 21648 + }, + { + "epoch": 1.5640363393357055, + "grad_norm": 6.101159545252924, + "learning_rate": 5.973760665699535e-07, + "loss": 0.5685, + "step": 21649 + }, + { + "epoch": 1.5641085845359148, + "grad_norm": 6.9142641544568875, + "learning_rate": 5.971863351295795e-07, + "loss": 0.6529, + "step": 21650 + }, + { + "epoch": 1.5641808297361244, + "grad_norm": 7.454006310904748, + "learning_rate": 5.969966297372473e-07, + "loss": 0.6932, + "step": 21651 + }, + { + "epoch": 1.564253074936334, + "grad_norm": 7.570227088092087, + "learning_rate": 5.968069503955534e-07, + "loss": 0.6146, + "step": 21652 + }, + { + "epoch": 1.5643253201365435, + "grad_norm": 8.369032370580497, + "learning_rate": 5.966172971070949e-07, + "loss": 0.6426, + "step": 21653 + }, + { + "epoch": 1.564397565336753, + "grad_norm": 8.327201061992888, + "learning_rate": 5.964276698744675e-07, + "loss": 0.6175, + "step": 21654 + }, + { + "epoch": 1.5644698105369623, + "grad_norm": 7.471326567123634, + "learning_rate": 5.962380687002675e-07, + "loss": 0.6268, + "step": 21655 + }, + { + "epoch": 1.564542055737172, + "grad_norm": 6.181316021295704, + "learning_rate": 5.960484935870909e-07, + "loss": 0.6106, + "step": 21656 + }, + { + "epoch": 1.5646143009373814, + "grad_norm": 6.258687809413657, + "learning_rate": 5.958589445375313e-07, + "loss": 0.6015, + "step": 21657 + }, + { + "epoch": 1.564686546137591, + "grad_norm": 7.422122477370729, + "learning_rate": 5.956694215541841e-07, + "loss": 0.5959, + "step": 21658 + }, + { + "epoch": 1.5647587913378005, + "grad_norm": 5.730112929371428, + "learning_rate": 5.954799246396447e-07, + "loss": 0.5635, + "step": 21659 + }, + { + "epoch": 1.56483103653801, + "grad_norm": 6.970075951576995, + "learning_rate": 5.952904537965057e-07, + "loss": 0.5834, + "step": 21660 + }, + { + "epoch": 1.5649032817382196, + "grad_norm": 6.505620433825194, + "learning_rate": 5.951010090273615e-07, + "loss": 0.6455, + "step": 21661 + }, + { + "epoch": 1.564975526938429, + "grad_norm": 7.9285126515766455, + "learning_rate": 5.949115903348049e-07, + "loss": 0.5925, + "step": 21662 + }, + { + "epoch": 1.5650477721386387, + "grad_norm": 7.373422744972258, + "learning_rate": 5.947221977214304e-07, + "loss": 0.6265, + "step": 21663 + }, + { + "epoch": 1.565120017338848, + "grad_norm": 7.684905816259895, + "learning_rate": 5.945328311898294e-07, + "loss": 0.6344, + "step": 21664 + }, + { + "epoch": 1.5651922625390575, + "grad_norm": 7.753688795754331, + "learning_rate": 5.943434907425941e-07, + "loss": 0.6807, + "step": 21665 + }, + { + "epoch": 1.565264507739267, + "grad_norm": 8.453964783984356, + "learning_rate": 5.941541763823177e-07, + "loss": 0.6194, + "step": 21666 + }, + { + "epoch": 1.5653367529394766, + "grad_norm": 7.281227210013288, + "learning_rate": 5.939648881115901e-07, + "loss": 0.5678, + "step": 21667 + }, + { + "epoch": 1.5654089981396861, + "grad_norm": 6.97217150801021, + "learning_rate": 5.93775625933003e-07, + "loss": 0.5642, + "step": 21668 + }, + { + "epoch": 1.5654812433398955, + "grad_norm": 6.731444958313679, + "learning_rate": 5.935863898491479e-07, + "loss": 0.5619, + "step": 21669 + }, + { + "epoch": 1.5655534885401052, + "grad_norm": 6.6092664921648545, + "learning_rate": 5.933971798626145e-07, + "loss": 0.609, + "step": 21670 + }, + { + "epoch": 1.5656257337403146, + "grad_norm": 6.222073534294022, + "learning_rate": 5.932079959759935e-07, + "loss": 0.6033, + "step": 21671 + }, + { + "epoch": 1.565697978940524, + "grad_norm": 6.632193061291725, + "learning_rate": 5.930188381918745e-07, + "loss": 0.5764, + "step": 21672 + }, + { + "epoch": 1.5657702241407336, + "grad_norm": 7.506110690155794, + "learning_rate": 5.928297065128477e-07, + "loss": 0.6323, + "step": 21673 + }, + { + "epoch": 1.5658424693409432, + "grad_norm": 7.961094668468181, + "learning_rate": 5.926406009415006e-07, + "loss": 0.6362, + "step": 21674 + }, + { + "epoch": 1.5659147145411527, + "grad_norm": 6.332883350381643, + "learning_rate": 5.924515214804227e-07, + "loss": 0.557, + "step": 21675 + }, + { + "epoch": 1.565986959741362, + "grad_norm": 7.021637580908527, + "learning_rate": 5.922624681322023e-07, + "loss": 0.5667, + "step": 21676 + }, + { + "epoch": 1.5660592049415718, + "grad_norm": 7.138777679619474, + "learning_rate": 5.920734408994283e-07, + "loss": 0.6438, + "step": 21677 + }, + { + "epoch": 1.5661314501417811, + "grad_norm": 7.156045302749278, + "learning_rate": 5.918844397846868e-07, + "loss": 0.5689, + "step": 21678 + }, + { + "epoch": 1.5662036953419907, + "grad_norm": 5.734351716786859, + "learning_rate": 5.916954647905656e-07, + "loss": 0.6151, + "step": 21679 + }, + { + "epoch": 1.5662759405422002, + "grad_norm": 7.535030698150494, + "learning_rate": 5.915065159196517e-07, + "loss": 0.6074, + "step": 21680 + }, + { + "epoch": 1.5663481857424097, + "grad_norm": 6.009360958406307, + "learning_rate": 5.913175931745321e-07, + "loss": 0.5943, + "step": 21681 + }, + { + "epoch": 1.5664204309426193, + "grad_norm": 7.303137891282215, + "learning_rate": 5.911286965577922e-07, + "loss": 0.6611, + "step": 21682 + }, + { + "epoch": 1.5664926761428286, + "grad_norm": 5.95679622999508, + "learning_rate": 5.909398260720186e-07, + "loss": 0.5857, + "step": 21683 + }, + { + "epoch": 1.5665649213430384, + "grad_norm": 8.558100328799199, + "learning_rate": 5.907509817197974e-07, + "loss": 0.6514, + "step": 21684 + }, + { + "epoch": 1.5666371665432477, + "grad_norm": 6.3748575830446255, + "learning_rate": 5.905621635037117e-07, + "loss": 0.586, + "step": 21685 + }, + { + "epoch": 1.5667094117434575, + "grad_norm": 6.428082438445327, + "learning_rate": 5.903733714263476e-07, + "loss": 0.5519, + "step": 21686 + }, + { + "epoch": 1.5667816569436668, + "grad_norm": 5.89549511410033, + "learning_rate": 5.901846054902893e-07, + "loss": 0.5391, + "step": 21687 + }, + { + "epoch": 1.5668539021438763, + "grad_norm": 7.448377588766159, + "learning_rate": 5.899958656981217e-07, + "loss": 0.6689, + "step": 21688 + }, + { + "epoch": 1.5669261473440859, + "grad_norm": 6.951286295683036, + "learning_rate": 5.898071520524267e-07, + "loss": 0.5333, + "step": 21689 + }, + { + "epoch": 1.5669983925442952, + "grad_norm": 7.062152145056199, + "learning_rate": 5.896184645557882e-07, + "loss": 0.6227, + "step": 21690 + }, + { + "epoch": 1.567070637744505, + "grad_norm": 7.632522374459891, + "learning_rate": 5.89429803210791e-07, + "loss": 0.6274, + "step": 21691 + }, + { + "epoch": 1.5671428829447143, + "grad_norm": 6.589939553152895, + "learning_rate": 5.892411680200152e-07, + "loss": 0.5997, + "step": 21692 + }, + { + "epoch": 1.567215128144924, + "grad_norm": 6.097503288740497, + "learning_rate": 5.890525589860447e-07, + "loss": 0.5539, + "step": 21693 + }, + { + "epoch": 1.5672873733451333, + "grad_norm": 7.167337282726775, + "learning_rate": 5.888639761114606e-07, + "loss": 0.6186, + "step": 21694 + }, + { + "epoch": 1.567359618545343, + "grad_norm": 7.45362938667704, + "learning_rate": 5.886754193988456e-07, + "loss": 0.5704, + "step": 21695 + }, + { + "epoch": 1.5674318637455524, + "grad_norm": 6.210542870226396, + "learning_rate": 5.884868888507794e-07, + "loss": 0.5401, + "step": 21696 + }, + { + "epoch": 1.5675041089457618, + "grad_norm": 6.620886047166236, + "learning_rate": 5.882983844698434e-07, + "loss": 0.6452, + "step": 21697 + }, + { + "epoch": 1.5675763541459715, + "grad_norm": 7.117801710942014, + "learning_rate": 5.881099062586182e-07, + "loss": 0.589, + "step": 21698 + }, + { + "epoch": 1.5676485993461808, + "grad_norm": 7.601567016474763, + "learning_rate": 5.879214542196839e-07, + "loss": 0.706, + "step": 21699 + }, + { + "epoch": 1.5677208445463906, + "grad_norm": 7.134491604801591, + "learning_rate": 5.877330283556204e-07, + "loss": 0.6249, + "step": 21700 + }, + { + "epoch": 1.5677930897466, + "grad_norm": 6.368810903091403, + "learning_rate": 5.875446286690067e-07, + "loss": 0.5901, + "step": 21701 + }, + { + "epoch": 1.5678653349468095, + "grad_norm": 6.866710867631492, + "learning_rate": 5.873562551624229e-07, + "loss": 0.6013, + "step": 21702 + }, + { + "epoch": 1.567937580147019, + "grad_norm": 7.900848352778549, + "learning_rate": 5.871679078384463e-07, + "loss": 0.5857, + "step": 21703 + }, + { + "epoch": 1.5680098253472285, + "grad_norm": 6.662896934265202, + "learning_rate": 5.869795866996558e-07, + "loss": 0.6189, + "step": 21704 + }, + { + "epoch": 1.568082070547438, + "grad_norm": 6.53475230371423, + "learning_rate": 5.867912917486293e-07, + "loss": 0.6036, + "step": 21705 + }, + { + "epoch": 1.5681543157476474, + "grad_norm": 7.384507855525007, + "learning_rate": 5.866030229879452e-07, + "loss": 0.6215, + "step": 21706 + }, + { + "epoch": 1.5682265609478572, + "grad_norm": 7.1275598043543935, + "learning_rate": 5.864147804201794e-07, + "loss": 0.6974, + "step": 21707 + }, + { + "epoch": 1.5682988061480665, + "grad_norm": 6.743041796615118, + "learning_rate": 5.862265640479095e-07, + "loss": 0.6009, + "step": 21708 + }, + { + "epoch": 1.568371051348276, + "grad_norm": 6.787191995899997, + "learning_rate": 5.860383738737119e-07, + "loss": 0.6123, + "step": 21709 + }, + { + "epoch": 1.5684432965484856, + "grad_norm": 6.666901711453063, + "learning_rate": 5.858502099001631e-07, + "loss": 0.6195, + "step": 21710 + }, + { + "epoch": 1.5685155417486951, + "grad_norm": 7.1956491464067085, + "learning_rate": 5.856620721298384e-07, + "loss": 0.6538, + "step": 21711 + }, + { + "epoch": 1.5685877869489047, + "grad_norm": 6.815138839460503, + "learning_rate": 5.854739605653137e-07, + "loss": 0.5893, + "step": 21712 + }, + { + "epoch": 1.568660032149114, + "grad_norm": 7.406413273681561, + "learning_rate": 5.852858752091645e-07, + "loss": 0.5901, + "step": 21713 + }, + { + "epoch": 1.5687322773493237, + "grad_norm": 8.534821485854883, + "learning_rate": 5.850978160639645e-07, + "loss": 0.6602, + "step": 21714 + }, + { + "epoch": 1.568804522549533, + "grad_norm": 8.462345418264805, + "learning_rate": 5.849097831322884e-07, + "loss": 0.7048, + "step": 21715 + }, + { + "epoch": 1.5688767677497426, + "grad_norm": 7.232355944203943, + "learning_rate": 5.847217764167112e-07, + "loss": 0.6379, + "step": 21716 + }, + { + "epoch": 1.5689490129499521, + "grad_norm": 6.069005562835522, + "learning_rate": 5.84533795919805e-07, + "loss": 0.5721, + "step": 21717 + }, + { + "epoch": 1.5690212581501617, + "grad_norm": 6.698125651072009, + "learning_rate": 5.843458416441439e-07, + "loss": 0.5962, + "step": 21718 + }, + { + "epoch": 1.5690935033503712, + "grad_norm": 6.631490443092425, + "learning_rate": 5.841579135923e-07, + "loss": 0.5726, + "step": 21719 + }, + { + "epoch": 1.5691657485505806, + "grad_norm": 8.81523273924576, + "learning_rate": 5.839700117668485e-07, + "loss": 0.6914, + "step": 21720 + }, + { + "epoch": 1.5692379937507903, + "grad_norm": 6.747961796225606, + "learning_rate": 5.837821361703589e-07, + "loss": 0.6051, + "step": 21721 + }, + { + "epoch": 1.5693102389509996, + "grad_norm": 6.879889674078374, + "learning_rate": 5.835942868054037e-07, + "loss": 0.6345, + "step": 21722 + }, + { + "epoch": 1.5693824841512092, + "grad_norm": 6.926583714262291, + "learning_rate": 5.834064636745557e-07, + "loss": 0.6448, + "step": 21723 + }, + { + "epoch": 1.5694547293514187, + "grad_norm": 6.857128705282594, + "learning_rate": 5.832186667803844e-07, + "loss": 0.6349, + "step": 21724 + }, + { + "epoch": 1.5695269745516283, + "grad_norm": 7.584885937833304, + "learning_rate": 5.830308961254613e-07, + "loss": 0.7168, + "step": 21725 + }, + { + "epoch": 1.5695992197518378, + "grad_norm": 6.311106008578344, + "learning_rate": 5.828431517123567e-07, + "loss": 0.6894, + "step": 21726 + }, + { + "epoch": 1.5696714649520471, + "grad_norm": 6.209808515707857, + "learning_rate": 5.826554335436407e-07, + "loss": 0.6301, + "step": 21727 + }, + { + "epoch": 1.5697437101522569, + "grad_norm": 9.791566715677252, + "learning_rate": 5.824677416218832e-07, + "loss": 0.6489, + "step": 21728 + }, + { + "epoch": 1.5698159553524662, + "grad_norm": 6.752726251567827, + "learning_rate": 5.822800759496536e-07, + "loss": 0.622, + "step": 21729 + }, + { + "epoch": 1.5698882005526757, + "grad_norm": 6.036042676103619, + "learning_rate": 5.820924365295214e-07, + "loss": 0.6613, + "step": 21730 + }, + { + "epoch": 1.5699604457528853, + "grad_norm": 7.295766350049972, + "learning_rate": 5.819048233640537e-07, + "loss": 0.6132, + "step": 21731 + }, + { + "epoch": 1.5700326909530948, + "grad_norm": 6.420014271943103, + "learning_rate": 5.817172364558196e-07, + "loss": 0.5604, + "step": 21732 + }, + { + "epoch": 1.5701049361533044, + "grad_norm": 6.504443777055572, + "learning_rate": 5.815296758073874e-07, + "loss": 0.6682, + "step": 21733 + }, + { + "epoch": 1.5701771813535137, + "grad_norm": 7.01850407238341, + "learning_rate": 5.813421414213247e-07, + "loss": 0.5182, + "step": 21734 + }, + { + "epoch": 1.5702494265537235, + "grad_norm": 7.077260754097884, + "learning_rate": 5.81154633300198e-07, + "loss": 0.5663, + "step": 21735 + }, + { + "epoch": 1.5703216717539328, + "grad_norm": 6.2246850604317725, + "learning_rate": 5.809671514465743e-07, + "loss": 0.595, + "step": 21736 + }, + { + "epoch": 1.5703939169541423, + "grad_norm": 8.572508816540696, + "learning_rate": 5.807796958630202e-07, + "loss": 0.6782, + "step": 21737 + }, + { + "epoch": 1.5704661621543519, + "grad_norm": 8.406243193102496, + "learning_rate": 5.805922665521019e-07, + "loss": 0.6399, + "step": 21738 + }, + { + "epoch": 1.5705384073545614, + "grad_norm": 8.457667232148923, + "learning_rate": 5.804048635163853e-07, + "loss": 0.666, + "step": 21739 + }, + { + "epoch": 1.570610652554771, + "grad_norm": 7.37851540785973, + "learning_rate": 5.802174867584354e-07, + "loss": 0.6328, + "step": 21740 + }, + { + "epoch": 1.5706828977549803, + "grad_norm": 7.0886093536292, + "learning_rate": 5.800301362808183e-07, + "loss": 0.5523, + "step": 21741 + }, + { + "epoch": 1.57075514295519, + "grad_norm": 6.525138660579936, + "learning_rate": 5.798428120860972e-07, + "loss": 0.5758, + "step": 21742 + }, + { + "epoch": 1.5708273881553994, + "grad_norm": 6.6309957184067985, + "learning_rate": 5.796555141768373e-07, + "loss": 0.6748, + "step": 21743 + }, + { + "epoch": 1.570899633355609, + "grad_norm": 6.825675332888315, + "learning_rate": 5.79468242555602e-07, + "loss": 0.5316, + "step": 21744 + }, + { + "epoch": 1.5709718785558184, + "grad_norm": 6.279112798623847, + "learning_rate": 5.792809972249561e-07, + "loss": 0.6623, + "step": 21745 + }, + { + "epoch": 1.571044123756028, + "grad_norm": 6.9100389400549815, + "learning_rate": 5.790937781874614e-07, + "loss": 0.5842, + "step": 21746 + }, + { + "epoch": 1.5711163689562375, + "grad_norm": 7.36177869284733, + "learning_rate": 5.789065854456805e-07, + "loss": 0.5868, + "step": 21747 + }, + { + "epoch": 1.5711886141564468, + "grad_norm": 7.471702343307208, + "learning_rate": 5.787194190021784e-07, + "loss": 0.6222, + "step": 21748 + }, + { + "epoch": 1.5712608593566566, + "grad_norm": 7.0202594224429085, + "learning_rate": 5.785322788595149e-07, + "loss": 0.5934, + "step": 21749 + }, + { + "epoch": 1.571333104556866, + "grad_norm": 7.665654848198488, + "learning_rate": 5.783451650202526e-07, + "loss": 0.5841, + "step": 21750 + }, + { + "epoch": 1.5714053497570755, + "grad_norm": 7.938705713205327, + "learning_rate": 5.78158077486953e-07, + "loss": 0.5835, + "step": 21751 + }, + { + "epoch": 1.571477594957285, + "grad_norm": 6.670394268593599, + "learning_rate": 5.779710162621777e-07, + "loss": 0.5944, + "step": 21752 + }, + { + "epoch": 1.5715498401574945, + "grad_norm": 7.410617490257749, + "learning_rate": 5.777839813484862e-07, + "loss": 0.5545, + "step": 21753 + }, + { + "epoch": 1.571622085357704, + "grad_norm": 6.887712966201855, + "learning_rate": 5.775969727484393e-07, + "loss": 0.5934, + "step": 21754 + }, + { + "epoch": 1.5716943305579134, + "grad_norm": 7.661959516123077, + "learning_rate": 5.774099904645975e-07, + "loss": 0.6918, + "step": 21755 + }, + { + "epoch": 1.5717665757581232, + "grad_norm": 6.575999879414726, + "learning_rate": 5.772230344995198e-07, + "loss": 0.6094, + "step": 21756 + }, + { + "epoch": 1.5718388209583325, + "grad_norm": 7.144956514561212, + "learning_rate": 5.77036104855766e-07, + "loss": 0.6042, + "step": 21757 + }, + { + "epoch": 1.5719110661585423, + "grad_norm": 7.740008404552224, + "learning_rate": 5.768492015358948e-07, + "loss": 0.5708, + "step": 21758 + }, + { + "epoch": 1.5719833113587516, + "grad_norm": 6.709407396598107, + "learning_rate": 5.766623245424655e-07, + "loss": 0.5291, + "step": 21759 + }, + { + "epoch": 1.5720555565589611, + "grad_norm": 7.102350743090689, + "learning_rate": 5.764754738780351e-07, + "loss": 0.6159, + "step": 21760 + }, + { + "epoch": 1.5721278017591707, + "grad_norm": 6.507595905853171, + "learning_rate": 5.762886495451616e-07, + "loss": 0.5525, + "step": 21761 + }, + { + "epoch": 1.57220004695938, + "grad_norm": 6.010282606440178, + "learning_rate": 5.761018515464031e-07, + "loss": 0.5751, + "step": 21762 + }, + { + "epoch": 1.5722722921595897, + "grad_norm": 6.492931630883701, + "learning_rate": 5.759150798843169e-07, + "loss": 0.5726, + "step": 21763 + }, + { + "epoch": 1.572344537359799, + "grad_norm": 7.7625090029260395, + "learning_rate": 5.757283345614589e-07, + "loss": 0.6394, + "step": 21764 + }, + { + "epoch": 1.5724167825600088, + "grad_norm": 8.677259378528777, + "learning_rate": 5.755416155803856e-07, + "loss": 0.6873, + "step": 21765 + }, + { + "epoch": 1.5724890277602181, + "grad_norm": 8.948022157153073, + "learning_rate": 5.753549229436533e-07, + "loss": 0.7436, + "step": 21766 + }, + { + "epoch": 1.5725612729604277, + "grad_norm": 6.7695549173706295, + "learning_rate": 5.75168256653818e-07, + "loss": 0.6499, + "step": 21767 + }, + { + "epoch": 1.5726335181606372, + "grad_norm": 6.0345276144080024, + "learning_rate": 5.749816167134348e-07, + "loss": 0.6216, + "step": 21768 + }, + { + "epoch": 1.5727057633608466, + "grad_norm": 7.394949359630135, + "learning_rate": 5.747950031250582e-07, + "loss": 0.5768, + "step": 21769 + }, + { + "epoch": 1.5727780085610563, + "grad_norm": 6.6832704092410555, + "learning_rate": 5.74608415891244e-07, + "loss": 0.5787, + "step": 21770 + }, + { + "epoch": 1.5728502537612656, + "grad_norm": 9.265195170784821, + "learning_rate": 5.744218550145453e-07, + "loss": 0.61, + "step": 21771 + }, + { + "epoch": 1.5729224989614754, + "grad_norm": 7.441101740816846, + "learning_rate": 5.74235320497516e-07, + "loss": 0.6874, + "step": 21772 + }, + { + "epoch": 1.5729947441616847, + "grad_norm": 6.312989168759364, + "learning_rate": 5.740488123427104e-07, + "loss": 0.6523, + "step": 21773 + }, + { + "epoch": 1.5730669893618943, + "grad_norm": 7.073556263413045, + "learning_rate": 5.738623305526808e-07, + "loss": 0.6883, + "step": 21774 + }, + { + "epoch": 1.5731392345621038, + "grad_norm": 7.282510934776323, + "learning_rate": 5.736758751299795e-07, + "loss": 0.6885, + "step": 21775 + }, + { + "epoch": 1.5732114797623131, + "grad_norm": 6.687954896090334, + "learning_rate": 5.734894460771603e-07, + "loss": 0.5809, + "step": 21776 + }, + { + "epoch": 1.573283724962523, + "grad_norm": 6.572424659108555, + "learning_rate": 5.733030433967756e-07, + "loss": 0.5443, + "step": 21777 + }, + { + "epoch": 1.5733559701627322, + "grad_norm": 7.077458567119069, + "learning_rate": 5.731166670913757e-07, + "loss": 0.6414, + "step": 21778 + }, + { + "epoch": 1.573428215362942, + "grad_norm": 8.295332310822314, + "learning_rate": 5.729303171635123e-07, + "loss": 0.6898, + "step": 21779 + }, + { + "epoch": 1.5735004605631513, + "grad_norm": 6.417998468814333, + "learning_rate": 5.727439936157372e-07, + "loss": 0.5918, + "step": 21780 + }, + { + "epoch": 1.5735727057633608, + "grad_norm": 8.850681136477002, + "learning_rate": 5.725576964505999e-07, + "loss": 0.6137, + "step": 21781 + }, + { + "epoch": 1.5736449509635704, + "grad_norm": 8.40104338205836, + "learning_rate": 5.723714256706512e-07, + "loss": 0.6285, + "step": 21782 + }, + { + "epoch": 1.57371719616378, + "grad_norm": 6.70794006837699, + "learning_rate": 5.721851812784409e-07, + "loss": 0.5141, + "step": 21783 + }, + { + "epoch": 1.5737894413639895, + "grad_norm": 7.544877887847443, + "learning_rate": 5.719989632765186e-07, + "loss": 0.6003, + "step": 21784 + }, + { + "epoch": 1.5738616865641988, + "grad_norm": 6.47679691787767, + "learning_rate": 5.718127716674335e-07, + "loss": 0.6514, + "step": 21785 + }, + { + "epoch": 1.5739339317644085, + "grad_norm": 8.3974018256765, + "learning_rate": 5.716266064537346e-07, + "loss": 0.6244, + "step": 21786 + }, + { + "epoch": 1.5740061769646179, + "grad_norm": 7.928171999577382, + "learning_rate": 5.714404676379701e-07, + "loss": 0.5477, + "step": 21787 + }, + { + "epoch": 1.5740784221648274, + "grad_norm": 6.8544375782525115, + "learning_rate": 5.712543552226887e-07, + "loss": 0.6002, + "step": 21788 + }, + { + "epoch": 1.574150667365037, + "grad_norm": 8.026994936156253, + "learning_rate": 5.710682692104369e-07, + "loss": 0.6066, + "step": 21789 + }, + { + "epoch": 1.5742229125652465, + "grad_norm": 8.810768769027453, + "learning_rate": 5.708822096037628e-07, + "loss": 0.6243, + "step": 21790 + }, + { + "epoch": 1.574295157765456, + "grad_norm": 5.906207695057651, + "learning_rate": 5.706961764052141e-07, + "loss": 0.6155, + "step": 21791 + }, + { + "epoch": 1.5743674029656654, + "grad_norm": 6.112130482743798, + "learning_rate": 5.705101696173363e-07, + "loss": 0.607, + "step": 21792 + }, + { + "epoch": 1.5744396481658751, + "grad_norm": 6.65809770468758, + "learning_rate": 5.703241892426758e-07, + "loss": 0.6234, + "step": 21793 + }, + { + "epoch": 1.5745118933660844, + "grad_norm": 8.778486692664131, + "learning_rate": 5.70138235283779e-07, + "loss": 0.6644, + "step": 21794 + }, + { + "epoch": 1.574584138566294, + "grad_norm": 6.974148562293846, + "learning_rate": 5.699523077431912e-07, + "loss": 0.5791, + "step": 21795 + }, + { + "epoch": 1.5746563837665035, + "grad_norm": 6.453854424952314, + "learning_rate": 5.697664066234579e-07, + "loss": 0.6157, + "step": 21796 + }, + { + "epoch": 1.574728628966713, + "grad_norm": 6.582232644823196, + "learning_rate": 5.695805319271236e-07, + "loss": 0.542, + "step": 21797 + }, + { + "epoch": 1.5748008741669226, + "grad_norm": 5.551087748839186, + "learning_rate": 5.693946836567337e-07, + "loss": 0.494, + "step": 21798 + }, + { + "epoch": 1.574873119367132, + "grad_norm": 7.676220317833505, + "learning_rate": 5.692088618148309e-07, + "loss": 0.6866, + "step": 21799 + }, + { + "epoch": 1.5749453645673417, + "grad_norm": 6.659935732236425, + "learning_rate": 5.690230664039595e-07, + "loss": 0.595, + "step": 21800 + }, + { + "epoch": 1.575017609767551, + "grad_norm": 7.241913593795497, + "learning_rate": 5.688372974266631e-07, + "loss": 0.6022, + "step": 21801 + }, + { + "epoch": 1.5750898549677605, + "grad_norm": 6.440924465097218, + "learning_rate": 5.686515548854854e-07, + "loss": 0.6325, + "step": 21802 + }, + { + "epoch": 1.57516210016797, + "grad_norm": 6.194651160713547, + "learning_rate": 5.68465838782967e-07, + "loss": 0.6524, + "step": 21803 + }, + { + "epoch": 1.5752343453681796, + "grad_norm": 7.527476271456057, + "learning_rate": 5.682801491216519e-07, + "loss": 0.5912, + "step": 21804 + }, + { + "epoch": 1.5753065905683892, + "grad_norm": 6.771215653018129, + "learning_rate": 5.680944859040826e-07, + "loss": 0.6234, + "step": 21805 + }, + { + "epoch": 1.5753788357685985, + "grad_norm": 8.59473760305293, + "learning_rate": 5.679088491327991e-07, + "loss": 0.6048, + "step": 21806 + }, + { + "epoch": 1.5754510809688083, + "grad_norm": 7.410677459677823, + "learning_rate": 5.677232388103432e-07, + "loss": 0.6139, + "step": 21807 + }, + { + "epoch": 1.5755233261690176, + "grad_norm": 6.039981194776351, + "learning_rate": 5.675376549392559e-07, + "loss": 0.5751, + "step": 21808 + }, + { + "epoch": 1.5755955713692271, + "grad_norm": 7.3308587089372, + "learning_rate": 5.673520975220781e-07, + "loss": 0.5135, + "step": 21809 + }, + { + "epoch": 1.5756678165694367, + "grad_norm": 7.38531445287681, + "learning_rate": 5.671665665613491e-07, + "loss": 0.5587, + "step": 21810 + }, + { + "epoch": 1.5757400617696462, + "grad_norm": 7.686425855073157, + "learning_rate": 5.669810620596089e-07, + "loss": 0.6158, + "step": 21811 + }, + { + "epoch": 1.5758123069698557, + "grad_norm": 7.750602452642391, + "learning_rate": 5.667955840193973e-07, + "loss": 0.6704, + "step": 21812 + }, + { + "epoch": 1.575884552170065, + "grad_norm": 7.886896744524368, + "learning_rate": 5.666101324432527e-07, + "loss": 0.56, + "step": 21813 + }, + { + "epoch": 1.5759567973702748, + "grad_norm": 6.2858866568871905, + "learning_rate": 5.664247073337145e-07, + "loss": 0.6446, + "step": 21814 + }, + { + "epoch": 1.5760290425704842, + "grad_norm": 7.698374430493123, + "learning_rate": 5.662393086933208e-07, + "loss": 0.6589, + "step": 21815 + }, + { + "epoch": 1.5761012877706937, + "grad_norm": 7.979573876039, + "learning_rate": 5.660539365246098e-07, + "loss": 0.6122, + "step": 21816 + }, + { + "epoch": 1.5761735329709032, + "grad_norm": 5.734499397279796, + "learning_rate": 5.658685908301184e-07, + "loss": 0.5916, + "step": 21817 + }, + { + "epoch": 1.5762457781711128, + "grad_norm": 7.172045470894078, + "learning_rate": 5.656832716123842e-07, + "loss": 0.5615, + "step": 21818 + }, + { + "epoch": 1.5763180233713223, + "grad_norm": 5.88628877462957, + "learning_rate": 5.654979788739442e-07, + "loss": 0.6412, + "step": 21819 + }, + { + "epoch": 1.5763902685715316, + "grad_norm": 6.686956936186014, + "learning_rate": 5.653127126173355e-07, + "loss": 0.5866, + "step": 21820 + }, + { + "epoch": 1.5764625137717414, + "grad_norm": 7.601074955861121, + "learning_rate": 5.651274728450928e-07, + "loss": 0.5837, + "step": 21821 + }, + { + "epoch": 1.5765347589719507, + "grad_norm": 8.317981260850457, + "learning_rate": 5.649422595597526e-07, + "loss": 0.6367, + "step": 21822 + }, + { + "epoch": 1.5766070041721603, + "grad_norm": 7.847871829651569, + "learning_rate": 5.647570727638504e-07, + "loss": 0.7095, + "step": 21823 + }, + { + "epoch": 1.5766792493723698, + "grad_norm": 8.750915043532368, + "learning_rate": 5.645719124599213e-07, + "loss": 0.6836, + "step": 21824 + }, + { + "epoch": 1.5767514945725793, + "grad_norm": 6.427426057536381, + "learning_rate": 5.643867786505e-07, + "loss": 0.5862, + "step": 21825 + }, + { + "epoch": 1.576823739772789, + "grad_norm": 6.765880694414063, + "learning_rate": 5.642016713381207e-07, + "loss": 0.6408, + "step": 21826 + }, + { + "epoch": 1.5768959849729982, + "grad_norm": 7.09205882881667, + "learning_rate": 5.640165905253184e-07, + "loss": 0.6114, + "step": 21827 + }, + { + "epoch": 1.576968230173208, + "grad_norm": 6.226084615444274, + "learning_rate": 5.638315362146249e-07, + "loss": 0.5711, + "step": 21828 + }, + { + "epoch": 1.5770404753734173, + "grad_norm": 7.317866516213663, + "learning_rate": 5.636465084085743e-07, + "loss": 0.6094, + "step": 21829 + }, + { + "epoch": 1.5771127205736268, + "grad_norm": 8.48562843238956, + "learning_rate": 5.634615071097002e-07, + "loss": 0.6806, + "step": 21830 + }, + { + "epoch": 1.5771849657738364, + "grad_norm": 7.504489063861652, + "learning_rate": 5.632765323205339e-07, + "loss": 0.6008, + "step": 21831 + }, + { + "epoch": 1.577257210974046, + "grad_norm": 7.169937836232575, + "learning_rate": 5.630915840436074e-07, + "loss": 0.5887, + "step": 21832 + }, + { + "epoch": 1.5773294561742555, + "grad_norm": 7.27175533129, + "learning_rate": 5.629066622814539e-07, + "loss": 0.6304, + "step": 21833 + }, + { + "epoch": 1.5774017013744648, + "grad_norm": 7.9154117861231, + "learning_rate": 5.627217670366044e-07, + "loss": 0.6121, + "step": 21834 + }, + { + "epoch": 1.5774739465746745, + "grad_norm": 6.864675086126505, + "learning_rate": 5.625368983115895e-07, + "loss": 0.5743, + "step": 21835 + }, + { + "epoch": 1.5775461917748839, + "grad_norm": 6.14456756833275, + "learning_rate": 5.623520561089401e-07, + "loss": 0.5558, + "step": 21836 + }, + { + "epoch": 1.5776184369750936, + "grad_norm": 6.852431552103934, + "learning_rate": 5.62167240431187e-07, + "loss": 0.5935, + "step": 21837 + }, + { + "epoch": 1.577690682175303, + "grad_norm": 7.2219516540447275, + "learning_rate": 5.619824512808591e-07, + "loss": 0.6107, + "step": 21838 + }, + { + "epoch": 1.5777629273755125, + "grad_norm": 6.82584578754399, + "learning_rate": 5.61797688660487e-07, + "loss": 0.6225, + "step": 21839 + }, + { + "epoch": 1.577835172575722, + "grad_norm": 8.057079770228674, + "learning_rate": 5.616129525725994e-07, + "loss": 0.5633, + "step": 21840 + }, + { + "epoch": 1.5779074177759314, + "grad_norm": 7.119861556861003, + "learning_rate": 5.614282430197254e-07, + "loss": 0.6726, + "step": 21841 + }, + { + "epoch": 1.5779796629761411, + "grad_norm": 6.116890657708324, + "learning_rate": 5.612435600043936e-07, + "loss": 0.5957, + "step": 21842 + }, + { + "epoch": 1.5780519081763504, + "grad_norm": 7.001991261228084, + "learning_rate": 5.61058903529132e-07, + "loss": 0.6696, + "step": 21843 + }, + { + "epoch": 1.5781241533765602, + "grad_norm": 7.391009524385067, + "learning_rate": 5.608742735964687e-07, + "loss": 0.5411, + "step": 21844 + }, + { + "epoch": 1.5781963985767695, + "grad_norm": 6.6753878134320255, + "learning_rate": 5.606896702089315e-07, + "loss": 0.668, + "step": 21845 + }, + { + "epoch": 1.578268643776979, + "grad_norm": 7.298419927707785, + "learning_rate": 5.605050933690461e-07, + "loss": 0.6027, + "step": 21846 + }, + { + "epoch": 1.5783408889771886, + "grad_norm": 7.105164071000055, + "learning_rate": 5.603205430793405e-07, + "loss": 0.6238, + "step": 21847 + }, + { + "epoch": 1.578413134177398, + "grad_norm": 9.037106631253636, + "learning_rate": 5.601360193423408e-07, + "loss": 0.612, + "step": 21848 + }, + { + "epoch": 1.5784853793776077, + "grad_norm": 6.2446070482857445, + "learning_rate": 5.599515221605725e-07, + "loss": 0.6345, + "step": 21849 + }, + { + "epoch": 1.578557624577817, + "grad_norm": 6.208888837637901, + "learning_rate": 5.597670515365613e-07, + "loss": 0.673, + "step": 21850 + }, + { + "epoch": 1.5786298697780268, + "grad_norm": 7.164712092277984, + "learning_rate": 5.595826074728328e-07, + "loss": 0.6057, + "step": 21851 + }, + { + "epoch": 1.578702114978236, + "grad_norm": 7.205253634095099, + "learning_rate": 5.593981899719117e-07, + "loss": 0.5206, + "step": 21852 + }, + { + "epoch": 1.5787743601784456, + "grad_norm": 6.788497778404464, + "learning_rate": 5.592137990363225e-07, + "loss": 0.5769, + "step": 21853 + }, + { + "epoch": 1.5788466053786552, + "grad_norm": 7.7022967096361254, + "learning_rate": 5.590294346685896e-07, + "loss": 0.6074, + "step": 21854 + }, + { + "epoch": 1.5789188505788647, + "grad_norm": 7.7220780460550404, + "learning_rate": 5.588450968712372e-07, + "loss": 0.6607, + "step": 21855 + }, + { + "epoch": 1.5789910957790743, + "grad_norm": 7.025158765634628, + "learning_rate": 5.586607856467877e-07, + "loss": 0.5623, + "step": 21856 + }, + { + "epoch": 1.5790633409792836, + "grad_norm": 7.071082917279847, + "learning_rate": 5.584765009977647e-07, + "loss": 0.6084, + "step": 21857 + }, + { + "epoch": 1.5791355861794933, + "grad_norm": 6.4367316815283075, + "learning_rate": 5.582922429266907e-07, + "loss": 0.6256, + "step": 21858 + }, + { + "epoch": 1.5792078313797027, + "grad_norm": 6.576749608012619, + "learning_rate": 5.581080114360892e-07, + "loss": 0.5929, + "step": 21859 + }, + { + "epoch": 1.5792800765799122, + "grad_norm": 7.044124227092321, + "learning_rate": 5.579238065284798e-07, + "loss": 0.5857, + "step": 21860 + }, + { + "epoch": 1.5793523217801217, + "grad_norm": 7.5380863802994105, + "learning_rate": 5.57739628206386e-07, + "loss": 0.7635, + "step": 21861 + }, + { + "epoch": 1.5794245669803313, + "grad_norm": 8.995050976933745, + "learning_rate": 5.575554764723295e-07, + "loss": 0.6071, + "step": 21862 + }, + { + "epoch": 1.5794968121805408, + "grad_norm": 7.853193335761425, + "learning_rate": 5.573713513288298e-07, + "loss": 0.6305, + "step": 21863 + }, + { + "epoch": 1.5795690573807502, + "grad_norm": 6.8095427927926995, + "learning_rate": 5.571872527784078e-07, + "loss": 0.5738, + "step": 21864 + }, + { + "epoch": 1.57964130258096, + "grad_norm": 6.858192289747663, + "learning_rate": 5.570031808235837e-07, + "loss": 0.568, + "step": 21865 + }, + { + "epoch": 1.5797135477811692, + "grad_norm": 7.150701112317944, + "learning_rate": 5.568191354668784e-07, + "loss": 0.6042, + "step": 21866 + }, + { + "epoch": 1.5797857929813788, + "grad_norm": 6.768790475824705, + "learning_rate": 5.566351167108094e-07, + "loss": 0.5591, + "step": 21867 + }, + { + "epoch": 1.5798580381815883, + "grad_norm": 6.497827900578555, + "learning_rate": 5.564511245578966e-07, + "loss": 0.6249, + "step": 21868 + }, + { + "epoch": 1.5799302833817979, + "grad_norm": 7.113646446041393, + "learning_rate": 5.562671590106591e-07, + "loss": 0.5974, + "step": 21869 + }, + { + "epoch": 1.5800025285820074, + "grad_norm": 7.648472163401405, + "learning_rate": 5.560832200716149e-07, + "loss": 0.5997, + "step": 21870 + }, + { + "epoch": 1.5800747737822167, + "grad_norm": 6.948143569771864, + "learning_rate": 5.55899307743282e-07, + "loss": 0.5439, + "step": 21871 + }, + { + "epoch": 1.5801470189824265, + "grad_norm": 6.89818312455692, + "learning_rate": 5.557154220281782e-07, + "loss": 0.5422, + "step": 21872 + }, + { + "epoch": 1.5802192641826358, + "grad_norm": 5.299896548719043, + "learning_rate": 5.555315629288213e-07, + "loss": 0.6252, + "step": 21873 + }, + { + "epoch": 1.5802915093828453, + "grad_norm": 7.357338398783148, + "learning_rate": 5.55347730447727e-07, + "loss": 0.629, + "step": 21874 + }, + { + "epoch": 1.580363754583055, + "grad_norm": 8.553338042823087, + "learning_rate": 5.551639245874122e-07, + "loss": 0.634, + "step": 21875 + }, + { + "epoch": 1.5804359997832644, + "grad_norm": 6.335610264842568, + "learning_rate": 5.54980145350393e-07, + "loss": 0.6211, + "step": 21876 + }, + { + "epoch": 1.580508244983474, + "grad_norm": 6.649748730035401, + "learning_rate": 5.547963927391861e-07, + "loss": 0.6276, + "step": 21877 + }, + { + "epoch": 1.5805804901836833, + "grad_norm": 6.877462050271865, + "learning_rate": 5.546126667563059e-07, + "loss": 0.6735, + "step": 21878 + }, + { + "epoch": 1.580652735383893, + "grad_norm": 7.255276831426542, + "learning_rate": 5.544289674042674e-07, + "loss": 0.5881, + "step": 21879 + }, + { + "epoch": 1.5807249805841024, + "grad_norm": 7.183910501869922, + "learning_rate": 5.542452946855858e-07, + "loss": 0.6253, + "step": 21880 + }, + { + "epoch": 1.580797225784312, + "grad_norm": 6.575780310216607, + "learning_rate": 5.540616486027756e-07, + "loss": 0.5672, + "step": 21881 + }, + { + "epoch": 1.5808694709845215, + "grad_norm": 6.7447924836107935, + "learning_rate": 5.538780291583504e-07, + "loss": 0.6047, + "step": 21882 + }, + { + "epoch": 1.580941716184731, + "grad_norm": 8.264573605448911, + "learning_rate": 5.536944363548238e-07, + "loss": 0.6534, + "step": 21883 + }, + { + "epoch": 1.5810139613849405, + "grad_norm": 6.97692063662933, + "learning_rate": 5.535108701947098e-07, + "loss": 0.6363, + "step": 21884 + }, + { + "epoch": 1.5810862065851499, + "grad_norm": 7.622199122570292, + "learning_rate": 5.5332733068052e-07, + "loss": 0.6248, + "step": 21885 + }, + { + "epoch": 1.5811584517853596, + "grad_norm": 8.195842191235311, + "learning_rate": 5.531438178147677e-07, + "loss": 0.6036, + "step": 21886 + }, + { + "epoch": 1.581230696985569, + "grad_norm": 7.438507268320666, + "learning_rate": 5.529603315999654e-07, + "loss": 0.6766, + "step": 21887 + }, + { + "epoch": 1.5813029421857785, + "grad_norm": 8.68776082257975, + "learning_rate": 5.527768720386231e-07, + "loss": 0.6948, + "step": 21888 + }, + { + "epoch": 1.581375187385988, + "grad_norm": 6.387639698509047, + "learning_rate": 5.525934391332541e-07, + "loss": 0.5452, + "step": 21889 + }, + { + "epoch": 1.5814474325861976, + "grad_norm": 9.454043408382368, + "learning_rate": 5.524100328863691e-07, + "loss": 0.5853, + "step": 21890 + }, + { + "epoch": 1.5815196777864071, + "grad_norm": 6.158772372992363, + "learning_rate": 5.522266533004789e-07, + "loss": 0.5718, + "step": 21891 + }, + { + "epoch": 1.5815919229866164, + "grad_norm": 6.391727417711445, + "learning_rate": 5.520433003780929e-07, + "loss": 0.5581, + "step": 21892 + }, + { + "epoch": 1.5816641681868262, + "grad_norm": 7.949989885047859, + "learning_rate": 5.518599741217217e-07, + "loss": 0.6693, + "step": 21893 + }, + { + "epoch": 1.5817364133870355, + "grad_norm": 6.457978499314427, + "learning_rate": 5.516766745338747e-07, + "loss": 0.6697, + "step": 21894 + }, + { + "epoch": 1.581808658587245, + "grad_norm": 7.359833202959876, + "learning_rate": 5.51493401617062e-07, + "loss": 0.6059, + "step": 21895 + }, + { + "epoch": 1.5818809037874546, + "grad_norm": 7.003785608362123, + "learning_rate": 5.513101553737912e-07, + "loss": 0.6138, + "step": 21896 + }, + { + "epoch": 1.5819531489876641, + "grad_norm": 8.263431596970625, + "learning_rate": 5.51126935806571e-07, + "loss": 0.6263, + "step": 21897 + }, + { + "epoch": 1.5820253941878737, + "grad_norm": 6.3052888982837505, + "learning_rate": 5.509437429179102e-07, + "loss": 0.6286, + "step": 21898 + }, + { + "epoch": 1.582097639388083, + "grad_norm": 6.345326495536769, + "learning_rate": 5.507605767103161e-07, + "loss": 0.5636, + "step": 21899 + }, + { + "epoch": 1.5821698845882928, + "grad_norm": 8.162975593019164, + "learning_rate": 5.50577437186296e-07, + "loss": 0.5687, + "step": 21900 + }, + { + "epoch": 1.582242129788502, + "grad_norm": 6.767167759022955, + "learning_rate": 5.503943243483572e-07, + "loss": 0.6057, + "step": 21901 + }, + { + "epoch": 1.5823143749887116, + "grad_norm": 6.348654368642261, + "learning_rate": 5.502112381990072e-07, + "loss": 0.6206, + "step": 21902 + }, + { + "epoch": 1.5823866201889212, + "grad_norm": 6.725967008702052, + "learning_rate": 5.500281787407507e-07, + "loss": 0.578, + "step": 21903 + }, + { + "epoch": 1.5824588653891307, + "grad_norm": 7.003713984916827, + "learning_rate": 5.498451459760943e-07, + "loss": 0.5733, + "step": 21904 + }, + { + "epoch": 1.5825311105893403, + "grad_norm": 6.391474361917051, + "learning_rate": 5.496621399075444e-07, + "loss": 0.5666, + "step": 21905 + }, + { + "epoch": 1.5826033557895496, + "grad_norm": 5.99107714614774, + "learning_rate": 5.494791605376046e-07, + "loss": 0.5316, + "step": 21906 + }, + { + "epoch": 1.5826756009897593, + "grad_norm": 6.378200251328342, + "learning_rate": 5.492962078687808e-07, + "loss": 0.547, + "step": 21907 + }, + { + "epoch": 1.5827478461899687, + "grad_norm": 7.116283773921369, + "learning_rate": 5.491132819035774e-07, + "loss": 0.681, + "step": 21908 + }, + { + "epoch": 1.5828200913901784, + "grad_norm": 8.280978677011618, + "learning_rate": 5.489303826444981e-07, + "loss": 0.6379, + "step": 21909 + }, + { + "epoch": 1.5828923365903877, + "grad_norm": 6.976333665672459, + "learning_rate": 5.487475100940473e-07, + "loss": 0.5181, + "step": 21910 + }, + { + "epoch": 1.5829645817905973, + "grad_norm": 5.8394031371196435, + "learning_rate": 5.485646642547277e-07, + "loss": 0.6306, + "step": 21911 + }, + { + "epoch": 1.5830368269908068, + "grad_norm": 7.481727721243551, + "learning_rate": 5.483818451290435e-07, + "loss": 0.6803, + "step": 21912 + }, + { + "epoch": 1.5831090721910162, + "grad_norm": 8.097728797009806, + "learning_rate": 5.481990527194958e-07, + "loss": 0.6648, + "step": 21913 + }, + { + "epoch": 1.583181317391226, + "grad_norm": 7.234083132695667, + "learning_rate": 5.480162870285877e-07, + "loss": 0.6452, + "step": 21914 + }, + { + "epoch": 1.5832535625914352, + "grad_norm": 8.285792994007924, + "learning_rate": 5.478335480588209e-07, + "loss": 0.6405, + "step": 21915 + }, + { + "epoch": 1.583325807791645, + "grad_norm": 7.3140976945518865, + "learning_rate": 5.47650835812697e-07, + "loss": 0.5968, + "step": 21916 + }, + { + "epoch": 1.5833980529918543, + "grad_norm": 8.239680454020958, + "learning_rate": 5.474681502927174e-07, + "loss": 0.6657, + "step": 21917 + }, + { + "epoch": 1.5834702981920639, + "grad_norm": 6.768324104358524, + "learning_rate": 5.472854915013829e-07, + "loss": 0.5676, + "step": 21918 + }, + { + "epoch": 1.5835425433922734, + "grad_norm": 7.085072322629779, + "learning_rate": 5.471028594411934e-07, + "loss": 0.6395, + "step": 21919 + }, + { + "epoch": 1.5836147885924827, + "grad_norm": 8.221882850356144, + "learning_rate": 5.469202541146504e-07, + "loss": 0.6226, + "step": 21920 + }, + { + "epoch": 1.5836870337926925, + "grad_norm": 6.460517989611343, + "learning_rate": 5.467376755242521e-07, + "loss": 0.6716, + "step": 21921 + }, + { + "epoch": 1.5837592789929018, + "grad_norm": 6.968035879452256, + "learning_rate": 5.46555123672498e-07, + "loss": 0.6457, + "step": 21922 + }, + { + "epoch": 1.5838315241931116, + "grad_norm": 6.799987220752152, + "learning_rate": 5.463725985618887e-07, + "loss": 0.5878, + "step": 21923 + }, + { + "epoch": 1.583903769393321, + "grad_norm": 6.413108169240626, + "learning_rate": 5.461901001949204e-07, + "loss": 0.6468, + "step": 21924 + }, + { + "epoch": 1.5839760145935304, + "grad_norm": 7.577886141129327, + "learning_rate": 5.460076285740931e-07, + "loss": 0.6254, + "step": 21925 + }, + { + "epoch": 1.58404825979374, + "grad_norm": 6.956686072378079, + "learning_rate": 5.458251837019038e-07, + "loss": 0.5697, + "step": 21926 + }, + { + "epoch": 1.5841205049939495, + "grad_norm": 7.620274314448359, + "learning_rate": 5.456427655808508e-07, + "loss": 0.6531, + "step": 21927 + }, + { + "epoch": 1.584192750194159, + "grad_norm": 7.433780028374119, + "learning_rate": 5.454603742134306e-07, + "loss": 0.6523, + "step": 21928 + }, + { + "epoch": 1.5842649953943684, + "grad_norm": 7.418820300648905, + "learning_rate": 5.452780096021404e-07, + "loss": 0.5748, + "step": 21929 + }, + { + "epoch": 1.5843372405945781, + "grad_norm": 7.8518063939109535, + "learning_rate": 5.45095671749477e-07, + "loss": 0.664, + "step": 21930 + }, + { + "epoch": 1.5844094857947875, + "grad_norm": 7.138451443672631, + "learning_rate": 5.449133606579354e-07, + "loss": 0.6219, + "step": 21931 + }, + { + "epoch": 1.584481730994997, + "grad_norm": 6.795455468683131, + "learning_rate": 5.44731076330012e-07, + "loss": 0.6066, + "step": 21932 + }, + { + "epoch": 1.5845539761952065, + "grad_norm": 7.776888808658404, + "learning_rate": 5.445488187682019e-07, + "loss": 0.5972, + "step": 21933 + }, + { + "epoch": 1.584626221395416, + "grad_norm": 7.320178573181239, + "learning_rate": 5.443665879750007e-07, + "loss": 0.6211, + "step": 21934 + }, + { + "epoch": 1.5846984665956256, + "grad_norm": 7.453261400170235, + "learning_rate": 5.441843839529018e-07, + "loss": 0.5303, + "step": 21935 + }, + { + "epoch": 1.584770711795835, + "grad_norm": 6.121485421917784, + "learning_rate": 5.440022067044004e-07, + "loss": 0.584, + "step": 21936 + }, + { + "epoch": 1.5848429569960447, + "grad_norm": 6.51907367140479, + "learning_rate": 5.438200562319898e-07, + "loss": 0.5912, + "step": 21937 + }, + { + "epoch": 1.584915202196254, + "grad_norm": 8.848723721476642, + "learning_rate": 5.436379325381638e-07, + "loss": 0.6216, + "step": 21938 + }, + { + "epoch": 1.5849874473964636, + "grad_norm": 7.066667525573294, + "learning_rate": 5.434558356254155e-07, + "loss": 0.6748, + "step": 21939 + }, + { + "epoch": 1.5850596925966731, + "grad_norm": 7.652804101374454, + "learning_rate": 5.432737654962375e-07, + "loss": 0.6122, + "step": 21940 + }, + { + "epoch": 1.5851319377968827, + "grad_norm": 7.157392785391549, + "learning_rate": 5.430917221531232e-07, + "loss": 0.6082, + "step": 21941 + }, + { + "epoch": 1.5852041829970922, + "grad_norm": 7.4457359553798526, + "learning_rate": 5.42909705598563e-07, + "loss": 0.6918, + "step": 21942 + }, + { + "epoch": 1.5852764281973015, + "grad_norm": 7.172158229446203, + "learning_rate": 5.427277158350494e-07, + "loss": 0.6364, + "step": 21943 + }, + { + "epoch": 1.5853486733975113, + "grad_norm": 8.146715459516203, + "learning_rate": 5.425457528650737e-07, + "loss": 0.5776, + "step": 21944 + }, + { + "epoch": 1.5854209185977206, + "grad_norm": 9.530074050151299, + "learning_rate": 5.423638166911269e-07, + "loss": 0.6515, + "step": 21945 + }, + { + "epoch": 1.5854931637979301, + "grad_norm": 7.5248844581427585, + "learning_rate": 5.421819073156992e-07, + "loss": 0.6535, + "step": 21946 + }, + { + "epoch": 1.5855654089981397, + "grad_norm": 7.20423493346535, + "learning_rate": 5.42000024741281e-07, + "loss": 0.6234, + "step": 21947 + }, + { + "epoch": 1.5856376541983492, + "grad_norm": 7.10306505479201, + "learning_rate": 5.41818168970363e-07, + "loss": 0.5848, + "step": 21948 + }, + { + "epoch": 1.5857098993985588, + "grad_norm": 6.087352974935418, + "learning_rate": 5.416363400054333e-07, + "loss": 0.6353, + "step": 21949 + }, + { + "epoch": 1.585782144598768, + "grad_norm": 5.877117019046533, + "learning_rate": 5.414545378489813e-07, + "loss": 0.624, + "step": 21950 + }, + { + "epoch": 1.5858543897989779, + "grad_norm": 6.876414482952392, + "learning_rate": 5.412727625034961e-07, + "loss": 0.571, + "step": 21951 + }, + { + "epoch": 1.5859266349991872, + "grad_norm": 8.2793491701138, + "learning_rate": 5.410910139714664e-07, + "loss": 0.6576, + "step": 21952 + }, + { + "epoch": 1.5859988801993967, + "grad_norm": 7.285208874223474, + "learning_rate": 5.409092922553794e-07, + "loss": 0.6203, + "step": 21953 + }, + { + "epoch": 1.5860711253996063, + "grad_norm": 7.679208393228488, + "learning_rate": 5.407275973577228e-07, + "loss": 0.6657, + "step": 21954 + }, + { + "epoch": 1.5861433705998158, + "grad_norm": 8.292807751322265, + "learning_rate": 5.405459292809842e-07, + "loss": 0.6198, + "step": 21955 + }, + { + "epoch": 1.5862156158000253, + "grad_norm": 7.880483020366032, + "learning_rate": 5.403642880276502e-07, + "loss": 0.5956, + "step": 21956 + }, + { + "epoch": 1.5862878610002347, + "grad_norm": 7.463697184725758, + "learning_rate": 5.401826736002077e-07, + "loss": 0.6273, + "step": 21957 + }, + { + "epoch": 1.5863601062004444, + "grad_norm": 6.346721986534878, + "learning_rate": 5.400010860011429e-07, + "loss": 0.6377, + "step": 21958 + }, + { + "epoch": 1.5864323514006538, + "grad_norm": 8.207060068071726, + "learning_rate": 5.398195252329416e-07, + "loss": 0.6073, + "step": 21959 + }, + { + "epoch": 1.5865045966008633, + "grad_norm": 8.265109934920897, + "learning_rate": 5.396379912980887e-07, + "loss": 0.6576, + "step": 21960 + }, + { + "epoch": 1.5865768418010728, + "grad_norm": 7.077316002160254, + "learning_rate": 5.394564841990696e-07, + "loss": 0.6, + "step": 21961 + }, + { + "epoch": 1.5866490870012824, + "grad_norm": 7.46319884537557, + "learning_rate": 5.392750039383696e-07, + "loss": 0.6175, + "step": 21962 + }, + { + "epoch": 1.586721332201492, + "grad_norm": 8.405186241941266, + "learning_rate": 5.390935505184716e-07, + "loss": 0.7035, + "step": 21963 + }, + { + "epoch": 1.5867935774017012, + "grad_norm": 6.784997423209458, + "learning_rate": 5.389121239418605e-07, + "loss": 0.6137, + "step": 21964 + }, + { + "epoch": 1.586865822601911, + "grad_norm": 7.159758261750128, + "learning_rate": 5.387307242110188e-07, + "loss": 0.5658, + "step": 21965 + }, + { + "epoch": 1.5869380678021203, + "grad_norm": 7.08717370737075, + "learning_rate": 5.38549351328432e-07, + "loss": 0.6685, + "step": 21966 + }, + { + "epoch": 1.5870103130023299, + "grad_norm": 7.488207383405893, + "learning_rate": 5.383680052965812e-07, + "loss": 0.5885, + "step": 21967 + }, + { + "epoch": 1.5870825582025394, + "grad_norm": 6.14203811964592, + "learning_rate": 5.381866861179491e-07, + "loss": 0.6138, + "step": 21968 + }, + { + "epoch": 1.587154803402749, + "grad_norm": 7.086916686582296, + "learning_rate": 5.380053937950188e-07, + "loss": 0.5859, + "step": 21969 + }, + { + "epoch": 1.5872270486029585, + "grad_norm": 8.197085295178145, + "learning_rate": 5.378241283302707e-07, + "loss": 0.6469, + "step": 21970 + }, + { + "epoch": 1.5872992938031678, + "grad_norm": 7.44485161586686, + "learning_rate": 5.376428897261867e-07, + "loss": 0.6354, + "step": 21971 + }, + { + "epoch": 1.5873715390033776, + "grad_norm": 6.4910471955662405, + "learning_rate": 5.374616779852476e-07, + "loss": 0.614, + "step": 21972 + }, + { + "epoch": 1.587443784203587, + "grad_norm": 7.521460284222001, + "learning_rate": 5.372804931099346e-07, + "loss": 0.6406, + "step": 21973 + }, + { + "epoch": 1.5875160294037964, + "grad_norm": 5.8921657338292945, + "learning_rate": 5.370993351027276e-07, + "loss": 0.5237, + "step": 21974 + }, + { + "epoch": 1.587588274604006, + "grad_norm": 7.947393304301226, + "learning_rate": 5.369182039661067e-07, + "loss": 0.5997, + "step": 21975 + }, + { + "epoch": 1.5876605198042155, + "grad_norm": 7.460779254293671, + "learning_rate": 5.367370997025511e-07, + "loss": 0.6574, + "step": 21976 + }, + { + "epoch": 1.587732765004425, + "grad_norm": 7.138014568746871, + "learning_rate": 5.365560223145413e-07, + "loss": 0.5673, + "step": 21977 + }, + { + "epoch": 1.5878050102046344, + "grad_norm": 7.949212031631068, + "learning_rate": 5.363749718045542e-07, + "loss": 0.607, + "step": 21978 + }, + { + "epoch": 1.5878772554048441, + "grad_norm": 7.369430766800117, + "learning_rate": 5.361939481750689e-07, + "loss": 0.6096, + "step": 21979 + }, + { + "epoch": 1.5879495006050535, + "grad_norm": 7.044425860172203, + "learning_rate": 5.360129514285645e-07, + "loss": 0.6244, + "step": 21980 + }, + { + "epoch": 1.588021745805263, + "grad_norm": 6.7088456363020095, + "learning_rate": 5.358319815675173e-07, + "loss": 0.6192, + "step": 21981 + }, + { + "epoch": 1.5880939910054725, + "grad_norm": 7.2904463555220085, + "learning_rate": 5.35651038594405e-07, + "loss": 0.7151, + "step": 21982 + }, + { + "epoch": 1.588166236205682, + "grad_norm": 7.957684422212619, + "learning_rate": 5.354701225117048e-07, + "loss": 0.5259, + "step": 21983 + }, + { + "epoch": 1.5882384814058916, + "grad_norm": 7.674477565020051, + "learning_rate": 5.352892333218932e-07, + "loss": 0.6299, + "step": 21984 + }, + { + "epoch": 1.588310726606101, + "grad_norm": 8.47782608669522, + "learning_rate": 5.351083710274466e-07, + "loss": 0.6632, + "step": 21985 + }, + { + "epoch": 1.5883829718063107, + "grad_norm": 7.764335909102272, + "learning_rate": 5.349275356308406e-07, + "loss": 0.7202, + "step": 21986 + }, + { + "epoch": 1.58845521700652, + "grad_norm": 6.936190094873883, + "learning_rate": 5.347467271345516e-07, + "loss": 0.5549, + "step": 21987 + }, + { + "epoch": 1.5885274622067298, + "grad_norm": 7.6936742287692255, + "learning_rate": 5.345659455410535e-07, + "loss": 0.5673, + "step": 21988 + }, + { + "epoch": 1.5885997074069391, + "grad_norm": 7.234763610816495, + "learning_rate": 5.343851908528213e-07, + "loss": 0.6887, + "step": 21989 + }, + { + "epoch": 1.5886719526071487, + "grad_norm": 8.628760361204485, + "learning_rate": 5.342044630723297e-07, + "loss": 0.5781, + "step": 21990 + }, + { + "epoch": 1.5887441978073582, + "grad_norm": 7.878173430920775, + "learning_rate": 5.340237622020536e-07, + "loss": 0.6024, + "step": 21991 + }, + { + "epoch": 1.5888164430075675, + "grad_norm": 7.828598160444489, + "learning_rate": 5.338430882444648e-07, + "loss": 0.6228, + "step": 21992 + }, + { + "epoch": 1.5888886882077773, + "grad_norm": 8.378381302757385, + "learning_rate": 5.336624412020366e-07, + "loss": 0.5742, + "step": 21993 + }, + { + "epoch": 1.5889609334079866, + "grad_norm": 8.200128265866343, + "learning_rate": 5.334818210772447e-07, + "loss": 0.5574, + "step": 21994 + }, + { + "epoch": 1.5890331786081964, + "grad_norm": 8.737451328911519, + "learning_rate": 5.333012278725585e-07, + "loss": 0.7413, + "step": 21995 + }, + { + "epoch": 1.5891054238084057, + "grad_norm": 6.877265972896707, + "learning_rate": 5.331206615904519e-07, + "loss": 0.6186, + "step": 21996 + }, + { + "epoch": 1.5891776690086152, + "grad_norm": 7.573840505157116, + "learning_rate": 5.32940122233396e-07, + "loss": 0.6679, + "step": 21997 + }, + { + "epoch": 1.5892499142088248, + "grad_norm": 7.61243641894117, + "learning_rate": 5.327596098038634e-07, + "loss": 0.6396, + "step": 21998 + }, + { + "epoch": 1.589322159409034, + "grad_norm": 6.846530018711606, + "learning_rate": 5.325791243043236e-07, + "loss": 0.5744, + "step": 21999 + }, + { + "epoch": 1.5893944046092439, + "grad_norm": 6.6182118313210765, + "learning_rate": 5.323986657372476e-07, + "loss": 0.5932, + "step": 22000 + }, + { + "epoch": 1.5894666498094532, + "grad_norm": 6.929114970711899, + "learning_rate": 5.322182341051066e-07, + "loss": 0.6197, + "step": 22001 + }, + { + "epoch": 1.589538895009663, + "grad_norm": 9.70782111348805, + "learning_rate": 5.320378294103698e-07, + "loss": 0.6062, + "step": 22002 + }, + { + "epoch": 1.5896111402098723, + "grad_norm": 7.587351674070828, + "learning_rate": 5.318574516555072e-07, + "loss": 0.5741, + "step": 22003 + }, + { + "epoch": 1.5896833854100818, + "grad_norm": 8.518695242825435, + "learning_rate": 5.316771008429877e-07, + "loss": 0.5998, + "step": 22004 + }, + { + "epoch": 1.5897556306102913, + "grad_norm": 7.691972625477404, + "learning_rate": 5.314967769752813e-07, + "loss": 0.5861, + "step": 22005 + }, + { + "epoch": 1.589827875810501, + "grad_norm": 6.6662339387961635, + "learning_rate": 5.313164800548551e-07, + "loss": 0.6859, + "step": 22006 + }, + { + "epoch": 1.5899001210107104, + "grad_norm": 5.529003139550843, + "learning_rate": 5.311362100841774e-07, + "loss": 0.5918, + "step": 22007 + }, + { + "epoch": 1.5899723662109198, + "grad_norm": 7.090983788334717, + "learning_rate": 5.309559670657163e-07, + "loss": 0.6276, + "step": 22008 + }, + { + "epoch": 1.5900446114111295, + "grad_norm": 6.81203494103207, + "learning_rate": 5.307757510019398e-07, + "loss": 0.6679, + "step": 22009 + }, + { + "epoch": 1.5901168566113388, + "grad_norm": 7.000966414088928, + "learning_rate": 5.305955618953138e-07, + "loss": 0.5204, + "step": 22010 + }, + { + "epoch": 1.5901891018115484, + "grad_norm": 6.936769189231249, + "learning_rate": 5.304153997483052e-07, + "loss": 0.5479, + "step": 22011 + }, + { + "epoch": 1.590261347011758, + "grad_norm": 7.905220537674301, + "learning_rate": 5.302352645633804e-07, + "loss": 0.624, + "step": 22012 + }, + { + "epoch": 1.5903335922119675, + "grad_norm": 6.881516021658865, + "learning_rate": 5.300551563430056e-07, + "loss": 0.5825, + "step": 22013 + }, + { + "epoch": 1.590405837412177, + "grad_norm": 6.4548893103058536, + "learning_rate": 5.298750750896462e-07, + "loss": 0.6546, + "step": 22014 + }, + { + "epoch": 1.5904780826123863, + "grad_norm": 6.858439805358119, + "learning_rate": 5.296950208057672e-07, + "loss": 0.5685, + "step": 22015 + }, + { + "epoch": 1.590550327812596, + "grad_norm": 6.36187260200567, + "learning_rate": 5.295149934938343e-07, + "loss": 0.6195, + "step": 22016 + }, + { + "epoch": 1.5906225730128054, + "grad_norm": 7.101106971088645, + "learning_rate": 5.293349931563102e-07, + "loss": 0.5835, + "step": 22017 + }, + { + "epoch": 1.590694818213015, + "grad_norm": 7.255050478548655, + "learning_rate": 5.291550197956602e-07, + "loss": 0.5973, + "step": 22018 + }, + { + "epoch": 1.5907670634132245, + "grad_norm": 6.417843037893121, + "learning_rate": 5.289750734143484e-07, + "loss": 0.494, + "step": 22019 + }, + { + "epoch": 1.590839308613434, + "grad_norm": 7.265577484303634, + "learning_rate": 5.287951540148364e-07, + "loss": 0.6191, + "step": 22020 + }, + { + "epoch": 1.5909115538136436, + "grad_norm": 8.012448638894227, + "learning_rate": 5.286152615995888e-07, + "loss": 0.6004, + "step": 22021 + }, + { + "epoch": 1.590983799013853, + "grad_norm": 6.7628255430827044, + "learning_rate": 5.284353961710664e-07, + "loss": 0.531, + "step": 22022 + }, + { + "epoch": 1.5910560442140627, + "grad_norm": 7.9369999284988175, + "learning_rate": 5.282555577317341e-07, + "loss": 0.5594, + "step": 22023 + }, + { + "epoch": 1.591128289414272, + "grad_norm": 8.831670160685828, + "learning_rate": 5.280757462840516e-07, + "loss": 0.668, + "step": 22024 + }, + { + "epoch": 1.5912005346144815, + "grad_norm": 6.895860408795112, + "learning_rate": 5.27895961830481e-07, + "loss": 0.5508, + "step": 22025 + }, + { + "epoch": 1.591272779814691, + "grad_norm": 7.637383832531918, + "learning_rate": 5.277162043734838e-07, + "loss": 0.6204, + "step": 22026 + }, + { + "epoch": 1.5913450250149006, + "grad_norm": 6.918133070818086, + "learning_rate": 5.275364739155206e-07, + "loss": 0.5439, + "step": 22027 + }, + { + "epoch": 1.5914172702151101, + "grad_norm": 7.242491681769337, + "learning_rate": 5.27356770459051e-07, + "loss": 0.632, + "step": 22028 + }, + { + "epoch": 1.5914895154153195, + "grad_norm": 6.570272438752719, + "learning_rate": 5.271770940065357e-07, + "loss": 0.5659, + "step": 22029 + }, + { + "epoch": 1.5915617606155292, + "grad_norm": 6.492022387152787, + "learning_rate": 5.26997444560434e-07, + "loss": 0.6009, + "step": 22030 + }, + { + "epoch": 1.5916340058157386, + "grad_norm": 7.5265217101084865, + "learning_rate": 5.268178221232053e-07, + "loss": 0.5539, + "step": 22031 + }, + { + "epoch": 1.591706251015948, + "grad_norm": 6.077725826384662, + "learning_rate": 5.266382266973089e-07, + "loss": 0.5455, + "step": 22032 + }, + { + "epoch": 1.5917784962161576, + "grad_norm": 6.466649400878089, + "learning_rate": 5.264586582852025e-07, + "loss": 0.5864, + "step": 22033 + }, + { + "epoch": 1.5918507414163672, + "grad_norm": 7.170948375936865, + "learning_rate": 5.262791168893458e-07, + "loss": 0.6339, + "step": 22034 + }, + { + "epoch": 1.5919229866165767, + "grad_norm": 6.986148208037672, + "learning_rate": 5.260996025121945e-07, + "loss": 0.6804, + "step": 22035 + }, + { + "epoch": 1.591995231816786, + "grad_norm": 6.404496413481917, + "learning_rate": 5.259201151562071e-07, + "loss": 0.6297, + "step": 22036 + }, + { + "epoch": 1.5920674770169958, + "grad_norm": 7.317470849868271, + "learning_rate": 5.257406548238414e-07, + "loss": 0.5706, + "step": 22037 + }, + { + "epoch": 1.5921397222172051, + "grad_norm": 8.030150341940066, + "learning_rate": 5.255612215175523e-07, + "loss": 0.5637, + "step": 22038 + }, + { + "epoch": 1.5922119674174147, + "grad_norm": 7.752074179421399, + "learning_rate": 5.253818152397971e-07, + "loss": 0.6083, + "step": 22039 + }, + { + "epoch": 1.5922842126176242, + "grad_norm": 7.024731136077714, + "learning_rate": 5.252024359930316e-07, + "loss": 0.6607, + "step": 22040 + }, + { + "epoch": 1.5923564578178337, + "grad_norm": 7.718426284488441, + "learning_rate": 5.250230837797115e-07, + "loss": 0.5616, + "step": 22041 + }, + { + "epoch": 1.5924287030180433, + "grad_norm": 6.478397859383359, + "learning_rate": 5.248437586022919e-07, + "loss": 0.5792, + "step": 22042 + }, + { + "epoch": 1.5925009482182526, + "grad_norm": 7.605441665013405, + "learning_rate": 5.246644604632278e-07, + "loss": 0.6649, + "step": 22043 + }, + { + "epoch": 1.5925731934184624, + "grad_norm": 7.27387725451294, + "learning_rate": 5.244851893649741e-07, + "loss": 0.5883, + "step": 22044 + }, + { + "epoch": 1.5926454386186717, + "grad_norm": 7.460425937550599, + "learning_rate": 5.243059453099833e-07, + "loss": 0.6732, + "step": 22045 + }, + { + "epoch": 1.5927176838188812, + "grad_norm": 7.383749217195923, + "learning_rate": 5.241267283007104e-07, + "loss": 0.5383, + "step": 22046 + }, + { + "epoch": 1.5927899290190908, + "grad_norm": 7.656354381861794, + "learning_rate": 5.239475383396084e-07, + "loss": 0.6515, + "step": 22047 + }, + { + "epoch": 1.5928621742193003, + "grad_norm": 6.831141174510577, + "learning_rate": 5.237683754291309e-07, + "loss": 0.5696, + "step": 22048 + }, + { + "epoch": 1.5929344194195099, + "grad_norm": 5.949741313023158, + "learning_rate": 5.23589239571729e-07, + "loss": 0.5444, + "step": 22049 + }, + { + "epoch": 1.5930066646197192, + "grad_norm": 6.691253749448003, + "learning_rate": 5.234101307698555e-07, + "loss": 0.6248, + "step": 22050 + }, + { + "epoch": 1.593078909819929, + "grad_norm": 6.769114521669042, + "learning_rate": 5.232310490259631e-07, + "loss": 0.5707, + "step": 22051 + }, + { + "epoch": 1.5931511550201383, + "grad_norm": 7.475940770175344, + "learning_rate": 5.230519943425033e-07, + "loss": 0.5743, + "step": 22052 + }, + { + "epoch": 1.5932234002203478, + "grad_norm": 7.692080489984767, + "learning_rate": 5.228729667219263e-07, + "loss": 0.5713, + "step": 22053 + }, + { + "epoch": 1.5932956454205573, + "grad_norm": 7.8925914164853435, + "learning_rate": 5.226939661666833e-07, + "loss": 0.5869, + "step": 22054 + }, + { + "epoch": 1.593367890620767, + "grad_norm": 6.948444428385769, + "learning_rate": 5.22514992679225e-07, + "loss": 0.5853, + "step": 22055 + }, + { + "epoch": 1.5934401358209764, + "grad_norm": 6.7280728265039045, + "learning_rate": 5.223360462620006e-07, + "loss": 0.5749, + "step": 22056 + }, + { + "epoch": 1.5935123810211858, + "grad_norm": 8.767618805945386, + "learning_rate": 5.2215712691746e-07, + "loss": 0.6526, + "step": 22057 + }, + { + "epoch": 1.5935846262213955, + "grad_norm": 7.108972559007846, + "learning_rate": 5.219782346480526e-07, + "loss": 0.5931, + "step": 22058 + }, + { + "epoch": 1.5936568714216048, + "grad_norm": 7.3068275295194, + "learning_rate": 5.217993694562274e-07, + "loss": 0.5877, + "step": 22059 + }, + { + "epoch": 1.5937291166218146, + "grad_norm": 6.558870892465654, + "learning_rate": 5.216205313444331e-07, + "loss": 0.5897, + "step": 22060 + }, + { + "epoch": 1.593801361822024, + "grad_norm": 7.308458564523402, + "learning_rate": 5.214417203151173e-07, + "loss": 0.5461, + "step": 22061 + }, + { + "epoch": 1.5938736070222335, + "grad_norm": 8.125292493984015, + "learning_rate": 5.212629363707289e-07, + "loss": 0.7094, + "step": 22062 + }, + { + "epoch": 1.593945852222443, + "grad_norm": 6.124920591501351, + "learning_rate": 5.210841795137137e-07, + "loss": 0.6323, + "step": 22063 + }, + { + "epoch": 1.5940180974226523, + "grad_norm": 9.81748379046659, + "learning_rate": 5.209054497465196e-07, + "loss": 0.6144, + "step": 22064 + }, + { + "epoch": 1.594090342622862, + "grad_norm": 8.661633081254212, + "learning_rate": 5.207267470715935e-07, + "loss": 0.6577, + "step": 22065 + }, + { + "epoch": 1.5941625878230714, + "grad_norm": 6.9835922860847, + "learning_rate": 5.205480714913818e-07, + "loss": 0.6048, + "step": 22066 + }, + { + "epoch": 1.5942348330232812, + "grad_norm": 7.058100857860177, + "learning_rate": 5.203694230083295e-07, + "loss": 0.5948, + "step": 22067 + }, + { + "epoch": 1.5943070782234905, + "grad_norm": 6.1717387123995175, + "learning_rate": 5.201908016248828e-07, + "loss": 0.611, + "step": 22068 + }, + { + "epoch": 1.5943793234237, + "grad_norm": 7.922365914507459, + "learning_rate": 5.200122073434866e-07, + "loss": 0.6291, + "step": 22069 + }, + { + "epoch": 1.5944515686239096, + "grad_norm": 7.368105235538027, + "learning_rate": 5.19833640166586e-07, + "loss": 0.6376, + "step": 22070 + }, + { + "epoch": 1.594523813824119, + "grad_norm": 7.728258955334823, + "learning_rate": 5.196551000966254e-07, + "loss": 0.5813, + "step": 22071 + }, + { + "epoch": 1.5945960590243287, + "grad_norm": 7.5430982154865625, + "learning_rate": 5.194765871360488e-07, + "loss": 0.6034, + "step": 22072 + }, + { + "epoch": 1.594668304224538, + "grad_norm": 7.115408347361874, + "learning_rate": 5.192981012873005e-07, + "loss": 0.6573, + "step": 22073 + }, + { + "epoch": 1.5947405494247477, + "grad_norm": 6.010143606593356, + "learning_rate": 5.191196425528228e-07, + "loss": 0.6254, + "step": 22074 + }, + { + "epoch": 1.594812794624957, + "grad_norm": 7.4443253686342725, + "learning_rate": 5.189412109350592e-07, + "loss": 0.7364, + "step": 22075 + }, + { + "epoch": 1.5948850398251666, + "grad_norm": 6.859798479150867, + "learning_rate": 5.187628064364519e-07, + "loss": 0.5542, + "step": 22076 + }, + { + "epoch": 1.5949572850253761, + "grad_norm": 7.374973103102436, + "learning_rate": 5.185844290594444e-07, + "loss": 0.5314, + "step": 22077 + }, + { + "epoch": 1.5950295302255857, + "grad_norm": 7.428229958158222, + "learning_rate": 5.184060788064763e-07, + "loss": 0.5926, + "step": 22078 + }, + { + "epoch": 1.5951017754257952, + "grad_norm": 6.640801066140201, + "learning_rate": 5.182277556799908e-07, + "loss": 0.6009, + "step": 22079 + }, + { + "epoch": 1.5951740206260046, + "grad_norm": 6.870820040561764, + "learning_rate": 5.180494596824296e-07, + "loss": 0.6036, + "step": 22080 + }, + { + "epoch": 1.5952462658262143, + "grad_norm": 7.666951823805514, + "learning_rate": 5.178711908162315e-07, + "loss": 0.611, + "step": 22081 + }, + { + "epoch": 1.5953185110264236, + "grad_norm": 6.2753960959359905, + "learning_rate": 5.176929490838375e-07, + "loss": 0.5993, + "step": 22082 + }, + { + "epoch": 1.5953907562266332, + "grad_norm": 7.327475317682755, + "learning_rate": 5.175147344876882e-07, + "loss": 0.6087, + "step": 22083 + }, + { + "epoch": 1.5954630014268427, + "grad_norm": 6.31180863324105, + "learning_rate": 5.173365470302235e-07, + "loss": 0.5727, + "step": 22084 + }, + { + "epoch": 1.5955352466270523, + "grad_norm": 7.740185090841079, + "learning_rate": 5.171583867138816e-07, + "loss": 0.6843, + "step": 22085 + }, + { + "epoch": 1.5956074918272618, + "grad_norm": 6.852612474778666, + "learning_rate": 5.169802535411014e-07, + "loss": 0.5789, + "step": 22086 + }, + { + "epoch": 1.5956797370274711, + "grad_norm": 8.675435642879844, + "learning_rate": 5.16802147514322e-07, + "loss": 0.5536, + "step": 22087 + }, + { + "epoch": 1.5957519822276809, + "grad_norm": 8.23681233940949, + "learning_rate": 5.166240686359814e-07, + "loss": 0.5764, + "step": 22088 + }, + { + "epoch": 1.5958242274278902, + "grad_norm": 7.444565694526911, + "learning_rate": 5.164460169085173e-07, + "loss": 0.6106, + "step": 22089 + }, + { + "epoch": 1.5958964726280997, + "grad_norm": 6.827274644918603, + "learning_rate": 5.16267992334367e-07, + "loss": 0.6017, + "step": 22090 + }, + { + "epoch": 1.5959687178283093, + "grad_norm": 6.53980042225971, + "learning_rate": 5.160899949159684e-07, + "loss": 0.6527, + "step": 22091 + }, + { + "epoch": 1.5960409630285188, + "grad_norm": 8.486223613064736, + "learning_rate": 5.159120246557567e-07, + "loss": 0.6075, + "step": 22092 + }, + { + "epoch": 1.5961132082287284, + "grad_norm": 8.823389994697909, + "learning_rate": 5.157340815561687e-07, + "loss": 0.6063, + "step": 22093 + }, + { + "epoch": 1.5961854534289377, + "grad_norm": 7.745595603387819, + "learning_rate": 5.155561656196411e-07, + "loss": 0.6354, + "step": 22094 + }, + { + "epoch": 1.5962576986291475, + "grad_norm": 7.411166462587495, + "learning_rate": 5.153782768486082e-07, + "loss": 0.6094, + "step": 22095 + }, + { + "epoch": 1.5963299438293568, + "grad_norm": 7.026914624804197, + "learning_rate": 5.152004152455056e-07, + "loss": 0.6337, + "step": 22096 + }, + { + "epoch": 1.5964021890295663, + "grad_norm": 7.6162872564975075, + "learning_rate": 5.150225808127685e-07, + "loss": 0.6031, + "step": 22097 + }, + { + "epoch": 1.5964744342297759, + "grad_norm": 8.235028177663636, + "learning_rate": 5.148447735528309e-07, + "loss": 0.6964, + "step": 22098 + }, + { + "epoch": 1.5965466794299854, + "grad_norm": 8.005343559941403, + "learning_rate": 5.14666993468127e-07, + "loss": 0.549, + "step": 22099 + }, + { + "epoch": 1.596618924630195, + "grad_norm": 8.141773457706933, + "learning_rate": 5.144892405610901e-07, + "loss": 0.5897, + "step": 22100 + }, + { + "epoch": 1.5966911698304043, + "grad_norm": 6.723812023868917, + "learning_rate": 5.143115148341549e-07, + "loss": 0.6513, + "step": 22101 + }, + { + "epoch": 1.596763415030614, + "grad_norm": 6.933096373005557, + "learning_rate": 5.141338162897528e-07, + "loss": 0.5301, + "step": 22102 + }, + { + "epoch": 1.5968356602308234, + "grad_norm": 7.782909538063869, + "learning_rate": 5.139561449303162e-07, + "loss": 0.5287, + "step": 22103 + }, + { + "epoch": 1.596907905431033, + "grad_norm": 9.204776220714846, + "learning_rate": 5.137785007582785e-07, + "loss": 0.6232, + "step": 22104 + }, + { + "epoch": 1.5969801506312424, + "grad_norm": 6.721010759269822, + "learning_rate": 5.136008837760711e-07, + "loss": 0.5593, + "step": 22105 + }, + { + "epoch": 1.597052395831452, + "grad_norm": 8.069418134735807, + "learning_rate": 5.13423293986125e-07, + "loss": 0.6024, + "step": 22106 + }, + { + "epoch": 1.5971246410316615, + "grad_norm": 7.056035687120631, + "learning_rate": 5.132457313908707e-07, + "loss": 0.5856, + "step": 22107 + }, + { + "epoch": 1.5971968862318708, + "grad_norm": 8.186976758238746, + "learning_rate": 5.130681959927402e-07, + "loss": 0.528, + "step": 22108 + }, + { + "epoch": 1.5972691314320806, + "grad_norm": 6.341449188767557, + "learning_rate": 5.128906877941644e-07, + "loss": 0.6282, + "step": 22109 + }, + { + "epoch": 1.59734137663229, + "grad_norm": 6.827477745262928, + "learning_rate": 5.127132067975712e-07, + "loss": 0.5943, + "step": 22110 + }, + { + "epoch": 1.5974136218324995, + "grad_norm": 6.448350668882466, + "learning_rate": 5.125357530053915e-07, + "loss": 0.6415, + "step": 22111 + }, + { + "epoch": 1.597485867032709, + "grad_norm": 7.533519355480181, + "learning_rate": 5.123583264200546e-07, + "loss": 0.5622, + "step": 22112 + }, + { + "epoch": 1.5975581122329185, + "grad_norm": 8.22326257785293, + "learning_rate": 5.121809270439881e-07, + "loss": 0.6395, + "step": 22113 + }, + { + "epoch": 1.597630357433128, + "grad_norm": 8.282691686123709, + "learning_rate": 5.120035548796215e-07, + "loss": 0.6443, + "step": 22114 + }, + { + "epoch": 1.5977026026333374, + "grad_norm": 6.81642527755224, + "learning_rate": 5.118262099293825e-07, + "loss": 0.5549, + "step": 22115 + }, + { + "epoch": 1.5977748478335472, + "grad_norm": 6.98539190548692, + "learning_rate": 5.116488921956991e-07, + "loss": 0.6232, + "step": 22116 + }, + { + "epoch": 1.5978470930337565, + "grad_norm": 6.645675641364436, + "learning_rate": 5.114716016809987e-07, + "loss": 0.6518, + "step": 22117 + }, + { + "epoch": 1.597919338233966, + "grad_norm": 6.819887128326629, + "learning_rate": 5.112943383877078e-07, + "loss": 0.6057, + "step": 22118 + }, + { + "epoch": 1.5979915834341756, + "grad_norm": 7.608497396693913, + "learning_rate": 5.111171023182541e-07, + "loss": 0.5641, + "step": 22119 + }, + { + "epoch": 1.5980638286343851, + "grad_norm": 7.9674892250948135, + "learning_rate": 5.109398934750621e-07, + "loss": 0.6772, + "step": 22120 + }, + { + "epoch": 1.5981360738345947, + "grad_norm": 7.687259577271319, + "learning_rate": 5.107627118605588e-07, + "loss": 0.6762, + "step": 22121 + }, + { + "epoch": 1.598208319034804, + "grad_norm": 7.621003150890236, + "learning_rate": 5.105855574771693e-07, + "loss": 0.5951, + "step": 22122 + }, + { + "epoch": 1.5982805642350137, + "grad_norm": 6.70848996313248, + "learning_rate": 5.104084303273194e-07, + "loss": 0.6397, + "step": 22123 + }, + { + "epoch": 1.598352809435223, + "grad_norm": 7.171078174348484, + "learning_rate": 5.102313304134327e-07, + "loss": 0.5116, + "step": 22124 + }, + { + "epoch": 1.5984250546354326, + "grad_norm": 8.162882129084084, + "learning_rate": 5.100542577379341e-07, + "loss": 0.6478, + "step": 22125 + }, + { + "epoch": 1.5984972998356421, + "grad_norm": 7.5287362489886664, + "learning_rate": 5.098772123032478e-07, + "loss": 0.6013, + "step": 22126 + }, + { + "epoch": 1.5985695450358517, + "grad_norm": 6.685830166270082, + "learning_rate": 5.097001941117972e-07, + "loss": 0.6194, + "step": 22127 + }, + { + "epoch": 1.5986417902360612, + "grad_norm": 7.0034860377463595, + "learning_rate": 5.095232031660053e-07, + "loss": 0.5356, + "step": 22128 + }, + { + "epoch": 1.5987140354362706, + "grad_norm": 6.317291556309703, + "learning_rate": 5.093462394682955e-07, + "loss": 0.5807, + "step": 22129 + }, + { + "epoch": 1.5987862806364803, + "grad_norm": 6.280643870397191, + "learning_rate": 5.091693030210907e-07, + "loss": 0.5589, + "step": 22130 + }, + { + "epoch": 1.5988585258366896, + "grad_norm": 7.6309171136956895, + "learning_rate": 5.089923938268116e-07, + "loss": 0.5737, + "step": 22131 + }, + { + "epoch": 1.5989307710368994, + "grad_norm": 7.352984137914475, + "learning_rate": 5.088155118878807e-07, + "loss": 0.567, + "step": 22132 + }, + { + "epoch": 1.5990030162371087, + "grad_norm": 7.140267594823292, + "learning_rate": 5.086386572067195e-07, + "loss": 0.5939, + "step": 22133 + }, + { + "epoch": 1.5990752614373183, + "grad_norm": 7.601363521531819, + "learning_rate": 5.084618297857494e-07, + "loss": 0.655, + "step": 22134 + }, + { + "epoch": 1.5991475066375278, + "grad_norm": 6.382107531161367, + "learning_rate": 5.082850296273892e-07, + "loss": 0.6002, + "step": 22135 + }, + { + "epoch": 1.5992197518377371, + "grad_norm": 7.017369654484206, + "learning_rate": 5.081082567340611e-07, + "loss": 0.6453, + "step": 22136 + }, + { + "epoch": 1.5992919970379469, + "grad_norm": 6.740056095512433, + "learning_rate": 5.079315111081854e-07, + "loss": 0.5072, + "step": 22137 + }, + { + "epoch": 1.5993642422381562, + "grad_norm": 7.351730360978775, + "learning_rate": 5.077547927521795e-07, + "loss": 0.6505, + "step": 22138 + }, + { + "epoch": 1.599436487438366, + "grad_norm": 7.955296068322534, + "learning_rate": 5.075781016684639e-07, + "loss": 0.5813, + "step": 22139 + }, + { + "epoch": 1.5995087326385753, + "grad_norm": 8.030268152683565, + "learning_rate": 5.074014378594569e-07, + "loss": 0.5765, + "step": 22140 + }, + { + "epoch": 1.5995809778387848, + "grad_norm": 7.56612366171769, + "learning_rate": 5.072248013275777e-07, + "loss": 0.6068, + "step": 22141 + }, + { + "epoch": 1.5996532230389944, + "grad_norm": 6.946768952322592, + "learning_rate": 5.070481920752432e-07, + "loss": 0.6477, + "step": 22142 + }, + { + "epoch": 1.5997254682392037, + "grad_norm": 5.916608174352799, + "learning_rate": 5.068716101048713e-07, + "loss": 0.5688, + "step": 22143 + }, + { + "epoch": 1.5997977134394135, + "grad_norm": 8.278003675321292, + "learning_rate": 5.066950554188796e-07, + "loss": 0.6258, + "step": 22144 + }, + { + "epoch": 1.5998699586396228, + "grad_norm": 6.509954898988567, + "learning_rate": 5.06518528019685e-07, + "loss": 0.5696, + "step": 22145 + }, + { + "epoch": 1.5999422038398325, + "grad_norm": 7.894315736481891, + "learning_rate": 5.063420279097037e-07, + "loss": 0.6334, + "step": 22146 + }, + { + "epoch": 1.6000144490400419, + "grad_norm": 8.292234110624841, + "learning_rate": 5.061655550913522e-07, + "loss": 0.637, + "step": 22147 + }, + { + "epoch": 1.6000866942402514, + "grad_norm": 7.182578616165129, + "learning_rate": 5.059891095670466e-07, + "loss": 0.6332, + "step": 22148 + }, + { + "epoch": 1.600158939440461, + "grad_norm": 7.026154566471147, + "learning_rate": 5.058126913392014e-07, + "loss": 0.5981, + "step": 22149 + }, + { + "epoch": 1.6002311846406705, + "grad_norm": 7.270595892788325, + "learning_rate": 5.056363004102322e-07, + "loss": 0.6394, + "step": 22150 + }, + { + "epoch": 1.60030342984088, + "grad_norm": 6.62863980478809, + "learning_rate": 5.054599367825538e-07, + "loss": 0.5765, + "step": 22151 + }, + { + "epoch": 1.6003756750410894, + "grad_norm": 6.320228906180034, + "learning_rate": 5.052836004585798e-07, + "loss": 0.6087, + "step": 22152 + }, + { + "epoch": 1.6004479202412991, + "grad_norm": 8.15544504453998, + "learning_rate": 5.051072914407246e-07, + "loss": 0.6071, + "step": 22153 + }, + { + "epoch": 1.6005201654415084, + "grad_norm": 6.863073372990185, + "learning_rate": 5.049310097314014e-07, + "loss": 0.6063, + "step": 22154 + }, + { + "epoch": 1.600592410641718, + "grad_norm": 6.756357942719323, + "learning_rate": 5.047547553330237e-07, + "loss": 0.6017, + "step": 22155 + }, + { + "epoch": 1.6006646558419275, + "grad_norm": 6.376004607386821, + "learning_rate": 5.045785282480042e-07, + "loss": 0.5535, + "step": 22156 + }, + { + "epoch": 1.600736901042137, + "grad_norm": 7.024899475898881, + "learning_rate": 5.044023284787553e-07, + "loss": 0.5976, + "step": 22157 + }, + { + "epoch": 1.6008091462423466, + "grad_norm": 6.824224898931116, + "learning_rate": 5.04226156027689e-07, + "loss": 0.6205, + "step": 22158 + }, + { + "epoch": 1.600881391442556, + "grad_norm": 7.4673672597177045, + "learning_rate": 5.040500108972176e-07, + "loss": 0.5855, + "step": 22159 + }, + { + "epoch": 1.6009536366427657, + "grad_norm": 9.706814321016722, + "learning_rate": 5.038738930897513e-07, + "loss": 0.6076, + "step": 22160 + }, + { + "epoch": 1.601025881842975, + "grad_norm": 7.0086874867456626, + "learning_rate": 5.036978026077014e-07, + "loss": 0.5868, + "step": 22161 + }, + { + "epoch": 1.6010981270431845, + "grad_norm": 7.388057206209665, + "learning_rate": 5.035217394534794e-07, + "loss": 0.6271, + "step": 22162 + }, + { + "epoch": 1.601170372243394, + "grad_norm": 7.460727868455493, + "learning_rate": 5.033457036294931e-07, + "loss": 0.666, + "step": 22163 + }, + { + "epoch": 1.6012426174436036, + "grad_norm": 8.024206731930441, + "learning_rate": 5.031696951381545e-07, + "loss": 0.5908, + "step": 22164 + }, + { + "epoch": 1.6013148626438132, + "grad_norm": 6.4651198959149, + "learning_rate": 5.029937139818725e-07, + "loss": 0.6107, + "step": 22165 + }, + { + "epoch": 1.6013871078440225, + "grad_norm": 7.448759387875073, + "learning_rate": 5.028177601630563e-07, + "loss": 0.7167, + "step": 22166 + }, + { + "epoch": 1.6014593530442323, + "grad_norm": 6.545909728493358, + "learning_rate": 5.026418336841138e-07, + "loss": 0.6164, + "step": 22167 + }, + { + "epoch": 1.6015315982444416, + "grad_norm": 7.286440329612392, + "learning_rate": 5.02465934547454e-07, + "loss": 0.6001, + "step": 22168 + }, + { + "epoch": 1.6016038434446511, + "grad_norm": 7.070923769328881, + "learning_rate": 5.02290062755485e-07, + "loss": 0.7039, + "step": 22169 + }, + { + "epoch": 1.6016760886448607, + "grad_norm": 7.823537613106453, + "learning_rate": 5.021142183106132e-07, + "loss": 0.5341, + "step": 22170 + }, + { + "epoch": 1.6017483338450702, + "grad_norm": 6.854572813606349, + "learning_rate": 5.019384012152464e-07, + "loss": 0.6711, + "step": 22171 + }, + { + "epoch": 1.6018205790452797, + "grad_norm": 8.512819832550484, + "learning_rate": 5.017626114717916e-07, + "loss": 0.6439, + "step": 22172 + }, + { + "epoch": 1.601892824245489, + "grad_norm": 7.285868870139724, + "learning_rate": 5.015868490826553e-07, + "loss": 0.6354, + "step": 22173 + }, + { + "epoch": 1.6019650694456988, + "grad_norm": 6.955738185208805, + "learning_rate": 5.014111140502431e-07, + "loss": 0.575, + "step": 22174 + }, + { + "epoch": 1.6020373146459082, + "grad_norm": 6.428879382311095, + "learning_rate": 5.012354063769612e-07, + "loss": 0.5987, + "step": 22175 + }, + { + "epoch": 1.6021095598461177, + "grad_norm": 8.437166334548344, + "learning_rate": 5.01059726065215e-07, + "loss": 0.6034, + "step": 22176 + }, + { + "epoch": 1.6021818050463272, + "grad_norm": 6.907293318570562, + "learning_rate": 5.008840731174086e-07, + "loss": 0.6275, + "step": 22177 + }, + { + "epoch": 1.6022540502465368, + "grad_norm": 7.183071463901607, + "learning_rate": 5.007084475359469e-07, + "loss": 0.5973, + "step": 22178 + }, + { + "epoch": 1.6023262954467463, + "grad_norm": 7.048848419287078, + "learning_rate": 5.005328493232345e-07, + "loss": 0.5453, + "step": 22179 + }, + { + "epoch": 1.6023985406469556, + "grad_norm": 8.291823751472789, + "learning_rate": 5.003572784816754e-07, + "loss": 0.6362, + "step": 22180 + }, + { + "epoch": 1.6024707858471654, + "grad_norm": 7.985302776025758, + "learning_rate": 5.001817350136718e-07, + "loss": 0.6401, + "step": 22181 + }, + { + "epoch": 1.6025430310473747, + "grad_norm": 7.15511689523126, + "learning_rate": 5.000062189216276e-07, + "loss": 0.6716, + "step": 22182 + }, + { + "epoch": 1.6026152762475843, + "grad_norm": 8.264315120985241, + "learning_rate": 4.998307302079453e-07, + "loss": 0.5952, + "step": 22183 + }, + { + "epoch": 1.6026875214477938, + "grad_norm": 8.409750284123103, + "learning_rate": 4.996552688750273e-07, + "loss": 0.5905, + "step": 22184 + }, + { + "epoch": 1.6027597666480033, + "grad_norm": 7.058755878049505, + "learning_rate": 4.994798349252756e-07, + "loss": 0.6312, + "step": 22185 + }, + { + "epoch": 1.602832011848213, + "grad_norm": 8.489829317675458, + "learning_rate": 4.993044283610915e-07, + "loss": 0.6501, + "step": 22186 + }, + { + "epoch": 1.6029042570484222, + "grad_norm": 6.548303853343467, + "learning_rate": 4.991290491848768e-07, + "loss": 0.6155, + "step": 22187 + }, + { + "epoch": 1.602976502248632, + "grad_norm": 7.177320087958982, + "learning_rate": 4.989536973990317e-07, + "loss": 0.6393, + "step": 22188 + }, + { + "epoch": 1.6030487474488413, + "grad_norm": 6.7525505615962675, + "learning_rate": 4.987783730059564e-07, + "loss": 0.5934, + "step": 22189 + }, + { + "epoch": 1.6031209926490508, + "grad_norm": 7.159798487805808, + "learning_rate": 4.986030760080513e-07, + "loss": 0.6021, + "step": 22190 + }, + { + "epoch": 1.6031932378492604, + "grad_norm": 6.9050189686044385, + "learning_rate": 4.984278064077164e-07, + "loss": 0.6011, + "step": 22191 + }, + { + "epoch": 1.60326548304947, + "grad_norm": 6.962782147647283, + "learning_rate": 4.982525642073504e-07, + "loss": 0.5938, + "step": 22192 + }, + { + "epoch": 1.6033377282496795, + "grad_norm": 6.1053840890043585, + "learning_rate": 4.980773494093527e-07, + "loss": 0.579, + "step": 22193 + }, + { + "epoch": 1.6034099734498888, + "grad_norm": 6.625517626999689, + "learning_rate": 4.979021620161223e-07, + "loss": 0.6029, + "step": 22194 + }, + { + "epoch": 1.6034822186500985, + "grad_norm": 8.746975185178972, + "learning_rate": 4.977270020300561e-07, + "loss": 0.672, + "step": 22195 + }, + { + "epoch": 1.6035544638503079, + "grad_norm": 7.031603723741432, + "learning_rate": 4.975518694535525e-07, + "loss": 0.6805, + "step": 22196 + }, + { + "epoch": 1.6036267090505174, + "grad_norm": 6.775505199106927, + "learning_rate": 4.973767642890093e-07, + "loss": 0.6283, + "step": 22197 + }, + { + "epoch": 1.603698954250727, + "grad_norm": 6.8595468417042795, + "learning_rate": 4.972016865388238e-07, + "loss": 0.5955, + "step": 22198 + }, + { + "epoch": 1.6037711994509365, + "grad_norm": 6.707098363452801, + "learning_rate": 4.970266362053913e-07, + "loss": 0.5776, + "step": 22199 + }, + { + "epoch": 1.603843444651146, + "grad_norm": 7.5890955848278985, + "learning_rate": 4.968516132911091e-07, + "loss": 0.5956, + "step": 22200 + }, + { + "epoch": 1.6039156898513554, + "grad_norm": 6.685033040161199, + "learning_rate": 4.966766177983728e-07, + "loss": 0.6324, + "step": 22201 + }, + { + "epoch": 1.6039879350515651, + "grad_norm": 7.986334095666025, + "learning_rate": 4.965016497295783e-07, + "loss": 0.5578, + "step": 22202 + }, + { + "epoch": 1.6040601802517744, + "grad_norm": 8.528600260294906, + "learning_rate": 4.963267090871208e-07, + "loss": 0.5346, + "step": 22203 + }, + { + "epoch": 1.604132425451984, + "grad_norm": 6.067908008438901, + "learning_rate": 4.961517958733944e-07, + "loss": 0.5766, + "step": 22204 + }, + { + "epoch": 1.6042046706521935, + "grad_norm": 6.6635770632104006, + "learning_rate": 4.959769100907949e-07, + "loss": 0.6477, + "step": 22205 + }, + { + "epoch": 1.604276915852403, + "grad_norm": 6.597309616134639, + "learning_rate": 4.95802051741715e-07, + "loss": 0.6721, + "step": 22206 + }, + { + "epoch": 1.6043491610526126, + "grad_norm": 6.755850058265475, + "learning_rate": 4.956272208285487e-07, + "loss": 0.58, + "step": 22207 + }, + { + "epoch": 1.604421406252822, + "grad_norm": 6.346890578821353, + "learning_rate": 4.954524173536901e-07, + "loss": 0.5854, + "step": 22208 + }, + { + "epoch": 1.6044936514530317, + "grad_norm": 6.928696829288843, + "learning_rate": 4.952776413195309e-07, + "loss": 0.5987, + "step": 22209 + }, + { + "epoch": 1.604565896653241, + "grad_norm": 6.832961684874494, + "learning_rate": 4.95102892728464e-07, + "loss": 0.5523, + "step": 22210 + }, + { + "epoch": 1.6046381418534508, + "grad_norm": 8.291553694627952, + "learning_rate": 4.949281715828822e-07, + "loss": 0.6116, + "step": 22211 + }, + { + "epoch": 1.60471038705366, + "grad_norm": 7.1317994985076565, + "learning_rate": 4.947534778851764e-07, + "loss": 0.5967, + "step": 22212 + }, + { + "epoch": 1.6047826322538696, + "grad_norm": 7.7370682159883675, + "learning_rate": 4.945788116377389e-07, + "loss": 0.6692, + "step": 22213 + }, + { + "epoch": 1.6048548774540792, + "grad_norm": 7.9719303740788, + "learning_rate": 4.944041728429602e-07, + "loss": 0.6222, + "step": 22214 + }, + { + "epoch": 1.6049271226542885, + "grad_norm": 7.151823182175407, + "learning_rate": 4.942295615032311e-07, + "loss": 0.5835, + "step": 22215 + }, + { + "epoch": 1.6049993678544983, + "grad_norm": 6.3744431794776135, + "learning_rate": 4.940549776209427e-07, + "loss": 0.5531, + "step": 22216 + }, + { + "epoch": 1.6050716130547076, + "grad_norm": 7.279421073195908, + "learning_rate": 4.938804211984835e-07, + "loss": 0.6638, + "step": 22217 + }, + { + "epoch": 1.6051438582549173, + "grad_norm": 6.686637466280698, + "learning_rate": 4.937058922382435e-07, + "loss": 0.5291, + "step": 22218 + }, + { + "epoch": 1.6052161034551267, + "grad_norm": 8.062741298169682, + "learning_rate": 4.935313907426129e-07, + "loss": 0.63, + "step": 22219 + }, + { + "epoch": 1.6052883486553362, + "grad_norm": 6.773037063747291, + "learning_rate": 4.933569167139782e-07, + "loss": 0.6297, + "step": 22220 + }, + { + "epoch": 1.6053605938555457, + "grad_norm": 8.113663496803122, + "learning_rate": 4.9318247015473e-07, + "loss": 0.5834, + "step": 22221 + }, + { + "epoch": 1.605432839055755, + "grad_norm": 6.409163542876398, + "learning_rate": 4.930080510672555e-07, + "loss": 0.6091, + "step": 22222 + }, + { + "epoch": 1.6055050842559648, + "grad_norm": 8.516272731959516, + "learning_rate": 4.928336594539432e-07, + "loss": 0.6312, + "step": 22223 + }, + { + "epoch": 1.6055773294561742, + "grad_norm": 7.213400704192044, + "learning_rate": 4.926592953171788e-07, + "loss": 0.6319, + "step": 22224 + }, + { + "epoch": 1.605649574656384, + "grad_norm": 7.989941472494446, + "learning_rate": 4.924849586593503e-07, + "loss": 0.6166, + "step": 22225 + }, + { + "epoch": 1.6057218198565932, + "grad_norm": 9.63273606962278, + "learning_rate": 4.923106494828445e-07, + "loss": 0.623, + "step": 22226 + }, + { + "epoch": 1.6057940650568028, + "grad_norm": 7.433187051653755, + "learning_rate": 4.921363677900462e-07, + "loss": 0.6569, + "step": 22227 + }, + { + "epoch": 1.6058663102570123, + "grad_norm": 5.8184088749363, + "learning_rate": 4.919621135833422e-07, + "loss": 0.5705, + "step": 22228 + }, + { + "epoch": 1.6059385554572219, + "grad_norm": 7.641638440905421, + "learning_rate": 4.917878868651177e-07, + "loss": 0.5241, + "step": 22229 + }, + { + "epoch": 1.6060108006574314, + "grad_norm": 6.73388698429342, + "learning_rate": 4.916136876377578e-07, + "loss": 0.6261, + "step": 22230 + }, + { + "epoch": 1.6060830458576407, + "grad_norm": 6.883513294789298, + "learning_rate": 4.914395159036472e-07, + "loss": 0.6105, + "step": 22231 + }, + { + "epoch": 1.6061552910578505, + "grad_norm": 6.538063984863257, + "learning_rate": 4.9126537166517e-07, + "loss": 0.5431, + "step": 22232 + }, + { + "epoch": 1.6062275362580598, + "grad_norm": 7.254863554646166, + "learning_rate": 4.910912549247107e-07, + "loss": 0.6121, + "step": 22233 + }, + { + "epoch": 1.6062997814582693, + "grad_norm": 6.967093916694605, + "learning_rate": 4.909171656846518e-07, + "loss": 0.5902, + "step": 22234 + }, + { + "epoch": 1.606372026658479, + "grad_norm": 6.4519763136616355, + "learning_rate": 4.90743103947377e-07, + "loss": 0.642, + "step": 22235 + }, + { + "epoch": 1.6064442718586884, + "grad_norm": 8.091713196148914, + "learning_rate": 4.905690697152688e-07, + "loss": 0.6108, + "step": 22236 + }, + { + "epoch": 1.606516517058898, + "grad_norm": 6.770867199935334, + "learning_rate": 4.903950629907106e-07, + "loss": 0.65, + "step": 22237 + }, + { + "epoch": 1.6065887622591073, + "grad_norm": 6.465552677802831, + "learning_rate": 4.902210837760829e-07, + "loss": 0.5688, + "step": 22238 + }, + { + "epoch": 1.606661007459317, + "grad_norm": 7.419601831163706, + "learning_rate": 4.900471320737681e-07, + "loss": 0.6385, + "step": 22239 + }, + { + "epoch": 1.6067332526595264, + "grad_norm": 8.207675449193692, + "learning_rate": 4.898732078861471e-07, + "loss": 0.6256, + "step": 22240 + }, + { + "epoch": 1.606805497859736, + "grad_norm": 7.06454951162819, + "learning_rate": 4.896993112156021e-07, + "loss": 0.5364, + "step": 22241 + }, + { + "epoch": 1.6068777430599455, + "grad_norm": 8.04783538716749, + "learning_rate": 4.895254420645118e-07, + "loss": 0.6065, + "step": 22242 + }, + { + "epoch": 1.606949988260155, + "grad_norm": 7.280435803123219, + "learning_rate": 4.893516004352575e-07, + "loss": 0.6246, + "step": 22243 + }, + { + "epoch": 1.6070222334603645, + "grad_norm": 6.980645807771276, + "learning_rate": 4.891777863302191e-07, + "loss": 0.6176, + "step": 22244 + }, + { + "epoch": 1.6070944786605739, + "grad_norm": 6.866984463099875, + "learning_rate": 4.890039997517751e-07, + "loss": 0.5802, + "step": 22245 + }, + { + "epoch": 1.6071667238607836, + "grad_norm": 8.087022496650304, + "learning_rate": 4.888302407023047e-07, + "loss": 0.6389, + "step": 22246 + }, + { + "epoch": 1.607238969060993, + "grad_norm": 7.329223290232461, + "learning_rate": 4.886565091841867e-07, + "loss": 0.6597, + "step": 22247 + }, + { + "epoch": 1.6073112142612025, + "grad_norm": 7.4487547787469985, + "learning_rate": 4.884828051997995e-07, + "loss": 0.6055, + "step": 22248 + }, + { + "epoch": 1.607383459461412, + "grad_norm": 8.385731667867077, + "learning_rate": 4.883091287515207e-07, + "loss": 0.6476, + "step": 22249 + }, + { + "epoch": 1.6074557046616216, + "grad_norm": 7.220690445789517, + "learning_rate": 4.881354798417282e-07, + "loss": 0.6502, + "step": 22250 + }, + { + "epoch": 1.6075279498618311, + "grad_norm": 8.959095294100266, + "learning_rate": 4.879618584727993e-07, + "loss": 0.6032, + "step": 22251 + }, + { + "epoch": 1.6076001950620404, + "grad_norm": 9.009892007771192, + "learning_rate": 4.877882646471097e-07, + "loss": 0.6277, + "step": 22252 + }, + { + "epoch": 1.6076724402622502, + "grad_norm": 7.509529163621202, + "learning_rate": 4.876146983670363e-07, + "loss": 0.583, + "step": 22253 + }, + { + "epoch": 1.6077446854624595, + "grad_norm": 6.971015048884027, + "learning_rate": 4.874411596349554e-07, + "loss": 0.5817, + "step": 22254 + }, + { + "epoch": 1.607816930662669, + "grad_norm": 6.347842244871204, + "learning_rate": 4.872676484532429e-07, + "loss": 0.641, + "step": 22255 + }, + { + "epoch": 1.6078891758628786, + "grad_norm": 7.130114677116904, + "learning_rate": 4.870941648242727e-07, + "loss": 0.6511, + "step": 22256 + }, + { + "epoch": 1.6079614210630881, + "grad_norm": 5.580575783942215, + "learning_rate": 4.869207087504208e-07, + "loss": 0.6026, + "step": 22257 + }, + { + "epoch": 1.6080336662632977, + "grad_norm": 7.547669047092194, + "learning_rate": 4.867472802340608e-07, + "loss": 0.657, + "step": 22258 + }, + { + "epoch": 1.608105911463507, + "grad_norm": 6.6226284172766565, + "learning_rate": 4.865738792775679e-07, + "loss": 0.5911, + "step": 22259 + }, + { + "epoch": 1.6081781566637168, + "grad_norm": 7.245356947462463, + "learning_rate": 4.864005058833151e-07, + "loss": 0.5984, + "step": 22260 + }, + { + "epoch": 1.608250401863926, + "grad_norm": 7.046956740622098, + "learning_rate": 4.862271600536758e-07, + "loss": 0.587, + "step": 22261 + }, + { + "epoch": 1.6083226470641356, + "grad_norm": 7.376608204472766, + "learning_rate": 4.860538417910238e-07, + "loss": 0.5806, + "step": 22262 + }, + { + "epoch": 1.6083948922643452, + "grad_norm": 6.355005060969753, + "learning_rate": 4.858805510977305e-07, + "loss": 0.5577, + "step": 22263 + }, + { + "epoch": 1.6084671374645547, + "grad_norm": 7.798563497263429, + "learning_rate": 4.857072879761681e-07, + "loss": 0.5753, + "step": 22264 + }, + { + "epoch": 1.6085393826647643, + "grad_norm": 6.103667185618247, + "learning_rate": 4.855340524287094e-07, + "loss": 0.5996, + "step": 22265 + }, + { + "epoch": 1.6086116278649736, + "grad_norm": 6.798947914863257, + "learning_rate": 4.85360844457726e-07, + "loss": 0.5828, + "step": 22266 + }, + { + "epoch": 1.6086838730651833, + "grad_norm": 7.55291190677373, + "learning_rate": 4.851876640655875e-07, + "loss": 0.623, + "step": 22267 + }, + { + "epoch": 1.6087561182653927, + "grad_norm": 5.904042956829158, + "learning_rate": 4.85014511254665e-07, + "loss": 0.5438, + "step": 22268 + }, + { + "epoch": 1.6088283634656022, + "grad_norm": 6.9059036651352805, + "learning_rate": 4.848413860273307e-07, + "loss": 0.6281, + "step": 22269 + }, + { + "epoch": 1.6089006086658117, + "grad_norm": 7.795155796258305, + "learning_rate": 4.846682883859522e-07, + "loss": 0.5511, + "step": 22270 + }, + { + "epoch": 1.6089728538660213, + "grad_norm": 7.974531641872553, + "learning_rate": 4.844952183329002e-07, + "loss": 0.6069, + "step": 22271 + }, + { + "epoch": 1.6090450990662308, + "grad_norm": 6.981982066568265, + "learning_rate": 4.843221758705438e-07, + "loss": 0.565, + "step": 22272 + }, + { + "epoch": 1.6091173442664402, + "grad_norm": 7.23429089491135, + "learning_rate": 4.841491610012522e-07, + "loss": 0.5356, + "step": 22273 + }, + { + "epoch": 1.60918958946665, + "grad_norm": 6.964060758094493, + "learning_rate": 4.839761737273929e-07, + "loss": 0.5621, + "step": 22274 + }, + { + "epoch": 1.6092618346668592, + "grad_norm": 7.679308240651084, + "learning_rate": 4.838032140513344e-07, + "loss": 0.6527, + "step": 22275 + }, + { + "epoch": 1.6093340798670688, + "grad_norm": 6.97471055828313, + "learning_rate": 4.836302819754443e-07, + "loss": 0.6098, + "step": 22276 + }, + { + "epoch": 1.6094063250672783, + "grad_norm": 6.401335827178141, + "learning_rate": 4.834573775020901e-07, + "loss": 0.5648, + "step": 22277 + }, + { + "epoch": 1.6094785702674879, + "grad_norm": 7.270361359442528, + "learning_rate": 4.832845006336387e-07, + "loss": 0.6168, + "step": 22278 + }, + { + "epoch": 1.6095508154676974, + "grad_norm": 7.615499863126727, + "learning_rate": 4.831116513724568e-07, + "loss": 0.6149, + "step": 22279 + }, + { + "epoch": 1.6096230606679067, + "grad_norm": 7.306454498848946, + "learning_rate": 4.829388297209106e-07, + "loss": 0.5746, + "step": 22280 + }, + { + "epoch": 1.6096953058681165, + "grad_norm": 6.893735291784372, + "learning_rate": 4.827660356813655e-07, + "loss": 0.6416, + "step": 22281 + }, + { + "epoch": 1.6097675510683258, + "grad_norm": 6.192273081078105, + "learning_rate": 4.825932692561866e-07, + "loss": 0.5917, + "step": 22282 + }, + { + "epoch": 1.6098397962685356, + "grad_norm": 7.884105202091515, + "learning_rate": 4.824205304477405e-07, + "loss": 0.73, + "step": 22283 + }, + { + "epoch": 1.609912041468745, + "grad_norm": 7.5249511210226006, + "learning_rate": 4.822478192583899e-07, + "loss": 0.5799, + "step": 22284 + }, + { + "epoch": 1.6099842866689544, + "grad_norm": 6.0065053006314475, + "learning_rate": 4.820751356905001e-07, + "loss": 0.6262, + "step": 22285 + }, + { + "epoch": 1.610056531869164, + "grad_norm": 7.524878121336371, + "learning_rate": 4.819024797464347e-07, + "loss": 0.5814, + "step": 22286 + }, + { + "epoch": 1.6101287770693733, + "grad_norm": 7.5506248872357675, + "learning_rate": 4.817298514285576e-07, + "loss": 0.6563, + "step": 22287 + }, + { + "epoch": 1.610201022269583, + "grad_norm": 6.872934204461543, + "learning_rate": 4.815572507392316e-07, + "loss": 0.5876, + "step": 22288 + }, + { + "epoch": 1.6102732674697924, + "grad_norm": 8.431935828093216, + "learning_rate": 4.813846776808195e-07, + "loss": 0.6872, + "step": 22289 + }, + { + "epoch": 1.6103455126700021, + "grad_norm": 7.316021454443323, + "learning_rate": 4.81212132255684e-07, + "loss": 0.5439, + "step": 22290 + }, + { + "epoch": 1.6104177578702115, + "grad_norm": 7.426422075436414, + "learning_rate": 4.810396144661877e-07, + "loss": 0.5537, + "step": 22291 + }, + { + "epoch": 1.610490003070421, + "grad_norm": 8.060013261883524, + "learning_rate": 4.808671243146906e-07, + "loss": 0.5782, + "step": 22292 + }, + { + "epoch": 1.6105622482706305, + "grad_norm": 7.749759301170212, + "learning_rate": 4.806946618035549e-07, + "loss": 0.579, + "step": 22293 + }, + { + "epoch": 1.6106344934708399, + "grad_norm": 5.655229676672986, + "learning_rate": 4.80522226935142e-07, + "loss": 0.649, + "step": 22294 + }, + { + "epoch": 1.6107067386710496, + "grad_norm": 7.442669009902751, + "learning_rate": 4.803498197118112e-07, + "loss": 0.591, + "step": 22295 + }, + { + "epoch": 1.610778983871259, + "grad_norm": 7.094309511679749, + "learning_rate": 4.80177440135923e-07, + "loss": 0.5718, + "step": 22296 + }, + { + "epoch": 1.6108512290714687, + "grad_norm": 7.8720232469168945, + "learning_rate": 4.800050882098369e-07, + "loss": 0.6331, + "step": 22297 + }, + { + "epoch": 1.610923474271678, + "grad_norm": 6.843876807026545, + "learning_rate": 4.79832763935914e-07, + "loss": 0.5908, + "step": 22298 + }, + { + "epoch": 1.6109957194718876, + "grad_norm": 7.109662060653743, + "learning_rate": 4.796604673165114e-07, + "loss": 0.5876, + "step": 22299 + }, + { + "epoch": 1.6110679646720971, + "grad_norm": 7.508189498534146, + "learning_rate": 4.794881983539881e-07, + "loss": 0.685, + "step": 22300 + }, + { + "epoch": 1.6111402098723067, + "grad_norm": 8.628182971045291, + "learning_rate": 4.793159570507034e-07, + "loss": 0.5845, + "step": 22301 + }, + { + "epoch": 1.6112124550725162, + "grad_norm": 6.533868438190206, + "learning_rate": 4.791437434090137e-07, + "loss": 0.6057, + "step": 22302 + }, + { + "epoch": 1.6112847002727255, + "grad_norm": 6.320963709305367, + "learning_rate": 4.78971557431277e-07, + "loss": 0.6484, + "step": 22303 + }, + { + "epoch": 1.6113569454729353, + "grad_norm": 7.313521876215775, + "learning_rate": 4.787993991198503e-07, + "loss": 0.5982, + "step": 22304 + }, + { + "epoch": 1.6114291906731446, + "grad_norm": 6.714054329778162, + "learning_rate": 4.786272684770904e-07, + "loss": 0.6188, + "step": 22305 + }, + { + "epoch": 1.6115014358733541, + "grad_norm": 7.955271612933582, + "learning_rate": 4.78455165505354e-07, + "loss": 0.649, + "step": 22306 + }, + { + "epoch": 1.6115736810735637, + "grad_norm": 6.456035410295868, + "learning_rate": 4.782830902069965e-07, + "loss": 0.6495, + "step": 22307 + }, + { + "epoch": 1.6116459262737732, + "grad_norm": 6.601919600178918, + "learning_rate": 4.781110425843747e-07, + "loss": 0.5574, + "step": 22308 + }, + { + "epoch": 1.6117181714739828, + "grad_norm": 7.5481287070875895, + "learning_rate": 4.779390226398417e-07, + "loss": 0.5641, + "step": 22309 + }, + { + "epoch": 1.611790416674192, + "grad_norm": 8.176224493778058, + "learning_rate": 4.777670303757539e-07, + "loss": 0.5752, + "step": 22310 + }, + { + "epoch": 1.6118626618744019, + "grad_norm": 7.110653001546232, + "learning_rate": 4.77595065794465e-07, + "loss": 0.6661, + "step": 22311 + }, + { + "epoch": 1.6119349070746112, + "grad_norm": 7.795125455403429, + "learning_rate": 4.774231288983302e-07, + "loss": 0.6442, + "step": 22312 + }, + { + "epoch": 1.6120071522748207, + "grad_norm": 8.492540564475263, + "learning_rate": 4.772512196897017e-07, + "loss": 0.602, + "step": 22313 + }, + { + "epoch": 1.6120793974750303, + "grad_norm": 6.230998642729218, + "learning_rate": 4.770793381709332e-07, + "loss": 0.5552, + "step": 22314 + }, + { + "epoch": 1.6121516426752398, + "grad_norm": 8.870141647222827, + "learning_rate": 4.76907484344378e-07, + "loss": 0.6228, + "step": 22315 + }, + { + "epoch": 1.6122238878754493, + "grad_norm": 7.053930974974665, + "learning_rate": 4.767356582123886e-07, + "loss": 0.6067, + "step": 22316 + }, + { + "epoch": 1.6122961330756587, + "grad_norm": 7.3261393315226275, + "learning_rate": 4.76563859777317e-07, + "loss": 0.6757, + "step": 22317 + }, + { + "epoch": 1.6123683782758684, + "grad_norm": 6.294653110051332, + "learning_rate": 4.7639208904151494e-07, + "loss": 0.5606, + "step": 22318 + }, + { + "epoch": 1.6124406234760778, + "grad_norm": 7.469599591698523, + "learning_rate": 4.762203460073347e-07, + "loss": 0.6586, + "step": 22319 + }, + { + "epoch": 1.6125128686762873, + "grad_norm": 6.6335894485691504, + "learning_rate": 4.7604863067712585e-07, + "loss": 0.6146, + "step": 22320 + }, + { + "epoch": 1.6125851138764968, + "grad_norm": 7.154193699348164, + "learning_rate": 4.758769430532398e-07, + "loss": 0.582, + "step": 22321 + }, + { + "epoch": 1.6126573590767064, + "grad_norm": 6.306601913205747, + "learning_rate": 4.7570528313802683e-07, + "loss": 0.6154, + "step": 22322 + }, + { + "epoch": 1.612729604276916, + "grad_norm": 7.3491443058078465, + "learning_rate": 4.7553365093383734e-07, + "loss": 0.6076, + "step": 22323 + }, + { + "epoch": 1.6128018494771252, + "grad_norm": 7.587227740108599, + "learning_rate": 4.753620464430198e-07, + "loss": 0.497, + "step": 22324 + }, + { + "epoch": 1.612874094677335, + "grad_norm": 6.963567474553273, + "learning_rate": 4.75190469667923e-07, + "loss": 0.6049, + "step": 22325 + }, + { + "epoch": 1.6129463398775443, + "grad_norm": 7.263987235608418, + "learning_rate": 4.750189206108979e-07, + "loss": 0.562, + "step": 22326 + }, + { + "epoch": 1.6130185850777539, + "grad_norm": 7.987843098084685, + "learning_rate": 4.748473992742908e-07, + "loss": 0.6126, + "step": 22327 + }, + { + "epoch": 1.6130908302779634, + "grad_norm": 7.480952171063667, + "learning_rate": 4.7467590566045026e-07, + "loss": 0.5755, + "step": 22328 + }, + { + "epoch": 1.613163075478173, + "grad_norm": 9.806565743901029, + "learning_rate": 4.745044397717241e-07, + "loss": 0.6076, + "step": 22329 + }, + { + "epoch": 1.6132353206783825, + "grad_norm": 7.309780038845806, + "learning_rate": 4.7433300161046e-07, + "loss": 0.6095, + "step": 22330 + }, + { + "epoch": 1.6133075658785918, + "grad_norm": 7.916338729758684, + "learning_rate": 4.7416159117900374e-07, + "loss": 0.5868, + "step": 22331 + }, + { + "epoch": 1.6133798110788016, + "grad_norm": 8.18625357655313, + "learning_rate": 4.739902084797024e-07, + "loss": 0.682, + "step": 22332 + }, + { + "epoch": 1.613452056279011, + "grad_norm": 6.84983046941432, + "learning_rate": 4.7381885351490186e-07, + "loss": 0.6022, + "step": 22333 + }, + { + "epoch": 1.6135243014792204, + "grad_norm": 7.415795207170132, + "learning_rate": 4.7364752628694806e-07, + "loss": 0.582, + "step": 22334 + }, + { + "epoch": 1.61359654667943, + "grad_norm": 6.153044664934722, + "learning_rate": 4.734762267981863e-07, + "loss": 0.5989, + "step": 22335 + }, + { + "epoch": 1.6136687918796395, + "grad_norm": 7.72241766275986, + "learning_rate": 4.7330495505096164e-07, + "loss": 0.5776, + "step": 22336 + }, + { + "epoch": 1.613741037079849, + "grad_norm": 7.751930734645795, + "learning_rate": 4.731337110476189e-07, + "loss": 0.6425, + "step": 22337 + }, + { + "epoch": 1.6138132822800584, + "grad_norm": 6.888024494672546, + "learning_rate": 4.729624947905012e-07, + "loss": 0.6118, + "step": 22338 + }, + { + "epoch": 1.6138855274802681, + "grad_norm": 7.271607395128447, + "learning_rate": 4.7279130628195335e-07, + "loss": 0.6355, + "step": 22339 + }, + { + "epoch": 1.6139577726804775, + "grad_norm": 7.227992890532332, + "learning_rate": 4.72620145524319e-07, + "loss": 0.5615, + "step": 22340 + }, + { + "epoch": 1.614030017880687, + "grad_norm": 7.767114016811295, + "learning_rate": 4.724490125199399e-07, + "loss": 0.6064, + "step": 22341 + }, + { + "epoch": 1.6141022630808965, + "grad_norm": 6.944483635685852, + "learning_rate": 4.7227790727115973e-07, + "loss": 0.6073, + "step": 22342 + }, + { + "epoch": 1.614174508281106, + "grad_norm": 8.30689480749416, + "learning_rate": 4.721068297803205e-07, + "loss": 0.6613, + "step": 22343 + }, + { + "epoch": 1.6142467534813156, + "grad_norm": 8.167964546716997, + "learning_rate": 4.7193578004976427e-07, + "loss": 0.6288, + "step": 22344 + }, + { + "epoch": 1.614318998681525, + "grad_norm": 7.439715143718725, + "learning_rate": 4.717647580818324e-07, + "loss": 0.5925, + "step": 22345 + }, + { + "epoch": 1.6143912438817347, + "grad_norm": 6.623266317140182, + "learning_rate": 4.7159376387886645e-07, + "loss": 0.6037, + "step": 22346 + }, + { + "epoch": 1.614463489081944, + "grad_norm": 7.855697226367212, + "learning_rate": 4.714227974432067e-07, + "loss": 0.5576, + "step": 22347 + }, + { + "epoch": 1.6145357342821536, + "grad_norm": 7.154533346914345, + "learning_rate": 4.7125185877719445e-07, + "loss": 0.5572, + "step": 22348 + }, + { + "epoch": 1.6146079794823631, + "grad_norm": 7.765867417541081, + "learning_rate": 4.710809478831682e-07, + "loss": 0.6146, + "step": 22349 + }, + { + "epoch": 1.6146802246825727, + "grad_norm": 6.668841992559766, + "learning_rate": 4.7091006476346875e-07, + "loss": 0.5976, + "step": 22350 + }, + { + "epoch": 1.6147524698827822, + "grad_norm": 7.714667719510733, + "learning_rate": 4.707392094204355e-07, + "loss": 0.6398, + "step": 22351 + }, + { + "epoch": 1.6148247150829915, + "grad_norm": 8.117937980375476, + "learning_rate": 4.7056838185640607e-07, + "loss": 0.6392, + "step": 22352 + }, + { + "epoch": 1.6148969602832013, + "grad_norm": 5.807373082797654, + "learning_rate": 4.7039758207371944e-07, + "loss": 0.5934, + "step": 22353 + }, + { + "epoch": 1.6149692054834106, + "grad_norm": 8.474724383078227, + "learning_rate": 4.7022681007471425e-07, + "loss": 0.6537, + "step": 22354 + }, + { + "epoch": 1.6150414506836204, + "grad_norm": 7.852308733194731, + "learning_rate": 4.7005606586172895e-07, + "loss": 0.6996, + "step": 22355 + }, + { + "epoch": 1.6151136958838297, + "grad_norm": 7.131979217824219, + "learning_rate": 4.698853494370992e-07, + "loss": 0.6465, + "step": 22356 + }, + { + "epoch": 1.6151859410840392, + "grad_norm": 6.905671384574314, + "learning_rate": 4.6971466080316305e-07, + "loss": 0.6266, + "step": 22357 + }, + { + "epoch": 1.6152581862842488, + "grad_norm": 6.139038191524157, + "learning_rate": 4.6954399996225697e-07, + "loss": 0.5971, + "step": 22358 + }, + { + "epoch": 1.615330431484458, + "grad_norm": 6.6137409624994365, + "learning_rate": 4.6937336691671666e-07, + "loss": 0.5069, + "step": 22359 + }, + { + "epoch": 1.6154026766846679, + "grad_norm": 6.993814732965334, + "learning_rate": 4.6920276166887794e-07, + "loss": 0.5517, + "step": 22360 + }, + { + "epoch": 1.6154749218848772, + "grad_norm": 6.530133749744394, + "learning_rate": 4.690321842210768e-07, + "loss": 0.5959, + "step": 22361 + }, + { + "epoch": 1.615547167085087, + "grad_norm": 7.0510812457820355, + "learning_rate": 4.6886163457564833e-07, + "loss": 0.6765, + "step": 22362 + }, + { + "epoch": 1.6156194122852963, + "grad_norm": 7.935348579620062, + "learning_rate": 4.6869111273492666e-07, + "loss": 0.5771, + "step": 22363 + }, + { + "epoch": 1.6156916574855058, + "grad_norm": 7.227942488593016, + "learning_rate": 4.685206187012467e-07, + "loss": 0.6612, + "step": 22364 + }, + { + "epoch": 1.6157639026857153, + "grad_norm": 9.470604642520026, + "learning_rate": 4.6835015247694266e-07, + "loss": 0.6619, + "step": 22365 + }, + { + "epoch": 1.6158361478859247, + "grad_norm": 5.727523327583885, + "learning_rate": 4.6817971406434684e-07, + "loss": 0.5718, + "step": 22366 + }, + { + "epoch": 1.6159083930861344, + "grad_norm": 8.001678290755633, + "learning_rate": 4.6800930346579314e-07, + "loss": 0.6011, + "step": 22367 + }, + { + "epoch": 1.6159806382863438, + "grad_norm": 6.47388552939085, + "learning_rate": 4.6783892068361423e-07, + "loss": 0.6033, + "step": 22368 + }, + { + "epoch": 1.6160528834865535, + "grad_norm": 6.7755446098997965, + "learning_rate": 4.6766856572014353e-07, + "loss": 0.6031, + "step": 22369 + }, + { + "epoch": 1.6161251286867628, + "grad_norm": 8.192475881671996, + "learning_rate": 4.674982385777116e-07, + "loss": 0.5805, + "step": 22370 + }, + { + "epoch": 1.6161973738869724, + "grad_norm": 7.180825227370802, + "learning_rate": 4.673279392586502e-07, + "loss": 0.6298, + "step": 22371 + }, + { + "epoch": 1.616269619087182, + "grad_norm": 7.887528399191148, + "learning_rate": 4.671576677652914e-07, + "loss": 0.6519, + "step": 22372 + }, + { + "epoch": 1.6163418642873915, + "grad_norm": 7.521207453192279, + "learning_rate": 4.6698742409996556e-07, + "loss": 0.6247, + "step": 22373 + }, + { + "epoch": 1.616414109487601, + "grad_norm": 8.386357591490107, + "learning_rate": 4.6681720826500355e-07, + "loss": 0.724, + "step": 22374 + }, + { + "epoch": 1.6164863546878103, + "grad_norm": 7.61364876734475, + "learning_rate": 4.6664702026273517e-07, + "loss": 0.6027, + "step": 22375 + }, + { + "epoch": 1.61655859988802, + "grad_norm": 8.112646953268637, + "learning_rate": 4.664768600954908e-07, + "loss": 0.6203, + "step": 22376 + }, + { + "epoch": 1.6166308450882294, + "grad_norm": 9.048744443345528, + "learning_rate": 4.6630672776559887e-07, + "loss": 0.6425, + "step": 22377 + }, + { + "epoch": 1.616703090288439, + "grad_norm": 7.4976365498074, + "learning_rate": 4.661366232753886e-07, + "loss": 0.5817, + "step": 22378 + }, + { + "epoch": 1.6167753354886485, + "grad_norm": 6.471813121667459, + "learning_rate": 4.65966546627189e-07, + "loss": 0.5667, + "step": 22379 + }, + { + "epoch": 1.616847580688858, + "grad_norm": 7.304350656504902, + "learning_rate": 4.657964978233284e-07, + "loss": 0.6247, + "step": 22380 + }, + { + "epoch": 1.6169198258890676, + "grad_norm": 6.47626828766674, + "learning_rate": 4.656264768661331e-07, + "loss": 0.6151, + "step": 22381 + }, + { + "epoch": 1.616992071089277, + "grad_norm": 8.071162811418064, + "learning_rate": 4.654564837579326e-07, + "loss": 0.5804, + "step": 22382 + }, + { + "epoch": 1.6170643162894867, + "grad_norm": 6.80547873871024, + "learning_rate": 4.6528651850105353e-07, + "loss": 0.6132, + "step": 22383 + }, + { + "epoch": 1.617136561489696, + "grad_norm": 6.185284458733423, + "learning_rate": 4.6511658109782173e-07, + "loss": 0.5658, + "step": 22384 + }, + { + "epoch": 1.6172088066899055, + "grad_norm": 6.43125538580162, + "learning_rate": 4.649466715505638e-07, + "loss": 0.5399, + "step": 22385 + }, + { + "epoch": 1.617281051890115, + "grad_norm": 6.275200962936895, + "learning_rate": 4.647767898616057e-07, + "loss": 0.6502, + "step": 22386 + }, + { + "epoch": 1.6173532970903246, + "grad_norm": 7.058266780438924, + "learning_rate": 4.6460693603327387e-07, + "loss": 0.603, + "step": 22387 + }, + { + "epoch": 1.6174255422905341, + "grad_norm": 6.726507773713934, + "learning_rate": 4.644371100678921e-07, + "loss": 0.531, + "step": 22388 + }, + { + "epoch": 1.6174977874907435, + "grad_norm": 6.9598693370612095, + "learning_rate": 4.642673119677857e-07, + "loss": 0.6233, + "step": 22389 + }, + { + "epoch": 1.6175700326909532, + "grad_norm": 6.186325424144608, + "learning_rate": 4.6409754173527913e-07, + "loss": 0.5396, + "step": 22390 + }, + { + "epoch": 1.6176422778911625, + "grad_norm": 8.069287658790737, + "learning_rate": 4.639277993726965e-07, + "loss": 0.6465, + "step": 22391 + }, + { + "epoch": 1.617714523091372, + "grad_norm": 6.737907598335773, + "learning_rate": 4.637580848823614e-07, + "loss": 0.6321, + "step": 22392 + }, + { + "epoch": 1.6177867682915816, + "grad_norm": 6.956277539797658, + "learning_rate": 4.63588398266597e-07, + "loss": 0.5865, + "step": 22393 + }, + { + "epoch": 1.6178590134917912, + "grad_norm": 6.107476527277858, + "learning_rate": 4.63418739527727e-07, + "loss": 0.6127, + "step": 22394 + }, + { + "epoch": 1.6179312586920007, + "grad_norm": 7.319840618545475, + "learning_rate": 4.6324910866807237e-07, + "loss": 0.6303, + "step": 22395 + }, + { + "epoch": 1.61800350389221, + "grad_norm": 7.137388735570347, + "learning_rate": 4.630795056899562e-07, + "loss": 0.5458, + "step": 22396 + }, + { + "epoch": 1.6180757490924198, + "grad_norm": 8.041654860202906, + "learning_rate": 4.629099305956999e-07, + "loss": 0.6282, + "step": 22397 + }, + { + "epoch": 1.6181479942926291, + "grad_norm": 6.960651430359154, + "learning_rate": 4.6274038338762566e-07, + "loss": 0.589, + "step": 22398 + }, + { + "epoch": 1.6182202394928387, + "grad_norm": 6.970029157580299, + "learning_rate": 4.6257086406805317e-07, + "loss": 0.6242, + "step": 22399 + }, + { + "epoch": 1.6182924846930482, + "grad_norm": 6.066533585370779, + "learning_rate": 4.624013726393034e-07, + "loss": 0.6365, + "step": 22400 + }, + { + "epoch": 1.6183647298932577, + "grad_norm": 6.827562950623492, + "learning_rate": 4.6223190910369697e-07, + "loss": 0.5653, + "step": 22401 + }, + { + "epoch": 1.6184369750934673, + "grad_norm": 8.222710065144563, + "learning_rate": 4.620624734635534e-07, + "loss": 0.5685, + "step": 22402 + }, + { + "epoch": 1.6185092202936766, + "grad_norm": 8.955166089739722, + "learning_rate": 4.6189306572119203e-07, + "loss": 0.6004, + "step": 22403 + }, + { + "epoch": 1.6185814654938864, + "grad_norm": 6.822034690480197, + "learning_rate": 4.6172368587893227e-07, + "loss": 0.5945, + "step": 22404 + }, + { + "epoch": 1.6186537106940957, + "grad_norm": 6.633547181687179, + "learning_rate": 4.615543339390932e-07, + "loss": 0.5519, + "step": 22405 + }, + { + "epoch": 1.6187259558943052, + "grad_norm": 6.786204413589203, + "learning_rate": 4.613850099039921e-07, + "loss": 0.562, + "step": 22406 + }, + { + "epoch": 1.6187982010945148, + "grad_norm": 6.746756728152939, + "learning_rate": 4.612157137759471e-07, + "loss": 0.6619, + "step": 22407 + }, + { + "epoch": 1.6188704462947243, + "grad_norm": 7.4710054064662055, + "learning_rate": 4.610464455572766e-07, + "loss": 0.657, + "step": 22408 + }, + { + "epoch": 1.6189426914949339, + "grad_norm": 7.9777011040014, + "learning_rate": 4.608772052502966e-07, + "loss": 0.6136, + "step": 22409 + }, + { + "epoch": 1.6190149366951432, + "grad_norm": 7.048775359580146, + "learning_rate": 4.607079928573235e-07, + "loss": 0.523, + "step": 22410 + }, + { + "epoch": 1.619087181895353, + "grad_norm": 6.790918445522685, + "learning_rate": 4.6053880838067554e-07, + "loss": 0.6128, + "step": 22411 + }, + { + "epoch": 1.6191594270955623, + "grad_norm": 9.827275711019338, + "learning_rate": 4.603696518226683e-07, + "loss": 0.6536, + "step": 22412 + }, + { + "epoch": 1.6192316722957718, + "grad_norm": 8.672126175275723, + "learning_rate": 4.602005231856163e-07, + "loss": 0.6701, + "step": 22413 + }, + { + "epoch": 1.6193039174959813, + "grad_norm": 6.503800968085563, + "learning_rate": 4.60031422471835e-07, + "loss": 0.6124, + "step": 22414 + }, + { + "epoch": 1.619376162696191, + "grad_norm": 6.400634984464096, + "learning_rate": 4.5986234968364044e-07, + "loss": 0.5592, + "step": 22415 + }, + { + "epoch": 1.6194484078964004, + "grad_norm": 6.974985386549432, + "learning_rate": 4.596933048233457e-07, + "loss": 0.6067, + "step": 22416 + }, + { + "epoch": 1.6195206530966098, + "grad_norm": 6.3392287806327925, + "learning_rate": 4.595242878932654e-07, + "loss": 0.5883, + "step": 22417 + }, + { + "epoch": 1.6195928982968195, + "grad_norm": 8.740671034832518, + "learning_rate": 4.5935529889571314e-07, + "loss": 0.6846, + "step": 22418 + }, + { + "epoch": 1.6196651434970288, + "grad_norm": 7.287422936164547, + "learning_rate": 4.591863378330025e-07, + "loss": 0.544, + "step": 22419 + }, + { + "epoch": 1.6197373886972384, + "grad_norm": 6.760558453578229, + "learning_rate": 4.590174047074461e-07, + "loss": 0.6377, + "step": 22420 + }, + { + "epoch": 1.619809633897448, + "grad_norm": 7.691967666153339, + "learning_rate": 4.5884849952135684e-07, + "loss": 0.5859, + "step": 22421 + }, + { + "epoch": 1.6198818790976575, + "grad_norm": 8.4032916567822, + "learning_rate": 4.5867962227704704e-07, + "loss": 0.6267, + "step": 22422 + }, + { + "epoch": 1.619954124297867, + "grad_norm": 7.243857313649792, + "learning_rate": 4.5851077297682845e-07, + "loss": 0.5278, + "step": 22423 + }, + { + "epoch": 1.6200263694980763, + "grad_norm": 6.817885483746586, + "learning_rate": 4.58341951623012e-07, + "loss": 0.5802, + "step": 22424 + }, + { + "epoch": 1.620098614698286, + "grad_norm": 7.150544536635665, + "learning_rate": 4.581731582179086e-07, + "loss": 0.6155, + "step": 22425 + }, + { + "epoch": 1.6201708598984954, + "grad_norm": 7.100509582285274, + "learning_rate": 4.5800439276383e-07, + "loss": 0.6647, + "step": 22426 + }, + { + "epoch": 1.620243105098705, + "grad_norm": 7.738343856227057, + "learning_rate": 4.5783565526308525e-07, + "loss": 0.6499, + "step": 22427 + }, + { + "epoch": 1.6203153502989145, + "grad_norm": 7.029353585402664, + "learning_rate": 4.576669457179847e-07, + "loss": 0.6536, + "step": 22428 + }, + { + "epoch": 1.620387595499124, + "grad_norm": 6.328294690529633, + "learning_rate": 4.574982641308381e-07, + "loss": 0.5767, + "step": 22429 + }, + { + "epoch": 1.6204598406993336, + "grad_norm": 7.008026698144371, + "learning_rate": 4.57329610503954e-07, + "loss": 0.589, + "step": 22430 + }, + { + "epoch": 1.620532085899543, + "grad_norm": 7.527601189162849, + "learning_rate": 4.5716098483964186e-07, + "loss": 0.596, + "step": 22431 + }, + { + "epoch": 1.6206043310997527, + "grad_norm": 7.912265825949497, + "learning_rate": 4.569923871402093e-07, + "loss": 0.5597, + "step": 22432 + }, + { + "epoch": 1.620676576299962, + "grad_norm": 7.465924738171751, + "learning_rate": 4.568238174079656e-07, + "loss": 0.6366, + "step": 22433 + }, + { + "epoch": 1.6207488215001717, + "grad_norm": 6.903204475107664, + "learning_rate": 4.566552756452167e-07, + "loss": 0.5738, + "step": 22434 + }, + { + "epoch": 1.620821066700381, + "grad_norm": 7.940349157617255, + "learning_rate": 4.564867618542704e-07, + "loss": 0.6099, + "step": 22435 + }, + { + "epoch": 1.6208933119005906, + "grad_norm": 7.479121329365837, + "learning_rate": 4.5631827603743377e-07, + "loss": 0.5519, + "step": 22436 + }, + { + "epoch": 1.6209655571008001, + "grad_norm": 7.167684290592018, + "learning_rate": 4.5614981819701393e-07, + "loss": 0.5573, + "step": 22437 + }, + { + "epoch": 1.6210378023010095, + "grad_norm": 7.249462765487081, + "learning_rate": 4.559813883353145e-07, + "loss": 0.6089, + "step": 22438 + }, + { + "epoch": 1.6211100475012192, + "grad_norm": 7.170575458412327, + "learning_rate": 4.558129864546437e-07, + "loss": 0.6934, + "step": 22439 + }, + { + "epoch": 1.6211822927014286, + "grad_norm": 7.452318575754865, + "learning_rate": 4.5564461255730636e-07, + "loss": 0.6363, + "step": 22440 + }, + { + "epoch": 1.6212545379016383, + "grad_norm": 7.591374029499648, + "learning_rate": 4.554762666456064e-07, + "loss": 0.6333, + "step": 22441 + }, + { + "epoch": 1.6213267831018476, + "grad_norm": 6.611155890020174, + "learning_rate": 4.553079487218487e-07, + "loss": 0.6512, + "step": 22442 + }, + { + "epoch": 1.6213990283020572, + "grad_norm": 8.01483305054603, + "learning_rate": 4.5513965878833776e-07, + "loss": 0.6024, + "step": 22443 + }, + { + "epoch": 1.6214712735022667, + "grad_norm": 6.990738600318175, + "learning_rate": 4.549713968473779e-07, + "loss": 0.5883, + "step": 22444 + }, + { + "epoch": 1.621543518702476, + "grad_norm": 7.414995012236632, + "learning_rate": 4.5480316290127085e-07, + "loss": 0.6234, + "step": 22445 + }, + { + "epoch": 1.6216157639026858, + "grad_norm": 7.110121333593258, + "learning_rate": 4.5463495695232064e-07, + "loss": 0.6048, + "step": 22446 + }, + { + "epoch": 1.6216880091028951, + "grad_norm": 6.6971676547587835, + "learning_rate": 4.544667790028298e-07, + "loss": 0.6411, + "step": 22447 + }, + { + "epoch": 1.6217602543031049, + "grad_norm": 8.429195584146852, + "learning_rate": 4.542986290551005e-07, + "loss": 0.6439, + "step": 22448 + }, + { + "epoch": 1.6218324995033142, + "grad_norm": 6.503904783689775, + "learning_rate": 4.541305071114344e-07, + "loss": 0.6076, + "step": 22449 + }, + { + "epoch": 1.6219047447035237, + "grad_norm": 5.93457858844762, + "learning_rate": 4.5396241317413323e-07, + "loss": 0.5413, + "step": 22450 + }, + { + "epoch": 1.6219769899037333, + "grad_norm": 6.8399750910969095, + "learning_rate": 4.5379434724549866e-07, + "loss": 0.6237, + "step": 22451 + }, + { + "epoch": 1.6220492351039428, + "grad_norm": 8.488828161309787, + "learning_rate": 4.536263093278301e-07, + "loss": 0.7034, + "step": 22452 + }, + { + "epoch": 1.6221214803041524, + "grad_norm": 7.141929991358418, + "learning_rate": 4.534582994234285e-07, + "loss": 0.571, + "step": 22453 + }, + { + "epoch": 1.6221937255043617, + "grad_norm": 7.489162498680384, + "learning_rate": 4.532903175345937e-07, + "loss": 0.5342, + "step": 22454 + }, + { + "epoch": 1.6222659707045715, + "grad_norm": 7.711906333591955, + "learning_rate": 4.531223636636259e-07, + "loss": 0.6016, + "step": 22455 + }, + { + "epoch": 1.6223382159047808, + "grad_norm": 5.893252001987998, + "learning_rate": 4.529544378128228e-07, + "loss": 0.5068, + "step": 22456 + }, + { + "epoch": 1.6224104611049903, + "grad_norm": 6.112216298515129, + "learning_rate": 4.527865399844844e-07, + "loss": 0.5838, + "step": 22457 + }, + { + "epoch": 1.6224827063051999, + "grad_norm": 6.604062951012619, + "learning_rate": 4.526186701809085e-07, + "loss": 0.5883, + "step": 22458 + }, + { + "epoch": 1.6225549515054094, + "grad_norm": 6.989452317246921, + "learning_rate": 4.5245082840439327e-07, + "loss": 0.6261, + "step": 22459 + }, + { + "epoch": 1.622627196705619, + "grad_norm": 6.777346845730561, + "learning_rate": 4.522830146572366e-07, + "loss": 0.6188, + "step": 22460 + }, + { + "epoch": 1.6226994419058283, + "grad_norm": 6.311716767622538, + "learning_rate": 4.521152289417352e-07, + "loss": 0.6031, + "step": 22461 + }, + { + "epoch": 1.622771687106038, + "grad_norm": 6.529959081080519, + "learning_rate": 4.519474712601871e-07, + "loss": 0.5593, + "step": 22462 + }, + { + "epoch": 1.6228439323062473, + "grad_norm": 7.128783710419429, + "learning_rate": 4.5177974161488694e-07, + "loss": 0.5284, + "step": 22463 + }, + { + "epoch": 1.622916177506457, + "grad_norm": 6.840632037581321, + "learning_rate": 4.5161204000813225e-07, + "loss": 0.6229, + "step": 22464 + }, + { + "epoch": 1.6229884227066664, + "grad_norm": 8.187618342119455, + "learning_rate": 4.5144436644221857e-07, + "loss": 0.5971, + "step": 22465 + }, + { + "epoch": 1.623060667906876, + "grad_norm": 5.478169726435622, + "learning_rate": 4.5127672091943993e-07, + "loss": 0.5424, + "step": 22466 + }, + { + "epoch": 1.6231329131070855, + "grad_norm": 7.555770780577125, + "learning_rate": 4.5110910344209254e-07, + "loss": 0.6347, + "step": 22467 + }, + { + "epoch": 1.6232051583072948, + "grad_norm": 7.153095466282822, + "learning_rate": 4.5094151401247095e-07, + "loss": 0.5775, + "step": 22468 + }, + { + "epoch": 1.6232774035075046, + "grad_norm": 6.6543587876852675, + "learning_rate": 4.5077395263286945e-07, + "loss": 0.5704, + "step": 22469 + }, + { + "epoch": 1.623349648707714, + "grad_norm": 7.163646357273653, + "learning_rate": 4.506064193055809e-07, + "loss": 0.6018, + "step": 22470 + }, + { + "epoch": 1.6234218939079235, + "grad_norm": 6.52914088020353, + "learning_rate": 4.504389140328994e-07, + "loss": 0.5864, + "step": 22471 + }, + { + "epoch": 1.623494139108133, + "grad_norm": 7.705847949502353, + "learning_rate": 4.502714368171182e-07, + "loss": 0.6293, + "step": 22472 + }, + { + "epoch": 1.6235663843083425, + "grad_norm": 8.636813269779037, + "learning_rate": 4.5010398766052873e-07, + "loss": 0.6624, + "step": 22473 + }, + { + "epoch": 1.623638629508552, + "grad_norm": 7.310137766571829, + "learning_rate": 4.499365665654243e-07, + "loss": 0.5511, + "step": 22474 + }, + { + "epoch": 1.6237108747087614, + "grad_norm": 7.152914144292187, + "learning_rate": 4.497691735340964e-07, + "loss": 0.6301, + "step": 22475 + }, + { + "epoch": 1.6237831199089712, + "grad_norm": 5.895547201629432, + "learning_rate": 4.4960180856883667e-07, + "loss": 0.573, + "step": 22476 + }, + { + "epoch": 1.6238553651091805, + "grad_norm": 7.548512030493939, + "learning_rate": 4.494344716719359e-07, + "loss": 0.5641, + "step": 22477 + }, + { + "epoch": 1.62392761030939, + "grad_norm": 7.225868579437003, + "learning_rate": 4.4926716284568545e-07, + "loss": 0.5498, + "step": 22478 + }, + { + "epoch": 1.6239998555095996, + "grad_norm": 6.992269334367487, + "learning_rate": 4.4909988209237504e-07, + "loss": 0.6035, + "step": 22479 + }, + { + "epoch": 1.6240721007098091, + "grad_norm": 6.720931865476755, + "learning_rate": 4.489326294142954e-07, + "loss": 0.6255, + "step": 22480 + }, + { + "epoch": 1.6241443459100187, + "grad_norm": 7.628693983462081, + "learning_rate": 4.4876540481373494e-07, + "loss": 0.6112, + "step": 22481 + }, + { + "epoch": 1.624216591110228, + "grad_norm": 6.565545528898647, + "learning_rate": 4.485982082929835e-07, + "loss": 0.6669, + "step": 22482 + }, + { + "epoch": 1.6242888363104377, + "grad_norm": 5.916428288058949, + "learning_rate": 4.484310398543304e-07, + "loss": 0.5721, + "step": 22483 + }, + { + "epoch": 1.624361081510647, + "grad_norm": 6.963306165282757, + "learning_rate": 4.4826389950006266e-07, + "loss": 0.5812, + "step": 22484 + }, + { + "epoch": 1.6244333267108566, + "grad_norm": 7.1207768824352184, + "learning_rate": 4.4809678723246903e-07, + "loss": 0.7039, + "step": 22485 + }, + { + "epoch": 1.6245055719110661, + "grad_norm": 7.227793655684312, + "learning_rate": 4.4792970305383736e-07, + "loss": 0.5999, + "step": 22486 + }, + { + "epoch": 1.6245778171112757, + "grad_norm": 6.566528827492129, + "learning_rate": 4.477626469664548e-07, + "loss": 0.625, + "step": 22487 + }, + { + "epoch": 1.6246500623114852, + "grad_norm": 6.112107078233415, + "learning_rate": 4.4759561897260775e-07, + "loss": 0.5512, + "step": 22488 + }, + { + "epoch": 1.6247223075116946, + "grad_norm": 6.841919257844204, + "learning_rate": 4.474286190745833e-07, + "loss": 0.5877, + "step": 22489 + }, + { + "epoch": 1.6247945527119043, + "grad_norm": 6.681788206770454, + "learning_rate": 4.47261647274668e-07, + "loss": 0.5888, + "step": 22490 + }, + { + "epoch": 1.6248667979121136, + "grad_norm": 6.7413506369750955, + "learning_rate": 4.4709470357514636e-07, + "loss": 0.5892, + "step": 22491 + }, + { + "epoch": 1.6249390431123232, + "grad_norm": 7.771481184420682, + "learning_rate": 4.4692778797830406e-07, + "loss": 0.6017, + "step": 22492 + }, + { + "epoch": 1.6250112883125327, + "grad_norm": 6.737228461687395, + "learning_rate": 4.467609004864265e-07, + "loss": 0.6227, + "step": 22493 + }, + { + "epoch": 1.6250835335127423, + "grad_norm": 6.536320666399707, + "learning_rate": 4.465940411017977e-07, + "loss": 0.5593, + "step": 22494 + }, + { + "epoch": 1.6251557787129518, + "grad_norm": 6.722898542082047, + "learning_rate": 4.4642720982670225e-07, + "loss": 0.6265, + "step": 22495 + }, + { + "epoch": 1.6252280239131611, + "grad_norm": 7.453889883527389, + "learning_rate": 4.462604066634238e-07, + "loss": 0.6232, + "step": 22496 + }, + { + "epoch": 1.6253002691133709, + "grad_norm": 7.161506954143663, + "learning_rate": 4.460936316142464e-07, + "loss": 0.6294, + "step": 22497 + }, + { + "epoch": 1.6253725143135802, + "grad_norm": 8.653282786232959, + "learning_rate": 4.4592688468145155e-07, + "loss": 0.6525, + "step": 22498 + }, + { + "epoch": 1.6254447595137897, + "grad_norm": 6.6430019298426055, + "learning_rate": 4.45760165867323e-07, + "loss": 0.5984, + "step": 22499 + }, + { + "epoch": 1.6255170047139993, + "grad_norm": 7.944490253283812, + "learning_rate": 4.455934751741428e-07, + "loss": 0.649, + "step": 22500 + }, + { + "epoch": 1.6255892499142088, + "grad_norm": 7.646106711618599, + "learning_rate": 4.454268126041933e-07, + "loss": 0.6152, + "step": 22501 + }, + { + "epoch": 1.6256614951144184, + "grad_norm": 6.734250379502393, + "learning_rate": 4.452601781597549e-07, + "loss": 0.6771, + "step": 22502 + }, + { + "epoch": 1.6257337403146277, + "grad_norm": 7.129911637153246, + "learning_rate": 4.450935718431093e-07, + "loss": 0.6179, + "step": 22503 + }, + { + "epoch": 1.6258059855148375, + "grad_norm": 7.882925979810659, + "learning_rate": 4.4492699365653704e-07, + "loss": 0.6433, + "step": 22504 + }, + { + "epoch": 1.6258782307150468, + "grad_norm": 7.683488248751421, + "learning_rate": 4.4476044360231866e-07, + "loss": 0.6362, + "step": 22505 + }, + { + "epoch": 1.6259504759152565, + "grad_norm": 7.090266107426511, + "learning_rate": 4.4459392168273383e-07, + "loss": 0.6262, + "step": 22506 + }, + { + "epoch": 1.6260227211154659, + "grad_norm": 6.667554541909345, + "learning_rate": 4.4442742790006266e-07, + "loss": 0.5605, + "step": 22507 + }, + { + "epoch": 1.6260949663156754, + "grad_norm": 6.032480381199806, + "learning_rate": 4.4426096225658436e-07, + "loss": 0.5843, + "step": 22508 + }, + { + "epoch": 1.626167211515885, + "grad_norm": 6.719885158011648, + "learning_rate": 4.4409452475457666e-07, + "loss": 0.6082, + "step": 22509 + }, + { + "epoch": 1.6262394567160943, + "grad_norm": 8.48770464059147, + "learning_rate": 4.4392811539631875e-07, + "loss": 0.5811, + "step": 22510 + }, + { + "epoch": 1.626311701916304, + "grad_norm": 7.953374892470226, + "learning_rate": 4.437617341840883e-07, + "loss": 0.5803, + "step": 22511 + }, + { + "epoch": 1.6263839471165134, + "grad_norm": 7.762732844154615, + "learning_rate": 4.4359538112016375e-07, + "loss": 0.5908, + "step": 22512 + }, + { + "epoch": 1.6264561923167231, + "grad_norm": 6.263119470106424, + "learning_rate": 4.4342905620682137e-07, + "loss": 0.588, + "step": 22513 + }, + { + "epoch": 1.6265284375169324, + "grad_norm": 9.332371344263288, + "learning_rate": 4.4326275944633815e-07, + "loss": 0.691, + "step": 22514 + }, + { + "epoch": 1.626600682717142, + "grad_norm": 6.60939895228742, + "learning_rate": 4.4309649084099097e-07, + "loss": 0.5952, + "step": 22515 + }, + { + "epoch": 1.6266729279173515, + "grad_norm": 8.512226063357982, + "learning_rate": 4.4293025039305576e-07, + "loss": 0.6734, + "step": 22516 + }, + { + "epoch": 1.6267451731175608, + "grad_norm": 6.18307614404249, + "learning_rate": 4.4276403810480816e-07, + "loss": 0.5556, + "step": 22517 + }, + { + "epoch": 1.6268174183177706, + "grad_norm": 7.140043740018334, + "learning_rate": 4.425978539785233e-07, + "loss": 0.6134, + "step": 22518 + }, + { + "epoch": 1.62688966351798, + "grad_norm": 8.314127597794151, + "learning_rate": 4.424316980164772e-07, + "loss": 0.6227, + "step": 22519 + }, + { + "epoch": 1.6269619087181897, + "grad_norm": 6.710908510014509, + "learning_rate": 4.42265570220943e-07, + "loss": 0.6139, + "step": 22520 + }, + { + "epoch": 1.627034153918399, + "grad_norm": 7.369671723341173, + "learning_rate": 4.4209947059419526e-07, + "loss": 0.6394, + "step": 22521 + }, + { + "epoch": 1.6271063991186085, + "grad_norm": 7.280979135083917, + "learning_rate": 4.4193339913850856e-07, + "loss": 0.5938, + "step": 22522 + }, + { + "epoch": 1.627178644318818, + "grad_norm": 6.886346505112603, + "learning_rate": 4.4176735585615444e-07, + "loss": 0.5462, + "step": 22523 + }, + { + "epoch": 1.6272508895190276, + "grad_norm": 6.655215474000154, + "learning_rate": 4.416013407494077e-07, + "loss": 0.6523, + "step": 22524 + }, + { + "epoch": 1.6273231347192372, + "grad_norm": 7.39310908760321, + "learning_rate": 4.4143535382054045e-07, + "loss": 0.609, + "step": 22525 + }, + { + "epoch": 1.6273953799194465, + "grad_norm": 8.58904345550008, + "learning_rate": 4.4126939507182554e-07, + "loss": 0.6777, + "step": 22526 + }, + { + "epoch": 1.6274676251196563, + "grad_norm": 8.492536072653003, + "learning_rate": 4.4110346450553316e-07, + "loss": 0.5202, + "step": 22527 + }, + { + "epoch": 1.6275398703198656, + "grad_norm": 6.325294445557301, + "learning_rate": 4.4093756212393593e-07, + "loss": 0.5481, + "step": 22528 + }, + { + "epoch": 1.6276121155200751, + "grad_norm": 7.860372612738151, + "learning_rate": 4.4077168792930476e-07, + "loss": 0.8134, + "step": 22529 + }, + { + "epoch": 1.6276843607202847, + "grad_norm": 8.523423625819367, + "learning_rate": 4.406058419239109e-07, + "loss": 0.5906, + "step": 22530 + }, + { + "epoch": 1.6277566059204942, + "grad_norm": 8.20011849666668, + "learning_rate": 4.404400241100232e-07, + "loss": 0.6815, + "step": 22531 + }, + { + "epoch": 1.6278288511207037, + "grad_norm": 7.122719650764634, + "learning_rate": 4.4027423448991276e-07, + "loss": 0.6943, + "step": 22532 + }, + { + "epoch": 1.627901096320913, + "grad_norm": 9.341626388458435, + "learning_rate": 4.4010847306584867e-07, + "loss": 0.6108, + "step": 22533 + }, + { + "epoch": 1.6279733415211228, + "grad_norm": 7.866257674151195, + "learning_rate": 4.3994273984009987e-07, + "loss": 0.6623, + "step": 22534 + }, + { + "epoch": 1.6280455867213321, + "grad_norm": 8.06340080965764, + "learning_rate": 4.3977703481493577e-07, + "loss": 0.6393, + "step": 22535 + }, + { + "epoch": 1.6281178319215417, + "grad_norm": 8.369376500832981, + "learning_rate": 4.396113579926242e-07, + "loss": 0.641, + "step": 22536 + }, + { + "epoch": 1.6281900771217512, + "grad_norm": 7.120828042903547, + "learning_rate": 4.3944570937543426e-07, + "loss": 0.6717, + "step": 22537 + }, + { + "epoch": 1.6282623223219608, + "grad_norm": 6.549366624799314, + "learning_rate": 4.392800889656318e-07, + "loss": 0.5759, + "step": 22538 + }, + { + "epoch": 1.6283345675221703, + "grad_norm": 6.542545170524206, + "learning_rate": 4.3911449676548477e-07, + "loss": 0.6184, + "step": 22539 + }, + { + "epoch": 1.6284068127223796, + "grad_norm": 6.566986294372672, + "learning_rate": 4.3894893277726114e-07, + "loss": 0.6112, + "step": 22540 + }, + { + "epoch": 1.6284790579225894, + "grad_norm": 7.633491165772689, + "learning_rate": 4.387833970032254e-07, + "loss": 0.5804, + "step": 22541 + }, + { + "epoch": 1.6285513031227987, + "grad_norm": 8.099591713493524, + "learning_rate": 4.3861788944564443e-07, + "loss": 0.5676, + "step": 22542 + }, + { + "epoch": 1.6286235483230083, + "grad_norm": 7.252313442099092, + "learning_rate": 4.3845241010678356e-07, + "loss": 0.6055, + "step": 22543 + }, + { + "epoch": 1.6286957935232178, + "grad_norm": 7.314768121555129, + "learning_rate": 4.3828695898890966e-07, + "loss": 0.5919, + "step": 22544 + }, + { + "epoch": 1.6287680387234273, + "grad_norm": 7.619014704724016, + "learning_rate": 4.3812153609428585e-07, + "loss": 0.7499, + "step": 22545 + }, + { + "epoch": 1.628840283923637, + "grad_norm": 6.568143615899551, + "learning_rate": 4.3795614142517736e-07, + "loss": 0.5762, + "step": 22546 + }, + { + "epoch": 1.6289125291238462, + "grad_norm": 6.997576975397364, + "learning_rate": 4.3779077498384863e-07, + "loss": 0.5744, + "step": 22547 + }, + { + "epoch": 1.628984774324056, + "grad_norm": 7.795446475237388, + "learning_rate": 4.376254367725627e-07, + "loss": 0.6439, + "step": 22548 + }, + { + "epoch": 1.6290570195242653, + "grad_norm": 7.239898221053989, + "learning_rate": 4.37460126793583e-07, + "loss": 0.5944, + "step": 22549 + }, + { + "epoch": 1.6291292647244748, + "grad_norm": 7.826968779617024, + "learning_rate": 4.372948450491726e-07, + "loss": 0.5786, + "step": 22550 + }, + { + "epoch": 1.6292015099246844, + "grad_norm": 6.9253954061831315, + "learning_rate": 4.3712959154159434e-07, + "loss": 0.6482, + "step": 22551 + }, + { + "epoch": 1.629273755124894, + "grad_norm": 6.770795647931983, + "learning_rate": 4.3696436627311045e-07, + "loss": 0.6086, + "step": 22552 + }, + { + "epoch": 1.6293460003251035, + "grad_norm": 6.355657217165615, + "learning_rate": 4.367991692459822e-07, + "loss": 0.5311, + "step": 22553 + }, + { + "epoch": 1.6294182455253128, + "grad_norm": 6.212186944392456, + "learning_rate": 4.366340004624714e-07, + "loss": 0.5345, + "step": 22554 + }, + { + "epoch": 1.6294904907255225, + "grad_norm": 9.673473022770345, + "learning_rate": 4.3646885992483984e-07, + "loss": 0.6218, + "step": 22555 + }, + { + "epoch": 1.6295627359257319, + "grad_norm": 7.945902064532423, + "learning_rate": 4.3630374763534655e-07, + "loss": 0.59, + "step": 22556 + }, + { + "epoch": 1.6296349811259414, + "grad_norm": 7.256370113669954, + "learning_rate": 4.361386635962528e-07, + "loss": 0.5632, + "step": 22557 + }, + { + "epoch": 1.629707226326151, + "grad_norm": 7.449583863790806, + "learning_rate": 4.359736078098187e-07, + "loss": 0.709, + "step": 22558 + }, + { + "epoch": 1.6297794715263605, + "grad_norm": 6.900648788688929, + "learning_rate": 4.358085802783027e-07, + "loss": 0.5452, + "step": 22559 + }, + { + "epoch": 1.62985171672657, + "grad_norm": 8.198845144686109, + "learning_rate": 4.356435810039644e-07, + "loss": 0.6198, + "step": 22560 + }, + { + "epoch": 1.6299239619267794, + "grad_norm": 7.768637857525407, + "learning_rate": 4.3547860998906287e-07, + "loss": 0.6098, + "step": 22561 + }, + { + "epoch": 1.6299962071269891, + "grad_norm": 9.197389431304636, + "learning_rate": 4.35313667235856e-07, + "loss": 0.6602, + "step": 22562 + }, + { + "epoch": 1.6300684523271984, + "grad_norm": 7.295406871846234, + "learning_rate": 4.3514875274660195e-07, + "loss": 0.6223, + "step": 22563 + }, + { + "epoch": 1.630140697527408, + "grad_norm": 6.606769454007624, + "learning_rate": 4.349838665235581e-07, + "loss": 0.6598, + "step": 22564 + }, + { + "epoch": 1.6302129427276175, + "grad_norm": 8.12316920854925, + "learning_rate": 4.3481900856898266e-07, + "loss": 0.6306, + "step": 22565 + }, + { + "epoch": 1.630285187927827, + "grad_norm": 7.596913373466561, + "learning_rate": 4.346541788851305e-07, + "loss": 0.6136, + "step": 22566 + }, + { + "epoch": 1.6303574331280366, + "grad_norm": 6.109398414976709, + "learning_rate": 4.3448937747425924e-07, + "loss": 0.5938, + "step": 22567 + }, + { + "epoch": 1.630429678328246, + "grad_norm": 7.366255408957797, + "learning_rate": 4.3432460433862457e-07, + "loss": 0.6169, + "step": 22568 + }, + { + "epoch": 1.6305019235284557, + "grad_norm": 7.567754008047458, + "learning_rate": 4.341598594804827e-07, + "loss": 0.5855, + "step": 22569 + }, + { + "epoch": 1.630574168728665, + "grad_norm": 5.918730613088629, + "learning_rate": 4.33995142902088e-07, + "loss": 0.5712, + "step": 22570 + }, + { + "epoch": 1.6306464139288745, + "grad_norm": 6.951410043510423, + "learning_rate": 4.3383045460569486e-07, + "loss": 0.5393, + "step": 22571 + }, + { + "epoch": 1.630718659129084, + "grad_norm": 7.029978726220534, + "learning_rate": 4.3366579459355964e-07, + "loss": 0.6562, + "step": 22572 + }, + { + "epoch": 1.6307909043292936, + "grad_norm": 8.731311567959155, + "learning_rate": 4.3350116286793475e-07, + "loss": 0.6226, + "step": 22573 + }, + { + "epoch": 1.6308631495295032, + "grad_norm": 6.942268451130264, + "learning_rate": 4.333365594310743e-07, + "loss": 0.6302, + "step": 22574 + }, + { + "epoch": 1.6309353947297125, + "grad_norm": 8.013809682659229, + "learning_rate": 4.3317198428523194e-07, + "loss": 0.586, + "step": 22575 + }, + { + "epoch": 1.6310076399299223, + "grad_norm": 8.978970011411127, + "learning_rate": 4.330074374326607e-07, + "loss": 0.5961, + "step": 22576 + }, + { + "epoch": 1.6310798851301316, + "grad_norm": 8.20836002971613, + "learning_rate": 4.3284291887561226e-07, + "loss": 0.6406, + "step": 22577 + }, + { + "epoch": 1.6311521303303413, + "grad_norm": 6.604270316512962, + "learning_rate": 4.3267842861633907e-07, + "loss": 0.603, + "step": 22578 + }, + { + "epoch": 1.6312243755305507, + "grad_norm": 7.851518287344423, + "learning_rate": 4.325139666570932e-07, + "loss": 0.6189, + "step": 22579 + }, + { + "epoch": 1.6312966207307602, + "grad_norm": 6.540558382980893, + "learning_rate": 4.3234953300012566e-07, + "loss": 0.5671, + "step": 22580 + }, + { + "epoch": 1.6313688659309697, + "grad_norm": 6.488677507089994, + "learning_rate": 4.3218512764768767e-07, + "loss": 0.6308, + "step": 22581 + }, + { + "epoch": 1.631441111131179, + "grad_norm": 7.94280564212079, + "learning_rate": 4.3202075060202993e-07, + "loss": 0.5574, + "step": 22582 + }, + { + "epoch": 1.6315133563313888, + "grad_norm": 6.668912350395367, + "learning_rate": 4.3185640186540284e-07, + "loss": 0.5661, + "step": 22583 + }, + { + "epoch": 1.6315856015315982, + "grad_norm": 7.879472945736148, + "learning_rate": 4.316920814400549e-07, + "loss": 0.6464, + "step": 22584 + }, + { + "epoch": 1.631657846731808, + "grad_norm": 7.3180176874297835, + "learning_rate": 4.3152778932823683e-07, + "loss": 0.5654, + "step": 22585 + }, + { + "epoch": 1.6317300919320172, + "grad_norm": 7.054185412209831, + "learning_rate": 4.3136352553219715e-07, + "loss": 0.6264, + "step": 22586 + }, + { + "epoch": 1.6318023371322268, + "grad_norm": 6.885962883487223, + "learning_rate": 4.311992900541853e-07, + "loss": 0.5466, + "step": 22587 + }, + { + "epoch": 1.6318745823324363, + "grad_norm": 6.841558793673772, + "learning_rate": 4.3103508289644824e-07, + "loss": 0.5499, + "step": 22588 + }, + { + "epoch": 1.6319468275326456, + "grad_norm": 6.323500613083081, + "learning_rate": 4.308709040612341e-07, + "loss": 0.6053, + "step": 22589 + }, + { + "epoch": 1.6320190727328554, + "grad_norm": 6.967530832431164, + "learning_rate": 4.307067535507911e-07, + "loss": 0.6346, + "step": 22590 + }, + { + "epoch": 1.6320913179330647, + "grad_norm": 7.247794506435701, + "learning_rate": 4.305426313673658e-07, + "loss": 0.6143, + "step": 22591 + }, + { + "epoch": 1.6321635631332745, + "grad_norm": 8.72479497892738, + "learning_rate": 4.30378537513205e-07, + "loss": 0.6659, + "step": 22592 + }, + { + "epoch": 1.6322358083334838, + "grad_norm": 7.299114528933913, + "learning_rate": 4.3021447199055517e-07, + "loss": 0.6511, + "step": 22593 + }, + { + "epoch": 1.6323080535336933, + "grad_norm": 6.817488777310878, + "learning_rate": 4.3005043480166264e-07, + "loss": 0.5536, + "step": 22594 + }, + { + "epoch": 1.632380298733903, + "grad_norm": 5.809192988277187, + "learning_rate": 4.29886425948772e-07, + "loss": 0.5705, + "step": 22595 + }, + { + "epoch": 1.6324525439341124, + "grad_norm": 7.519138839192733, + "learning_rate": 4.2972244543412867e-07, + "loss": 0.6259, + "step": 22596 + }, + { + "epoch": 1.632524789134322, + "grad_norm": 6.338091050374852, + "learning_rate": 4.295584932599783e-07, + "loss": 0.5672, + "step": 22597 + }, + { + "epoch": 1.6325970343345313, + "grad_norm": 7.764778075673066, + "learning_rate": 4.2939456942856396e-07, + "loss": 0.6666, + "step": 22598 + }, + { + "epoch": 1.632669279534741, + "grad_norm": 6.741278771682644, + "learning_rate": 4.2923067394213033e-07, + "loss": 0.5447, + "step": 22599 + }, + { + "epoch": 1.6327415247349504, + "grad_norm": 7.0075424975856375, + "learning_rate": 4.2906680680292024e-07, + "loss": 0.6107, + "step": 22600 + }, + { + "epoch": 1.63281376993516, + "grad_norm": 6.919897895015605, + "learning_rate": 4.289029680131787e-07, + "loss": 0.6297, + "step": 22601 + }, + { + "epoch": 1.6328860151353695, + "grad_norm": 7.697657627673554, + "learning_rate": 4.287391575751468e-07, + "loss": 0.6494, + "step": 22602 + }, + { + "epoch": 1.632958260335579, + "grad_norm": 7.221036474588256, + "learning_rate": 4.2857537549106766e-07, + "loss": 0.5569, + "step": 22603 + }, + { + "epoch": 1.6330305055357885, + "grad_norm": 7.611943306948833, + "learning_rate": 4.2841162176318395e-07, + "loss": 0.6668, + "step": 22604 + }, + { + "epoch": 1.6331027507359979, + "grad_norm": 7.659911731978327, + "learning_rate": 4.2824789639373615e-07, + "loss": 0.6277, + "step": 22605 + }, + { + "epoch": 1.6331749959362076, + "grad_norm": 7.12805565345008, + "learning_rate": 4.2808419938496577e-07, + "loss": 0.5559, + "step": 22606 + }, + { + "epoch": 1.633247241136417, + "grad_norm": 7.404152875067024, + "learning_rate": 4.2792053073911407e-07, + "loss": 0.602, + "step": 22607 + }, + { + "epoch": 1.6333194863366265, + "grad_norm": 8.280906814085895, + "learning_rate": 4.2775689045842144e-07, + "loss": 0.6385, + "step": 22608 + }, + { + "epoch": 1.633391731536836, + "grad_norm": 7.021167358542321, + "learning_rate": 4.2759327854512813e-07, + "loss": 0.548, + "step": 22609 + }, + { + "epoch": 1.6334639767370456, + "grad_norm": 7.032708182130125, + "learning_rate": 4.274296950014734e-07, + "loss": 0.6544, + "step": 22610 + }, + { + "epoch": 1.6335362219372551, + "grad_norm": 6.603256531906956, + "learning_rate": 4.2726613982969705e-07, + "loss": 0.6386, + "step": 22611 + }, + { + "epoch": 1.6336084671374644, + "grad_norm": 6.819458653507938, + "learning_rate": 4.2710261303203855e-07, + "loss": 0.5781, + "step": 22612 + }, + { + "epoch": 1.6336807123376742, + "grad_norm": 7.064559771191846, + "learning_rate": 4.2693911461073516e-07, + "loss": 0.6617, + "step": 22613 + }, + { + "epoch": 1.6337529575378835, + "grad_norm": 8.031996636621752, + "learning_rate": 4.2677564456802574e-07, + "loss": 0.5847, + "step": 22614 + }, + { + "epoch": 1.633825202738093, + "grad_norm": 7.245545959331104, + "learning_rate": 4.266122029061484e-07, + "loss": 0.6727, + "step": 22615 + }, + { + "epoch": 1.6338974479383026, + "grad_norm": 6.613359120739118, + "learning_rate": 4.2644878962733977e-07, + "loss": 0.6537, + "step": 22616 + }, + { + "epoch": 1.6339696931385121, + "grad_norm": 7.686929821175916, + "learning_rate": 4.262854047338369e-07, + "loss": 0.5441, + "step": 22617 + }, + { + "epoch": 1.6340419383387217, + "grad_norm": 7.591362220641973, + "learning_rate": 4.261220482278769e-07, + "loss": 0.598, + "step": 22618 + }, + { + "epoch": 1.634114183538931, + "grad_norm": 6.614277349488531, + "learning_rate": 4.2595872011169575e-07, + "loss": 0.583, + "step": 22619 + }, + { + "epoch": 1.6341864287391408, + "grad_norm": 7.0371727984236, + "learning_rate": 4.257954203875292e-07, + "loss": 0.5789, + "step": 22620 + }, + { + "epoch": 1.63425867393935, + "grad_norm": 6.893106096065208, + "learning_rate": 4.256321490576129e-07, + "loss": 0.6051, + "step": 22621 + }, + { + "epoch": 1.6343309191395596, + "grad_norm": 7.518331631177054, + "learning_rate": 4.254689061241826e-07, + "loss": 0.6515, + "step": 22622 + }, + { + "epoch": 1.6344031643397692, + "grad_norm": 7.758870862237931, + "learning_rate": 4.2530569158947154e-07, + "loss": 0.5791, + "step": 22623 + }, + { + "epoch": 1.6344754095399787, + "grad_norm": 6.829608118926405, + "learning_rate": 4.2514250545571457e-07, + "loss": 0.5594, + "step": 22624 + }, + { + "epoch": 1.6345476547401883, + "grad_norm": 7.1621706262720375, + "learning_rate": 4.249793477251457e-07, + "loss": 0.6328, + "step": 22625 + }, + { + "epoch": 1.6346198999403976, + "grad_norm": 8.285406717689789, + "learning_rate": 4.2481621839999903e-07, + "loss": 0.624, + "step": 22626 + }, + { + "epoch": 1.6346921451406073, + "grad_norm": 7.105069040691867, + "learning_rate": 4.2465311748250667e-07, + "loss": 0.5616, + "step": 22627 + }, + { + "epoch": 1.6347643903408167, + "grad_norm": 6.560570714827183, + "learning_rate": 4.2449004497490095e-07, + "loss": 0.6214, + "step": 22628 + }, + { + "epoch": 1.6348366355410262, + "grad_norm": 7.0273991190842615, + "learning_rate": 4.2432700087941593e-07, + "loss": 0.5944, + "step": 22629 + }, + { + "epoch": 1.6349088807412357, + "grad_norm": 8.56749807793557, + "learning_rate": 4.241639851982823e-07, + "loss": 0.6405, + "step": 22630 + }, + { + "epoch": 1.6349811259414453, + "grad_norm": 6.174339092268772, + "learning_rate": 4.24000997933732e-07, + "loss": 0.6271, + "step": 22631 + }, + { + "epoch": 1.6350533711416548, + "grad_norm": 6.293921599184382, + "learning_rate": 4.238380390879959e-07, + "loss": 0.6058, + "step": 22632 + }, + { + "epoch": 1.6351256163418642, + "grad_norm": 6.703713984722562, + "learning_rate": 4.2367510866330566e-07, + "loss": 0.6585, + "step": 22633 + }, + { + "epoch": 1.635197861542074, + "grad_norm": 7.553542704271546, + "learning_rate": 4.2351220666189046e-07, + "loss": 0.6033, + "step": 22634 + }, + { + "epoch": 1.6352701067422832, + "grad_norm": 7.434515346182103, + "learning_rate": 4.2334933308598084e-07, + "loss": 0.5682, + "step": 22635 + }, + { + "epoch": 1.6353423519424928, + "grad_norm": 7.076550036903907, + "learning_rate": 4.2318648793780643e-07, + "loss": 0.6453, + "step": 22636 + }, + { + "epoch": 1.6354145971427023, + "grad_norm": 8.958309250436985, + "learning_rate": 4.230236712195965e-07, + "loss": 0.7079, + "step": 22637 + }, + { + "epoch": 1.6354868423429119, + "grad_norm": 6.618080988600826, + "learning_rate": 4.2286088293358006e-07, + "loss": 0.5706, + "step": 22638 + }, + { + "epoch": 1.6355590875431214, + "grad_norm": 6.983286932864692, + "learning_rate": 4.2269812308198486e-07, + "loss": 0.611, + "step": 22639 + }, + { + "epoch": 1.6356313327433307, + "grad_norm": 6.230628855211686, + "learning_rate": 4.225353916670405e-07, + "loss": 0.6472, + "step": 22640 + }, + { + "epoch": 1.6357035779435405, + "grad_norm": 7.241708947494687, + "learning_rate": 4.2237268869097263e-07, + "loss": 0.5852, + "step": 22641 + }, + { + "epoch": 1.6357758231437498, + "grad_norm": 7.365398816862894, + "learning_rate": 4.2221001415600983e-07, + "loss": 0.6122, + "step": 22642 + }, + { + "epoch": 1.6358480683439593, + "grad_norm": 7.2391030863549, + "learning_rate": 4.220473680643783e-07, + "loss": 0.5783, + "step": 22643 + }, + { + "epoch": 1.635920313544169, + "grad_norm": 6.990192900303057, + "learning_rate": 4.2188475041830573e-07, + "loss": 0.6018, + "step": 22644 + }, + { + "epoch": 1.6359925587443784, + "grad_norm": 6.794068487119188, + "learning_rate": 4.217221612200165e-07, + "loss": 0.5587, + "step": 22645 + }, + { + "epoch": 1.636064803944588, + "grad_norm": 7.188138717798813, + "learning_rate": 4.215596004717373e-07, + "loss": 0.621, + "step": 22646 + }, + { + "epoch": 1.6361370491447973, + "grad_norm": 6.60722788752333, + "learning_rate": 4.213970681756935e-07, + "loss": 0.5493, + "step": 22647 + }, + { + "epoch": 1.636209294345007, + "grad_norm": 6.3258649392968085, + "learning_rate": 4.212345643341098e-07, + "loss": 0.6356, + "step": 22648 + }, + { + "epoch": 1.6362815395452164, + "grad_norm": 6.368489849553597, + "learning_rate": 4.210720889492109e-07, + "loss": 0.5936, + "step": 22649 + }, + { + "epoch": 1.636353784745426, + "grad_norm": 6.965615700963258, + "learning_rate": 4.209096420232209e-07, + "loss": 0.6623, + "step": 22650 + }, + { + "epoch": 1.6364260299456355, + "grad_norm": 7.082098580243922, + "learning_rate": 4.2074722355836427e-07, + "loss": 0.5752, + "step": 22651 + }, + { + "epoch": 1.636498275145845, + "grad_norm": 6.8122085368951, + "learning_rate": 4.2058483355686295e-07, + "loss": 0.6496, + "step": 22652 + }, + { + "epoch": 1.6365705203460545, + "grad_norm": 7.867604249373239, + "learning_rate": 4.2042247202094074e-07, + "loss": 0.6532, + "step": 22653 + }, + { + "epoch": 1.6366427655462639, + "grad_norm": 8.200212001386049, + "learning_rate": 4.2026013895282075e-07, + "loss": 0.6184, + "step": 22654 + }, + { + "epoch": 1.6367150107464736, + "grad_norm": 6.390470395631405, + "learning_rate": 4.2009783435472436e-07, + "loss": 0.6231, + "step": 22655 + }, + { + "epoch": 1.636787255946683, + "grad_norm": 9.23191095774004, + "learning_rate": 4.1993555822887277e-07, + "loss": 0.5747, + "step": 22656 + }, + { + "epoch": 1.6368595011468927, + "grad_norm": 7.96124871930618, + "learning_rate": 4.19773310577489e-07, + "loss": 0.6101, + "step": 22657 + }, + { + "epoch": 1.636931746347102, + "grad_norm": 6.721265313143647, + "learning_rate": 4.19611091402794e-07, + "loss": 0.6408, + "step": 22658 + }, + { + "epoch": 1.6370039915473116, + "grad_norm": 7.169658509532414, + "learning_rate": 4.194489007070071e-07, + "loss": 0.5261, + "step": 22659 + }, + { + "epoch": 1.6370762367475211, + "grad_norm": 7.316786854422293, + "learning_rate": 4.192867384923496e-07, + "loss": 0.5846, + "step": 22660 + }, + { + "epoch": 1.6371484819477304, + "grad_norm": 6.764093168369346, + "learning_rate": 4.191246047610409e-07, + "loss": 0.6057, + "step": 22661 + }, + { + "epoch": 1.6372207271479402, + "grad_norm": 6.495316799034842, + "learning_rate": 4.1896249951530133e-07, + "loss": 0.5886, + "step": 22662 + }, + { + "epoch": 1.6372929723481495, + "grad_norm": 7.319007521088498, + "learning_rate": 4.1880042275734864e-07, + "loss": 0.641, + "step": 22663 + }, + { + "epoch": 1.6373652175483593, + "grad_norm": 6.627162652338981, + "learning_rate": 4.186383744894024e-07, + "loss": 0.5534, + "step": 22664 + }, + { + "epoch": 1.6374374627485686, + "grad_norm": 6.6668522173173175, + "learning_rate": 4.184763547136805e-07, + "loss": 0.5762, + "step": 22665 + }, + { + "epoch": 1.6375097079487781, + "grad_norm": 7.537711383139302, + "learning_rate": 4.1831436343240136e-07, + "loss": 0.6112, + "step": 22666 + }, + { + "epoch": 1.6375819531489877, + "grad_norm": 6.506619823839596, + "learning_rate": 4.1815240064778215e-07, + "loss": 0.6023, + "step": 22667 + }, + { + "epoch": 1.637654198349197, + "grad_norm": 6.95397313120854, + "learning_rate": 4.1799046636204014e-07, + "loss": 0.6138, + "step": 22668 + }, + { + "epoch": 1.6377264435494068, + "grad_norm": 7.049800020452293, + "learning_rate": 4.178285605773924e-07, + "loss": 0.6113, + "step": 22669 + }, + { + "epoch": 1.637798688749616, + "grad_norm": 6.899526784398537, + "learning_rate": 4.1766668329605463e-07, + "loss": 0.5997, + "step": 22670 + }, + { + "epoch": 1.6378709339498259, + "grad_norm": 7.834080058549921, + "learning_rate": 4.175048345202432e-07, + "loss": 0.629, + "step": 22671 + }, + { + "epoch": 1.6379431791500352, + "grad_norm": 6.8143338876068515, + "learning_rate": 4.1734301425217423e-07, + "loss": 0.6229, + "step": 22672 + }, + { + "epoch": 1.6380154243502447, + "grad_norm": 6.886835349356799, + "learning_rate": 4.1718122249406163e-07, + "loss": 0.5424, + "step": 22673 + }, + { + "epoch": 1.6380876695504543, + "grad_norm": 7.0741216862474126, + "learning_rate": 4.1701945924812084e-07, + "loss": 0.589, + "step": 22674 + }, + { + "epoch": 1.6381599147506638, + "grad_norm": 6.547236539558628, + "learning_rate": 4.168577245165664e-07, + "loss": 0.5452, + "step": 22675 + }, + { + "epoch": 1.6382321599508733, + "grad_norm": 7.701470806793377, + "learning_rate": 4.166960183016122e-07, + "loss": 0.615, + "step": 22676 + }, + { + "epoch": 1.6383044051510827, + "grad_norm": 5.661586878164452, + "learning_rate": 4.1653434060547197e-07, + "loss": 0.5292, + "step": 22677 + }, + { + "epoch": 1.6383766503512924, + "grad_norm": 6.21026216110231, + "learning_rate": 4.163726914303587e-07, + "loss": 0.6089, + "step": 22678 + }, + { + "epoch": 1.6384488955515017, + "grad_norm": 6.432479530298938, + "learning_rate": 4.162110707784864e-07, + "loss": 0.5713, + "step": 22679 + }, + { + "epoch": 1.6385211407517113, + "grad_norm": 7.358047657220476, + "learning_rate": 4.160494786520658e-07, + "loss": 0.65, + "step": 22680 + }, + { + "epoch": 1.6385933859519208, + "grad_norm": 6.681389985433508, + "learning_rate": 4.158879150533096e-07, + "loss": 0.6199, + "step": 22681 + }, + { + "epoch": 1.6386656311521304, + "grad_norm": 6.595242733598809, + "learning_rate": 4.157263799844299e-07, + "loss": 0.5397, + "step": 22682 + }, + { + "epoch": 1.63873787635234, + "grad_norm": 5.884328699968041, + "learning_rate": 4.155648734476384e-07, + "loss": 0.5863, + "step": 22683 + }, + { + "epoch": 1.6388101215525492, + "grad_norm": 9.114468284244133, + "learning_rate": 4.1540339544514446e-07, + "loss": 0.5954, + "step": 22684 + }, + { + "epoch": 1.638882366752759, + "grad_norm": 7.584713680971382, + "learning_rate": 4.15241945979159e-07, + "loss": 0.6269, + "step": 22685 + }, + { + "epoch": 1.6389546119529683, + "grad_norm": 8.328467614317436, + "learning_rate": 4.15080525051893e-07, + "loss": 0.5806, + "step": 22686 + }, + { + "epoch": 1.6390268571531779, + "grad_norm": 6.856923701553473, + "learning_rate": 4.149191326655566e-07, + "loss": 0.5268, + "step": 22687 + }, + { + "epoch": 1.6390991023533874, + "grad_norm": 6.928769503474292, + "learning_rate": 4.1475776882235755e-07, + "loss": 0.578, + "step": 22688 + }, + { + "epoch": 1.639171347553597, + "grad_norm": 7.663353439023708, + "learning_rate": 4.145964335245056e-07, + "loss": 0.6023, + "step": 22689 + }, + { + "epoch": 1.6392435927538065, + "grad_norm": 6.945767811460635, + "learning_rate": 4.1443512677421e-07, + "loss": 0.6274, + "step": 22690 + }, + { + "epoch": 1.6393158379540158, + "grad_norm": 7.502709217300457, + "learning_rate": 4.1427384857367747e-07, + "loss": 0.5946, + "step": 22691 + }, + { + "epoch": 1.6393880831542256, + "grad_norm": 6.431284153498204, + "learning_rate": 4.1411259892511657e-07, + "loss": 0.6118, + "step": 22692 + }, + { + "epoch": 1.639460328354435, + "grad_norm": 6.616733215663574, + "learning_rate": 4.1395137783073444e-07, + "loss": 0.5625, + "step": 22693 + }, + { + "epoch": 1.6395325735546444, + "grad_norm": 7.890586082438062, + "learning_rate": 4.137901852927384e-07, + "loss": 0.6262, + "step": 22694 + }, + { + "epoch": 1.639604818754854, + "grad_norm": 7.709605866289495, + "learning_rate": 4.136290213133348e-07, + "loss": 0.5669, + "step": 22695 + }, + { + "epoch": 1.6396770639550635, + "grad_norm": 6.723991301377188, + "learning_rate": 4.1346788589472997e-07, + "loss": 0.589, + "step": 22696 + }, + { + "epoch": 1.639749309155273, + "grad_norm": 6.564164886002999, + "learning_rate": 4.133067790391301e-07, + "loss": 0.6309, + "step": 22697 + }, + { + "epoch": 1.6398215543554824, + "grad_norm": 7.838527688724732, + "learning_rate": 4.1314570074873994e-07, + "loss": 0.626, + "step": 22698 + }, + { + "epoch": 1.6398937995556921, + "grad_norm": 6.34962169091008, + "learning_rate": 4.129846510257646e-07, + "loss": 0.5612, + "step": 22699 + }, + { + "epoch": 1.6399660447559015, + "grad_norm": 7.97600557211908, + "learning_rate": 4.1282362987240894e-07, + "loss": 0.6699, + "step": 22700 + }, + { + "epoch": 1.640038289956111, + "grad_norm": 6.858776301018434, + "learning_rate": 4.126626372908779e-07, + "loss": 0.6297, + "step": 22701 + }, + { + "epoch": 1.6401105351563205, + "grad_norm": 7.028568553370155, + "learning_rate": 4.125016732833739e-07, + "loss": 0.6967, + "step": 22702 + }, + { + "epoch": 1.64018278035653, + "grad_norm": 6.603976594935439, + "learning_rate": 4.123407378521013e-07, + "loss": 0.6679, + "step": 22703 + }, + { + "epoch": 1.6402550255567396, + "grad_norm": 7.614299332262231, + "learning_rate": 4.121798309992631e-07, + "loss": 0.5765, + "step": 22704 + }, + { + "epoch": 1.640327270756949, + "grad_norm": 6.8917686568635315, + "learning_rate": 4.1201895272706155e-07, + "loss": 0.6244, + "step": 22705 + }, + { + "epoch": 1.6403995159571587, + "grad_norm": 7.081827639447584, + "learning_rate": 4.1185810303769973e-07, + "loss": 0.5944, + "step": 22706 + }, + { + "epoch": 1.640471761157368, + "grad_norm": 7.365610384693948, + "learning_rate": 4.1169728193337895e-07, + "loss": 0.6669, + "step": 22707 + }, + { + "epoch": 1.6405440063575776, + "grad_norm": 7.29379697652168, + "learning_rate": 4.115364894163015e-07, + "loss": 0.5726, + "step": 22708 + }, + { + "epoch": 1.6406162515577871, + "grad_norm": 6.884873669364982, + "learning_rate": 4.1137572548866747e-07, + "loss": 0.5948, + "step": 22709 + }, + { + "epoch": 1.6406884967579967, + "grad_norm": 5.853454365503604, + "learning_rate": 4.1121499015267794e-07, + "loss": 0.5513, + "step": 22710 + }, + { + "epoch": 1.6407607419582062, + "grad_norm": 7.2016001512698224, + "learning_rate": 4.1105428341053383e-07, + "loss": 0.5296, + "step": 22711 + }, + { + "epoch": 1.6408329871584155, + "grad_norm": 7.810407434598632, + "learning_rate": 4.108936052644341e-07, + "loss": 0.5724, + "step": 22712 + }, + { + "epoch": 1.6409052323586253, + "grad_norm": 8.272326228189138, + "learning_rate": 4.1073295571657844e-07, + "loss": 0.6527, + "step": 22713 + }, + { + "epoch": 1.6409774775588346, + "grad_norm": 7.450936369967646, + "learning_rate": 4.1057233476916674e-07, + "loss": 0.5927, + "step": 22714 + }, + { + "epoch": 1.6410497227590441, + "grad_norm": 7.454860141396565, + "learning_rate": 4.104117424243981e-07, + "loss": 0.5959, + "step": 22715 + }, + { + "epoch": 1.6411219679592537, + "grad_norm": 7.064711233016407, + "learning_rate": 4.1025117868446985e-07, + "loss": 0.63, + "step": 22716 + }, + { + "epoch": 1.6411942131594632, + "grad_norm": 6.371386831759009, + "learning_rate": 4.100906435515803e-07, + "loss": 0.5838, + "step": 22717 + }, + { + "epoch": 1.6412664583596728, + "grad_norm": 7.20939234486359, + "learning_rate": 4.099301370279268e-07, + "loss": 0.5834, + "step": 22718 + }, + { + "epoch": 1.641338703559882, + "grad_norm": 6.626335153455337, + "learning_rate": 4.0976965911570796e-07, + "loss": 0.6133, + "step": 22719 + }, + { + "epoch": 1.6414109487600919, + "grad_norm": 8.658887550844165, + "learning_rate": 4.0960920981711867e-07, + "loss": 0.6507, + "step": 22720 + }, + { + "epoch": 1.6414831939603012, + "grad_norm": 9.33656795400044, + "learning_rate": 4.0944878913435626e-07, + "loss": 0.6136, + "step": 22721 + }, + { + "epoch": 1.6415554391605107, + "grad_norm": 7.61132662143942, + "learning_rate": 4.092883970696165e-07, + "loss": 0.5669, + "step": 22722 + }, + { + "epoch": 1.6416276843607203, + "grad_norm": 6.6058279489118545, + "learning_rate": 4.091280336250955e-07, + "loss": 0.5856, + "step": 22723 + }, + { + "epoch": 1.6416999295609298, + "grad_norm": 7.568722521529651, + "learning_rate": 4.0896769880298835e-07, + "loss": 0.579, + "step": 22724 + }, + { + "epoch": 1.6417721747611393, + "grad_norm": 6.95975478343223, + "learning_rate": 4.0880739260548973e-07, + "loss": 0.629, + "step": 22725 + }, + { + "epoch": 1.6418444199613487, + "grad_norm": 7.61281399885753, + "learning_rate": 4.086471150347948e-07, + "loss": 0.562, + "step": 22726 + }, + { + "epoch": 1.6419166651615584, + "grad_norm": 7.239166320938952, + "learning_rate": 4.0848686609309627e-07, + "loss": 0.6203, + "step": 22727 + }, + { + "epoch": 1.6419889103617678, + "grad_norm": 6.532828838306973, + "learning_rate": 4.083266457825888e-07, + "loss": 0.6239, + "step": 22728 + }, + { + "epoch": 1.6420611555619775, + "grad_norm": 6.972035270842445, + "learning_rate": 4.0816645410546586e-07, + "loss": 0.5657, + "step": 22729 + }, + { + "epoch": 1.6421334007621868, + "grad_norm": 8.48141602119654, + "learning_rate": 4.080062910639196e-07, + "loss": 0.5895, + "step": 22730 + }, + { + "epoch": 1.6422056459623964, + "grad_norm": 7.332541365196275, + "learning_rate": 4.078461566601427e-07, + "loss": 0.5744, + "step": 22731 + }, + { + "epoch": 1.642277891162606, + "grad_norm": 7.048516668011667, + "learning_rate": 4.076860508963276e-07, + "loss": 0.5751, + "step": 22732 + }, + { + "epoch": 1.6423501363628152, + "grad_norm": 7.019606244382364, + "learning_rate": 4.075259737746659e-07, + "loss": 0.6718, + "step": 22733 + }, + { + "epoch": 1.642422381563025, + "grad_norm": 8.110726833675747, + "learning_rate": 4.0736592529734887e-07, + "loss": 0.6223, + "step": 22734 + }, + { + "epoch": 1.6424946267632343, + "grad_norm": 6.5981841161315815, + "learning_rate": 4.0720590546656727e-07, + "loss": 0.5954, + "step": 22735 + }, + { + "epoch": 1.642566871963444, + "grad_norm": 7.95327512827975, + "learning_rate": 4.0704591428451263e-07, + "loss": 0.609, + "step": 22736 + }, + { + "epoch": 1.6426391171636534, + "grad_norm": 7.598700271697246, + "learning_rate": 4.068859517533738e-07, + "loss": 0.6219, + "step": 22737 + }, + { + "epoch": 1.642711362363863, + "grad_norm": 8.30157073451502, + "learning_rate": 4.0672601787534126e-07, + "loss": 0.6486, + "step": 22738 + }, + { + "epoch": 1.6427836075640725, + "grad_norm": 8.538664449923003, + "learning_rate": 4.065661126526041e-07, + "loss": 0.6068, + "step": 22739 + }, + { + "epoch": 1.6428558527642818, + "grad_norm": 7.18216832649564, + "learning_rate": 4.064062360873519e-07, + "loss": 0.6706, + "step": 22740 + }, + { + "epoch": 1.6429280979644916, + "grad_norm": 6.874734491943438, + "learning_rate": 4.062463881817716e-07, + "loss": 0.6353, + "step": 22741 + }, + { + "epoch": 1.643000343164701, + "grad_norm": 6.99789887638297, + "learning_rate": 4.0608656893805304e-07, + "loss": 0.5725, + "step": 22742 + }, + { + "epoch": 1.6430725883649107, + "grad_norm": 6.7017001258200155, + "learning_rate": 4.0592677835838366e-07, + "loss": 0.6438, + "step": 22743 + }, + { + "epoch": 1.64314483356512, + "grad_norm": 6.071745439483178, + "learning_rate": 4.057670164449515e-07, + "loss": 0.5501, + "step": 22744 + }, + { + "epoch": 1.6432170787653295, + "grad_norm": 8.268035594846955, + "learning_rate": 4.056072831999419e-07, + "loss": 0.6068, + "step": 22745 + }, + { + "epoch": 1.643289323965539, + "grad_norm": 6.352239331058219, + "learning_rate": 4.054475786255427e-07, + "loss": 0.6013, + "step": 22746 + }, + { + "epoch": 1.6433615691657486, + "grad_norm": 6.1507056971139304, + "learning_rate": 4.0528790272394007e-07, + "loss": 0.5535, + "step": 22747 + }, + { + "epoch": 1.6434338143659581, + "grad_norm": 7.405387989211095, + "learning_rate": 4.051282554973193e-07, + "loss": 0.5823, + "step": 22748 + }, + { + "epoch": 1.6435060595661675, + "grad_norm": 6.694686878223726, + "learning_rate": 4.0496863694786617e-07, + "loss": 0.6144, + "step": 22749 + }, + { + "epoch": 1.6435783047663772, + "grad_norm": 8.254281522467503, + "learning_rate": 4.048090470777655e-07, + "loss": 0.6098, + "step": 22750 + }, + { + "epoch": 1.6436505499665865, + "grad_norm": 7.098688633125036, + "learning_rate": 4.046494858892022e-07, + "loss": 0.6188, + "step": 22751 + }, + { + "epoch": 1.643722795166796, + "grad_norm": 6.584381535132613, + "learning_rate": 4.0448995338436077e-07, + "loss": 0.5861, + "step": 22752 + }, + { + "epoch": 1.6437950403670056, + "grad_norm": 7.481338171570801, + "learning_rate": 4.043304495654246e-07, + "loss": 0.6675, + "step": 22753 + }, + { + "epoch": 1.6438672855672152, + "grad_norm": 7.793534597251409, + "learning_rate": 4.0417097443457813e-07, + "loss": 0.5857, + "step": 22754 + }, + { + "epoch": 1.6439395307674247, + "grad_norm": 7.151633560197677, + "learning_rate": 4.0401152799400297e-07, + "loss": 0.6319, + "step": 22755 + }, + { + "epoch": 1.644011775967634, + "grad_norm": 9.216344694155291, + "learning_rate": 4.0385211024588253e-07, + "loss": 0.6487, + "step": 22756 + }, + { + "epoch": 1.6440840211678438, + "grad_norm": 7.85359309873811, + "learning_rate": 4.036927211923991e-07, + "loss": 0.6556, + "step": 22757 + }, + { + "epoch": 1.6441562663680531, + "grad_norm": 7.021196969083128, + "learning_rate": 4.0353336083573557e-07, + "loss": 0.5932, + "step": 22758 + }, + { + "epoch": 1.6442285115682627, + "grad_norm": 6.761731725688012, + "learning_rate": 4.033740291780716e-07, + "loss": 0.6258, + "step": 22759 + }, + { + "epoch": 1.6443007567684722, + "grad_norm": 7.46672049684648, + "learning_rate": 4.0321472622158934e-07, + "loss": 0.5706, + "step": 22760 + }, + { + "epoch": 1.6443730019686817, + "grad_norm": 7.153947884693742, + "learning_rate": 4.0305545196846955e-07, + "loss": 0.6065, + "step": 22761 + }, + { + "epoch": 1.6444452471688913, + "grad_norm": 6.953189189486013, + "learning_rate": 4.0289620642089246e-07, + "loss": 0.5926, + "step": 22762 + }, + { + "epoch": 1.6445174923691006, + "grad_norm": 8.501733042249661, + "learning_rate": 4.027369895810379e-07, + "loss": 0.6811, + "step": 22763 + }, + { + "epoch": 1.6445897375693104, + "grad_norm": 7.743609839549508, + "learning_rate": 4.0257780145108534e-07, + "loss": 0.5932, + "step": 22764 + }, + { + "epoch": 1.6446619827695197, + "grad_norm": 7.506729413047529, + "learning_rate": 4.024186420332149e-07, + "loss": 0.6378, + "step": 22765 + }, + { + "epoch": 1.6447342279697292, + "grad_norm": 6.15442394148418, + "learning_rate": 4.0225951132960405e-07, + "loss": 0.6004, + "step": 22766 + }, + { + "epoch": 1.6448064731699388, + "grad_norm": 6.400748519040871, + "learning_rate": 4.021004093424319e-07, + "loss": 0.5709, + "step": 22767 + }, + { + "epoch": 1.6448787183701483, + "grad_norm": 8.277820265601893, + "learning_rate": 4.019413360738758e-07, + "loss": 0.615, + "step": 22768 + }, + { + "epoch": 1.6449509635703579, + "grad_norm": 6.4105885770762905, + "learning_rate": 4.0178229152611407e-07, + "loss": 0.5693, + "step": 22769 + }, + { + "epoch": 1.6450232087705672, + "grad_norm": 7.425037874581169, + "learning_rate": 4.016232757013236e-07, + "loss": 0.6352, + "step": 22770 + }, + { + "epoch": 1.645095453970777, + "grad_norm": 7.519933785897793, + "learning_rate": 4.014642886016809e-07, + "loss": 0.6026, + "step": 22771 + }, + { + "epoch": 1.6451676991709863, + "grad_norm": 6.660072053170143, + "learning_rate": 4.013053302293635e-07, + "loss": 0.5727, + "step": 22772 + }, + { + "epoch": 1.6452399443711958, + "grad_norm": 7.651020365963017, + "learning_rate": 4.011464005865462e-07, + "loss": 0.6536, + "step": 22773 + }, + { + "epoch": 1.6453121895714053, + "grad_norm": 6.601443751243366, + "learning_rate": 4.0098749967540456e-07, + "loss": 0.6067, + "step": 22774 + }, + { + "epoch": 1.645384434771615, + "grad_norm": 8.768578560445423, + "learning_rate": 4.0082862749811463e-07, + "loss": 0.6753, + "step": 22775 + }, + { + "epoch": 1.6454566799718244, + "grad_norm": 6.588745242706764, + "learning_rate": 4.0066978405685133e-07, + "loss": 0.578, + "step": 22776 + }, + { + "epoch": 1.6455289251720338, + "grad_norm": 9.64467482616399, + "learning_rate": 4.0051096935378814e-07, + "loss": 0.7025, + "step": 22777 + }, + { + "epoch": 1.6456011703722435, + "grad_norm": 6.764010829291989, + "learning_rate": 4.0035218339109977e-07, + "loss": 0.5671, + "step": 22778 + }, + { + "epoch": 1.6456734155724528, + "grad_norm": 6.6595098545377605, + "learning_rate": 4.001934261709595e-07, + "loss": 0.6057, + "step": 22779 + }, + { + "epoch": 1.6457456607726624, + "grad_norm": 8.420890679809862, + "learning_rate": 4.0003469769554074e-07, + "loss": 0.6501, + "step": 22780 + }, + { + "epoch": 1.645817905972872, + "grad_norm": 7.289757207901131, + "learning_rate": 3.9987599796701664e-07, + "loss": 0.5427, + "step": 22781 + }, + { + "epoch": 1.6458901511730815, + "grad_norm": 8.000733818730854, + "learning_rate": 3.9971732698755955e-07, + "loss": 0.5524, + "step": 22782 + }, + { + "epoch": 1.645962396373291, + "grad_norm": 7.367147111832558, + "learning_rate": 3.995586847593419e-07, + "loss": 0.6543, + "step": 22783 + }, + { + "epoch": 1.6460346415735003, + "grad_norm": 6.486578362475787, + "learning_rate": 3.994000712845347e-07, + "loss": 0.623, + "step": 22784 + }, + { + "epoch": 1.64610688677371, + "grad_norm": 8.764629232656537, + "learning_rate": 3.992414865653091e-07, + "loss": 0.5823, + "step": 22785 + }, + { + "epoch": 1.6461791319739194, + "grad_norm": 8.213935863555635, + "learning_rate": 3.990829306038374e-07, + "loss": 0.6511, + "step": 22786 + }, + { + "epoch": 1.646251377174129, + "grad_norm": 5.920256622525961, + "learning_rate": 3.989244034022882e-07, + "loss": 0.5187, + "step": 22787 + }, + { + "epoch": 1.6463236223743385, + "grad_norm": 6.355812068170238, + "learning_rate": 3.9876590496283255e-07, + "loss": 0.6048, + "step": 22788 + }, + { + "epoch": 1.646395867574548, + "grad_norm": 7.3322586081225145, + "learning_rate": 3.986074352876404e-07, + "loss": 0.6627, + "step": 22789 + }, + { + "epoch": 1.6464681127747576, + "grad_norm": 6.964742423372582, + "learning_rate": 3.984489943788808e-07, + "loss": 0.5988, + "step": 22790 + }, + { + "epoch": 1.646540357974967, + "grad_norm": 6.137765776867789, + "learning_rate": 3.9829058223872257e-07, + "loss": 0.5985, + "step": 22791 + }, + { + "epoch": 1.6466126031751767, + "grad_norm": 6.5234421781420355, + "learning_rate": 3.981321988693343e-07, + "loss": 0.5981, + "step": 22792 + }, + { + "epoch": 1.646684848375386, + "grad_norm": 8.556193681619346, + "learning_rate": 3.979738442728845e-07, + "loss": 0.6401, + "step": 22793 + }, + { + "epoch": 1.6467570935755955, + "grad_norm": 7.315944805756732, + "learning_rate": 3.9781551845154113e-07, + "loss": 0.6082, + "step": 22794 + }, + { + "epoch": 1.646829338775805, + "grad_norm": 6.101000106623945, + "learning_rate": 3.976572214074706e-07, + "loss": 0.6195, + "step": 22795 + }, + { + "epoch": 1.6469015839760146, + "grad_norm": 8.196322048906724, + "learning_rate": 3.9749895314284e-07, + "loss": 0.5922, + "step": 22796 + }, + { + "epoch": 1.6469738291762241, + "grad_norm": 7.214697026205026, + "learning_rate": 3.9734071365981704e-07, + "loss": 0.5781, + "step": 22797 + }, + { + "epoch": 1.6470460743764335, + "grad_norm": 6.03062032466347, + "learning_rate": 3.971825029605661e-07, + "loss": 0.6087, + "step": 22798 + }, + { + "epoch": 1.6471183195766432, + "grad_norm": 6.115821656665015, + "learning_rate": 3.970243210472544e-07, + "loss": 0.5669, + "step": 22799 + }, + { + "epoch": 1.6471905647768526, + "grad_norm": 7.957715581382004, + "learning_rate": 3.9686616792204677e-07, + "loss": 0.6934, + "step": 22800 + }, + { + "epoch": 1.6472628099770623, + "grad_norm": 8.610876045983758, + "learning_rate": 3.9670804358710903e-07, + "loss": 0.671, + "step": 22801 + }, + { + "epoch": 1.6473350551772716, + "grad_norm": 8.10576524307121, + "learning_rate": 3.9654994804460445e-07, + "loss": 0.5437, + "step": 22802 + }, + { + "epoch": 1.6474073003774812, + "grad_norm": 6.130355984431052, + "learning_rate": 3.9639188129669797e-07, + "loss": 0.6444, + "step": 22803 + }, + { + "epoch": 1.6474795455776907, + "grad_norm": 6.756519701124217, + "learning_rate": 3.962338433455537e-07, + "loss": 0.6363, + "step": 22804 + }, + { + "epoch": 1.6475517907779, + "grad_norm": 7.454994463032662, + "learning_rate": 3.9607583419333433e-07, + "loss": 0.6254, + "step": 22805 + }, + { + "epoch": 1.6476240359781098, + "grad_norm": 6.467215093317403, + "learning_rate": 3.9591785384220286e-07, + "loss": 0.5869, + "step": 22806 + }, + { + "epoch": 1.6476962811783191, + "grad_norm": 6.30773594932417, + "learning_rate": 3.957599022943226e-07, + "loss": 0.5976, + "step": 22807 + }, + { + "epoch": 1.6477685263785289, + "grad_norm": 7.53396772678593, + "learning_rate": 3.956019795518551e-07, + "loss": 0.6244, + "step": 22808 + }, + { + "epoch": 1.6478407715787382, + "grad_norm": 7.854992594578857, + "learning_rate": 3.954440856169628e-07, + "loss": 0.635, + "step": 22809 + }, + { + "epoch": 1.6479130167789477, + "grad_norm": 7.848774915213057, + "learning_rate": 3.9528622049180675e-07, + "loss": 0.5685, + "step": 22810 + }, + { + "epoch": 1.6479852619791573, + "grad_norm": 8.372384645129253, + "learning_rate": 3.951283841785486e-07, + "loss": 0.6159, + "step": 22811 + }, + { + "epoch": 1.6480575071793666, + "grad_norm": 6.564733216093883, + "learning_rate": 3.9497057667934824e-07, + "loss": 0.6005, + "step": 22812 + }, + { + "epoch": 1.6481297523795764, + "grad_norm": 6.701881417823969, + "learning_rate": 3.948127979963662e-07, + "loss": 0.5883, + "step": 22813 + }, + { + "epoch": 1.6482019975797857, + "grad_norm": 6.070863283912996, + "learning_rate": 3.9465504813176206e-07, + "loss": 0.5446, + "step": 22814 + }, + { + "epoch": 1.6482742427799955, + "grad_norm": 7.138822833133702, + "learning_rate": 3.9449732708769643e-07, + "loss": 0.5625, + "step": 22815 + }, + { + "epoch": 1.6483464879802048, + "grad_norm": 7.062191846758337, + "learning_rate": 3.9433963486632726e-07, + "loss": 0.6005, + "step": 22816 + }, + { + "epoch": 1.6484187331804143, + "grad_norm": 7.614810576659685, + "learning_rate": 3.941819714698131e-07, + "loss": 0.6112, + "step": 22817 + }, + { + "epoch": 1.6484909783806239, + "grad_norm": 7.8956345773876, + "learning_rate": 3.94024336900313e-07, + "loss": 0.5852, + "step": 22818 + }, + { + "epoch": 1.6485632235808334, + "grad_norm": 6.877042709672819, + "learning_rate": 3.938667311599842e-07, + "loss": 0.6935, + "step": 22819 + }, + { + "epoch": 1.648635468781043, + "grad_norm": 7.151541814426196, + "learning_rate": 3.937091542509849e-07, + "loss": 0.5838, + "step": 22820 + }, + { + "epoch": 1.6487077139812523, + "grad_norm": 6.797120819908212, + "learning_rate": 3.9355160617547153e-07, + "loss": 0.6272, + "step": 22821 + }, + { + "epoch": 1.648779959181462, + "grad_norm": 7.926038742582942, + "learning_rate": 3.9339408693560205e-07, + "loss": 0.5536, + "step": 22822 + }, + { + "epoch": 1.6488522043816713, + "grad_norm": 6.137868325549768, + "learning_rate": 3.932365965335311e-07, + "loss": 0.5905, + "step": 22823 + }, + { + "epoch": 1.648924449581881, + "grad_norm": 7.336990167169012, + "learning_rate": 3.9307913497141524e-07, + "loss": 0.6059, + "step": 22824 + }, + { + "epoch": 1.6489966947820904, + "grad_norm": 7.612476758498619, + "learning_rate": 3.9292170225141037e-07, + "loss": 0.578, + "step": 22825 + }, + { + "epoch": 1.6490689399823, + "grad_norm": 8.336213440694078, + "learning_rate": 3.9276429837567103e-07, + "loss": 0.6519, + "step": 22826 + }, + { + "epoch": 1.6491411851825095, + "grad_norm": 6.337068092330706, + "learning_rate": 3.9260692334635254e-07, + "loss": 0.5926, + "step": 22827 + }, + { + "epoch": 1.6492134303827188, + "grad_norm": 8.7894626645014, + "learning_rate": 3.924495771656089e-07, + "loss": 0.6266, + "step": 22828 + }, + { + "epoch": 1.6492856755829286, + "grad_norm": 7.522708085016529, + "learning_rate": 3.922922598355947e-07, + "loss": 0.6252, + "step": 22829 + }, + { + "epoch": 1.649357920783138, + "grad_norm": 6.986120360081615, + "learning_rate": 3.921349713584624e-07, + "loss": 0.5318, + "step": 22830 + }, + { + "epoch": 1.6494301659833475, + "grad_norm": 6.432572339893193, + "learning_rate": 3.9197771173636545e-07, + "loss": 0.6084, + "step": 22831 + }, + { + "epoch": 1.649502411183557, + "grad_norm": 6.72234131398423, + "learning_rate": 3.918204809714571e-07, + "loss": 0.5457, + "step": 22832 + }, + { + "epoch": 1.6495746563837665, + "grad_norm": 8.357952931027167, + "learning_rate": 3.9166327906589004e-07, + "loss": 0.6066, + "step": 22833 + }, + { + "epoch": 1.649646901583976, + "grad_norm": 6.773452705973416, + "learning_rate": 3.9150610602181507e-07, + "loss": 0.5461, + "step": 22834 + }, + { + "epoch": 1.6497191467841854, + "grad_norm": 8.337276390542938, + "learning_rate": 3.913489618413843e-07, + "loss": 0.6778, + "step": 22835 + }, + { + "epoch": 1.6497913919843952, + "grad_norm": 6.060052751368195, + "learning_rate": 3.9119184652674917e-07, + "loss": 0.5542, + "step": 22836 + }, + { + "epoch": 1.6498636371846045, + "grad_norm": 7.664785928442974, + "learning_rate": 3.9103476008006016e-07, + "loss": 0.6173, + "step": 22837 + }, + { + "epoch": 1.649935882384814, + "grad_norm": 7.170856611297908, + "learning_rate": 3.908777025034677e-07, + "loss": 0.5845, + "step": 22838 + }, + { + "epoch": 1.6500081275850236, + "grad_norm": 8.96862410081495, + "learning_rate": 3.9072067379912206e-07, + "loss": 0.6769, + "step": 22839 + }, + { + "epoch": 1.6500803727852331, + "grad_norm": 7.9026838258664585, + "learning_rate": 3.905636739691729e-07, + "loss": 0.5403, + "step": 22840 + }, + { + "epoch": 1.6501526179854427, + "grad_norm": 7.466563140060607, + "learning_rate": 3.9040670301576875e-07, + "loss": 0.5981, + "step": 22841 + }, + { + "epoch": 1.650224863185652, + "grad_norm": 7.630838628955034, + "learning_rate": 3.902497609410591e-07, + "loss": 0.544, + "step": 22842 + }, + { + "epoch": 1.6502971083858617, + "grad_norm": 6.308918758559156, + "learning_rate": 3.900928477471924e-07, + "loss": 0.6765, + "step": 22843 + }, + { + "epoch": 1.650369353586071, + "grad_norm": 9.272050710465372, + "learning_rate": 3.899359634363159e-07, + "loss": 0.6314, + "step": 22844 + }, + { + "epoch": 1.6504415987862806, + "grad_norm": 7.618119684546586, + "learning_rate": 3.8977910801057757e-07, + "loss": 0.5798, + "step": 22845 + }, + { + "epoch": 1.6505138439864901, + "grad_norm": 8.187963108622563, + "learning_rate": 3.896222814721243e-07, + "loss": 0.6663, + "step": 22846 + }, + { + "epoch": 1.6505860891866997, + "grad_norm": 7.940144496342792, + "learning_rate": 3.894654838231046e-07, + "loss": 0.6136, + "step": 22847 + }, + { + "epoch": 1.6506583343869092, + "grad_norm": 8.380395773780336, + "learning_rate": 3.89308715065663e-07, + "loss": 0.6416, + "step": 22848 + }, + { + "epoch": 1.6507305795871186, + "grad_norm": 6.432718816208913, + "learning_rate": 3.8915197520194624e-07, + "loss": 0.5975, + "step": 22849 + }, + { + "epoch": 1.6508028247873283, + "grad_norm": 7.379294486709555, + "learning_rate": 3.889952642340999e-07, + "loss": 0.683, + "step": 22850 + }, + { + "epoch": 1.6508750699875376, + "grad_norm": 7.697601380729036, + "learning_rate": 3.888385821642701e-07, + "loss": 0.5873, + "step": 22851 + }, + { + "epoch": 1.6509473151877472, + "grad_norm": 7.748386338117541, + "learning_rate": 3.8868192899460034e-07, + "loss": 0.4857, + "step": 22852 + }, + { + "epoch": 1.6510195603879567, + "grad_norm": 6.404090182315707, + "learning_rate": 3.8852530472723555e-07, + "loss": 0.6136, + "step": 22853 + }, + { + "epoch": 1.6510918055881663, + "grad_norm": 8.921483767828583, + "learning_rate": 3.883687093643199e-07, + "loss": 0.605, + "step": 22854 + }, + { + "epoch": 1.6511640507883758, + "grad_norm": 6.337456348276381, + "learning_rate": 3.8821214290799687e-07, + "loss": 0.5896, + "step": 22855 + }, + { + "epoch": 1.6512362959885851, + "grad_norm": 7.924305920938015, + "learning_rate": 3.8805560536041007e-07, + "loss": 0.5742, + "step": 22856 + }, + { + "epoch": 1.6513085411887949, + "grad_norm": 7.154081990698644, + "learning_rate": 3.878990967237023e-07, + "loss": 0.6962, + "step": 22857 + }, + { + "epoch": 1.6513807863890042, + "grad_norm": 6.027868083662184, + "learning_rate": 3.877426170000165e-07, + "loss": 0.5747, + "step": 22858 + }, + { + "epoch": 1.6514530315892137, + "grad_norm": 7.105544447954391, + "learning_rate": 3.875861661914934e-07, + "loss": 0.5677, + "step": 22859 + }, + { + "epoch": 1.6515252767894233, + "grad_norm": 8.220773426062554, + "learning_rate": 3.8742974430027587e-07, + "loss": 0.5454, + "step": 22860 + }, + { + "epoch": 1.6515975219896328, + "grad_norm": 7.3320010737924575, + "learning_rate": 3.8727335132850514e-07, + "loss": 0.6166, + "step": 22861 + }, + { + "epoch": 1.6516697671898424, + "grad_norm": 6.035467224997865, + "learning_rate": 3.8711698727832117e-07, + "loss": 0.5882, + "step": 22862 + }, + { + "epoch": 1.6517420123900517, + "grad_norm": 6.421658477474363, + "learning_rate": 3.869606521518654e-07, + "loss": 0.6232, + "step": 22863 + }, + { + "epoch": 1.6518142575902615, + "grad_norm": 7.145545916620591, + "learning_rate": 3.8680434595127743e-07, + "loss": 0.6045, + "step": 22864 + }, + { + "epoch": 1.6518865027904708, + "grad_norm": 6.791183579315095, + "learning_rate": 3.866480686786972e-07, + "loss": 0.5726, + "step": 22865 + }, + { + "epoch": 1.6519587479906803, + "grad_norm": 7.052369270181337, + "learning_rate": 3.864918203362639e-07, + "loss": 0.567, + "step": 22866 + }, + { + "epoch": 1.6520309931908899, + "grad_norm": 6.90422145689007, + "learning_rate": 3.8633560092611653e-07, + "loss": 0.563, + "step": 22867 + }, + { + "epoch": 1.6521032383910994, + "grad_norm": 8.193470416038377, + "learning_rate": 3.861794104503944e-07, + "loss": 0.6572, + "step": 22868 + }, + { + "epoch": 1.652175483591309, + "grad_norm": 7.689875290036787, + "learning_rate": 3.860232489112342e-07, + "loss": 0.6462, + "step": 22869 + }, + { + "epoch": 1.6522477287915183, + "grad_norm": 7.185224886184082, + "learning_rate": 3.858671163107744e-07, + "loss": 0.6058, + "step": 22870 + }, + { + "epoch": 1.652319973991728, + "grad_norm": 8.223435607364879, + "learning_rate": 3.857110126511521e-07, + "loss": 0.5874, + "step": 22871 + }, + { + "epoch": 1.6523922191919374, + "grad_norm": 6.953998365093189, + "learning_rate": 3.8555493793450527e-07, + "loss": 0.5026, + "step": 22872 + }, + { + "epoch": 1.652464464392147, + "grad_norm": 6.581963440859909, + "learning_rate": 3.8539889216296866e-07, + "loss": 0.626, + "step": 22873 + }, + { + "epoch": 1.6525367095923564, + "grad_norm": 6.854328219317686, + "learning_rate": 3.8524287533867943e-07, + "loss": 0.6392, + "step": 22874 + }, + { + "epoch": 1.652608954792566, + "grad_norm": 7.613532025516439, + "learning_rate": 3.850868874637728e-07, + "loss": 0.7058, + "step": 22875 + }, + { + "epoch": 1.6526811999927755, + "grad_norm": 8.44167919503199, + "learning_rate": 3.8493092854038545e-07, + "loss": 0.6231, + "step": 22876 + }, + { + "epoch": 1.6527534451929848, + "grad_norm": 7.089335004560355, + "learning_rate": 3.847749985706512e-07, + "loss": 0.5448, + "step": 22877 + }, + { + "epoch": 1.6528256903931946, + "grad_norm": 6.644877142764314, + "learning_rate": 3.8461909755670464e-07, + "loss": 0.6087, + "step": 22878 + }, + { + "epoch": 1.652897935593404, + "grad_norm": 6.50420478387677, + "learning_rate": 3.8446322550068085e-07, + "loss": 0.6325, + "step": 22879 + }, + { + "epoch": 1.6529701807936137, + "grad_norm": 6.750261089785597, + "learning_rate": 3.843073824047122e-07, + "loss": 0.623, + "step": 22880 + }, + { + "epoch": 1.653042425993823, + "grad_norm": 7.0316994756509, + "learning_rate": 3.841515682709326e-07, + "loss": 0.6302, + "step": 22881 + }, + { + "epoch": 1.6531146711940325, + "grad_norm": 6.6389172321532115, + "learning_rate": 3.839957831014754e-07, + "loss": 0.6582, + "step": 22882 + }, + { + "epoch": 1.653186916394242, + "grad_norm": 7.362784403827542, + "learning_rate": 3.8384002689847296e-07, + "loss": 0.5997, + "step": 22883 + }, + { + "epoch": 1.6532591615944514, + "grad_norm": 6.371132668680946, + "learning_rate": 3.8368429966405745e-07, + "loss": 0.6305, + "step": 22884 + }, + { + "epoch": 1.6533314067946612, + "grad_norm": 7.435893165695084, + "learning_rate": 3.8352860140036055e-07, + "loss": 0.5372, + "step": 22885 + }, + { + "epoch": 1.6534036519948705, + "grad_norm": 9.371563090711367, + "learning_rate": 3.833729321095145e-07, + "loss": 0.5747, + "step": 22886 + }, + { + "epoch": 1.6534758971950803, + "grad_norm": 8.209701140118232, + "learning_rate": 3.8321729179364883e-07, + "loss": 0.6191, + "step": 22887 + }, + { + "epoch": 1.6535481423952896, + "grad_norm": 6.645913565060699, + "learning_rate": 3.8306168045489507e-07, + "loss": 0.6273, + "step": 22888 + }, + { + "epoch": 1.6536203875954991, + "grad_norm": 7.564981605119144, + "learning_rate": 3.82906098095383e-07, + "loss": 0.6148, + "step": 22889 + }, + { + "epoch": 1.6536926327957087, + "grad_norm": 7.093562590018211, + "learning_rate": 3.8275054471724333e-07, + "loss": 0.6597, + "step": 22890 + }, + { + "epoch": 1.653764877995918, + "grad_norm": 7.738682019688848, + "learning_rate": 3.825950203226042e-07, + "loss": 0.6559, + "step": 22891 + }, + { + "epoch": 1.6538371231961277, + "grad_norm": 6.621965109053171, + "learning_rate": 3.8243952491359507e-07, + "loss": 0.6142, + "step": 22892 + }, + { + "epoch": 1.653909368396337, + "grad_norm": 6.744968092880396, + "learning_rate": 3.8228405849234484e-07, + "loss": 0.6423, + "step": 22893 + }, + { + "epoch": 1.6539816135965468, + "grad_norm": 6.133467135144721, + "learning_rate": 3.821286210609812e-07, + "loss": 0.5773, + "step": 22894 + }, + { + "epoch": 1.6540538587967561, + "grad_norm": 6.327158008679479, + "learning_rate": 3.8197321262163275e-07, + "loss": 0.5883, + "step": 22895 + }, + { + "epoch": 1.6541261039969657, + "grad_norm": 7.130384585587155, + "learning_rate": 3.8181783317642605e-07, + "loss": 0.6268, + "step": 22896 + }, + { + "epoch": 1.6541983491971752, + "grad_norm": 6.533454486476161, + "learning_rate": 3.816624827274895e-07, + "loss": 0.5692, + "step": 22897 + }, + { + "epoch": 1.6542705943973848, + "grad_norm": 7.044090109790868, + "learning_rate": 3.81507161276948e-07, + "loss": 0.5956, + "step": 22898 + }, + { + "epoch": 1.6543428395975943, + "grad_norm": 7.323763527519936, + "learning_rate": 3.813518688269285e-07, + "loss": 0.6125, + "step": 22899 + }, + { + "epoch": 1.6544150847978036, + "grad_norm": 7.344227617031724, + "learning_rate": 3.811966053795571e-07, + "loss": 0.5866, + "step": 22900 + }, + { + "epoch": 1.6544873299980134, + "grad_norm": 6.643799505470984, + "learning_rate": 3.8104137093695955e-07, + "loss": 0.5904, + "step": 22901 + }, + { + "epoch": 1.6545595751982227, + "grad_norm": 7.775841238748052, + "learning_rate": 3.808861655012597e-07, + "loss": 0.6016, + "step": 22902 + }, + { + "epoch": 1.6546318203984323, + "grad_norm": 9.76880846549103, + "learning_rate": 3.8073098907458224e-07, + "loss": 0.6261, + "step": 22903 + }, + { + "epoch": 1.6547040655986418, + "grad_norm": 8.160209542370259, + "learning_rate": 3.8057584165905324e-07, + "loss": 0.6172, + "step": 22904 + }, + { + "epoch": 1.6547763107988513, + "grad_norm": 7.060965312203192, + "learning_rate": 3.8042072325679497e-07, + "loss": 0.5982, + "step": 22905 + }, + { + "epoch": 1.6548485559990609, + "grad_norm": 6.796922144233052, + "learning_rate": 3.8026563386993094e-07, + "loss": 0.6069, + "step": 22906 + }, + { + "epoch": 1.6549208011992702, + "grad_norm": 6.952184309172274, + "learning_rate": 3.801105735005847e-07, + "loss": 0.6145, + "step": 22907 + }, + { + "epoch": 1.65499304639948, + "grad_norm": 7.404535666569512, + "learning_rate": 3.799555421508794e-07, + "loss": 0.537, + "step": 22908 + }, + { + "epoch": 1.6550652915996893, + "grad_norm": 6.7325343473162444, + "learning_rate": 3.7980053982293597e-07, + "loss": 0.5655, + "step": 22909 + }, + { + "epoch": 1.6551375367998988, + "grad_norm": 7.002220619018806, + "learning_rate": 3.796455665188767e-07, + "loss": 0.6373, + "step": 22910 + }, + { + "epoch": 1.6552097820001084, + "grad_norm": 6.519015740366373, + "learning_rate": 3.7949062224082343e-07, + "loss": 0.5907, + "step": 22911 + }, + { + "epoch": 1.655282027200318, + "grad_norm": 7.17177048151986, + "learning_rate": 3.7933570699089704e-07, + "loss": 0.6274, + "step": 22912 + }, + { + "epoch": 1.6553542724005275, + "grad_norm": 7.986818660039153, + "learning_rate": 3.791808207712183e-07, + "loss": 0.6947, + "step": 22913 + }, + { + "epoch": 1.6554265176007368, + "grad_norm": 6.024881588100696, + "learning_rate": 3.7902596358390745e-07, + "loss": 0.5853, + "step": 22914 + }, + { + "epoch": 1.6554987628009465, + "grad_norm": 5.36005417326695, + "learning_rate": 3.7887113543108476e-07, + "loss": 0.5341, + "step": 22915 + }, + { + "epoch": 1.6555710080011559, + "grad_norm": 9.038316753620085, + "learning_rate": 3.787163363148688e-07, + "loss": 0.6153, + "step": 22916 + }, + { + "epoch": 1.6556432532013654, + "grad_norm": 7.233341942472945, + "learning_rate": 3.78561566237379e-07, + "loss": 0.5688, + "step": 22917 + }, + { + "epoch": 1.655715498401575, + "grad_norm": 8.058431382422688, + "learning_rate": 3.784068252007347e-07, + "loss": 0.6487, + "step": 22918 + }, + { + "epoch": 1.6557877436017845, + "grad_norm": 6.024051145176893, + "learning_rate": 3.7825211320705353e-07, + "loss": 0.564, + "step": 22919 + }, + { + "epoch": 1.655859988801994, + "grad_norm": 8.923792003442435, + "learning_rate": 3.7809743025845307e-07, + "loss": 0.5995, + "step": 22920 + }, + { + "epoch": 1.6559322340022034, + "grad_norm": 9.006470791230116, + "learning_rate": 3.779427763570512e-07, + "loss": 0.6849, + "step": 22921 + }, + { + "epoch": 1.6560044792024131, + "grad_norm": 6.659426222362103, + "learning_rate": 3.7778815150496527e-07, + "loss": 0.725, + "step": 22922 + }, + { + "epoch": 1.6560767244026224, + "grad_norm": 7.405864206559293, + "learning_rate": 3.7763355570431177e-07, + "loss": 0.7407, + "step": 22923 + }, + { + "epoch": 1.656148969602832, + "grad_norm": 6.317820506870328, + "learning_rate": 3.77478988957207e-07, + "loss": 0.5686, + "step": 22924 + }, + { + "epoch": 1.6562212148030415, + "grad_norm": 6.680453857719338, + "learning_rate": 3.7732445126576674e-07, + "loss": 0.6373, + "step": 22925 + }, + { + "epoch": 1.656293460003251, + "grad_norm": 7.792141932172006, + "learning_rate": 3.771699426321071e-07, + "loss": 0.6007, + "step": 22926 + }, + { + "epoch": 1.6563657052034606, + "grad_norm": 6.130385541879878, + "learning_rate": 3.7701546305834197e-07, + "loss": 0.6193, + "step": 22927 + }, + { + "epoch": 1.65643795040367, + "grad_norm": 5.280549132661872, + "learning_rate": 3.7686101254658715e-07, + "loss": 0.5812, + "step": 22928 + }, + { + "epoch": 1.6565101956038797, + "grad_norm": 6.867290821535015, + "learning_rate": 3.7670659109895675e-07, + "loss": 0.5489, + "step": 22929 + }, + { + "epoch": 1.656582440804089, + "grad_norm": 8.897849313857218, + "learning_rate": 3.765521987175641e-07, + "loss": 0.6238, + "step": 22930 + }, + { + "epoch": 1.6566546860042985, + "grad_norm": 7.070412583459454, + "learning_rate": 3.763978354045225e-07, + "loss": 0.5887, + "step": 22931 + }, + { + "epoch": 1.656726931204508, + "grad_norm": 8.031906397969426, + "learning_rate": 3.7624350116194643e-07, + "loss": 0.5899, + "step": 22932 + }, + { + "epoch": 1.6567991764047176, + "grad_norm": 7.224904000987579, + "learning_rate": 3.760891959919483e-07, + "loss": 0.6074, + "step": 22933 + }, + { + "epoch": 1.6568714216049272, + "grad_norm": 6.964113343729167, + "learning_rate": 3.759349198966392e-07, + "loss": 0.5159, + "step": 22934 + }, + { + "epoch": 1.6569436668051365, + "grad_norm": 7.267074998121343, + "learning_rate": 3.7578067287813186e-07, + "loss": 0.6244, + "step": 22935 + }, + { + "epoch": 1.6570159120053463, + "grad_norm": 7.6990624290479, + "learning_rate": 3.7562645493853854e-07, + "loss": 0.4872, + "step": 22936 + }, + { + "epoch": 1.6570881572055556, + "grad_norm": 6.345463563595182, + "learning_rate": 3.7547226607996894e-07, + "loss": 0.559, + "step": 22937 + }, + { + "epoch": 1.6571604024057651, + "grad_norm": 7.505449731047648, + "learning_rate": 3.753181063045344e-07, + "loss": 0.6346, + "step": 22938 + }, + { + "epoch": 1.6572326476059747, + "grad_norm": 7.931858490913277, + "learning_rate": 3.751639756143452e-07, + "loss": 0.5751, + "step": 22939 + }, + { + "epoch": 1.6573048928061842, + "grad_norm": 6.758422400675121, + "learning_rate": 3.7500987401151127e-07, + "loss": 0.594, + "step": 22940 + }, + { + "epoch": 1.6573771380063937, + "grad_norm": 6.443924744065353, + "learning_rate": 3.748558014981424e-07, + "loss": 0.6131, + "step": 22941 + }, + { + "epoch": 1.657449383206603, + "grad_norm": 6.733881036118306, + "learning_rate": 3.7470175807634764e-07, + "loss": 0.5506, + "step": 22942 + }, + { + "epoch": 1.6575216284068128, + "grad_norm": 8.63595195318358, + "learning_rate": 3.7454774374823587e-07, + "loss": 0.5739, + "step": 22943 + }, + { + "epoch": 1.6575938736070222, + "grad_norm": 5.446747206361324, + "learning_rate": 3.7439375851591494e-07, + "loss": 0.5938, + "step": 22944 + }, + { + "epoch": 1.6576661188072317, + "grad_norm": 6.07017393211777, + "learning_rate": 3.7423980238149275e-07, + "loss": 0.5925, + "step": 22945 + }, + { + "epoch": 1.6577383640074412, + "grad_norm": 6.653114690812967, + "learning_rate": 3.740858753470772e-07, + "loss": 0.6234, + "step": 22946 + }, + { + "epoch": 1.6578106092076508, + "grad_norm": 8.622468438625777, + "learning_rate": 3.739319774147759e-07, + "loss": 0.6561, + "step": 22947 + }, + { + "epoch": 1.6578828544078603, + "grad_norm": 6.692758649545377, + "learning_rate": 3.7377810858669453e-07, + "loss": 0.6436, + "step": 22948 + }, + { + "epoch": 1.6579550996080696, + "grad_norm": 6.618312123108634, + "learning_rate": 3.7362426886493997e-07, + "loss": 0.5784, + "step": 22949 + }, + { + "epoch": 1.6580273448082794, + "grad_norm": 7.256236846675336, + "learning_rate": 3.7347045825161827e-07, + "loss": 0.5763, + "step": 22950 + }, + { + "epoch": 1.6580995900084887, + "grad_norm": 7.056519263158581, + "learning_rate": 3.733166767488347e-07, + "loss": 0.6317, + "step": 22951 + }, + { + "epoch": 1.6581718352086985, + "grad_norm": 6.885366496385707, + "learning_rate": 3.7316292435869453e-07, + "loss": 0.6531, + "step": 22952 + }, + { + "epoch": 1.6582440804089078, + "grad_norm": 6.685175126149288, + "learning_rate": 3.730092010833025e-07, + "loss": 0.5763, + "step": 22953 + }, + { + "epoch": 1.6583163256091173, + "grad_norm": 6.9878227036535, + "learning_rate": 3.7285550692476386e-07, + "loss": 0.6314, + "step": 22954 + }, + { + "epoch": 1.658388570809327, + "grad_norm": 6.065453193507222, + "learning_rate": 3.7270184188518077e-07, + "loss": 0.5717, + "step": 22955 + }, + { + "epoch": 1.6584608160095362, + "grad_norm": 7.064123726596116, + "learning_rate": 3.7254820596665797e-07, + "loss": 0.6417, + "step": 22956 + }, + { + "epoch": 1.658533061209746, + "grad_norm": 6.227708660781605, + "learning_rate": 3.7239459917129825e-07, + "loss": 0.6027, + "step": 22957 + }, + { + "epoch": 1.6586053064099553, + "grad_norm": 6.491238778360071, + "learning_rate": 3.722410215012051e-07, + "loss": 0.5894, + "step": 22958 + }, + { + "epoch": 1.658677551610165, + "grad_norm": 6.593165656497904, + "learning_rate": 3.7208747295847917e-07, + "loss": 0.6143, + "step": 22959 + }, + { + "epoch": 1.6587497968103744, + "grad_norm": 7.703177492531593, + "learning_rate": 3.7193395354522406e-07, + "loss": 0.6501, + "step": 22960 + }, + { + "epoch": 1.658822042010584, + "grad_norm": 6.9829572565725435, + "learning_rate": 3.7178046326354133e-07, + "loss": 0.561, + "step": 22961 + }, + { + "epoch": 1.6588942872107935, + "grad_norm": 6.596474902562483, + "learning_rate": 3.7162700211553104e-07, + "loss": 0.5198, + "step": 22962 + }, + { + "epoch": 1.6589665324110028, + "grad_norm": 6.229452307370583, + "learning_rate": 3.7147357010329457e-07, + "loss": 0.5848, + "step": 22963 + }, + { + "epoch": 1.6590387776112125, + "grad_norm": 7.158953693170325, + "learning_rate": 3.713201672289321e-07, + "loss": 0.572, + "step": 22964 + }, + { + "epoch": 1.6591110228114219, + "grad_norm": 7.2233552321152095, + "learning_rate": 3.7116679349454455e-07, + "loss": 0.5577, + "step": 22965 + }, + { + "epoch": 1.6591832680116316, + "grad_norm": 7.966186589842073, + "learning_rate": 3.710134489022299e-07, + "loss": 0.6993, + "step": 22966 + }, + { + "epoch": 1.659255513211841, + "grad_norm": 6.870897768488447, + "learning_rate": 3.7086013345408815e-07, + "loss": 0.596, + "step": 22967 + }, + { + "epoch": 1.6593277584120505, + "grad_norm": 8.242770639916268, + "learning_rate": 3.707068471522179e-07, + "loss": 0.6154, + "step": 22968 + }, + { + "epoch": 1.65940000361226, + "grad_norm": 5.921325497056368, + "learning_rate": 3.705535899987178e-07, + "loss": 0.5544, + "step": 22969 + }, + { + "epoch": 1.6594722488124696, + "grad_norm": 7.3885461573199676, + "learning_rate": 3.7040036199568556e-07, + "loss": 0.5636, + "step": 22970 + }, + { + "epoch": 1.6595444940126791, + "grad_norm": 5.456769627038807, + "learning_rate": 3.7024716314521867e-07, + "loss": 0.5137, + "step": 22971 + }, + { + "epoch": 1.6596167392128884, + "grad_norm": 8.029674806007673, + "learning_rate": 3.7009399344941523e-07, + "loss": 0.6099, + "step": 22972 + }, + { + "epoch": 1.6596889844130982, + "grad_norm": 8.902646697757788, + "learning_rate": 3.699408529103707e-07, + "loss": 0.5768, + "step": 22973 + }, + { + "epoch": 1.6597612296133075, + "grad_norm": 6.821109476094287, + "learning_rate": 3.697877415301818e-07, + "loss": 0.6804, + "step": 22974 + }, + { + "epoch": 1.659833474813517, + "grad_norm": 6.627479808867271, + "learning_rate": 3.6963465931094566e-07, + "loss": 0.574, + "step": 22975 + }, + { + "epoch": 1.6599057200137266, + "grad_norm": 8.621784440076912, + "learning_rate": 3.69481606254756e-07, + "loss": 0.6333, + "step": 22976 + }, + { + "epoch": 1.6599779652139361, + "grad_norm": 7.204718357699316, + "learning_rate": 3.693285823637091e-07, + "loss": 0.6025, + "step": 22977 + }, + { + "epoch": 1.6600502104141457, + "grad_norm": 6.761977694890113, + "learning_rate": 3.6917558763989946e-07, + "loss": 0.6209, + "step": 22978 + }, + { + "epoch": 1.660122455614355, + "grad_norm": 8.00965013213797, + "learning_rate": 3.690226220854215e-07, + "loss": 0.627, + "step": 22979 + }, + { + "epoch": 1.6601947008145648, + "grad_norm": 7.253536808628661, + "learning_rate": 3.6886968570236937e-07, + "loss": 0.5493, + "step": 22980 + }, + { + "epoch": 1.660266946014774, + "grad_norm": 6.455850464385453, + "learning_rate": 3.687167784928364e-07, + "loss": 0.5801, + "step": 22981 + }, + { + "epoch": 1.6603391912149836, + "grad_norm": 6.518712032862229, + "learning_rate": 3.6856390045891586e-07, + "loss": 0.6267, + "step": 22982 + }, + { + "epoch": 1.6604114364151932, + "grad_norm": 7.58156232449971, + "learning_rate": 3.6841105160270116e-07, + "loss": 0.6763, + "step": 22983 + }, + { + "epoch": 1.6604836816154027, + "grad_norm": 7.3062393906306635, + "learning_rate": 3.6825823192628366e-07, + "loss": 0.6886, + "step": 22984 + }, + { + "epoch": 1.6605559268156123, + "grad_norm": 8.207233439174107, + "learning_rate": 3.6810544143175584e-07, + "loss": 0.6345, + "step": 22985 + }, + { + "epoch": 1.6606281720158216, + "grad_norm": 6.73155319678617, + "learning_rate": 3.6795268012120966e-07, + "loss": 0.5942, + "step": 22986 + }, + { + "epoch": 1.6607004172160313, + "grad_norm": 6.152665852183349, + "learning_rate": 3.6779994799673534e-07, + "loss": 0.6568, + "step": 22987 + }, + { + "epoch": 1.6607726624162407, + "grad_norm": 7.469835019006888, + "learning_rate": 3.6764724506042346e-07, + "loss": 0.6005, + "step": 22988 + }, + { + "epoch": 1.6608449076164502, + "grad_norm": 7.798412347262707, + "learning_rate": 3.674945713143657e-07, + "loss": 0.5958, + "step": 22989 + }, + { + "epoch": 1.6609171528166597, + "grad_norm": 5.985629356138125, + "learning_rate": 3.673419267606523e-07, + "loss": 0.6164, + "step": 22990 + }, + { + "epoch": 1.6609893980168693, + "grad_norm": 9.089907119326814, + "learning_rate": 3.6718931140137136e-07, + "loss": 0.5758, + "step": 22991 + }, + { + "epoch": 1.6610616432170788, + "grad_norm": 7.500163775881101, + "learning_rate": 3.6703672523861253e-07, + "loss": 0.6331, + "step": 22992 + }, + { + "epoch": 1.6611338884172882, + "grad_norm": 8.580037888663337, + "learning_rate": 3.668841682744656e-07, + "loss": 0.6412, + "step": 22993 + }, + { + "epoch": 1.661206133617498, + "grad_norm": 6.161277615714955, + "learning_rate": 3.6673164051101743e-07, + "loss": 0.5769, + "step": 22994 + }, + { + "epoch": 1.6612783788177072, + "grad_norm": 6.511459812263203, + "learning_rate": 3.66579141950357e-07, + "loss": 0.6542, + "step": 22995 + }, + { + "epoch": 1.6613506240179168, + "grad_norm": 7.146428860982798, + "learning_rate": 3.6642667259457116e-07, + "loss": 0.6353, + "step": 22996 + }, + { + "epoch": 1.6614228692181263, + "grad_norm": 6.455365916428769, + "learning_rate": 3.6627423244574803e-07, + "loss": 0.5835, + "step": 22997 + }, + { + "epoch": 1.6614951144183359, + "grad_norm": 6.528889352654034, + "learning_rate": 3.6612182150597364e-07, + "loss": 0.6395, + "step": 22998 + }, + { + "epoch": 1.6615673596185454, + "grad_norm": 6.780622462053199, + "learning_rate": 3.6596943977733475e-07, + "loss": 0.6218, + "step": 22999 + }, + { + "epoch": 1.6616396048187547, + "grad_norm": 7.534392528284126, + "learning_rate": 3.6581708726191767e-07, + "loss": 0.6163, + "step": 23000 + }, + { + "epoch": 1.6617118500189645, + "grad_norm": 6.932746427099504, + "learning_rate": 3.6566476396180716e-07, + "loss": 0.5324, + "step": 23001 + }, + { + "epoch": 1.6617840952191738, + "grad_norm": 7.186071635415027, + "learning_rate": 3.655124698790888e-07, + "loss": 0.6299, + "step": 23002 + }, + { + "epoch": 1.6618563404193833, + "grad_norm": 7.664078676476939, + "learning_rate": 3.653602050158475e-07, + "loss": 0.6279, + "step": 23003 + }, + { + "epoch": 1.661928585619593, + "grad_norm": 7.999996900557871, + "learning_rate": 3.652079693741681e-07, + "loss": 0.5723, + "step": 23004 + }, + { + "epoch": 1.6620008308198024, + "grad_norm": 7.678334797099411, + "learning_rate": 3.6505576295613327e-07, + "loss": 0.6426, + "step": 23005 + }, + { + "epoch": 1.662073076020012, + "grad_norm": 6.271969748229425, + "learning_rate": 3.649035857638275e-07, + "loss": 0.5684, + "step": 23006 + }, + { + "epoch": 1.6621453212202213, + "grad_norm": 6.900925737282944, + "learning_rate": 3.647514377993339e-07, + "loss": 0.6216, + "step": 23007 + }, + { + "epoch": 1.662217566420431, + "grad_norm": 6.680820444856025, + "learning_rate": 3.645993190647351e-07, + "loss": 0.6141, + "step": 23008 + }, + { + "epoch": 1.6622898116206404, + "grad_norm": 7.522986472802587, + "learning_rate": 3.6444722956211374e-07, + "loss": 0.5976, + "step": 23009 + }, + { + "epoch": 1.66236205682085, + "grad_norm": 6.5689935437613665, + "learning_rate": 3.6429516929355144e-07, + "loss": 0.5923, + "step": 23010 + }, + { + "epoch": 1.6624343020210595, + "grad_norm": 7.0738005575074405, + "learning_rate": 3.6414313826113076e-07, + "loss": 0.5796, + "step": 23011 + }, + { + "epoch": 1.662506547221269, + "grad_norm": 7.357340213493586, + "learning_rate": 3.639911364669316e-07, + "loss": 0.6654, + "step": 23012 + }, + { + "epoch": 1.6625787924214785, + "grad_norm": 7.321243666596789, + "learning_rate": 3.6383916391303507e-07, + "loss": 0.6168, + "step": 23013 + }, + { + "epoch": 1.6626510376216879, + "grad_norm": 6.886849197130258, + "learning_rate": 3.6368722060152185e-07, + "loss": 0.655, + "step": 23014 + }, + { + "epoch": 1.6627232828218976, + "grad_norm": 11.129552017066363, + "learning_rate": 3.635353065344724e-07, + "loss": 0.6152, + "step": 23015 + }, + { + "epoch": 1.662795528022107, + "grad_norm": 7.398863122574427, + "learning_rate": 3.633834217139648e-07, + "loss": 0.6134, + "step": 23016 + }, + { + "epoch": 1.6628677732223165, + "grad_norm": 7.069862781363897, + "learning_rate": 3.632315661420796e-07, + "loss": 0.6881, + "step": 23017 + }, + { + "epoch": 1.662940018422526, + "grad_norm": 7.662244295268904, + "learning_rate": 3.6307973982089573e-07, + "loss": 0.5637, + "step": 23018 + }, + { + "epoch": 1.6630122636227356, + "grad_norm": 6.9451815031761805, + "learning_rate": 3.629279427524904e-07, + "loss": 0.6139, + "step": 23019 + }, + { + "epoch": 1.6630845088229451, + "grad_norm": 8.032829159101817, + "learning_rate": 3.627761749389422e-07, + "loss": 0.5795, + "step": 23020 + }, + { + "epoch": 1.6631567540231544, + "grad_norm": 8.01530137626426, + "learning_rate": 3.6262443638232865e-07, + "loss": 0.55, + "step": 23021 + }, + { + "epoch": 1.6632289992233642, + "grad_norm": 5.724734317661606, + "learning_rate": 3.624727270847278e-07, + "loss": 0.536, + "step": 23022 + }, + { + "epoch": 1.6633012444235735, + "grad_norm": 5.8717090236071945, + "learning_rate": 3.62321047048215e-07, + "loss": 0.5504, + "step": 23023 + }, + { + "epoch": 1.6633734896237833, + "grad_norm": 7.0623765782737475, + "learning_rate": 3.6216939627486706e-07, + "loss": 0.5806, + "step": 23024 + }, + { + "epoch": 1.6634457348239926, + "grad_norm": 6.990222369191981, + "learning_rate": 3.6201777476676025e-07, + "loss": 0.6154, + "step": 23025 + }, + { + "epoch": 1.6635179800242021, + "grad_norm": 7.306220594466061, + "learning_rate": 3.618661825259703e-07, + "loss": 0.504, + "step": 23026 + }, + { + "epoch": 1.6635902252244117, + "grad_norm": 6.990253202248326, + "learning_rate": 3.6171461955457206e-07, + "loss": 0.5805, + "step": 23027 + }, + { + "epoch": 1.663662470424621, + "grad_norm": 6.630272548591311, + "learning_rate": 3.6156308585464013e-07, + "loss": 0.5791, + "step": 23028 + }, + { + "epoch": 1.6637347156248308, + "grad_norm": 7.377528387601316, + "learning_rate": 3.6141158142825014e-07, + "loss": 0.6284, + "step": 23029 + }, + { + "epoch": 1.66380696082504, + "grad_norm": 8.254311562013696, + "learning_rate": 3.612601062774743e-07, + "loss": 0.6295, + "step": 23030 + }, + { + "epoch": 1.6638792060252499, + "grad_norm": 7.508088899698861, + "learning_rate": 3.61108660404387e-07, + "loss": 0.6309, + "step": 23031 + }, + { + "epoch": 1.6639514512254592, + "grad_norm": 6.785310575479422, + "learning_rate": 3.6095724381106145e-07, + "loss": 0.5904, + "step": 23032 + }, + { + "epoch": 1.6640236964256687, + "grad_norm": 8.374066201148098, + "learning_rate": 3.608058564995709e-07, + "loss": 0.6403, + "step": 23033 + }, + { + "epoch": 1.6640959416258783, + "grad_norm": 7.156392058582494, + "learning_rate": 3.6065449847198645e-07, + "loss": 0.5691, + "step": 23034 + }, + { + "epoch": 1.6641681868260876, + "grad_norm": 7.528168993829655, + "learning_rate": 3.60503169730381e-07, + "loss": 0.5643, + "step": 23035 + }, + { + "epoch": 1.6642404320262973, + "grad_norm": 6.8412872481835025, + "learning_rate": 3.6035187027682607e-07, + "loss": 0.5878, + "step": 23036 + }, + { + "epoch": 1.6643126772265067, + "grad_norm": 6.515059705037635, + "learning_rate": 3.602006001133926e-07, + "loss": 0.6396, + "step": 23037 + }, + { + "epoch": 1.6643849224267164, + "grad_norm": 7.886502539291171, + "learning_rate": 3.600493592421514e-07, + "loss": 0.6461, + "step": 23038 + }, + { + "epoch": 1.6644571676269257, + "grad_norm": 7.030726434500019, + "learning_rate": 3.598981476651728e-07, + "loss": 0.6754, + "step": 23039 + }, + { + "epoch": 1.6645294128271353, + "grad_norm": 6.851477314714176, + "learning_rate": 3.5974696538452784e-07, + "loss": 0.6411, + "step": 23040 + }, + { + "epoch": 1.6646016580273448, + "grad_norm": 7.7713810486502535, + "learning_rate": 3.5959581240228413e-07, + "loss": 0.6124, + "step": 23041 + }, + { + "epoch": 1.6646739032275544, + "grad_norm": 7.637056668115142, + "learning_rate": 3.5944468872051217e-07, + "loss": 0.5645, + "step": 23042 + }, + { + "epoch": 1.664746148427764, + "grad_norm": 7.86687910579757, + "learning_rate": 3.5929359434128064e-07, + "loss": 0.5706, + "step": 23043 + }, + { + "epoch": 1.6648183936279732, + "grad_norm": 7.180692683307744, + "learning_rate": 3.591425292666567e-07, + "loss": 0.5718, + "step": 23044 + }, + { + "epoch": 1.664890638828183, + "grad_norm": 7.872828350663672, + "learning_rate": 3.589914934987099e-07, + "loss": 0.6219, + "step": 23045 + }, + { + "epoch": 1.6649628840283923, + "grad_norm": 6.680073545131025, + "learning_rate": 3.5884048703950707e-07, + "loss": 0.5664, + "step": 23046 + }, + { + "epoch": 1.6650351292286019, + "grad_norm": 7.950251871593587, + "learning_rate": 3.586895098911161e-07, + "loss": 0.544, + "step": 23047 + }, + { + "epoch": 1.6651073744288114, + "grad_norm": 7.343388974669718, + "learning_rate": 3.585385620556026e-07, + "loss": 0.6178, + "step": 23048 + }, + { + "epoch": 1.665179619629021, + "grad_norm": 6.11160463979742, + "learning_rate": 3.5838764353503307e-07, + "loss": 0.5962, + "step": 23049 + }, + { + "epoch": 1.6652518648292305, + "grad_norm": 8.872531601330117, + "learning_rate": 3.582367543314749e-07, + "loss": 0.5842, + "step": 23050 + }, + { + "epoch": 1.6653241100294398, + "grad_norm": 7.436959126784737, + "learning_rate": 3.580858944469917e-07, + "loss": 0.616, + "step": 23051 + }, + { + "epoch": 1.6653963552296496, + "grad_norm": 7.447550162165324, + "learning_rate": 3.5793506388364957e-07, + "loss": 0.6146, + "step": 23052 + }, + { + "epoch": 1.665468600429859, + "grad_norm": 6.3980600993322945, + "learning_rate": 3.5778426264351324e-07, + "loss": 0.5833, + "step": 23053 + }, + { + "epoch": 1.6655408456300684, + "grad_norm": 6.265303225352723, + "learning_rate": 3.576334907286472e-07, + "loss": 0.5975, + "step": 23054 + }, + { + "epoch": 1.665613090830278, + "grad_norm": 7.781605662601544, + "learning_rate": 3.57482748141115e-07, + "loss": 0.6499, + "step": 23055 + }, + { + "epoch": 1.6656853360304875, + "grad_norm": 7.353002814539445, + "learning_rate": 3.573320348829806e-07, + "loss": 0.583, + "step": 23056 + }, + { + "epoch": 1.665757581230697, + "grad_norm": 6.317632118597719, + "learning_rate": 3.5718135095630687e-07, + "loss": 0.6266, + "step": 23057 + }, + { + "epoch": 1.6658298264309064, + "grad_norm": 7.611387014222403, + "learning_rate": 3.570306963631573e-07, + "loss": 0.6449, + "step": 23058 + }, + { + "epoch": 1.6659020716311161, + "grad_norm": 7.336648567358787, + "learning_rate": 3.568800711055928e-07, + "loss": 0.5826, + "step": 23059 + }, + { + "epoch": 1.6659743168313255, + "grad_norm": 6.146607879280316, + "learning_rate": 3.567294751856762e-07, + "loss": 0.5996, + "step": 23060 + }, + { + "epoch": 1.666046562031535, + "grad_norm": 7.719885862228151, + "learning_rate": 3.565789086054697e-07, + "loss": 0.6633, + "step": 23061 + }, + { + "epoch": 1.6661188072317445, + "grad_norm": 6.957148041727423, + "learning_rate": 3.56428371367033e-07, + "loss": 0.5851, + "step": 23062 + }, + { + "epoch": 1.666191052431954, + "grad_norm": 6.506888627399043, + "learning_rate": 3.562778634724276e-07, + "loss": 0.6114, + "step": 23063 + }, + { + "epoch": 1.6662632976321636, + "grad_norm": 6.73454323401378, + "learning_rate": 3.5612738492371345e-07, + "loss": 0.604, + "step": 23064 + }, + { + "epoch": 1.666335542832373, + "grad_norm": 8.833040148589927, + "learning_rate": 3.559769357229512e-07, + "loss": 0.6225, + "step": 23065 + }, + { + "epoch": 1.6664077880325827, + "grad_norm": 6.1157823607677, + "learning_rate": 3.558265158721996e-07, + "loss": 0.576, + "step": 23066 + }, + { + "epoch": 1.666480033232792, + "grad_norm": 6.2234810320274825, + "learning_rate": 3.5567612537351833e-07, + "loss": 0.577, + "step": 23067 + }, + { + "epoch": 1.6665522784330016, + "grad_norm": 10.09107316842117, + "learning_rate": 3.5552576422896666e-07, + "loss": 0.5968, + "step": 23068 + }, + { + "epoch": 1.6666245236332111, + "grad_norm": 7.626227608338194, + "learning_rate": 3.553754324406014e-07, + "loss": 0.5953, + "step": 23069 + }, + { + "epoch": 1.6666967688334207, + "grad_norm": 7.2528733446083065, + "learning_rate": 3.552251300104814e-07, + "loss": 0.575, + "step": 23070 + }, + { + "epoch": 1.6667690140336302, + "grad_norm": 6.878008929620367, + "learning_rate": 3.5507485694066397e-07, + "loss": 0.6076, + "step": 23071 + }, + { + "epoch": 1.6668412592338395, + "grad_norm": 7.616050345807402, + "learning_rate": 3.549246132332068e-07, + "loss": 0.6881, + "step": 23072 + }, + { + "epoch": 1.6669135044340493, + "grad_norm": 6.612602005227671, + "learning_rate": 3.547743988901653e-07, + "loss": 0.6234, + "step": 23073 + }, + { + "epoch": 1.6669857496342586, + "grad_norm": 7.708903716008233, + "learning_rate": 3.546242139135969e-07, + "loss": 0.6824, + "step": 23074 + }, + { + "epoch": 1.6670579948344681, + "grad_norm": 7.91416637356252, + "learning_rate": 3.544740583055581e-07, + "loss": 0.5783, + "step": 23075 + }, + { + "epoch": 1.6671302400346777, + "grad_norm": 7.538106875557946, + "learning_rate": 3.543239320681027e-07, + "loss": 0.618, + "step": 23076 + }, + { + "epoch": 1.6672024852348872, + "grad_norm": 6.862784613594389, + "learning_rate": 3.541738352032867e-07, + "loss": 0.5888, + "step": 23077 + }, + { + "epoch": 1.6672747304350968, + "grad_norm": 7.347837179208464, + "learning_rate": 3.54023767713165e-07, + "loss": 0.716, + "step": 23078 + }, + { + "epoch": 1.667346975635306, + "grad_norm": 6.475250074351617, + "learning_rate": 3.538737295997921e-07, + "loss": 0.5965, + "step": 23079 + }, + { + "epoch": 1.6674192208355159, + "grad_norm": 7.7211929964910535, + "learning_rate": 3.537237208652208e-07, + "loss": 0.6291, + "step": 23080 + }, + { + "epoch": 1.6674914660357252, + "grad_norm": 7.617589632814624, + "learning_rate": 3.535737415115054e-07, + "loss": 0.5555, + "step": 23081 + }, + { + "epoch": 1.6675637112359347, + "grad_norm": 8.178559291970998, + "learning_rate": 3.5342379154069876e-07, + "loss": 0.6334, + "step": 23082 + }, + { + "epoch": 1.6676359564361443, + "grad_norm": 5.715349697784795, + "learning_rate": 3.532738709548539e-07, + "loss": 0.5632, + "step": 23083 + }, + { + "epoch": 1.6677082016363538, + "grad_norm": 7.846028882177959, + "learning_rate": 3.531239797560229e-07, + "loss": 0.5898, + "step": 23084 + }, + { + "epoch": 1.6677804468365633, + "grad_norm": 6.943763723282597, + "learning_rate": 3.529741179462576e-07, + "loss": 0.6325, + "step": 23085 + }, + { + "epoch": 1.6678526920367727, + "grad_norm": 6.553968715244997, + "learning_rate": 3.528242855276101e-07, + "loss": 0.5604, + "step": 23086 + }, + { + "epoch": 1.6679249372369824, + "grad_norm": 6.942787973147955, + "learning_rate": 3.526744825021303e-07, + "loss": 0.6223, + "step": 23087 + }, + { + "epoch": 1.6679971824371917, + "grad_norm": 6.830314093275138, + "learning_rate": 3.525247088718697e-07, + "loss": 0.5942, + "step": 23088 + }, + { + "epoch": 1.6680694276374013, + "grad_norm": 7.451805397789413, + "learning_rate": 3.5237496463887855e-07, + "loss": 0.5927, + "step": 23089 + }, + { + "epoch": 1.6681416728376108, + "grad_norm": 6.49885901193673, + "learning_rate": 3.52225249805207e-07, + "loss": 0.6768, + "step": 23090 + }, + { + "epoch": 1.6682139180378204, + "grad_norm": 7.089170616289008, + "learning_rate": 3.5207556437290346e-07, + "loss": 0.5882, + "step": 23091 + }, + { + "epoch": 1.66828616323803, + "grad_norm": 7.415981675422687, + "learning_rate": 3.5192590834401797e-07, + "loss": 0.5219, + "step": 23092 + }, + { + "epoch": 1.6683584084382392, + "grad_norm": 7.025211436935018, + "learning_rate": 3.517762817205989e-07, + "loss": 0.5396, + "step": 23093 + }, + { + "epoch": 1.668430653638449, + "grad_norm": 6.304967030280943, + "learning_rate": 3.5162668450469423e-07, + "loss": 0.5819, + "step": 23094 + }, + { + "epoch": 1.6685028988386583, + "grad_norm": 6.4220701313305915, + "learning_rate": 3.5147711669835245e-07, + "loss": 0.5906, + "step": 23095 + }, + { + "epoch": 1.6685751440388679, + "grad_norm": 6.059151894926469, + "learning_rate": 3.5132757830362045e-07, + "loss": 0.6317, + "step": 23096 + }, + { + "epoch": 1.6686473892390774, + "grad_norm": 6.560885276141433, + "learning_rate": 3.5117806932254637e-07, + "loss": 0.6248, + "step": 23097 + }, + { + "epoch": 1.668719634439287, + "grad_norm": 8.377404166788807, + "learning_rate": 3.510285897571755e-07, + "loss": 0.6296, + "step": 23098 + }, + { + "epoch": 1.6687918796394965, + "grad_norm": 7.6904499627708915, + "learning_rate": 3.5087913960955453e-07, + "loss": 0.5555, + "step": 23099 + }, + { + "epoch": 1.6688641248397058, + "grad_norm": 7.417866670489711, + "learning_rate": 3.5072971888173017e-07, + "loss": 0.67, + "step": 23100 + }, + { + "epoch": 1.6689363700399156, + "grad_norm": 7.019497556610845, + "learning_rate": 3.505803275757458e-07, + "loss": 0.6754, + "step": 23101 + }, + { + "epoch": 1.669008615240125, + "grad_norm": 6.409431374691128, + "learning_rate": 3.5043096569364857e-07, + "loss": 0.5688, + "step": 23102 + }, + { + "epoch": 1.6690808604403347, + "grad_norm": 6.8992868248396695, + "learning_rate": 3.5028163323748255e-07, + "loss": 0.6092, + "step": 23103 + }, + { + "epoch": 1.669153105640544, + "grad_norm": 8.47341891593942, + "learning_rate": 3.501323302092921e-07, + "loss": 0.626, + "step": 23104 + }, + { + "epoch": 1.6692253508407535, + "grad_norm": 5.981572144701226, + "learning_rate": 3.499830566111204e-07, + "loss": 0.6439, + "step": 23105 + }, + { + "epoch": 1.669297596040963, + "grad_norm": 7.748221408656221, + "learning_rate": 3.4983381244501095e-07, + "loss": 0.6517, + "step": 23106 + }, + { + "epoch": 1.6693698412411724, + "grad_norm": 8.772895286701294, + "learning_rate": 3.4968459771300813e-07, + "loss": 0.6378, + "step": 23107 + }, + { + "epoch": 1.6694420864413821, + "grad_norm": 6.829242536283564, + "learning_rate": 3.495354124171527e-07, + "loss": 0.5259, + "step": 23108 + }, + { + "epoch": 1.6695143316415915, + "grad_norm": 9.005456330102263, + "learning_rate": 3.493862565594877e-07, + "loss": 0.6104, + "step": 23109 + }, + { + "epoch": 1.6695865768418012, + "grad_norm": 6.959066325734837, + "learning_rate": 3.4923713014205525e-07, + "loss": 0.5491, + "step": 23110 + }, + { + "epoch": 1.6696588220420105, + "grad_norm": 7.35098131337111, + "learning_rate": 3.490880331668964e-07, + "loss": 0.6013, + "step": 23111 + }, + { + "epoch": 1.66973106724222, + "grad_norm": 8.478989157978283, + "learning_rate": 3.489389656360523e-07, + "loss": 0.6069, + "step": 23112 + }, + { + "epoch": 1.6698033124424296, + "grad_norm": 7.174592211894863, + "learning_rate": 3.4878992755156354e-07, + "loss": 0.5619, + "step": 23113 + }, + { + "epoch": 1.669875557642639, + "grad_norm": 7.691282504797462, + "learning_rate": 3.486409189154705e-07, + "loss": 0.6225, + "step": 23114 + }, + { + "epoch": 1.6699478028428487, + "grad_norm": 7.551334682495575, + "learning_rate": 3.484919397298134e-07, + "loss": 0.5486, + "step": 23115 + }, + { + "epoch": 1.670020048043058, + "grad_norm": 6.297951428814534, + "learning_rate": 3.4834298999663035e-07, + "loss": 0.5734, + "step": 23116 + }, + { + "epoch": 1.6700922932432678, + "grad_norm": 6.980927779395287, + "learning_rate": 3.4819406971796115e-07, + "loss": 0.5931, + "step": 23117 + }, + { + "epoch": 1.6701645384434771, + "grad_norm": 7.2002344517153025, + "learning_rate": 3.48045178895845e-07, + "loss": 0.6149, + "step": 23118 + }, + { + "epoch": 1.6702367836436867, + "grad_norm": 7.73483170210968, + "learning_rate": 3.478963175323191e-07, + "loss": 0.5564, + "step": 23119 + }, + { + "epoch": 1.6703090288438962, + "grad_norm": 8.219903785161033, + "learning_rate": 3.4774748562942135e-07, + "loss": 0.6231, + "step": 23120 + }, + { + "epoch": 1.6703812740441057, + "grad_norm": 7.643692867286994, + "learning_rate": 3.4759868318918894e-07, + "loss": 0.5963, + "step": 23121 + }, + { + "epoch": 1.6704535192443153, + "grad_norm": 7.899459878347943, + "learning_rate": 3.474499102136605e-07, + "loss": 0.6814, + "step": 23122 + }, + { + "epoch": 1.6705257644445246, + "grad_norm": 7.229583405750509, + "learning_rate": 3.4730116670487063e-07, + "loss": 0.5472, + "step": 23123 + }, + { + "epoch": 1.6705980096447344, + "grad_norm": 6.082461812894699, + "learning_rate": 3.4715245266485647e-07, + "loss": 0.6402, + "step": 23124 + }, + { + "epoch": 1.6706702548449437, + "grad_norm": 8.20199071398909, + "learning_rate": 3.470037680956545e-07, + "loss": 0.6595, + "step": 23125 + }, + { + "epoch": 1.6707425000451532, + "grad_norm": 8.32142148932665, + "learning_rate": 3.4685511299929837e-07, + "loss": 0.578, + "step": 23126 + }, + { + "epoch": 1.6708147452453628, + "grad_norm": 8.349303950919726, + "learning_rate": 3.4670648737782394e-07, + "loss": 0.6888, + "step": 23127 + }, + { + "epoch": 1.6708869904455723, + "grad_norm": 7.220982854417331, + "learning_rate": 3.4655789123326597e-07, + "loss": 0.5938, + "step": 23128 + }, + { + "epoch": 1.6709592356457819, + "grad_norm": 6.682805490300567, + "learning_rate": 3.4640932456765805e-07, + "loss": 0.5768, + "step": 23129 + }, + { + "epoch": 1.6710314808459912, + "grad_norm": 7.5208158906715, + "learning_rate": 3.4626078738303484e-07, + "loss": 0.5892, + "step": 23130 + }, + { + "epoch": 1.671103726046201, + "grad_norm": 6.7177532077622795, + "learning_rate": 3.4611227968142866e-07, + "loss": 0.571, + "step": 23131 + }, + { + "epoch": 1.6711759712464103, + "grad_norm": 7.3638676822177445, + "learning_rate": 3.459638014648739e-07, + "loss": 0.5778, + "step": 23132 + }, + { + "epoch": 1.6712482164466198, + "grad_norm": 8.20800728965272, + "learning_rate": 3.458153527354016e-07, + "loss": 0.6546, + "step": 23133 + }, + { + "epoch": 1.6713204616468293, + "grad_norm": 6.700314517254168, + "learning_rate": 3.4566693349504437e-07, + "loss": 0.5788, + "step": 23134 + }, + { + "epoch": 1.671392706847039, + "grad_norm": 7.503024699324391, + "learning_rate": 3.4551854374583416e-07, + "loss": 0.6244, + "step": 23135 + }, + { + "epoch": 1.6714649520472484, + "grad_norm": 8.606561384475572, + "learning_rate": 3.453701834898027e-07, + "loss": 0.5718, + "step": 23136 + }, + { + "epoch": 1.6715371972474578, + "grad_norm": 5.986874210410579, + "learning_rate": 3.4522185272897997e-07, + "loss": 0.602, + "step": 23137 + }, + { + "epoch": 1.6716094424476675, + "grad_norm": 7.515874973509847, + "learning_rate": 3.450735514653972e-07, + "loss": 0.6068, + "step": 23138 + }, + { + "epoch": 1.6716816876478768, + "grad_norm": 7.381804349982854, + "learning_rate": 3.4492527970108407e-07, + "loss": 0.6565, + "step": 23139 + }, + { + "epoch": 1.6717539328480864, + "grad_norm": 7.762759134632751, + "learning_rate": 3.447770374380707e-07, + "loss": 0.6655, + "step": 23140 + }, + { + "epoch": 1.671826178048296, + "grad_norm": 6.403539465638381, + "learning_rate": 3.4462882467838605e-07, + "loss": 0.6111, + "step": 23141 + }, + { + "epoch": 1.6718984232485055, + "grad_norm": 6.364053041267063, + "learning_rate": 3.444806414240595e-07, + "loss": 0.6138, + "step": 23142 + }, + { + "epoch": 1.671970668448715, + "grad_norm": 6.765208312983895, + "learning_rate": 3.443324876771198e-07, + "loss": 0.61, + "step": 23143 + }, + { + "epoch": 1.6720429136489243, + "grad_norm": 6.433813131884981, + "learning_rate": 3.441843634395939e-07, + "loss": 0.5776, + "step": 23144 + }, + { + "epoch": 1.672115158849134, + "grad_norm": 6.613020520554389, + "learning_rate": 3.4403626871351045e-07, + "loss": 0.5806, + "step": 23145 + }, + { + "epoch": 1.6721874040493434, + "grad_norm": 6.039086187595038, + "learning_rate": 3.4388820350089615e-07, + "loss": 0.6599, + "step": 23146 + }, + { + "epoch": 1.672259649249553, + "grad_norm": 6.069567464032297, + "learning_rate": 3.437401678037791e-07, + "loss": 0.6612, + "step": 23147 + }, + { + "epoch": 1.6723318944497625, + "grad_norm": 7.899461085613071, + "learning_rate": 3.435921616241841e-07, + "loss": 0.5481, + "step": 23148 + }, + { + "epoch": 1.672404139649972, + "grad_norm": 7.530425093811495, + "learning_rate": 3.4344418496413763e-07, + "loss": 0.6142, + "step": 23149 + }, + { + "epoch": 1.6724763848501816, + "grad_norm": 6.1584927109401635, + "learning_rate": 3.432962378256668e-07, + "loss": 0.5826, + "step": 23150 + }, + { + "epoch": 1.672548630050391, + "grad_norm": 6.466307542929434, + "learning_rate": 3.4314832021079547e-07, + "loss": 0.5773, + "step": 23151 + }, + { + "epoch": 1.6726208752506007, + "grad_norm": 6.453221946154349, + "learning_rate": 3.430004321215491e-07, + "loss": 0.6419, + "step": 23152 + }, + { + "epoch": 1.67269312045081, + "grad_norm": 8.418053037571061, + "learning_rate": 3.4285257355995166e-07, + "loss": 0.6616, + "step": 23153 + }, + { + "epoch": 1.6727653656510195, + "grad_norm": 7.893669885514254, + "learning_rate": 3.427047445280285e-07, + "loss": 0.6795, + "step": 23154 + }, + { + "epoch": 1.672837610851229, + "grad_norm": 7.246792313867704, + "learning_rate": 3.425569450278016e-07, + "loss": 0.6499, + "step": 23155 + }, + { + "epoch": 1.6729098560514386, + "grad_norm": 7.974344122532606, + "learning_rate": 3.4240917506129483e-07, + "loss": 0.6173, + "step": 23156 + }, + { + "epoch": 1.6729821012516481, + "grad_norm": 7.257815653591887, + "learning_rate": 3.422614346305314e-07, + "loss": 0.5672, + "step": 23157 + }, + { + "epoch": 1.6730543464518575, + "grad_norm": 7.861617327754935, + "learning_rate": 3.4211372373753355e-07, + "loss": 0.6181, + "step": 23158 + }, + { + "epoch": 1.6731265916520672, + "grad_norm": 8.686973061807116, + "learning_rate": 3.4196604238432323e-07, + "loss": 0.6527, + "step": 23159 + }, + { + "epoch": 1.6731988368522765, + "grad_norm": 7.057314969595598, + "learning_rate": 3.418183905729222e-07, + "loss": 0.6295, + "step": 23160 + }, + { + "epoch": 1.673271082052486, + "grad_norm": 7.275369389650217, + "learning_rate": 3.4167076830535246e-07, + "loss": 0.6078, + "step": 23161 + }, + { + "epoch": 1.6733433272526956, + "grad_norm": 6.938748633938289, + "learning_rate": 3.4152317558363316e-07, + "loss": 0.6267, + "step": 23162 + }, + { + "epoch": 1.6734155724529052, + "grad_norm": 6.589943605219297, + "learning_rate": 3.413756124097858e-07, + "loss": 0.5781, + "step": 23163 + }, + { + "epoch": 1.6734878176531147, + "grad_norm": 6.7386098764973, + "learning_rate": 3.412280787858305e-07, + "loss": 0.5358, + "step": 23164 + }, + { + "epoch": 1.673560062853324, + "grad_norm": 7.261771806725217, + "learning_rate": 3.410805747137869e-07, + "loss": 0.5812, + "step": 23165 + }, + { + "epoch": 1.6736323080535338, + "grad_norm": 7.514600148581853, + "learning_rate": 3.409331001956734e-07, + "loss": 0.6335, + "step": 23166 + }, + { + "epoch": 1.6737045532537431, + "grad_norm": 7.470515724773746, + "learning_rate": 3.407856552335093e-07, + "loss": 0.6112, + "step": 23167 + }, + { + "epoch": 1.6737767984539527, + "grad_norm": 6.798049018577637, + "learning_rate": 3.4063823982931315e-07, + "loss": 0.5733, + "step": 23168 + }, + { + "epoch": 1.6738490436541622, + "grad_norm": 7.958408002519745, + "learning_rate": 3.404908539851029e-07, + "loss": 0.6277, + "step": 23169 + }, + { + "epoch": 1.6739212888543717, + "grad_norm": 6.849652257849796, + "learning_rate": 3.40343497702896e-07, + "loss": 0.6272, + "step": 23170 + }, + { + "epoch": 1.6739935340545813, + "grad_norm": 7.589269250162682, + "learning_rate": 3.4019617098470986e-07, + "loss": 0.6486, + "step": 23171 + }, + { + "epoch": 1.6740657792547906, + "grad_norm": 6.99149214201276, + "learning_rate": 3.400488738325616e-07, + "loss": 0.5357, + "step": 23172 + }, + { + "epoch": 1.6741380244550004, + "grad_norm": 6.376694996211363, + "learning_rate": 3.399016062484667e-07, + "loss": 0.5771, + "step": 23173 + }, + { + "epoch": 1.6742102696552097, + "grad_norm": 7.163786937942552, + "learning_rate": 3.397543682344415e-07, + "loss": 0.6664, + "step": 23174 + }, + { + "epoch": 1.6742825148554195, + "grad_norm": 7.319286360009712, + "learning_rate": 3.396071597925024e-07, + "loss": 0.6084, + "step": 23175 + }, + { + "epoch": 1.6743547600556288, + "grad_norm": 7.232614916914694, + "learning_rate": 3.394599809246632e-07, + "loss": 0.6027, + "step": 23176 + }, + { + "epoch": 1.6744270052558383, + "grad_norm": 8.527451113043778, + "learning_rate": 3.3931283163293916e-07, + "loss": 0.6672, + "step": 23177 + }, + { + "epoch": 1.6744992504560479, + "grad_norm": 7.386651097819496, + "learning_rate": 3.391657119193445e-07, + "loss": 0.609, + "step": 23178 + }, + { + "epoch": 1.6745714956562572, + "grad_norm": 5.816435435307066, + "learning_rate": 3.3901862178589427e-07, + "loss": 0.5988, + "step": 23179 + }, + { + "epoch": 1.674643740856467, + "grad_norm": 8.39044132422332, + "learning_rate": 3.388715612346011e-07, + "loss": 0.6311, + "step": 23180 + }, + { + "epoch": 1.6747159860566763, + "grad_norm": 7.658350317780178, + "learning_rate": 3.38724530267478e-07, + "loss": 0.6095, + "step": 23181 + }, + { + "epoch": 1.674788231256886, + "grad_norm": 7.500131733055333, + "learning_rate": 3.385775288865384e-07, + "loss": 0.6215, + "step": 23182 + }, + { + "epoch": 1.6748604764570953, + "grad_norm": 8.533079070038397, + "learning_rate": 3.3843055709379376e-07, + "loss": 0.6041, + "step": 23183 + }, + { + "epoch": 1.674932721657305, + "grad_norm": 6.359859663656601, + "learning_rate": 3.3828361489125634e-07, + "loss": 0.6153, + "step": 23184 + }, + { + "epoch": 1.6750049668575144, + "grad_norm": 6.499785493098893, + "learning_rate": 3.381367022809379e-07, + "loss": 0.5367, + "step": 23185 + }, + { + "epoch": 1.6750772120577238, + "grad_norm": 5.776952847661262, + "learning_rate": 3.3798981926484926e-07, + "loss": 0.5916, + "step": 23186 + }, + { + "epoch": 1.6751494572579335, + "grad_norm": 7.586049889341882, + "learning_rate": 3.3784296584500136e-07, + "loss": 0.5471, + "step": 23187 + }, + { + "epoch": 1.6752217024581428, + "grad_norm": 7.509300822742125, + "learning_rate": 3.3769614202340457e-07, + "loss": 0.6175, + "step": 23188 + }, + { + "epoch": 1.6752939476583526, + "grad_norm": 8.306322139659223, + "learning_rate": 3.3754934780206917e-07, + "loss": 0.5598, + "step": 23189 + }, + { + "epoch": 1.675366192858562, + "grad_norm": 7.049277562129982, + "learning_rate": 3.374025831830036e-07, + "loss": 0.6131, + "step": 23190 + }, + { + "epoch": 1.6754384380587715, + "grad_norm": 7.975705690163317, + "learning_rate": 3.3725584816821753e-07, + "loss": 0.5625, + "step": 23191 + }, + { + "epoch": 1.675510683258981, + "grad_norm": 7.6634549863544486, + "learning_rate": 3.371091427597198e-07, + "loss": 0.5965, + "step": 23192 + }, + { + "epoch": 1.6755829284591905, + "grad_norm": 7.214776601101977, + "learning_rate": 3.3696246695951894e-07, + "loss": 0.5939, + "step": 23193 + }, + { + "epoch": 1.6756551736594, + "grad_norm": 7.111950083090783, + "learning_rate": 3.3681582076962176e-07, + "loss": 0.5985, + "step": 23194 + }, + { + "epoch": 1.6757274188596094, + "grad_norm": 9.541410040146017, + "learning_rate": 3.366692041920364e-07, + "loss": 0.6569, + "step": 23195 + }, + { + "epoch": 1.6757996640598192, + "grad_norm": 7.583324921432133, + "learning_rate": 3.3652261722877e-07, + "loss": 0.5921, + "step": 23196 + }, + { + "epoch": 1.6758719092600285, + "grad_norm": 6.797712322705752, + "learning_rate": 3.3637605988182916e-07, + "loss": 0.5718, + "step": 23197 + }, + { + "epoch": 1.675944154460238, + "grad_norm": 7.570721909482139, + "learning_rate": 3.3622953215322e-07, + "loss": 0.6017, + "step": 23198 + }, + { + "epoch": 1.6760163996604476, + "grad_norm": 7.429644114210206, + "learning_rate": 3.360830340449486e-07, + "loss": 0.6414, + "step": 23199 + }, + { + "epoch": 1.6760886448606571, + "grad_norm": 5.964474409918469, + "learning_rate": 3.359365655590208e-07, + "loss": 0.5843, + "step": 23200 + }, + { + "epoch": 1.6761608900608667, + "grad_norm": 8.264459134755224, + "learning_rate": 3.357901266974406e-07, + "loss": 0.5896, + "step": 23201 + }, + { + "epoch": 1.676233135261076, + "grad_norm": 7.6485267766431555, + "learning_rate": 3.356437174622132e-07, + "loss": 0.5961, + "step": 23202 + }, + { + "epoch": 1.6763053804612857, + "grad_norm": 7.582966750230903, + "learning_rate": 3.354973378553428e-07, + "loss": 0.6352, + "step": 23203 + }, + { + "epoch": 1.676377625661495, + "grad_norm": 6.547921108527947, + "learning_rate": 3.3535098787883356e-07, + "loss": 0.5908, + "step": 23204 + }, + { + "epoch": 1.6764498708617046, + "grad_norm": 7.740425840979747, + "learning_rate": 3.3520466753468814e-07, + "loss": 0.6144, + "step": 23205 + }, + { + "epoch": 1.6765221160619141, + "grad_norm": 6.471236630964405, + "learning_rate": 3.3505837682490963e-07, + "loss": 0.6542, + "step": 23206 + }, + { + "epoch": 1.6765943612621237, + "grad_norm": 6.595693871217968, + "learning_rate": 3.3491211575150173e-07, + "loss": 0.5763, + "step": 23207 + }, + { + "epoch": 1.6766666064623332, + "grad_norm": 7.6854739116612825, + "learning_rate": 3.3476588431646556e-07, + "loss": 0.6179, + "step": 23208 + }, + { + "epoch": 1.6767388516625426, + "grad_norm": 7.766601009937268, + "learning_rate": 3.3461968252180313e-07, + "loss": 0.5578, + "step": 23209 + }, + { + "epoch": 1.6768110968627523, + "grad_norm": 7.520378909203198, + "learning_rate": 3.34473510369516e-07, + "loss": 0.5811, + "step": 23210 + }, + { + "epoch": 1.6768833420629616, + "grad_norm": 7.73713280418147, + "learning_rate": 3.3432736786160544e-07, + "loss": 0.5592, + "step": 23211 + }, + { + "epoch": 1.6769555872631712, + "grad_norm": 6.815978955536354, + "learning_rate": 3.341812550000714e-07, + "loss": 0.5631, + "step": 23212 + }, + { + "epoch": 1.6770278324633807, + "grad_norm": 6.252419575597627, + "learning_rate": 3.340351717869142e-07, + "loss": 0.4967, + "step": 23213 + }, + { + "epoch": 1.6771000776635903, + "grad_norm": 9.356395113322307, + "learning_rate": 3.338891182241338e-07, + "loss": 0.612, + "step": 23214 + }, + { + "epoch": 1.6771723228637998, + "grad_norm": 5.744411240331499, + "learning_rate": 3.337430943137296e-07, + "loss": 0.5842, + "step": 23215 + }, + { + "epoch": 1.6772445680640091, + "grad_norm": 6.881340570919195, + "learning_rate": 3.3359710005770013e-07, + "loss": 0.5971, + "step": 23216 + }, + { + "epoch": 1.6773168132642189, + "grad_norm": 8.817830522970825, + "learning_rate": 3.334511354580444e-07, + "loss": 0.6515, + "step": 23217 + }, + { + "epoch": 1.6773890584644282, + "grad_norm": 8.131669344340287, + "learning_rate": 3.333052005167611e-07, + "loss": 0.6203, + "step": 23218 + }, + { + "epoch": 1.6774613036646377, + "grad_norm": 8.013853475961467, + "learning_rate": 3.3315929523584646e-07, + "loss": 0.6855, + "step": 23219 + }, + { + "epoch": 1.6775335488648473, + "grad_norm": 7.478248334378497, + "learning_rate": 3.3301341961729877e-07, + "loss": 0.6259, + "step": 23220 + }, + { + "epoch": 1.6776057940650568, + "grad_norm": 6.261193284935257, + "learning_rate": 3.3286757366311486e-07, + "loss": 0.656, + "step": 23221 + }, + { + "epoch": 1.6776780392652664, + "grad_norm": 7.644552208389768, + "learning_rate": 3.3272175737529166e-07, + "loss": 0.5948, + "step": 23222 + }, + { + "epoch": 1.6777502844654757, + "grad_norm": 7.2793817701985395, + "learning_rate": 3.325759707558243e-07, + "loss": 0.6163, + "step": 23223 + }, + { + "epoch": 1.6778225296656855, + "grad_norm": 7.799444902670633, + "learning_rate": 3.3243021380670925e-07, + "loss": 0.5211, + "step": 23224 + }, + { + "epoch": 1.6778947748658948, + "grad_norm": 7.823711925437716, + "learning_rate": 3.3228448652994127e-07, + "loss": 0.63, + "step": 23225 + }, + { + "epoch": 1.6779670200661043, + "grad_norm": 6.640164563541193, + "learning_rate": 3.3213878892751543e-07, + "loss": 0.5869, + "step": 23226 + }, + { + "epoch": 1.6780392652663139, + "grad_norm": 6.671210012417924, + "learning_rate": 3.3199312100142655e-07, + "loss": 0.6338, + "step": 23227 + }, + { + "epoch": 1.6781115104665234, + "grad_norm": 7.781205142707677, + "learning_rate": 3.3184748275366855e-07, + "loss": 0.6156, + "step": 23228 + }, + { + "epoch": 1.678183755666733, + "grad_norm": 7.5801314696332165, + "learning_rate": 3.3170187418623545e-07, + "loss": 0.6303, + "step": 23229 + }, + { + "epoch": 1.6782560008669423, + "grad_norm": 6.286530206412183, + "learning_rate": 3.315562953011198e-07, + "loss": 0.6137, + "step": 23230 + }, + { + "epoch": 1.678328246067152, + "grad_norm": 7.227351888895941, + "learning_rate": 3.3141074610031436e-07, + "loss": 0.5704, + "step": 23231 + }, + { + "epoch": 1.6784004912673613, + "grad_norm": 7.336401586508565, + "learning_rate": 3.3126522658581293e-07, + "loss": 0.5976, + "step": 23232 + }, + { + "epoch": 1.678472736467571, + "grad_norm": 7.22572867866931, + "learning_rate": 3.3111973675960587e-07, + "loss": 0.6571, + "step": 23233 + }, + { + "epoch": 1.6785449816677804, + "grad_norm": 7.701963387393629, + "learning_rate": 3.309742766236851e-07, + "loss": 0.6117, + "step": 23234 + }, + { + "epoch": 1.67861722686799, + "grad_norm": 6.717412771177524, + "learning_rate": 3.3082884618004294e-07, + "loss": 0.5911, + "step": 23235 + }, + { + "epoch": 1.6786894720681995, + "grad_norm": 8.163961572342652, + "learning_rate": 3.306834454306701e-07, + "loss": 0.6058, + "step": 23236 + }, + { + "epoch": 1.6787617172684088, + "grad_norm": 7.663918652287922, + "learning_rate": 3.3053807437755604e-07, + "loss": 0.6504, + "step": 23237 + }, + { + "epoch": 1.6788339624686186, + "grad_norm": 8.921252441063839, + "learning_rate": 3.303927330226914e-07, + "loss": 0.6249, + "step": 23238 + }, + { + "epoch": 1.678906207668828, + "grad_norm": 7.36553062666591, + "learning_rate": 3.3024742136806594e-07, + "loss": 0.6387, + "step": 23239 + }, + { + "epoch": 1.6789784528690375, + "grad_norm": 6.94956457297191, + "learning_rate": 3.301021394156681e-07, + "loss": 0.6211, + "step": 23240 + }, + { + "epoch": 1.679050698069247, + "grad_norm": 7.318555362501137, + "learning_rate": 3.2995688716748743e-07, + "loss": 0.5691, + "step": 23241 + }, + { + "epoch": 1.6791229432694565, + "grad_norm": 7.658569980931952, + "learning_rate": 3.298116646255117e-07, + "loss": 0.6073, + "step": 23242 + }, + { + "epoch": 1.679195188469666, + "grad_norm": 7.631676924227155, + "learning_rate": 3.2966647179172945e-07, + "loss": 0.5802, + "step": 23243 + }, + { + "epoch": 1.6792674336698754, + "grad_norm": 7.241134748103058, + "learning_rate": 3.2952130866812787e-07, + "loss": 0.5908, + "step": 23244 + }, + { + "epoch": 1.6793396788700852, + "grad_norm": 6.699995114908288, + "learning_rate": 3.293761752566943e-07, + "loss": 0.6638, + "step": 23245 + }, + { + "epoch": 1.6794119240702945, + "grad_norm": 7.150925166956245, + "learning_rate": 3.2923107155941546e-07, + "loss": 0.6557, + "step": 23246 + }, + { + "epoch": 1.6794841692705043, + "grad_norm": 7.687150900752974, + "learning_rate": 3.290859975782787e-07, + "loss": 0.5949, + "step": 23247 + }, + { + "epoch": 1.6795564144707136, + "grad_norm": 7.259623750127013, + "learning_rate": 3.289409533152682e-07, + "loss": 0.602, + "step": 23248 + }, + { + "epoch": 1.6796286596709231, + "grad_norm": 8.8197389995684, + "learning_rate": 3.287959387723702e-07, + "loss": 0.6083, + "step": 23249 + }, + { + "epoch": 1.6797009048711327, + "grad_norm": 8.304185882663521, + "learning_rate": 3.286509539515706e-07, + "loss": 0.6322, + "step": 23250 + }, + { + "epoch": 1.679773150071342, + "grad_norm": 7.389901317304053, + "learning_rate": 3.285059988548531e-07, + "loss": 0.6251, + "step": 23251 + }, + { + "epoch": 1.6798453952715517, + "grad_norm": 6.001364552779753, + "learning_rate": 3.2836107348420245e-07, + "loss": 0.5261, + "step": 23252 + }, + { + "epoch": 1.679917640471761, + "grad_norm": 8.48995243198472, + "learning_rate": 3.2821617784160237e-07, + "loss": 0.5818, + "step": 23253 + }, + { + "epoch": 1.6799898856719708, + "grad_norm": 6.4949887838993625, + "learning_rate": 3.280713119290366e-07, + "loss": 0.5286, + "step": 23254 + }, + { + "epoch": 1.6800621308721801, + "grad_norm": 8.017525549842844, + "learning_rate": 3.279264757484882e-07, + "loss": 0.5754, + "step": 23255 + }, + { + "epoch": 1.6801343760723897, + "grad_norm": 6.123821845893636, + "learning_rate": 3.277816693019398e-07, + "loss": 0.5544, + "step": 23256 + }, + { + "epoch": 1.6802066212725992, + "grad_norm": 6.546670478047947, + "learning_rate": 3.276368925913742e-07, + "loss": 0.6478, + "step": 23257 + }, + { + "epoch": 1.6802788664728086, + "grad_norm": 7.202147311970251, + "learning_rate": 3.2749214561877215e-07, + "loss": 0.62, + "step": 23258 + }, + { + "epoch": 1.6803511116730183, + "grad_norm": 9.624535487472077, + "learning_rate": 3.273474283861161e-07, + "loss": 0.6471, + "step": 23259 + }, + { + "epoch": 1.6804233568732276, + "grad_norm": 6.61406222315733, + "learning_rate": 3.272027408953865e-07, + "loss": 0.6548, + "step": 23260 + }, + { + "epoch": 1.6804956020734374, + "grad_norm": 8.145385053418718, + "learning_rate": 3.270580831485651e-07, + "loss": 0.5282, + "step": 23261 + }, + { + "epoch": 1.6805678472736467, + "grad_norm": 6.862570329038245, + "learning_rate": 3.269134551476308e-07, + "loss": 0.5875, + "step": 23262 + }, + { + "epoch": 1.6806400924738563, + "grad_norm": 7.1139411096502805, + "learning_rate": 3.2676885689456345e-07, + "loss": 0.6045, + "step": 23263 + }, + { + "epoch": 1.6807123376740658, + "grad_norm": 7.221449574279212, + "learning_rate": 3.26624288391344e-07, + "loss": 0.5926, + "step": 23264 + }, + { + "epoch": 1.6807845828742753, + "grad_norm": 7.401122044219223, + "learning_rate": 3.2647974963994995e-07, + "loss": 0.595, + "step": 23265 + }, + { + "epoch": 1.6808568280744849, + "grad_norm": 7.344538999337737, + "learning_rate": 3.263352406423606e-07, + "loss": 0.558, + "step": 23266 + }, + { + "epoch": 1.6809290732746942, + "grad_norm": 6.221423930472529, + "learning_rate": 3.261907614005541e-07, + "loss": 0.6218, + "step": 23267 + }, + { + "epoch": 1.681001318474904, + "grad_norm": 8.19395227480829, + "learning_rate": 3.2604631191650886e-07, + "loss": 0.5558, + "step": 23268 + }, + { + "epoch": 1.6810735636751133, + "grad_norm": 7.533593537329536, + "learning_rate": 3.259018921922011e-07, + "loss": 0.6199, + "step": 23269 + }, + { + "epoch": 1.6811458088753228, + "grad_norm": 7.442183358373296, + "learning_rate": 3.2575750222960804e-07, + "loss": 0.6028, + "step": 23270 + }, + { + "epoch": 1.6812180540755324, + "grad_norm": 7.7215242535457795, + "learning_rate": 3.2561314203070683e-07, + "loss": 0.612, + "step": 23271 + }, + { + "epoch": 1.681290299275742, + "grad_norm": 7.93478419182894, + "learning_rate": 3.254688115974733e-07, + "loss": 0.6113, + "step": 23272 + }, + { + "epoch": 1.6813625444759515, + "grad_norm": 7.570051978070137, + "learning_rate": 3.2532451093188337e-07, + "loss": 0.564, + "step": 23273 + }, + { + "epoch": 1.6814347896761608, + "grad_norm": 6.361412078237849, + "learning_rate": 3.251802400359125e-07, + "loss": 0.5166, + "step": 23274 + }, + { + "epoch": 1.6815070348763705, + "grad_norm": 7.022064583990133, + "learning_rate": 3.2503599891153593e-07, + "loss": 0.6596, + "step": 23275 + }, + { + "epoch": 1.6815792800765799, + "grad_norm": 6.72827155081626, + "learning_rate": 3.2489178756072715e-07, + "loss": 0.6248, + "step": 23276 + }, + { + "epoch": 1.6816515252767894, + "grad_norm": 8.603999126240975, + "learning_rate": 3.2474760598546094e-07, + "loss": 0.6538, + "step": 23277 + }, + { + "epoch": 1.681723770476999, + "grad_norm": 6.408427840562403, + "learning_rate": 3.2460345418771104e-07, + "loss": 0.5554, + "step": 23278 + }, + { + "epoch": 1.6817960156772085, + "grad_norm": 8.861432087292398, + "learning_rate": 3.244593321694514e-07, + "loss": 0.7173, + "step": 23279 + }, + { + "epoch": 1.681868260877418, + "grad_norm": 6.736902032761307, + "learning_rate": 3.243152399326538e-07, + "loss": 0.5663, + "step": 23280 + }, + { + "epoch": 1.6819405060776274, + "grad_norm": 7.556795853681556, + "learning_rate": 3.241711774792913e-07, + "loss": 0.5746, + "step": 23281 + }, + { + "epoch": 1.6820127512778371, + "grad_norm": 6.4913780543263275, + "learning_rate": 3.240271448113358e-07, + "loss": 0.6379, + "step": 23282 + }, + { + "epoch": 1.6820849964780464, + "grad_norm": 7.85826779523956, + "learning_rate": 3.238831419307592e-07, + "loss": 0.6986, + "step": 23283 + }, + { + "epoch": 1.682157241678256, + "grad_norm": 8.024809039449407, + "learning_rate": 3.2373916883953275e-07, + "loss": 0.6651, + "step": 23284 + }, + { + "epoch": 1.6822294868784655, + "grad_norm": 6.9637871420559145, + "learning_rate": 3.2359522553962743e-07, + "loss": 0.6026, + "step": 23285 + }, + { + "epoch": 1.682301732078675, + "grad_norm": 6.4752356409033185, + "learning_rate": 3.234513120330138e-07, + "loss": 0.5119, + "step": 23286 + }, + { + "epoch": 1.6823739772788846, + "grad_norm": 8.652635170741293, + "learning_rate": 3.233074283216614e-07, + "loss": 0.6436, + "step": 23287 + }, + { + "epoch": 1.682446222479094, + "grad_norm": 6.644000750427387, + "learning_rate": 3.2316357440754036e-07, + "loss": 0.5793, + "step": 23288 + }, + { + "epoch": 1.6825184676793037, + "grad_norm": 7.30660877766382, + "learning_rate": 3.230197502926202e-07, + "loss": 0.5388, + "step": 23289 + }, + { + "epoch": 1.682590712879513, + "grad_norm": 6.769440524312702, + "learning_rate": 3.2287595597886884e-07, + "loss": 0.6955, + "step": 23290 + }, + { + "epoch": 1.6826629580797225, + "grad_norm": 7.695309525697395, + "learning_rate": 3.227321914682546e-07, + "loss": 0.6292, + "step": 23291 + }, + { + "epoch": 1.682735203279932, + "grad_norm": 8.714441472253965, + "learning_rate": 3.225884567627466e-07, + "loss": 0.6561, + "step": 23292 + }, + { + "epoch": 1.6828074484801416, + "grad_norm": 6.624842228000485, + "learning_rate": 3.224447518643126e-07, + "loss": 0.5871, + "step": 23293 + }, + { + "epoch": 1.6828796936803512, + "grad_norm": 7.386417150770414, + "learning_rate": 3.223010767749188e-07, + "loss": 0.5778, + "step": 23294 + }, + { + "epoch": 1.6829519388805605, + "grad_norm": 7.226268205578525, + "learning_rate": 3.22157431496532e-07, + "loss": 0.6672, + "step": 23295 + }, + { + "epoch": 1.6830241840807703, + "grad_norm": 8.02943013868673, + "learning_rate": 3.2201381603111923e-07, + "loss": 0.5917, + "step": 23296 + }, + { + "epoch": 1.6830964292809796, + "grad_norm": 6.792045754304402, + "learning_rate": 3.2187023038064676e-07, + "loss": 0.6174, + "step": 23297 + }, + { + "epoch": 1.6831686744811891, + "grad_norm": 8.393278302695789, + "learning_rate": 3.2172667454707927e-07, + "loss": 0.6316, + "step": 23298 + }, + { + "epoch": 1.6832409196813987, + "grad_norm": 7.699300253643311, + "learning_rate": 3.21583148532382e-07, + "loss": 0.6118, + "step": 23299 + }, + { + "epoch": 1.6833131648816082, + "grad_norm": 8.041915757928404, + "learning_rate": 3.214396523385199e-07, + "loss": 0.6365, + "step": 23300 + }, + { + "epoch": 1.6833854100818177, + "grad_norm": 7.262460720986858, + "learning_rate": 3.2129618596745766e-07, + "loss": 0.6016, + "step": 23301 + }, + { + "epoch": 1.683457655282027, + "grad_norm": 7.030318948773546, + "learning_rate": 3.2115274942115887e-07, + "loss": 0.6366, + "step": 23302 + }, + { + "epoch": 1.6835299004822368, + "grad_norm": 6.373383560195565, + "learning_rate": 3.210093427015873e-07, + "loss": 0.5941, + "step": 23303 + }, + { + "epoch": 1.6836021456824461, + "grad_norm": 5.563022374688437, + "learning_rate": 3.208659658107063e-07, + "loss": 0.5539, + "step": 23304 + }, + { + "epoch": 1.6836743908826557, + "grad_norm": 7.822191742394529, + "learning_rate": 3.2072261875047807e-07, + "loss": 0.6374, + "step": 23305 + }, + { + "epoch": 1.6837466360828652, + "grad_norm": 6.34976737730207, + "learning_rate": 3.2057930152286474e-07, + "loss": 0.6128, + "step": 23306 + }, + { + "epoch": 1.6838188812830748, + "grad_norm": 7.772897918223237, + "learning_rate": 3.204360141298293e-07, + "loss": 0.5757, + "step": 23307 + }, + { + "epoch": 1.6838911264832843, + "grad_norm": 7.391384738962991, + "learning_rate": 3.202927565733319e-07, + "loss": 0.6037, + "step": 23308 + }, + { + "epoch": 1.6839633716834936, + "grad_norm": 6.900719270672929, + "learning_rate": 3.2014952885533413e-07, + "loss": 0.6612, + "step": 23309 + }, + { + "epoch": 1.6840356168837034, + "grad_norm": 7.298023729757764, + "learning_rate": 3.200063309777968e-07, + "loss": 0.5875, + "step": 23310 + }, + { + "epoch": 1.6841078620839127, + "grad_norm": 6.867995144130832, + "learning_rate": 3.1986316294268016e-07, + "loss": 0.6132, + "step": 23311 + }, + { + "epoch": 1.6841801072841223, + "grad_norm": 6.03569033336643, + "learning_rate": 3.197200247519441e-07, + "loss": 0.5789, + "step": 23312 + }, + { + "epoch": 1.6842523524843318, + "grad_norm": 6.993317275172811, + "learning_rate": 3.1957691640754793e-07, + "loss": 0.5509, + "step": 23313 + }, + { + "epoch": 1.6843245976845413, + "grad_norm": 6.155799190842608, + "learning_rate": 3.1943383791145144e-07, + "loss": 0.6311, + "step": 23314 + }, + { + "epoch": 1.6843968428847509, + "grad_norm": 7.405186315197813, + "learning_rate": 3.192907892656119e-07, + "loss": 0.5778, + "step": 23315 + }, + { + "epoch": 1.6844690880849602, + "grad_norm": 6.652653971957768, + "learning_rate": 3.1914777047198836e-07, + "loss": 0.614, + "step": 23316 + }, + { + "epoch": 1.68454133328517, + "grad_norm": 7.741116508368969, + "learning_rate": 3.1900478153253837e-07, + "loss": 0.6276, + "step": 23317 + }, + { + "epoch": 1.6846135784853793, + "grad_norm": 6.064778951593327, + "learning_rate": 3.188618224492204e-07, + "loss": 0.588, + "step": 23318 + }, + { + "epoch": 1.6846858236855888, + "grad_norm": 7.8059679724123985, + "learning_rate": 3.18718893223989e-07, + "loss": 0.6431, + "step": 23319 + }, + { + "epoch": 1.6847580688857984, + "grad_norm": 6.20451650698499, + "learning_rate": 3.1857599385880317e-07, + "loss": 0.6099, + "step": 23320 + }, + { + "epoch": 1.684830314086008, + "grad_norm": 7.324196093714958, + "learning_rate": 3.184331243556185e-07, + "loss": 0.6884, + "step": 23321 + }, + { + "epoch": 1.6849025592862175, + "grad_norm": 7.006640690332831, + "learning_rate": 3.182902847163902e-07, + "loss": 0.6288, + "step": 23322 + }, + { + "epoch": 1.6849748044864268, + "grad_norm": 8.090681635900408, + "learning_rate": 3.181474749430738e-07, + "loss": 0.6351, + "step": 23323 + }, + { + "epoch": 1.6850470496866365, + "grad_norm": 6.575157819239048, + "learning_rate": 3.180046950376245e-07, + "loss": 0.6234, + "step": 23324 + }, + { + "epoch": 1.6851192948868459, + "grad_norm": 6.958529380122872, + "learning_rate": 3.1786194500199707e-07, + "loss": 0.6555, + "step": 23325 + }, + { + "epoch": 1.6851915400870556, + "grad_norm": 6.888820558981607, + "learning_rate": 3.177192248381447e-07, + "loss": 0.5601, + "step": 23326 + }, + { + "epoch": 1.685263785287265, + "grad_norm": 6.2127171683779245, + "learning_rate": 3.1757653454802194e-07, + "loss": 0.6012, + "step": 23327 + }, + { + "epoch": 1.6853360304874745, + "grad_norm": 6.528577339858932, + "learning_rate": 3.174338741335817e-07, + "loss": 0.6119, + "step": 23328 + }, + { + "epoch": 1.685408275687684, + "grad_norm": 7.348205513428304, + "learning_rate": 3.17291243596777e-07, + "loss": 0.6461, + "step": 23329 + }, + { + "epoch": 1.6854805208878934, + "grad_norm": 6.827260955682185, + "learning_rate": 3.1714864293956066e-07, + "loss": 0.5554, + "step": 23330 + }, + { + "epoch": 1.6855527660881031, + "grad_norm": 6.000611274098814, + "learning_rate": 3.170060721638843e-07, + "loss": 0.6064, + "step": 23331 + }, + { + "epoch": 1.6856250112883124, + "grad_norm": 6.756503610147452, + "learning_rate": 3.168635312717006e-07, + "loss": 0.5542, + "step": 23332 + }, + { + "epoch": 1.6856972564885222, + "grad_norm": 6.697329134012818, + "learning_rate": 3.1672102026495925e-07, + "loss": 0.6537, + "step": 23333 + }, + { + "epoch": 1.6857695016887315, + "grad_norm": 6.306051756632213, + "learning_rate": 3.165785391456122e-07, + "loss": 0.5766, + "step": 23334 + }, + { + "epoch": 1.685841746888941, + "grad_norm": 6.970172275072508, + "learning_rate": 3.1643608791560946e-07, + "loss": 0.634, + "step": 23335 + }, + { + "epoch": 1.6859139920891506, + "grad_norm": 7.489604102801377, + "learning_rate": 3.1629366657690175e-07, + "loss": 0.608, + "step": 23336 + }, + { + "epoch": 1.68598623728936, + "grad_norm": 6.473118583084496, + "learning_rate": 3.161512751314377e-07, + "loss": 0.6067, + "step": 23337 + }, + { + "epoch": 1.6860584824895697, + "grad_norm": 6.379461186057492, + "learning_rate": 3.1600891358116726e-07, + "loss": 0.6413, + "step": 23338 + }, + { + "epoch": 1.686130727689779, + "grad_norm": 7.80109665815161, + "learning_rate": 3.1586658192803875e-07, + "loss": 0.6393, + "step": 23339 + }, + { + "epoch": 1.6862029728899888, + "grad_norm": 7.7056684956810875, + "learning_rate": 3.15724280174001e-07, + "loss": 0.6044, + "step": 23340 + }, + { + "epoch": 1.686275218090198, + "grad_norm": 6.957519514116123, + "learning_rate": 3.1558200832100177e-07, + "loss": 0.6669, + "step": 23341 + }, + { + "epoch": 1.6863474632904076, + "grad_norm": 6.8911273799112225, + "learning_rate": 3.1543976637098907e-07, + "loss": 0.572, + "step": 23342 + }, + { + "epoch": 1.6864197084906172, + "grad_norm": 7.494863149705437, + "learning_rate": 3.1529755432590984e-07, + "loss": 0.6371, + "step": 23343 + }, + { + "epoch": 1.6864919536908267, + "grad_norm": 7.186321924937533, + "learning_rate": 3.1515537218771067e-07, + "loss": 0.5873, + "step": 23344 + }, + { + "epoch": 1.6865641988910363, + "grad_norm": 6.932327679302433, + "learning_rate": 3.150132199583378e-07, + "loss": 0.5836, + "step": 23345 + }, + { + "epoch": 1.6866364440912456, + "grad_norm": 7.1612832305771095, + "learning_rate": 3.148710976397379e-07, + "loss": 0.6079, + "step": 23346 + }, + { + "epoch": 1.6867086892914553, + "grad_norm": 7.911354317063985, + "learning_rate": 3.14729005233855e-07, + "loss": 0.5691, + "step": 23347 + }, + { + "epoch": 1.6867809344916647, + "grad_norm": 7.804607830080103, + "learning_rate": 3.1458694274263596e-07, + "loss": 0.5886, + "step": 23348 + }, + { + "epoch": 1.6868531796918742, + "grad_norm": 6.663044644937455, + "learning_rate": 3.144449101680247e-07, + "loss": 0.6339, + "step": 23349 + }, + { + "epoch": 1.6869254248920837, + "grad_norm": 6.206041395171642, + "learning_rate": 3.1430290751196605e-07, + "loss": 0.6176, + "step": 23350 + }, + { + "epoch": 1.6869976700922933, + "grad_norm": 6.022421903377029, + "learning_rate": 3.1416093477640327e-07, + "loss": 0.675, + "step": 23351 + }, + { + "epoch": 1.6870699152925028, + "grad_norm": 5.031423814507807, + "learning_rate": 3.1401899196327996e-07, + "loss": 0.5533, + "step": 23352 + }, + { + "epoch": 1.6871421604927122, + "grad_norm": 6.987586595138592, + "learning_rate": 3.1387707907453905e-07, + "loss": 0.5611, + "step": 23353 + }, + { + "epoch": 1.687214405692922, + "grad_norm": 6.994036450285337, + "learning_rate": 3.137351961121246e-07, + "loss": 0.6041, + "step": 23354 + }, + { + "epoch": 1.6872866508931312, + "grad_norm": 7.273135136958407, + "learning_rate": 3.1359334307797695e-07, + "loss": 0.628, + "step": 23355 + }, + { + "epoch": 1.6873588960933408, + "grad_norm": 6.549675317535726, + "learning_rate": 3.134515199740387e-07, + "loss": 0.5174, + "step": 23356 + }, + { + "epoch": 1.6874311412935503, + "grad_norm": 7.774069050192954, + "learning_rate": 3.1330972680225143e-07, + "loss": 0.5414, + "step": 23357 + }, + { + "epoch": 1.6875033864937599, + "grad_norm": 7.890529035230182, + "learning_rate": 3.1316796356455626e-07, + "loss": 0.4858, + "step": 23358 + }, + { + "epoch": 1.6875756316939694, + "grad_norm": 6.713694955177207, + "learning_rate": 3.1302623026289364e-07, + "loss": 0.5335, + "step": 23359 + }, + { + "epoch": 1.6876478768941787, + "grad_norm": 7.804187961053047, + "learning_rate": 3.1288452689920397e-07, + "loss": 0.6136, + "step": 23360 + }, + { + "epoch": 1.6877201220943885, + "grad_norm": 7.529742458037658, + "learning_rate": 3.127428534754273e-07, + "loss": 0.5927, + "step": 23361 + }, + { + "epoch": 1.6877923672945978, + "grad_norm": 6.613028307976582, + "learning_rate": 3.126012099935022e-07, + "loss": 0.6113, + "step": 23362 + }, + { + "epoch": 1.6878646124948073, + "grad_norm": 7.4181168527869366, + "learning_rate": 3.124595964553684e-07, + "loss": 0.5261, + "step": 23363 + }, + { + "epoch": 1.687936857695017, + "grad_norm": 6.829062670260517, + "learning_rate": 3.123180128629644e-07, + "loss": 0.6038, + "step": 23364 + }, + { + "epoch": 1.6880091028952264, + "grad_norm": 6.413579850617293, + "learning_rate": 3.1217645921822777e-07, + "loss": 0.5571, + "step": 23365 + }, + { + "epoch": 1.688081348095436, + "grad_norm": 6.741713346957067, + "learning_rate": 3.120349355230967e-07, + "loss": 0.5849, + "step": 23366 + }, + { + "epoch": 1.6881535932956453, + "grad_norm": 7.543269652691905, + "learning_rate": 3.118934417795083e-07, + "loss": 0.527, + "step": 23367 + }, + { + "epoch": 1.688225838495855, + "grad_norm": 7.9230936832264955, + "learning_rate": 3.1175197798939986e-07, + "loss": 0.7093, + "step": 23368 + }, + { + "epoch": 1.6882980836960644, + "grad_norm": 7.0599809053070945, + "learning_rate": 3.116105441547079e-07, + "loss": 0.5942, + "step": 23369 + }, + { + "epoch": 1.688370328896274, + "grad_norm": 7.12276838724179, + "learning_rate": 3.1146914027736843e-07, + "loss": 0.5323, + "step": 23370 + }, + { + "epoch": 1.6884425740964835, + "grad_norm": 6.572115293600155, + "learning_rate": 3.1132776635931754e-07, + "loss": 0.5759, + "step": 23371 + }, + { + "epoch": 1.688514819296693, + "grad_norm": 7.5572032191549185, + "learning_rate": 3.1118642240248964e-07, + "loss": 0.6123, + "step": 23372 + }, + { + "epoch": 1.6885870644969025, + "grad_norm": 8.046541729293992, + "learning_rate": 3.110451084088201e-07, + "loss": 0.6585, + "step": 23373 + }, + { + "epoch": 1.6886593096971119, + "grad_norm": 8.663849397176357, + "learning_rate": 3.109038243802434e-07, + "loss": 0.5752, + "step": 23374 + }, + { + "epoch": 1.6887315548973216, + "grad_norm": 7.938093028105149, + "learning_rate": 3.107625703186942e-07, + "loss": 0.5996, + "step": 23375 + }, + { + "epoch": 1.688803800097531, + "grad_norm": 7.036890099015687, + "learning_rate": 3.1062134622610456e-07, + "loss": 0.5666, + "step": 23376 + }, + { + "epoch": 1.6888760452977405, + "grad_norm": 7.153703661336466, + "learning_rate": 3.10480152104409e-07, + "loss": 0.6144, + "step": 23377 + }, + { + "epoch": 1.68894829049795, + "grad_norm": 8.08549732724021, + "learning_rate": 3.1033898795554045e-07, + "loss": 0.6733, + "step": 23378 + }, + { + "epoch": 1.6890205356981596, + "grad_norm": 7.298422802416425, + "learning_rate": 3.1019785378143126e-07, + "loss": 0.6305, + "step": 23379 + }, + { + "epoch": 1.6890927808983691, + "grad_norm": 6.9484985046934105, + "learning_rate": 3.100567495840129e-07, + "loss": 0.6044, + "step": 23380 + }, + { + "epoch": 1.6891650260985784, + "grad_norm": 7.675583201563961, + "learning_rate": 3.0991567536521715e-07, + "loss": 0.5538, + "step": 23381 + }, + { + "epoch": 1.6892372712987882, + "grad_norm": 6.526654100311827, + "learning_rate": 3.0977463112697587e-07, + "loss": 0.6111, + "step": 23382 + }, + { + "epoch": 1.6893095164989975, + "grad_norm": 7.447444390393127, + "learning_rate": 3.096336168712186e-07, + "loss": 0.6186, + "step": 23383 + }, + { + "epoch": 1.689381761699207, + "grad_norm": 6.826576179461003, + "learning_rate": 3.094926325998765e-07, + "loss": 0.5739, + "step": 23384 + }, + { + "epoch": 1.6894540068994166, + "grad_norm": 7.201975699729359, + "learning_rate": 3.093516783148795e-07, + "loss": 0.5641, + "step": 23385 + }, + { + "epoch": 1.6895262520996261, + "grad_norm": 6.6250538373954875, + "learning_rate": 3.0921075401815714e-07, + "loss": 0.5637, + "step": 23386 + }, + { + "epoch": 1.6895984972998357, + "grad_norm": 7.59287913159956, + "learning_rate": 3.0906985971163846e-07, + "loss": 0.6944, + "step": 23387 + }, + { + "epoch": 1.689670742500045, + "grad_norm": 7.481726191637905, + "learning_rate": 3.089289953972521e-07, + "loss": 0.5937, + "step": 23388 + }, + { + "epoch": 1.6897429877002548, + "grad_norm": 7.36800039424336, + "learning_rate": 3.087881610769272e-07, + "loss": 0.615, + "step": 23389 + }, + { + "epoch": 1.689815232900464, + "grad_norm": 7.5588478538521695, + "learning_rate": 3.086473567525905e-07, + "loss": 0.6124, + "step": 23390 + }, + { + "epoch": 1.6898874781006736, + "grad_norm": 6.592372370711556, + "learning_rate": 3.0850658242617017e-07, + "loss": 0.5579, + "step": 23391 + }, + { + "epoch": 1.6899597233008832, + "grad_norm": 6.504969531141112, + "learning_rate": 3.083658380995927e-07, + "loss": 0.556, + "step": 23392 + }, + { + "epoch": 1.6900319685010927, + "grad_norm": 7.10918531898404, + "learning_rate": 3.08225123774786e-07, + "loss": 0.5831, + "step": 23393 + }, + { + "epoch": 1.6901042137013023, + "grad_norm": 6.973764574029179, + "learning_rate": 3.080844394536753e-07, + "loss": 0.6673, + "step": 23394 + }, + { + "epoch": 1.6901764589015116, + "grad_norm": 7.05335555065288, + "learning_rate": 3.079437851381864e-07, + "loss": 0.6079, + "step": 23395 + }, + { + "epoch": 1.6902487041017213, + "grad_norm": 7.982285914932788, + "learning_rate": 3.078031608302451e-07, + "loss": 0.6477, + "step": 23396 + }, + { + "epoch": 1.6903209493019307, + "grad_norm": 7.72287433106027, + "learning_rate": 3.0766256653177616e-07, + "loss": 0.651, + "step": 23397 + }, + { + "epoch": 1.6903931945021404, + "grad_norm": 6.9320781244012775, + "learning_rate": 3.075220022447048e-07, + "loss": 0.6053, + "step": 23398 + }, + { + "epoch": 1.6904654397023497, + "grad_norm": 8.587893512266401, + "learning_rate": 3.073814679709547e-07, + "loss": 0.6605, + "step": 23399 + }, + { + "epoch": 1.6905376849025593, + "grad_norm": 8.365658688967233, + "learning_rate": 3.072409637124504e-07, + "loss": 0.5467, + "step": 23400 + }, + { + "epoch": 1.6906099301027688, + "grad_norm": 9.057794862222995, + "learning_rate": 3.071004894711144e-07, + "loss": 0.6428, + "step": 23401 + }, + { + "epoch": 1.6906821753029782, + "grad_norm": 7.593480737252927, + "learning_rate": 3.069600452488697e-07, + "loss": 0.5923, + "step": 23402 + }, + { + "epoch": 1.690754420503188, + "grad_norm": 7.070352965181165, + "learning_rate": 3.0681963104763954e-07, + "loss": 0.643, + "step": 23403 + }, + { + "epoch": 1.6908266657033972, + "grad_norm": 5.5336495060194, + "learning_rate": 3.0667924686934555e-07, + "loss": 0.6381, + "step": 23404 + }, + { + "epoch": 1.690898910903607, + "grad_norm": 8.592126089467303, + "learning_rate": 3.0653889271590965e-07, + "loss": 0.6201, + "step": 23405 + }, + { + "epoch": 1.6909711561038163, + "grad_norm": 7.687531013736348, + "learning_rate": 3.0639856858925317e-07, + "loss": 0.6279, + "step": 23406 + }, + { + "epoch": 1.6910434013040259, + "grad_norm": 7.280219664141618, + "learning_rate": 3.0625827449129756e-07, + "loss": 0.6042, + "step": 23407 + }, + { + "epoch": 1.6911156465042354, + "grad_norm": 6.925604166605955, + "learning_rate": 3.061180104239625e-07, + "loss": 0.617, + "step": 23408 + }, + { + "epoch": 1.6911878917044447, + "grad_norm": 7.396987466799195, + "learning_rate": 3.059777763891683e-07, + "loss": 0.5859, + "step": 23409 + }, + { + "epoch": 1.6912601369046545, + "grad_norm": 6.664065934568282, + "learning_rate": 3.058375723888349e-07, + "loss": 0.6519, + "step": 23410 + }, + { + "epoch": 1.6913323821048638, + "grad_norm": 9.038664100986868, + "learning_rate": 3.0569739842488176e-07, + "loss": 0.6456, + "step": 23411 + }, + { + "epoch": 1.6914046273050736, + "grad_norm": 8.219552932796228, + "learning_rate": 3.0555725449922717e-07, + "loss": 0.5616, + "step": 23412 + }, + { + "epoch": 1.691476872505283, + "grad_norm": 8.404532218047548, + "learning_rate": 3.0541714061378987e-07, + "loss": 0.6097, + "step": 23413 + }, + { + "epoch": 1.6915491177054924, + "grad_norm": 6.818399377299586, + "learning_rate": 3.0527705677048796e-07, + "loss": 0.576, + "step": 23414 + }, + { + "epoch": 1.691621362905702, + "grad_norm": 6.383563703691069, + "learning_rate": 3.0513700297123886e-07, + "loss": 0.5352, + "step": 23415 + }, + { + "epoch": 1.6916936081059115, + "grad_norm": 8.329704664474201, + "learning_rate": 3.0499697921796026e-07, + "loss": 0.5761, + "step": 23416 + }, + { + "epoch": 1.691765853306121, + "grad_norm": 6.528550169499644, + "learning_rate": 3.0485698551256855e-07, + "loss": 0.6211, + "step": 23417 + }, + { + "epoch": 1.6918380985063304, + "grad_norm": 6.68273470790735, + "learning_rate": 3.0471702185698086e-07, + "loss": 0.6039, + "step": 23418 + }, + { + "epoch": 1.6919103437065401, + "grad_norm": 7.694978627354299, + "learning_rate": 3.045770882531121e-07, + "loss": 0.559, + "step": 23419 + }, + { + "epoch": 1.6919825889067495, + "grad_norm": 6.659831484933748, + "learning_rate": 3.044371847028782e-07, + "loss": 0.5621, + "step": 23420 + }, + { + "epoch": 1.692054834106959, + "grad_norm": 7.083271370411045, + "learning_rate": 3.042973112081951e-07, + "loss": 0.6291, + "step": 23421 + }, + { + "epoch": 1.6921270793071685, + "grad_norm": 7.254964509925821, + "learning_rate": 3.041574677709766e-07, + "loss": 0.6075, + "step": 23422 + }, + { + "epoch": 1.692199324507378, + "grad_norm": 7.040447272484246, + "learning_rate": 3.040176543931372e-07, + "loss": 0.5049, + "step": 23423 + }, + { + "epoch": 1.6922715697075876, + "grad_norm": 7.236266974269288, + "learning_rate": 3.038778710765905e-07, + "loss": 0.5841, + "step": 23424 + }, + { + "epoch": 1.692343814907797, + "grad_norm": 6.188430870240225, + "learning_rate": 3.037381178232518e-07, + "loss": 0.63, + "step": 23425 + }, + { + "epoch": 1.6924160601080067, + "grad_norm": 6.577459523661122, + "learning_rate": 3.035983946350324e-07, + "loss": 0.5947, + "step": 23426 + }, + { + "epoch": 1.692488305308216, + "grad_norm": 6.657646784838463, + "learning_rate": 3.0345870151384544e-07, + "loss": 0.5949, + "step": 23427 + }, + { + "epoch": 1.6925605505084256, + "grad_norm": 6.6278219870476445, + "learning_rate": 3.033190384616036e-07, + "loss": 0.6101, + "step": 23428 + }, + { + "epoch": 1.6926327957086351, + "grad_norm": 7.898880611256804, + "learning_rate": 3.03179405480219e-07, + "loss": 0.6148, + "step": 23429 + }, + { + "epoch": 1.6927050409088447, + "grad_norm": 7.042759135295837, + "learning_rate": 3.0303980257160194e-07, + "loss": 0.5937, + "step": 23430 + }, + { + "epoch": 1.6927772861090542, + "grad_norm": 8.173020000000621, + "learning_rate": 3.0290022973766403e-07, + "loss": 0.5625, + "step": 23431 + }, + { + "epoch": 1.6928495313092635, + "grad_norm": 6.321256399696434, + "learning_rate": 3.027606869803162e-07, + "loss": 0.5771, + "step": 23432 + }, + { + "epoch": 1.6929217765094733, + "grad_norm": 6.530426411514867, + "learning_rate": 3.026211743014687e-07, + "loss": 0.6135, + "step": 23433 + }, + { + "epoch": 1.6929940217096826, + "grad_norm": 6.199764542569402, + "learning_rate": 3.02481691703031e-07, + "loss": 0.5599, + "step": 23434 + }, + { + "epoch": 1.6930662669098921, + "grad_norm": 8.197333800273444, + "learning_rate": 3.023422391869127e-07, + "loss": 0.6372, + "step": 23435 + }, + { + "epoch": 1.6931385121101017, + "grad_norm": 5.893229022821842, + "learning_rate": 3.022028167550231e-07, + "loss": 0.6052, + "step": 23436 + }, + { + "epoch": 1.6932107573103112, + "grad_norm": 9.863376778621548, + "learning_rate": 3.0206342440927015e-07, + "loss": 0.6185, + "step": 23437 + }, + { + "epoch": 1.6932830025105208, + "grad_norm": 7.104756022270316, + "learning_rate": 3.019240621515623e-07, + "loss": 0.6686, + "step": 23438 + }, + { + "epoch": 1.69335524771073, + "grad_norm": 8.80914848932254, + "learning_rate": 3.017847299838078e-07, + "loss": 0.6643, + "step": 23439 + }, + { + "epoch": 1.6934274929109399, + "grad_norm": 8.965824939496818, + "learning_rate": 3.016454279079131e-07, + "loss": 0.6287, + "step": 23440 + }, + { + "epoch": 1.6934997381111492, + "grad_norm": 6.562407284036268, + "learning_rate": 3.0150615592578566e-07, + "loss": 0.5537, + "step": 23441 + }, + { + "epoch": 1.6935719833113587, + "grad_norm": 6.5941807032207205, + "learning_rate": 3.013669140393319e-07, + "loss": 0.6044, + "step": 23442 + }, + { + "epoch": 1.6936442285115683, + "grad_norm": 7.774058990937751, + "learning_rate": 3.0122770225045783e-07, + "loss": 0.6086, + "step": 23443 + }, + { + "epoch": 1.6937164737117778, + "grad_norm": 7.748334890348897, + "learning_rate": 3.010885205610692e-07, + "loss": 0.6272, + "step": 23444 + }, + { + "epoch": 1.6937887189119873, + "grad_norm": 8.017696358369998, + "learning_rate": 3.0094936897307165e-07, + "loss": 0.6082, + "step": 23445 + }, + { + "epoch": 1.6938609641121967, + "grad_norm": 5.852227085403544, + "learning_rate": 3.008102474883701e-07, + "loss": 0.5285, + "step": 23446 + }, + { + "epoch": 1.6939332093124064, + "grad_norm": 6.850457514499231, + "learning_rate": 3.006711561088682e-07, + "loss": 0.6454, + "step": 23447 + }, + { + "epoch": 1.6940054545126157, + "grad_norm": 6.921605503183436, + "learning_rate": 3.005320948364707e-07, + "loss": 0.5307, + "step": 23448 + }, + { + "epoch": 1.6940776997128253, + "grad_norm": 7.649087849010168, + "learning_rate": 3.003930636730809e-07, + "loss": 0.611, + "step": 23449 + }, + { + "epoch": 1.6941499449130348, + "grad_norm": 7.426743366295121, + "learning_rate": 3.002540626206027e-07, + "loss": 0.6266, + "step": 23450 + }, + { + "epoch": 1.6942221901132444, + "grad_norm": 7.939134084149449, + "learning_rate": 3.0011509168093783e-07, + "loss": 0.5761, + "step": 23451 + }, + { + "epoch": 1.694294435313454, + "grad_norm": 7.614487201659629, + "learning_rate": 2.9997615085598924e-07, + "loss": 0.6244, + "step": 23452 + }, + { + "epoch": 1.6943666805136632, + "grad_norm": 8.531938126866814, + "learning_rate": 2.9983724014765903e-07, + "loss": 0.6179, + "step": 23453 + }, + { + "epoch": 1.694438925713873, + "grad_norm": 8.994668865120275, + "learning_rate": 2.996983595578487e-07, + "loss": 0.5856, + "step": 23454 + }, + { + "epoch": 1.6945111709140823, + "grad_norm": 7.800452869668775, + "learning_rate": 2.995595090884593e-07, + "loss": 0.6572, + "step": 23455 + }, + { + "epoch": 1.6945834161142919, + "grad_norm": 6.674306243390874, + "learning_rate": 2.994206887413917e-07, + "loss": 0.5869, + "step": 23456 + }, + { + "epoch": 1.6946556613145014, + "grad_norm": 7.180579527522428, + "learning_rate": 2.9928189851854666e-07, + "loss": 0.537, + "step": 23457 + }, + { + "epoch": 1.694727906514711, + "grad_norm": 6.6963485218894725, + "learning_rate": 2.991431384218232e-07, + "loss": 0.5944, + "step": 23458 + }, + { + "epoch": 1.6948001517149205, + "grad_norm": 7.382244366780955, + "learning_rate": 2.9900440845312157e-07, + "loss": 0.6478, + "step": 23459 + }, + { + "epoch": 1.6948723969151298, + "grad_norm": 7.034901937900692, + "learning_rate": 2.9886570861434037e-07, + "loss": 0.549, + "step": 23460 + }, + { + "epoch": 1.6949446421153396, + "grad_norm": 7.435774659139029, + "learning_rate": 2.9872703890737873e-07, + "loss": 0.5682, + "step": 23461 + }, + { + "epoch": 1.695016887315549, + "grad_norm": 7.132396138501679, + "learning_rate": 2.985883993341346e-07, + "loss": 0.6148, + "step": 23462 + }, + { + "epoch": 1.6950891325157584, + "grad_norm": 6.8875246577159, + "learning_rate": 2.984497898965061e-07, + "loss": 0.6115, + "step": 23463 + }, + { + "epoch": 1.695161377715968, + "grad_norm": 7.373623477039788, + "learning_rate": 2.9831121059639116e-07, + "loss": 0.5662, + "step": 23464 + }, + { + "epoch": 1.6952336229161775, + "grad_norm": 7.037721902116707, + "learning_rate": 2.981726614356856e-07, + "loss": 0.6169, + "step": 23465 + }, + { + "epoch": 1.695305868116387, + "grad_norm": 7.597587212491609, + "learning_rate": 2.9803414241628664e-07, + "loss": 0.5812, + "step": 23466 + }, + { + "epoch": 1.6953781133165964, + "grad_norm": 6.8096164585280325, + "learning_rate": 2.9789565354009065e-07, + "loss": 0.6696, + "step": 23467 + }, + { + "epoch": 1.6954503585168061, + "grad_norm": 15.739820308992572, + "learning_rate": 2.9775719480899384e-07, + "loss": 0.5895, + "step": 23468 + }, + { + "epoch": 1.6955226037170155, + "grad_norm": 8.23464090978386, + "learning_rate": 2.9761876622489054e-07, + "loss": 0.6363, + "step": 23469 + }, + { + "epoch": 1.6955948489172252, + "grad_norm": 7.205207573298385, + "learning_rate": 2.974803677896765e-07, + "loss": 0.5924, + "step": 23470 + }, + { + "epoch": 1.6956670941174345, + "grad_norm": 7.546076825189375, + "learning_rate": 2.9734199950524593e-07, + "loss": 0.641, + "step": 23471 + }, + { + "epoch": 1.695739339317644, + "grad_norm": 7.208620569169521, + "learning_rate": 2.97203661373493e-07, + "loss": 0.7198, + "step": 23472 + }, + { + "epoch": 1.6958115845178536, + "grad_norm": 8.912879163212489, + "learning_rate": 2.9706535339631166e-07, + "loss": 0.5949, + "step": 23473 + }, + { + "epoch": 1.695883829718063, + "grad_norm": 7.7909311637594065, + "learning_rate": 2.969270755755954e-07, + "loss": 0.6318, + "step": 23474 + }, + { + "epoch": 1.6959560749182727, + "grad_norm": 8.025044816031647, + "learning_rate": 2.9678882791323723e-07, + "loss": 0.5939, + "step": 23475 + }, + { + "epoch": 1.696028320118482, + "grad_norm": 8.343998708840084, + "learning_rate": 2.966506104111286e-07, + "loss": 0.6448, + "step": 23476 + }, + { + "epoch": 1.6961005653186918, + "grad_norm": 10.252470137527757, + "learning_rate": 2.9651242307116273e-07, + "loss": 0.6797, + "step": 23477 + }, + { + "epoch": 1.6961728105189011, + "grad_norm": 7.393648267160558, + "learning_rate": 2.963742658952312e-07, + "loss": 0.6724, + "step": 23478 + }, + { + "epoch": 1.6962450557191107, + "grad_norm": 7.657523660402176, + "learning_rate": 2.962361388852245e-07, + "loss": 0.6173, + "step": 23479 + }, + { + "epoch": 1.6963173009193202, + "grad_norm": 7.212562981236336, + "learning_rate": 2.960980420430337e-07, + "loss": 0.6272, + "step": 23480 + }, + { + "epoch": 1.6963895461195295, + "grad_norm": 9.728233494139824, + "learning_rate": 2.9595997537054915e-07, + "loss": 0.6232, + "step": 23481 + }, + { + "epoch": 1.6964617913197393, + "grad_norm": 6.4274827368321565, + "learning_rate": 2.9582193886966233e-07, + "loss": 0.5729, + "step": 23482 + }, + { + "epoch": 1.6965340365199486, + "grad_norm": 6.68336858319938, + "learning_rate": 2.9568393254226116e-07, + "loss": 0.5813, + "step": 23483 + }, + { + "epoch": 1.6966062817201584, + "grad_norm": 6.918386988433685, + "learning_rate": 2.9554595639023545e-07, + "loss": 0.6464, + "step": 23484 + }, + { + "epoch": 1.6966785269203677, + "grad_norm": 7.304495840567183, + "learning_rate": 2.9540801041547385e-07, + "loss": 0.5888, + "step": 23485 + }, + { + "epoch": 1.6967507721205772, + "grad_norm": 6.813207467117835, + "learning_rate": 2.9527009461986527e-07, + "loss": 0.5973, + "step": 23486 + }, + { + "epoch": 1.6968230173207868, + "grad_norm": 8.006130730412847, + "learning_rate": 2.9513220900529686e-07, + "loss": 0.6164, + "step": 23487 + }, + { + "epoch": 1.696895262520996, + "grad_norm": 6.717769391557329, + "learning_rate": 2.949943535736566e-07, + "loss": 0.5741, + "step": 23488 + }, + { + "epoch": 1.6969675077212059, + "grad_norm": 6.909125786560951, + "learning_rate": 2.9485652832683146e-07, + "loss": 0.5918, + "step": 23489 + }, + { + "epoch": 1.6970397529214152, + "grad_norm": 7.065031155074121, + "learning_rate": 2.9471873326670843e-07, + "loss": 0.6195, + "step": 23490 + }, + { + "epoch": 1.697111998121625, + "grad_norm": 6.372380335489771, + "learning_rate": 2.945809683951734e-07, + "loss": 0.6108, + "step": 23491 + }, + { + "epoch": 1.6971842433218343, + "grad_norm": 6.601576368328629, + "learning_rate": 2.944432337141126e-07, + "loss": 0.5926, + "step": 23492 + }, + { + "epoch": 1.6972564885220438, + "grad_norm": 8.619825497164227, + "learning_rate": 2.943055292254121e-07, + "loss": 0.6787, + "step": 23493 + }, + { + "epoch": 1.6973287337222533, + "grad_norm": 7.441635136325703, + "learning_rate": 2.941678549309557e-07, + "loss": 0.5994, + "step": 23494 + }, + { + "epoch": 1.6974009789224629, + "grad_norm": 8.381199150869937, + "learning_rate": 2.940302108326284e-07, + "loss": 0.5494, + "step": 23495 + }, + { + "epoch": 1.6974732241226724, + "grad_norm": 6.731350035707745, + "learning_rate": 2.938925969323156e-07, + "loss": 0.6233, + "step": 23496 + }, + { + "epoch": 1.6975454693228818, + "grad_norm": 6.975267313548563, + "learning_rate": 2.937550132318995e-07, + "loss": 0.6515, + "step": 23497 + }, + { + "epoch": 1.6976177145230915, + "grad_norm": 7.219171420920733, + "learning_rate": 2.9361745973326414e-07, + "loss": 0.5905, + "step": 23498 + }, + { + "epoch": 1.6976899597233008, + "grad_norm": 7.296572823976057, + "learning_rate": 2.934799364382926e-07, + "loss": 0.5609, + "step": 23499 + }, + { + "epoch": 1.6977622049235104, + "grad_norm": 6.9044698086546115, + "learning_rate": 2.933424433488677e-07, + "loss": 0.6848, + "step": 23500 + }, + { + "epoch": 1.69783445012372, + "grad_norm": 6.31991323145926, + "learning_rate": 2.932049804668713e-07, + "loss": 0.6285, + "step": 23501 + }, + { + "epoch": 1.6979066953239295, + "grad_norm": 6.030780991418531, + "learning_rate": 2.9306754779418525e-07, + "loss": 0.6784, + "step": 23502 + }, + { + "epoch": 1.697978940524139, + "grad_norm": 6.983571802163045, + "learning_rate": 2.929301453326913e-07, + "loss": 0.615, + "step": 23503 + }, + { + "epoch": 1.6980511857243483, + "grad_norm": 7.170734256619244, + "learning_rate": 2.9279277308426947e-07, + "loss": 0.6149, + "step": 23504 + }, + { + "epoch": 1.698123430924558, + "grad_norm": 7.862993078735985, + "learning_rate": 2.9265543105080093e-07, + "loss": 0.684, + "step": 23505 + }, + { + "epoch": 1.6981956761247674, + "grad_norm": 8.304653049228872, + "learning_rate": 2.925181192341656e-07, + "loss": 0.6009, + "step": 23506 + }, + { + "epoch": 1.698267921324977, + "grad_norm": 7.257161253915475, + "learning_rate": 2.9238083763624387e-07, + "loss": 0.5832, + "step": 23507 + }, + { + "epoch": 1.6983401665251865, + "grad_norm": 7.447473330518818, + "learning_rate": 2.922435862589135e-07, + "loss": 0.6238, + "step": 23508 + }, + { + "epoch": 1.698412411725396, + "grad_norm": 7.137854240966483, + "learning_rate": 2.92106365104054e-07, + "loss": 0.6852, + "step": 23509 + }, + { + "epoch": 1.6984846569256056, + "grad_norm": 8.39576765339216, + "learning_rate": 2.9196917417354446e-07, + "loss": 0.6112, + "step": 23510 + }, + { + "epoch": 1.698556902125815, + "grad_norm": 7.910705517195363, + "learning_rate": 2.9183201346926317e-07, + "loss": 0.714, + "step": 23511 + }, + { + "epoch": 1.6986291473260247, + "grad_norm": 8.08782813041842, + "learning_rate": 2.9169488299308627e-07, + "loss": 0.6651, + "step": 23512 + }, + { + "epoch": 1.698701392526234, + "grad_norm": 8.158553576459552, + "learning_rate": 2.9155778274689184e-07, + "loss": 0.6026, + "step": 23513 + }, + { + "epoch": 1.6987736377264435, + "grad_norm": 6.937688601567832, + "learning_rate": 2.9142071273255744e-07, + "loss": 0.6077, + "step": 23514 + }, + { + "epoch": 1.698845882926653, + "grad_norm": 7.044495174387082, + "learning_rate": 2.9128367295195775e-07, + "loss": 0.5718, + "step": 23515 + }, + { + "epoch": 1.6989181281268626, + "grad_norm": 7.435764655252396, + "learning_rate": 2.911466634069698e-07, + "loss": 0.6339, + "step": 23516 + }, + { + "epoch": 1.6989903733270721, + "grad_norm": 6.609064478151744, + "learning_rate": 2.910096840994689e-07, + "loss": 0.5804, + "step": 23517 + }, + { + "epoch": 1.6990626185272815, + "grad_norm": 7.111698517115292, + "learning_rate": 2.9087273503133035e-07, + "loss": 0.6195, + "step": 23518 + }, + { + "epoch": 1.6991348637274912, + "grad_norm": 7.925082127479495, + "learning_rate": 2.907358162044288e-07, + "loss": 0.5663, + "step": 23519 + }, + { + "epoch": 1.6992071089277005, + "grad_norm": 6.523621406932374, + "learning_rate": 2.9059892762063835e-07, + "loss": 0.6357, + "step": 23520 + }, + { + "epoch": 1.69927935412791, + "grad_norm": 6.93692619081439, + "learning_rate": 2.9046206928183386e-07, + "loss": 0.6371, + "step": 23521 + }, + { + "epoch": 1.6993515993281196, + "grad_norm": 7.76234191635277, + "learning_rate": 2.903252411898874e-07, + "loss": 0.6235, + "step": 23522 + }, + { + "epoch": 1.6994238445283292, + "grad_norm": 7.0412678201193595, + "learning_rate": 2.901884433466726e-07, + "loss": 0.6404, + "step": 23523 + }, + { + "epoch": 1.6994960897285387, + "grad_norm": 8.417865881767062, + "learning_rate": 2.900516757540625e-07, + "loss": 0.6019, + "step": 23524 + }, + { + "epoch": 1.699568334928748, + "grad_norm": 6.312893694696364, + "learning_rate": 2.899149384139291e-07, + "loss": 0.6061, + "step": 23525 + }, + { + "epoch": 1.6996405801289578, + "grad_norm": 7.045271934312966, + "learning_rate": 2.8977823132814414e-07, + "loss": 0.5635, + "step": 23526 + }, + { + "epoch": 1.6997128253291671, + "grad_norm": 6.837290044765543, + "learning_rate": 2.8964155449857864e-07, + "loss": 0.6409, + "step": 23527 + }, + { + "epoch": 1.6997850705293767, + "grad_norm": 8.301356597707672, + "learning_rate": 2.8950490792710425e-07, + "loss": 0.5947, + "step": 23528 + }, + { + "epoch": 1.6998573157295862, + "grad_norm": 8.172268042949907, + "learning_rate": 2.8936829161559125e-07, + "loss": 0.6396, + "step": 23529 + }, + { + "epoch": 1.6999295609297957, + "grad_norm": 7.296570994152327, + "learning_rate": 2.8923170556590996e-07, + "loss": 0.5889, + "step": 23530 + }, + { + "epoch": 1.7000018061300053, + "grad_norm": 6.669490057398123, + "learning_rate": 2.890951497799302e-07, + "loss": 0.5299, + "step": 23531 + }, + { + "epoch": 1.7000740513302146, + "grad_norm": 7.53720401922956, + "learning_rate": 2.889586242595216e-07, + "loss": 0.589, + "step": 23532 + }, + { + "epoch": 1.7001462965304244, + "grad_norm": 7.368113519231051, + "learning_rate": 2.8882212900655207e-07, + "loss": 0.6598, + "step": 23533 + }, + { + "epoch": 1.7002185417306337, + "grad_norm": 6.721800781294266, + "learning_rate": 2.8868566402289077e-07, + "loss": 0.5573, + "step": 23534 + }, + { + "epoch": 1.7002907869308432, + "grad_norm": 7.077464496034364, + "learning_rate": 2.885492293104056e-07, + "loss": 0.6411, + "step": 23535 + }, + { + "epoch": 1.7003630321310528, + "grad_norm": 7.344037249202633, + "learning_rate": 2.884128248709653e-07, + "loss": 0.6214, + "step": 23536 + }, + { + "epoch": 1.7004352773312623, + "grad_norm": 7.83625976873444, + "learning_rate": 2.8827645070643477e-07, + "loss": 0.5274, + "step": 23537 + }, + { + "epoch": 1.7005075225314719, + "grad_norm": 7.929136901682001, + "learning_rate": 2.881401068186832e-07, + "loss": 0.6847, + "step": 23538 + }, + { + "epoch": 1.7005797677316812, + "grad_norm": 7.667236417525324, + "learning_rate": 2.8800379320957637e-07, + "loss": 0.6114, + "step": 23539 + }, + { + "epoch": 1.700652012931891, + "grad_norm": 7.9400318343055245, + "learning_rate": 2.878675098809799e-07, + "loss": 0.6194, + "step": 23540 + }, + { + "epoch": 1.7007242581321003, + "grad_norm": 8.048838314798816, + "learning_rate": 2.8773125683475955e-07, + "loss": 0.6162, + "step": 23541 + }, + { + "epoch": 1.7007965033323098, + "grad_norm": 7.930782569476145, + "learning_rate": 2.875950340727804e-07, + "loss": 0.5077, + "step": 23542 + }, + { + "epoch": 1.7008687485325193, + "grad_norm": 6.829980942795893, + "learning_rate": 2.874588415969082e-07, + "loss": 0.6227, + "step": 23543 + }, + { + "epoch": 1.700940993732729, + "grad_norm": 8.008711363424, + "learning_rate": 2.873226794090056e-07, + "loss": 0.5911, + "step": 23544 + }, + { + "epoch": 1.7010132389329384, + "grad_norm": 7.426600828900539, + "learning_rate": 2.8718654751093783e-07, + "loss": 0.5649, + "step": 23545 + }, + { + "epoch": 1.7010854841331478, + "grad_norm": 6.445303622152976, + "learning_rate": 2.870504459045681e-07, + "loss": 0.5894, + "step": 23546 + }, + { + "epoch": 1.7011577293333575, + "grad_norm": 7.119699748705612, + "learning_rate": 2.8691437459175935e-07, + "loss": 0.6458, + "step": 23547 + }, + { + "epoch": 1.7012299745335668, + "grad_norm": 7.669985536564167, + "learning_rate": 2.8677833357437454e-07, + "loss": 0.5866, + "step": 23548 + }, + { + "epoch": 1.7013022197337766, + "grad_norm": 5.983668671599755, + "learning_rate": 2.8664232285427586e-07, + "loss": 0.5981, + "step": 23549 + }, + { + "epoch": 1.701374464933986, + "grad_norm": 7.087052599259986, + "learning_rate": 2.865063424333256e-07, + "loss": 0.6107, + "step": 23550 + }, + { + "epoch": 1.7014467101341955, + "grad_norm": 7.159226777079652, + "learning_rate": 2.8637039231338454e-07, + "loss": 0.6572, + "step": 23551 + }, + { + "epoch": 1.701518955334405, + "grad_norm": 9.123664235627738, + "learning_rate": 2.86234472496314e-07, + "loss": 0.6681, + "step": 23552 + }, + { + "epoch": 1.7015912005346143, + "grad_norm": 7.9780711989899125, + "learning_rate": 2.860985829839752e-07, + "loss": 0.6721, + "step": 23553 + }, + { + "epoch": 1.701663445734824, + "grad_norm": 6.850014802123178, + "learning_rate": 2.859627237782273e-07, + "loss": 0.5868, + "step": 23554 + }, + { + "epoch": 1.7017356909350334, + "grad_norm": 7.673008603876074, + "learning_rate": 2.8582689488093033e-07, + "loss": 0.589, + "step": 23555 + }, + { + "epoch": 1.7018079361352432, + "grad_norm": 6.905398493268153, + "learning_rate": 2.856910962939444e-07, + "loss": 0.5728, + "step": 23556 + }, + { + "epoch": 1.7018801813354525, + "grad_norm": 7.410585060240425, + "learning_rate": 2.855553280191278e-07, + "loss": 0.5707, + "step": 23557 + }, + { + "epoch": 1.701952426535662, + "grad_norm": 7.322682651418051, + "learning_rate": 2.854195900583395e-07, + "loss": 0.612, + "step": 23558 + }, + { + "epoch": 1.7020246717358716, + "grad_norm": 7.644475111172892, + "learning_rate": 2.852838824134374e-07, + "loss": 0.6058, + "step": 23559 + }, + { + "epoch": 1.702096916936081, + "grad_norm": 7.4082584796409385, + "learning_rate": 2.851482050862797e-07, + "loss": 0.6455, + "step": 23560 + }, + { + "epoch": 1.7021691621362907, + "grad_norm": 6.357567382647418, + "learning_rate": 2.8501255807872303e-07, + "loss": 0.5903, + "step": 23561 + }, + { + "epoch": 1.7022414073365, + "grad_norm": 7.048034978496167, + "learning_rate": 2.848769413926247e-07, + "loss": 0.6173, + "step": 23562 + }, + { + "epoch": 1.7023136525367097, + "grad_norm": 8.561881606889377, + "learning_rate": 2.847413550298411e-07, + "loss": 0.6083, + "step": 23563 + }, + { + "epoch": 1.702385897736919, + "grad_norm": 7.3148844165235865, + "learning_rate": 2.846057989922285e-07, + "loss": 0.6368, + "step": 23564 + }, + { + "epoch": 1.7024581429371286, + "grad_norm": 8.480139474261593, + "learning_rate": 2.8447027328164236e-07, + "loss": 0.647, + "step": 23565 + }, + { + "epoch": 1.7025303881373381, + "grad_norm": 7.050130629174734, + "learning_rate": 2.8433477789993695e-07, + "loss": 0.6014, + "step": 23566 + }, + { + "epoch": 1.7026026333375477, + "grad_norm": 8.826183328369975, + "learning_rate": 2.8419931284896896e-07, + "loss": 0.6156, + "step": 23567 + }, + { + "epoch": 1.7026748785377572, + "grad_norm": 6.61074908607216, + "learning_rate": 2.840638781305921e-07, + "loss": 0.6307, + "step": 23568 + }, + { + "epoch": 1.7027471237379666, + "grad_norm": 6.373694791133895, + "learning_rate": 2.8392847374665996e-07, + "loss": 0.559, + "step": 23569 + }, + { + "epoch": 1.7028193689381763, + "grad_norm": 7.231679723264371, + "learning_rate": 2.837930996990262e-07, + "loss": 0.5281, + "step": 23570 + }, + { + "epoch": 1.7028916141383856, + "grad_norm": 6.7225524078630166, + "learning_rate": 2.8365775598954454e-07, + "loss": 0.6405, + "step": 23571 + }, + { + "epoch": 1.7029638593385952, + "grad_norm": 8.349257348207777, + "learning_rate": 2.8352244262006693e-07, + "loss": 0.6607, + "step": 23572 + }, + { + "epoch": 1.7030361045388047, + "grad_norm": 6.667483883678409, + "learning_rate": 2.8338715959244615e-07, + "loss": 0.6064, + "step": 23573 + }, + { + "epoch": 1.7031083497390143, + "grad_norm": 6.155053656659495, + "learning_rate": 2.8325190690853403e-07, + "loss": 0.5382, + "step": 23574 + }, + { + "epoch": 1.7031805949392238, + "grad_norm": 6.724264747105081, + "learning_rate": 2.8311668457018213e-07, + "loss": 0.5987, + "step": 23575 + }, + { + "epoch": 1.7032528401394331, + "grad_norm": 7.185264438761571, + "learning_rate": 2.829814925792415e-07, + "loss": 0.6669, + "step": 23576 + }, + { + "epoch": 1.7033250853396429, + "grad_norm": 6.151548807769878, + "learning_rate": 2.828463309375626e-07, + "loss": 0.564, + "step": 23577 + }, + { + "epoch": 1.7033973305398522, + "grad_norm": 7.347579152556042, + "learning_rate": 2.8271119964699667e-07, + "loss": 0.5805, + "step": 23578 + }, + { + "epoch": 1.7034695757400617, + "grad_norm": 7.151257235025454, + "learning_rate": 2.825760987093923e-07, + "loss": 0.5512, + "step": 23579 + }, + { + "epoch": 1.7035418209402713, + "grad_norm": 7.3706321066355756, + "learning_rate": 2.8244102812659935e-07, + "loss": 0.6205, + "step": 23580 + }, + { + "epoch": 1.7036140661404808, + "grad_norm": 6.8242875059143655, + "learning_rate": 2.823059879004669e-07, + "loss": 0.6019, + "step": 23581 + }, + { + "epoch": 1.7036863113406904, + "grad_norm": 6.566488452694121, + "learning_rate": 2.821709780328444e-07, + "loss": 0.5533, + "step": 23582 + }, + { + "epoch": 1.7037585565408997, + "grad_norm": 7.232604104593936, + "learning_rate": 2.8203599852557827e-07, + "loss": 0.7156, + "step": 23583 + }, + { + "epoch": 1.7038308017411095, + "grad_norm": 7.077567981755947, + "learning_rate": 2.8190104938051716e-07, + "loss": 0.6513, + "step": 23584 + }, + { + "epoch": 1.7039030469413188, + "grad_norm": 6.0224326714362215, + "learning_rate": 2.817661305995087e-07, + "loss": 0.5713, + "step": 23585 + }, + { + "epoch": 1.7039752921415283, + "grad_norm": 7.023372866255716, + "learning_rate": 2.8163124218439946e-07, + "loss": 0.6126, + "step": 23586 + }, + { + "epoch": 1.7040475373417379, + "grad_norm": 6.3905617260773475, + "learning_rate": 2.814963841370358e-07, + "loss": 0.5499, + "step": 23587 + }, + { + "epoch": 1.7041197825419474, + "grad_norm": 7.747373812775078, + "learning_rate": 2.813615564592645e-07, + "loss": 0.6309, + "step": 23588 + }, + { + "epoch": 1.704192027742157, + "grad_norm": 7.070634866183423, + "learning_rate": 2.812267591529311e-07, + "loss": 0.5525, + "step": 23589 + }, + { + "epoch": 1.7042642729423663, + "grad_norm": 6.694995423045456, + "learning_rate": 2.8109199221988013e-07, + "loss": 0.6037, + "step": 23590 + }, + { + "epoch": 1.704336518142576, + "grad_norm": 7.369463119102935, + "learning_rate": 2.8095725566195693e-07, + "loss": 0.5465, + "step": 23591 + }, + { + "epoch": 1.7044087633427853, + "grad_norm": 7.791744156994417, + "learning_rate": 2.80822549481006e-07, + "loss": 0.5929, + "step": 23592 + }, + { + "epoch": 1.704481008542995, + "grad_norm": 7.895670329670178, + "learning_rate": 2.806878736788715e-07, + "loss": 0.6688, + "step": 23593 + }, + { + "epoch": 1.7045532537432044, + "grad_norm": 8.527485111059177, + "learning_rate": 2.805532282573961e-07, + "loss": 0.6799, + "step": 23594 + }, + { + "epoch": 1.704625498943414, + "grad_norm": 7.4735447005986195, + "learning_rate": 2.804186132184242e-07, + "loss": 0.593, + "step": 23595 + }, + { + "epoch": 1.7046977441436235, + "grad_norm": 7.333158086358922, + "learning_rate": 2.8028402856379865e-07, + "loss": 0.6782, + "step": 23596 + }, + { + "epoch": 1.7047699893438328, + "grad_norm": 8.348368191739254, + "learning_rate": 2.801494742953606e-07, + "loss": 0.5644, + "step": 23597 + }, + { + "epoch": 1.7048422345440426, + "grad_norm": 7.186869187246948, + "learning_rate": 2.800149504149527e-07, + "loss": 0.5822, + "step": 23598 + }, + { + "epoch": 1.704914479744252, + "grad_norm": 9.045313039816879, + "learning_rate": 2.7988045692441624e-07, + "loss": 0.5438, + "step": 23599 + }, + { + "epoch": 1.7049867249444615, + "grad_norm": 7.660268546394497, + "learning_rate": 2.7974599382559335e-07, + "loss": 0.6459, + "step": 23600 + }, + { + "epoch": 1.705058970144671, + "grad_norm": 7.248200818140848, + "learning_rate": 2.7961156112032315e-07, + "loss": 0.5689, + "step": 23601 + }, + { + "epoch": 1.7051312153448805, + "grad_norm": 6.634972030937158, + "learning_rate": 2.7947715881044666e-07, + "loss": 0.6144, + "step": 23602 + }, + { + "epoch": 1.70520346054509, + "grad_norm": 6.708187156964341, + "learning_rate": 2.793427868978038e-07, + "loss": 0.5752, + "step": 23603 + }, + { + "epoch": 1.7052757057452994, + "grad_norm": 6.8749151051221515, + "learning_rate": 2.792084453842342e-07, + "loss": 0.6754, + "step": 23604 + }, + { + "epoch": 1.7053479509455092, + "grad_norm": 8.035753939091771, + "learning_rate": 2.790741342715764e-07, + "loss": 0.6876, + "step": 23605 + }, + { + "epoch": 1.7054201961457185, + "grad_norm": 9.71449112875322, + "learning_rate": 2.789398535616691e-07, + "loss": 0.6164, + "step": 23606 + }, + { + "epoch": 1.705492441345928, + "grad_norm": 8.372134956974413, + "learning_rate": 2.788056032563516e-07, + "loss": 0.5638, + "step": 23607 + }, + { + "epoch": 1.7055646865461376, + "grad_norm": 7.833801039131842, + "learning_rate": 2.7867138335745974e-07, + "loss": 0.5661, + "step": 23608 + }, + { + "epoch": 1.7056369317463471, + "grad_norm": 7.7348647453846615, + "learning_rate": 2.7853719386683217e-07, + "loss": 0.603, + "step": 23609 + }, + { + "epoch": 1.7057091769465567, + "grad_norm": 8.170855427552782, + "learning_rate": 2.784030347863062e-07, + "loss": 0.5758, + "step": 23610 + }, + { + "epoch": 1.705781422146766, + "grad_norm": 6.534329942997605, + "learning_rate": 2.782689061177171e-07, + "loss": 0.5475, + "step": 23611 + }, + { + "epoch": 1.7058536673469757, + "grad_norm": 6.273953427279008, + "learning_rate": 2.7813480786290146e-07, + "loss": 0.5949, + "step": 23612 + }, + { + "epoch": 1.705925912547185, + "grad_norm": 7.112284775412364, + "learning_rate": 2.7800074002369534e-07, + "loss": 0.6553, + "step": 23613 + }, + { + "epoch": 1.7059981577473946, + "grad_norm": 7.557537878448224, + "learning_rate": 2.778667026019335e-07, + "loss": 0.602, + "step": 23614 + }, + { + "epoch": 1.7060704029476041, + "grad_norm": 7.507026432737293, + "learning_rate": 2.777326955994514e-07, + "loss": 0.5897, + "step": 23615 + }, + { + "epoch": 1.7061426481478137, + "grad_norm": 7.191893188633904, + "learning_rate": 2.7759871901808316e-07, + "loss": 0.6171, + "step": 23616 + }, + { + "epoch": 1.7062148933480232, + "grad_norm": 7.77221351073459, + "learning_rate": 2.77464772859663e-07, + "loss": 0.568, + "step": 23617 + }, + { + "epoch": 1.7062871385482326, + "grad_norm": 6.946003968496641, + "learning_rate": 2.773308571260247e-07, + "loss": 0.6194, + "step": 23618 + }, + { + "epoch": 1.7063593837484423, + "grad_norm": 7.691796071571361, + "learning_rate": 2.7719697181900097e-07, + "loss": 0.5326, + "step": 23619 + }, + { + "epoch": 1.7064316289486516, + "grad_norm": 8.420884790755641, + "learning_rate": 2.7706311694042447e-07, + "loss": 0.6449, + "step": 23620 + }, + { + "epoch": 1.7065038741488614, + "grad_norm": 6.819257831631503, + "learning_rate": 2.7692929249212853e-07, + "loss": 0.6195, + "step": 23621 + }, + { + "epoch": 1.7065761193490707, + "grad_norm": 7.139927268632866, + "learning_rate": 2.7679549847594354e-07, + "loss": 0.6254, + "step": 23622 + }, + { + "epoch": 1.7066483645492803, + "grad_norm": 7.403132430363119, + "learning_rate": 2.7666173489370227e-07, + "loss": 0.522, + "step": 23623 + }, + { + "epoch": 1.7067206097494898, + "grad_norm": 7.350500761866181, + "learning_rate": 2.7652800174723574e-07, + "loss": 0.6316, + "step": 23624 + }, + { + "epoch": 1.7067928549496991, + "grad_norm": 7.013152164589858, + "learning_rate": 2.76394299038375e-07, + "loss": 0.5689, + "step": 23625 + }, + { + "epoch": 1.7068651001499089, + "grad_norm": 7.003673679385999, + "learning_rate": 2.762606267689491e-07, + "loss": 0.5777, + "step": 23626 + }, + { + "epoch": 1.7069373453501182, + "grad_norm": 8.693992768222564, + "learning_rate": 2.761269849407885e-07, + "loss": 0.6415, + "step": 23627 + }, + { + "epoch": 1.707009590550328, + "grad_norm": 7.081068896194026, + "learning_rate": 2.759933735557235e-07, + "loss": 0.6397, + "step": 23628 + }, + { + "epoch": 1.7070818357505373, + "grad_norm": 7.335024580859061, + "learning_rate": 2.7585979261558175e-07, + "loss": 0.6432, + "step": 23629 + }, + { + "epoch": 1.7071540809507468, + "grad_norm": 5.687390672240777, + "learning_rate": 2.757262421221926e-07, + "loss": 0.5991, + "step": 23630 + }, + { + "epoch": 1.7072263261509564, + "grad_norm": 7.4382003326229675, + "learning_rate": 2.7559272207738405e-07, + "loss": 0.6422, + "step": 23631 + }, + { + "epoch": 1.7072985713511657, + "grad_norm": 9.872815626928334, + "learning_rate": 2.7545923248298414e-07, + "loss": 0.664, + "step": 23632 + }, + { + "epoch": 1.7073708165513755, + "grad_norm": 7.116273320902114, + "learning_rate": 2.7532577334082015e-07, + "loss": 0.5912, + "step": 23633 + }, + { + "epoch": 1.7074430617515848, + "grad_norm": 7.351977864610891, + "learning_rate": 2.751923446527188e-07, + "loss": 0.6446, + "step": 23634 + }, + { + "epoch": 1.7075153069517945, + "grad_norm": 7.267967847233363, + "learning_rate": 2.7505894642050725e-07, + "loss": 0.6273, + "step": 23635 + }, + { + "epoch": 1.7075875521520039, + "grad_norm": 6.34915517158167, + "learning_rate": 2.749255786460109e-07, + "loss": 0.6256, + "step": 23636 + }, + { + "epoch": 1.7076597973522134, + "grad_norm": 8.486223613064736, + "learning_rate": 2.7479224133105554e-07, + "loss": 0.6535, + "step": 23637 + }, + { + "epoch": 1.707732042552423, + "grad_norm": 6.640184670603431, + "learning_rate": 2.7465893447746665e-07, + "loss": 0.5508, + "step": 23638 + }, + { + "epoch": 1.7078042877526325, + "grad_norm": 8.19539907753363, + "learning_rate": 2.745256580870698e-07, + "loss": 0.5873, + "step": 23639 + }, + { + "epoch": 1.707876532952842, + "grad_norm": 7.0169877595744445, + "learning_rate": 2.743924121616878e-07, + "loss": 0.6009, + "step": 23640 + }, + { + "epoch": 1.7079487781530514, + "grad_norm": 8.838171395747283, + "learning_rate": 2.742591967031458e-07, + "loss": 0.6634, + "step": 23641 + }, + { + "epoch": 1.7080210233532611, + "grad_norm": 5.883759481901783, + "learning_rate": 2.741260117132674e-07, + "loss": 0.585, + "step": 23642 + }, + { + "epoch": 1.7080932685534704, + "grad_norm": 7.476566581764821, + "learning_rate": 2.739928571938755e-07, + "loss": 0.6029, + "step": 23643 + }, + { + "epoch": 1.70816551375368, + "grad_norm": 7.66679285470802, + "learning_rate": 2.738597331467929e-07, + "loss": 0.6621, + "step": 23644 + }, + { + "epoch": 1.7082377589538895, + "grad_norm": 7.411063002557788, + "learning_rate": 2.737266395738422e-07, + "loss": 0.6521, + "step": 23645 + }, + { + "epoch": 1.708310004154099, + "grad_norm": 6.870841415829259, + "learning_rate": 2.7359357647684546e-07, + "loss": 0.5844, + "step": 23646 + }, + { + "epoch": 1.7083822493543086, + "grad_norm": 6.949486078156304, + "learning_rate": 2.734605438576238e-07, + "loss": 0.5783, + "step": 23647 + }, + { + "epoch": 1.708454494554518, + "grad_norm": 6.555049190613097, + "learning_rate": 2.7332754171799844e-07, + "loss": 0.5829, + "step": 23648 + }, + { + "epoch": 1.7085267397547277, + "grad_norm": 7.263969642995849, + "learning_rate": 2.7319457005978987e-07, + "loss": 0.5754, + "step": 23649 + }, + { + "epoch": 1.708598984954937, + "grad_norm": 8.693754071934704, + "learning_rate": 2.730616288848195e-07, + "loss": 0.6762, + "step": 23650 + }, + { + "epoch": 1.7086712301551465, + "grad_norm": 7.724207381290941, + "learning_rate": 2.7292871819490504e-07, + "loss": 0.5489, + "step": 23651 + }, + { + "epoch": 1.708743475355356, + "grad_norm": 6.6095749845077085, + "learning_rate": 2.727958379918677e-07, + "loss": 0.6151, + "step": 23652 + }, + { + "epoch": 1.7088157205555656, + "grad_norm": 6.220329045856071, + "learning_rate": 2.726629882775267e-07, + "loss": 0.5553, + "step": 23653 + }, + { + "epoch": 1.7088879657557752, + "grad_norm": 7.983753874446179, + "learning_rate": 2.725301690536994e-07, + "loss": 0.5851, + "step": 23654 + }, + { + "epoch": 1.7089602109559845, + "grad_norm": 7.342761423654389, + "learning_rate": 2.7239738032220467e-07, + "loss": 0.6202, + "step": 23655 + }, + { + "epoch": 1.7090324561561943, + "grad_norm": 7.266469992446336, + "learning_rate": 2.7226462208486017e-07, + "loss": 0.582, + "step": 23656 + }, + { + "epoch": 1.7091047013564036, + "grad_norm": 8.609840671248065, + "learning_rate": 2.7213189434348364e-07, + "loss": 0.6112, + "step": 23657 + }, + { + "epoch": 1.7091769465566131, + "grad_norm": 7.299779015945663, + "learning_rate": 2.719991970998911e-07, + "loss": 0.5099, + "step": 23658 + }, + { + "epoch": 1.7092491917568227, + "grad_norm": 6.629288634835195, + "learning_rate": 2.718665303558998e-07, + "loss": 0.6419, + "step": 23659 + }, + { + "epoch": 1.7093214369570322, + "grad_norm": 6.93035878539395, + "learning_rate": 2.7173389411332576e-07, + "loss": 0.5471, + "step": 23660 + }, + { + "epoch": 1.7093936821572417, + "grad_norm": 7.043118780466628, + "learning_rate": 2.7160128837398435e-07, + "loss": 0.5778, + "step": 23661 + }, + { + "epoch": 1.709465927357451, + "grad_norm": 7.569431879653379, + "learning_rate": 2.714687131396912e-07, + "loss": 0.6243, + "step": 23662 + }, + { + "epoch": 1.7095381725576608, + "grad_norm": 8.063949572935321, + "learning_rate": 2.713361684122609e-07, + "loss": 0.5881, + "step": 23663 + }, + { + "epoch": 1.7096104177578701, + "grad_norm": 6.490392774267041, + "learning_rate": 2.712036541935084e-07, + "loss": 0.5627, + "step": 23664 + }, + { + "epoch": 1.7096826629580797, + "grad_norm": 6.927231621170871, + "learning_rate": 2.71071170485247e-07, + "loss": 0.5985, + "step": 23665 + }, + { + "epoch": 1.7097549081582892, + "grad_norm": 8.005371674505355, + "learning_rate": 2.709387172892908e-07, + "loss": 0.6886, + "step": 23666 + }, + { + "epoch": 1.7098271533584988, + "grad_norm": 6.692914820552105, + "learning_rate": 2.7080629460745246e-07, + "loss": 0.66, + "step": 23667 + }, + { + "epoch": 1.7098993985587083, + "grad_norm": 6.647151549217282, + "learning_rate": 2.7067390244154583e-07, + "loss": 0.6075, + "step": 23668 + }, + { + "epoch": 1.7099716437589176, + "grad_norm": 8.877508104557515, + "learning_rate": 2.7054154079338174e-07, + "loss": 0.6432, + "step": 23669 + }, + { + "epoch": 1.7100438889591274, + "grad_norm": 7.830846378187948, + "learning_rate": 2.7040920966477315e-07, + "loss": 0.5759, + "step": 23670 + }, + { + "epoch": 1.7101161341593367, + "grad_norm": 7.454720956043271, + "learning_rate": 2.7027690905753134e-07, + "loss": 0.606, + "step": 23671 + }, + { + "epoch": 1.7101883793595463, + "grad_norm": 7.870950535685323, + "learning_rate": 2.7014463897346717e-07, + "loss": 0.608, + "step": 23672 + }, + { + "epoch": 1.7102606245597558, + "grad_norm": 7.036516582286506, + "learning_rate": 2.700123994143916e-07, + "loss": 0.6389, + "step": 23673 + }, + { + "epoch": 1.7103328697599653, + "grad_norm": 6.9954437686617, + "learning_rate": 2.698801903821149e-07, + "loss": 0.5661, + "step": 23674 + }, + { + "epoch": 1.7104051149601749, + "grad_norm": 6.205490007379711, + "learning_rate": 2.6974801187844736e-07, + "loss": 0.5599, + "step": 23675 + }, + { + "epoch": 1.7104773601603842, + "grad_norm": 7.353759956100237, + "learning_rate": 2.6961586390519714e-07, + "loss": 0.5685, + "step": 23676 + }, + { + "epoch": 1.710549605360594, + "grad_norm": 6.352631163460901, + "learning_rate": 2.694837464641742e-07, + "loss": 0.6343, + "step": 23677 + }, + { + "epoch": 1.7106218505608033, + "grad_norm": 6.955991552715207, + "learning_rate": 2.693516595571871e-07, + "loss": 0.6584, + "step": 23678 + }, + { + "epoch": 1.7106940957610128, + "grad_norm": 6.859264329438929, + "learning_rate": 2.6921960318604307e-07, + "loss": 0.5758, + "step": 23679 + }, + { + "epoch": 1.7107663409612224, + "grad_norm": 7.361571937707528, + "learning_rate": 2.6908757735255087e-07, + "loss": 0.5484, + "step": 23680 + }, + { + "epoch": 1.710838586161432, + "grad_norm": 6.862545870712758, + "learning_rate": 2.689555820585177e-07, + "loss": 0.5745, + "step": 23681 + }, + { + "epoch": 1.7109108313616415, + "grad_norm": 8.010448308583745, + "learning_rate": 2.6882361730575066e-07, + "loss": 0.5865, + "step": 23682 + }, + { + "epoch": 1.7109830765618508, + "grad_norm": 6.393763734386978, + "learning_rate": 2.686916830960556e-07, + "loss": 0.5548, + "step": 23683 + }, + { + "epoch": 1.7110553217620605, + "grad_norm": 8.782715155932925, + "learning_rate": 2.685597794312389e-07, + "loss": 0.5482, + "step": 23684 + }, + { + "epoch": 1.7111275669622699, + "grad_norm": 7.871897010424146, + "learning_rate": 2.684279063131065e-07, + "loss": 0.5987, + "step": 23685 + }, + { + "epoch": 1.7111998121624794, + "grad_norm": 7.8119499318070735, + "learning_rate": 2.682960637434631e-07, + "loss": 0.5938, + "step": 23686 + }, + { + "epoch": 1.711272057362689, + "grad_norm": 6.853665071664269, + "learning_rate": 2.681642517241137e-07, + "loss": 0.5865, + "step": 23687 + }, + { + "epoch": 1.7113443025628985, + "grad_norm": 7.089639018652143, + "learning_rate": 2.6803247025686265e-07, + "loss": 0.6783, + "step": 23688 + }, + { + "epoch": 1.711416547763108, + "grad_norm": 6.600488482807156, + "learning_rate": 2.679007193435143e-07, + "loss": 0.5571, + "step": 23689 + }, + { + "epoch": 1.7114887929633174, + "grad_norm": 7.438204691865231, + "learning_rate": 2.6776899898587175e-07, + "loss": 0.6164, + "step": 23690 + }, + { + "epoch": 1.7115610381635271, + "grad_norm": 7.326118243272101, + "learning_rate": 2.6763730918573846e-07, + "loss": 0.6427, + "step": 23691 + }, + { + "epoch": 1.7116332833637364, + "grad_norm": 6.9094987365602245, + "learning_rate": 2.6750564994491775e-07, + "loss": 0.636, + "step": 23692 + }, + { + "epoch": 1.7117055285639462, + "grad_norm": 6.9221607433437615, + "learning_rate": 2.673740212652107e-07, + "loss": 0.6794, + "step": 23693 + }, + { + "epoch": 1.7117777737641555, + "grad_norm": 7.424322170567682, + "learning_rate": 2.672424231484197e-07, + "loss": 0.6727, + "step": 23694 + }, + { + "epoch": 1.711850018964365, + "grad_norm": 7.692223067184936, + "learning_rate": 2.6711085559634613e-07, + "loss": 0.5888, + "step": 23695 + }, + { + "epoch": 1.7119222641645746, + "grad_norm": 6.53042144630478, + "learning_rate": 2.669793186107919e-07, + "loss": 0.6661, + "step": 23696 + }, + { + "epoch": 1.711994509364784, + "grad_norm": 6.567945564295343, + "learning_rate": 2.6684781219355665e-07, + "loss": 0.5707, + "step": 23697 + }, + { + "epoch": 1.7120667545649937, + "grad_norm": 6.899469006880478, + "learning_rate": 2.667163363464406e-07, + "loss": 0.5477, + "step": 23698 + }, + { + "epoch": 1.712138999765203, + "grad_norm": 7.054163781345932, + "learning_rate": 2.6658489107124373e-07, + "loss": 0.5737, + "step": 23699 + }, + { + "epoch": 1.7122112449654128, + "grad_norm": 7.272536929847527, + "learning_rate": 2.6645347636976627e-07, + "loss": 0.6047, + "step": 23700 + }, + { + "epoch": 1.712283490165622, + "grad_norm": 6.494831965081057, + "learning_rate": 2.663220922438062e-07, + "loss": 0.5599, + "step": 23701 + }, + { + "epoch": 1.7123557353658316, + "grad_norm": 7.830006995654727, + "learning_rate": 2.6619073869516206e-07, + "loss": 0.6623, + "step": 23702 + }, + { + "epoch": 1.7124279805660412, + "grad_norm": 5.752340752537247, + "learning_rate": 2.66059415725633e-07, + "loss": 0.6371, + "step": 23703 + }, + { + "epoch": 1.7125002257662505, + "grad_norm": 7.170763249547059, + "learning_rate": 2.659281233370153e-07, + "loss": 0.6395, + "step": 23704 + }, + { + "epoch": 1.7125724709664603, + "grad_norm": 6.838485295299917, + "learning_rate": 2.6579686153110713e-07, + "loss": 0.5245, + "step": 23705 + }, + { + "epoch": 1.7126447161666696, + "grad_norm": 6.686597246219806, + "learning_rate": 2.656656303097052e-07, + "loss": 0.5818, + "step": 23706 + }, + { + "epoch": 1.7127169613668793, + "grad_norm": 7.148073014102571, + "learning_rate": 2.6553442967460594e-07, + "loss": 0.651, + "step": 23707 + }, + { + "epoch": 1.7127892065670887, + "grad_norm": 6.778415351160739, + "learning_rate": 2.6540325962760514e-07, + "loss": 0.607, + "step": 23708 + }, + { + "epoch": 1.7128614517672982, + "grad_norm": 8.02197299803795, + "learning_rate": 2.6527212017049884e-07, + "loss": 0.5863, + "step": 23709 + }, + { + "epoch": 1.7129336969675077, + "grad_norm": 8.35711901874426, + "learning_rate": 2.651410113050826e-07, + "loss": 0.6503, + "step": 23710 + }, + { + "epoch": 1.713005942167717, + "grad_norm": 7.627161688889701, + "learning_rate": 2.6500993303315015e-07, + "loss": 0.6224, + "step": 23711 + }, + { + "epoch": 1.7130781873679268, + "grad_norm": 6.082026545939896, + "learning_rate": 2.6487888535649627e-07, + "loss": 0.6056, + "step": 23712 + }, + { + "epoch": 1.7131504325681362, + "grad_norm": 8.241770020528907, + "learning_rate": 2.64747868276915e-07, + "loss": 0.5937, + "step": 23713 + }, + { + "epoch": 1.713222677768346, + "grad_norm": 7.824981728431893, + "learning_rate": 2.6461688179620055e-07, + "loss": 0.6439, + "step": 23714 + }, + { + "epoch": 1.7132949229685552, + "grad_norm": 7.183122977284122, + "learning_rate": 2.6448592591614474e-07, + "loss": 0.6031, + "step": 23715 + }, + { + "epoch": 1.7133671681687648, + "grad_norm": 8.309656560344651, + "learning_rate": 2.6435500063854064e-07, + "loss": 0.6077, + "step": 23716 + }, + { + "epoch": 1.7134394133689743, + "grad_norm": 6.4708779193679185, + "learning_rate": 2.6422410596518095e-07, + "loss": 0.5597, + "step": 23717 + }, + { + "epoch": 1.7135116585691839, + "grad_norm": 7.894265722974597, + "learning_rate": 2.64093241897857e-07, + "loss": 0.6673, + "step": 23718 + }, + { + "epoch": 1.7135839037693934, + "grad_norm": 7.548614112021294, + "learning_rate": 2.639624084383607e-07, + "loss": 0.6506, + "step": 23719 + }, + { + "epoch": 1.7136561489696027, + "grad_norm": 7.480929734490414, + "learning_rate": 2.638316055884829e-07, + "loss": 0.5446, + "step": 23720 + }, + { + "epoch": 1.7137283941698125, + "grad_norm": 7.972976580945333, + "learning_rate": 2.637008333500146e-07, + "loss": 0.5676, + "step": 23721 + }, + { + "epoch": 1.7138006393700218, + "grad_norm": 7.51689356132278, + "learning_rate": 2.63570091724745e-07, + "loss": 0.5818, + "step": 23722 + }, + { + "epoch": 1.7138728845702313, + "grad_norm": 9.290709533204183, + "learning_rate": 2.6343938071446447e-07, + "loss": 0.6239, + "step": 23723 + }, + { + "epoch": 1.713945129770441, + "grad_norm": 7.36700704602365, + "learning_rate": 2.633087003209622e-07, + "loss": 0.5468, + "step": 23724 + }, + { + "epoch": 1.7140173749706504, + "grad_norm": 8.27447163257811, + "learning_rate": 2.6317805054602743e-07, + "loss": 0.5864, + "step": 23725 + }, + { + "epoch": 1.71408962017086, + "grad_norm": 7.492911485809751, + "learning_rate": 2.63047431391448e-07, + "loss": 0.6629, + "step": 23726 + }, + { + "epoch": 1.7141618653710693, + "grad_norm": 8.285363438887465, + "learning_rate": 2.62916842859012e-07, + "loss": 0.6426, + "step": 23727 + }, + { + "epoch": 1.714234110571279, + "grad_norm": 7.17753401080889, + "learning_rate": 2.6278628495050835e-07, + "loss": 0.5389, + "step": 23728 + }, + { + "epoch": 1.7143063557714884, + "grad_norm": 7.3710702027478945, + "learning_rate": 2.626557576677227e-07, + "loss": 0.5675, + "step": 23729 + }, + { + "epoch": 1.714378600971698, + "grad_norm": 6.444350958105884, + "learning_rate": 2.625252610124426e-07, + "loss": 0.5481, + "step": 23730 + }, + { + "epoch": 1.7144508461719075, + "grad_norm": 6.64756502014245, + "learning_rate": 2.623947949864547e-07, + "loss": 0.5722, + "step": 23731 + }, + { + "epoch": 1.714523091372117, + "grad_norm": 7.28255703045678, + "learning_rate": 2.62264359591545e-07, + "loss": 0.5571, + "step": 23732 + }, + { + "epoch": 1.7145953365723265, + "grad_norm": 7.818119560945476, + "learning_rate": 2.6213395482949796e-07, + "loss": 0.6594, + "step": 23733 + }, + { + "epoch": 1.7146675817725359, + "grad_norm": 8.196147050721502, + "learning_rate": 2.6200358070209985e-07, + "loss": 0.5632, + "step": 23734 + }, + { + "epoch": 1.7147398269727456, + "grad_norm": 9.539920252820554, + "learning_rate": 2.6187323721113486e-07, + "loss": 0.6742, + "step": 23735 + }, + { + "epoch": 1.714812072172955, + "grad_norm": 7.2314766336467216, + "learning_rate": 2.6174292435838745e-07, + "loss": 0.5792, + "step": 23736 + }, + { + "epoch": 1.7148843173731645, + "grad_norm": 6.861536332860737, + "learning_rate": 2.616126421456416e-07, + "loss": 0.6052, + "step": 23737 + }, + { + "epoch": 1.714956562573374, + "grad_norm": 6.358302970337105, + "learning_rate": 2.614823905746805e-07, + "loss": 0.6101, + "step": 23738 + }, + { + "epoch": 1.7150288077735836, + "grad_norm": 7.51098972061951, + "learning_rate": 2.6135216964728815e-07, + "loss": 0.6094, + "step": 23739 + }, + { + "epoch": 1.7151010529737931, + "grad_norm": 7.538128382868095, + "learning_rate": 2.612219793652457e-07, + "loss": 0.5231, + "step": 23740 + }, + { + "epoch": 1.7151732981740024, + "grad_norm": 8.162860632227643, + "learning_rate": 2.610918197303361e-07, + "loss": 0.5899, + "step": 23741 + }, + { + "epoch": 1.7152455433742122, + "grad_norm": 8.770081500693628, + "learning_rate": 2.609616907443416e-07, + "loss": 0.5904, + "step": 23742 + }, + { + "epoch": 1.7153177885744215, + "grad_norm": 7.310360326483733, + "learning_rate": 2.6083159240904237e-07, + "loss": 0.6471, + "step": 23743 + }, + { + "epoch": 1.715390033774631, + "grad_norm": 7.353209551233101, + "learning_rate": 2.607015247262204e-07, + "loss": 0.573, + "step": 23744 + }, + { + "epoch": 1.7154622789748406, + "grad_norm": 7.453463052884404, + "learning_rate": 2.605714876976556e-07, + "loss": 0.6015, + "step": 23745 + }, + { + "epoch": 1.7155345241750501, + "grad_norm": 8.165060495221391, + "learning_rate": 2.6044148132512824e-07, + "loss": 0.5949, + "step": 23746 + }, + { + "epoch": 1.7156067693752597, + "grad_norm": 6.66337612324738, + "learning_rate": 2.603115056104183e-07, + "loss": 0.6183, + "step": 23747 + }, + { + "epoch": 1.715679014575469, + "grad_norm": 8.708878154838333, + "learning_rate": 2.6018156055530483e-07, + "loss": 0.6893, + "step": 23748 + }, + { + "epoch": 1.7157512597756788, + "grad_norm": 6.702302895663894, + "learning_rate": 2.6005164616156655e-07, + "loss": 0.6045, + "step": 23749 + }, + { + "epoch": 1.715823504975888, + "grad_norm": 7.731809865035644, + "learning_rate": 2.599217624309827e-07, + "loss": 0.6054, + "step": 23750 + }, + { + "epoch": 1.7158957501760976, + "grad_norm": 8.027757649761282, + "learning_rate": 2.5979190936533004e-07, + "loss": 0.6963, + "step": 23751 + }, + { + "epoch": 1.7159679953763072, + "grad_norm": 8.46037841527715, + "learning_rate": 2.5966208696638686e-07, + "loss": 0.6431, + "step": 23752 + }, + { + "epoch": 1.7160402405765167, + "grad_norm": 7.298588226924518, + "learning_rate": 2.5953229523593055e-07, + "loss": 0.5812, + "step": 23753 + }, + { + "epoch": 1.7161124857767263, + "grad_norm": 7.024033568890762, + "learning_rate": 2.59402534175737e-07, + "loss": 0.5862, + "step": 23754 + }, + { + "epoch": 1.7161847309769356, + "grad_norm": 7.353455967553491, + "learning_rate": 2.59272803787583e-07, + "loss": 0.5289, + "step": 23755 + }, + { + "epoch": 1.7162569761771453, + "grad_norm": 6.728684572443521, + "learning_rate": 2.591431040732442e-07, + "loss": 0.5524, + "step": 23756 + }, + { + "epoch": 1.7163292213773547, + "grad_norm": 6.857300325129667, + "learning_rate": 2.590134350344972e-07, + "loss": 0.6682, + "step": 23757 + }, + { + "epoch": 1.7164014665775642, + "grad_norm": 6.392653611970284, + "learning_rate": 2.5888379667311586e-07, + "loss": 0.5921, + "step": 23758 + }, + { + "epoch": 1.7164737117777737, + "grad_norm": 6.667298223462479, + "learning_rate": 2.5875418899087514e-07, + "loss": 0.5587, + "step": 23759 + }, + { + "epoch": 1.7165459569779833, + "grad_norm": 9.036372543645962, + "learning_rate": 2.5862461198954997e-07, + "loss": 0.6537, + "step": 23760 + }, + { + "epoch": 1.7166182021781928, + "grad_norm": 7.374215553073653, + "learning_rate": 2.5849506567091286e-07, + "loss": 0.585, + "step": 23761 + }, + { + "epoch": 1.7166904473784022, + "grad_norm": 7.620480307913079, + "learning_rate": 2.5836555003673794e-07, + "loss": 0.6742, + "step": 23762 + }, + { + "epoch": 1.716762692578612, + "grad_norm": 6.682437584924831, + "learning_rate": 2.5823606508879826e-07, + "loss": 0.5677, + "step": 23763 + }, + { + "epoch": 1.7168349377788212, + "grad_norm": 7.415322456988184, + "learning_rate": 2.5810661082886627e-07, + "loss": 0.617, + "step": 23764 + }, + { + "epoch": 1.7169071829790308, + "grad_norm": 7.886330582247376, + "learning_rate": 2.579771872587139e-07, + "loss": 0.5852, + "step": 23765 + }, + { + "epoch": 1.7169794281792403, + "grad_norm": 7.822906155500872, + "learning_rate": 2.57847794380113e-07, + "loss": 0.6346, + "step": 23766 + }, + { + "epoch": 1.7170516733794499, + "grad_norm": 7.503913112877523, + "learning_rate": 2.577184321948356e-07, + "loss": 0.5855, + "step": 23767 + }, + { + "epoch": 1.7171239185796594, + "grad_norm": 6.9832462363020085, + "learning_rate": 2.575891007046513e-07, + "loss": 0.5242, + "step": 23768 + }, + { + "epoch": 1.7171961637798687, + "grad_norm": 8.910558254165611, + "learning_rate": 2.5745979991133094e-07, + "loss": 0.6498, + "step": 23769 + }, + { + "epoch": 1.7172684089800785, + "grad_norm": 7.11895093625889, + "learning_rate": 2.573305298166448e-07, + "loss": 0.6013, + "step": 23770 + }, + { + "epoch": 1.7173406541802878, + "grad_norm": 7.856040049852889, + "learning_rate": 2.572012904223628e-07, + "loss": 0.5664, + "step": 23771 + }, + { + "epoch": 1.7174128993804976, + "grad_norm": 6.994879348292433, + "learning_rate": 2.570720817302533e-07, + "loss": 0.5805, + "step": 23772 + }, + { + "epoch": 1.717485144580707, + "grad_norm": 7.738205333108172, + "learning_rate": 2.5694290374208537e-07, + "loss": 0.6417, + "step": 23773 + }, + { + "epoch": 1.7175573897809164, + "grad_norm": 6.311965768699508, + "learning_rate": 2.5681375645962734e-07, + "loss": 0.5628, + "step": 23774 + }, + { + "epoch": 1.717629634981126, + "grad_norm": 7.78169341145316, + "learning_rate": 2.566846398846476e-07, + "loss": 0.592, + "step": 23775 + }, + { + "epoch": 1.7177018801813353, + "grad_norm": 7.49039072522607, + "learning_rate": 2.565555540189132e-07, + "loss": 0.6562, + "step": 23776 + }, + { + "epoch": 1.717774125381545, + "grad_norm": 7.141285003944904, + "learning_rate": 2.564264988641912e-07, + "loss": 0.547, + "step": 23777 + }, + { + "epoch": 1.7178463705817544, + "grad_norm": 8.036640658796017, + "learning_rate": 2.562974744222491e-07, + "loss": 0.6505, + "step": 23778 + }, + { + "epoch": 1.7179186157819641, + "grad_norm": 7.188540972086968, + "learning_rate": 2.561684806948519e-07, + "loss": 0.5809, + "step": 23779 + }, + { + "epoch": 1.7179908609821735, + "grad_norm": 7.295411577857142, + "learning_rate": 2.56039517683766e-07, + "loss": 0.6329, + "step": 23780 + }, + { + "epoch": 1.718063106182383, + "grad_norm": 8.354112226985835, + "learning_rate": 2.5591058539075676e-07, + "loss": 0.6253, + "step": 23781 + }, + { + "epoch": 1.7181353513825925, + "grad_norm": 6.699868716192055, + "learning_rate": 2.5578168381758944e-07, + "loss": 0.6536, + "step": 23782 + }, + { + "epoch": 1.7182075965828019, + "grad_norm": 7.982308853876294, + "learning_rate": 2.5565281296602824e-07, + "loss": 0.6186, + "step": 23783 + }, + { + "epoch": 1.7182798417830116, + "grad_norm": 6.72508104693844, + "learning_rate": 2.555239728378367e-07, + "loss": 0.5963, + "step": 23784 + }, + { + "epoch": 1.718352086983221, + "grad_norm": 6.647337198261615, + "learning_rate": 2.553951634347801e-07, + "loss": 0.5427, + "step": 23785 + }, + { + "epoch": 1.7184243321834307, + "grad_norm": 6.8953545011325605, + "learning_rate": 2.5526638475862065e-07, + "loss": 0.5719, + "step": 23786 + }, + { + "epoch": 1.71849657738364, + "grad_norm": 7.144784596468367, + "learning_rate": 2.5513763681112134e-07, + "loss": 0.5496, + "step": 23787 + }, + { + "epoch": 1.7185688225838496, + "grad_norm": 7.702880607891039, + "learning_rate": 2.55008919594045e-07, + "loss": 0.622, + "step": 23788 + }, + { + "epoch": 1.7186410677840591, + "grad_norm": 8.98867084797754, + "learning_rate": 2.548802331091535e-07, + "loss": 0.5957, + "step": 23789 + }, + { + "epoch": 1.7187133129842687, + "grad_norm": 7.432301218997526, + "learning_rate": 2.547515773582082e-07, + "loss": 0.5171, + "step": 23790 + }, + { + "epoch": 1.7187855581844782, + "grad_norm": 8.219780338814376, + "learning_rate": 2.546229523429705e-07, + "loss": 0.6384, + "step": 23791 + }, + { + "epoch": 1.7188578033846875, + "grad_norm": 7.339524632166029, + "learning_rate": 2.54494358065201e-07, + "loss": 0.5404, + "step": 23792 + }, + { + "epoch": 1.7189300485848973, + "grad_norm": 7.811868382670936, + "learning_rate": 2.543657945266603e-07, + "loss": 0.5802, + "step": 23793 + }, + { + "epoch": 1.7190022937851066, + "grad_norm": 8.017767725757539, + "learning_rate": 2.5423726172910806e-07, + "loss": 0.6785, + "step": 23794 + }, + { + "epoch": 1.7190745389853161, + "grad_norm": 6.958580637056804, + "learning_rate": 2.541087596743041e-07, + "loss": 0.5001, + "step": 23795 + }, + { + "epoch": 1.7191467841855257, + "grad_norm": 7.240294438481643, + "learning_rate": 2.53980288364008e-07, + "loss": 0.5967, + "step": 23796 + }, + { + "epoch": 1.7192190293857352, + "grad_norm": 8.17191887984156, + "learning_rate": 2.538518477999774e-07, + "loss": 0.6383, + "step": 23797 + }, + { + "epoch": 1.7192912745859448, + "grad_norm": 7.119896382638705, + "learning_rate": 2.537234379839709e-07, + "loss": 0.5684, + "step": 23798 + }, + { + "epoch": 1.719363519786154, + "grad_norm": 7.534720606493642, + "learning_rate": 2.5359505891774634e-07, + "loss": 0.6806, + "step": 23799 + }, + { + "epoch": 1.7194357649863639, + "grad_norm": 7.205218956166345, + "learning_rate": 2.5346671060306166e-07, + "loss": 0.5433, + "step": 23800 + }, + { + "epoch": 1.7195080101865732, + "grad_norm": 7.3251663449508895, + "learning_rate": 2.53338393041673e-07, + "loss": 0.5385, + "step": 23801 + }, + { + "epoch": 1.7195802553867827, + "grad_norm": 7.668301311321869, + "learning_rate": 2.532101062353373e-07, + "loss": 0.5633, + "step": 23802 + }, + { + "epoch": 1.7196525005869923, + "grad_norm": 8.491109349683423, + "learning_rate": 2.530818501858107e-07, + "loss": 0.6716, + "step": 23803 + }, + { + "epoch": 1.7197247457872018, + "grad_norm": 7.937368076459675, + "learning_rate": 2.5295362489484914e-07, + "loss": 0.5747, + "step": 23804 + }, + { + "epoch": 1.7197969909874113, + "grad_norm": 7.385655867805935, + "learning_rate": 2.528254303642075e-07, + "loss": 0.5421, + "step": 23805 + }, + { + "epoch": 1.7198692361876207, + "grad_norm": 7.954716791066431, + "learning_rate": 2.5269726659564094e-07, + "loss": 0.6098, + "step": 23806 + }, + { + "epoch": 1.7199414813878304, + "grad_norm": 5.878602242451106, + "learning_rate": 2.5256913359090445e-07, + "loss": 0.5966, + "step": 23807 + }, + { + "epoch": 1.7200137265880397, + "grad_norm": 8.029285235107714, + "learning_rate": 2.5244103135175085e-07, + "loss": 0.7017, + "step": 23808 + }, + { + "epoch": 1.7200859717882493, + "grad_norm": 8.449246290552665, + "learning_rate": 2.523129598799345e-07, + "loss": 0.5757, + "step": 23809 + }, + { + "epoch": 1.7201582169884588, + "grad_norm": 6.958447971275298, + "learning_rate": 2.521849191772091e-07, + "loss": 0.6101, + "step": 23810 + }, + { + "epoch": 1.7202304621886684, + "grad_norm": 7.832402627183222, + "learning_rate": 2.520569092453262e-07, + "loss": 0.6215, + "step": 23811 + }, + { + "epoch": 1.720302707388878, + "grad_norm": 8.06301665314094, + "learning_rate": 2.5192893008603837e-07, + "loss": 0.5898, + "step": 23812 + }, + { + "epoch": 1.7203749525890872, + "grad_norm": 6.680778762285537, + "learning_rate": 2.5180098170109857e-07, + "loss": 0.6478, + "step": 23813 + }, + { + "epoch": 1.720447197789297, + "grad_norm": 9.026683669344255, + "learning_rate": 2.5167306409225796e-07, + "loss": 0.6061, + "step": 23814 + }, + { + "epoch": 1.7205194429895063, + "grad_norm": 6.92264099704899, + "learning_rate": 2.51545177261267e-07, + "loss": 0.5302, + "step": 23815 + }, + { + "epoch": 1.7205916881897159, + "grad_norm": 7.788746359635569, + "learning_rate": 2.5141732120987696e-07, + "loss": 0.5587, + "step": 23816 + }, + { + "epoch": 1.7206639333899254, + "grad_norm": 7.3752718083353495, + "learning_rate": 2.512894959398382e-07, + "loss": 0.5771, + "step": 23817 + }, + { + "epoch": 1.720736178590135, + "grad_norm": 7.068203943614346, + "learning_rate": 2.511617014528997e-07, + "loss": 0.608, + "step": 23818 + }, + { + "epoch": 1.7208084237903445, + "grad_norm": 8.557766907942959, + "learning_rate": 2.510339377508114e-07, + "loss": 0.5563, + "step": 23819 + }, + { + "epoch": 1.7208806689905538, + "grad_norm": 6.538714801837256, + "learning_rate": 2.509062048353225e-07, + "loss": 0.6801, + "step": 23820 + }, + { + "epoch": 1.7209529141907636, + "grad_norm": 6.607525505856703, + "learning_rate": 2.5077850270818105e-07, + "loss": 0.5584, + "step": 23821 + }, + { + "epoch": 1.721025159390973, + "grad_norm": 6.031818580509828, + "learning_rate": 2.506508313711356e-07, + "loss": 0.5582, + "step": 23822 + }, + { + "epoch": 1.7210974045911824, + "grad_norm": 6.431763991566613, + "learning_rate": 2.5052319082593397e-07, + "loss": 0.5942, + "step": 23823 + }, + { + "epoch": 1.721169649791392, + "grad_norm": 8.480985129373146, + "learning_rate": 2.503955810743236e-07, + "loss": 0.6433, + "step": 23824 + }, + { + "epoch": 1.7212418949916015, + "grad_norm": 6.658289350699376, + "learning_rate": 2.502680021180504e-07, + "loss": 0.5605, + "step": 23825 + }, + { + "epoch": 1.721314140191811, + "grad_norm": 8.011986812159806, + "learning_rate": 2.5014045395886146e-07, + "loss": 0.5739, + "step": 23826 + }, + { + "epoch": 1.7213863853920204, + "grad_norm": 6.850347813595769, + "learning_rate": 2.5001293659850296e-07, + "loss": 0.578, + "step": 23827 + }, + { + "epoch": 1.7214586305922301, + "grad_norm": 7.712826575686834, + "learning_rate": 2.498854500387207e-07, + "loss": 0.5858, + "step": 23828 + }, + { + "epoch": 1.7215308757924395, + "grad_norm": 7.059550251670384, + "learning_rate": 2.497579942812592e-07, + "loss": 0.6088, + "step": 23829 + }, + { + "epoch": 1.721603120992649, + "grad_norm": 8.103976707553057, + "learning_rate": 2.496305693278633e-07, + "loss": 0.6077, + "step": 23830 + }, + { + "epoch": 1.7216753661928585, + "grad_norm": 7.6483370001856095, + "learning_rate": 2.4950317518027787e-07, + "loss": 0.5955, + "step": 23831 + }, + { + "epoch": 1.721747611393068, + "grad_norm": 6.977288596033831, + "learning_rate": 2.4937581184024644e-07, + "loss": 0.5831, + "step": 23832 + }, + { + "epoch": 1.7218198565932776, + "grad_norm": 6.905512291358514, + "learning_rate": 2.4924847930951264e-07, + "loss": 0.6138, + "step": 23833 + }, + { + "epoch": 1.721892101793487, + "grad_norm": 7.843727628516404, + "learning_rate": 2.491211775898197e-07, + "loss": 0.5898, + "step": 23834 + }, + { + "epoch": 1.7219643469936967, + "grad_norm": 9.31595542344746, + "learning_rate": 2.4899390668291056e-07, + "loss": 0.5503, + "step": 23835 + }, + { + "epoch": 1.722036592193906, + "grad_norm": 8.010591171870622, + "learning_rate": 2.488666665905265e-07, + "loss": 0.6202, + "step": 23836 + }, + { + "epoch": 1.7221088373941156, + "grad_norm": 6.913383838514451, + "learning_rate": 2.487394573144097e-07, + "loss": 0.58, + "step": 23837 + }, + { + "epoch": 1.7221810825943251, + "grad_norm": 6.816047794568477, + "learning_rate": 2.4861227885630225e-07, + "loss": 0.6255, + "step": 23838 + }, + { + "epoch": 1.7222533277945347, + "grad_norm": 8.43261622620507, + "learning_rate": 2.4848513121794453e-07, + "loss": 0.6235, + "step": 23839 + }, + { + "epoch": 1.7223255729947442, + "grad_norm": 6.631043179061506, + "learning_rate": 2.483580144010772e-07, + "loss": 0.6254, + "step": 23840 + }, + { + "epoch": 1.7223978181949535, + "grad_norm": 8.076721423639553, + "learning_rate": 2.482309284074394e-07, + "loss": 0.738, + "step": 23841 + }, + { + "epoch": 1.7224700633951633, + "grad_norm": 7.196832055292112, + "learning_rate": 2.4810387323877305e-07, + "loss": 0.5792, + "step": 23842 + }, + { + "epoch": 1.7225423085953726, + "grad_norm": 8.391026890398495, + "learning_rate": 2.4797684889681564e-07, + "loss": 0.6412, + "step": 23843 + }, + { + "epoch": 1.7226145537955824, + "grad_norm": 6.912986542339737, + "learning_rate": 2.478498553833067e-07, + "loss": 0.6567, + "step": 23844 + }, + { + "epoch": 1.7226867989957917, + "grad_norm": 7.22338929487214, + "learning_rate": 2.4772289269998443e-07, + "loss": 0.6197, + "step": 23845 + }, + { + "epoch": 1.7227590441960012, + "grad_norm": 8.026992084754472, + "learning_rate": 2.4759596084858755e-07, + "loss": 0.6813, + "step": 23846 + }, + { + "epoch": 1.7228312893962108, + "grad_norm": 8.280961632656163, + "learning_rate": 2.4746905983085267e-07, + "loss": 0.6045, + "step": 23847 + }, + { + "epoch": 1.72290353459642, + "grad_norm": 7.354159375872434, + "learning_rate": 2.473421896485173e-07, + "loss": 0.6, + "step": 23848 + }, + { + "epoch": 1.7229757797966299, + "grad_norm": 7.027215910889031, + "learning_rate": 2.4721535030331814e-07, + "loss": 0.6328, + "step": 23849 + }, + { + "epoch": 1.7230480249968392, + "grad_norm": 7.360766107243653, + "learning_rate": 2.4708854179699175e-07, + "loss": 0.6687, + "step": 23850 + }, + { + "epoch": 1.723120270197049, + "grad_norm": 8.75368051778347, + "learning_rate": 2.469617641312741e-07, + "loss": 0.617, + "step": 23851 + }, + { + "epoch": 1.7231925153972583, + "grad_norm": 7.439310318349095, + "learning_rate": 2.468350173079004e-07, + "loss": 0.5487, + "step": 23852 + }, + { + "epoch": 1.7232647605974678, + "grad_norm": 8.522363303071, + "learning_rate": 2.4670830132860616e-07, + "loss": 0.6347, + "step": 23853 + }, + { + "epoch": 1.7233370057976773, + "grad_norm": 7.7799536915947884, + "learning_rate": 2.465816161951254e-07, + "loss": 0.5677, + "step": 23854 + }, + { + "epoch": 1.7234092509978867, + "grad_norm": 7.209865105963592, + "learning_rate": 2.464549619091927e-07, + "loss": 0.5818, + "step": 23855 + }, + { + "epoch": 1.7234814961980964, + "grad_norm": 6.842778106378634, + "learning_rate": 2.463283384725418e-07, + "loss": 0.5939, + "step": 23856 + }, + { + "epoch": 1.7235537413983057, + "grad_norm": 8.324975769190994, + "learning_rate": 2.462017458869065e-07, + "loss": 0.6409, + "step": 23857 + }, + { + "epoch": 1.7236259865985155, + "grad_norm": 6.63276735252229, + "learning_rate": 2.4607518415401874e-07, + "loss": 0.5835, + "step": 23858 + }, + { + "epoch": 1.7236982317987248, + "grad_norm": 6.825681759944358, + "learning_rate": 2.4594865327561196e-07, + "loss": 0.6172, + "step": 23859 + }, + { + "epoch": 1.7237704769989344, + "grad_norm": 6.6393924062867535, + "learning_rate": 2.4582215325341774e-07, + "loss": 0.6259, + "step": 23860 + }, + { + "epoch": 1.723842722199144, + "grad_norm": 6.377876380810956, + "learning_rate": 2.4569568408916806e-07, + "loss": 0.5855, + "step": 23861 + }, + { + "epoch": 1.7239149673993535, + "grad_norm": 6.810941465393718, + "learning_rate": 2.4556924578459403e-07, + "loss": 0.6246, + "step": 23862 + }, + { + "epoch": 1.723987212599563, + "grad_norm": 7.734744161407773, + "learning_rate": 2.4544283834142647e-07, + "loss": 0.6718, + "step": 23863 + }, + { + "epoch": 1.7240594577997723, + "grad_norm": 7.277586182975422, + "learning_rate": 2.453164617613965e-07, + "loss": 0.6188, + "step": 23864 + }, + { + "epoch": 1.724131702999982, + "grad_norm": 6.822664569730414, + "learning_rate": 2.4519011604623305e-07, + "loss": 0.6467, + "step": 23865 + }, + { + "epoch": 1.7242039482001914, + "grad_norm": 9.356342926277422, + "learning_rate": 2.450638011976664e-07, + "loss": 0.6069, + "step": 23866 + }, + { + "epoch": 1.724276193400401, + "grad_norm": 7.318157127075932, + "learning_rate": 2.4493751721742564e-07, + "loss": 0.5584, + "step": 23867 + }, + { + "epoch": 1.7243484386006105, + "grad_norm": 7.7374959178414615, + "learning_rate": 2.4481126410723894e-07, + "loss": 0.6139, + "step": 23868 + }, + { + "epoch": 1.72442068380082, + "grad_norm": 6.503353134325868, + "learning_rate": 2.446850418688346e-07, + "loss": 0.5873, + "step": 23869 + }, + { + "epoch": 1.7244929290010296, + "grad_norm": 8.561509569343308, + "learning_rate": 2.4455885050394116e-07, + "loss": 0.6568, + "step": 23870 + }, + { + "epoch": 1.724565174201239, + "grad_norm": 8.652054083046234, + "learning_rate": 2.4443269001428626e-07, + "loss": 0.6191, + "step": 23871 + }, + { + "epoch": 1.7246374194014487, + "grad_norm": 6.599144053586816, + "learning_rate": 2.443065604015962e-07, + "loss": 0.6575, + "step": 23872 + }, + { + "epoch": 1.724709664601658, + "grad_norm": 7.136089328514674, + "learning_rate": 2.44180461667598e-07, + "loss": 0.6475, + "step": 23873 + }, + { + "epoch": 1.7247819098018675, + "grad_norm": 7.755350050561114, + "learning_rate": 2.44054393814018e-07, + "loss": 0.6086, + "step": 23874 + }, + { + "epoch": 1.724854155002077, + "grad_norm": 7.616058610257856, + "learning_rate": 2.4392835684258153e-07, + "loss": 0.6133, + "step": 23875 + }, + { + "epoch": 1.7249264002022866, + "grad_norm": 8.10531485170162, + "learning_rate": 2.4380235075501383e-07, + "loss": 0.6036, + "step": 23876 + }, + { + "epoch": 1.7249986454024961, + "grad_norm": 7.077994844364166, + "learning_rate": 2.4367637555304025e-07, + "loss": 0.6134, + "step": 23877 + }, + { + "epoch": 1.7250708906027055, + "grad_norm": 7.8482198555547, + "learning_rate": 2.435504312383852e-07, + "loss": 0.6724, + "step": 23878 + }, + { + "epoch": 1.7251431358029152, + "grad_norm": 7.602068579392784, + "learning_rate": 2.4342451781277255e-07, + "loss": 0.6132, + "step": 23879 + }, + { + "epoch": 1.7252153810031245, + "grad_norm": 6.84994240618252, + "learning_rate": 2.4329863527792627e-07, + "loss": 0.5938, + "step": 23880 + }, + { + "epoch": 1.725287626203334, + "grad_norm": 7.357280068566387, + "learning_rate": 2.4317278363556965e-07, + "loss": 0.6697, + "step": 23881 + }, + { + "epoch": 1.7253598714035436, + "grad_norm": 6.377174754475613, + "learning_rate": 2.4304696288742545e-07, + "loss": 0.5941, + "step": 23882 + }, + { + "epoch": 1.7254321166037532, + "grad_norm": 6.716698053758359, + "learning_rate": 2.4292117303521574e-07, + "loss": 0.5394, + "step": 23883 + }, + { + "epoch": 1.7255043618039627, + "grad_norm": 6.040529060687302, + "learning_rate": 2.427954140806624e-07, + "loss": 0.5434, + "step": 23884 + }, + { + "epoch": 1.725576607004172, + "grad_norm": 6.812973146019211, + "learning_rate": 2.426696860254879e-07, + "loss": 0.6461, + "step": 23885 + }, + { + "epoch": 1.7256488522043818, + "grad_norm": 7.489827187025339, + "learning_rate": 2.4254398887141205e-07, + "loss": 0.6211, + "step": 23886 + }, + { + "epoch": 1.7257210974045911, + "grad_norm": 6.770640991608982, + "learning_rate": 2.4241832262015625e-07, + "loss": 0.5114, + "step": 23887 + }, + { + "epoch": 1.7257933426048007, + "grad_norm": 7.70211494431747, + "learning_rate": 2.4229268727344075e-07, + "loss": 0.6, + "step": 23888 + }, + { + "epoch": 1.7258655878050102, + "grad_norm": 7.799820276781873, + "learning_rate": 2.421670828329853e-07, + "loss": 0.6073, + "step": 23889 + }, + { + "epoch": 1.7259378330052197, + "grad_norm": 7.662638587382913, + "learning_rate": 2.4204150930050964e-07, + "loss": 0.6023, + "step": 23890 + }, + { + "epoch": 1.7260100782054293, + "grad_norm": 7.731372720643328, + "learning_rate": 2.419159666777321e-07, + "loss": 0.634, + "step": 23891 + }, + { + "epoch": 1.7260823234056386, + "grad_norm": 7.309544675243116, + "learning_rate": 2.4179045496637243e-07, + "loss": 0.5417, + "step": 23892 + }, + { + "epoch": 1.7261545686058484, + "grad_norm": 7.656189213725533, + "learning_rate": 2.416649741681476e-07, + "loss": 0.557, + "step": 23893 + }, + { + "epoch": 1.7262268138060577, + "grad_norm": 7.481306812959796, + "learning_rate": 2.4153952428477565e-07, + "loss": 0.5632, + "step": 23894 + }, + { + "epoch": 1.7262990590062672, + "grad_norm": 6.726445390875893, + "learning_rate": 2.4141410531797416e-07, + "loss": 0.583, + "step": 23895 + }, + { + "epoch": 1.7263713042064768, + "grad_norm": 8.899615041572506, + "learning_rate": 2.412887172694603e-07, + "loss": 0.6637, + "step": 23896 + }, + { + "epoch": 1.7264435494066863, + "grad_norm": 6.193608207118934, + "learning_rate": 2.411633601409491e-07, + "loss": 0.5968, + "step": 23897 + }, + { + "epoch": 1.7265157946068959, + "grad_norm": 7.38973303285545, + "learning_rate": 2.410380339341584e-07, + "loss": 0.6014, + "step": 23898 + }, + { + "epoch": 1.7265880398071052, + "grad_norm": 7.545195398549657, + "learning_rate": 2.4091273865080314e-07, + "loss": 0.543, + "step": 23899 + }, + { + "epoch": 1.726660285007315, + "grad_norm": 7.848165416766873, + "learning_rate": 2.407874742925981e-07, + "loss": 0.5848, + "step": 23900 + }, + { + "epoch": 1.7267325302075243, + "grad_norm": 7.452902864548714, + "learning_rate": 2.4066224086125856e-07, + "loss": 0.6379, + "step": 23901 + }, + { + "epoch": 1.7268047754077338, + "grad_norm": 7.392790463017204, + "learning_rate": 2.4053703835849845e-07, + "loss": 0.6166, + "step": 23902 + }, + { + "epoch": 1.7268770206079433, + "grad_norm": 7.682323418675555, + "learning_rate": 2.4041186678603224e-07, + "loss": 0.6284, + "step": 23903 + }, + { + "epoch": 1.726949265808153, + "grad_norm": 7.274508650399444, + "learning_rate": 2.4028672614557296e-07, + "loss": 0.6316, + "step": 23904 + }, + { + "epoch": 1.7270215110083624, + "grad_norm": 7.068685877717427, + "learning_rate": 2.4016161643883373e-07, + "loss": 0.6124, + "step": 23905 + }, + { + "epoch": 1.7270937562085718, + "grad_norm": 7.444590034136681, + "learning_rate": 2.4003653766752735e-07, + "loss": 0.5533, + "step": 23906 + }, + { + "epoch": 1.7271660014087815, + "grad_norm": 7.663560016736801, + "learning_rate": 2.39911489833366e-07, + "loss": 0.6912, + "step": 23907 + }, + { + "epoch": 1.7272382466089908, + "grad_norm": 6.631131483739308, + "learning_rate": 2.3978647293806137e-07, + "loss": 0.5918, + "step": 23908 + }, + { + "epoch": 1.7273104918092004, + "grad_norm": 7.676249140876209, + "learning_rate": 2.3966148698332527e-07, + "loss": 0.5943, + "step": 23909 + }, + { + "epoch": 1.72738273700941, + "grad_norm": 7.500409941596005, + "learning_rate": 2.395365319708687e-07, + "loss": 0.6703, + "step": 23910 + }, + { + "epoch": 1.7274549822096195, + "grad_norm": 6.841925112093961, + "learning_rate": 2.3941160790240143e-07, + "loss": 0.5813, + "step": 23911 + }, + { + "epoch": 1.727527227409829, + "grad_norm": 7.055731306448597, + "learning_rate": 2.3928671477963407e-07, + "loss": 0.6322, + "step": 23912 + }, + { + "epoch": 1.7275994726100383, + "grad_norm": 6.672632536216789, + "learning_rate": 2.3916185260427603e-07, + "loss": 0.5981, + "step": 23913 + }, + { + "epoch": 1.727671717810248, + "grad_norm": 8.233165789712876, + "learning_rate": 2.390370213780374e-07, + "loss": 0.6771, + "step": 23914 + }, + { + "epoch": 1.7277439630104574, + "grad_norm": 6.682641376750938, + "learning_rate": 2.3891222110262595e-07, + "loss": 0.5595, + "step": 23915 + }, + { + "epoch": 1.727816208210667, + "grad_norm": 7.99567940388049, + "learning_rate": 2.387874517797506e-07, + "loss": 0.6104, + "step": 23916 + }, + { + "epoch": 1.7278884534108765, + "grad_norm": 6.556691823787293, + "learning_rate": 2.386627134111194e-07, + "loss": 0.5755, + "step": 23917 + }, + { + "epoch": 1.727960698611086, + "grad_norm": 8.590547165946507, + "learning_rate": 2.3853800599843964e-07, + "loss": 0.5527, + "step": 23918 + }, + { + "epoch": 1.7280329438112956, + "grad_norm": 7.402494742028644, + "learning_rate": 2.3841332954341879e-07, + "loss": 0.6586, + "step": 23919 + }, + { + "epoch": 1.728105189011505, + "grad_norm": 7.352922919755598, + "learning_rate": 2.3828868404776328e-07, + "loss": 0.6105, + "step": 23920 + }, + { + "epoch": 1.7281774342117147, + "grad_norm": 7.9879710839081985, + "learning_rate": 2.381640695131804e-07, + "loss": 0.6254, + "step": 23921 + }, + { + "epoch": 1.728249679411924, + "grad_norm": 7.734800631438762, + "learning_rate": 2.3803948594137428e-07, + "loss": 0.5787, + "step": 23922 + }, + { + "epoch": 1.7283219246121337, + "grad_norm": 7.24771424139042, + "learning_rate": 2.3791493333405163e-07, + "loss": 0.6041, + "step": 23923 + }, + { + "epoch": 1.728394169812343, + "grad_norm": 7.252381558429132, + "learning_rate": 2.377904116929175e-07, + "loss": 0.57, + "step": 23924 + }, + { + "epoch": 1.7284664150125526, + "grad_norm": 7.0842231939513844, + "learning_rate": 2.3766592101967524e-07, + "loss": 0.5735, + "step": 23925 + }, + { + "epoch": 1.7285386602127621, + "grad_norm": 7.7908444982303084, + "learning_rate": 2.375414613160304e-07, + "loss": 0.6431, + "step": 23926 + }, + { + "epoch": 1.7286109054129715, + "grad_norm": 7.85233278054589, + "learning_rate": 2.3741703258368638e-07, + "loss": 0.5417, + "step": 23927 + }, + { + "epoch": 1.7286831506131812, + "grad_norm": 7.2992535457702425, + "learning_rate": 2.3729263482434679e-07, + "loss": 0.5801, + "step": 23928 + }, + { + "epoch": 1.7287553958133905, + "grad_norm": 7.632868474717991, + "learning_rate": 2.371682680397136e-07, + "loss": 0.6244, + "step": 23929 + }, + { + "epoch": 1.7288276410136003, + "grad_norm": 7.618456174396567, + "learning_rate": 2.370439322314899e-07, + "loss": 0.6389, + "step": 23930 + }, + { + "epoch": 1.7288998862138096, + "grad_norm": 7.45737168347784, + "learning_rate": 2.3691962740137824e-07, + "loss": 0.6718, + "step": 23931 + }, + { + "epoch": 1.7289721314140192, + "grad_norm": 7.63161919127998, + "learning_rate": 2.367953535510789e-07, + "loss": 0.5956, + "step": 23932 + }, + { + "epoch": 1.7290443766142287, + "grad_norm": 6.148623627265343, + "learning_rate": 2.3667111068229443e-07, + "loss": 0.5003, + "step": 23933 + }, + { + "epoch": 1.729116621814438, + "grad_norm": 10.499967665849582, + "learning_rate": 2.365468987967248e-07, + "loss": 0.6556, + "step": 23934 + }, + { + "epoch": 1.7291888670146478, + "grad_norm": 6.770371109272769, + "learning_rate": 2.3642271789607068e-07, + "loss": 0.5858, + "step": 23935 + }, + { + "epoch": 1.7292611122148571, + "grad_norm": 7.361067721001082, + "learning_rate": 2.3629856798203203e-07, + "loss": 0.6059, + "step": 23936 + }, + { + "epoch": 1.7293333574150669, + "grad_norm": 7.450872884728659, + "learning_rate": 2.361744490563081e-07, + "loss": 0.6312, + "step": 23937 + }, + { + "epoch": 1.7294056026152762, + "grad_norm": 7.005572417108928, + "learning_rate": 2.360503611205986e-07, + "loss": 0.6098, + "step": 23938 + }, + { + "epoch": 1.7294778478154857, + "grad_norm": 8.05827516563064, + "learning_rate": 2.3592630417660217e-07, + "loss": 0.6494, + "step": 23939 + }, + { + "epoch": 1.7295500930156953, + "grad_norm": 8.71771999824343, + "learning_rate": 2.358022782260161e-07, + "loss": 0.61, + "step": 23940 + }, + { + "epoch": 1.7296223382159048, + "grad_norm": 8.350035853508823, + "learning_rate": 2.3567828327053898e-07, + "loss": 0.5527, + "step": 23941 + }, + { + "epoch": 1.7296945834161144, + "grad_norm": 7.256124606104584, + "learning_rate": 2.3555431931186838e-07, + "loss": 0.5944, + "step": 23942 + }, + { + "epoch": 1.7297668286163237, + "grad_norm": 11.298624032566837, + "learning_rate": 2.3543038635170045e-07, + "loss": 0.666, + "step": 23943 + }, + { + "epoch": 1.7298390738165335, + "grad_norm": 7.33061153301674, + "learning_rate": 2.3530648439173216e-07, + "loss": 0.6032, + "step": 23944 + }, + { + "epoch": 1.7299113190167428, + "grad_norm": 6.680674554721516, + "learning_rate": 2.351826134336596e-07, + "loss": 0.6305, + "step": 23945 + }, + { + "epoch": 1.7299835642169523, + "grad_norm": 7.418649329873914, + "learning_rate": 2.3505877347917873e-07, + "loss": 0.6756, + "step": 23946 + }, + { + "epoch": 1.7300558094171619, + "grad_norm": 7.016333461409828, + "learning_rate": 2.3493496452998476e-07, + "loss": 0.6266, + "step": 23947 + }, + { + "epoch": 1.7301280546173714, + "grad_norm": 9.024964782138913, + "learning_rate": 2.3481118658777224e-07, + "loss": 0.591, + "step": 23948 + }, + { + "epoch": 1.730200299817581, + "grad_norm": 8.094200092305437, + "learning_rate": 2.3468743965423617e-07, + "loss": 0.6331, + "step": 23949 + }, + { + "epoch": 1.7302725450177903, + "grad_norm": 7.789905315014717, + "learning_rate": 2.3456372373106967e-07, + "loss": 0.6583, + "step": 23950 + }, + { + "epoch": 1.730344790218, + "grad_norm": 6.53513611165083, + "learning_rate": 2.3444003881996692e-07, + "loss": 0.5476, + "step": 23951 + }, + { + "epoch": 1.7304170354182093, + "grad_norm": 6.219494952338027, + "learning_rate": 2.3431638492262098e-07, + "loss": 0.5417, + "step": 23952 + }, + { + "epoch": 1.730489280618419, + "grad_norm": 8.353188880022529, + "learning_rate": 2.34192762040725e-07, + "loss": 0.577, + "step": 23953 + }, + { + "epoch": 1.7305615258186284, + "grad_norm": 8.285479462401376, + "learning_rate": 2.340691701759698e-07, + "loss": 0.6063, + "step": 23954 + }, + { + "epoch": 1.730633771018838, + "grad_norm": 8.122491067709433, + "learning_rate": 2.3394560933004878e-07, + "loss": 0.6209, + "step": 23955 + }, + { + "epoch": 1.7307060162190475, + "grad_norm": 6.720199925309723, + "learning_rate": 2.3382207950465335e-07, + "loss": 0.5294, + "step": 23956 + }, + { + "epoch": 1.7307782614192568, + "grad_norm": 7.251420506716899, + "learning_rate": 2.3369858070147356e-07, + "loss": 0.602, + "step": 23957 + }, + { + "epoch": 1.7308505066194666, + "grad_norm": 6.99302952986867, + "learning_rate": 2.3357511292220054e-07, + "loss": 0.5879, + "step": 23958 + }, + { + "epoch": 1.730922751819676, + "grad_norm": 7.557091662763795, + "learning_rate": 2.334516761685246e-07, + "loss": 0.6112, + "step": 23959 + }, + { + "epoch": 1.7309949970198855, + "grad_norm": 7.657120386590908, + "learning_rate": 2.333282704421355e-07, + "loss": 0.5936, + "step": 23960 + }, + { + "epoch": 1.731067242220095, + "grad_norm": 6.113629432975963, + "learning_rate": 2.332048957447222e-07, + "loss": 0.609, + "step": 23961 + }, + { + "epoch": 1.7311394874203045, + "grad_norm": 9.440799363769633, + "learning_rate": 2.330815520779739e-07, + "loss": 0.546, + "step": 23962 + }, + { + "epoch": 1.731211732620514, + "grad_norm": 7.626893855803639, + "learning_rate": 2.3295823944357893e-07, + "loss": 0.6789, + "step": 23963 + }, + { + "epoch": 1.7312839778207234, + "grad_norm": 7.284300856701847, + "learning_rate": 2.328349578432254e-07, + "loss": 0.5257, + "step": 23964 + }, + { + "epoch": 1.7313562230209332, + "grad_norm": 8.074491823817189, + "learning_rate": 2.3271170727860088e-07, + "loss": 0.6025, + "step": 23965 + }, + { + "epoch": 1.7314284682211425, + "grad_norm": 6.478383432948136, + "learning_rate": 2.3258848775139285e-07, + "loss": 0.5806, + "step": 23966 + }, + { + "epoch": 1.731500713421352, + "grad_norm": 8.546371082465306, + "learning_rate": 2.3246529926328808e-07, + "loss": 0.615, + "step": 23967 + }, + { + "epoch": 1.7315729586215616, + "grad_norm": 6.785318446265903, + "learning_rate": 2.3234214181597242e-07, + "loss": 0.5408, + "step": 23968 + }, + { + "epoch": 1.7316452038217711, + "grad_norm": 7.793667487080131, + "learning_rate": 2.322190154111323e-07, + "loss": 0.5696, + "step": 23969 + }, + { + "epoch": 1.7317174490219807, + "grad_norm": 6.217693603348205, + "learning_rate": 2.3209592005045274e-07, + "loss": 0.5994, + "step": 23970 + }, + { + "epoch": 1.73178969422219, + "grad_norm": 6.764980787657349, + "learning_rate": 2.3197285573561962e-07, + "loss": 0.6238, + "step": 23971 + }, + { + "epoch": 1.7318619394223997, + "grad_norm": 7.340921319832723, + "learning_rate": 2.3184982246831688e-07, + "loss": 0.6309, + "step": 23972 + }, + { + "epoch": 1.731934184622609, + "grad_norm": 6.372695207026628, + "learning_rate": 2.3172682025022876e-07, + "loss": 0.6243, + "step": 23973 + }, + { + "epoch": 1.7320064298228186, + "grad_norm": 6.619398231930215, + "learning_rate": 2.3160384908303914e-07, + "loss": 0.5679, + "step": 23974 + }, + { + "epoch": 1.7320786750230281, + "grad_norm": 7.4835232626326, + "learning_rate": 2.3148090896843169e-07, + "loss": 0.6527, + "step": 23975 + }, + { + "epoch": 1.7321509202232377, + "grad_norm": 7.126884077495069, + "learning_rate": 2.3135799990808922e-07, + "loss": 0.5846, + "step": 23976 + }, + { + "epoch": 1.7322231654234472, + "grad_norm": 8.834962610620074, + "learning_rate": 2.312351219036943e-07, + "loss": 0.5999, + "step": 23977 + }, + { + "epoch": 1.7322954106236566, + "grad_norm": 7.392288633697719, + "learning_rate": 2.3111227495692944e-07, + "loss": 0.6651, + "step": 23978 + }, + { + "epoch": 1.7323676558238663, + "grad_norm": 7.05402831655237, + "learning_rate": 2.3098945906947529e-07, + "loss": 0.602, + "step": 23979 + }, + { + "epoch": 1.7324399010240756, + "grad_norm": 5.99107428086302, + "learning_rate": 2.3086667424301379e-07, + "loss": 0.5533, + "step": 23980 + }, + { + "epoch": 1.7325121462242852, + "grad_norm": 8.925803473714128, + "learning_rate": 2.3074392047922582e-07, + "loss": 0.6256, + "step": 23981 + }, + { + "epoch": 1.7325843914244947, + "grad_norm": 8.131779116774133, + "learning_rate": 2.306211977797912e-07, + "loss": 0.6266, + "step": 23982 + }, + { + "epoch": 1.7326566366247043, + "grad_norm": 6.290273381585882, + "learning_rate": 2.3049850614639047e-07, + "loss": 0.5913, + "step": 23983 + }, + { + "epoch": 1.7327288818249138, + "grad_norm": 7.107001342240853, + "learning_rate": 2.3037584558070286e-07, + "loss": 0.6296, + "step": 23984 + }, + { + "epoch": 1.7328011270251231, + "grad_norm": 7.716960745934689, + "learning_rate": 2.3025321608440843e-07, + "loss": 0.5788, + "step": 23985 + }, + { + "epoch": 1.7328733722253329, + "grad_norm": 6.555271199986397, + "learning_rate": 2.3013061765918444e-07, + "loss": 0.5597, + "step": 23986 + }, + { + "epoch": 1.7329456174255422, + "grad_norm": 8.802318995704963, + "learning_rate": 2.3000805030671007e-07, + "loss": 0.5875, + "step": 23987 + }, + { + "epoch": 1.7330178626257517, + "grad_norm": 7.541622632426767, + "learning_rate": 2.2988551402866294e-07, + "loss": 0.6347, + "step": 23988 + }, + { + "epoch": 1.7330901078259613, + "grad_norm": 7.417687192277031, + "learning_rate": 2.297630088267208e-07, + "loss": 0.5621, + "step": 23989 + }, + { + "epoch": 1.7331623530261708, + "grad_norm": 7.586452415583073, + "learning_rate": 2.2964053470256014e-07, + "loss": 0.6022, + "step": 23990 + }, + { + "epoch": 1.7332345982263804, + "grad_norm": 7.024651852054885, + "learning_rate": 2.2951809165785737e-07, + "loss": 0.6099, + "step": 23991 + }, + { + "epoch": 1.7333068434265897, + "grad_norm": 7.431593914110757, + "learning_rate": 2.2939567969428921e-07, + "loss": 0.5389, + "step": 23992 + }, + { + "epoch": 1.7333790886267995, + "grad_norm": 8.318839273181526, + "learning_rate": 2.2927329881353127e-07, + "loss": 0.6008, + "step": 23993 + }, + { + "epoch": 1.7334513338270088, + "grad_norm": 7.02940052716021, + "learning_rate": 2.291509490172586e-07, + "loss": 0.6276, + "step": 23994 + }, + { + "epoch": 1.7335235790272185, + "grad_norm": 6.666733201012796, + "learning_rate": 2.290286303071465e-07, + "loss": 0.6239, + "step": 23995 + }, + { + "epoch": 1.7335958242274279, + "grad_norm": 8.233365020485055, + "learning_rate": 2.2890634268486923e-07, + "loss": 0.5775, + "step": 23996 + }, + { + "epoch": 1.7336680694276374, + "grad_norm": 7.924454910353185, + "learning_rate": 2.287840861521007e-07, + "loss": 0.634, + "step": 23997 + }, + { + "epoch": 1.733740314627847, + "grad_norm": 6.38599121429575, + "learning_rate": 2.2866186071051427e-07, + "loss": 0.5752, + "step": 23998 + }, + { + "epoch": 1.7338125598280563, + "grad_norm": 6.762593706930761, + "learning_rate": 2.2853966636178393e-07, + "loss": 0.5595, + "step": 23999 + }, + { + "epoch": 1.733884805028266, + "grad_norm": 6.975738715974454, + "learning_rate": 2.2841750310758136e-07, + "loss": 0.6589, + "step": 24000 + }, + { + "epoch": 1.7339570502284753, + "grad_norm": 7.775556449971247, + "learning_rate": 2.2829537094957943e-07, + "loss": 0.6432, + "step": 24001 + }, + { + "epoch": 1.7340292954286851, + "grad_norm": 7.066102046122792, + "learning_rate": 2.2817326988944954e-07, + "loss": 0.5708, + "step": 24002 + }, + { + "epoch": 1.7341015406288944, + "grad_norm": 7.073401215875018, + "learning_rate": 2.2805119992886455e-07, + "loss": 0.6227, + "step": 24003 + }, + { + "epoch": 1.734173785829104, + "grad_norm": 6.367892023905949, + "learning_rate": 2.279291610694942e-07, + "loss": 0.5435, + "step": 24004 + }, + { + "epoch": 1.7342460310293135, + "grad_norm": 7.498463791238551, + "learning_rate": 2.278071533130094e-07, + "loss": 0.6075, + "step": 24005 + }, + { + "epoch": 1.7343182762295228, + "grad_norm": 8.449911751370959, + "learning_rate": 2.2768517666108075e-07, + "loss": 0.5412, + "step": 24006 + }, + { + "epoch": 1.7343905214297326, + "grad_norm": 7.1956753882601525, + "learning_rate": 2.2756323111537748e-07, + "loss": 0.6179, + "step": 24007 + }, + { + "epoch": 1.734462766629942, + "grad_norm": 6.24099143246156, + "learning_rate": 2.274413166775691e-07, + "loss": 0.6345, + "step": 24008 + }, + { + "epoch": 1.7345350118301517, + "grad_norm": 7.745680066487565, + "learning_rate": 2.273194333493245e-07, + "loss": 0.6256, + "step": 24009 + }, + { + "epoch": 1.734607257030361, + "grad_norm": 7.845434729819216, + "learning_rate": 2.2719758113231211e-07, + "loss": 0.617, + "step": 24010 + }, + { + "epoch": 1.7346795022305705, + "grad_norm": 7.153870032738399, + "learning_rate": 2.270757600282003e-07, + "loss": 0.5938, + "step": 24011 + }, + { + "epoch": 1.73475174743078, + "grad_norm": 7.456883920100377, + "learning_rate": 2.2695397003865666e-07, + "loss": 0.6281, + "step": 24012 + }, + { + "epoch": 1.7348239926309896, + "grad_norm": 7.8828091125178545, + "learning_rate": 2.2683221116534787e-07, + "loss": 0.6444, + "step": 24013 + }, + { + "epoch": 1.7348962378311992, + "grad_norm": 7.272138797186621, + "learning_rate": 2.267104834099418e-07, + "loss": 0.6142, + "step": 24014 + }, + { + "epoch": 1.7349684830314085, + "grad_norm": 8.336864587914725, + "learning_rate": 2.2658878677410374e-07, + "loss": 0.5971, + "step": 24015 + }, + { + "epoch": 1.7350407282316183, + "grad_norm": 6.569682232794261, + "learning_rate": 2.2646712125949987e-07, + "loss": 0.5838, + "step": 24016 + }, + { + "epoch": 1.7351129734318276, + "grad_norm": 7.397263108431478, + "learning_rate": 2.263454868677964e-07, + "loss": 0.5855, + "step": 24017 + }, + { + "epoch": 1.7351852186320371, + "grad_norm": 7.033396753444059, + "learning_rate": 2.2622388360065722e-07, + "loss": 0.6487, + "step": 24018 + }, + { + "epoch": 1.7352574638322467, + "grad_norm": 7.599245445539493, + "learning_rate": 2.261023114597477e-07, + "loss": 0.6076, + "step": 24019 + }, + { + "epoch": 1.7353297090324562, + "grad_norm": 6.692824481198712, + "learning_rate": 2.2598077044673206e-07, + "loss": 0.5836, + "step": 24020 + }, + { + "epoch": 1.7354019542326657, + "grad_norm": 6.761232990025573, + "learning_rate": 2.2585926056327368e-07, + "loss": 0.5584, + "step": 24021 + }, + { + "epoch": 1.735474199432875, + "grad_norm": 9.038567452816906, + "learning_rate": 2.2573778181103651e-07, + "loss": 0.5663, + "step": 24022 + }, + { + "epoch": 1.7355464446330848, + "grad_norm": 7.717572450585047, + "learning_rate": 2.256163341916834e-07, + "loss": 0.6217, + "step": 24023 + }, + { + "epoch": 1.7356186898332941, + "grad_norm": 7.5030976574203745, + "learning_rate": 2.2549491770687688e-07, + "loss": 0.676, + "step": 24024 + }, + { + "epoch": 1.7356909350335037, + "grad_norm": 7.752532053133562, + "learning_rate": 2.253735323582787e-07, + "loss": 0.6032, + "step": 24025 + }, + { + "epoch": 1.7357631802337132, + "grad_norm": 7.224521460079445, + "learning_rate": 2.2525217814755057e-07, + "loss": 0.5615, + "step": 24026 + }, + { + "epoch": 1.7358354254339228, + "grad_norm": 8.324818597140299, + "learning_rate": 2.2513085507635397e-07, + "loss": 0.6157, + "step": 24027 + }, + { + "epoch": 1.7359076706341323, + "grad_norm": 6.89379865087944, + "learning_rate": 2.2500956314635004e-07, + "loss": 0.595, + "step": 24028 + }, + { + "epoch": 1.7359799158343416, + "grad_norm": 7.236702925516408, + "learning_rate": 2.2488830235919828e-07, + "loss": 0.6554, + "step": 24029 + }, + { + "epoch": 1.7360521610345514, + "grad_norm": 6.694280306684423, + "learning_rate": 2.2476707271655908e-07, + "loss": 0.5359, + "step": 24030 + }, + { + "epoch": 1.7361244062347607, + "grad_norm": 7.282547078005014, + "learning_rate": 2.2464587422009215e-07, + "loss": 0.6147, + "step": 24031 + }, + { + "epoch": 1.7361966514349703, + "grad_norm": 9.47621147845356, + "learning_rate": 2.245247068714565e-07, + "loss": 0.6238, + "step": 24032 + }, + { + "epoch": 1.7362688966351798, + "grad_norm": 8.010259248917183, + "learning_rate": 2.2440357067231104e-07, + "loss": 0.6502, + "step": 24033 + }, + { + "epoch": 1.7363411418353893, + "grad_norm": 7.682523031252977, + "learning_rate": 2.2428246562431367e-07, + "loss": 0.6032, + "step": 24034 + }, + { + "epoch": 1.7364133870355989, + "grad_norm": 8.96531053046769, + "learning_rate": 2.241613917291227e-07, + "loss": 0.606, + "step": 24035 + }, + { + "epoch": 1.7364856322358082, + "grad_norm": 5.449686359864982, + "learning_rate": 2.2404034898839465e-07, + "loss": 0.6137, + "step": 24036 + }, + { + "epoch": 1.736557877436018, + "grad_norm": 6.203068115588516, + "learning_rate": 2.2391933740378734e-07, + "loss": 0.6031, + "step": 24037 + }, + { + "epoch": 1.7366301226362273, + "grad_norm": 7.013384964804854, + "learning_rate": 2.2379835697695695e-07, + "loss": 0.6713, + "step": 24038 + }, + { + "epoch": 1.7367023678364368, + "grad_norm": 8.147163093954372, + "learning_rate": 2.2367740770955965e-07, + "loss": 0.6364, + "step": 24039 + }, + { + "epoch": 1.7367746130366464, + "grad_norm": 7.205878072968639, + "learning_rate": 2.2355648960325104e-07, + "loss": 0.5591, + "step": 24040 + }, + { + "epoch": 1.736846858236856, + "grad_norm": 6.888935184665017, + "learning_rate": 2.2343560265968678e-07, + "loss": 0.6333, + "step": 24041 + }, + { + "epoch": 1.7369191034370655, + "grad_norm": 6.839424333942628, + "learning_rate": 2.233147468805219e-07, + "loss": 0.627, + "step": 24042 + }, + { + "epoch": 1.7369913486372748, + "grad_norm": 6.989727384520236, + "learning_rate": 2.2319392226740983e-07, + "loss": 0.6846, + "step": 24043 + }, + { + "epoch": 1.7370635938374845, + "grad_norm": 7.025321122293122, + "learning_rate": 2.2307312882200534e-07, + "loss": 0.5714, + "step": 24044 + }, + { + "epoch": 1.7371358390376939, + "grad_norm": 7.490768855197188, + "learning_rate": 2.2295236654596152e-07, + "loss": 0.6072, + "step": 24045 + }, + { + "epoch": 1.7372080842379034, + "grad_norm": 6.918982458990792, + "learning_rate": 2.2283163544093268e-07, + "loss": 0.6291, + "step": 24046 + }, + { + "epoch": 1.737280329438113, + "grad_norm": 7.542016655263463, + "learning_rate": 2.2271093550856964e-07, + "loss": 0.6684, + "step": 24047 + }, + { + "epoch": 1.7373525746383225, + "grad_norm": 7.228471031762613, + "learning_rate": 2.2259026675052614e-07, + "loss": 0.6562, + "step": 24048 + }, + { + "epoch": 1.737424819838532, + "grad_norm": 7.203422622986351, + "learning_rate": 2.2246962916845332e-07, + "loss": 0.5521, + "step": 24049 + }, + { + "epoch": 1.7374970650387414, + "grad_norm": 8.238642417403241, + "learning_rate": 2.2234902276400294e-07, + "loss": 0.6509, + "step": 24050 + }, + { + "epoch": 1.7375693102389511, + "grad_norm": 7.591539854071573, + "learning_rate": 2.2222844753882617e-07, + "loss": 0.6772, + "step": 24051 + }, + { + "epoch": 1.7376415554391604, + "grad_norm": 8.115702312396476, + "learning_rate": 2.2210790349457307e-07, + "loss": 0.5699, + "step": 24052 + }, + { + "epoch": 1.73771380063937, + "grad_norm": 7.048860595831279, + "learning_rate": 2.219873906328948e-07, + "loss": 0.6036, + "step": 24053 + }, + { + "epoch": 1.7377860458395795, + "grad_norm": 7.0546737113133515, + "learning_rate": 2.2186690895543982e-07, + "loss": 0.6088, + "step": 24054 + }, + { + "epoch": 1.737858291039789, + "grad_norm": 8.138014947914538, + "learning_rate": 2.217464584638579e-07, + "loss": 0.551, + "step": 24055 + }, + { + "epoch": 1.7379305362399986, + "grad_norm": 7.058975285413466, + "learning_rate": 2.2162603915979852e-07, + "loss": 0.5947, + "step": 24056 + }, + { + "epoch": 1.738002781440208, + "grad_norm": 6.869170335537824, + "learning_rate": 2.215056510449093e-07, + "loss": 0.5743, + "step": 24057 + }, + { + "epoch": 1.7380750266404177, + "grad_norm": 8.120778557044797, + "learning_rate": 2.213852941208386e-07, + "loss": 0.6139, + "step": 24058 + }, + { + "epoch": 1.738147271840627, + "grad_norm": 8.474011352848374, + "learning_rate": 2.2126496838923346e-07, + "loss": 0.6349, + "step": 24059 + }, + { + "epoch": 1.7382195170408365, + "grad_norm": 8.040371120105306, + "learning_rate": 2.2114467385174255e-07, + "loss": 0.6636, + "step": 24060 + }, + { + "epoch": 1.738291762241046, + "grad_norm": 7.364630181479647, + "learning_rate": 2.2102441051001094e-07, + "loss": 0.651, + "step": 24061 + }, + { + "epoch": 1.7383640074412556, + "grad_norm": 6.773382870850925, + "learning_rate": 2.2090417836568595e-07, + "loss": 0.5823, + "step": 24062 + }, + { + "epoch": 1.7384362526414652, + "grad_norm": 8.162687720671133, + "learning_rate": 2.2078397742041347e-07, + "loss": 0.5575, + "step": 24063 + }, + { + "epoch": 1.7385084978416745, + "grad_norm": 7.942516513863729, + "learning_rate": 2.20663807675838e-07, + "loss": 0.5934, + "step": 24064 + }, + { + "epoch": 1.7385807430418843, + "grad_norm": 8.803464328995405, + "learning_rate": 2.2054366913360548e-07, + "loss": 0.5687, + "step": 24065 + }, + { + "epoch": 1.7386529882420936, + "grad_norm": 6.397347866334954, + "learning_rate": 2.204235617953601e-07, + "loss": 0.628, + "step": 24066 + }, + { + "epoch": 1.7387252334423033, + "grad_norm": 8.2057338671407, + "learning_rate": 2.2030348566274618e-07, + "loss": 0.6891, + "step": 24067 + }, + { + "epoch": 1.7387974786425127, + "grad_norm": 6.263195908307503, + "learning_rate": 2.201834407374076e-07, + "loss": 0.563, + "step": 24068 + }, + { + "epoch": 1.7388697238427222, + "grad_norm": 6.0009494666037995, + "learning_rate": 2.2006342702098753e-07, + "loss": 0.521, + "step": 24069 + }, + { + "epoch": 1.7389419690429317, + "grad_norm": 7.8698723463008164, + "learning_rate": 2.1994344451512883e-07, + "loss": 0.6292, + "step": 24070 + }, + { + "epoch": 1.739014214243141, + "grad_norm": 8.060076682054921, + "learning_rate": 2.1982349322147462e-07, + "loss": 0.6051, + "step": 24071 + }, + { + "epoch": 1.7390864594433508, + "grad_norm": 7.0970849083815635, + "learning_rate": 2.1970357314166553e-07, + "loss": 0.6226, + "step": 24072 + }, + { + "epoch": 1.7391587046435601, + "grad_norm": 7.138710349588521, + "learning_rate": 2.1958368427734438e-07, + "loss": 0.647, + "step": 24073 + }, + { + "epoch": 1.73923094984377, + "grad_norm": 6.976387252400699, + "learning_rate": 2.1946382663015214e-07, + "loss": 0.5747, + "step": 24074 + }, + { + "epoch": 1.7393031950439792, + "grad_norm": 7.130160420500391, + "learning_rate": 2.1934400020172914e-07, + "loss": 0.5777, + "step": 24075 + }, + { + "epoch": 1.7393754402441888, + "grad_norm": 7.404391413683588, + "learning_rate": 2.19224204993716e-07, + "loss": 0.6754, + "step": 24076 + }, + { + "epoch": 1.7394476854443983, + "grad_norm": 7.882869845012551, + "learning_rate": 2.1910444100775224e-07, + "loss": 0.5663, + "step": 24077 + }, + { + "epoch": 1.7395199306446076, + "grad_norm": 7.511098406603501, + "learning_rate": 2.1898470824547795e-07, + "loss": 0.6404, + "step": 24078 + }, + { + "epoch": 1.7395921758448174, + "grad_norm": 8.39663452667268, + "learning_rate": 2.1886500670853183e-07, + "loss": 0.6786, + "step": 24079 + }, + { + "epoch": 1.7396644210450267, + "grad_norm": 10.107340541641358, + "learning_rate": 2.1874533639855284e-07, + "loss": 0.5822, + "step": 24080 + }, + { + "epoch": 1.7397366662452365, + "grad_norm": 6.72094662265954, + "learning_rate": 2.1862569731717908e-07, + "loss": 0.5758, + "step": 24081 + }, + { + "epoch": 1.7398089114454458, + "grad_norm": 6.839144337098895, + "learning_rate": 2.1850608946604763e-07, + "loss": 0.6499, + "step": 24082 + }, + { + "epoch": 1.7398811566456553, + "grad_norm": 7.222154747411373, + "learning_rate": 2.1838651284679658e-07, + "loss": 0.5866, + "step": 24083 + }, + { + "epoch": 1.7399534018458649, + "grad_norm": 7.030892460527604, + "learning_rate": 2.1826696746106242e-07, + "loss": 0.5602, + "step": 24084 + }, + { + "epoch": 1.7400256470460744, + "grad_norm": 6.505729497634121, + "learning_rate": 2.1814745331048242e-07, + "loss": 0.6019, + "step": 24085 + }, + { + "epoch": 1.740097892246284, + "grad_norm": 6.986127458590759, + "learning_rate": 2.180279703966917e-07, + "loss": 0.6161, + "step": 24086 + }, + { + "epoch": 1.7401701374464933, + "grad_norm": 7.9608919780653835, + "learning_rate": 2.1790851872132563e-07, + "loss": 0.5676, + "step": 24087 + }, + { + "epoch": 1.740242382646703, + "grad_norm": 6.699934477983896, + "learning_rate": 2.177890982860209e-07, + "loss": 0.6165, + "step": 24088 + }, + { + "epoch": 1.7403146278469124, + "grad_norm": 7.707690512857414, + "learning_rate": 2.176697090924107e-07, + "loss": 0.542, + "step": 24089 + }, + { + "epoch": 1.740386873047122, + "grad_norm": 7.4853734124966484, + "learning_rate": 2.1755035114213034e-07, + "loss": 0.654, + "step": 24090 + }, + { + "epoch": 1.7404591182473315, + "grad_norm": 6.780588425389883, + "learning_rate": 2.1743102443681358e-07, + "loss": 0.6145, + "step": 24091 + }, + { + "epoch": 1.740531363447541, + "grad_norm": 6.961906322871452, + "learning_rate": 2.1731172897809404e-07, + "loss": 0.6113, + "step": 24092 + }, + { + "epoch": 1.7406036086477505, + "grad_norm": 6.741207471501536, + "learning_rate": 2.171924647676041e-07, + "loss": 0.5443, + "step": 24093 + }, + { + "epoch": 1.7406758538479599, + "grad_norm": 6.739308402238208, + "learning_rate": 2.170732318069768e-07, + "loss": 0.619, + "step": 24094 + }, + { + "epoch": 1.7407480990481696, + "grad_norm": 10.966041860976079, + "learning_rate": 2.1695403009784454e-07, + "loss": 0.562, + "step": 24095 + }, + { + "epoch": 1.740820344248379, + "grad_norm": 7.932837611292612, + "learning_rate": 2.1683485964183875e-07, + "loss": 0.579, + "step": 24096 + }, + { + "epoch": 1.7408925894485885, + "grad_norm": 7.168815941079134, + "learning_rate": 2.1671572044059119e-07, + "loss": 0.5375, + "step": 24097 + }, + { + "epoch": 1.740964834648798, + "grad_norm": 6.518339254984079, + "learning_rate": 2.165966124957325e-07, + "loss": 0.5439, + "step": 24098 + }, + { + "epoch": 1.7410370798490076, + "grad_norm": 6.991631546508219, + "learning_rate": 2.164775358088936e-07, + "loss": 0.5931, + "step": 24099 + }, + { + "epoch": 1.7411093250492171, + "grad_norm": 7.163886780492203, + "learning_rate": 2.163584903817037e-07, + "loss": 0.5768, + "step": 24100 + }, + { + "epoch": 1.7411815702494264, + "grad_norm": 7.658428022472407, + "learning_rate": 2.1623947621579293e-07, + "loss": 0.6712, + "step": 24101 + }, + { + "epoch": 1.7412538154496362, + "grad_norm": 6.290276717025875, + "learning_rate": 2.1612049331279084e-07, + "loss": 0.5527, + "step": 24102 + }, + { + "epoch": 1.7413260606498455, + "grad_norm": 7.046855512029595, + "learning_rate": 2.1600154167432609e-07, + "loss": 0.5702, + "step": 24103 + }, + { + "epoch": 1.741398305850055, + "grad_norm": 8.618173302491506, + "learning_rate": 2.1588262130202625e-07, + "loss": 0.6135, + "step": 24104 + }, + { + "epoch": 1.7414705510502646, + "grad_norm": 6.674351681431811, + "learning_rate": 2.1576373219751978e-07, + "loss": 0.6353, + "step": 24105 + }, + { + "epoch": 1.7415427962504741, + "grad_norm": 7.931184914805103, + "learning_rate": 2.1564487436243453e-07, + "loss": 0.567, + "step": 24106 + }, + { + "epoch": 1.7416150414506837, + "grad_norm": 8.315367240021681, + "learning_rate": 2.1552604779839698e-07, + "loss": 0.6508, + "step": 24107 + }, + { + "epoch": 1.741687286650893, + "grad_norm": 8.324872668326384, + "learning_rate": 2.1540725250703414e-07, + "loss": 0.6164, + "step": 24108 + }, + { + "epoch": 1.7417595318511028, + "grad_norm": 7.77432911422391, + "learning_rate": 2.1528848848997226e-07, + "loss": 0.6089, + "step": 24109 + }, + { + "epoch": 1.741831777051312, + "grad_norm": 7.892131518080628, + "learning_rate": 2.1516975574883747e-07, + "loss": 0.6213, + "step": 24110 + }, + { + "epoch": 1.7419040222515216, + "grad_norm": 7.304175439522134, + "learning_rate": 2.150510542852541e-07, + "loss": 0.5873, + "step": 24111 + }, + { + "epoch": 1.7419762674517312, + "grad_norm": 8.823983144126695, + "learning_rate": 2.1493238410084771e-07, + "loss": 0.6945, + "step": 24112 + }, + { + "epoch": 1.7420485126519407, + "grad_norm": 7.298038626771686, + "learning_rate": 2.1481374519724317e-07, + "loss": 0.6064, + "step": 24113 + }, + { + "epoch": 1.7421207578521503, + "grad_norm": 8.427150227800265, + "learning_rate": 2.1469513757606364e-07, + "loss": 0.6179, + "step": 24114 + }, + { + "epoch": 1.7421930030523596, + "grad_norm": 6.854573648384197, + "learning_rate": 2.1457656123893277e-07, + "loss": 0.625, + "step": 24115 + }, + { + "epoch": 1.7422652482525693, + "grad_norm": 7.180485495301984, + "learning_rate": 2.1445801618747487e-07, + "loss": 0.5947, + "step": 24116 + }, + { + "epoch": 1.7423374934527787, + "grad_norm": 5.559014246162541, + "learning_rate": 2.1433950242331246e-07, + "loss": 0.5434, + "step": 24117 + }, + { + "epoch": 1.7424097386529882, + "grad_norm": 6.1947112014117724, + "learning_rate": 2.142210199480671e-07, + "loss": 0.5767, + "step": 24118 + }, + { + "epoch": 1.7424819838531977, + "grad_norm": 8.290222604521517, + "learning_rate": 2.1410256876336106e-07, + "loss": 0.6144, + "step": 24119 + }, + { + "epoch": 1.7425542290534073, + "grad_norm": 7.740820586005502, + "learning_rate": 2.139841488708161e-07, + "loss": 0.636, + "step": 24120 + }, + { + "epoch": 1.7426264742536168, + "grad_norm": 8.072109912226084, + "learning_rate": 2.138657602720537e-07, + "loss": 0.5902, + "step": 24121 + }, + { + "epoch": 1.7426987194538262, + "grad_norm": 7.077481474264323, + "learning_rate": 2.137474029686934e-07, + "loss": 0.565, + "step": 24122 + }, + { + "epoch": 1.742770964654036, + "grad_norm": 7.014192906537926, + "learning_rate": 2.1362907696235614e-07, + "loss": 0.6327, + "step": 24123 + }, + { + "epoch": 1.7428432098542452, + "grad_norm": 8.416578789683443, + "learning_rate": 2.1351078225466142e-07, + "loss": 0.6082, + "step": 24124 + }, + { + "epoch": 1.7429154550544548, + "grad_norm": 7.822851784373983, + "learning_rate": 2.1339251884722883e-07, + "loss": 0.6395, + "step": 24125 + }, + { + "epoch": 1.7429877002546643, + "grad_norm": 6.9615011104776565, + "learning_rate": 2.1327428674167704e-07, + "loss": 0.5172, + "step": 24126 + }, + { + "epoch": 1.7430599454548739, + "grad_norm": 7.516416511968751, + "learning_rate": 2.1315608593962505e-07, + "loss": 0.578, + "step": 24127 + }, + { + "epoch": 1.7431321906550834, + "grad_norm": 6.842414621162648, + "learning_rate": 2.1303791644269072e-07, + "loss": 0.624, + "step": 24128 + }, + { + "epoch": 1.7432044358552927, + "grad_norm": 6.397259912431808, + "learning_rate": 2.1291977825249138e-07, + "loss": 0.5998, + "step": 24129 + }, + { + "epoch": 1.7432766810555025, + "grad_norm": 7.7649600935952625, + "learning_rate": 2.1280167137064433e-07, + "loss": 0.6783, + "step": 24130 + }, + { + "epoch": 1.7433489262557118, + "grad_norm": 7.392427188436023, + "learning_rate": 2.126835957987672e-07, + "loss": 0.6504, + "step": 24131 + }, + { + "epoch": 1.7434211714559213, + "grad_norm": 6.173705165257675, + "learning_rate": 2.1256555153847503e-07, + "loss": 0.6032, + "step": 24132 + }, + { + "epoch": 1.743493416656131, + "grad_norm": 7.816546559444701, + "learning_rate": 2.1244753859138434e-07, + "loss": 0.6012, + "step": 24133 + }, + { + "epoch": 1.7435656618563404, + "grad_norm": 7.67277816770373, + "learning_rate": 2.1232955695911079e-07, + "loss": 0.6533, + "step": 24134 + }, + { + "epoch": 1.74363790705655, + "grad_norm": 7.405630608906724, + "learning_rate": 2.122116066432692e-07, + "loss": 0.7342, + "step": 24135 + }, + { + "epoch": 1.7437101522567593, + "grad_norm": 7.518692627748949, + "learning_rate": 2.120936876454746e-07, + "loss": 0.5494, + "step": 24136 + }, + { + "epoch": 1.743782397456969, + "grad_norm": 8.613009314017916, + "learning_rate": 2.119757999673408e-07, + "loss": 0.6224, + "step": 24137 + }, + { + "epoch": 1.7438546426571784, + "grad_norm": 6.021959492081259, + "learning_rate": 2.118579436104823e-07, + "loss": 0.5758, + "step": 24138 + }, + { + "epoch": 1.743926887857388, + "grad_norm": 7.091830762794276, + "learning_rate": 2.117401185765114e-07, + "loss": 0.5804, + "step": 24139 + }, + { + "epoch": 1.7439991330575975, + "grad_norm": 8.199169433212809, + "learning_rate": 2.1162232486704153e-07, + "loss": 0.6541, + "step": 24140 + }, + { + "epoch": 1.744071378257807, + "grad_norm": 7.105735031431587, + "learning_rate": 2.1150456248368534e-07, + "loss": 0.6058, + "step": 24141 + }, + { + "epoch": 1.7441436234580165, + "grad_norm": 6.868856841237843, + "learning_rate": 2.113868314280551e-07, + "loss": 0.6477, + "step": 24142 + }, + { + "epoch": 1.7442158686582259, + "grad_norm": 9.566387508275843, + "learning_rate": 2.1126913170176178e-07, + "loss": 0.5912, + "step": 24143 + }, + { + "epoch": 1.7442881138584356, + "grad_norm": 6.788473334170226, + "learning_rate": 2.1115146330641657e-07, + "loss": 0.6095, + "step": 24144 + }, + { + "epoch": 1.744360359058645, + "grad_norm": 7.117539900630031, + "learning_rate": 2.1103382624363093e-07, + "loss": 0.579, + "step": 24145 + }, + { + "epoch": 1.7444326042588547, + "grad_norm": 8.365810989860242, + "learning_rate": 2.1091622051501558e-07, + "loss": 0.5796, + "step": 24146 + }, + { + "epoch": 1.744504849459064, + "grad_norm": 7.213491927616952, + "learning_rate": 2.107986461221792e-07, + "loss": 0.5624, + "step": 24147 + }, + { + "epoch": 1.7445770946592736, + "grad_norm": 7.140131893702851, + "learning_rate": 2.1068110306673188e-07, + "loss": 0.5777, + "step": 24148 + }, + { + "epoch": 1.7446493398594831, + "grad_norm": 8.567390771486243, + "learning_rate": 2.105635913502832e-07, + "loss": 0.6359, + "step": 24149 + }, + { + "epoch": 1.7447215850596924, + "grad_norm": 6.584100252058013, + "learning_rate": 2.10446110974441e-07, + "loss": 0.6399, + "step": 24150 + }, + { + "epoch": 1.7447938302599022, + "grad_norm": 8.017821964547217, + "learning_rate": 2.1032866194081375e-07, + "loss": 0.6374, + "step": 24151 + }, + { + "epoch": 1.7448660754601115, + "grad_norm": 7.997798139828813, + "learning_rate": 2.102112442510093e-07, + "loss": 0.5681, + "step": 24152 + }, + { + "epoch": 1.7449383206603213, + "grad_norm": 8.211929333976911, + "learning_rate": 2.10093857906635e-07, + "loss": 0.5781, + "step": 24153 + }, + { + "epoch": 1.7450105658605306, + "grad_norm": 7.15985443095124, + "learning_rate": 2.0997650290929788e-07, + "loss": 0.553, + "step": 24154 + }, + { + "epoch": 1.7450828110607401, + "grad_norm": 8.618932829114806, + "learning_rate": 2.0985917926060418e-07, + "loss": 0.6243, + "step": 24155 + }, + { + "epoch": 1.7451550562609497, + "grad_norm": 6.817477866158892, + "learning_rate": 2.0974188696216064e-07, + "loss": 0.6375, + "step": 24156 + }, + { + "epoch": 1.745227301461159, + "grad_norm": 8.04595669441602, + "learning_rate": 2.0962462601557208e-07, + "loss": 0.6679, + "step": 24157 + }, + { + "epoch": 1.7452995466613688, + "grad_norm": 6.889039841232015, + "learning_rate": 2.0950739642244395e-07, + "loss": 0.5909, + "step": 24158 + }, + { + "epoch": 1.745371791861578, + "grad_norm": 7.048467688727331, + "learning_rate": 2.09390198184381e-07, + "loss": 0.6771, + "step": 24159 + }, + { + "epoch": 1.7454440370617879, + "grad_norm": 6.628586572009674, + "learning_rate": 2.0927303130298837e-07, + "loss": 0.587, + "step": 24160 + }, + { + "epoch": 1.7455162822619972, + "grad_norm": 8.083817090637492, + "learning_rate": 2.0915589577986867e-07, + "loss": 0.5614, + "step": 24161 + }, + { + "epoch": 1.7455885274622067, + "grad_norm": 5.826736243826981, + "learning_rate": 2.0903879161662615e-07, + "loss": 0.5868, + "step": 24162 + }, + { + "epoch": 1.7456607726624163, + "grad_norm": 7.046477921964726, + "learning_rate": 2.0892171881486373e-07, + "loss": 0.5947, + "step": 24163 + }, + { + "epoch": 1.7457330178626258, + "grad_norm": 8.375533357404535, + "learning_rate": 2.08804677376184e-07, + "loss": 0.578, + "step": 24164 + }, + { + "epoch": 1.7458052630628353, + "grad_norm": 8.093998849938917, + "learning_rate": 2.0868766730218925e-07, + "loss": 0.6194, + "step": 24165 + }, + { + "epoch": 1.7458775082630447, + "grad_norm": 7.6843374306472825, + "learning_rate": 2.0857068859448131e-07, + "loss": 0.5782, + "step": 24166 + }, + { + "epoch": 1.7459497534632544, + "grad_norm": 6.652968479822061, + "learning_rate": 2.0845374125466166e-07, + "loss": 0.508, + "step": 24167 + }, + { + "epoch": 1.7460219986634637, + "grad_norm": 8.663447393488697, + "learning_rate": 2.0833682528433096e-07, + "loss": 0.6016, + "step": 24168 + }, + { + "epoch": 1.7460942438636733, + "grad_norm": 7.532641523660515, + "learning_rate": 2.082199406850896e-07, + "loss": 0.6836, + "step": 24169 + }, + { + "epoch": 1.7461664890638828, + "grad_norm": 7.194492024084754, + "learning_rate": 2.0810308745853768e-07, + "loss": 0.5921, + "step": 24170 + }, + { + "epoch": 1.7462387342640924, + "grad_norm": 5.738945659608178, + "learning_rate": 2.079862656062756e-07, + "loss": 0.614, + "step": 24171 + }, + { + "epoch": 1.746310979464302, + "grad_norm": 7.5965827085378494, + "learning_rate": 2.0786947512990068e-07, + "loss": 0.6616, + "step": 24172 + }, + { + "epoch": 1.7463832246645112, + "grad_norm": 7.436984517161246, + "learning_rate": 2.077527160310136e-07, + "loss": 0.5962, + "step": 24173 + }, + { + "epoch": 1.746455469864721, + "grad_norm": 7.4947376863410335, + "learning_rate": 2.076359883112125e-07, + "loss": 0.6665, + "step": 24174 + }, + { + "epoch": 1.7465277150649303, + "grad_norm": 7.90687745510754, + "learning_rate": 2.0751929197209447e-07, + "loss": 0.6067, + "step": 24175 + }, + { + "epoch": 1.7465999602651399, + "grad_norm": 6.22925818470693, + "learning_rate": 2.074026270152571e-07, + "loss": 0.553, + "step": 24176 + }, + { + "epoch": 1.7466722054653494, + "grad_norm": 7.584462958518594, + "learning_rate": 2.0728599344229773e-07, + "loss": 0.5854, + "step": 24177 + }, + { + "epoch": 1.746744450665559, + "grad_norm": 6.754706543280755, + "learning_rate": 2.0716939125481367e-07, + "loss": 0.5486, + "step": 24178 + }, + { + "epoch": 1.7468166958657685, + "grad_norm": 6.869288065751315, + "learning_rate": 2.070528204543995e-07, + "loss": 0.6209, + "step": 24179 + }, + { + "epoch": 1.7468889410659778, + "grad_norm": 7.001954214609399, + "learning_rate": 2.06936281042652e-07, + "loss": 0.5727, + "step": 24180 + }, + { + "epoch": 1.7469611862661876, + "grad_norm": 7.259035728037694, + "learning_rate": 2.0681977302116658e-07, + "loss": 0.5746, + "step": 24181 + }, + { + "epoch": 1.747033431466397, + "grad_norm": 6.780412050852492, + "learning_rate": 2.0670329639153773e-07, + "loss": 0.6886, + "step": 24182 + }, + { + "epoch": 1.7471056766666064, + "grad_norm": 7.618285678048788, + "learning_rate": 2.065868511553601e-07, + "loss": 0.6579, + "step": 24183 + }, + { + "epoch": 1.747177921866816, + "grad_norm": 8.53614884460012, + "learning_rate": 2.0647043731422788e-07, + "loss": 0.6284, + "step": 24184 + }, + { + "epoch": 1.7472501670670255, + "grad_norm": 7.568094753438377, + "learning_rate": 2.0635405486973487e-07, + "loss": 0.6877, + "step": 24185 + }, + { + "epoch": 1.747322412267235, + "grad_norm": 6.762045391265512, + "learning_rate": 2.0623770382347363e-07, + "loss": 0.5846, + "step": 24186 + }, + { + "epoch": 1.7473946574674444, + "grad_norm": 7.085233844782676, + "learning_rate": 2.061213841770371e-07, + "loss": 0.6384, + "step": 24187 + }, + { + "epoch": 1.7474669026676541, + "grad_norm": 7.972413421476552, + "learning_rate": 2.060050959320184e-07, + "loss": 0.6067, + "step": 24188 + }, + { + "epoch": 1.7475391478678635, + "grad_norm": 6.512419353432405, + "learning_rate": 2.0588883909000794e-07, + "loss": 0.6544, + "step": 24189 + }, + { + "epoch": 1.747611393068073, + "grad_norm": 7.134481178459371, + "learning_rate": 2.0577261365259839e-07, + "loss": 0.5602, + "step": 24190 + }, + { + "epoch": 1.7476836382682825, + "grad_norm": 5.888991556932068, + "learning_rate": 2.0565641962138034e-07, + "loss": 0.7043, + "step": 24191 + }, + { + "epoch": 1.747755883468492, + "grad_norm": 7.927651129767425, + "learning_rate": 2.0554025699794423e-07, + "loss": 0.6877, + "step": 24192 + }, + { + "epoch": 1.7478281286687016, + "grad_norm": 7.685817627457671, + "learning_rate": 2.0542412578388072e-07, + "loss": 0.5615, + "step": 24193 + }, + { + "epoch": 1.747900373868911, + "grad_norm": 7.201424288276948, + "learning_rate": 2.053080259807791e-07, + "loss": 0.5714, + "step": 24194 + }, + { + "epoch": 1.7479726190691207, + "grad_norm": 7.252943559486701, + "learning_rate": 2.0519195759022948e-07, + "loss": 0.5879, + "step": 24195 + }, + { + "epoch": 1.74804486426933, + "grad_norm": 10.321801631896568, + "learning_rate": 2.0507592061381977e-07, + "loss": 0.7136, + "step": 24196 + }, + { + "epoch": 1.7481171094695396, + "grad_norm": 7.600655386924934, + "learning_rate": 2.0495991505313868e-07, + "loss": 0.6169, + "step": 24197 + }, + { + "epoch": 1.7481893546697491, + "grad_norm": 7.763212938479391, + "learning_rate": 2.048439409097744e-07, + "loss": 0.6125, + "step": 24198 + }, + { + "epoch": 1.7482615998699587, + "grad_norm": 7.008773210520077, + "learning_rate": 2.0472799818531508e-07, + "loss": 0.5901, + "step": 24199 + }, + { + "epoch": 1.7483338450701682, + "grad_norm": 8.188485821495213, + "learning_rate": 2.0461208688134615e-07, + "loss": 0.6475, + "step": 24200 + }, + { + "epoch": 1.7484060902703775, + "grad_norm": 7.399693415708814, + "learning_rate": 2.0449620699945604e-07, + "loss": 0.6199, + "step": 24201 + }, + { + "epoch": 1.7484783354705873, + "grad_norm": 6.60044224731248, + "learning_rate": 2.0438035854123072e-07, + "loss": 0.5958, + "step": 24202 + }, + { + "epoch": 1.7485505806707966, + "grad_norm": 8.007201291460872, + "learning_rate": 2.0426454150825615e-07, + "loss": 0.604, + "step": 24203 + }, + { + "epoch": 1.7486228258710061, + "grad_norm": 8.140495665359351, + "learning_rate": 2.0414875590211684e-07, + "loss": 0.5755, + "step": 24204 + }, + { + "epoch": 1.7486950710712157, + "grad_norm": 6.238307850111573, + "learning_rate": 2.0403300172439882e-07, + "loss": 0.5689, + "step": 24205 + }, + { + "epoch": 1.7487673162714252, + "grad_norm": 6.54740492103178, + "learning_rate": 2.0391727897668634e-07, + "loss": 0.5395, + "step": 24206 + }, + { + "epoch": 1.7488395614716348, + "grad_norm": 6.080303047056369, + "learning_rate": 2.038015876605634e-07, + "loss": 0.6005, + "step": 24207 + }, + { + "epoch": 1.748911806671844, + "grad_norm": 7.745192236290821, + "learning_rate": 2.0368592777761377e-07, + "loss": 0.5737, + "step": 24208 + }, + { + "epoch": 1.7489840518720539, + "grad_norm": 6.138906456800882, + "learning_rate": 2.0357029932942084e-07, + "loss": 0.5625, + "step": 24209 + }, + { + "epoch": 1.7490562970722632, + "grad_norm": 7.151716770460962, + "learning_rate": 2.034547023175673e-07, + "loss": 0.5605, + "step": 24210 + }, + { + "epoch": 1.7491285422724727, + "grad_norm": 6.0451960635514, + "learning_rate": 2.0333913674363575e-07, + "loss": 0.6035, + "step": 24211 + }, + { + "epoch": 1.7492007874726823, + "grad_norm": 6.745183285767241, + "learning_rate": 2.0322360260920826e-07, + "loss": 0.5861, + "step": 24212 + }, + { + "epoch": 1.7492730326728918, + "grad_norm": 7.447645688407097, + "learning_rate": 2.0310809991586688e-07, + "loss": 0.6147, + "step": 24213 + }, + { + "epoch": 1.7493452778731013, + "grad_norm": 7.766130212203526, + "learning_rate": 2.029926286651915e-07, + "loss": 0.5978, + "step": 24214 + }, + { + "epoch": 1.7494175230733107, + "grad_norm": 6.304256079376493, + "learning_rate": 2.0287718885876355e-07, + "loss": 0.5351, + "step": 24215 + }, + { + "epoch": 1.7494897682735204, + "grad_norm": 7.687299524189842, + "learning_rate": 2.0276178049816352e-07, + "loss": 0.5822, + "step": 24216 + }, + { + "epoch": 1.7495620134737297, + "grad_norm": 7.245504629859027, + "learning_rate": 2.0264640358497123e-07, + "loss": 0.5805, + "step": 24217 + }, + { + "epoch": 1.7496342586739395, + "grad_norm": 6.00828711094671, + "learning_rate": 2.0253105812076567e-07, + "loss": 0.5754, + "step": 24218 + }, + { + "epoch": 1.7497065038741488, + "grad_norm": 7.101956766764508, + "learning_rate": 2.024157441071259e-07, + "loss": 0.5633, + "step": 24219 + }, + { + "epoch": 1.7497787490743584, + "grad_norm": 6.090493295418932, + "learning_rate": 2.023004615456306e-07, + "loss": 0.5863, + "step": 24220 + }, + { + "epoch": 1.749850994274568, + "grad_norm": 8.23861834006894, + "learning_rate": 2.0218521043785799e-07, + "loss": 0.5865, + "step": 24221 + }, + { + "epoch": 1.7499232394747772, + "grad_norm": 7.602730922936328, + "learning_rate": 2.020699907853857e-07, + "loss": 0.6596, + "step": 24222 + }, + { + "epoch": 1.749995484674987, + "grad_norm": 7.575076404349878, + "learning_rate": 2.0195480258979106e-07, + "loss": 0.6248, + "step": 24223 + }, + { + "epoch": 1.7500677298751963, + "grad_norm": 7.067072105772897, + "learning_rate": 2.0183964585265115e-07, + "loss": 0.6663, + "step": 24224 + }, + { + "epoch": 1.750139975075406, + "grad_norm": 7.565978960331163, + "learning_rate": 2.0172452057554191e-07, + "loss": 0.6453, + "step": 24225 + }, + { + "epoch": 1.7502122202756154, + "grad_norm": 8.451490318200264, + "learning_rate": 2.0160942676003935e-07, + "loss": 0.5912, + "step": 24226 + }, + { + "epoch": 1.750284465475825, + "grad_norm": 8.760317577719428, + "learning_rate": 2.0149436440771914e-07, + "loss": 0.5726, + "step": 24227 + }, + { + "epoch": 1.7503567106760345, + "grad_norm": 6.6515666980474775, + "learning_rate": 2.0137933352015664e-07, + "loss": 0.6274, + "step": 24228 + }, + { + "epoch": 1.7504289558762438, + "grad_norm": 7.990377838405909, + "learning_rate": 2.0126433409892565e-07, + "loss": 0.5615, + "step": 24229 + }, + { + "epoch": 1.7505012010764536, + "grad_norm": 5.468673618328198, + "learning_rate": 2.0114936614560155e-07, + "loss": 0.5739, + "step": 24230 + }, + { + "epoch": 1.750573446276663, + "grad_norm": 5.952701102956756, + "learning_rate": 2.0103442966175806e-07, + "loss": 0.5771, + "step": 24231 + }, + { + "epoch": 1.7506456914768727, + "grad_norm": 7.488048695120648, + "learning_rate": 2.0091952464896787e-07, + "loss": 0.614, + "step": 24232 + }, + { + "epoch": 1.750717936677082, + "grad_norm": 8.57397449467013, + "learning_rate": 2.008046511088041e-07, + "loss": 0.6122, + "step": 24233 + }, + { + "epoch": 1.7507901818772915, + "grad_norm": 7.66394254413642, + "learning_rate": 2.006898090428394e-07, + "loss": 0.5729, + "step": 24234 + }, + { + "epoch": 1.750862427077501, + "grad_norm": 6.828126117348306, + "learning_rate": 2.0057499845264644e-07, + "loss": 0.6809, + "step": 24235 + }, + { + "epoch": 1.7509346722777106, + "grad_norm": 6.3231660981184525, + "learning_rate": 2.0046021933979614e-07, + "loss": 0.5767, + "step": 24236 + }, + { + "epoch": 1.7510069174779201, + "grad_norm": 7.4851693068361485, + "learning_rate": 2.0034547170585977e-07, + "loss": 0.5097, + "step": 24237 + }, + { + "epoch": 1.7510791626781295, + "grad_norm": 6.72063501218754, + "learning_rate": 2.0023075555240829e-07, + "loss": 0.4878, + "step": 24238 + }, + { + "epoch": 1.7511514078783392, + "grad_norm": 7.2527613149678745, + "learning_rate": 2.001160708810121e-07, + "loss": 0.6124, + "step": 24239 + }, + { + "epoch": 1.7512236530785485, + "grad_norm": 6.4628386831170515, + "learning_rate": 2.0000141769324106e-07, + "loss": 0.6099, + "step": 24240 + }, + { + "epoch": 1.751295898278758, + "grad_norm": 5.744845858399299, + "learning_rate": 1.99886795990665e-07, + "loss": 0.6332, + "step": 24241 + }, + { + "epoch": 1.7513681434789676, + "grad_norm": 7.995388132184537, + "learning_rate": 1.99772205774853e-07, + "loss": 0.6384, + "step": 24242 + }, + { + "epoch": 1.7514403886791772, + "grad_norm": 7.647422466003507, + "learning_rate": 1.9965764704737322e-07, + "loss": 0.6133, + "step": 24243 + }, + { + "epoch": 1.7515126338793867, + "grad_norm": 8.028159173912766, + "learning_rate": 1.9954311980979413e-07, + "loss": 0.6186, + "step": 24244 + }, + { + "epoch": 1.751584879079596, + "grad_norm": 7.346506453680866, + "learning_rate": 1.994286240636839e-07, + "loss": 0.6112, + "step": 24245 + }, + { + "epoch": 1.7516571242798058, + "grad_norm": 7.363062362048288, + "learning_rate": 1.9931415981060937e-07, + "loss": 0.5846, + "step": 24246 + }, + { + "epoch": 1.7517293694800151, + "grad_norm": 6.683272977764151, + "learning_rate": 1.991997270521373e-07, + "loss": 0.6126, + "step": 24247 + }, + { + "epoch": 1.7518016146802247, + "grad_norm": 7.951931067522953, + "learning_rate": 1.9908532578983453e-07, + "loss": 0.6571, + "step": 24248 + }, + { + "epoch": 1.7518738598804342, + "grad_norm": 6.412994853477013, + "learning_rate": 1.9897095602526727e-07, + "loss": 0.6205, + "step": 24249 + }, + { + "epoch": 1.7519461050806437, + "grad_norm": 9.370969592627917, + "learning_rate": 1.9885661776000098e-07, + "loss": 0.6117, + "step": 24250 + }, + { + "epoch": 1.7520183502808533, + "grad_norm": 7.82487593969896, + "learning_rate": 1.9874231099560076e-07, + "loss": 0.6285, + "step": 24251 + }, + { + "epoch": 1.7520905954810626, + "grad_norm": 8.045236957822905, + "learning_rate": 1.9862803573363148e-07, + "loss": 0.6198, + "step": 24252 + }, + { + "epoch": 1.7521628406812724, + "grad_norm": 6.935598576792648, + "learning_rate": 1.98513791975658e-07, + "loss": 0.5977, + "step": 24253 + }, + { + "epoch": 1.7522350858814817, + "grad_norm": 7.2378920374816476, + "learning_rate": 1.9839957972324325e-07, + "loss": 0.6184, + "step": 24254 + }, + { + "epoch": 1.7523073310816912, + "grad_norm": 6.9931978145786875, + "learning_rate": 1.9828539897795123e-07, + "loss": 0.627, + "step": 24255 + }, + { + "epoch": 1.7523795762819008, + "grad_norm": 6.718454469790391, + "learning_rate": 1.9817124974134516e-07, + "loss": 0.5909, + "step": 24256 + }, + { + "epoch": 1.7524518214821103, + "grad_norm": 6.164258366362931, + "learning_rate": 1.9805713201498683e-07, + "loss": 0.5352, + "step": 24257 + }, + { + "epoch": 1.7525240666823199, + "grad_norm": 7.27730050435017, + "learning_rate": 1.9794304580043943e-07, + "loss": 0.5909, + "step": 24258 + }, + { + "epoch": 1.7525963118825292, + "grad_norm": 8.283861432296668, + "learning_rate": 1.9782899109926423e-07, + "loss": 0.6143, + "step": 24259 + }, + { + "epoch": 1.752668557082739, + "grad_norm": 6.729628729137384, + "learning_rate": 1.9771496791302301e-07, + "loss": 0.6267, + "step": 24260 + }, + { + "epoch": 1.7527408022829483, + "grad_norm": 7.117404034164906, + "learning_rate": 1.9760097624327595e-07, + "loss": 0.5889, + "step": 24261 + }, + { + "epoch": 1.7528130474831578, + "grad_norm": 8.000825362544392, + "learning_rate": 1.9748701609158372e-07, + "loss": 0.5686, + "step": 24262 + }, + { + "epoch": 1.7528852926833673, + "grad_norm": 7.360361604100388, + "learning_rate": 1.9737308745950672e-07, + "loss": 0.5762, + "step": 24263 + }, + { + "epoch": 1.7529575378835769, + "grad_norm": 6.449286180371292, + "learning_rate": 1.9725919034860401e-07, + "loss": 0.5843, + "step": 24264 + }, + { + "epoch": 1.7530297830837864, + "grad_norm": 8.025589547531647, + "learning_rate": 1.9714532476043518e-07, + "loss": 0.5435, + "step": 24265 + }, + { + "epoch": 1.7531020282839958, + "grad_norm": 7.531673182077862, + "learning_rate": 1.970314906965587e-07, + "loss": 0.5837, + "step": 24266 + }, + { + "epoch": 1.7531742734842055, + "grad_norm": 7.819119266172682, + "learning_rate": 1.96917688158533e-07, + "loss": 0.5605, + "step": 24267 + }, + { + "epoch": 1.7532465186844148, + "grad_norm": 7.59595623855078, + "learning_rate": 1.968039171479158e-07, + "loss": 0.5913, + "step": 24268 + }, + { + "epoch": 1.7533187638846244, + "grad_norm": 7.712985090583824, + "learning_rate": 1.9669017766626468e-07, + "loss": 0.5834, + "step": 24269 + }, + { + "epoch": 1.753391009084834, + "grad_norm": 7.071584614494896, + "learning_rate": 1.9657646971513706e-07, + "loss": 0.571, + "step": 24270 + }, + { + "epoch": 1.7534632542850435, + "grad_norm": 6.831957267591855, + "learning_rate": 1.9646279329608886e-07, + "loss": 0.616, + "step": 24271 + }, + { + "epoch": 1.753535499485253, + "grad_norm": 6.996539214077278, + "learning_rate": 1.963491484106761e-07, + "loss": 0.5444, + "step": 24272 + }, + { + "epoch": 1.7536077446854623, + "grad_norm": 7.517080059088508, + "learning_rate": 1.96235535060455e-07, + "loss": 0.6566, + "step": 24273 + }, + { + "epoch": 1.753679989885672, + "grad_norm": 8.135047217140025, + "learning_rate": 1.9612195324698102e-07, + "loss": 0.6021, + "step": 24274 + }, + { + "epoch": 1.7537522350858814, + "grad_norm": 6.5913865613038585, + "learning_rate": 1.9600840297180846e-07, + "loss": 0.5568, + "step": 24275 + }, + { + "epoch": 1.753824480286091, + "grad_norm": 7.365001819452396, + "learning_rate": 1.9589488423649162e-07, + "loss": 0.5943, + "step": 24276 + }, + { + "epoch": 1.7538967254863005, + "grad_norm": 9.433437660458143, + "learning_rate": 1.9578139704258454e-07, + "loss": 0.644, + "step": 24277 + }, + { + "epoch": 1.75396897068651, + "grad_norm": 6.777841581589743, + "learning_rate": 1.956679413916418e-07, + "loss": 0.5365, + "step": 24278 + }, + { + "epoch": 1.7540412158867196, + "grad_norm": 8.936636076033798, + "learning_rate": 1.9555451728521553e-07, + "loss": 0.6076, + "step": 24279 + }, + { + "epoch": 1.754113461086929, + "grad_norm": 7.316615584882355, + "learning_rate": 1.9544112472485833e-07, + "loss": 0.6527, + "step": 24280 + }, + { + "epoch": 1.7541857062871387, + "grad_norm": 7.876789434667891, + "learning_rate": 1.9532776371212342e-07, + "loss": 0.6186, + "step": 24281 + }, + { + "epoch": 1.754257951487348, + "grad_norm": 6.782464687706839, + "learning_rate": 1.952144342485615e-07, + "loss": 0.6457, + "step": 24282 + }, + { + "epoch": 1.7543301966875575, + "grad_norm": 6.798139081896357, + "learning_rate": 1.951011363357244e-07, + "loss": 0.6207, + "step": 24283 + }, + { + "epoch": 1.754402441887767, + "grad_norm": 8.232231657425794, + "learning_rate": 1.9498786997516312e-07, + "loss": 0.6712, + "step": 24284 + }, + { + "epoch": 1.7544746870879766, + "grad_norm": 6.749136127717309, + "learning_rate": 1.9487463516842803e-07, + "loss": 0.5946, + "step": 24285 + }, + { + "epoch": 1.7545469322881861, + "grad_norm": 8.302006343419155, + "learning_rate": 1.9476143191706932e-07, + "loss": 0.5501, + "step": 24286 + }, + { + "epoch": 1.7546191774883955, + "grad_norm": 6.947827324776582, + "learning_rate": 1.9464826022263684e-07, + "loss": 0.6331, + "step": 24287 + }, + { + "epoch": 1.7546914226886052, + "grad_norm": 6.38626509522233, + "learning_rate": 1.945351200866802e-07, + "loss": 0.6389, + "step": 24288 + }, + { + "epoch": 1.7547636678888145, + "grad_norm": 6.931198970238776, + "learning_rate": 1.944220115107473e-07, + "loss": 0.5415, + "step": 24289 + }, + { + "epoch": 1.7548359130890243, + "grad_norm": 7.038708609132597, + "learning_rate": 1.9430893449638666e-07, + "loss": 0.6084, + "step": 24290 + }, + { + "epoch": 1.7549081582892336, + "grad_norm": 6.544494341732205, + "learning_rate": 1.9419588904514675e-07, + "loss": 0.5291, + "step": 24291 + }, + { + "epoch": 1.7549804034894432, + "grad_norm": 6.340970337075693, + "learning_rate": 1.9408287515857495e-07, + "loss": 0.5537, + "step": 24292 + }, + { + "epoch": 1.7550526486896527, + "grad_norm": 6.992788141974212, + "learning_rate": 1.9396989283821776e-07, + "loss": 0.6015, + "step": 24293 + }, + { + "epoch": 1.755124893889862, + "grad_norm": 8.343846924179555, + "learning_rate": 1.9385694208562234e-07, + "loss": 0.6262, + "step": 24294 + }, + { + "epoch": 1.7551971390900718, + "grad_norm": 6.846343084833969, + "learning_rate": 1.9374402290233463e-07, + "loss": 0.5897, + "step": 24295 + }, + { + "epoch": 1.7552693842902811, + "grad_norm": 6.423406784886124, + "learning_rate": 1.9363113528990063e-07, + "loss": 0.5699, + "step": 24296 + }, + { + "epoch": 1.7553416294904909, + "grad_norm": 6.526213679947807, + "learning_rate": 1.935182792498655e-07, + "loss": 0.6387, + "step": 24297 + }, + { + "epoch": 1.7554138746907002, + "grad_norm": 6.240106610585641, + "learning_rate": 1.9340545478377437e-07, + "loss": 0.5534, + "step": 24298 + }, + { + "epoch": 1.7554861198909097, + "grad_norm": 7.59326019591269, + "learning_rate": 1.9329266189317215e-07, + "loss": 0.5531, + "step": 24299 + }, + { + "epoch": 1.7555583650911193, + "grad_norm": 7.449590776712403, + "learning_rate": 1.9317990057960174e-07, + "loss": 0.6579, + "step": 24300 + }, + { + "epoch": 1.7556306102913286, + "grad_norm": 7.668370955845516, + "learning_rate": 1.9306717084460747e-07, + "loss": 0.7066, + "step": 24301 + }, + { + "epoch": 1.7557028554915384, + "grad_norm": 8.815055747112835, + "learning_rate": 1.9295447268973283e-07, + "loss": 0.6472, + "step": 24302 + }, + { + "epoch": 1.7557751006917477, + "grad_norm": 6.402304723130746, + "learning_rate": 1.9284180611651964e-07, + "loss": 0.546, + "step": 24303 + }, + { + "epoch": 1.7558473458919575, + "grad_norm": 7.189746804056824, + "learning_rate": 1.9272917112651085e-07, + "loss": 0.6186, + "step": 24304 + }, + { + "epoch": 1.7559195910921668, + "grad_norm": 7.349717592795498, + "learning_rate": 1.9261656772124826e-07, + "loss": 0.5557, + "step": 24305 + }, + { + "epoch": 1.7559918362923763, + "grad_norm": 6.983487680894768, + "learning_rate": 1.9250399590227315e-07, + "loss": 0.6568, + "step": 24306 + }, + { + "epoch": 1.7560640814925859, + "grad_norm": 8.13413933475971, + "learning_rate": 1.9239145567112676e-07, + "loss": 0.5796, + "step": 24307 + }, + { + "epoch": 1.7561363266927954, + "grad_norm": 7.629646011480352, + "learning_rate": 1.9227894702934958e-07, + "loss": 0.6333, + "step": 24308 + }, + { + "epoch": 1.756208571893005, + "grad_norm": 7.839453261248691, + "learning_rate": 1.921664699784817e-07, + "loss": 0.5738, + "step": 24309 + }, + { + "epoch": 1.7562808170932143, + "grad_norm": 7.47067376423969, + "learning_rate": 1.9205402452006334e-07, + "loss": 0.6041, + "step": 24310 + }, + { + "epoch": 1.756353062293424, + "grad_norm": 7.19245143010802, + "learning_rate": 1.9194161065563323e-07, + "loss": 0.529, + "step": 24311 + }, + { + "epoch": 1.7564253074936333, + "grad_norm": 7.157701370119362, + "learning_rate": 1.9182922838673012e-07, + "loss": 0.5733, + "step": 24312 + }, + { + "epoch": 1.756497552693843, + "grad_norm": 6.595816482776252, + "learning_rate": 1.9171687771489284e-07, + "loss": 0.6201, + "step": 24313 + }, + { + "epoch": 1.7565697978940524, + "grad_norm": 6.3471934927474045, + "learning_rate": 1.91604558641659e-07, + "loss": 0.6128, + "step": 24314 + }, + { + "epoch": 1.756642043094262, + "grad_norm": 7.914200596108204, + "learning_rate": 1.9149227116856655e-07, + "loss": 0.6011, + "step": 24315 + }, + { + "epoch": 1.7567142882944715, + "grad_norm": 8.498360419620795, + "learning_rate": 1.9138001529715262e-07, + "loss": 0.6097, + "step": 24316 + }, + { + "epoch": 1.7567865334946808, + "grad_norm": 7.6652179123173365, + "learning_rate": 1.91267791028954e-07, + "loss": 0.6874, + "step": 24317 + }, + { + "epoch": 1.7568587786948906, + "grad_norm": 8.103898097068829, + "learning_rate": 1.9115559836550612e-07, + "loss": 0.5738, + "step": 24318 + }, + { + "epoch": 1.7569310238951, + "grad_norm": 7.38649849081647, + "learning_rate": 1.9104343730834585e-07, + "loss": 0.5954, + "step": 24319 + }, + { + "epoch": 1.7570032690953095, + "grad_norm": 6.503576615145123, + "learning_rate": 1.9093130785900833e-07, + "loss": 0.6431, + "step": 24320 + }, + { + "epoch": 1.757075514295519, + "grad_norm": 7.532204975031459, + "learning_rate": 1.908192100190276e-07, + "loss": 0.5262, + "step": 24321 + }, + { + "epoch": 1.7571477594957285, + "grad_norm": 6.609816516170581, + "learning_rate": 1.9070714378993938e-07, + "loss": 0.6693, + "step": 24322 + }, + { + "epoch": 1.757220004695938, + "grad_norm": 9.237738240516071, + "learning_rate": 1.9059510917327718e-07, + "loss": 0.6914, + "step": 24323 + }, + { + "epoch": 1.7572922498961474, + "grad_norm": 8.806344554168788, + "learning_rate": 1.9048310617057474e-07, + "loss": 0.6974, + "step": 24324 + }, + { + "epoch": 1.7573644950963572, + "grad_norm": 7.69299740120147, + "learning_rate": 1.9037113478336533e-07, + "loss": 0.5648, + "step": 24325 + }, + { + "epoch": 1.7574367402965665, + "grad_norm": 6.601248721278102, + "learning_rate": 1.9025919501318185e-07, + "loss": 0.561, + "step": 24326 + }, + { + "epoch": 1.757508985496776, + "grad_norm": 7.7016508537327715, + "learning_rate": 1.90147286861557e-07, + "loss": 0.6315, + "step": 24327 + }, + { + "epoch": 1.7575812306969856, + "grad_norm": 6.744976010742962, + "learning_rate": 1.9003541033002172e-07, + "loss": 0.5586, + "step": 24328 + }, + { + "epoch": 1.7576534758971951, + "grad_norm": 6.634062991754333, + "learning_rate": 1.8992356542010816e-07, + "loss": 0.5808, + "step": 24329 + }, + { + "epoch": 1.7577257210974047, + "grad_norm": 8.560331863672747, + "learning_rate": 1.8981175213334758e-07, + "loss": 0.6426, + "step": 24330 + }, + { + "epoch": 1.757797966297614, + "grad_norm": 7.536234239165364, + "learning_rate": 1.8969997047127043e-07, + "loss": 0.5825, + "step": 24331 + }, + { + "epoch": 1.7578702114978237, + "grad_norm": 6.727007666199978, + "learning_rate": 1.895882204354066e-07, + "loss": 0.5973, + "step": 24332 + }, + { + "epoch": 1.757942456698033, + "grad_norm": 8.815171290057096, + "learning_rate": 1.8947650202728623e-07, + "loss": 0.5912, + "step": 24333 + }, + { + "epoch": 1.7580147018982426, + "grad_norm": 7.224747185425283, + "learning_rate": 1.8936481524843814e-07, + "loss": 0.6401, + "step": 24334 + }, + { + "epoch": 1.7580869470984521, + "grad_norm": 6.861638349626454, + "learning_rate": 1.8925316010039218e-07, + "loss": 0.5646, + "step": 24335 + }, + { + "epoch": 1.7581591922986617, + "grad_norm": 7.466539127532163, + "learning_rate": 1.8914153658467606e-07, + "loss": 0.6089, + "step": 24336 + }, + { + "epoch": 1.7582314374988712, + "grad_norm": 8.073094703597977, + "learning_rate": 1.8902994470281794e-07, + "loss": 0.6025, + "step": 24337 + }, + { + "epoch": 1.7583036826990806, + "grad_norm": 7.8832734259600095, + "learning_rate": 1.8891838445634608e-07, + "loss": 0.5619, + "step": 24338 + }, + { + "epoch": 1.7583759278992903, + "grad_norm": 8.587151230749672, + "learning_rate": 1.8880685584678676e-07, + "loss": 0.7014, + "step": 24339 + }, + { + "epoch": 1.7584481730994996, + "grad_norm": 6.491732401223251, + "learning_rate": 1.886953588756668e-07, + "loss": 0.5781, + "step": 24340 + }, + { + "epoch": 1.7585204182997092, + "grad_norm": 7.221406258029377, + "learning_rate": 1.8858389354451306e-07, + "loss": 0.5637, + "step": 24341 + }, + { + "epoch": 1.7585926634999187, + "grad_norm": 8.307574426189614, + "learning_rate": 1.8847245985485068e-07, + "loss": 0.5855, + "step": 24342 + }, + { + "epoch": 1.7586649087001283, + "grad_norm": 7.396057068212089, + "learning_rate": 1.8836105780820596e-07, + "loss": 0.5368, + "step": 24343 + }, + { + "epoch": 1.7587371539003378, + "grad_norm": 7.05573509101359, + "learning_rate": 1.882496874061032e-07, + "loss": 0.5779, + "step": 24344 + }, + { + "epoch": 1.7588093991005471, + "grad_norm": 7.944556996482644, + "learning_rate": 1.881383486500679e-07, + "loss": 0.5716, + "step": 24345 + }, + { + "epoch": 1.7588816443007569, + "grad_norm": 6.795063908762737, + "learning_rate": 1.88027041541623e-07, + "loss": 0.6712, + "step": 24346 + }, + { + "epoch": 1.7589538895009662, + "grad_norm": 6.530304324666187, + "learning_rate": 1.879157660822928e-07, + "loss": 0.5822, + "step": 24347 + }, + { + "epoch": 1.7590261347011757, + "grad_norm": 6.734382930317256, + "learning_rate": 1.878045222736008e-07, + "loss": 0.5889, + "step": 24348 + }, + { + "epoch": 1.7590983799013853, + "grad_norm": 6.747060912955712, + "learning_rate": 1.876933101170697e-07, + "loss": 0.5902, + "step": 24349 + }, + { + "epoch": 1.7591706251015948, + "grad_norm": 7.639776701534961, + "learning_rate": 1.8758212961422135e-07, + "loss": 0.6595, + "step": 24350 + }, + { + "epoch": 1.7592428703018044, + "grad_norm": 7.6108439631422495, + "learning_rate": 1.8747098076657837e-07, + "loss": 0.5534, + "step": 24351 + }, + { + "epoch": 1.7593151155020137, + "grad_norm": 8.53016605407868, + "learning_rate": 1.8735986357566184e-07, + "loss": 0.6059, + "step": 24352 + }, + { + "epoch": 1.7593873607022235, + "grad_norm": 8.852254168833433, + "learning_rate": 1.8724877804299325e-07, + "loss": 0.6279, + "step": 24353 + }, + { + "epoch": 1.7594596059024328, + "grad_norm": 6.765170533595795, + "learning_rate": 1.8713772417009334e-07, + "loss": 0.6289, + "step": 24354 + }, + { + "epoch": 1.7595318511026423, + "grad_norm": 7.293110237731037, + "learning_rate": 1.8702670195848205e-07, + "loss": 0.6175, + "step": 24355 + }, + { + "epoch": 1.7596040963028519, + "grad_norm": 7.900277396052009, + "learning_rate": 1.8691571140967952e-07, + "loss": 0.7258, + "step": 24356 + }, + { + "epoch": 1.7596763415030614, + "grad_norm": 7.404331393351351, + "learning_rate": 1.868047525252048e-07, + "loss": 0.6417, + "step": 24357 + }, + { + "epoch": 1.759748586703271, + "grad_norm": 8.166039682830503, + "learning_rate": 1.866938253065767e-07, + "loss": 0.5984, + "step": 24358 + }, + { + "epoch": 1.7598208319034803, + "grad_norm": 7.088394899738493, + "learning_rate": 1.86582929755314e-07, + "loss": 0.622, + "step": 24359 + }, + { + "epoch": 1.75989307710369, + "grad_norm": 5.871122338586417, + "learning_rate": 1.8647206587293522e-07, + "loss": 0.6005, + "step": 24360 + }, + { + "epoch": 1.7599653223038993, + "grad_norm": 8.052937359117061, + "learning_rate": 1.8636123366095715e-07, + "loss": 0.6072, + "step": 24361 + }, + { + "epoch": 1.760037567504109, + "grad_norm": 6.3893617906483335, + "learning_rate": 1.8625043312089696e-07, + "loss": 0.5473, + "step": 24362 + }, + { + "epoch": 1.7601098127043184, + "grad_norm": 6.383411617479624, + "learning_rate": 1.861396642542726e-07, + "loss": 0.6254, + "step": 24363 + }, + { + "epoch": 1.760182057904528, + "grad_norm": 7.272621379428165, + "learning_rate": 1.8602892706259923e-07, + "loss": 0.553, + "step": 24364 + }, + { + "epoch": 1.7602543031047375, + "grad_norm": 8.66328051024661, + "learning_rate": 1.8591822154739313e-07, + "loss": 0.6579, + "step": 24365 + }, + { + "epoch": 1.7603265483049468, + "grad_norm": 8.488142381946206, + "learning_rate": 1.8580754771016978e-07, + "loss": 0.6685, + "step": 24366 + }, + { + "epoch": 1.7603987935051566, + "grad_norm": 8.92583082590171, + "learning_rate": 1.8569690555244492e-07, + "loss": 0.6565, + "step": 24367 + }, + { + "epoch": 1.760471038705366, + "grad_norm": 6.929753282787399, + "learning_rate": 1.8558629507573172e-07, + "loss": 0.5935, + "step": 24368 + }, + { + "epoch": 1.7605432839055757, + "grad_norm": 7.7645427479325955, + "learning_rate": 1.8547571628154514e-07, + "loss": 0.5558, + "step": 24369 + }, + { + "epoch": 1.760615529105785, + "grad_norm": 7.454245045311875, + "learning_rate": 1.8536516917139923e-07, + "loss": 0.6132, + "step": 24370 + }, + { + "epoch": 1.7606877743059945, + "grad_norm": 6.926406926757846, + "learning_rate": 1.8525465374680667e-07, + "loss": 0.5872, + "step": 24371 + }, + { + "epoch": 1.760760019506204, + "grad_norm": 7.778474488918496, + "learning_rate": 1.851441700092807e-07, + "loss": 0.6242, + "step": 24372 + }, + { + "epoch": 1.7608322647064134, + "grad_norm": 7.325039797730117, + "learning_rate": 1.850337179603337e-07, + "loss": 0.5722, + "step": 24373 + }, + { + "epoch": 1.7609045099066232, + "grad_norm": 7.238729991065071, + "learning_rate": 1.8492329760147782e-07, + "loss": 0.5823, + "step": 24374 + }, + { + "epoch": 1.7609767551068325, + "grad_norm": 6.979203257298748, + "learning_rate": 1.8481290893422433e-07, + "loss": 0.6154, + "step": 24375 + }, + { + "epoch": 1.7610490003070423, + "grad_norm": 7.768442176373346, + "learning_rate": 1.8470255196008452e-07, + "loss": 0.63, + "step": 24376 + }, + { + "epoch": 1.7611212455072516, + "grad_norm": 7.3618465735944625, + "learning_rate": 1.8459222668056915e-07, + "loss": 0.5395, + "step": 24377 + }, + { + "epoch": 1.7611934907074611, + "grad_norm": 7.202918192445375, + "learning_rate": 1.8448193309718837e-07, + "loss": 0.5659, + "step": 24378 + }, + { + "epoch": 1.7612657359076707, + "grad_norm": 6.7087785403286215, + "learning_rate": 1.8437167121145183e-07, + "loss": 0.6065, + "step": 24379 + }, + { + "epoch": 1.76133798110788, + "grad_norm": 7.646270850307342, + "learning_rate": 1.8426144102486915e-07, + "loss": 0.6215, + "step": 24380 + }, + { + "epoch": 1.7614102263080897, + "grad_norm": 6.314855457140731, + "learning_rate": 1.841512425389494e-07, + "loss": 0.544, + "step": 24381 + }, + { + "epoch": 1.761482471508299, + "grad_norm": 6.474050810182313, + "learning_rate": 1.840410757552008e-07, + "loss": 0.5489, + "step": 24382 + }, + { + "epoch": 1.7615547167085088, + "grad_norm": 7.457879874937911, + "learning_rate": 1.839309406751319e-07, + "loss": 0.6008, + "step": 24383 + }, + { + "epoch": 1.7616269619087181, + "grad_norm": 7.180850460933242, + "learning_rate": 1.838208373002498e-07, + "loss": 0.5526, + "step": 24384 + }, + { + "epoch": 1.7616992071089277, + "grad_norm": 6.2944064543294855, + "learning_rate": 1.8371076563206275e-07, + "loss": 0.5175, + "step": 24385 + }, + { + "epoch": 1.7617714523091372, + "grad_norm": 6.596127049968502, + "learning_rate": 1.836007256720762e-07, + "loss": 0.5847, + "step": 24386 + }, + { + "epoch": 1.7618436975093468, + "grad_norm": 8.229852292113145, + "learning_rate": 1.834907174217973e-07, + "loss": 0.5694, + "step": 24387 + }, + { + "epoch": 1.7619159427095563, + "grad_norm": 6.309713825199064, + "learning_rate": 1.8338074088273205e-07, + "loss": 0.583, + "step": 24388 + }, + { + "epoch": 1.7619881879097656, + "grad_norm": 7.953595999989859, + "learning_rate": 1.8327079605638563e-07, + "loss": 0.6426, + "step": 24389 + }, + { + "epoch": 1.7620604331099754, + "grad_norm": 7.470167209127532, + "learning_rate": 1.8316088294426243e-07, + "loss": 0.6097, + "step": 24390 + }, + { + "epoch": 1.7621326783101847, + "grad_norm": 6.806604756899756, + "learning_rate": 1.8305100154786842e-07, + "loss": 0.638, + "step": 24391 + }, + { + "epoch": 1.7622049235103943, + "grad_norm": 8.27240600495647, + "learning_rate": 1.829411518687077e-07, + "loss": 0.6068, + "step": 24392 + }, + { + "epoch": 1.7622771687106038, + "grad_norm": 7.381187042045351, + "learning_rate": 1.8283133390828295e-07, + "loss": 0.6771, + "step": 24393 + }, + { + "epoch": 1.7623494139108133, + "grad_norm": 8.03082488007269, + "learning_rate": 1.8272154766809825e-07, + "loss": 0.6262, + "step": 24394 + }, + { + "epoch": 1.7624216591110229, + "grad_norm": 7.457588059112934, + "learning_rate": 1.8261179314965655e-07, + "loss": 0.5077, + "step": 24395 + }, + { + "epoch": 1.7624939043112322, + "grad_norm": 6.611060682829253, + "learning_rate": 1.8250207035445972e-07, + "loss": 0.5721, + "step": 24396 + }, + { + "epoch": 1.762566149511442, + "grad_norm": 7.570113456006879, + "learning_rate": 1.8239237928401016e-07, + "loss": 0.5844, + "step": 24397 + }, + { + "epoch": 1.7626383947116513, + "grad_norm": 6.854071093726803, + "learning_rate": 1.8228271993980916e-07, + "loss": 0.6249, + "step": 24398 + }, + { + "epoch": 1.7627106399118608, + "grad_norm": 7.199168411550816, + "learning_rate": 1.8217309232335834e-07, + "loss": 0.5602, + "step": 24399 + }, + { + "epoch": 1.7627828851120704, + "grad_norm": 7.303313133157257, + "learning_rate": 1.820634964361584e-07, + "loss": 0.6207, + "step": 24400 + }, + { + "epoch": 1.76285513031228, + "grad_norm": 6.282652740989843, + "learning_rate": 1.8195393227970927e-07, + "loss": 0.595, + "step": 24401 + }, + { + "epoch": 1.7629273755124895, + "grad_norm": 6.166639826260828, + "learning_rate": 1.818443998555114e-07, + "loss": 0.5772, + "step": 24402 + }, + { + "epoch": 1.7629996207126988, + "grad_norm": 6.557955957951803, + "learning_rate": 1.817348991650633e-07, + "loss": 0.5953, + "step": 24403 + }, + { + "epoch": 1.7630718659129085, + "grad_norm": 7.081408549568259, + "learning_rate": 1.816254302098644e-07, + "loss": 0.6441, + "step": 24404 + }, + { + "epoch": 1.7631441111131179, + "grad_norm": 7.3144027973068315, + "learning_rate": 1.8151599299141315e-07, + "loss": 0.5653, + "step": 24405 + }, + { + "epoch": 1.7632163563133274, + "grad_norm": 6.743008984571771, + "learning_rate": 1.8140658751120838e-07, + "loss": 0.6074, + "step": 24406 + }, + { + "epoch": 1.763288601513537, + "grad_norm": 7.476545662696559, + "learning_rate": 1.8129721377074666e-07, + "loss": 0.6403, + "step": 24407 + }, + { + "epoch": 1.7633608467137465, + "grad_norm": 7.345815295108741, + "learning_rate": 1.8118787177152568e-07, + "loss": 0.5993, + "step": 24408 + }, + { + "epoch": 1.763433091913956, + "grad_norm": 8.165618777354833, + "learning_rate": 1.8107856151504262e-07, + "loss": 0.5896, + "step": 24409 + }, + { + "epoch": 1.7635053371141653, + "grad_norm": 7.17392623256833, + "learning_rate": 1.8096928300279315e-07, + "loss": 0.6423, + "step": 24410 + }, + { + "epoch": 1.7635775823143751, + "grad_norm": 7.167113208905492, + "learning_rate": 1.8086003623627364e-07, + "loss": 0.5774, + "step": 24411 + }, + { + "epoch": 1.7636498275145844, + "grad_norm": 6.80608464820453, + "learning_rate": 1.8075082121697952e-07, + "loss": 0.5503, + "step": 24412 + }, + { + "epoch": 1.763722072714794, + "grad_norm": 8.818482444808433, + "learning_rate": 1.8064163794640655e-07, + "loss": 0.59, + "step": 24413 + }, + { + "epoch": 1.7637943179150035, + "grad_norm": 9.113646250191676, + "learning_rate": 1.8053248642604797e-07, + "loss": 0.5409, + "step": 24414 + }, + { + "epoch": 1.763866563115213, + "grad_norm": 7.192092357191003, + "learning_rate": 1.80423366657399e-07, + "loss": 0.6316, + "step": 24415 + }, + { + "epoch": 1.7639388083154226, + "grad_norm": 7.69908918468163, + "learning_rate": 1.803142786419529e-07, + "loss": 0.6624, + "step": 24416 + }, + { + "epoch": 1.764011053515632, + "grad_norm": 7.294619356595462, + "learning_rate": 1.8020522238120369e-07, + "loss": 0.6457, + "step": 24417 + }, + { + "epoch": 1.7640832987158417, + "grad_norm": 7.544277968494929, + "learning_rate": 1.800961978766433e-07, + "loss": 0.6134, + "step": 24418 + }, + { + "epoch": 1.764155543916051, + "grad_norm": 7.0007506376792765, + "learning_rate": 1.7998720512976408e-07, + "loss": 0.5812, + "step": 24419 + }, + { + "epoch": 1.7642277891162605, + "grad_norm": 9.95317252490616, + "learning_rate": 1.798782441420599e-07, + "loss": 0.5439, + "step": 24420 + }, + { + "epoch": 1.76430003431647, + "grad_norm": 7.216059195637284, + "learning_rate": 1.7976931491502037e-07, + "loss": 0.5984, + "step": 24421 + }, + { + "epoch": 1.7643722795166796, + "grad_norm": 7.955608467314431, + "learning_rate": 1.7966041745013762e-07, + "loss": 0.6191, + "step": 24422 + }, + { + "epoch": 1.7644445247168892, + "grad_norm": 7.508336837829833, + "learning_rate": 1.7955155174890188e-07, + "loss": 0.6001, + "step": 24423 + }, + { + "epoch": 1.7645167699170985, + "grad_norm": 6.766743838366594, + "learning_rate": 1.7944271781280414e-07, + "loss": 0.5843, + "step": 24424 + }, + { + "epoch": 1.7645890151173083, + "grad_norm": 7.192642097435189, + "learning_rate": 1.7933391564333353e-07, + "loss": 0.5862, + "step": 24425 + }, + { + "epoch": 1.7646612603175176, + "grad_norm": 7.121980261432791, + "learning_rate": 1.7922514524197965e-07, + "loss": 0.6707, + "step": 24426 + }, + { + "epoch": 1.7647335055177271, + "grad_norm": 7.173957605437139, + "learning_rate": 1.7911640661023162e-07, + "loss": 0.6352, + "step": 24427 + }, + { + "epoch": 1.7648057507179367, + "grad_norm": 7.7609043318007265, + "learning_rate": 1.7900769974957765e-07, + "loss": 0.5678, + "step": 24428 + }, + { + "epoch": 1.7648779959181462, + "grad_norm": 7.468812056407334, + "learning_rate": 1.788990246615066e-07, + "loss": 0.6093, + "step": 24429 + }, + { + "epoch": 1.7649502411183557, + "grad_norm": 7.022159379498177, + "learning_rate": 1.787903813475053e-07, + "loss": 0.6176, + "step": 24430 + }, + { + "epoch": 1.765022486318565, + "grad_norm": 6.4344679719674875, + "learning_rate": 1.7868176980906204e-07, + "loss": 0.5946, + "step": 24431 + }, + { + "epoch": 1.7650947315187748, + "grad_norm": 7.670340390367403, + "learning_rate": 1.7857319004766254e-07, + "loss": 0.577, + "step": 24432 + }, + { + "epoch": 1.7651669767189841, + "grad_norm": 6.414912326656648, + "learning_rate": 1.7846464206479342e-07, + "loss": 0.5675, + "step": 24433 + }, + { + "epoch": 1.7652392219191937, + "grad_norm": 7.910211226660804, + "learning_rate": 1.7835612586194122e-07, + "loss": 0.6642, + "step": 24434 + }, + { + "epoch": 1.7653114671194032, + "grad_norm": 7.436065278996419, + "learning_rate": 1.7824764144059064e-07, + "loss": 0.609, + "step": 24435 + }, + { + "epoch": 1.7653837123196128, + "grad_norm": 7.101059966147467, + "learning_rate": 1.7813918880222713e-07, + "loss": 0.6975, + "step": 24436 + }, + { + "epoch": 1.7654559575198223, + "grad_norm": 7.574476610489437, + "learning_rate": 1.7803076794833535e-07, + "loss": 0.6234, + "step": 24437 + }, + { + "epoch": 1.7655282027200316, + "grad_norm": 7.350052876184963, + "learning_rate": 1.7792237888039966e-07, + "loss": 0.643, + "step": 24438 + }, + { + "epoch": 1.7656004479202414, + "grad_norm": 7.221952710461931, + "learning_rate": 1.778140215999033e-07, + "loss": 0.6388, + "step": 24439 + }, + { + "epoch": 1.7656726931204507, + "grad_norm": 7.434081758646341, + "learning_rate": 1.777056961083301e-07, + "loss": 0.6311, + "step": 24440 + }, + { + "epoch": 1.7657449383206605, + "grad_norm": 8.311489416659459, + "learning_rate": 1.7759740240716278e-07, + "loss": 0.6258, + "step": 24441 + }, + { + "epoch": 1.7658171835208698, + "grad_norm": 8.408416582293405, + "learning_rate": 1.7748914049788402e-07, + "loss": 0.6368, + "step": 24442 + }, + { + "epoch": 1.7658894287210793, + "grad_norm": 6.144033325740714, + "learning_rate": 1.7738091038197542e-07, + "loss": 0.6049, + "step": 24443 + }, + { + "epoch": 1.7659616739212889, + "grad_norm": 6.527256671135195, + "learning_rate": 1.7727271206091857e-07, + "loss": 0.5859, + "step": 24444 + }, + { + "epoch": 1.7660339191214982, + "grad_norm": 6.807709575765461, + "learning_rate": 1.7716454553619534e-07, + "loss": 0.6447, + "step": 24445 + }, + { + "epoch": 1.766106164321708, + "grad_norm": 6.9856354602285915, + "learning_rate": 1.7705641080928564e-07, + "loss": 0.6412, + "step": 24446 + }, + { + "epoch": 1.7661784095219173, + "grad_norm": 7.024156306637002, + "learning_rate": 1.7694830788166945e-07, + "loss": 0.575, + "step": 24447 + }, + { + "epoch": 1.766250654722127, + "grad_norm": 6.70651109796873, + "learning_rate": 1.7684023675482748e-07, + "loss": 0.6008, + "step": 24448 + }, + { + "epoch": 1.7663228999223364, + "grad_norm": 6.809399660511363, + "learning_rate": 1.767321974302394e-07, + "loss": 0.6474, + "step": 24449 + }, + { + "epoch": 1.766395145122546, + "grad_norm": 7.426063271373953, + "learning_rate": 1.7662418990938347e-07, + "loss": 0.6351, + "step": 24450 + }, + { + "epoch": 1.7664673903227555, + "grad_norm": 7.4345109847765825, + "learning_rate": 1.7651621419373793e-07, + "loss": 0.5565, + "step": 24451 + }, + { + "epoch": 1.7665396355229648, + "grad_norm": 6.940297155696937, + "learning_rate": 1.764082702847822e-07, + "loss": 0.5079, + "step": 24452 + }, + { + "epoch": 1.7666118807231745, + "grad_norm": 7.610716652529372, + "learning_rate": 1.7630035818399227e-07, + "loss": 0.4867, + "step": 24453 + }, + { + "epoch": 1.7666841259233839, + "grad_norm": 7.038590460899791, + "learning_rate": 1.7619247789284645e-07, + "loss": 0.5928, + "step": 24454 + }, + { + "epoch": 1.7667563711235936, + "grad_norm": 8.392769253627131, + "learning_rate": 1.760846294128213e-07, + "loss": 0.6262, + "step": 24455 + }, + { + "epoch": 1.766828616323803, + "grad_norm": 8.038385331203658, + "learning_rate": 1.7597681274539314e-07, + "loss": 0.5871, + "step": 24456 + }, + { + "epoch": 1.7669008615240125, + "grad_norm": 8.269426535266394, + "learning_rate": 1.7586902789203776e-07, + "loss": 0.6244, + "step": 24457 + }, + { + "epoch": 1.766973106724222, + "grad_norm": 6.62252012659444, + "learning_rate": 1.7576127485423116e-07, + "loss": 0.5934, + "step": 24458 + }, + { + "epoch": 1.7670453519244316, + "grad_norm": 6.485288256942383, + "learning_rate": 1.7565355363344804e-07, + "loss": 0.5969, + "step": 24459 + }, + { + "epoch": 1.7671175971246411, + "grad_norm": 9.075202826132825, + "learning_rate": 1.7554586423116303e-07, + "loss": 0.5614, + "step": 24460 + }, + { + "epoch": 1.7671898423248504, + "grad_norm": 6.985682968796061, + "learning_rate": 1.7543820664884997e-07, + "loss": 0.6054, + "step": 24461 + }, + { + "epoch": 1.7672620875250602, + "grad_norm": 7.565393067893334, + "learning_rate": 1.7533058088798321e-07, + "loss": 0.5641, + "step": 24462 + }, + { + "epoch": 1.7673343327252695, + "grad_norm": 7.190737318664558, + "learning_rate": 1.7522298695003603e-07, + "loss": 0.5913, + "step": 24463 + }, + { + "epoch": 1.767406577925479, + "grad_norm": 7.521179557573125, + "learning_rate": 1.7511542483648087e-07, + "loss": 0.6288, + "step": 24464 + }, + { + "epoch": 1.7674788231256886, + "grad_norm": 7.509218272884616, + "learning_rate": 1.7500789454879015e-07, + "loss": 0.5834, + "step": 24465 + }, + { + "epoch": 1.7675510683258981, + "grad_norm": 7.454863467485347, + "learning_rate": 1.749003960884363e-07, + "loss": 0.5614, + "step": 24466 + }, + { + "epoch": 1.7676233135261077, + "grad_norm": 7.478489100403471, + "learning_rate": 1.7479292945689063e-07, + "loss": 0.615, + "step": 24467 + }, + { + "epoch": 1.767695558726317, + "grad_norm": 8.056298998438805, + "learning_rate": 1.7468549465562452e-07, + "loss": 0.574, + "step": 24468 + }, + { + "epoch": 1.7677678039265268, + "grad_norm": 5.817473222107808, + "learning_rate": 1.7457809168610812e-07, + "loss": 0.6202, + "step": 24469 + }, + { + "epoch": 1.767840049126736, + "grad_norm": 6.986935549408054, + "learning_rate": 1.7447072054981278e-07, + "loss": 0.5378, + "step": 24470 + }, + { + "epoch": 1.7679122943269456, + "grad_norm": 7.07208465791321, + "learning_rate": 1.7436338124820706e-07, + "loss": 0.5527, + "step": 24471 + }, + { + "epoch": 1.7679845395271552, + "grad_norm": 5.824298001011782, + "learning_rate": 1.7425607378276117e-07, + "loss": 0.587, + "step": 24472 + }, + { + "epoch": 1.7680567847273647, + "grad_norm": 6.153605091435255, + "learning_rate": 1.7414879815494362e-07, + "loss": 0.6203, + "step": 24473 + }, + { + "epoch": 1.7681290299275743, + "grad_norm": 7.790193251831363, + "learning_rate": 1.7404155436622355e-07, + "loss": 0.5874, + "step": 24474 + }, + { + "epoch": 1.7682012751277836, + "grad_norm": 6.846800520547101, + "learning_rate": 1.7393434241806756e-07, + "loss": 0.5426, + "step": 24475 + }, + { + "epoch": 1.7682735203279933, + "grad_norm": 7.024425127393795, + "learning_rate": 1.7382716231194502e-07, + "loss": 0.5953, + "step": 24476 + }, + { + "epoch": 1.7683457655282027, + "grad_norm": 6.287115745861171, + "learning_rate": 1.7372001404932283e-07, + "loss": 0.621, + "step": 24477 + }, + { + "epoch": 1.7684180107284122, + "grad_norm": 6.2334439531326264, + "learning_rate": 1.73612897631667e-07, + "loss": 0.591, + "step": 24478 + }, + { + "epoch": 1.7684902559286217, + "grad_norm": 6.34418835558287, + "learning_rate": 1.7350581306044446e-07, + "loss": 0.621, + "step": 24479 + }, + { + "epoch": 1.7685625011288313, + "grad_norm": 6.852948699923928, + "learning_rate": 1.7339876033712065e-07, + "loss": 0.5429, + "step": 24480 + }, + { + "epoch": 1.7686347463290408, + "grad_norm": 7.381596347005636, + "learning_rate": 1.7329173946316169e-07, + "loss": 0.6648, + "step": 24481 + }, + { + "epoch": 1.7687069915292501, + "grad_norm": 9.623435153332235, + "learning_rate": 1.7318475044003218e-07, + "loss": 0.6577, + "step": 24482 + }, + { + "epoch": 1.76877923672946, + "grad_norm": 7.097209607747864, + "learning_rate": 1.7307779326919655e-07, + "loss": 0.5455, + "step": 24483 + }, + { + "epoch": 1.7688514819296692, + "grad_norm": 6.825101064974726, + "learning_rate": 1.7297086795211915e-07, + "loss": 0.5796, + "step": 24484 + }, + { + "epoch": 1.7689237271298788, + "grad_norm": 8.122429074171187, + "learning_rate": 1.7286397449026382e-07, + "loss": 0.6818, + "step": 24485 + }, + { + "epoch": 1.7689959723300883, + "grad_norm": 8.50551246960496, + "learning_rate": 1.7275711288509384e-07, + "loss": 0.592, + "step": 24486 + }, + { + "epoch": 1.7690682175302979, + "grad_norm": 7.329791369624158, + "learning_rate": 1.7265028313807192e-07, + "loss": 0.5656, + "step": 24487 + }, + { + "epoch": 1.7691404627305074, + "grad_norm": 8.645824116774707, + "learning_rate": 1.7254348525066105e-07, + "loss": 0.6462, + "step": 24488 + }, + { + "epoch": 1.7692127079307167, + "grad_norm": 8.019718664091146, + "learning_rate": 1.7243671922432205e-07, + "loss": 0.6174, + "step": 24489 + }, + { + "epoch": 1.7692849531309265, + "grad_norm": 8.38682654870753, + "learning_rate": 1.7232998506051728e-07, + "loss": 0.5948, + "step": 24490 + }, + { + "epoch": 1.7693571983311358, + "grad_norm": 6.686999721175117, + "learning_rate": 1.7222328276070789e-07, + "loss": 0.5585, + "step": 24491 + }, + { + "epoch": 1.7694294435313453, + "grad_norm": 7.788301145635511, + "learning_rate": 1.7211661232635457e-07, + "loss": 0.6406, + "step": 24492 + }, + { + "epoch": 1.769501688731555, + "grad_norm": 8.328277529237598, + "learning_rate": 1.7200997375891677e-07, + "loss": 0.6607, + "step": 24493 + }, + { + "epoch": 1.7695739339317644, + "grad_norm": 23.3413117346925, + "learning_rate": 1.7190336705985522e-07, + "loss": 0.6257, + "step": 24494 + }, + { + "epoch": 1.769646179131974, + "grad_norm": 7.380263180058933, + "learning_rate": 1.7179679223062878e-07, + "loss": 0.5935, + "step": 24495 + }, + { + "epoch": 1.7697184243321833, + "grad_norm": 6.555888305811157, + "learning_rate": 1.7169024927269628e-07, + "loss": 0.6472, + "step": 24496 + }, + { + "epoch": 1.769790669532393, + "grad_norm": 7.548042791458555, + "learning_rate": 1.7158373818751683e-07, + "loss": 0.616, + "step": 24497 + }, + { + "epoch": 1.7698629147326024, + "grad_norm": 8.68510788292168, + "learning_rate": 1.714772589765479e-07, + "loss": 0.5741, + "step": 24498 + }, + { + "epoch": 1.769935159932812, + "grad_norm": 7.208586965813839, + "learning_rate": 1.713708116412477e-07, + "loss": 0.5749, + "step": 24499 + }, + { + "epoch": 1.7700074051330215, + "grad_norm": 8.290535036498323, + "learning_rate": 1.7126439618307288e-07, + "loss": 0.6294, + "step": 24500 + }, + { + "epoch": 1.770079650333231, + "grad_norm": 6.682467554700802, + "learning_rate": 1.7115801260348008e-07, + "loss": 0.5598, + "step": 24501 + }, + { + "epoch": 1.7701518955334405, + "grad_norm": 6.656618493137758, + "learning_rate": 1.7105166090392616e-07, + "loss": 0.543, + "step": 24502 + }, + { + "epoch": 1.7702241407336499, + "grad_norm": 8.525721061394117, + "learning_rate": 1.7094534108586608e-07, + "loss": 0.6269, + "step": 24503 + }, + { + "epoch": 1.7702963859338596, + "grad_norm": 7.1125894175508435, + "learning_rate": 1.7083905315075616e-07, + "loss": 0.5688, + "step": 24504 + }, + { + "epoch": 1.770368631134069, + "grad_norm": 6.700456563851638, + "learning_rate": 1.7073279710005135e-07, + "loss": 0.5411, + "step": 24505 + }, + { + "epoch": 1.7704408763342785, + "grad_norm": 6.159660827414855, + "learning_rate": 1.7062657293520634e-07, + "loss": 0.6047, + "step": 24506 + }, + { + "epoch": 1.770513121534488, + "grad_norm": 6.48629048562551, + "learning_rate": 1.705203806576744e-07, + "loss": 0.586, + "step": 24507 + }, + { + "epoch": 1.7705853667346976, + "grad_norm": 8.520180475893506, + "learning_rate": 1.7041422026890962e-07, + "loss": 0.6443, + "step": 24508 + }, + { + "epoch": 1.7706576119349071, + "grad_norm": 6.415675825762204, + "learning_rate": 1.7030809177036588e-07, + "loss": 0.6128, + "step": 24509 + }, + { + "epoch": 1.7707298571351164, + "grad_norm": 7.707641515530779, + "learning_rate": 1.7020199516349505e-07, + "loss": 0.5516, + "step": 24510 + }, + { + "epoch": 1.7708021023353262, + "grad_norm": 6.480044028521123, + "learning_rate": 1.7009593044975014e-07, + "loss": 0.5607, + "step": 24511 + }, + { + "epoch": 1.7708743475355355, + "grad_norm": 7.421089339583777, + "learning_rate": 1.699898976305825e-07, + "loss": 0.641, + "step": 24512 + }, + { + "epoch": 1.7709465927357453, + "grad_norm": 7.113506751385139, + "learning_rate": 1.6988389670744425e-07, + "loss": 0.5748, + "step": 24513 + }, + { + "epoch": 1.7710188379359546, + "grad_norm": 8.144201975508166, + "learning_rate": 1.6977792768178624e-07, + "loss": 0.6848, + "step": 24514 + }, + { + "epoch": 1.7710910831361641, + "grad_norm": 6.509716401141529, + "learning_rate": 1.6967199055505924e-07, + "loss": 0.648, + "step": 24515 + }, + { + "epoch": 1.7711633283363737, + "grad_norm": 7.774932625530782, + "learning_rate": 1.6956608532871315e-07, + "loss": 0.7069, + "step": 24516 + }, + { + "epoch": 1.771235573536583, + "grad_norm": 9.058428671954248, + "learning_rate": 1.6946021200419828e-07, + "loss": 0.6075, + "step": 24517 + }, + { + "epoch": 1.7713078187367928, + "grad_norm": 6.178282374228546, + "learning_rate": 1.693543705829631e-07, + "loss": 0.5759, + "step": 24518 + }, + { + "epoch": 1.771380063937002, + "grad_norm": 7.94821885027367, + "learning_rate": 1.692485610664571e-07, + "loss": 0.5919, + "step": 24519 + }, + { + "epoch": 1.7714523091372119, + "grad_norm": 8.347708345356864, + "learning_rate": 1.6914278345612878e-07, + "loss": 0.6183, + "step": 24520 + }, + { + "epoch": 1.7715245543374212, + "grad_norm": 6.179239944088682, + "learning_rate": 1.6903703775342562e-07, + "loss": 0.5641, + "step": 24521 + }, + { + "epoch": 1.7715967995376307, + "grad_norm": 6.531245035416026, + "learning_rate": 1.689313239597956e-07, + "loss": 0.5678, + "step": 24522 + }, + { + "epoch": 1.7716690447378403, + "grad_norm": 8.777659703787124, + "learning_rate": 1.6882564207668595e-07, + "loss": 0.6555, + "step": 24523 + }, + { + "epoch": 1.7717412899380496, + "grad_norm": 7.2401047626313355, + "learning_rate": 1.6871999210554297e-07, + "loss": 0.5629, + "step": 24524 + }, + { + "epoch": 1.7718135351382593, + "grad_norm": 8.149933572510154, + "learning_rate": 1.6861437404781327e-07, + "loss": 0.6324, + "step": 24525 + }, + { + "epoch": 1.7718857803384687, + "grad_norm": 7.10147735913965, + "learning_rate": 1.685087879049427e-07, + "loss": 0.6429, + "step": 24526 + }, + { + "epoch": 1.7719580255386784, + "grad_norm": 7.827596010772486, + "learning_rate": 1.684032336783767e-07, + "loss": 0.6461, + "step": 24527 + }, + { + "epoch": 1.7720302707388877, + "grad_norm": 6.762971917289967, + "learning_rate": 1.6829771136955997e-07, + "loss": 0.6308, + "step": 24528 + }, + { + "epoch": 1.7721025159390973, + "grad_norm": 6.72530907129867, + "learning_rate": 1.6819222097993692e-07, + "loss": 0.6051, + "step": 24529 + }, + { + "epoch": 1.7721747611393068, + "grad_norm": 7.7860085517999, + "learning_rate": 1.6808676251095169e-07, + "loss": 0.5912, + "step": 24530 + }, + { + "epoch": 1.7722470063395164, + "grad_norm": 7.186285297747038, + "learning_rate": 1.6798133596404864e-07, + "loss": 0.6975, + "step": 24531 + }, + { + "epoch": 1.772319251539726, + "grad_norm": 6.8900514394226855, + "learning_rate": 1.678759413406697e-07, + "loss": 0.5976, + "step": 24532 + }, + { + "epoch": 1.7723914967399352, + "grad_norm": 6.899776687325535, + "learning_rate": 1.6777057864225871e-07, + "loss": 0.5437, + "step": 24533 + }, + { + "epoch": 1.772463741940145, + "grad_norm": 6.239812864774259, + "learning_rate": 1.6766524787025783e-07, + "loss": 0.616, + "step": 24534 + }, + { + "epoch": 1.7725359871403543, + "grad_norm": 7.527975676478008, + "learning_rate": 1.6755994902610873e-07, + "loss": 0.5834, + "step": 24535 + }, + { + "epoch": 1.7726082323405639, + "grad_norm": 6.882528725705128, + "learning_rate": 1.6745468211125243e-07, + "loss": 0.5713, + "step": 24536 + }, + { + "epoch": 1.7726804775407734, + "grad_norm": 7.47788819106676, + "learning_rate": 1.6734944712713086e-07, + "loss": 0.6018, + "step": 24537 + }, + { + "epoch": 1.772752722740983, + "grad_norm": 8.65183010255837, + "learning_rate": 1.6724424407518426e-07, + "loss": 0.5984, + "step": 24538 + }, + { + "epoch": 1.7728249679411925, + "grad_norm": 6.6215846056989776, + "learning_rate": 1.6713907295685256e-07, + "loss": 0.567, + "step": 24539 + }, + { + "epoch": 1.7728972131414018, + "grad_norm": 8.988529525172755, + "learning_rate": 1.6703393377357545e-07, + "loss": 0.6286, + "step": 24540 + }, + { + "epoch": 1.7729694583416116, + "grad_norm": 7.183780138793192, + "learning_rate": 1.6692882652679237e-07, + "loss": 0.6677, + "step": 24541 + }, + { + "epoch": 1.773041703541821, + "grad_norm": 7.221001344176419, + "learning_rate": 1.6682375121794214e-07, + "loss": 0.6098, + "step": 24542 + }, + { + "epoch": 1.7731139487420304, + "grad_norm": 8.459781416285418, + "learning_rate": 1.6671870784846305e-07, + "loss": 0.6237, + "step": 24543 + }, + { + "epoch": 1.77318619394224, + "grad_norm": 7.1313737168141165, + "learning_rate": 1.6661369641979341e-07, + "loss": 0.6363, + "step": 24544 + }, + { + "epoch": 1.7732584391424495, + "grad_norm": 8.054167944415827, + "learning_rate": 1.6650871693337067e-07, + "loss": 0.6178, + "step": 24545 + }, + { + "epoch": 1.773330684342659, + "grad_norm": 6.631283641158188, + "learning_rate": 1.6640376939063147e-07, + "loss": 0.537, + "step": 24546 + }, + { + "epoch": 1.7734029295428684, + "grad_norm": 8.721767108595714, + "learning_rate": 1.6629885379301297e-07, + "loss": 0.6329, + "step": 24547 + }, + { + "epoch": 1.7734751747430781, + "grad_norm": 7.962336094485272, + "learning_rate": 1.6619397014195098e-07, + "loss": 0.6251, + "step": 24548 + }, + { + "epoch": 1.7735474199432875, + "grad_norm": 7.97469117334977, + "learning_rate": 1.660891184388816e-07, + "loss": 0.6076, + "step": 24549 + }, + { + "epoch": 1.773619665143497, + "grad_norm": 8.036306963665451, + "learning_rate": 1.6598429868524003e-07, + "loss": 0.6211, + "step": 24550 + }, + { + "epoch": 1.7736919103437065, + "grad_norm": 7.216073204582703, + "learning_rate": 1.6587951088246097e-07, + "loss": 0.5847, + "step": 24551 + }, + { + "epoch": 1.773764155543916, + "grad_norm": 7.998178274637958, + "learning_rate": 1.657747550319791e-07, + "loss": 0.6732, + "step": 24552 + }, + { + "epoch": 1.7738364007441256, + "grad_norm": 8.106395843064483, + "learning_rate": 1.656700311352283e-07, + "loss": 0.6282, + "step": 24553 + }, + { + "epoch": 1.773908645944335, + "grad_norm": 7.2692121940999606, + "learning_rate": 1.6556533919364237e-07, + "loss": 0.574, + "step": 24554 + }, + { + "epoch": 1.7739808911445447, + "grad_norm": 7.52446216169328, + "learning_rate": 1.654606792086544e-07, + "loss": 0.5897, + "step": 24555 + }, + { + "epoch": 1.774053136344754, + "grad_norm": 6.728027467778185, + "learning_rate": 1.653560511816976e-07, + "loss": 0.6765, + "step": 24556 + }, + { + "epoch": 1.7741253815449636, + "grad_norm": 8.14803253952368, + "learning_rate": 1.6525145511420316e-07, + "loss": 0.6173, + "step": 24557 + }, + { + "epoch": 1.7741976267451731, + "grad_norm": 9.207726874310213, + "learning_rate": 1.6514689100760346e-07, + "loss": 0.5488, + "step": 24558 + }, + { + "epoch": 1.7742698719453827, + "grad_norm": 7.700080316298098, + "learning_rate": 1.6504235886333042e-07, + "loss": 0.6099, + "step": 24559 + }, + { + "epoch": 1.7743421171455922, + "grad_norm": 6.850268738758567, + "learning_rate": 1.6493785868281375e-07, + "loss": 0.6116, + "step": 24560 + }, + { + "epoch": 1.7744143623458015, + "grad_norm": 6.416538517981044, + "learning_rate": 1.648333904674851e-07, + "loss": 0.5215, + "step": 24561 + }, + { + "epoch": 1.7744866075460113, + "grad_norm": 6.098376899279353, + "learning_rate": 1.6472895421877412e-07, + "loss": 0.537, + "step": 24562 + }, + { + "epoch": 1.7745588527462206, + "grad_norm": 7.606597205381778, + "learning_rate": 1.6462454993811105e-07, + "loss": 0.5572, + "step": 24563 + }, + { + "epoch": 1.7746310979464301, + "grad_norm": 6.079144468347583, + "learning_rate": 1.6452017762692397e-07, + "loss": 0.5809, + "step": 24564 + }, + { + "epoch": 1.7747033431466397, + "grad_norm": 7.567303353745469, + "learning_rate": 1.644158372866425e-07, + "loss": 0.6368, + "step": 24565 + }, + { + "epoch": 1.7747755883468492, + "grad_norm": 8.584849794614566, + "learning_rate": 1.64311528918695e-07, + "loss": 0.5789, + "step": 24566 + }, + { + "epoch": 1.7748478335470588, + "grad_norm": 8.101426891343355, + "learning_rate": 1.6420725252450892e-07, + "loss": 0.645, + "step": 24567 + }, + { + "epoch": 1.774920078747268, + "grad_norm": 7.496483296206478, + "learning_rate": 1.6410300810551172e-07, + "loss": 0.6109, + "step": 24568 + }, + { + "epoch": 1.7749923239474779, + "grad_norm": 8.12572816373511, + "learning_rate": 1.6399879566313088e-07, + "loss": 0.616, + "step": 24569 + }, + { + "epoch": 1.7750645691476872, + "grad_norm": 7.715008898548821, + "learning_rate": 1.6389461519879247e-07, + "loss": 0.6265, + "step": 24570 + }, + { + "epoch": 1.7751368143478967, + "grad_norm": 10.285475213436223, + "learning_rate": 1.6379046671392313e-07, + "loss": 0.5913, + "step": 24571 + }, + { + "epoch": 1.7752090595481063, + "grad_norm": 6.484495010184631, + "learning_rate": 1.636863502099481e-07, + "loss": 0.5913, + "step": 24572 + }, + { + "epoch": 1.7752813047483158, + "grad_norm": 9.055035483229155, + "learning_rate": 1.6358226568829293e-07, + "loss": 0.624, + "step": 24573 + }, + { + "epoch": 1.7753535499485253, + "grad_norm": 6.8276213366336505, + "learning_rate": 1.6347821315038314e-07, + "loss": 0.5862, + "step": 24574 + }, + { + "epoch": 1.7754257951487347, + "grad_norm": 8.463756257295584, + "learning_rate": 1.6337419259764175e-07, + "loss": 0.6458, + "step": 24575 + }, + { + "epoch": 1.7754980403489444, + "grad_norm": 6.503720905767497, + "learning_rate": 1.6327020403149346e-07, + "loss": 0.519, + "step": 24576 + }, + { + "epoch": 1.7755702855491537, + "grad_norm": 8.269065328454602, + "learning_rate": 1.6316624745336212e-07, + "loss": 0.6257, + "step": 24577 + }, + { + "epoch": 1.7756425307493633, + "grad_norm": 6.43920816517323, + "learning_rate": 1.6306232286466993e-07, + "loss": 0.5757, + "step": 24578 + }, + { + "epoch": 1.7757147759495728, + "grad_norm": 8.199014967393758, + "learning_rate": 1.6295843026684023e-07, + "loss": 0.6475, + "step": 24579 + }, + { + "epoch": 1.7757870211497824, + "grad_norm": 8.78686344017084, + "learning_rate": 1.628545696612946e-07, + "loss": 0.5742, + "step": 24580 + }, + { + "epoch": 1.775859266349992, + "grad_norm": 8.29389190207473, + "learning_rate": 1.6275074104945583e-07, + "loss": 0.6569, + "step": 24581 + }, + { + "epoch": 1.7759315115502012, + "grad_norm": 5.760426894369123, + "learning_rate": 1.6264694443274442e-07, + "loss": 0.5875, + "step": 24582 + }, + { + "epoch": 1.776003756750411, + "grad_norm": 6.539765423875417, + "learning_rate": 1.6254317981258149e-07, + "loss": 0.6085, + "step": 24583 + }, + { + "epoch": 1.7760760019506203, + "grad_norm": 8.511522449262346, + "learning_rate": 1.6243944719038786e-07, + "loss": 0.5908, + "step": 24584 + }, + { + "epoch": 1.7761482471508299, + "grad_norm": 7.041672234467181, + "learning_rate": 1.623357465675829e-07, + "loss": 0.508, + "step": 24585 + }, + { + "epoch": 1.7762204923510394, + "grad_norm": 7.173278804179455, + "learning_rate": 1.6223207794558632e-07, + "loss": 0.6488, + "step": 24586 + }, + { + "epoch": 1.776292737551249, + "grad_norm": 9.341937957916993, + "learning_rate": 1.6212844132581757e-07, + "loss": 0.6252, + "step": 24587 + }, + { + "epoch": 1.7763649827514585, + "grad_norm": 7.272867903650204, + "learning_rate": 1.6202483670969522e-07, + "loss": 0.5483, + "step": 24588 + }, + { + "epoch": 1.7764372279516678, + "grad_norm": 7.008100999147901, + "learning_rate": 1.6192126409863756e-07, + "loss": 0.5887, + "step": 24589 + }, + { + "epoch": 1.7765094731518776, + "grad_norm": 6.996773657463097, + "learning_rate": 1.6181772349406238e-07, + "loss": 0.6489, + "step": 24590 + }, + { + "epoch": 1.776581718352087, + "grad_norm": 7.791042309665382, + "learning_rate": 1.617142148973874e-07, + "loss": 0.5531, + "step": 24591 + }, + { + "epoch": 1.7766539635522967, + "grad_norm": 7.480898629129819, + "learning_rate": 1.6161073831002878e-07, + "loss": 0.6388, + "step": 24592 + }, + { + "epoch": 1.776726208752506, + "grad_norm": 6.224773614282873, + "learning_rate": 1.6150729373340363e-07, + "loss": 0.6096, + "step": 24593 + }, + { + "epoch": 1.7767984539527155, + "grad_norm": 7.242576218330071, + "learning_rate": 1.614038811689278e-07, + "loss": 0.6035, + "step": 24594 + }, + { + "epoch": 1.776870699152925, + "grad_norm": 7.6199429110894075, + "learning_rate": 1.6130050061801767e-07, + "loss": 0.5364, + "step": 24595 + }, + { + "epoch": 1.7769429443531344, + "grad_norm": 8.6294659095894, + "learning_rate": 1.6119715208208737e-07, + "loss": 0.5953, + "step": 24596 + }, + { + "epoch": 1.7770151895533441, + "grad_norm": 7.4262191747678346, + "learning_rate": 1.6109383556255186e-07, + "loss": 0.6194, + "step": 24597 + }, + { + "epoch": 1.7770874347535535, + "grad_norm": 8.734639663644456, + "learning_rate": 1.609905510608259e-07, + "loss": 0.6416, + "step": 24598 + }, + { + "epoch": 1.7771596799537632, + "grad_norm": 7.4901770797189355, + "learning_rate": 1.6088729857832302e-07, + "loss": 0.6692, + "step": 24599 + }, + { + "epoch": 1.7772319251539725, + "grad_norm": 6.315251723200951, + "learning_rate": 1.6078407811645685e-07, + "loss": 0.5594, + "step": 24600 + }, + { + "epoch": 1.777304170354182, + "grad_norm": 8.521906728374194, + "learning_rate": 1.6068088967664041e-07, + "loss": 0.6047, + "step": 24601 + }, + { + "epoch": 1.7773764155543916, + "grad_norm": 6.269858891847698, + "learning_rate": 1.6057773326028675e-07, + "loss": 0.5365, + "step": 24602 + }, + { + "epoch": 1.777448660754601, + "grad_norm": 7.002959307064156, + "learning_rate": 1.6047460886880695e-07, + "loss": 0.6386, + "step": 24603 + }, + { + "epoch": 1.7775209059548107, + "grad_norm": 7.324725503302528, + "learning_rate": 1.603715165036132e-07, + "loss": 0.5806, + "step": 24604 + }, + { + "epoch": 1.77759315115502, + "grad_norm": 7.593225531657108, + "learning_rate": 1.6026845616611693e-07, + "loss": 0.5828, + "step": 24605 + }, + { + "epoch": 1.7776653963552298, + "grad_norm": 7.4447878225411, + "learning_rate": 1.6016542785772887e-07, + "loss": 0.6824, + "step": 24606 + }, + { + "epoch": 1.7777376415554391, + "grad_norm": 6.584173253557452, + "learning_rate": 1.6006243157985935e-07, + "loss": 0.5585, + "step": 24607 + }, + { + "epoch": 1.7778098867556487, + "grad_norm": 7.231774144806663, + "learning_rate": 1.5995946733391803e-07, + "loss": 0.6067, + "step": 24608 + }, + { + "epoch": 1.7778821319558582, + "grad_norm": 7.657492525132554, + "learning_rate": 1.5985653512131466e-07, + "loss": 0.591, + "step": 24609 + }, + { + "epoch": 1.7779543771560677, + "grad_norm": 8.447074374641357, + "learning_rate": 1.5975363494345863e-07, + "loss": 0.5866, + "step": 24610 + }, + { + "epoch": 1.7780266223562773, + "grad_norm": 6.527382905889439, + "learning_rate": 1.59650766801758e-07, + "loss": 0.6123, + "step": 24611 + }, + { + "epoch": 1.7780988675564866, + "grad_norm": 6.368511712854706, + "learning_rate": 1.5954793069762137e-07, + "loss": 0.5439, + "step": 24612 + }, + { + "epoch": 1.7781711127566964, + "grad_norm": 7.755417929627235, + "learning_rate": 1.594451266324565e-07, + "loss": 0.5744, + "step": 24613 + }, + { + "epoch": 1.7782433579569057, + "grad_norm": 7.162496580752211, + "learning_rate": 1.593423546076703e-07, + "loss": 0.5862, + "step": 24614 + }, + { + "epoch": 1.7783156031571152, + "grad_norm": 7.806594691977703, + "learning_rate": 1.592396146246697e-07, + "loss": 0.5818, + "step": 24615 + }, + { + "epoch": 1.7783878483573248, + "grad_norm": 8.087719648111284, + "learning_rate": 1.5913690668486193e-07, + "loss": 0.6442, + "step": 24616 + }, + { + "epoch": 1.7784600935575343, + "grad_norm": 6.656947139714065, + "learning_rate": 1.5903423078965142e-07, + "loss": 0.5665, + "step": 24617 + }, + { + "epoch": 1.7785323387577439, + "grad_norm": 8.89294339005213, + "learning_rate": 1.5893158694044482e-07, + "loss": 0.5914, + "step": 24618 + }, + { + "epoch": 1.7786045839579532, + "grad_norm": 7.824186571638773, + "learning_rate": 1.5882897513864737e-07, + "loss": 0.6175, + "step": 24619 + }, + { + "epoch": 1.778676829158163, + "grad_norm": 7.173345012064412, + "learning_rate": 1.587263953856638e-07, + "loss": 0.5844, + "step": 24620 + }, + { + "epoch": 1.7787490743583723, + "grad_norm": 7.72514492256987, + "learning_rate": 1.5862384768289745e-07, + "loss": 0.5681, + "step": 24621 + }, + { + "epoch": 1.7788213195585818, + "grad_norm": 7.0859256563434645, + "learning_rate": 1.5852133203175273e-07, + "loss": 0.6176, + "step": 24622 + }, + { + "epoch": 1.7788935647587913, + "grad_norm": 8.096953829429804, + "learning_rate": 1.5841884843363326e-07, + "loss": 0.6197, + "step": 24623 + }, + { + "epoch": 1.7789658099590009, + "grad_norm": 7.2330771944369285, + "learning_rate": 1.583163968899415e-07, + "loss": 0.5698, + "step": 24624 + }, + { + "epoch": 1.7790380551592104, + "grad_norm": 6.428012708574229, + "learning_rate": 1.5821397740207993e-07, + "loss": 0.5397, + "step": 24625 + }, + { + "epoch": 1.7791103003594197, + "grad_norm": 6.9544854708355315, + "learning_rate": 1.581115899714508e-07, + "loss": 0.5892, + "step": 24626 + }, + { + "epoch": 1.7791825455596295, + "grad_norm": 8.217188001372632, + "learning_rate": 1.5800923459945548e-07, + "loss": 0.5655, + "step": 24627 + }, + { + "epoch": 1.7792547907598388, + "grad_norm": 8.175093008089753, + "learning_rate": 1.5790691128749564e-07, + "loss": 0.6187, + "step": 24628 + }, + { + "epoch": 1.7793270359600484, + "grad_norm": 8.471056409438821, + "learning_rate": 1.5780462003697122e-07, + "loss": 0.6277, + "step": 24629 + }, + { + "epoch": 1.779399281160258, + "grad_norm": 7.316541027939732, + "learning_rate": 1.577023608492831e-07, + "loss": 0.58, + "step": 24630 + }, + { + "epoch": 1.7794715263604675, + "grad_norm": 7.8690938448752235, + "learning_rate": 1.576001337258315e-07, + "loss": 0.5781, + "step": 24631 + }, + { + "epoch": 1.779543771560677, + "grad_norm": 7.845683919490434, + "learning_rate": 1.5749793866801477e-07, + "loss": 0.6196, + "step": 24632 + }, + { + "epoch": 1.7796160167608863, + "grad_norm": 6.860767961673517, + "learning_rate": 1.5739577567723264e-07, + "loss": 0.5619, + "step": 24633 + }, + { + "epoch": 1.779688261961096, + "grad_norm": 8.201049309559846, + "learning_rate": 1.572936447548837e-07, + "loss": 0.5988, + "step": 24634 + }, + { + "epoch": 1.7797605071613054, + "grad_norm": 7.575496886420842, + "learning_rate": 1.5719154590236541e-07, + "loss": 0.6237, + "step": 24635 + }, + { + "epoch": 1.779832752361515, + "grad_norm": 7.889830173458464, + "learning_rate": 1.5708947912107558e-07, + "loss": 0.6111, + "step": 24636 + }, + { + "epoch": 1.7799049975617245, + "grad_norm": 7.5499536778121605, + "learning_rate": 1.5698744441241142e-07, + "loss": 0.5856, + "step": 24637 + }, + { + "epoch": 1.779977242761934, + "grad_norm": 8.666338498675945, + "learning_rate": 1.568854417777707e-07, + "loss": 0.6337, + "step": 24638 + }, + { + "epoch": 1.7800494879621436, + "grad_norm": 5.949950966512771, + "learning_rate": 1.5678347121854842e-07, + "loss": 0.6108, + "step": 24639 + }, + { + "epoch": 1.780121733162353, + "grad_norm": 7.937091967089897, + "learning_rate": 1.5668153273614122e-07, + "loss": 0.6351, + "step": 24640 + }, + { + "epoch": 1.7801939783625627, + "grad_norm": 7.236224537467078, + "learning_rate": 1.5657962633194468e-07, + "loss": 0.5739, + "step": 24641 + }, + { + "epoch": 1.780266223562772, + "grad_norm": 7.407205125731176, + "learning_rate": 1.564777520073532e-07, + "loss": 0.5782, + "step": 24642 + }, + { + "epoch": 1.7803384687629815, + "grad_norm": 6.256767271381637, + "learning_rate": 1.5637590976376155e-07, + "loss": 0.6009, + "step": 24643 + }, + { + "epoch": 1.780410713963191, + "grad_norm": 7.382143085144111, + "learning_rate": 1.5627409960256413e-07, + "loss": 0.5725, + "step": 24644 + }, + { + "epoch": 1.7804829591634006, + "grad_norm": 7.917070425345037, + "learning_rate": 1.5617232152515426e-07, + "loss": 0.667, + "step": 24645 + }, + { + "epoch": 1.7805552043636101, + "grad_norm": 7.526714051161966, + "learning_rate": 1.560705755329256e-07, + "loss": 0.5493, + "step": 24646 + }, + { + "epoch": 1.7806274495638195, + "grad_norm": 9.549507439977802, + "learning_rate": 1.5596886162727087e-07, + "loss": 0.5666, + "step": 24647 + }, + { + "epoch": 1.7806996947640292, + "grad_norm": 8.305842666454883, + "learning_rate": 1.5586717980958233e-07, + "loss": 0.5795, + "step": 24648 + }, + { + "epoch": 1.7807719399642385, + "grad_norm": 5.764167580767402, + "learning_rate": 1.5576553008125244e-07, + "loss": 0.5788, + "step": 24649 + }, + { + "epoch": 1.780844185164448, + "grad_norm": 7.297457359322685, + "learning_rate": 1.556639124436718e-07, + "loss": 0.5996, + "step": 24650 + }, + { + "epoch": 1.7809164303646576, + "grad_norm": 8.061040701873774, + "learning_rate": 1.5556232689823203e-07, + "loss": 0.5274, + "step": 24651 + }, + { + "epoch": 1.7809886755648672, + "grad_norm": 8.439995742724461, + "learning_rate": 1.55460773446324e-07, + "loss": 0.6273, + "step": 24652 + }, + { + "epoch": 1.7810609207650767, + "grad_norm": 6.055535159917581, + "learning_rate": 1.5535925208933712e-07, + "loss": 0.5868, + "step": 24653 + }, + { + "epoch": 1.781133165965286, + "grad_norm": 7.590023931935493, + "learning_rate": 1.5525776282866169e-07, + "loss": 0.6073, + "step": 24654 + }, + { + "epoch": 1.7812054111654958, + "grad_norm": 6.265234423709396, + "learning_rate": 1.5515630566568686e-07, + "loss": 0.6005, + "step": 24655 + }, + { + "epoch": 1.7812776563657051, + "grad_norm": 7.3289182839550495, + "learning_rate": 1.5505488060180153e-07, + "loss": 0.5931, + "step": 24656 + }, + { + "epoch": 1.7813499015659147, + "grad_norm": 7.988037463807575, + "learning_rate": 1.54953487638394e-07, + "loss": 0.6015, + "step": 24657 + }, + { + "epoch": 1.7814221467661242, + "grad_norm": 8.230662947845401, + "learning_rate": 1.548521267768527e-07, + "loss": 0.6451, + "step": 24658 + }, + { + "epoch": 1.7814943919663337, + "grad_norm": 7.556524769320582, + "learning_rate": 1.5475079801856502e-07, + "loss": 0.5851, + "step": 24659 + }, + { + "epoch": 1.7815666371665433, + "grad_norm": 7.764018024909093, + "learning_rate": 1.546495013649177e-07, + "loss": 0.6074, + "step": 24660 + }, + { + "epoch": 1.7816388823667526, + "grad_norm": 7.841295565000424, + "learning_rate": 1.5454823681729764e-07, + "loss": 0.5267, + "step": 24661 + }, + { + "epoch": 1.7817111275669624, + "grad_norm": 6.3967133785342565, + "learning_rate": 1.5444700437709125e-07, + "loss": 0.6336, + "step": 24662 + }, + { + "epoch": 1.7817833727671717, + "grad_norm": 6.366157233077091, + "learning_rate": 1.5434580404568438e-07, + "loss": 0.582, + "step": 24663 + }, + { + "epoch": 1.7818556179673815, + "grad_norm": 7.509083651170323, + "learning_rate": 1.5424463582446176e-07, + "loss": 0.6086, + "step": 24664 + }, + { + "epoch": 1.7819278631675908, + "grad_norm": 8.61187585969247, + "learning_rate": 1.5414349971480834e-07, + "loss": 0.6153, + "step": 24665 + }, + { + "epoch": 1.7820001083678003, + "grad_norm": 8.587115692023902, + "learning_rate": 1.5404239571810974e-07, + "loss": 0.6464, + "step": 24666 + }, + { + "epoch": 1.7820723535680099, + "grad_norm": 7.725635006184236, + "learning_rate": 1.53941323835749e-07, + "loss": 0.6097, + "step": 24667 + }, + { + "epoch": 1.7821445987682192, + "grad_norm": 6.8811412778437075, + "learning_rate": 1.5384028406910971e-07, + "loss": 0.571, + "step": 24668 + }, + { + "epoch": 1.782216843968429, + "grad_norm": 7.498226719075922, + "learning_rate": 1.5373927641957554e-07, + "loss": 0.5881, + "step": 24669 + }, + { + "epoch": 1.7822890891686383, + "grad_norm": 6.771655348739285, + "learning_rate": 1.5363830088852922e-07, + "loss": 0.545, + "step": 24670 + }, + { + "epoch": 1.782361334368848, + "grad_norm": 7.0948598388419555, + "learning_rate": 1.5353735747735215e-07, + "loss": 0.5592, + "step": 24671 + }, + { + "epoch": 1.7824335795690573, + "grad_norm": 7.003974875918674, + "learning_rate": 1.5343644618742687e-07, + "loss": 0.585, + "step": 24672 + }, + { + "epoch": 1.7825058247692669, + "grad_norm": 8.231809502357637, + "learning_rate": 1.5333556702013475e-07, + "loss": 0.5392, + "step": 24673 + }, + { + "epoch": 1.7825780699694764, + "grad_norm": 6.724588669460571, + "learning_rate": 1.5323471997685636e-07, + "loss": 0.5494, + "step": 24674 + }, + { + "epoch": 1.7826503151696858, + "grad_norm": 7.987693380722222, + "learning_rate": 1.531339050589728e-07, + "loss": 0.5776, + "step": 24675 + }, + { + "epoch": 1.7827225603698955, + "grad_norm": 7.337019283007815, + "learning_rate": 1.5303312226786383e-07, + "loss": 0.5682, + "step": 24676 + }, + { + "epoch": 1.7827948055701048, + "grad_norm": 7.390747585449402, + "learning_rate": 1.5293237160490914e-07, + "loss": 0.641, + "step": 24677 + }, + { + "epoch": 1.7828670507703146, + "grad_norm": 8.62642580493038, + "learning_rate": 1.5283165307148795e-07, + "loss": 0.5647, + "step": 24678 + }, + { + "epoch": 1.782939295970524, + "grad_norm": 7.025767990977008, + "learning_rate": 1.5273096666897884e-07, + "loss": 0.6079, + "step": 24679 + }, + { + "epoch": 1.7830115411707335, + "grad_norm": 6.636000802654565, + "learning_rate": 1.526303123987602e-07, + "loss": 0.5911, + "step": 24680 + }, + { + "epoch": 1.783083786370943, + "grad_norm": 7.352702426400201, + "learning_rate": 1.5252969026221032e-07, + "loss": 0.6051, + "step": 24681 + }, + { + "epoch": 1.7831560315711525, + "grad_norm": 6.470435176405858, + "learning_rate": 1.524291002607059e-07, + "loss": 0.6181, + "step": 24682 + }, + { + "epoch": 1.783228276771362, + "grad_norm": 7.086050821258275, + "learning_rate": 1.5232854239562444e-07, + "loss": 0.541, + "step": 24683 + }, + { + "epoch": 1.7833005219715714, + "grad_norm": 6.870675408757844, + "learning_rate": 1.5222801666834236e-07, + "loss": 0.5494, + "step": 24684 + }, + { + "epoch": 1.7833727671717812, + "grad_norm": 7.894916357040647, + "learning_rate": 1.5212752308023577e-07, + "loss": 0.5144, + "step": 24685 + }, + { + "epoch": 1.7834450123719905, + "grad_norm": 8.095642104338788, + "learning_rate": 1.5202706163268049e-07, + "loss": 0.5911, + "step": 24686 + }, + { + "epoch": 1.7835172575722, + "grad_norm": 8.95537694600943, + "learning_rate": 1.5192663232705157e-07, + "loss": 0.6563, + "step": 24687 + }, + { + "epoch": 1.7835895027724096, + "grad_norm": 7.175266369916036, + "learning_rate": 1.5182623516472428e-07, + "loss": 0.5867, + "step": 24688 + }, + { + "epoch": 1.7836617479726191, + "grad_norm": 6.822012882700275, + "learning_rate": 1.5172587014707223e-07, + "loss": 0.5707, + "step": 24689 + }, + { + "epoch": 1.7837339931728287, + "grad_norm": 6.666772968716293, + "learning_rate": 1.5162553727546964e-07, + "loss": 0.5572, + "step": 24690 + }, + { + "epoch": 1.783806238373038, + "grad_norm": 7.85478230971786, + "learning_rate": 1.5152523655129037e-07, + "loss": 0.5807, + "step": 24691 + }, + { + "epoch": 1.7838784835732477, + "grad_norm": 8.351741549764487, + "learning_rate": 1.5142496797590695e-07, + "loss": 0.6317, + "step": 24692 + }, + { + "epoch": 1.783950728773457, + "grad_norm": 7.86144045921293, + "learning_rate": 1.513247315506916e-07, + "loss": 0.6299, + "step": 24693 + }, + { + "epoch": 1.7840229739736666, + "grad_norm": 6.484902381132831, + "learning_rate": 1.5122452727701742e-07, + "loss": 0.5606, + "step": 24694 + }, + { + "epoch": 1.7840952191738761, + "grad_norm": 7.997267494840547, + "learning_rate": 1.5112435515625634e-07, + "loss": 0.6997, + "step": 24695 + }, + { + "epoch": 1.7841674643740857, + "grad_norm": 7.959070647538553, + "learning_rate": 1.510242151897784e-07, + "loss": 0.6139, + "step": 24696 + }, + { + "epoch": 1.7842397095742952, + "grad_norm": 7.53199580761615, + "learning_rate": 1.5092410737895523e-07, + "loss": 0.5838, + "step": 24697 + }, + { + "epoch": 1.7843119547745045, + "grad_norm": 8.284056220438083, + "learning_rate": 1.5082403172515775e-07, + "loss": 0.6175, + "step": 24698 + }, + { + "epoch": 1.7843841999747143, + "grad_norm": 7.8009873670033905, + "learning_rate": 1.5072398822975454e-07, + "loss": 0.5748, + "step": 24699 + }, + { + "epoch": 1.7844564451749236, + "grad_norm": 6.99642607870303, + "learning_rate": 1.506239768941159e-07, + "loss": 0.5945, + "step": 24700 + }, + { + "epoch": 1.7845286903751332, + "grad_norm": 7.373481206164172, + "learning_rate": 1.5052399771961073e-07, + "loss": 0.6732, + "step": 24701 + }, + { + "epoch": 1.7846009355753427, + "grad_norm": 7.387879843717349, + "learning_rate": 1.5042405070760796e-07, + "loss": 0.6287, + "step": 24702 + }, + { + "epoch": 1.7846731807755523, + "grad_norm": 8.04300432164912, + "learning_rate": 1.5032413585947563e-07, + "loss": 0.6382, + "step": 24703 + }, + { + "epoch": 1.7847454259757618, + "grad_norm": 6.610794383920484, + "learning_rate": 1.502242531765813e-07, + "loss": 0.6012, + "step": 24704 + }, + { + "epoch": 1.7848176711759711, + "grad_norm": 7.44517441708588, + "learning_rate": 1.501244026602927e-07, + "loss": 0.6056, + "step": 24705 + }, + { + "epoch": 1.7848899163761809, + "grad_norm": 7.989305261765495, + "learning_rate": 1.5002458431197658e-07, + "loss": 0.6261, + "step": 24706 + }, + { + "epoch": 1.7849621615763902, + "grad_norm": 6.730343490382579, + "learning_rate": 1.4992479813299904e-07, + "loss": 0.6163, + "step": 24707 + }, + { + "epoch": 1.7850344067765997, + "grad_norm": 8.101837948095916, + "learning_rate": 1.4982504412472622e-07, + "loss": 0.6272, + "step": 24708 + }, + { + "epoch": 1.7851066519768093, + "grad_norm": 7.334445204396774, + "learning_rate": 1.497253222885242e-07, + "loss": 0.6678, + "step": 24709 + }, + { + "epoch": 1.7851788971770188, + "grad_norm": 7.8416652872676895, + "learning_rate": 1.4962563262575724e-07, + "loss": 0.6251, + "step": 24710 + }, + { + "epoch": 1.7852511423772284, + "grad_norm": 7.648489869106975, + "learning_rate": 1.495259751377906e-07, + "loss": 0.6421, + "step": 24711 + }, + { + "epoch": 1.7853233875774377, + "grad_norm": 6.338851465591541, + "learning_rate": 1.494263498259882e-07, + "loss": 0.5683, + "step": 24712 + }, + { + "epoch": 1.7853956327776475, + "grad_norm": 7.806840723683361, + "learning_rate": 1.4932675669171393e-07, + "loss": 0.5339, + "step": 24713 + }, + { + "epoch": 1.7854678779778568, + "grad_norm": 6.2765205759856695, + "learning_rate": 1.4922719573633115e-07, + "loss": 0.6402, + "step": 24714 + }, + { + "epoch": 1.7855401231780663, + "grad_norm": 7.870053872453623, + "learning_rate": 1.4912766696120268e-07, + "loss": 0.6518, + "step": 24715 + }, + { + "epoch": 1.7856123683782759, + "grad_norm": 7.096314626227762, + "learning_rate": 1.4902817036769185e-07, + "loss": 0.5655, + "step": 24716 + }, + { + "epoch": 1.7856846135784854, + "grad_norm": 7.506460077933628, + "learning_rate": 1.4892870595715926e-07, + "loss": 0.5894, + "step": 24717 + }, + { + "epoch": 1.785756858778695, + "grad_norm": 6.996966385851538, + "learning_rate": 1.4882927373096738e-07, + "loss": 0.6044, + "step": 24718 + }, + { + "epoch": 1.7858291039789043, + "grad_norm": 6.3087839198310744, + "learning_rate": 1.4872987369047714e-07, + "loss": 0.6565, + "step": 24719 + }, + { + "epoch": 1.785901349179114, + "grad_norm": 7.980060045977245, + "learning_rate": 1.4863050583704964e-07, + "loss": 0.5847, + "step": 24720 + }, + { + "epoch": 1.7859735943793233, + "grad_norm": 7.89702589813967, + "learning_rate": 1.4853117017204433e-07, + "loss": 0.6308, + "step": 24721 + }, + { + "epoch": 1.786045839579533, + "grad_norm": 7.89141418740629, + "learning_rate": 1.4843186669682124e-07, + "loss": 0.6281, + "step": 24722 + }, + { + "epoch": 1.7861180847797424, + "grad_norm": 8.309757554686886, + "learning_rate": 1.483325954127407e-07, + "loss": 0.6608, + "step": 24723 + }, + { + "epoch": 1.786190329979952, + "grad_norm": 8.84461825256983, + "learning_rate": 1.4823335632116077e-07, + "loss": 0.6287, + "step": 24724 + }, + { + "epoch": 1.7862625751801615, + "grad_norm": 7.309351316546577, + "learning_rate": 1.481341494234398e-07, + "loss": 0.5809, + "step": 24725 + }, + { + "epoch": 1.7863348203803708, + "grad_norm": 7.244704844989589, + "learning_rate": 1.480349747209367e-07, + "loss": 0.5545, + "step": 24726 + }, + { + "epoch": 1.7864070655805806, + "grad_norm": 7.192773890950409, + "learning_rate": 1.4793583221500873e-07, + "loss": 0.6018, + "step": 24727 + }, + { + "epoch": 1.78647931078079, + "grad_norm": 7.75979192553119, + "learning_rate": 1.478367219070126e-07, + "loss": 0.6299, + "step": 24728 + }, + { + "epoch": 1.7865515559809995, + "grad_norm": 8.807592012746065, + "learning_rate": 1.4773764379830523e-07, + "loss": 0.5342, + "step": 24729 + }, + { + "epoch": 1.786623801181209, + "grad_norm": 6.664128615128508, + "learning_rate": 1.4763859789024336e-07, + "loss": 0.6318, + "step": 24730 + }, + { + "epoch": 1.7866960463814185, + "grad_norm": 7.054987600033121, + "learning_rate": 1.4753958418418256e-07, + "loss": 0.6143, + "step": 24731 + }, + { + "epoch": 1.786768291581628, + "grad_norm": 7.776479216059331, + "learning_rate": 1.4744060268147815e-07, + "loss": 0.5996, + "step": 24732 + }, + { + "epoch": 1.7868405367818374, + "grad_norm": 7.4322229465514615, + "learning_rate": 1.473416533834854e-07, + "loss": 0.5765, + "step": 24733 + }, + { + "epoch": 1.7869127819820472, + "grad_norm": 7.187397035607601, + "learning_rate": 1.4724273629155905e-07, + "loss": 0.6039, + "step": 24734 + }, + { + "epoch": 1.7869850271822565, + "grad_norm": 6.590636761339671, + "learning_rate": 1.4714385140705222e-07, + "loss": 0.6399, + "step": 24735 + }, + { + "epoch": 1.7870572723824663, + "grad_norm": 7.249444348319049, + "learning_rate": 1.4704499873131938e-07, + "loss": 0.585, + "step": 24736 + }, + { + "epoch": 1.7871295175826756, + "grad_norm": 6.6654648015354345, + "learning_rate": 1.469461782657136e-07, + "loss": 0.619, + "step": 24737 + }, + { + "epoch": 1.7872017627828851, + "grad_norm": 8.549228155979245, + "learning_rate": 1.46847390011588e-07, + "loss": 0.5942, + "step": 24738 + }, + { + "epoch": 1.7872740079830947, + "grad_norm": 7.9808886656075, + "learning_rate": 1.4674863397029422e-07, + "loss": 0.6057, + "step": 24739 + }, + { + "epoch": 1.787346253183304, + "grad_norm": 8.874652694568884, + "learning_rate": 1.4664991014318425e-07, + "loss": 0.6127, + "step": 24740 + }, + { + "epoch": 1.7874184983835137, + "grad_norm": 8.212888999330993, + "learning_rate": 1.465512185316098e-07, + "loss": 0.6727, + "step": 24741 + }, + { + "epoch": 1.787490743583723, + "grad_norm": 7.263425038660228, + "learning_rate": 1.46452559136922e-07, + "loss": 0.5504, + "step": 24742 + }, + { + "epoch": 1.7875629887839328, + "grad_norm": 6.479561584147025, + "learning_rate": 1.463539319604712e-07, + "loss": 0.5502, + "step": 24743 + }, + { + "epoch": 1.7876352339841421, + "grad_norm": 7.888853452122336, + "learning_rate": 1.462553370036074e-07, + "loss": 0.644, + "step": 24744 + }, + { + "epoch": 1.7877074791843517, + "grad_norm": 7.0301057011565335, + "learning_rate": 1.461567742676809e-07, + "loss": 0.5616, + "step": 24745 + }, + { + "epoch": 1.7877797243845612, + "grad_norm": 8.66148554158635, + "learning_rate": 1.4605824375404033e-07, + "loss": 0.6095, + "step": 24746 + }, + { + "epoch": 1.7878519695847706, + "grad_norm": 7.891857934693006, + "learning_rate": 1.4595974546403434e-07, + "loss": 0.4933, + "step": 24747 + }, + { + "epoch": 1.7879242147849803, + "grad_norm": 8.576487900904162, + "learning_rate": 1.4586127939901246e-07, + "loss": 0.6426, + "step": 24748 + }, + { + "epoch": 1.7879964599851896, + "grad_norm": 7.575594072369269, + "learning_rate": 1.4576284556032106e-07, + "loss": 0.6582, + "step": 24749 + }, + { + "epoch": 1.7880687051853994, + "grad_norm": 6.919018020194017, + "learning_rate": 1.4566444394930822e-07, + "loss": 0.6122, + "step": 24750 + }, + { + "epoch": 1.7881409503856087, + "grad_norm": 6.190641588230236, + "learning_rate": 1.4556607456732125e-07, + "loss": 0.5111, + "step": 24751 + }, + { + "epoch": 1.7882131955858183, + "grad_norm": 7.075538684839767, + "learning_rate": 1.4546773741570735e-07, + "loss": 0.6034, + "step": 24752 + }, + { + "epoch": 1.7882854407860278, + "grad_norm": 7.76665037205287, + "learning_rate": 1.453694324958113e-07, + "loss": 0.6022, + "step": 24753 + }, + { + "epoch": 1.7883576859862373, + "grad_norm": 6.84700192743416, + "learning_rate": 1.4527115980897954e-07, + "loss": 0.5624, + "step": 24754 + }, + { + "epoch": 1.7884299311864469, + "grad_norm": 7.080012122537181, + "learning_rate": 1.4517291935655709e-07, + "loss": 0.5473, + "step": 24755 + }, + { + "epoch": 1.7885021763866562, + "grad_norm": 6.649785444197235, + "learning_rate": 1.4507471113988953e-07, + "loss": 0.5845, + "step": 24756 + }, + { + "epoch": 1.788574421586866, + "grad_norm": 6.730096081618915, + "learning_rate": 1.4497653516032052e-07, + "loss": 0.6585, + "step": 24757 + }, + { + "epoch": 1.7886466667870753, + "grad_norm": 7.173834771896594, + "learning_rate": 1.448783914191937e-07, + "loss": 0.6484, + "step": 24758 + }, + { + "epoch": 1.7887189119872848, + "grad_norm": 7.585773815899492, + "learning_rate": 1.4478027991785354e-07, + "loss": 0.6472, + "step": 24759 + }, + { + "epoch": 1.7887911571874944, + "grad_norm": 8.22473437219489, + "learning_rate": 1.4468220065764232e-07, + "loss": 0.5725, + "step": 24760 + }, + { + "epoch": 1.788863402387704, + "grad_norm": 7.472167442275393, + "learning_rate": 1.4458415363990313e-07, + "loss": 0.5999, + "step": 24761 + }, + { + "epoch": 1.7889356475879135, + "grad_norm": 8.2548943075788, + "learning_rate": 1.4448613886597796e-07, + "loss": 0.5907, + "step": 24762 + }, + { + "epoch": 1.7890078927881228, + "grad_norm": 9.12224255011159, + "learning_rate": 1.4438815633720905e-07, + "loss": 0.6641, + "step": 24763 + }, + { + "epoch": 1.7890801379883325, + "grad_norm": 8.271077367864196, + "learning_rate": 1.4429020605493671e-07, + "loss": 0.657, + "step": 24764 + }, + { + "epoch": 1.7891523831885419, + "grad_norm": 9.303322180214291, + "learning_rate": 1.4419228802050268e-07, + "loss": 0.7069, + "step": 24765 + }, + { + "epoch": 1.7892246283887514, + "grad_norm": 8.246226979620078, + "learning_rate": 1.4409440223524722e-07, + "loss": 0.6148, + "step": 24766 + }, + { + "epoch": 1.789296873588961, + "grad_norm": 6.402062512889523, + "learning_rate": 1.439965487005099e-07, + "loss": 0.5482, + "step": 24767 + }, + { + "epoch": 1.7893691187891705, + "grad_norm": 7.218865992287127, + "learning_rate": 1.438987274176304e-07, + "loss": 0.534, + "step": 24768 + }, + { + "epoch": 1.78944136398938, + "grad_norm": 8.067292910560838, + "learning_rate": 1.4380093838794796e-07, + "loss": 0.6057, + "step": 24769 + }, + { + "epoch": 1.7895136091895893, + "grad_norm": 8.420569040818524, + "learning_rate": 1.4370318161280094e-07, + "loss": 0.5716, + "step": 24770 + }, + { + "epoch": 1.7895858543897991, + "grad_norm": 6.548028885206153, + "learning_rate": 1.436054570935283e-07, + "loss": 0.552, + "step": 24771 + }, + { + "epoch": 1.7896580995900084, + "grad_norm": 6.786481254657985, + "learning_rate": 1.43507764831467e-07, + "loss": 0.5829, + "step": 24772 + }, + { + "epoch": 1.789730344790218, + "grad_norm": 7.360655201482894, + "learning_rate": 1.4341010482795514e-07, + "loss": 0.6097, + "step": 24773 + }, + { + "epoch": 1.7898025899904275, + "grad_norm": 8.448455705547243, + "learning_rate": 1.4331247708432887e-07, + "loss": 0.6263, + "step": 24774 + }, + { + "epoch": 1.789874835190637, + "grad_norm": 6.891043514076125, + "learning_rate": 1.4321488160192493e-07, + "loss": 0.6436, + "step": 24775 + }, + { + "epoch": 1.7899470803908466, + "grad_norm": 8.698729358708356, + "learning_rate": 1.4311731838207916e-07, + "loss": 0.5867, + "step": 24776 + }, + { + "epoch": 1.790019325591056, + "grad_norm": 8.746365037384816, + "learning_rate": 1.43019787426128e-07, + "loss": 0.6477, + "step": 24777 + }, + { + "epoch": 1.7900915707912657, + "grad_norm": 6.163646301993974, + "learning_rate": 1.429222887354048e-07, + "loss": 0.5003, + "step": 24778 + }, + { + "epoch": 1.790163815991475, + "grad_norm": 6.826369699386715, + "learning_rate": 1.4282482231124578e-07, + "loss": 0.5788, + "step": 24779 + }, + { + "epoch": 1.7902360611916845, + "grad_norm": 7.844532961731712, + "learning_rate": 1.4272738815498483e-07, + "loss": 0.608, + "step": 24780 + }, + { + "epoch": 1.790308306391894, + "grad_norm": 7.359335346479556, + "learning_rate": 1.4262998626795588e-07, + "loss": 0.5688, + "step": 24781 + }, + { + "epoch": 1.7903805515921036, + "grad_norm": 7.295724520769612, + "learning_rate": 1.4253261665149177e-07, + "loss": 0.633, + "step": 24782 + }, + { + "epoch": 1.7904527967923132, + "grad_norm": 8.314605675647156, + "learning_rate": 1.4243527930692585e-07, + "loss": 0.6409, + "step": 24783 + }, + { + "epoch": 1.7905250419925225, + "grad_norm": 8.006167895143992, + "learning_rate": 1.4233797423559043e-07, + "loss": 0.6337, + "step": 24784 + }, + { + "epoch": 1.7905972871927323, + "grad_norm": 7.847732809404429, + "learning_rate": 1.4224070143881747e-07, + "loss": 0.6177, + "step": 24785 + }, + { + "epoch": 1.7906695323929416, + "grad_norm": 6.354070677318854, + "learning_rate": 1.421434609179384e-07, + "loss": 0.583, + "step": 24786 + }, + { + "epoch": 1.7907417775931511, + "grad_norm": 7.065677974321813, + "learning_rate": 1.4204625267428467e-07, + "loss": 0.575, + "step": 24787 + }, + { + "epoch": 1.7908140227933607, + "grad_norm": 6.735550567620631, + "learning_rate": 1.419490767091869e-07, + "loss": 0.6065, + "step": 24788 + }, + { + "epoch": 1.7908862679935702, + "grad_norm": 8.097632224548295, + "learning_rate": 1.418519330239754e-07, + "loss": 0.6419, + "step": 24789 + }, + { + "epoch": 1.7909585131937797, + "grad_norm": 7.375220602583192, + "learning_rate": 1.417548216199799e-07, + "loss": 0.5636, + "step": 24790 + }, + { + "epoch": 1.791030758393989, + "grad_norm": 7.026439055074607, + "learning_rate": 1.4165774249853022e-07, + "loss": 0.5204, + "step": 24791 + }, + { + "epoch": 1.7911030035941988, + "grad_norm": 7.071346717349719, + "learning_rate": 1.4156069566095443e-07, + "loss": 0.5902, + "step": 24792 + }, + { + "epoch": 1.7911752487944081, + "grad_norm": 8.12868240246214, + "learning_rate": 1.4146368110858177e-07, + "loss": 0.6632, + "step": 24793 + }, + { + "epoch": 1.7912474939946177, + "grad_norm": 6.939226365296648, + "learning_rate": 1.413666988427398e-07, + "loss": 0.5566, + "step": 24794 + }, + { + "epoch": 1.7913197391948272, + "grad_norm": 5.412562571245388, + "learning_rate": 1.4126974886475685e-07, + "loss": 0.5554, + "step": 24795 + }, + { + "epoch": 1.7913919843950368, + "grad_norm": 6.385897429032281, + "learning_rate": 1.4117283117595914e-07, + "loss": 0.5739, + "step": 24796 + }, + { + "epoch": 1.7914642295952463, + "grad_norm": 7.594455262388599, + "learning_rate": 1.4107594577767392e-07, + "loss": 0.5703, + "step": 24797 + }, + { + "epoch": 1.7915364747954556, + "grad_norm": 6.885706662194735, + "learning_rate": 1.4097909267122734e-07, + "loss": 0.5794, + "step": 24798 + }, + { + "epoch": 1.7916087199956654, + "grad_norm": 6.729077727337296, + "learning_rate": 1.4088227185794557e-07, + "loss": 0.5733, + "step": 24799 + }, + { + "epoch": 1.7916809651958747, + "grad_norm": 8.557508809884451, + "learning_rate": 1.4078548333915364e-07, + "loss": 0.5564, + "step": 24800 + }, + { + "epoch": 1.7917532103960843, + "grad_norm": 8.2673481116864, + "learning_rate": 1.4068872711617638e-07, + "loss": 0.6012, + "step": 24801 + }, + { + "epoch": 1.7918254555962938, + "grad_norm": 6.205717145743009, + "learning_rate": 1.4059200319033934e-07, + "loss": 0.5659, + "step": 24802 + }, + { + "epoch": 1.7918977007965033, + "grad_norm": 7.46556706619618, + "learning_rate": 1.4049531156296507e-07, + "loss": 0.6342, + "step": 24803 + }, + { + "epoch": 1.7919699459967129, + "grad_norm": 7.101671274692938, + "learning_rate": 1.4039865223537814e-07, + "loss": 0.6059, + "step": 24804 + }, + { + "epoch": 1.7920421911969222, + "grad_norm": 7.549569922317621, + "learning_rate": 1.4030202520890185e-07, + "loss": 0.6402, + "step": 24805 + }, + { + "epoch": 1.792114436397132, + "grad_norm": 8.336434918822418, + "learning_rate": 1.4020543048485823e-07, + "loss": 0.632, + "step": 24806 + }, + { + "epoch": 1.7921866815973413, + "grad_norm": 6.534949610077342, + "learning_rate": 1.4010886806456985e-07, + "loss": 0.6055, + "step": 24807 + }, + { + "epoch": 1.7922589267975508, + "grad_norm": 7.310822123207998, + "learning_rate": 1.4001233794935898e-07, + "loss": 0.5766, + "step": 24808 + }, + { + "epoch": 1.7923311719977604, + "grad_norm": 9.11682306000942, + "learning_rate": 1.3991584014054703e-07, + "loss": 0.6496, + "step": 24809 + }, + { + "epoch": 1.79240341719797, + "grad_norm": 7.190255076588469, + "learning_rate": 1.3981937463945466e-07, + "loss": 0.6388, + "step": 24810 + }, + { + "epoch": 1.7924756623981795, + "grad_norm": 5.982739097959882, + "learning_rate": 1.3972294144740218e-07, + "loss": 0.5286, + "step": 24811 + }, + { + "epoch": 1.7925479075983888, + "grad_norm": 7.325209047634937, + "learning_rate": 1.3962654056571018e-07, + "loss": 0.5398, + "step": 24812 + }, + { + "epoch": 1.7926201527985985, + "grad_norm": 7.232911326437199, + "learning_rate": 1.3953017199569847e-07, + "loss": 0.5383, + "step": 24813 + }, + { + "epoch": 1.7926923979988079, + "grad_norm": 9.103701046371153, + "learning_rate": 1.394338357386857e-07, + "loss": 0.6265, + "step": 24814 + }, + { + "epoch": 1.7927646431990176, + "grad_norm": 8.18131261091994, + "learning_rate": 1.3933753179599052e-07, + "loss": 0.6793, + "step": 24815 + }, + { + "epoch": 1.792836888399227, + "grad_norm": 7.502162367314298, + "learning_rate": 1.3924126016893192e-07, + "loss": 0.6429, + "step": 24816 + }, + { + "epoch": 1.7929091335994365, + "grad_norm": 7.793106544442794, + "learning_rate": 1.3914502085882715e-07, + "loss": 0.5852, + "step": 24817 + }, + { + "epoch": 1.792981378799646, + "grad_norm": 7.129064103560563, + "learning_rate": 1.3904881386699405e-07, + "loss": 0.5616, + "step": 24818 + }, + { + "epoch": 1.7930536239998554, + "grad_norm": 6.36850422543127, + "learning_rate": 1.389526391947496e-07, + "loss": 0.6398, + "step": 24819 + }, + { + "epoch": 1.7931258692000651, + "grad_norm": 6.450973780261632, + "learning_rate": 1.3885649684341086e-07, + "loss": 0.6152, + "step": 24820 + }, + { + "epoch": 1.7931981144002744, + "grad_norm": 7.181771558076902, + "learning_rate": 1.3876038681429256e-07, + "loss": 0.5884, + "step": 24821 + }, + { + "epoch": 1.7932703596004842, + "grad_norm": 7.994542644652536, + "learning_rate": 1.3866430910871142e-07, + "loss": 0.5775, + "step": 24822 + }, + { + "epoch": 1.7933426048006935, + "grad_norm": 6.232586214438606, + "learning_rate": 1.385682637279828e-07, + "loss": 0.6015, + "step": 24823 + }, + { + "epoch": 1.793414850000903, + "grad_norm": 6.974068156244906, + "learning_rate": 1.3847225067342068e-07, + "loss": 0.5688, + "step": 24824 + }, + { + "epoch": 1.7934870952011126, + "grad_norm": 7.098322936235014, + "learning_rate": 1.3837626994634006e-07, + "loss": 0.6356, + "step": 24825 + }, + { + "epoch": 1.793559340401322, + "grad_norm": 7.056104076353192, + "learning_rate": 1.3828032154805437e-07, + "loss": 0.594, + "step": 24826 + }, + { + "epoch": 1.7936315856015317, + "grad_norm": 8.0829077015404, + "learning_rate": 1.3818440547987754e-07, + "loss": 0.611, + "step": 24827 + }, + { + "epoch": 1.793703830801741, + "grad_norm": 7.064250357799883, + "learning_rate": 1.3808852174312214e-07, + "loss": 0.645, + "step": 24828 + }, + { + "epoch": 1.7937760760019508, + "grad_norm": 6.840734644917732, + "learning_rate": 1.3799267033910102e-07, + "loss": 0.5506, + "step": 24829 + }, + { + "epoch": 1.79384832120216, + "grad_norm": 8.277434078081939, + "learning_rate": 1.3789685126912672e-07, + "loss": 0.6337, + "step": 24830 + }, + { + "epoch": 1.7939205664023696, + "grad_norm": 8.139728050094314, + "learning_rate": 1.3780106453451013e-07, + "loss": 0.6057, + "step": 24831 + }, + { + "epoch": 1.7939928116025792, + "grad_norm": 6.9005653149767205, + "learning_rate": 1.3770531013656274e-07, + "loss": 0.5271, + "step": 24832 + }, + { + "epoch": 1.7940650568027887, + "grad_norm": 7.239449025470296, + "learning_rate": 1.376095880765954e-07, + "loss": 0.5788, + "step": 24833 + }, + { + "epoch": 1.7941373020029983, + "grad_norm": 8.147934690562982, + "learning_rate": 1.3751389835591906e-07, + "loss": 0.6606, + "step": 24834 + }, + { + "epoch": 1.7942095472032076, + "grad_norm": 7.678636853519162, + "learning_rate": 1.3741824097584206e-07, + "loss": 0.5855, + "step": 24835 + }, + { + "epoch": 1.7942817924034173, + "grad_norm": 6.334157522012843, + "learning_rate": 1.373226159376756e-07, + "loss": 0.5753, + "step": 24836 + }, + { + "epoch": 1.7943540376036267, + "grad_norm": 7.501417916097805, + "learning_rate": 1.372270232427278e-07, + "loss": 0.669, + "step": 24837 + }, + { + "epoch": 1.7944262828038362, + "grad_norm": 7.984458848247966, + "learning_rate": 1.3713146289230818e-07, + "loss": 0.6043, + "step": 24838 + }, + { + "epoch": 1.7944985280040457, + "grad_norm": 8.11377962479908, + "learning_rate": 1.3703593488772344e-07, + "loss": 0.6308, + "step": 24839 + }, + { + "epoch": 1.7945707732042553, + "grad_norm": 8.45262497076999, + "learning_rate": 1.3694043923028228e-07, + "loss": 0.6311, + "step": 24840 + }, + { + "epoch": 1.7946430184044648, + "grad_norm": 7.586965787831303, + "learning_rate": 1.3684497592129198e-07, + "loss": 0.6772, + "step": 24841 + }, + { + "epoch": 1.7947152636046741, + "grad_norm": 6.205166343899956, + "learning_rate": 1.3674954496205872e-07, + "loss": 0.5575, + "step": 24842 + }, + { + "epoch": 1.794787508804884, + "grad_norm": 6.2283802705581275, + "learning_rate": 1.3665414635388923e-07, + "loss": 0.5444, + "step": 24843 + }, + { + "epoch": 1.7948597540050932, + "grad_norm": 8.483366904601377, + "learning_rate": 1.3655878009808943e-07, + "loss": 0.5735, + "step": 24844 + }, + { + "epoch": 1.7949319992053028, + "grad_norm": 7.886943902726572, + "learning_rate": 1.364634461959649e-07, + "loss": 0.5946, + "step": 24845 + }, + { + "epoch": 1.7950042444055123, + "grad_norm": 7.683704710926676, + "learning_rate": 1.3636814464882077e-07, + "loss": 0.5713, + "step": 24846 + }, + { + "epoch": 1.7950764896057219, + "grad_norm": 6.893303383049503, + "learning_rate": 1.362728754579612e-07, + "loss": 0.5518, + "step": 24847 + }, + { + "epoch": 1.7951487348059314, + "grad_norm": 7.153570348125888, + "learning_rate": 1.3617763862469103e-07, + "loss": 0.5138, + "step": 24848 + }, + { + "epoch": 1.7952209800061407, + "grad_norm": 7.878064240611345, + "learning_rate": 1.3608243415031336e-07, + "loss": 0.5736, + "step": 24849 + }, + { + "epoch": 1.7952932252063505, + "grad_norm": 9.249743896238213, + "learning_rate": 1.359872620361319e-07, + "loss": 0.6575, + "step": 24850 + }, + { + "epoch": 1.7953654704065598, + "grad_norm": 6.663221836171831, + "learning_rate": 1.3589212228344888e-07, + "loss": 0.5849, + "step": 24851 + }, + { + "epoch": 1.7954377156067693, + "grad_norm": 6.191889275322319, + "learning_rate": 1.3579701489356778e-07, + "loss": 0.5612, + "step": 24852 + }, + { + "epoch": 1.7955099608069789, + "grad_norm": 7.0340348217807, + "learning_rate": 1.3570193986778946e-07, + "loss": 0.6236, + "step": 24853 + }, + { + "epoch": 1.7955822060071884, + "grad_norm": 7.524853534477068, + "learning_rate": 1.3560689720741566e-07, + "loss": 0.6023, + "step": 24854 + }, + { + "epoch": 1.795654451207398, + "grad_norm": 7.211373131055051, + "learning_rate": 1.3551188691374755e-07, + "loss": 0.5668, + "step": 24855 + }, + { + "epoch": 1.7957266964076073, + "grad_norm": 8.68145981075768, + "learning_rate": 1.3541690898808607e-07, + "loss": 0.5949, + "step": 24856 + }, + { + "epoch": 1.795798941607817, + "grad_norm": 8.080858251483342, + "learning_rate": 1.35321963431731e-07, + "loss": 0.5586, + "step": 24857 + }, + { + "epoch": 1.7958711868080264, + "grad_norm": 8.438687615348348, + "learning_rate": 1.352270502459821e-07, + "loss": 0.5855, + "step": 24858 + }, + { + "epoch": 1.795943432008236, + "grad_norm": 83.2586331859946, + "learning_rate": 1.351321694321392e-07, + "loss": 0.7373, + "step": 24859 + }, + { + "epoch": 1.7960156772084455, + "grad_norm": 6.841428876950278, + "learning_rate": 1.3503732099150041e-07, + "loss": 0.5413, + "step": 24860 + }, + { + "epoch": 1.796087922408655, + "grad_norm": 8.596176415267566, + "learning_rate": 1.3494250492536443e-07, + "loss": 0.6455, + "step": 24861 + }, + { + "epoch": 1.7961601676088645, + "grad_norm": 8.866509042739452, + "learning_rate": 1.348477212350291e-07, + "loss": 0.599, + "step": 24862 + }, + { + "epoch": 1.7962324128090739, + "grad_norm": 6.568279809211816, + "learning_rate": 1.3475296992179198e-07, + "loss": 0.5446, + "step": 24863 + }, + { + "epoch": 1.7963046580092836, + "grad_norm": 7.806985602679145, + "learning_rate": 1.346582509869504e-07, + "loss": 0.606, + "step": 24864 + }, + { + "epoch": 1.796376903209493, + "grad_norm": 7.109259904202268, + "learning_rate": 1.345635644318008e-07, + "loss": 0.6177, + "step": 24865 + }, + { + "epoch": 1.7964491484097025, + "grad_norm": 7.205480757166348, + "learning_rate": 1.3446891025763937e-07, + "loss": 0.6359, + "step": 24866 + }, + { + "epoch": 1.796521393609912, + "grad_norm": 7.297339741122328, + "learning_rate": 1.3437428846576173e-07, + "loss": 0.559, + "step": 24867 + }, + { + "epoch": 1.7965936388101216, + "grad_norm": 7.5268840877101475, + "learning_rate": 1.342796990574635e-07, + "loss": 0.5532, + "step": 24868 + }, + { + "epoch": 1.7966658840103311, + "grad_norm": 6.797599806450958, + "learning_rate": 1.34185142034039e-07, + "loss": 0.5853, + "step": 24869 + }, + { + "epoch": 1.7967381292105404, + "grad_norm": 5.744042004955004, + "learning_rate": 1.3409061739678347e-07, + "loss": 0.6124, + "step": 24870 + }, + { + "epoch": 1.7968103744107502, + "grad_norm": 7.464696063263027, + "learning_rate": 1.339961251469901e-07, + "loss": 0.5007, + "step": 24871 + }, + { + "epoch": 1.7968826196109595, + "grad_norm": 8.144887207449242, + "learning_rate": 1.339016652859526e-07, + "loss": 0.6695, + "step": 24872 + }, + { + "epoch": 1.796954864811169, + "grad_norm": 8.147537196338698, + "learning_rate": 1.3380723781496408e-07, + "loss": 0.6448, + "step": 24873 + }, + { + "epoch": 1.7970271100113786, + "grad_norm": 8.961143537844897, + "learning_rate": 1.3371284273531738e-07, + "loss": 0.6098, + "step": 24874 + }, + { + "epoch": 1.7970993552115881, + "grad_norm": 6.374270229082631, + "learning_rate": 1.3361848004830424e-07, + "loss": 0.6349, + "step": 24875 + }, + { + "epoch": 1.7971716004117977, + "grad_norm": 6.770145166100086, + "learning_rate": 1.33524149755217e-07, + "loss": 0.5948, + "step": 24876 + }, + { + "epoch": 1.797243845612007, + "grad_norm": 8.05842144144433, + "learning_rate": 1.3342985185734707e-07, + "loss": 0.5951, + "step": 24877 + }, + { + "epoch": 1.7973160908122168, + "grad_norm": 7.3208252555210604, + "learning_rate": 1.3333558635598458e-07, + "loss": 0.6024, + "step": 24878 + }, + { + "epoch": 1.797388336012426, + "grad_norm": 8.923929649304851, + "learning_rate": 1.3324135325242043e-07, + "loss": 0.6224, + "step": 24879 + }, + { + "epoch": 1.7974605812126356, + "grad_norm": 7.4256772230187185, + "learning_rate": 1.331471525479447e-07, + "loss": 0.6345, + "step": 24880 + }, + { + "epoch": 1.7975328264128452, + "grad_norm": 8.347037021516929, + "learning_rate": 1.3305298424384632e-07, + "loss": 0.6362, + "step": 24881 + }, + { + "epoch": 1.7976050716130547, + "grad_norm": 6.756627820202355, + "learning_rate": 1.329588483414146e-07, + "loss": 0.6172, + "step": 24882 + }, + { + "epoch": 1.7976773168132643, + "grad_norm": 7.373110771660214, + "learning_rate": 1.3286474484193872e-07, + "loss": 0.57, + "step": 24883 + }, + { + "epoch": 1.7977495620134736, + "grad_norm": 7.36058938278802, + "learning_rate": 1.327706737467063e-07, + "loss": 0.6219, + "step": 24884 + }, + { + "epoch": 1.7978218072136833, + "grad_norm": 6.854333228161988, + "learning_rate": 1.3267663505700523e-07, + "loss": 0.5581, + "step": 24885 + }, + { + "epoch": 1.7978940524138927, + "grad_norm": 7.663568976607995, + "learning_rate": 1.325826287741233e-07, + "loss": 0.6222, + "step": 24886 + }, + { + "epoch": 1.7979662976141024, + "grad_norm": 7.133888455951153, + "learning_rate": 1.3248865489934677e-07, + "loss": 0.6298, + "step": 24887 + }, + { + "epoch": 1.7980385428143117, + "grad_norm": 8.771301063757544, + "learning_rate": 1.323947134339626e-07, + "loss": 0.6498, + "step": 24888 + }, + { + "epoch": 1.7981107880145213, + "grad_norm": 6.372886157661663, + "learning_rate": 1.323008043792562e-07, + "loss": 0.615, + "step": 24889 + }, + { + "epoch": 1.7981830332147308, + "grad_norm": 7.9115710540302135, + "learning_rate": 1.3220692773651316e-07, + "loss": 0.6637, + "step": 24890 + }, + { + "epoch": 1.7982552784149402, + "grad_norm": 8.087480510234196, + "learning_rate": 1.3211308350701918e-07, + "loss": 0.5994, + "step": 24891 + }, + { + "epoch": 1.79832752361515, + "grad_norm": 7.944409104322719, + "learning_rate": 1.320192716920582e-07, + "loss": 0.5702, + "step": 24892 + }, + { + "epoch": 1.7983997688153592, + "grad_norm": 6.81425131588891, + "learning_rate": 1.31925492292915e-07, + "loss": 0.5723, + "step": 24893 + }, + { + "epoch": 1.798472014015569, + "grad_norm": 8.852682933280029, + "learning_rate": 1.3183174531087307e-07, + "loss": 0.5983, + "step": 24894 + }, + { + "epoch": 1.7985442592157783, + "grad_norm": 8.547875159287806, + "learning_rate": 1.317380307472163e-07, + "loss": 0.5894, + "step": 24895 + }, + { + "epoch": 1.7986165044159879, + "grad_norm": 8.739632268017676, + "learning_rate": 1.316443486032265e-07, + "loss": 0.5934, + "step": 24896 + }, + { + "epoch": 1.7986887496161974, + "grad_norm": 7.677778097846758, + "learning_rate": 1.315506988801865e-07, + "loss": 0.6269, + "step": 24897 + }, + { + "epoch": 1.7987609948164067, + "grad_norm": 7.237224240229754, + "learning_rate": 1.314570815793792e-07, + "loss": 0.579, + "step": 24898 + }, + { + "epoch": 1.7988332400166165, + "grad_norm": 7.706778814606374, + "learning_rate": 1.3136349670208465e-07, + "loss": 0.6487, + "step": 24899 + }, + { + "epoch": 1.7989054852168258, + "grad_norm": 7.355014424364998, + "learning_rate": 1.312699442495849e-07, + "loss": 0.5885, + "step": 24900 + }, + { + "epoch": 1.7989777304170356, + "grad_norm": 7.394235128447406, + "learning_rate": 1.3117642422316058e-07, + "loss": 0.6567, + "step": 24901 + }, + { + "epoch": 1.799049975617245, + "grad_norm": 7.493819423987088, + "learning_rate": 1.3108293662409148e-07, + "loss": 0.5762, + "step": 24902 + }, + { + "epoch": 1.7991222208174544, + "grad_norm": 7.019089962478554, + "learning_rate": 1.3098948145365775e-07, + "loss": 0.5605, + "step": 24903 + }, + { + "epoch": 1.799194466017664, + "grad_norm": 7.998101724476149, + "learning_rate": 1.308960587131386e-07, + "loss": 0.6109, + "step": 24904 + }, + { + "epoch": 1.7992667112178735, + "grad_norm": 8.742002047156044, + "learning_rate": 1.30802668403813e-07, + "loss": 0.6814, + "step": 24905 + }, + { + "epoch": 1.799338956418083, + "grad_norm": 7.526290336821378, + "learning_rate": 1.307093105269594e-07, + "loss": 0.6115, + "step": 24906 + }, + { + "epoch": 1.7994112016182924, + "grad_norm": 7.18641189963468, + "learning_rate": 1.3061598508385537e-07, + "loss": 0.5648, + "step": 24907 + }, + { + "epoch": 1.7994834468185021, + "grad_norm": 6.435154756168638, + "learning_rate": 1.3052269207577907e-07, + "loss": 0.5862, + "step": 24908 + }, + { + "epoch": 1.7995556920187115, + "grad_norm": 6.846367601043146, + "learning_rate": 1.3042943150400748e-07, + "loss": 0.6055, + "step": 24909 + }, + { + "epoch": 1.799627937218921, + "grad_norm": 9.894282576202981, + "learning_rate": 1.303362033698169e-07, + "loss": 0.6279, + "step": 24910 + }, + { + "epoch": 1.7997001824191305, + "grad_norm": 8.43469689042237, + "learning_rate": 1.3024300767448345e-07, + "loss": 0.5734, + "step": 24911 + }, + { + "epoch": 1.79977242761934, + "grad_norm": 6.951009432809724, + "learning_rate": 1.3014984441928364e-07, + "loss": 0.5705, + "step": 24912 + }, + { + "epoch": 1.7998446728195496, + "grad_norm": 7.078145479757899, + "learning_rate": 1.3005671360549227e-07, + "loss": 0.6211, + "step": 24913 + }, + { + "epoch": 1.799916918019759, + "grad_norm": 6.456725217717522, + "learning_rate": 1.2996361523438417e-07, + "loss": 0.5558, + "step": 24914 + }, + { + "epoch": 1.7999891632199687, + "grad_norm": 7.674935594717173, + "learning_rate": 1.2987054930723414e-07, + "loss": 0.5523, + "step": 24915 + }, + { + "epoch": 1.800061408420178, + "grad_norm": 6.607637506180703, + "learning_rate": 1.2977751582531616e-07, + "loss": 0.5378, + "step": 24916 + }, + { + "epoch": 1.8001336536203876, + "grad_norm": 7.013954422479875, + "learning_rate": 1.296845147899034e-07, + "loss": 0.5944, + "step": 24917 + }, + { + "epoch": 1.8002058988205971, + "grad_norm": 5.505603536701924, + "learning_rate": 1.2959154620226928e-07, + "loss": 0.5057, + "step": 24918 + }, + { + "epoch": 1.8002781440208067, + "grad_norm": 8.159630787294295, + "learning_rate": 1.2949861006368637e-07, + "loss": 0.6721, + "step": 24919 + }, + { + "epoch": 1.8003503892210162, + "grad_norm": 7.552541938975926, + "learning_rate": 1.2940570637542698e-07, + "loss": 0.624, + "step": 24920 + }, + { + "epoch": 1.8004226344212255, + "grad_norm": 7.395641342321693, + "learning_rate": 1.2931283513876263e-07, + "loss": 0.6328, + "step": 24921 + }, + { + "epoch": 1.8004948796214353, + "grad_norm": 6.902966024934952, + "learning_rate": 1.2921999635496507e-07, + "loss": 0.5797, + "step": 24922 + }, + { + "epoch": 1.8005671248216446, + "grad_norm": 7.778278810015612, + "learning_rate": 1.291271900253052e-07, + "loss": 0.5455, + "step": 24923 + }, + { + "epoch": 1.8006393700218541, + "grad_norm": 7.233697913104118, + "learning_rate": 1.2903441615105288e-07, + "loss": 0.642, + "step": 24924 + }, + { + "epoch": 1.8007116152220637, + "grad_norm": 6.893717030976382, + "learning_rate": 1.2894167473347847e-07, + "loss": 0.5879, + "step": 24925 + }, + { + "epoch": 1.8007838604222732, + "grad_norm": 6.584854272201687, + "learning_rate": 1.2884896577385147e-07, + "loss": 0.5743, + "step": 24926 + }, + { + "epoch": 1.8008561056224828, + "grad_norm": 7.5311180514731415, + "learning_rate": 1.2875628927344146e-07, + "loss": 0.6688, + "step": 24927 + }, + { + "epoch": 1.800928350822692, + "grad_norm": 9.843548097507789, + "learning_rate": 1.2866364523351632e-07, + "loss": 0.6704, + "step": 24928 + }, + { + "epoch": 1.8010005960229019, + "grad_norm": 8.040449402808346, + "learning_rate": 1.2857103365534445e-07, + "loss": 0.5795, + "step": 24929 + }, + { + "epoch": 1.8010728412231112, + "grad_norm": 6.400242812795788, + "learning_rate": 1.2847845454019402e-07, + "loss": 0.5891, + "step": 24930 + }, + { + "epoch": 1.8011450864233207, + "grad_norm": 8.332082069553175, + "learning_rate": 1.2838590788933237e-07, + "loss": 0.5627, + "step": 24931 + }, + { + "epoch": 1.8012173316235303, + "grad_norm": 7.371949162904558, + "learning_rate": 1.2829339370402593e-07, + "loss": 0.6196, + "step": 24932 + }, + { + "epoch": 1.8012895768237398, + "grad_norm": 6.89668626686926, + "learning_rate": 1.2820091198554151e-07, + "loss": 0.5366, + "step": 24933 + }, + { + "epoch": 1.8013618220239493, + "grad_norm": 8.360845190517804, + "learning_rate": 1.281084627351456e-07, + "loss": 0.6465, + "step": 24934 + }, + { + "epoch": 1.8014340672241587, + "grad_norm": 8.172772622636005, + "learning_rate": 1.2801604595410244e-07, + "loss": 0.6137, + "step": 24935 + }, + { + "epoch": 1.8015063124243684, + "grad_norm": 5.386062947298854, + "learning_rate": 1.2792366164367825e-07, + "loss": 0.5603, + "step": 24936 + }, + { + "epoch": 1.8015785576245777, + "grad_norm": 6.9666476649461755, + "learning_rate": 1.278313098051373e-07, + "loss": 0.5861, + "step": 24937 + }, + { + "epoch": 1.8016508028247873, + "grad_norm": 6.637407021311314, + "learning_rate": 1.2773899043974387e-07, + "loss": 0.6009, + "step": 24938 + }, + { + "epoch": 1.8017230480249968, + "grad_norm": 8.307136354020974, + "learning_rate": 1.2764670354876135e-07, + "loss": 0.6068, + "step": 24939 + }, + { + "epoch": 1.8017952932252064, + "grad_norm": 7.244547404914318, + "learning_rate": 1.275544491334535e-07, + "loss": 0.6455, + "step": 24940 + }, + { + "epoch": 1.801867538425416, + "grad_norm": 7.163514560390981, + "learning_rate": 1.2746222719508345e-07, + "loss": 0.6151, + "step": 24941 + }, + { + "epoch": 1.8019397836256252, + "grad_norm": 7.629864001180733, + "learning_rate": 1.2737003773491324e-07, + "loss": 0.641, + "step": 24942 + }, + { + "epoch": 1.802012028825835, + "grad_norm": 7.337043979387663, + "learning_rate": 1.2727788075420494e-07, + "loss": 0.616, + "step": 24943 + }, + { + "epoch": 1.8020842740260443, + "grad_norm": 7.538856811879988, + "learning_rate": 1.2718575625422032e-07, + "loss": 0.6344, + "step": 24944 + }, + { + "epoch": 1.8021565192262539, + "grad_norm": 7.921853089913611, + "learning_rate": 1.2709366423622027e-07, + "loss": 0.5521, + "step": 24945 + }, + { + "epoch": 1.8022287644264634, + "grad_norm": 7.025547546551615, + "learning_rate": 1.270016047014655e-07, + "loss": 0.6253, + "step": 24946 + }, + { + "epoch": 1.802301009626673, + "grad_norm": 6.749155910131843, + "learning_rate": 1.269095776512161e-07, + "loss": 0.5938, + "step": 24947 + }, + { + "epoch": 1.8023732548268825, + "grad_norm": 8.33236775208326, + "learning_rate": 1.2681758308673214e-07, + "loss": 0.5988, + "step": 24948 + }, + { + "epoch": 1.8024455000270918, + "grad_norm": 7.105319230748862, + "learning_rate": 1.2672562100927265e-07, + "loss": 0.6156, + "step": 24949 + }, + { + "epoch": 1.8025177452273016, + "grad_norm": 6.112121120950399, + "learning_rate": 1.266336914200969e-07, + "loss": 0.5377, + "step": 24950 + }, + { + "epoch": 1.802589990427511, + "grad_norm": 7.12634344987499, + "learning_rate": 1.2654179432046276e-07, + "loss": 0.5743, + "step": 24951 + }, + { + "epoch": 1.8026622356277204, + "grad_norm": 8.432491822407, + "learning_rate": 1.2644992971162923e-07, + "loss": 0.6466, + "step": 24952 + }, + { + "epoch": 1.80273448082793, + "grad_norm": 6.4087793334742, + "learning_rate": 1.263580975948528e-07, + "loss": 0.5989, + "step": 24953 + }, + { + "epoch": 1.8028067260281395, + "grad_norm": 9.453333223429878, + "learning_rate": 1.2626629797139106e-07, + "loss": 0.5836, + "step": 24954 + }, + { + "epoch": 1.802878971228349, + "grad_norm": 6.63783661586852, + "learning_rate": 1.261745308425008e-07, + "loss": 0.5854, + "step": 24955 + }, + { + "epoch": 1.8029512164285584, + "grad_norm": 8.492688344102744, + "learning_rate": 1.260827962094377e-07, + "loss": 0.6249, + "step": 24956 + }, + { + "epoch": 1.8030234616287681, + "grad_norm": 7.612405349819526, + "learning_rate": 1.2599109407345822e-07, + "loss": 0.5361, + "step": 24957 + }, + { + "epoch": 1.8030957068289775, + "grad_norm": 7.625261145950695, + "learning_rate": 1.2589942443581694e-07, + "loss": 0.5344, + "step": 24958 + }, + { + "epoch": 1.8031679520291872, + "grad_norm": 7.0134295658331345, + "learning_rate": 1.258077872977695e-07, + "loss": 0.5801, + "step": 24959 + }, + { + "epoch": 1.8032401972293965, + "grad_norm": 7.768104326296007, + "learning_rate": 1.257161826605699e-07, + "loss": 0.5664, + "step": 24960 + }, + { + "epoch": 1.803312442429606, + "grad_norm": 6.969328851992622, + "learning_rate": 1.2562461052547214e-07, + "loss": 0.5992, + "step": 24961 + }, + { + "epoch": 1.8033846876298156, + "grad_norm": 6.17442435233955, + "learning_rate": 1.2553307089373022e-07, + "loss": 0.5201, + "step": 24962 + }, + { + "epoch": 1.803456932830025, + "grad_norm": 8.365549705488576, + "learning_rate": 1.2544156376659676e-07, + "loss": 0.5714, + "step": 24963 + }, + { + "epoch": 1.8035291780302347, + "grad_norm": 7.456879571777866, + "learning_rate": 1.2535008914532438e-07, + "loss": 0.5993, + "step": 24964 + }, + { + "epoch": 1.803601423230444, + "grad_norm": 8.912227297940523, + "learning_rate": 1.2525864703116564e-07, + "loss": 0.5891, + "step": 24965 + }, + { + "epoch": 1.8036736684306538, + "grad_norm": 6.545735480801156, + "learning_rate": 1.2516723742537233e-07, + "loss": 0.6251, + "step": 24966 + }, + { + "epoch": 1.8037459136308631, + "grad_norm": 6.746407011762585, + "learning_rate": 1.2507586032919517e-07, + "loss": 0.572, + "step": 24967 + }, + { + "epoch": 1.8038181588310727, + "grad_norm": 8.222614496627958, + "learning_rate": 1.2498451574388532e-07, + "loss": 0.5667, + "step": 24968 + }, + { + "epoch": 1.8038904040312822, + "grad_norm": 7.703646690546217, + "learning_rate": 1.2489320367069374e-07, + "loss": 0.5794, + "step": 24969 + }, + { + "epoch": 1.8039626492314915, + "grad_norm": 8.279101744501077, + "learning_rate": 1.2480192411087028e-07, + "loss": 0.5804, + "step": 24970 + }, + { + "epoch": 1.8040348944317013, + "grad_norm": 7.2062245483478335, + "learning_rate": 1.247106770656642e-07, + "loss": 0.6236, + "step": 24971 + }, + { + "epoch": 1.8041071396319106, + "grad_norm": 7.965130631021583, + "learning_rate": 1.2461946253632424e-07, + "loss": 0.6729, + "step": 24972 + }, + { + "epoch": 1.8041793848321204, + "grad_norm": 7.876425477848786, + "learning_rate": 1.245282805241002e-07, + "loss": 0.5935, + "step": 24973 + }, + { + "epoch": 1.8042516300323297, + "grad_norm": 6.935928579154124, + "learning_rate": 1.244371310302389e-07, + "loss": 0.6069, + "step": 24974 + }, + { + "epoch": 1.8043238752325392, + "grad_norm": 7.053602978047731, + "learning_rate": 1.243460140559888e-07, + "loss": 0.6131, + "step": 24975 + }, + { + "epoch": 1.8043961204327488, + "grad_norm": 7.132958501552023, + "learning_rate": 1.2425492960259745e-07, + "loss": 0.5497, + "step": 24976 + }, + { + "epoch": 1.8044683656329583, + "grad_norm": 6.464955566914419, + "learning_rate": 1.2416387767131139e-07, + "loss": 0.5361, + "step": 24977 + }, + { + "epoch": 1.8045406108331679, + "grad_norm": 7.149221908807083, + "learning_rate": 1.2407285826337684e-07, + "loss": 0.6015, + "step": 24978 + }, + { + "epoch": 1.8046128560333772, + "grad_norm": 7.312806473528963, + "learning_rate": 1.239818713800403e-07, + "loss": 0.5968, + "step": 24979 + }, + { + "epoch": 1.804685101233587, + "grad_norm": 8.987065663672658, + "learning_rate": 1.2389091702254746e-07, + "loss": 0.6617, + "step": 24980 + }, + { + "epoch": 1.8047573464337963, + "grad_norm": 7.605114631265429, + "learning_rate": 1.237999951921426e-07, + "loss": 0.5474, + "step": 24981 + }, + { + "epoch": 1.8048295916340058, + "grad_norm": 8.055161559309784, + "learning_rate": 1.237091058900708e-07, + "loss": 0.5586, + "step": 24982 + }, + { + "epoch": 1.8049018368342153, + "grad_norm": 6.415673150106241, + "learning_rate": 1.2361824911757637e-07, + "loss": 0.5773, + "step": 24983 + }, + { + "epoch": 1.8049740820344249, + "grad_norm": 7.362006945850564, + "learning_rate": 1.235274248759033e-07, + "loss": 0.6121, + "step": 24984 + }, + { + "epoch": 1.8050463272346344, + "grad_norm": 7.278290110148828, + "learning_rate": 1.2343663316629423e-07, + "loss": 0.6262, + "step": 24985 + }, + { + "epoch": 1.8051185724348437, + "grad_norm": 7.349449510784618, + "learning_rate": 1.2334587398999232e-07, + "loss": 0.5718, + "step": 24986 + }, + { + "epoch": 1.8051908176350535, + "grad_norm": 7.523549048368271, + "learning_rate": 1.232551473482399e-07, + "loss": 0.6473, + "step": 24987 + }, + { + "epoch": 1.8052630628352628, + "grad_norm": 7.255754751937504, + "learning_rate": 1.2316445324227933e-07, + "loss": 0.6005, + "step": 24988 + }, + { + "epoch": 1.8053353080354724, + "grad_norm": 7.239758064484986, + "learning_rate": 1.2307379167335182e-07, + "loss": 0.6394, + "step": 24989 + }, + { + "epoch": 1.805407553235682, + "grad_norm": 6.237989252813192, + "learning_rate": 1.229831626426986e-07, + "loss": 0.5525, + "step": 24990 + }, + { + "epoch": 1.8054797984358915, + "grad_norm": 8.34367136257419, + "learning_rate": 1.2289256615156037e-07, + "loss": 0.6095, + "step": 24991 + }, + { + "epoch": 1.805552043636101, + "grad_norm": 10.41149440273788, + "learning_rate": 1.2280200220117694e-07, + "loss": 0.6243, + "step": 24992 + }, + { + "epoch": 1.8056242888363103, + "grad_norm": 8.531105571264865, + "learning_rate": 1.2271147079278846e-07, + "loss": 0.5978, + "step": 24993 + }, + { + "epoch": 1.80569653403652, + "grad_norm": 7.468274983259919, + "learning_rate": 1.2262097192763416e-07, + "loss": 0.5259, + "step": 24994 + }, + { + "epoch": 1.8057687792367294, + "grad_norm": 7.868730016899641, + "learning_rate": 1.2253050560695285e-07, + "loss": 0.5197, + "step": 24995 + }, + { + "epoch": 1.805841024436939, + "grad_norm": 8.067753462709343, + "learning_rate": 1.2244007183198291e-07, + "loss": 0.6529, + "step": 24996 + }, + { + "epoch": 1.8059132696371485, + "grad_norm": 6.5272046570308815, + "learning_rate": 1.2234967060396176e-07, + "loss": 0.6194, + "step": 24997 + }, + { + "epoch": 1.805985514837358, + "grad_norm": 6.6918611655783025, + "learning_rate": 1.2225930192412831e-07, + "loss": 0.6226, + "step": 24998 + }, + { + "epoch": 1.8060577600375676, + "grad_norm": 7.489481862052389, + "learning_rate": 1.2216896579371862e-07, + "loss": 0.5915, + "step": 24999 + }, + { + "epoch": 1.806130005237777, + "grad_norm": 6.567868607148858, + "learning_rate": 1.2207866221396913e-07, + "loss": 0.5053, + "step": 25000 + }, + { + "epoch": 1.8062022504379867, + "grad_norm": 7.604163796062886, + "learning_rate": 1.2198839118611665e-07, + "loss": 0.5794, + "step": 25001 + }, + { + "epoch": 1.806274495638196, + "grad_norm": 7.3374638051272685, + "learning_rate": 1.2189815271139716e-07, + "loss": 0.6531, + "step": 25002 + }, + { + "epoch": 1.8063467408384055, + "grad_norm": 8.453872732207605, + "learning_rate": 1.218079467910449e-07, + "loss": 0.6725, + "step": 25003 + }, + { + "epoch": 1.806418986038615, + "grad_norm": 7.957368748978934, + "learning_rate": 1.217177734262956e-07, + "loss": 0.5778, + "step": 25004 + }, + { + "epoch": 1.8064912312388246, + "grad_norm": 6.648894494887883, + "learning_rate": 1.2162763261838294e-07, + "loss": 0.6191, + "step": 25005 + }, + { + "epoch": 1.8065634764390341, + "grad_norm": 8.929426481448719, + "learning_rate": 1.2153752436854155e-07, + "loss": 0.6571, + "step": 25006 + }, + { + "epoch": 1.8066357216392435, + "grad_norm": 8.13245835939448, + "learning_rate": 1.2144744867800485e-07, + "loss": 0.6255, + "step": 25007 + }, + { + "epoch": 1.8067079668394532, + "grad_norm": 7.960313108027349, + "learning_rate": 1.2135740554800547e-07, + "loss": 0.6497, + "step": 25008 + }, + { + "epoch": 1.8067802120396625, + "grad_norm": 7.3542646738353294, + "learning_rate": 1.2126739497977685e-07, + "loss": 0.5519, + "step": 25009 + }, + { + "epoch": 1.806852457239872, + "grad_norm": 7.746724328610534, + "learning_rate": 1.2117741697455026e-07, + "loss": 0.6594, + "step": 25010 + }, + { + "epoch": 1.8069247024400816, + "grad_norm": 8.153936416059942, + "learning_rate": 1.2108747153355777e-07, + "loss": 0.6095, + "step": 25011 + }, + { + "epoch": 1.8069969476402912, + "grad_norm": 7.178399203904394, + "learning_rate": 1.2099755865803088e-07, + "loss": 0.604, + "step": 25012 + }, + { + "epoch": 1.8070691928405007, + "grad_norm": 6.404879688618951, + "learning_rate": 1.2090767834919998e-07, + "loss": 0.6677, + "step": 25013 + }, + { + "epoch": 1.80714143804071, + "grad_norm": 8.200094306217421, + "learning_rate": 1.2081783060829577e-07, + "loss": 0.6417, + "step": 25014 + }, + { + "epoch": 1.8072136832409198, + "grad_norm": 8.51801276371849, + "learning_rate": 1.207280154365481e-07, + "loss": 0.6431, + "step": 25015 + }, + { + "epoch": 1.8072859284411291, + "grad_norm": 7.408131034614965, + "learning_rate": 1.2063823283518655e-07, + "loss": 0.6262, + "step": 25016 + }, + { + "epoch": 1.8073581736413387, + "grad_norm": 7.506764730803331, + "learning_rate": 1.2054848280544014e-07, + "loss": 0.5416, + "step": 25017 + }, + { + "epoch": 1.8074304188415482, + "grad_norm": 6.50447134134819, + "learning_rate": 1.2045876534853757e-07, + "loss": 0.637, + "step": 25018 + }, + { + "epoch": 1.8075026640417577, + "grad_norm": 8.529702294220318, + "learning_rate": 1.2036908046570677e-07, + "loss": 0.666, + "step": 25019 + }, + { + "epoch": 1.8075749092419673, + "grad_norm": 6.706871426277179, + "learning_rate": 1.2027942815817594e-07, + "loss": 0.5423, + "step": 25020 + }, + { + "epoch": 1.8076471544421766, + "grad_norm": 6.594439574427932, + "learning_rate": 1.201898084271716e-07, + "loss": 0.578, + "step": 25021 + }, + { + "epoch": 1.8077193996423864, + "grad_norm": 6.886944469056872, + "learning_rate": 1.2010022127392106e-07, + "loss": 0.5895, + "step": 25022 + }, + { + "epoch": 1.8077916448425957, + "grad_norm": 6.977270280534283, + "learning_rate": 1.200106666996509e-07, + "loss": 0.5291, + "step": 25023 + }, + { + "epoch": 1.8078638900428052, + "grad_norm": 9.063054350636857, + "learning_rate": 1.1992114470558646e-07, + "loss": 0.5763, + "step": 25024 + }, + { + "epoch": 1.8079361352430148, + "grad_norm": 7.105711410095029, + "learning_rate": 1.198316552929532e-07, + "loss": 0.6285, + "step": 25025 + }, + { + "epoch": 1.8080083804432243, + "grad_norm": 7.1413496388502535, + "learning_rate": 1.1974219846297708e-07, + "loss": 0.6078, + "step": 25026 + }, + { + "epoch": 1.8080806256434339, + "grad_norm": 6.485733514976924, + "learning_rate": 1.196527742168821e-07, + "loss": 0.5689, + "step": 25027 + }, + { + "epoch": 1.8081528708436432, + "grad_norm": 7.975345769101096, + "learning_rate": 1.1956338255589227e-07, + "loss": 0.5621, + "step": 25028 + }, + { + "epoch": 1.808225116043853, + "grad_norm": 7.209071685667427, + "learning_rate": 1.1947402348123137e-07, + "loss": 0.5176, + "step": 25029 + }, + { + "epoch": 1.8082973612440623, + "grad_norm": 7.620262300076859, + "learning_rate": 1.1938469699412315e-07, + "loss": 0.6336, + "step": 25030 + }, + { + "epoch": 1.8083696064442718, + "grad_norm": 7.625532788282782, + "learning_rate": 1.1929540309578962e-07, + "loss": 0.6649, + "step": 25031 + }, + { + "epoch": 1.8084418516444813, + "grad_norm": 8.442722992802922, + "learning_rate": 1.192061417874535e-07, + "loss": 0.6356, + "step": 25032 + }, + { + "epoch": 1.8085140968446909, + "grad_norm": 8.95002008361268, + "learning_rate": 1.1911691307033679e-07, + "loss": 0.6151, + "step": 25033 + }, + { + "epoch": 1.8085863420449004, + "grad_norm": 6.521031613326373, + "learning_rate": 1.1902771694566079e-07, + "loss": 0.5581, + "step": 25034 + }, + { + "epoch": 1.8086585872451098, + "grad_norm": 7.9395026136219, + "learning_rate": 1.1893855341464671e-07, + "loss": 0.6243, + "step": 25035 + }, + { + "epoch": 1.8087308324453195, + "grad_norm": 7.627708329287995, + "learning_rate": 1.18849422478515e-07, + "loss": 0.5831, + "step": 25036 + }, + { + "epoch": 1.8088030776455288, + "grad_norm": 6.88733385166049, + "learning_rate": 1.1876032413848631e-07, + "loss": 0.5733, + "step": 25037 + }, + { + "epoch": 1.8088753228457386, + "grad_norm": 9.121983277690056, + "learning_rate": 1.1867125839577941e-07, + "loss": 0.6067, + "step": 25038 + }, + { + "epoch": 1.808947568045948, + "grad_norm": 7.798357805361922, + "learning_rate": 1.185822252516139e-07, + "loss": 0.5763, + "step": 25039 + }, + { + "epoch": 1.8090198132461575, + "grad_norm": 7.745533794530047, + "learning_rate": 1.1849322470720903e-07, + "loss": 0.6337, + "step": 25040 + }, + { + "epoch": 1.809092058446367, + "grad_norm": 7.437030424793665, + "learning_rate": 1.1840425676378276e-07, + "loss": 0.5916, + "step": 25041 + }, + { + "epoch": 1.8091643036465763, + "grad_norm": 7.624323830563011, + "learning_rate": 1.1831532142255297e-07, + "loss": 0.5334, + "step": 25042 + }, + { + "epoch": 1.809236548846786, + "grad_norm": 6.761315927125891, + "learning_rate": 1.1822641868473705e-07, + "loss": 0.6041, + "step": 25043 + }, + { + "epoch": 1.8093087940469954, + "grad_norm": 7.817106552055652, + "learning_rate": 1.1813754855155208e-07, + "loss": 0.5946, + "step": 25044 + }, + { + "epoch": 1.8093810392472052, + "grad_norm": 8.901856526679254, + "learning_rate": 1.1804871102421483e-07, + "loss": 0.5736, + "step": 25045 + }, + { + "epoch": 1.8094532844474145, + "grad_norm": 7.349799079484354, + "learning_rate": 1.1795990610394104e-07, + "loss": 0.5911, + "step": 25046 + }, + { + "epoch": 1.809525529647624, + "grad_norm": 6.482021704468234, + "learning_rate": 1.1787113379194665e-07, + "loss": 0.5621, + "step": 25047 + }, + { + "epoch": 1.8095977748478336, + "grad_norm": 7.214959805086489, + "learning_rate": 1.1778239408944736e-07, + "loss": 0.6893, + "step": 25048 + }, + { + "epoch": 1.809670020048043, + "grad_norm": 6.889109611393403, + "learning_rate": 1.1769368699765693e-07, + "loss": 0.5745, + "step": 25049 + }, + { + "epoch": 1.8097422652482527, + "grad_norm": 6.930384655690482, + "learning_rate": 1.1760501251779021e-07, + "loss": 0.5753, + "step": 25050 + }, + { + "epoch": 1.809814510448462, + "grad_norm": 7.969676483748587, + "learning_rate": 1.1751637065106126e-07, + "loss": 0.6002, + "step": 25051 + }, + { + "epoch": 1.8098867556486717, + "grad_norm": 7.708770354658347, + "learning_rate": 1.1742776139868323e-07, + "loss": 0.5851, + "step": 25052 + }, + { + "epoch": 1.809959000848881, + "grad_norm": 10.438668374094293, + "learning_rate": 1.1733918476186878e-07, + "loss": 0.6705, + "step": 25053 + }, + { + "epoch": 1.8100312460490906, + "grad_norm": 6.887846717484678, + "learning_rate": 1.1725064074183112e-07, + "loss": 0.611, + "step": 25054 + }, + { + "epoch": 1.8101034912493001, + "grad_norm": 7.518880856612738, + "learning_rate": 1.171621293397826e-07, + "loss": 0.598, + "step": 25055 + }, + { + "epoch": 1.8101757364495097, + "grad_norm": 7.726821941343829, + "learning_rate": 1.1707365055693392e-07, + "loss": 0.5617, + "step": 25056 + }, + { + "epoch": 1.8102479816497192, + "grad_norm": 7.390325108708165, + "learning_rate": 1.169852043944969e-07, + "loss": 0.5983, + "step": 25057 + }, + { + "epoch": 1.8103202268499285, + "grad_norm": 8.097336846993082, + "learning_rate": 1.1689679085368194e-07, + "loss": 0.6278, + "step": 25058 + }, + { + "epoch": 1.8103924720501383, + "grad_norm": 8.564155362911052, + "learning_rate": 1.1680840993570002e-07, + "loss": 0.59, + "step": 25059 + }, + { + "epoch": 1.8104647172503476, + "grad_norm": 6.496148949329518, + "learning_rate": 1.167200616417602e-07, + "loss": 0.5916, + "step": 25060 + }, + { + "epoch": 1.8105369624505572, + "grad_norm": 7.603309924490868, + "learning_rate": 1.1663174597307203e-07, + "loss": 0.677, + "step": 25061 + }, + { + "epoch": 1.8106092076507667, + "grad_norm": 6.946015776137679, + "learning_rate": 1.1654346293084484e-07, + "loss": 0.6025, + "step": 25062 + }, + { + "epoch": 1.8106814528509763, + "grad_norm": 7.3416269683053255, + "learning_rate": 1.164552125162871e-07, + "loss": 0.6186, + "step": 25063 + }, + { + "epoch": 1.8107536980511858, + "grad_norm": 7.458452987421465, + "learning_rate": 1.1636699473060675e-07, + "loss": 0.609, + "step": 25064 + }, + { + "epoch": 1.8108259432513951, + "grad_norm": 7.255876461537755, + "learning_rate": 1.1627880957501141e-07, + "loss": 0.5679, + "step": 25065 + }, + { + "epoch": 1.8108981884516049, + "grad_norm": 6.647751231472705, + "learning_rate": 1.1619065705070847e-07, + "loss": 0.5853, + "step": 25066 + }, + { + "epoch": 1.8109704336518142, + "grad_norm": 5.783578764501726, + "learning_rate": 1.1610253715890446e-07, + "loss": 0.5881, + "step": 25067 + }, + { + "epoch": 1.8110426788520237, + "grad_norm": 7.365298339436513, + "learning_rate": 1.1601444990080563e-07, + "loss": 0.6533, + "step": 25068 + }, + { + "epoch": 1.8111149240522333, + "grad_norm": 6.786876682162197, + "learning_rate": 1.1592639527761851e-07, + "loss": 0.6517, + "step": 25069 + }, + { + "epoch": 1.8111871692524428, + "grad_norm": 7.426064041909483, + "learning_rate": 1.1583837329054743e-07, + "loss": 0.6899, + "step": 25070 + }, + { + "epoch": 1.8112594144526524, + "grad_norm": 6.668405242560166, + "learning_rate": 1.157503839407978e-07, + "loss": 0.5586, + "step": 25071 + }, + { + "epoch": 1.8113316596528617, + "grad_norm": 7.8130217110959475, + "learning_rate": 1.1566242722957422e-07, + "loss": 0.654, + "step": 25072 + }, + { + "epoch": 1.8114039048530715, + "grad_norm": 8.081763151063189, + "learning_rate": 1.1557450315808045e-07, + "loss": 0.5857, + "step": 25073 + }, + { + "epoch": 1.8114761500532808, + "grad_norm": 6.941815657770296, + "learning_rate": 1.1548661172752051e-07, + "loss": 0.6141, + "step": 25074 + }, + { + "epoch": 1.8115483952534903, + "grad_norm": 8.307584528210313, + "learning_rate": 1.1539875293909763e-07, + "loss": 0.5975, + "step": 25075 + }, + { + "epoch": 1.8116206404536999, + "grad_norm": 5.719365321340741, + "learning_rate": 1.153109267940139e-07, + "loss": 0.5262, + "step": 25076 + }, + { + "epoch": 1.8116928856539094, + "grad_norm": 7.944998495823194, + "learning_rate": 1.1522313329347252e-07, + "loss": 0.534, + "step": 25077 + }, + { + "epoch": 1.811765130854119, + "grad_norm": 8.146417647248294, + "learning_rate": 1.1513537243867445e-07, + "loss": 0.6206, + "step": 25078 + }, + { + "epoch": 1.8118373760543283, + "grad_norm": 7.1029772466622445, + "learning_rate": 1.1504764423082154e-07, + "loss": 0.6442, + "step": 25079 + }, + { + "epoch": 1.811909621254538, + "grad_norm": 6.464547824137171, + "learning_rate": 1.1495994867111477e-07, + "loss": 0.569, + "step": 25080 + }, + { + "epoch": 1.8119818664547473, + "grad_norm": 6.294921875, + "learning_rate": 1.1487228576075371e-07, + "loss": 0.6494, + "step": 25081 + }, + { + "epoch": 1.812054111654957, + "grad_norm": 6.729468024939389, + "learning_rate": 1.1478465550093965e-07, + "loss": 0.5288, + "step": 25082 + }, + { + "epoch": 1.8121263568551664, + "grad_norm": 7.4126080612874485, + "learning_rate": 1.1469705789287161e-07, + "loss": 0.6152, + "step": 25083 + }, + { + "epoch": 1.812198602055376, + "grad_norm": 7.171884308187723, + "learning_rate": 1.1460949293774892e-07, + "loss": 0.6161, + "step": 25084 + }, + { + "epoch": 1.8122708472555855, + "grad_norm": 6.3342617090649, + "learning_rate": 1.1452196063677007e-07, + "loss": 0.5204, + "step": 25085 + }, + { + "epoch": 1.8123430924557948, + "grad_norm": 6.735513188238558, + "learning_rate": 1.1443446099113325e-07, + "loss": 0.581, + "step": 25086 + }, + { + "epoch": 1.8124153376560046, + "grad_norm": 7.467722802173207, + "learning_rate": 1.143469940020367e-07, + "loss": 0.6091, + "step": 25087 + }, + { + "epoch": 1.812487582856214, + "grad_norm": 7.883825533813354, + "learning_rate": 1.1425955967067692e-07, + "loss": 0.561, + "step": 25088 + }, + { + "epoch": 1.8125598280564235, + "grad_norm": 6.959059747785054, + "learning_rate": 1.141721579982516e-07, + "loss": 0.6626, + "step": 25089 + }, + { + "epoch": 1.812632073256633, + "grad_norm": 6.822725793240988, + "learning_rate": 1.1408478898595698e-07, + "loss": 0.5707, + "step": 25090 + }, + { + "epoch": 1.8127043184568425, + "grad_norm": 7.211299337481813, + "learning_rate": 1.1399745263498907e-07, + "loss": 0.5875, + "step": 25091 + }, + { + "epoch": 1.812776563657052, + "grad_norm": 8.405949129867833, + "learning_rate": 1.1391014894654329e-07, + "loss": 0.6017, + "step": 25092 + }, + { + "epoch": 1.8128488088572614, + "grad_norm": 6.845155336499716, + "learning_rate": 1.1382287792181507e-07, + "loss": 0.5918, + "step": 25093 + }, + { + "epoch": 1.8129210540574712, + "grad_norm": 7.921691772027352, + "learning_rate": 1.1373563956199901e-07, + "loss": 0.5666, + "step": 25094 + }, + { + "epoch": 1.8129932992576805, + "grad_norm": 7.628036644786076, + "learning_rate": 1.1364843386828917e-07, + "loss": 0.645, + "step": 25095 + }, + { + "epoch": 1.81306554445789, + "grad_norm": 6.413395167400975, + "learning_rate": 1.1356126084187929e-07, + "loss": 0.5118, + "step": 25096 + }, + { + "epoch": 1.8131377896580996, + "grad_norm": 8.076436617108797, + "learning_rate": 1.1347412048396261e-07, + "loss": 0.5867, + "step": 25097 + }, + { + "epoch": 1.8132100348583091, + "grad_norm": 6.849000632345221, + "learning_rate": 1.1338701279573261e-07, + "loss": 0.5946, + "step": 25098 + }, + { + "epoch": 1.8132822800585187, + "grad_norm": 7.629107010207247, + "learning_rate": 1.1329993777838111e-07, + "loss": 0.6617, + "step": 25099 + }, + { + "epoch": 1.813354525258728, + "grad_norm": 7.752156111524162, + "learning_rate": 1.1321289543310021e-07, + "loss": 0.5922, + "step": 25100 + }, + { + "epoch": 1.8134267704589377, + "grad_norm": 7.739354605565851, + "learning_rate": 1.1312588576108174e-07, + "loss": 0.6142, + "step": 25101 + }, + { + "epoch": 1.813499015659147, + "grad_norm": 8.23257548233028, + "learning_rate": 1.1303890876351642e-07, + "loss": 0.5914, + "step": 25102 + }, + { + "epoch": 1.8135712608593566, + "grad_norm": 6.73998025455469, + "learning_rate": 1.1295196444159496e-07, + "loss": 0.5831, + "step": 25103 + }, + { + "epoch": 1.8136435060595661, + "grad_norm": 8.062320677663017, + "learning_rate": 1.1286505279650806e-07, + "loss": 0.5789, + "step": 25104 + }, + { + "epoch": 1.8137157512597757, + "grad_norm": 7.280139494596996, + "learning_rate": 1.1277817382944506e-07, + "loss": 0.5747, + "step": 25105 + }, + { + "epoch": 1.8137879964599852, + "grad_norm": 6.786653255806634, + "learning_rate": 1.1269132754159528e-07, + "loss": 0.6094, + "step": 25106 + }, + { + "epoch": 1.8138602416601945, + "grad_norm": 6.309485291588799, + "learning_rate": 1.1260451393414723e-07, + "loss": 0.5647, + "step": 25107 + }, + { + "epoch": 1.8139324868604043, + "grad_norm": 7.254422384623154, + "learning_rate": 1.1251773300828994e-07, + "loss": 0.6137, + "step": 25108 + }, + { + "epoch": 1.8140047320606136, + "grad_norm": 8.067556289084017, + "learning_rate": 1.1243098476521136e-07, + "loss": 0.6637, + "step": 25109 + }, + { + "epoch": 1.8140769772608234, + "grad_norm": 8.094567217128855, + "learning_rate": 1.1234426920609831e-07, + "loss": 0.5431, + "step": 25110 + }, + { + "epoch": 1.8141492224610327, + "grad_norm": 7.3014779776398, + "learning_rate": 1.1225758633213846e-07, + "loss": 0.5488, + "step": 25111 + }, + { + "epoch": 1.8142214676612423, + "grad_norm": 5.562275742714332, + "learning_rate": 1.1217093614451863e-07, + "loss": 0.5339, + "step": 25112 + }, + { + "epoch": 1.8142937128614518, + "grad_norm": 7.23924668075918, + "learning_rate": 1.1208431864442453e-07, + "loss": 0.5875, + "step": 25113 + }, + { + "epoch": 1.8143659580616611, + "grad_norm": 6.745765770834393, + "learning_rate": 1.1199773383304164e-07, + "loss": 0.6326, + "step": 25114 + }, + { + "epoch": 1.8144382032618709, + "grad_norm": 6.478267137041892, + "learning_rate": 1.119111817115559e-07, + "loss": 0.5087, + "step": 25115 + }, + { + "epoch": 1.8145104484620802, + "grad_norm": 7.666779918110611, + "learning_rate": 1.1182466228115225e-07, + "loss": 0.5632, + "step": 25116 + }, + { + "epoch": 1.81458269366229, + "grad_norm": 6.878527204651222, + "learning_rate": 1.1173817554301414e-07, + "loss": 0.666, + "step": 25117 + }, + { + "epoch": 1.8146549388624993, + "grad_norm": 7.182335631909339, + "learning_rate": 1.1165172149832593e-07, + "loss": 0.6149, + "step": 25118 + }, + { + "epoch": 1.8147271840627088, + "grad_norm": 5.0776002583449715, + "learning_rate": 1.1156530014827138e-07, + "loss": 0.5125, + "step": 25119 + }, + { + "epoch": 1.8147994292629184, + "grad_norm": 7.959063697840913, + "learning_rate": 1.1147891149403345e-07, + "loss": 0.6167, + "step": 25120 + }, + { + "epoch": 1.8148716744631277, + "grad_norm": 8.537819146241374, + "learning_rate": 1.1139255553679451e-07, + "loss": 0.6997, + "step": 25121 + }, + { + "epoch": 1.8149439196633375, + "grad_norm": 7.899646036451298, + "learning_rate": 1.1130623227773695e-07, + "loss": 0.554, + "step": 25122 + }, + { + "epoch": 1.8150161648635468, + "grad_norm": 8.565304484836995, + "learning_rate": 1.1121994171804262e-07, + "loss": 0.5573, + "step": 25123 + }, + { + "epoch": 1.8150884100637565, + "grad_norm": 7.048516668011667, + "learning_rate": 1.1113368385889223e-07, + "loss": 0.5764, + "step": 25124 + }, + { + "epoch": 1.8151606552639659, + "grad_norm": 7.411103666124825, + "learning_rate": 1.1104745870146705e-07, + "loss": 0.6462, + "step": 25125 + }, + { + "epoch": 1.8152329004641754, + "grad_norm": 8.209992939693933, + "learning_rate": 1.1096126624694697e-07, + "loss": 0.559, + "step": 25126 + }, + { + "epoch": 1.815305145664385, + "grad_norm": 7.97768962791336, + "learning_rate": 1.1087510649651273e-07, + "loss": 0.6396, + "step": 25127 + }, + { + "epoch": 1.8153773908645945, + "grad_norm": 7.781556395354468, + "learning_rate": 1.1078897945134282e-07, + "loss": 0.6306, + "step": 25128 + }, + { + "epoch": 1.815449636064804, + "grad_norm": 6.923618760057427, + "learning_rate": 1.1070288511261657e-07, + "loss": 0.573, + "step": 25129 + }, + { + "epoch": 1.8155218812650133, + "grad_norm": 7.957460312284443, + "learning_rate": 1.1061682348151276e-07, + "loss": 0.58, + "step": 25130 + }, + { + "epoch": 1.8155941264652231, + "grad_norm": 7.155568719276551, + "learning_rate": 1.1053079455920962e-07, + "loss": 0.6324, + "step": 25131 + }, + { + "epoch": 1.8156663716654324, + "grad_norm": 7.336704461871137, + "learning_rate": 1.1044479834688427e-07, + "loss": 0.6049, + "step": 25132 + }, + { + "epoch": 1.815738616865642, + "grad_norm": 7.29788573492016, + "learning_rate": 1.1035883484571436e-07, + "loss": 0.6423, + "step": 25133 + }, + { + "epoch": 1.8158108620658515, + "grad_norm": 9.009160360319335, + "learning_rate": 1.1027290405687701e-07, + "loss": 0.5796, + "step": 25134 + }, + { + "epoch": 1.815883107266061, + "grad_norm": 7.151653296052833, + "learning_rate": 1.1018700598154769e-07, + "loss": 0.6233, + "step": 25135 + }, + { + "epoch": 1.8159553524662706, + "grad_norm": 6.899588708091252, + "learning_rate": 1.1010114062090266e-07, + "loss": 0.5725, + "step": 25136 + }, + { + "epoch": 1.81602759766648, + "grad_norm": 9.315494747076642, + "learning_rate": 1.1001530797611764e-07, + "loss": 0.6318, + "step": 25137 + }, + { + "epoch": 1.8160998428666897, + "grad_norm": 6.523337211523266, + "learning_rate": 1.0992950804836671e-07, + "loss": 0.5255, + "step": 25138 + }, + { + "epoch": 1.816172088066899, + "grad_norm": 7.41034929489584, + "learning_rate": 1.0984374083882559e-07, + "loss": 0.5458, + "step": 25139 + }, + { + "epoch": 1.8162443332671085, + "grad_norm": 7.40580136513688, + "learning_rate": 1.0975800634866751e-07, + "loss": 0.6515, + "step": 25140 + }, + { + "epoch": 1.816316578467318, + "grad_norm": 6.391731297026547, + "learning_rate": 1.0967230457906708e-07, + "loss": 0.6003, + "step": 25141 + }, + { + "epoch": 1.8163888236675276, + "grad_norm": 6.9691821592983505, + "learning_rate": 1.0958663553119615e-07, + "loss": 0.5461, + "step": 25142 + }, + { + "epoch": 1.8164610688677372, + "grad_norm": 8.872245683548122, + "learning_rate": 1.0950099920622848e-07, + "loss": 0.6196, + "step": 25143 + }, + { + "epoch": 1.8165333140679465, + "grad_norm": 8.553714003542234, + "learning_rate": 1.094153956053362e-07, + "loss": 0.7032, + "step": 25144 + }, + { + "epoch": 1.8166055592681563, + "grad_norm": 7.613565595238413, + "learning_rate": 1.0932982472969061e-07, + "loss": 0.6885, + "step": 25145 + }, + { + "epoch": 1.8166778044683656, + "grad_norm": 8.428932191457207, + "learning_rate": 1.0924428658046354e-07, + "loss": 0.5687, + "step": 25146 + }, + { + "epoch": 1.8167500496685751, + "grad_norm": 7.324499994600207, + "learning_rate": 1.0915878115882572e-07, + "loss": 0.6724, + "step": 25147 + }, + { + "epoch": 1.8168222948687847, + "grad_norm": 7.048197349426649, + "learning_rate": 1.0907330846594816e-07, + "loss": 0.5688, + "step": 25148 + }, + { + "epoch": 1.8168945400689942, + "grad_norm": 7.490643577886849, + "learning_rate": 1.089878685030002e-07, + "loss": 0.5694, + "step": 25149 + }, + { + "epoch": 1.8169667852692037, + "grad_norm": 6.887432716996395, + "learning_rate": 1.0890246127115201e-07, + "loss": 0.5664, + "step": 25150 + }, + { + "epoch": 1.817039030469413, + "grad_norm": 6.500928372328712, + "learning_rate": 1.0881708677157266e-07, + "loss": 0.5411, + "step": 25151 + }, + { + "epoch": 1.8171112756696228, + "grad_norm": 7.358755810519698, + "learning_rate": 1.0873174500543094e-07, + "loss": 0.5737, + "step": 25152 + }, + { + "epoch": 1.8171835208698321, + "grad_norm": 6.930648582240179, + "learning_rate": 1.0864643597389451e-07, + "loss": 0.5423, + "step": 25153 + }, + { + "epoch": 1.8172557660700417, + "grad_norm": 6.586817563808526, + "learning_rate": 1.0856115967813191e-07, + "loss": 0.6123, + "step": 25154 + }, + { + "epoch": 1.8173280112702512, + "grad_norm": 7.0272441388333435, + "learning_rate": 1.0847591611931024e-07, + "loss": 0.6427, + "step": 25155 + }, + { + "epoch": 1.8174002564704608, + "grad_norm": 7.4138919340397145, + "learning_rate": 1.0839070529859608e-07, + "loss": 0.6082, + "step": 25156 + }, + { + "epoch": 1.8174725016706703, + "grad_norm": 6.065993099059051, + "learning_rate": 1.0830552721715653e-07, + "loss": 0.5418, + "step": 25157 + }, + { + "epoch": 1.8175447468708796, + "grad_norm": 6.777916998901222, + "learning_rate": 1.0822038187615653e-07, + "loss": 0.6131, + "step": 25158 + }, + { + "epoch": 1.8176169920710894, + "grad_norm": 6.89341682742605, + "learning_rate": 1.0813526927676343e-07, + "loss": 0.5534, + "step": 25159 + }, + { + "epoch": 1.8176892372712987, + "grad_norm": 6.923946854147748, + "learning_rate": 1.0805018942014078e-07, + "loss": 0.6089, + "step": 25160 + }, + { + "epoch": 1.8177614824715083, + "grad_norm": 6.886705455886204, + "learning_rate": 1.0796514230745376e-07, + "loss": 0.6019, + "step": 25161 + }, + { + "epoch": 1.8178337276717178, + "grad_norm": 7.218822660534614, + "learning_rate": 1.0788012793986696e-07, + "loss": 0.6282, + "step": 25162 + }, + { + "epoch": 1.8179059728719273, + "grad_norm": 7.169470955498619, + "learning_rate": 1.0779514631854365e-07, + "loss": 0.633, + "step": 25163 + }, + { + "epoch": 1.8179782180721369, + "grad_norm": 8.997945339059747, + "learning_rate": 1.0771019744464734e-07, + "loss": 0.6468, + "step": 25164 + }, + { + "epoch": 1.8180504632723462, + "grad_norm": 7.20498732449901, + "learning_rate": 1.0762528131934096e-07, + "loss": 0.5631, + "step": 25165 + }, + { + "epoch": 1.818122708472556, + "grad_norm": 6.721032610983982, + "learning_rate": 1.0754039794378696e-07, + "loss": 0.5985, + "step": 25166 + }, + { + "epoch": 1.8181949536727653, + "grad_norm": 6.288272405242102, + "learning_rate": 1.0745554731914714e-07, + "loss": 0.6598, + "step": 25167 + }, + { + "epoch": 1.8182671988729748, + "grad_norm": 8.18871781705199, + "learning_rate": 1.0737072944658311e-07, + "loss": 0.6288, + "step": 25168 + }, + { + "epoch": 1.8183394440731844, + "grad_norm": 7.770013133544819, + "learning_rate": 1.0728594432725669e-07, + "loss": 0.6091, + "step": 25169 + }, + { + "epoch": 1.818411689273394, + "grad_norm": 7.8285852475884665, + "learning_rate": 1.0720119196232726e-07, + "loss": 0.6116, + "step": 25170 + }, + { + "epoch": 1.8184839344736035, + "grad_norm": 8.234492668454806, + "learning_rate": 1.071164723529558e-07, + "loss": 0.5737, + "step": 25171 + }, + { + "epoch": 1.8185561796738128, + "grad_norm": 7.710037565424303, + "learning_rate": 1.070317855003017e-07, + "loss": 0.6, + "step": 25172 + }, + { + "epoch": 1.8186284248740225, + "grad_norm": 7.291868922743648, + "learning_rate": 1.0694713140552482e-07, + "loss": 0.5337, + "step": 25173 + }, + { + "epoch": 1.8187006700742319, + "grad_norm": 7.641298978134685, + "learning_rate": 1.0686251006978344e-07, + "loss": 0.5623, + "step": 25174 + }, + { + "epoch": 1.8187729152744414, + "grad_norm": 6.985662764042485, + "learning_rate": 1.0677792149423604e-07, + "loss": 0.6196, + "step": 25175 + }, + { + "epoch": 1.818845160474651, + "grad_norm": 7.499547308929198, + "learning_rate": 1.0669336568004063e-07, + "loss": 0.5778, + "step": 25176 + }, + { + "epoch": 1.8189174056748605, + "grad_norm": 7.699106278510056, + "learning_rate": 1.0660884262835486e-07, + "loss": 0.5243, + "step": 25177 + }, + { + "epoch": 1.81898965087507, + "grad_norm": 8.020222852849852, + "learning_rate": 1.0652435234033559e-07, + "loss": 0.6319, + "step": 25178 + }, + { + "epoch": 1.8190618960752793, + "grad_norm": 7.29258690249686, + "learning_rate": 1.0643989481713968e-07, + "loss": 0.5631, + "step": 25179 + }, + { + "epoch": 1.8191341412754891, + "grad_norm": 6.724324597331929, + "learning_rate": 1.0635547005992341e-07, + "loss": 0.555, + "step": 25180 + }, + { + "epoch": 1.8192063864756984, + "grad_norm": 6.7530513506981436, + "learning_rate": 1.0627107806984172e-07, + "loss": 0.5831, + "step": 25181 + }, + { + "epoch": 1.8192786316759082, + "grad_norm": 6.812315212158396, + "learning_rate": 1.061867188480506e-07, + "loss": 0.5643, + "step": 25182 + }, + { + "epoch": 1.8193508768761175, + "grad_norm": 8.61266561717121, + "learning_rate": 1.0610239239570441e-07, + "loss": 0.5451, + "step": 25183 + }, + { + "epoch": 1.819423122076327, + "grad_norm": 7.117572325971103, + "learning_rate": 1.0601809871395808e-07, + "loss": 0.5934, + "step": 25184 + }, + { + "epoch": 1.8194953672765366, + "grad_norm": 7.3657388243783855, + "learning_rate": 1.0593383780396482e-07, + "loss": 0.6156, + "step": 25185 + }, + { + "epoch": 1.819567612476746, + "grad_norm": 6.399135304688055, + "learning_rate": 1.0584960966687874e-07, + "loss": 0.5228, + "step": 25186 + }, + { + "epoch": 1.8196398576769557, + "grad_norm": 6.66052681795335, + "learning_rate": 1.0576541430385223e-07, + "loss": 0.6017, + "step": 25187 + }, + { + "epoch": 1.819712102877165, + "grad_norm": 6.992882243330842, + "learning_rate": 1.0568125171603827e-07, + "loss": 0.583, + "step": 25188 + }, + { + "epoch": 1.8197843480773748, + "grad_norm": 7.71249148277171, + "learning_rate": 1.0559712190458899e-07, + "loss": 0.5963, + "step": 25189 + }, + { + "epoch": 1.819856593277584, + "grad_norm": 8.344665248549612, + "learning_rate": 1.0551302487065595e-07, + "loss": 0.6137, + "step": 25190 + }, + { + "epoch": 1.8199288384777936, + "grad_norm": 6.040603894997566, + "learning_rate": 1.0542896061539077e-07, + "loss": 0.596, + "step": 25191 + }, + { + "epoch": 1.8200010836780032, + "grad_norm": 6.067571661942093, + "learning_rate": 1.053449291399436e-07, + "loss": 0.5216, + "step": 25192 + }, + { + "epoch": 1.8200733288782125, + "grad_norm": 7.713942045801636, + "learning_rate": 1.0526093044546465e-07, + "loss": 0.5662, + "step": 25193 + }, + { + "epoch": 1.8201455740784223, + "grad_norm": 7.337091552179812, + "learning_rate": 1.0517696453310494e-07, + "loss": 0.5736, + "step": 25194 + }, + { + "epoch": 1.8202178192786316, + "grad_norm": 7.775198548164472, + "learning_rate": 1.0509303140401217e-07, + "loss": 0.6611, + "step": 25195 + }, + { + "epoch": 1.8202900644788413, + "grad_norm": 6.1640194890174, + "learning_rate": 1.050091310593368e-07, + "loss": 0.5929, + "step": 25196 + }, + { + "epoch": 1.8203623096790507, + "grad_norm": 7.9777886086298615, + "learning_rate": 1.049252635002268e-07, + "loss": 0.5952, + "step": 25197 + }, + { + "epoch": 1.8204345548792602, + "grad_norm": 7.215908003023976, + "learning_rate": 1.0484142872783043e-07, + "loss": 0.6556, + "step": 25198 + }, + { + "epoch": 1.8205068000794697, + "grad_norm": 6.97511664398995, + "learning_rate": 1.047576267432951e-07, + "loss": 0.5666, + "step": 25199 + }, + { + "epoch": 1.8205790452796793, + "grad_norm": 7.322115322960597, + "learning_rate": 1.0467385754776793e-07, + "loss": 0.584, + "step": 25200 + }, + { + "epoch": 1.8206512904798888, + "grad_norm": 7.866443890579947, + "learning_rate": 1.0459012114239609e-07, + "loss": 0.5695, + "step": 25201 + }, + { + "epoch": 1.8207235356800981, + "grad_norm": 7.283581536411781, + "learning_rate": 1.045064175283253e-07, + "loss": 0.6129, + "step": 25202 + }, + { + "epoch": 1.820795780880308, + "grad_norm": 7.493721941083312, + "learning_rate": 1.0442274670670161e-07, + "loss": 0.6079, + "step": 25203 + }, + { + "epoch": 1.8208680260805172, + "grad_norm": 7.608443498875833, + "learning_rate": 1.0433910867867047e-07, + "loss": 0.5896, + "step": 25204 + }, + { + "epoch": 1.8209402712807268, + "grad_norm": 6.566375750505462, + "learning_rate": 1.042555034453771e-07, + "loss": 0.5837, + "step": 25205 + }, + { + "epoch": 1.8210125164809363, + "grad_norm": 7.898455369817817, + "learning_rate": 1.0417193100796558e-07, + "loss": 0.655, + "step": 25206 + }, + { + "epoch": 1.8210847616811459, + "grad_norm": 8.071230870271924, + "learning_rate": 1.0408839136757998e-07, + "loss": 0.6788, + "step": 25207 + }, + { + "epoch": 1.8211570068813554, + "grad_norm": 7.186187093488935, + "learning_rate": 1.0400488452536383e-07, + "loss": 0.685, + "step": 25208 + }, + { + "epoch": 1.8212292520815647, + "grad_norm": 9.301665077047572, + "learning_rate": 1.0392141048246097e-07, + "loss": 0.5516, + "step": 25209 + }, + { + "epoch": 1.8213014972817745, + "grad_norm": 7.447188022068139, + "learning_rate": 1.0383796924001322e-07, + "loss": 0.5798, + "step": 25210 + }, + { + "epoch": 1.8213737424819838, + "grad_norm": 8.012924721064664, + "learning_rate": 1.037545607991633e-07, + "loss": 0.6768, + "step": 25211 + }, + { + "epoch": 1.8214459876821933, + "grad_norm": 7.470453682518636, + "learning_rate": 1.0367118516105307e-07, + "loss": 0.5288, + "step": 25212 + }, + { + "epoch": 1.8215182328824029, + "grad_norm": 7.219286349629665, + "learning_rate": 1.0358784232682328e-07, + "loss": 0.6183, + "step": 25213 + }, + { + "epoch": 1.8215904780826124, + "grad_norm": 8.53999925171181, + "learning_rate": 1.0350453229761553e-07, + "loss": 0.6481, + "step": 25214 + }, + { + "epoch": 1.821662723282822, + "grad_norm": 7.104746894596548, + "learning_rate": 1.0342125507456945e-07, + "loss": 0.5249, + "step": 25215 + }, + { + "epoch": 1.8217349684830313, + "grad_norm": 6.5688521387233365, + "learning_rate": 1.033380106588261e-07, + "loss": 0.5606, + "step": 25216 + }, + { + "epoch": 1.821807213683241, + "grad_norm": 7.241813773478046, + "learning_rate": 1.0325479905152425e-07, + "loss": 0.5942, + "step": 25217 + }, + { + "epoch": 1.8218794588834504, + "grad_norm": 7.164467703405783, + "learning_rate": 1.0317162025380329e-07, + "loss": 0.6246, + "step": 25218 + }, + { + "epoch": 1.82195170408366, + "grad_norm": 7.961734810417027, + "learning_rate": 1.0308847426680229e-07, + "loss": 0.6107, + "step": 25219 + }, + { + "epoch": 1.8220239492838695, + "grad_norm": 7.319757495469401, + "learning_rate": 1.0300536109165843e-07, + "loss": 0.6205, + "step": 25220 + }, + { + "epoch": 1.822096194484079, + "grad_norm": 8.012718580957227, + "learning_rate": 1.0292228072951022e-07, + "loss": 0.6137, + "step": 25221 + }, + { + "epoch": 1.8221684396842885, + "grad_norm": 7.512019381009614, + "learning_rate": 1.028392331814948e-07, + "loss": 0.5341, + "step": 25222 + }, + { + "epoch": 1.8222406848844979, + "grad_norm": 6.633407727819548, + "learning_rate": 1.0275621844874878e-07, + "loss": 0.5538, + "step": 25223 + }, + { + "epoch": 1.8223129300847076, + "grad_norm": 7.393637174388794, + "learning_rate": 1.0267323653240901e-07, + "loss": 0.6285, + "step": 25224 + }, + { + "epoch": 1.822385175284917, + "grad_norm": 7.233436342317487, + "learning_rate": 1.0259028743361099e-07, + "loss": 0.6217, + "step": 25225 + }, + { + "epoch": 1.8224574204851265, + "grad_norm": 5.962969312451661, + "learning_rate": 1.0250737115349101e-07, + "loss": 0.6108, + "step": 25226 + }, + { + "epoch": 1.822529665685336, + "grad_norm": 6.206586587773766, + "learning_rate": 1.024244876931832e-07, + "loss": 0.5508, + "step": 25227 + }, + { + "epoch": 1.8226019108855456, + "grad_norm": 8.913534404547653, + "learning_rate": 1.0234163705382244e-07, + "loss": 0.6325, + "step": 25228 + }, + { + "epoch": 1.8226741560857551, + "grad_norm": 6.4345497852391125, + "learning_rate": 1.0225881923654313e-07, + "loss": 0.5772, + "step": 25229 + }, + { + "epoch": 1.8227464012859644, + "grad_norm": 6.9987298630569335, + "learning_rate": 1.0217603424247907e-07, + "loss": 0.608, + "step": 25230 + }, + { + "epoch": 1.8228186464861742, + "grad_norm": 6.167901954989816, + "learning_rate": 1.0209328207276326e-07, + "loss": 0.5904, + "step": 25231 + }, + { + "epoch": 1.8228908916863835, + "grad_norm": 6.930122645670364, + "learning_rate": 1.0201056272852838e-07, + "loss": 0.5694, + "step": 25232 + }, + { + "epoch": 1.822963136886593, + "grad_norm": 5.880024161775733, + "learning_rate": 1.0192787621090689e-07, + "loss": 0.5595, + "step": 25233 + }, + { + "epoch": 1.8230353820868026, + "grad_norm": 6.487176423281345, + "learning_rate": 1.0184522252103091e-07, + "loss": 0.559, + "step": 25234 + }, + { + "epoch": 1.8231076272870121, + "grad_norm": 5.969047818568014, + "learning_rate": 1.0176260166003177e-07, + "loss": 0.6131, + "step": 25235 + }, + { + "epoch": 1.8231798724872217, + "grad_norm": 8.411315526232263, + "learning_rate": 1.0168001362904051e-07, + "loss": 0.6219, + "step": 25236 + }, + { + "epoch": 1.823252117687431, + "grad_norm": 6.852572672200069, + "learning_rate": 1.0159745842918789e-07, + "loss": 0.6152, + "step": 25237 + }, + { + "epoch": 1.8233243628876408, + "grad_norm": 7.990205968462705, + "learning_rate": 1.0151493606160357e-07, + "loss": 0.6343, + "step": 25238 + }, + { + "epoch": 1.82339660808785, + "grad_norm": 7.056415198447499, + "learning_rate": 1.0143244652741746e-07, + "loss": 0.5907, + "step": 25239 + }, + { + "epoch": 1.8234688532880596, + "grad_norm": 7.293625951658412, + "learning_rate": 1.0134998982775895e-07, + "loss": 0.6455, + "step": 25240 + }, + { + "epoch": 1.8235410984882692, + "grad_norm": 7.269736424932357, + "learning_rate": 1.0126756596375687e-07, + "loss": 0.604, + "step": 25241 + }, + { + "epoch": 1.8236133436884787, + "grad_norm": 7.916943461615351, + "learning_rate": 1.011851749365389e-07, + "loss": 0.6991, + "step": 25242 + }, + { + "epoch": 1.8236855888886883, + "grad_norm": 6.95829145529322, + "learning_rate": 1.0110281674723305e-07, + "loss": 0.5854, + "step": 25243 + }, + { + "epoch": 1.8237578340888976, + "grad_norm": 8.184071063049188, + "learning_rate": 1.0102049139696785e-07, + "loss": 0.5954, + "step": 25244 + }, + { + "epoch": 1.8238300792891073, + "grad_norm": 7.232595929413852, + "learning_rate": 1.0093819888686879e-07, + "loss": 0.6496, + "step": 25245 + }, + { + "epoch": 1.8239023244893167, + "grad_norm": 9.313793937344986, + "learning_rate": 1.008559392180633e-07, + "loss": 0.6633, + "step": 25246 + }, + { + "epoch": 1.8239745696895262, + "grad_norm": 7.021539790288974, + "learning_rate": 1.0077371239167715e-07, + "loss": 0.5851, + "step": 25247 + }, + { + "epoch": 1.8240468148897357, + "grad_norm": 8.359231708760213, + "learning_rate": 1.0069151840883612e-07, + "loss": 0.6465, + "step": 25248 + }, + { + "epoch": 1.8241190600899453, + "grad_norm": 7.508753436744885, + "learning_rate": 1.0060935727066513e-07, + "loss": 0.6394, + "step": 25249 + }, + { + "epoch": 1.8241913052901548, + "grad_norm": 7.57777413411331, + "learning_rate": 1.0052722897828882e-07, + "loss": 0.6293, + "step": 25250 + }, + { + "epoch": 1.8242635504903641, + "grad_norm": 6.645357631132752, + "learning_rate": 1.0044513353283186e-07, + "loss": 0.5499, + "step": 25251 + }, + { + "epoch": 1.824335795690574, + "grad_norm": 7.9503418375093755, + "learning_rate": 1.0036307093541808e-07, + "loss": 0.6141, + "step": 25252 + }, + { + "epoch": 1.8244080408907832, + "grad_norm": 7.453820793894245, + "learning_rate": 1.002810411871702e-07, + "loss": 0.5697, + "step": 25253 + }, + { + "epoch": 1.8244802860909928, + "grad_norm": 6.930760039339148, + "learning_rate": 1.0019904428921201e-07, + "loss": 0.5951, + "step": 25254 + }, + { + "epoch": 1.8245525312912023, + "grad_norm": 6.9786654012404625, + "learning_rate": 1.001170802426657e-07, + "loss": 0.6146, + "step": 25255 + }, + { + "epoch": 1.8246247764914119, + "grad_norm": 6.861209980213266, + "learning_rate": 1.0003514904865285e-07, + "loss": 0.5555, + "step": 25256 + }, + { + "epoch": 1.8246970216916214, + "grad_norm": 7.297160957826927, + "learning_rate": 9.995325070829537e-08, + "loss": 0.5559, + "step": 25257 + }, + { + "epoch": 1.8247692668918307, + "grad_norm": 7.522666249895991, + "learning_rate": 9.987138522271456e-08, + "loss": 0.5745, + "step": 25258 + }, + { + "epoch": 1.8248415120920405, + "grad_norm": 7.329322440563257, + "learning_rate": 9.978955259303091e-08, + "loss": 0.6422, + "step": 25259 + }, + { + "epoch": 1.8249137572922498, + "grad_norm": 7.316714906062799, + "learning_rate": 9.970775282036438e-08, + "loss": 0.5839, + "step": 25260 + }, + { + "epoch": 1.8249860024924596, + "grad_norm": 6.828558796735847, + "learning_rate": 9.962598590583516e-08, + "loss": 0.6263, + "step": 25261 + }, + { + "epoch": 1.825058247692669, + "grad_norm": 8.572538630948335, + "learning_rate": 9.954425185056238e-08, + "loss": 0.6079, + "step": 25262 + }, + { + "epoch": 1.8251304928928784, + "grad_norm": 7.340020973944776, + "learning_rate": 9.946255065566513e-08, + "loss": 0.5827, + "step": 25263 + }, + { + "epoch": 1.825202738093088, + "grad_norm": 7.676553267076042, + "learning_rate": 9.938088232226139e-08, + "loss": 0.6435, + "step": 25264 + }, + { + "epoch": 1.8252749832932973, + "grad_norm": 7.916530273333357, + "learning_rate": 9.929924685146947e-08, + "loss": 0.5977, + "step": 25265 + }, + { + "epoch": 1.825347228493507, + "grad_norm": 7.343725066447035, + "learning_rate": 9.921764424440706e-08, + "loss": 0.6158, + "step": 25266 + }, + { + "epoch": 1.8254194736937164, + "grad_norm": 6.941966500586392, + "learning_rate": 9.913607450219104e-08, + "loss": 0.5441, + "step": 25267 + }, + { + "epoch": 1.8254917188939261, + "grad_norm": 7.588684401853047, + "learning_rate": 9.905453762593776e-08, + "loss": 0.6259, + "step": 25268 + }, + { + "epoch": 1.8255639640941355, + "grad_norm": 6.83852880573056, + "learning_rate": 9.897303361676381e-08, + "loss": 0.5809, + "step": 25269 + }, + { + "epoch": 1.825636209294345, + "grad_norm": 9.198099470788067, + "learning_rate": 9.889156247578469e-08, + "loss": 0.5998, + "step": 25270 + }, + { + "epoch": 1.8257084544945545, + "grad_norm": 6.652469044713334, + "learning_rate": 9.881012420411535e-08, + "loss": 0.6175, + "step": 25271 + }, + { + "epoch": 1.8257806996947639, + "grad_norm": 7.500104267667144, + "learning_rate": 9.872871880287128e-08, + "loss": 0.6078, + "step": 25272 + }, + { + "epoch": 1.8258529448949736, + "grad_norm": 6.79685283087261, + "learning_rate": 9.864734627316713e-08, + "loss": 0.63, + "step": 25273 + }, + { + "epoch": 1.825925190095183, + "grad_norm": 7.993435550616235, + "learning_rate": 9.856600661611565e-08, + "loss": 0.5893, + "step": 25274 + }, + { + "epoch": 1.8259974352953927, + "grad_norm": 8.12799251907485, + "learning_rate": 9.848469983283094e-08, + "loss": 0.5704, + "step": 25275 + }, + { + "epoch": 1.826069680495602, + "grad_norm": 7.3525465206372385, + "learning_rate": 9.840342592442654e-08, + "loss": 0.5964, + "step": 25276 + }, + { + "epoch": 1.8261419256958116, + "grad_norm": 7.769120532994785, + "learning_rate": 9.832218489201435e-08, + "loss": 0.6519, + "step": 25277 + }, + { + "epoch": 1.8262141708960211, + "grad_norm": 6.777082859261706, + "learning_rate": 9.824097673670624e-08, + "loss": 0.5162, + "step": 25278 + }, + { + "epoch": 1.8262864160962307, + "grad_norm": 7.469068959340566, + "learning_rate": 9.815980145961468e-08, + "loss": 0.6027, + "step": 25279 + }, + { + "epoch": 1.8263586612964402, + "grad_norm": 8.151050766684882, + "learning_rate": 9.807865906185043e-08, + "loss": 0.6013, + "step": 25280 + }, + { + "epoch": 1.8264309064966495, + "grad_norm": 7.332030209443187, + "learning_rate": 9.799754954452428e-08, + "loss": 0.6384, + "step": 25281 + }, + { + "epoch": 1.8265031516968593, + "grad_norm": 8.36892707757019, + "learning_rate": 9.791647290874673e-08, + "loss": 0.6531, + "step": 25282 + }, + { + "epoch": 1.8265753968970686, + "grad_norm": 7.184641394004938, + "learning_rate": 9.783542915562772e-08, + "loss": 0.6006, + "step": 25283 + }, + { + "epoch": 1.8266476420972781, + "grad_norm": 7.8047170705301925, + "learning_rate": 9.775441828627635e-08, + "loss": 0.5597, + "step": 25284 + }, + { + "epoch": 1.8267198872974877, + "grad_norm": 8.670569299224605, + "learning_rate": 9.767344030180176e-08, + "loss": 0.6668, + "step": 25285 + }, + { + "epoch": 1.8267921324976972, + "grad_norm": 7.795739100702428, + "learning_rate": 9.759249520331248e-08, + "loss": 0.5665, + "step": 25286 + }, + { + "epoch": 1.8268643776979068, + "grad_norm": 7.764996938809236, + "learning_rate": 9.751158299191682e-08, + "loss": 0.558, + "step": 25287 + }, + { + "epoch": 1.826936622898116, + "grad_norm": 8.565051958468759, + "learning_rate": 9.743070366872193e-08, + "loss": 0.6324, + "step": 25288 + }, + { + "epoch": 1.8270088680983259, + "grad_norm": 8.199760284641615, + "learning_rate": 9.734985723483526e-08, + "loss": 0.6036, + "step": 25289 + }, + { + "epoch": 1.8270811132985352, + "grad_norm": 7.940038080004029, + "learning_rate": 9.726904369136341e-08, + "loss": 0.6127, + "step": 25290 + }, + { + "epoch": 1.8271533584987447, + "grad_norm": 7.904641451440418, + "learning_rate": 9.718826303941304e-08, + "loss": 0.6089, + "step": 25291 + }, + { + "epoch": 1.8272256036989543, + "grad_norm": 7.131911288733974, + "learning_rate": 9.710751528008932e-08, + "loss": 0.5394, + "step": 25292 + }, + { + "epoch": 1.8272978488991638, + "grad_norm": 7.158080820493357, + "learning_rate": 9.702680041449808e-08, + "loss": 0.5851, + "step": 25293 + }, + { + "epoch": 1.8273700940993733, + "grad_norm": 9.023959586492635, + "learning_rate": 9.69461184437448e-08, + "loss": 0.5862, + "step": 25294 + }, + { + "epoch": 1.8274423392995827, + "grad_norm": 7.021764978311688, + "learning_rate": 9.686546936893249e-08, + "loss": 0.6478, + "step": 25295 + }, + { + "epoch": 1.8275145844997924, + "grad_norm": 9.16864445475422, + "learning_rate": 9.678485319116638e-08, + "loss": 0.6186, + "step": 25296 + }, + { + "epoch": 1.8275868297000017, + "grad_norm": 6.809841652260163, + "learning_rate": 9.670426991154946e-08, + "loss": 0.5811, + "step": 25297 + }, + { + "epoch": 1.8276590749002113, + "grad_norm": 6.807948840835512, + "learning_rate": 9.662371953118532e-08, + "loss": 0.5063, + "step": 25298 + }, + { + "epoch": 1.8277313201004208, + "grad_norm": 6.8210812339715945, + "learning_rate": 9.654320205117613e-08, + "loss": 0.6093, + "step": 25299 + }, + { + "epoch": 1.8278035653006304, + "grad_norm": 6.887150517785906, + "learning_rate": 9.646271747262403e-08, + "loss": 0.6074, + "step": 25300 + }, + { + "epoch": 1.82787581050084, + "grad_norm": 6.408588559442613, + "learning_rate": 9.638226579663179e-08, + "loss": 0.5817, + "step": 25301 + }, + { + "epoch": 1.8279480557010492, + "grad_norm": 8.196537533560283, + "learning_rate": 9.630184702429963e-08, + "loss": 0.6265, + "step": 25302 + }, + { + "epoch": 1.828020300901259, + "grad_norm": 7.901570378926268, + "learning_rate": 9.622146115672887e-08, + "loss": 0.5984, + "step": 25303 + }, + { + "epoch": 1.8280925461014683, + "grad_norm": 7.365879950078256, + "learning_rate": 9.614110819502004e-08, + "loss": 0.5383, + "step": 25304 + }, + { + "epoch": 1.8281647913016779, + "grad_norm": 7.446455236363017, + "learning_rate": 9.606078814027309e-08, + "loss": 0.5621, + "step": 25305 + }, + { + "epoch": 1.8282370365018874, + "grad_norm": 8.358940555299531, + "learning_rate": 9.598050099358713e-08, + "loss": 0.6485, + "step": 25306 + }, + { + "epoch": 1.828309281702097, + "grad_norm": 8.380143137872436, + "learning_rate": 9.590024675606158e-08, + "loss": 0.6034, + "step": 25307 + }, + { + "epoch": 1.8283815269023065, + "grad_norm": 7.3187917393798045, + "learning_rate": 9.582002542879471e-08, + "loss": 0.5876, + "step": 25308 + }, + { + "epoch": 1.8284537721025158, + "grad_norm": 7.829467414681448, + "learning_rate": 9.573983701288536e-08, + "loss": 0.6389, + "step": 25309 + }, + { + "epoch": 1.8285260173027256, + "grad_norm": 8.00318177846171, + "learning_rate": 9.565968150943073e-08, + "loss": 0.607, + "step": 25310 + }, + { + "epoch": 1.828598262502935, + "grad_norm": 7.949268417765596, + "learning_rate": 9.557955891952797e-08, + "loss": 0.6366, + "step": 25311 + }, + { + "epoch": 1.8286705077031444, + "grad_norm": 8.073073440178023, + "learning_rate": 9.549946924427483e-08, + "loss": 0.5911, + "step": 25312 + }, + { + "epoch": 1.828742752903354, + "grad_norm": 7.365135967089861, + "learning_rate": 9.541941248476627e-08, + "loss": 0.5858, + "step": 25313 + }, + { + "epoch": 1.8288149981035635, + "grad_norm": 7.716950859400768, + "learning_rate": 9.533938864209918e-08, + "loss": 0.6195, + "step": 25314 + }, + { + "epoch": 1.828887243303773, + "grad_norm": 7.730768275999937, + "learning_rate": 9.52593977173688e-08, + "loss": 0.619, + "step": 25315 + }, + { + "epoch": 1.8289594885039824, + "grad_norm": 8.035203250196766, + "learning_rate": 9.517943971167037e-08, + "loss": 0.5486, + "step": 25316 + }, + { + "epoch": 1.8290317337041921, + "grad_norm": 6.272575803795078, + "learning_rate": 9.509951462609774e-08, + "loss": 0.5845, + "step": 25317 + }, + { + "epoch": 1.8291039789044015, + "grad_norm": 7.9364825219852495, + "learning_rate": 9.501962246174556e-08, + "loss": 0.5898, + "step": 25318 + }, + { + "epoch": 1.829176224104611, + "grad_norm": 8.087880484402602, + "learning_rate": 9.493976321970743e-08, + "loss": 0.5723, + "step": 25319 + }, + { + "epoch": 1.8292484693048205, + "grad_norm": 7.425927399025371, + "learning_rate": 9.485993690107637e-08, + "loss": 0.5756, + "step": 25320 + }, + { + "epoch": 1.82932071450503, + "grad_norm": 7.052915838585618, + "learning_rate": 9.478014350694536e-08, + "loss": 0.6309, + "step": 25321 + }, + { + "epoch": 1.8293929597052396, + "grad_norm": 7.934917600572854, + "learning_rate": 9.470038303840689e-08, + "loss": 0.5352, + "step": 25322 + }, + { + "epoch": 1.829465204905449, + "grad_norm": 7.830428159658591, + "learning_rate": 9.462065549655258e-08, + "loss": 0.6382, + "step": 25323 + }, + { + "epoch": 1.8295374501056587, + "grad_norm": 7.711671618422987, + "learning_rate": 9.454096088247377e-08, + "loss": 0.6543, + "step": 25324 + }, + { + "epoch": 1.829609695305868, + "grad_norm": 8.144632418436409, + "learning_rate": 9.446129919726155e-08, + "loss": 0.6521, + "step": 25325 + }, + { + "epoch": 1.8296819405060776, + "grad_norm": 8.727916643200945, + "learning_rate": 9.438167044200641e-08, + "loss": 0.5873, + "step": 25326 + }, + { + "epoch": 1.8297541857062871, + "grad_norm": 7.613013429662699, + "learning_rate": 9.430207461779806e-08, + "loss": 0.6095, + "step": 25327 + }, + { + "epoch": 1.8298264309064967, + "grad_norm": 7.740580833780312, + "learning_rate": 9.422251172572616e-08, + "loss": 0.6317, + "step": 25328 + }, + { + "epoch": 1.8298986761067062, + "grad_norm": 7.619344646476945, + "learning_rate": 9.414298176688069e-08, + "loss": 0.599, + "step": 25329 + }, + { + "epoch": 1.8299709213069155, + "grad_norm": 6.277661565344528, + "learning_rate": 9.406348474234966e-08, + "loss": 0.6079, + "step": 25330 + }, + { + "epoch": 1.8300431665071253, + "grad_norm": 7.662809839254029, + "learning_rate": 9.398402065322138e-08, + "loss": 0.6261, + "step": 25331 + }, + { + "epoch": 1.8301154117073346, + "grad_norm": 6.839455567939623, + "learning_rate": 9.390458950058357e-08, + "loss": 0.5834, + "step": 25332 + }, + { + "epoch": 1.8301876569075444, + "grad_norm": 6.116515218884979, + "learning_rate": 9.382519128552426e-08, + "loss": 0.5453, + "step": 25333 + }, + { + "epoch": 1.8302599021077537, + "grad_norm": 7.0376208117128, + "learning_rate": 9.374582600912924e-08, + "loss": 0.6443, + "step": 25334 + }, + { + "epoch": 1.8303321473079632, + "grad_norm": 7.11980529947564, + "learning_rate": 9.36664936724857e-08, + "loss": 0.5573, + "step": 25335 + }, + { + "epoch": 1.8304043925081728, + "grad_norm": 7.023963238109415, + "learning_rate": 9.358719427667917e-08, + "loss": 0.6254, + "step": 25336 + }, + { + "epoch": 1.830476637708382, + "grad_norm": 6.547322770058176, + "learning_rate": 9.350792782279571e-08, + "loss": 0.5945, + "step": 25337 + }, + { + "epoch": 1.8305488829085919, + "grad_norm": 7.742240466755625, + "learning_rate": 9.34286943119203e-08, + "loss": 0.6357, + "step": 25338 + }, + { + "epoch": 1.8306211281088012, + "grad_norm": 7.1217397625926875, + "learning_rate": 9.334949374513736e-08, + "loss": 0.5125, + "step": 25339 + }, + { + "epoch": 1.830693373309011, + "grad_norm": 6.299379518016971, + "learning_rate": 9.327032612353126e-08, + "loss": 0.5288, + "step": 25340 + }, + { + "epoch": 1.8307656185092203, + "grad_norm": 9.109673994074669, + "learning_rate": 9.319119144818589e-08, + "loss": 0.6179, + "step": 25341 + }, + { + "epoch": 1.8308378637094298, + "grad_norm": 8.445988207242241, + "learning_rate": 9.311208972018426e-08, + "loss": 0.6106, + "step": 25342 + }, + { + "epoch": 1.8309101089096393, + "grad_norm": 8.279686891836882, + "learning_rate": 9.303302094060911e-08, + "loss": 0.6921, + "step": 25343 + }, + { + "epoch": 1.8309823541098487, + "grad_norm": 6.582169474160648, + "learning_rate": 9.295398511054321e-08, + "loss": 0.5809, + "step": 25344 + }, + { + "epoch": 1.8310545993100584, + "grad_norm": 7.475291942394185, + "learning_rate": 9.287498223106817e-08, + "loss": 0.6346, + "step": 25345 + }, + { + "epoch": 1.8311268445102677, + "grad_norm": 8.023999930841022, + "learning_rate": 9.279601230326562e-08, + "loss": 0.6278, + "step": 25346 + }, + { + "epoch": 1.8311990897104775, + "grad_norm": 8.201889787954359, + "learning_rate": 9.271707532821667e-08, + "loss": 0.6148, + "step": 25347 + }, + { + "epoch": 1.8312713349106868, + "grad_norm": 6.23689819368323, + "learning_rate": 9.263817130700153e-08, + "loss": 0.5219, + "step": 25348 + }, + { + "epoch": 1.8313435801108964, + "grad_norm": 7.6739596081093975, + "learning_rate": 9.255930024070076e-08, + "loss": 0.6456, + "step": 25349 + }, + { + "epoch": 1.831415825311106, + "grad_norm": 6.355735243337189, + "learning_rate": 9.248046213039403e-08, + "loss": 0.5583, + "step": 25350 + }, + { + "epoch": 1.8314880705113155, + "grad_norm": 7.202908659559622, + "learning_rate": 9.240165697716074e-08, + "loss": 0.627, + "step": 25351 + }, + { + "epoch": 1.831560315711525, + "grad_norm": 6.2010396147312905, + "learning_rate": 9.232288478207896e-08, + "loss": 0.5701, + "step": 25352 + }, + { + "epoch": 1.8316325609117343, + "grad_norm": 7.256718909385563, + "learning_rate": 9.224414554622751e-08, + "loss": 0.5875, + "step": 25353 + }, + { + "epoch": 1.831704806111944, + "grad_norm": 5.679786565827065, + "learning_rate": 9.216543927068417e-08, + "loss": 0.591, + "step": 25354 + }, + { + "epoch": 1.8317770513121534, + "grad_norm": 6.996426351320588, + "learning_rate": 9.208676595652694e-08, + "loss": 0.609, + "step": 25355 + }, + { + "epoch": 1.831849296512363, + "grad_norm": 7.317846707340139, + "learning_rate": 9.200812560483136e-08, + "loss": 0.6353, + "step": 25356 + }, + { + "epoch": 1.8319215417125725, + "grad_norm": 8.429348094708025, + "learning_rate": 9.192951821667517e-08, + "loss": 0.5691, + "step": 25357 + }, + { + "epoch": 1.831993786912782, + "grad_norm": 9.259560670256032, + "learning_rate": 9.185094379313448e-08, + "loss": 0.6911, + "step": 25358 + }, + { + "epoch": 1.8320660321129916, + "grad_norm": 7.881847734646883, + "learning_rate": 9.17724023352845e-08, + "loss": 0.5888, + "step": 25359 + }, + { + "epoch": 1.832138277313201, + "grad_norm": 7.982902615050302, + "learning_rate": 9.169389384420024e-08, + "loss": 0.6078, + "step": 25360 + }, + { + "epoch": 1.8322105225134107, + "grad_norm": 6.9345680433429955, + "learning_rate": 9.161541832095666e-08, + "loss": 0.6359, + "step": 25361 + }, + { + "epoch": 1.83228276771362, + "grad_norm": 6.86615225460718, + "learning_rate": 9.153697576662818e-08, + "loss": 0.5736, + "step": 25362 + }, + { + "epoch": 1.8323550129138295, + "grad_norm": 8.036049206732581, + "learning_rate": 9.145856618228837e-08, + "loss": 0.558, + "step": 25363 + }, + { + "epoch": 1.832427258114039, + "grad_norm": 7.149196563609575, + "learning_rate": 9.138018956901057e-08, + "loss": 0.5257, + "step": 25364 + }, + { + "epoch": 1.8324995033142486, + "grad_norm": 7.79865912625244, + "learning_rate": 9.130184592786779e-08, + "loss": 0.618, + "step": 25365 + }, + { + "epoch": 1.8325717485144581, + "grad_norm": 6.951639149526849, + "learning_rate": 9.12235352599325e-08, + "loss": 0.6024, + "step": 25366 + }, + { + "epoch": 1.8326439937146675, + "grad_norm": 8.236309830353763, + "learning_rate": 9.114525756627691e-08, + "loss": 0.6126, + "step": 25367 + }, + { + "epoch": 1.8327162389148772, + "grad_norm": 7.669574959925613, + "learning_rate": 9.106701284797237e-08, + "loss": 0.6242, + "step": 25368 + }, + { + "epoch": 1.8327884841150865, + "grad_norm": 7.025517682858553, + "learning_rate": 9.098880110608999e-08, + "loss": 0.5862, + "step": 25369 + }, + { + "epoch": 1.832860729315296, + "grad_norm": 7.399679238868044, + "learning_rate": 9.091062234170028e-08, + "loss": 0.5907, + "step": 25370 + }, + { + "epoch": 1.8329329745155056, + "grad_norm": 7.304190323989546, + "learning_rate": 9.08324765558738e-08, + "loss": 0.5511, + "step": 25371 + }, + { + "epoch": 1.8330052197157152, + "grad_norm": 6.4839417094505665, + "learning_rate": 9.075436374967994e-08, + "loss": 0.6311, + "step": 25372 + }, + { + "epoch": 1.8330774649159247, + "grad_norm": 7.001953125, + "learning_rate": 9.067628392418871e-08, + "loss": 0.5771, + "step": 25373 + }, + { + "epoch": 1.833149710116134, + "grad_norm": 5.9707107918253035, + "learning_rate": 9.059823708046783e-08, + "loss": 0.6182, + "step": 25374 + }, + { + "epoch": 1.8332219553163438, + "grad_norm": 8.020397884554264, + "learning_rate": 9.052022321958648e-08, + "loss": 0.5518, + "step": 25375 + }, + { + "epoch": 1.8332942005165531, + "grad_norm": 6.7126681457713895, + "learning_rate": 9.04422423426124e-08, + "loss": 0.6126, + "step": 25376 + }, + { + "epoch": 1.8333664457167627, + "grad_norm": 6.714318805833336, + "learning_rate": 9.036429445061307e-08, + "loss": 0.6198, + "step": 25377 + }, + { + "epoch": 1.8334386909169722, + "grad_norm": 8.731095737413426, + "learning_rate": 9.028637954465569e-08, + "loss": 0.6651, + "step": 25378 + }, + { + "epoch": 1.8335109361171817, + "grad_norm": 6.414920057243139, + "learning_rate": 9.020849762580663e-08, + "loss": 0.6509, + "step": 25379 + }, + { + "epoch": 1.8335831813173913, + "grad_norm": 8.145664639364439, + "learning_rate": 9.013064869513255e-08, + "loss": 0.5839, + "step": 25380 + }, + { + "epoch": 1.8336554265176006, + "grad_norm": 8.557304198119882, + "learning_rate": 9.005283275369842e-08, + "loss": 0.6196, + "step": 25381 + }, + { + "epoch": 1.8337276717178104, + "grad_norm": 6.862013324650119, + "learning_rate": 8.997504980256977e-08, + "loss": 0.6268, + "step": 25382 + }, + { + "epoch": 1.8337999169180197, + "grad_norm": 6.788588811330114, + "learning_rate": 8.989729984281159e-08, + "loss": 0.5585, + "step": 25383 + }, + { + "epoch": 1.8338721621182292, + "grad_norm": 8.229151420740097, + "learning_rate": 8.981958287548803e-08, + "loss": 0.6068, + "step": 25384 + }, + { + "epoch": 1.8339444073184388, + "grad_norm": 8.067478742261175, + "learning_rate": 8.974189890166268e-08, + "loss": 0.5929, + "step": 25385 + }, + { + "epoch": 1.8340166525186483, + "grad_norm": 8.140771201902112, + "learning_rate": 8.966424792239942e-08, + "loss": 0.5482, + "step": 25386 + }, + { + "epoch": 1.8340888977188579, + "grad_norm": 6.985820304964332, + "learning_rate": 8.958662993876154e-08, + "loss": 0.5416, + "step": 25387 + }, + { + "epoch": 1.8341611429190672, + "grad_norm": 7.487047327588285, + "learning_rate": 8.950904495181074e-08, + "loss": 0.5409, + "step": 25388 + }, + { + "epoch": 1.834233388119277, + "grad_norm": 8.159904742713376, + "learning_rate": 8.943149296260944e-08, + "loss": 0.7174, + "step": 25389 + }, + { + "epoch": 1.8343056333194863, + "grad_norm": 7.587151065954033, + "learning_rate": 8.935397397221935e-08, + "loss": 0.607, + "step": 25390 + }, + { + "epoch": 1.8343778785196958, + "grad_norm": 9.194572373569773, + "learning_rate": 8.927648798170207e-08, + "loss": 0.6774, + "step": 25391 + }, + { + "epoch": 1.8344501237199053, + "grad_norm": 7.057826293364316, + "learning_rate": 8.919903499211735e-08, + "loss": 0.5775, + "step": 25392 + }, + { + "epoch": 1.8345223689201149, + "grad_norm": 8.200513907355232, + "learning_rate": 8.912161500452599e-08, + "loss": 0.6679, + "step": 25393 + }, + { + "epoch": 1.8345946141203244, + "grad_norm": 7.357857646953495, + "learning_rate": 8.904422801998797e-08, + "loss": 0.5661, + "step": 25394 + }, + { + "epoch": 1.8346668593205337, + "grad_norm": 7.359195650349105, + "learning_rate": 8.896687403956245e-08, + "loss": 0.6269, + "step": 25395 + }, + { + "epoch": 1.8347391045207435, + "grad_norm": 8.033119310664755, + "learning_rate": 8.88895530643083e-08, + "loss": 0.5695, + "step": 25396 + }, + { + "epoch": 1.8348113497209528, + "grad_norm": 8.029209219069694, + "learning_rate": 8.881226509528384e-08, + "loss": 0.6892, + "step": 25397 + }, + { + "epoch": 1.8348835949211624, + "grad_norm": 6.3300103287213325, + "learning_rate": 8.873501013354796e-08, + "loss": 0.487, + "step": 25398 + }, + { + "epoch": 1.834955840121372, + "grad_norm": 8.181374158274133, + "learning_rate": 8.865778818015702e-08, + "loss": 0.6695, + "step": 25399 + }, + { + "epoch": 1.8350280853215815, + "grad_norm": 7.556837499858494, + "learning_rate": 8.858059923616879e-08, + "loss": 0.6964, + "step": 25400 + }, + { + "epoch": 1.835100330521791, + "grad_norm": 7.383244190008015, + "learning_rate": 8.850344330264021e-08, + "loss": 0.562, + "step": 25401 + }, + { + "epoch": 1.8351725757220003, + "grad_norm": 6.165567695362975, + "learning_rate": 8.842632038062681e-08, + "loss": 0.6287, + "step": 25402 + }, + { + "epoch": 1.83524482092221, + "grad_norm": 7.320741883084665, + "learning_rate": 8.834923047118444e-08, + "loss": 0.6429, + "step": 25403 + }, + { + "epoch": 1.8353170661224194, + "grad_norm": 5.624030305143185, + "learning_rate": 8.82721735753686e-08, + "loss": 0.5599, + "step": 25404 + }, + { + "epoch": 1.8353893113226292, + "grad_norm": 8.002749447425035, + "learning_rate": 8.819514969423404e-08, + "loss": 0.6006, + "step": 25405 + }, + { + "epoch": 1.8354615565228385, + "grad_norm": 7.38863857422115, + "learning_rate": 8.811815882883545e-08, + "loss": 0.5715, + "step": 25406 + }, + { + "epoch": 1.835533801723048, + "grad_norm": 7.459631808606287, + "learning_rate": 8.804120098022645e-08, + "loss": 0.6859, + "step": 25407 + }, + { + "epoch": 1.8356060469232576, + "grad_norm": 7.404022268054683, + "learning_rate": 8.79642761494609e-08, + "loss": 0.6115, + "step": 25408 + }, + { + "epoch": 1.835678292123467, + "grad_norm": 9.550987344402133, + "learning_rate": 8.788738433759131e-08, + "loss": 0.6523, + "step": 25409 + }, + { + "epoch": 1.8357505373236767, + "grad_norm": 8.330871930597867, + "learning_rate": 8.781052554567071e-08, + "loss": 0.6136, + "step": 25410 + }, + { + "epoch": 1.835822782523886, + "grad_norm": 8.525010059160579, + "learning_rate": 8.773369977475105e-08, + "loss": 0.5324, + "step": 25411 + }, + { + "epoch": 1.8358950277240957, + "grad_norm": 7.414080765503712, + "learning_rate": 8.765690702588397e-08, + "loss": 0.5843, + "step": 25412 + }, + { + "epoch": 1.835967272924305, + "grad_norm": 7.266955575828468, + "learning_rate": 8.758014730012059e-08, + "loss": 0.6507, + "step": 25413 + }, + { + "epoch": 1.8360395181245146, + "grad_norm": 6.646004542075714, + "learning_rate": 8.750342059851203e-08, + "loss": 0.6025, + "step": 25414 + }, + { + "epoch": 1.8361117633247241, + "grad_norm": 7.049471290188464, + "learning_rate": 8.742672692210851e-08, + "loss": 0.6799, + "step": 25415 + }, + { + "epoch": 1.8361840085249335, + "grad_norm": 6.95892517253094, + "learning_rate": 8.73500662719598e-08, + "loss": 0.6106, + "step": 25416 + }, + { + "epoch": 1.8362562537251432, + "grad_norm": 7.15060668718907, + "learning_rate": 8.727343864911558e-08, + "loss": 0.5477, + "step": 25417 + }, + { + "epoch": 1.8363284989253525, + "grad_norm": 7.808980896871568, + "learning_rate": 8.71968440546242e-08, + "loss": 0.6202, + "step": 25418 + }, + { + "epoch": 1.8364007441255623, + "grad_norm": 6.031893839096707, + "learning_rate": 8.712028248953507e-08, + "loss": 0.5669, + "step": 25419 + }, + { + "epoch": 1.8364729893257716, + "grad_norm": 7.019789379470648, + "learning_rate": 8.704375395489572e-08, + "loss": 0.6112, + "step": 25420 + }, + { + "epoch": 1.8365452345259812, + "grad_norm": 8.63804414086834, + "learning_rate": 8.696725845175364e-08, + "loss": 0.6725, + "step": 25421 + }, + { + "epoch": 1.8366174797261907, + "grad_norm": 6.465455031523183, + "learning_rate": 8.689079598115601e-08, + "loss": 0.5856, + "step": 25422 + }, + { + "epoch": 1.8366897249264, + "grad_norm": 7.540123488328177, + "learning_rate": 8.68143665441501e-08, + "loss": 0.6125, + "step": 25423 + }, + { + "epoch": 1.8367619701266098, + "grad_norm": 7.621077482283381, + "learning_rate": 8.67379701417817e-08, + "loss": 0.5533, + "step": 25424 + }, + { + "epoch": 1.8368342153268191, + "grad_norm": 6.8600784670954775, + "learning_rate": 8.666160677509638e-08, + "loss": 0.581, + "step": 25425 + }, + { + "epoch": 1.8369064605270289, + "grad_norm": 7.22399130779094, + "learning_rate": 8.658527644514054e-08, + "loss": 0.5951, + "step": 25426 + }, + { + "epoch": 1.8369787057272382, + "grad_norm": 7.727542209516136, + "learning_rate": 8.650897915295775e-08, + "loss": 0.6514, + "step": 25427 + }, + { + "epoch": 1.8370509509274477, + "grad_norm": 7.280978349194259, + "learning_rate": 8.643271489959331e-08, + "loss": 0.6354, + "step": 25428 + }, + { + "epoch": 1.8371231961276573, + "grad_norm": 6.3599814235967775, + "learning_rate": 8.635648368609084e-08, + "loss": 0.6123, + "step": 25429 + }, + { + "epoch": 1.8371954413278668, + "grad_norm": 7.785232444041968, + "learning_rate": 8.628028551349448e-08, + "loss": 0.5568, + "step": 25430 + }, + { + "epoch": 1.8372676865280764, + "grad_norm": 6.375662974870976, + "learning_rate": 8.620412038284649e-08, + "loss": 0.6061, + "step": 25431 + }, + { + "epoch": 1.8373399317282857, + "grad_norm": 7.218991230158374, + "learning_rate": 8.612798829518987e-08, + "loss": 0.637, + "step": 25432 + }, + { + "epoch": 1.8374121769284955, + "grad_norm": 7.456174854242036, + "learning_rate": 8.605188925156688e-08, + "loss": 0.5941, + "step": 25433 + }, + { + "epoch": 1.8374844221287048, + "grad_norm": 6.797825678844739, + "learning_rate": 8.597582325301918e-08, + "loss": 0.6226, + "step": 25434 + }, + { + "epoch": 1.8375566673289143, + "grad_norm": 6.770799591762882, + "learning_rate": 8.589979030058814e-08, + "loss": 0.6089, + "step": 25435 + }, + { + "epoch": 1.8376289125291239, + "grad_norm": 7.497119604765824, + "learning_rate": 8.582379039531464e-08, + "loss": 0.6159, + "step": 25436 + }, + { + "epoch": 1.8377011577293334, + "grad_norm": 7.375902540940806, + "learning_rate": 8.574782353823919e-08, + "loss": 0.6649, + "step": 25437 + }, + { + "epoch": 1.837773402929543, + "grad_norm": 7.70749427421523, + "learning_rate": 8.567188973040097e-08, + "loss": 0.6136, + "step": 25438 + }, + { + "epoch": 1.8378456481297523, + "grad_norm": 7.556621442047247, + "learning_rate": 8.559598897284027e-08, + "loss": 0.5495, + "step": 25439 + }, + { + "epoch": 1.837917893329962, + "grad_norm": 6.0088502461384055, + "learning_rate": 8.552012126659597e-08, + "loss": 0.6045, + "step": 25440 + }, + { + "epoch": 1.8379901385301713, + "grad_norm": 7.718985488326438, + "learning_rate": 8.544428661270587e-08, + "loss": 0.567, + "step": 25441 + }, + { + "epoch": 1.8380623837303809, + "grad_norm": 8.434942917644777, + "learning_rate": 8.536848501220912e-08, + "loss": 0.5687, + "step": 25442 + }, + { + "epoch": 1.8381346289305904, + "grad_norm": 6.2027194181207035, + "learning_rate": 8.529271646614295e-08, + "loss": 0.5868, + "step": 25443 + }, + { + "epoch": 1.8382068741308, + "grad_norm": 7.111064198789562, + "learning_rate": 8.521698097554487e-08, + "loss": 0.6388, + "step": 25444 + }, + { + "epoch": 1.8382791193310095, + "grad_norm": 7.539786285285758, + "learning_rate": 8.514127854145099e-08, + "loss": 0.6277, + "step": 25445 + }, + { + "epoch": 1.8383513645312188, + "grad_norm": 7.928860986790698, + "learning_rate": 8.506560916489798e-08, + "loss": 0.565, + "step": 25446 + }, + { + "epoch": 1.8384236097314286, + "grad_norm": 8.001681627914955, + "learning_rate": 8.498997284692196e-08, + "loss": 0.6098, + "step": 25447 + }, + { + "epoch": 1.838495854931638, + "grad_norm": 6.33902718767124, + "learning_rate": 8.49143695885582e-08, + "loss": 0.5162, + "step": 25448 + }, + { + "epoch": 1.8385681001318475, + "grad_norm": 8.685793043439213, + "learning_rate": 8.483879939084117e-08, + "loss": 0.5803, + "step": 25449 + }, + { + "epoch": 1.838640345332057, + "grad_norm": 7.993908708919136, + "learning_rate": 8.476326225480558e-08, + "loss": 0.5992, + "step": 25450 + }, + { + "epoch": 1.8387125905322665, + "grad_norm": 9.153690748615503, + "learning_rate": 8.46877581814859e-08, + "loss": 0.6046, + "step": 25451 + }, + { + "epoch": 1.838784835732476, + "grad_norm": 6.245469854804792, + "learning_rate": 8.461228717191545e-08, + "loss": 0.5941, + "step": 25452 + }, + { + "epoch": 1.8388570809326854, + "grad_norm": 8.21370643800392, + "learning_rate": 8.45368492271273e-08, + "loss": 0.6393, + "step": 25453 + }, + { + "epoch": 1.8389293261328952, + "grad_norm": 8.072711953469627, + "learning_rate": 8.446144434815395e-08, + "loss": 0.5808, + "step": 25454 + }, + { + "epoch": 1.8390015713331045, + "grad_norm": 6.719117620302459, + "learning_rate": 8.438607253602849e-08, + "loss": 0.5491, + "step": 25455 + }, + { + "epoch": 1.839073816533314, + "grad_norm": 8.083921850220273, + "learning_rate": 8.431073379178173e-08, + "loss": 0.6168, + "step": 25456 + }, + { + "epoch": 1.8391460617335236, + "grad_norm": 8.625709393947151, + "learning_rate": 8.423542811644536e-08, + "loss": 0.585, + "step": 25457 + }, + { + "epoch": 1.8392183069337331, + "grad_norm": 7.370106512203247, + "learning_rate": 8.41601555110505e-08, + "loss": 0.6205, + "step": 25458 + }, + { + "epoch": 1.8392905521339427, + "grad_norm": 7.809757331077099, + "learning_rate": 8.408491597662688e-08, + "loss": 0.6217, + "step": 25459 + }, + { + "epoch": 1.839362797334152, + "grad_norm": 7.554647104338743, + "learning_rate": 8.400970951420534e-08, + "loss": 0.5719, + "step": 25460 + }, + { + "epoch": 1.8394350425343617, + "grad_norm": 6.340257406695275, + "learning_rate": 8.393453612481478e-08, + "loss": 0.6058, + "step": 25461 + }, + { + "epoch": 1.839507287734571, + "grad_norm": 6.726935364139494, + "learning_rate": 8.385939580948437e-08, + "loss": 0.6066, + "step": 25462 + }, + { + "epoch": 1.8395795329347806, + "grad_norm": 6.724177949147157, + "learning_rate": 8.378428856924275e-08, + "loss": 0.5919, + "step": 25463 + }, + { + "epoch": 1.8396517781349901, + "grad_norm": 8.430742284420074, + "learning_rate": 8.370921440511825e-08, + "loss": 0.6138, + "step": 25464 + }, + { + "epoch": 1.8397240233351997, + "grad_norm": 6.420580805475364, + "learning_rate": 8.363417331813894e-08, + "loss": 0.5884, + "step": 25465 + }, + { + "epoch": 1.8397962685354092, + "grad_norm": 7.576599345055673, + "learning_rate": 8.355916530933122e-08, + "loss": 0.6134, + "step": 25466 + }, + { + "epoch": 1.8398685137356185, + "grad_norm": 6.2955935856908205, + "learning_rate": 8.348419037972205e-08, + "loss": 0.7014, + "step": 25467 + }, + { + "epoch": 1.8399407589358283, + "grad_norm": 8.199214097485578, + "learning_rate": 8.340924853033838e-08, + "loss": 0.5317, + "step": 25468 + }, + { + "epoch": 1.8400130041360376, + "grad_norm": 6.814435211534208, + "learning_rate": 8.33343397622055e-08, + "loss": 0.5008, + "step": 25469 + }, + { + "epoch": 1.8400852493362472, + "grad_norm": 7.604025336916158, + "learning_rate": 8.325946407634927e-08, + "loss": 0.6152, + "step": 25470 + }, + { + "epoch": 1.8401574945364567, + "grad_norm": 6.895809515444218, + "learning_rate": 8.318462147379414e-08, + "loss": 0.6262, + "step": 25471 + }, + { + "epoch": 1.8402297397366663, + "grad_norm": 7.2451947829321295, + "learning_rate": 8.31098119555654e-08, + "loss": 0.6379, + "step": 25472 + }, + { + "epoch": 1.8403019849368758, + "grad_norm": 5.803755550607808, + "learning_rate": 8.303503552268693e-08, + "loss": 0.5308, + "step": 25473 + }, + { + "epoch": 1.8403742301370851, + "grad_norm": 7.09889740240229, + "learning_rate": 8.296029217618184e-08, + "loss": 0.6588, + "step": 25474 + }, + { + "epoch": 1.8404464753372949, + "grad_norm": 8.098132504536782, + "learning_rate": 8.288558191707346e-08, + "loss": 0.6319, + "step": 25475 + }, + { + "epoch": 1.8405187205375042, + "grad_norm": 7.743439143039925, + "learning_rate": 8.281090474638514e-08, + "loss": 0.6368, + "step": 25476 + }, + { + "epoch": 1.8405909657377137, + "grad_norm": 7.855506627519323, + "learning_rate": 8.273626066513856e-08, + "loss": 0.5732, + "step": 25477 + }, + { + "epoch": 1.8406632109379233, + "grad_norm": 6.476599606912698, + "learning_rate": 8.266164967435542e-08, + "loss": 0.5892, + "step": 25478 + }, + { + "epoch": 1.8407354561381328, + "grad_norm": 7.823582227681321, + "learning_rate": 8.258707177505765e-08, + "loss": 0.5726, + "step": 25479 + }, + { + "epoch": 1.8408077013383424, + "grad_norm": 7.7499913861626775, + "learning_rate": 8.251252696826556e-08, + "loss": 0.601, + "step": 25480 + }, + { + "epoch": 1.8408799465385517, + "grad_norm": 7.093795977876191, + "learning_rate": 8.2438015255e-08, + "loss": 0.5755, + "step": 25481 + }, + { + "epoch": 1.8409521917387615, + "grad_norm": 6.600355554887004, + "learning_rate": 8.236353663628099e-08, + "loss": 0.528, + "step": 25482 + }, + { + "epoch": 1.8410244369389708, + "grad_norm": 7.077833695975122, + "learning_rate": 8.228909111312827e-08, + "loss": 0.6148, + "step": 25483 + }, + { + "epoch": 1.8410966821391805, + "grad_norm": 6.7833006961921525, + "learning_rate": 8.221467868656047e-08, + "loss": 0.6293, + "step": 25484 + }, + { + "epoch": 1.8411689273393899, + "grad_norm": 6.976712591558082, + "learning_rate": 8.21402993575962e-08, + "loss": 0.5844, + "step": 25485 + }, + { + "epoch": 1.8412411725395994, + "grad_norm": 7.48980172112569, + "learning_rate": 8.206595312725413e-08, + "loss": 0.617, + "step": 25486 + }, + { + "epoch": 1.841313417739809, + "grad_norm": 7.344963070936963, + "learning_rate": 8.199163999655174e-08, + "loss": 0.5928, + "step": 25487 + }, + { + "epoch": 1.8413856629400183, + "grad_norm": 7.26181540750626, + "learning_rate": 8.191735996650629e-08, + "loss": 0.5937, + "step": 25488 + }, + { + "epoch": 1.841457908140228, + "grad_norm": 6.372292336290029, + "learning_rate": 8.184311303813447e-08, + "loss": 0.5553, + "step": 25489 + }, + { + "epoch": 1.8415301533404373, + "grad_norm": 7.816604634617003, + "learning_rate": 8.176889921245296e-08, + "loss": 0.5859, + "step": 25490 + }, + { + "epoch": 1.8416023985406471, + "grad_norm": 7.0963960661511525, + "learning_rate": 8.16947184904776e-08, + "loss": 0.5687, + "step": 25491 + }, + { + "epoch": 1.8416746437408564, + "grad_norm": 7.467911549514827, + "learning_rate": 8.1620570873224e-08, + "loss": 0.532, + "step": 25492 + }, + { + "epoch": 1.841746888941066, + "grad_norm": 7.629332265370914, + "learning_rate": 8.154645636170689e-08, + "loss": 0.6988, + "step": 25493 + }, + { + "epoch": 1.8418191341412755, + "grad_norm": 7.411700983067405, + "learning_rate": 8.147237495694127e-08, + "loss": 0.6226, + "step": 25494 + }, + { + "epoch": 1.8418913793414848, + "grad_norm": 6.704382007175943, + "learning_rate": 8.139832665994079e-08, + "loss": 0.608, + "step": 25495 + }, + { + "epoch": 1.8419636245416946, + "grad_norm": 7.3972955968527305, + "learning_rate": 8.132431147171909e-08, + "loss": 0.5189, + "step": 25496 + }, + { + "epoch": 1.842035869741904, + "grad_norm": 7.724601473605314, + "learning_rate": 8.125032939328953e-08, + "loss": 0.6295, + "step": 25497 + }, + { + "epoch": 1.8421081149421137, + "grad_norm": 8.144688154214041, + "learning_rate": 8.117638042566517e-08, + "loss": 0.5299, + "step": 25498 + }, + { + "epoch": 1.842180360142323, + "grad_norm": 7.529178065291536, + "learning_rate": 8.110246456985798e-08, + "loss": 0.601, + "step": 25499 + }, + { + "epoch": 1.8422526053425325, + "grad_norm": 7.377938929614496, + "learning_rate": 8.102858182687995e-08, + "loss": 0.6081, + "step": 25500 + }, + { + "epoch": 1.842324850542742, + "grad_norm": 8.6304503095949, + "learning_rate": 8.095473219774275e-08, + "loss": 0.6274, + "step": 25501 + }, + { + "epoch": 1.8423970957429516, + "grad_norm": 7.997891148132913, + "learning_rate": 8.088091568345641e-08, + "loss": 0.6257, + "step": 25502 + }, + { + "epoch": 1.8424693409431612, + "grad_norm": 7.485601718850778, + "learning_rate": 8.080713228503234e-08, + "loss": 0.6073, + "step": 25503 + }, + { + "epoch": 1.8425415861433705, + "grad_norm": 8.490528888741082, + "learning_rate": 8.073338200348002e-08, + "loss": 0.5996, + "step": 25504 + }, + { + "epoch": 1.8426138313435803, + "grad_norm": 8.93482386931514, + "learning_rate": 8.065966483980975e-08, + "loss": 0.607, + "step": 25505 + }, + { + "epoch": 1.8426860765437896, + "grad_norm": 7.061005560826963, + "learning_rate": 8.058598079502961e-08, + "loss": 0.5515, + "step": 25506 + }, + { + "epoch": 1.8427583217439991, + "grad_norm": 7.34091768229374, + "learning_rate": 8.051232987014906e-08, + "loss": 0.5293, + "step": 25507 + }, + { + "epoch": 1.8428305669442087, + "grad_norm": 6.043178009769667, + "learning_rate": 8.043871206617621e-08, + "loss": 0.6012, + "step": 25508 + }, + { + "epoch": 1.8429028121444182, + "grad_norm": 6.438969120421392, + "learning_rate": 8.036512738411855e-08, + "loss": 0.6002, + "step": 25509 + }, + { + "epoch": 1.8429750573446277, + "grad_norm": 7.413418805203687, + "learning_rate": 8.029157582498364e-08, + "loss": 0.5442, + "step": 25510 + }, + { + "epoch": 1.843047302544837, + "grad_norm": 6.953533717797792, + "learning_rate": 8.021805738977816e-08, + "loss": 0.5554, + "step": 25511 + }, + { + "epoch": 1.8431195477450468, + "grad_norm": 8.080600027334468, + "learning_rate": 8.014457207950882e-08, + "loss": 0.6267, + "step": 25512 + }, + { + "epoch": 1.8431917929452561, + "grad_norm": 7.554761978934188, + "learning_rate": 8.007111989518146e-08, + "loss": 0.6095, + "step": 25513 + }, + { + "epoch": 1.8432640381454657, + "grad_norm": 6.526082161800321, + "learning_rate": 7.99977008378014e-08, + "loss": 0.5571, + "step": 25514 + }, + { + "epoch": 1.8433362833456752, + "grad_norm": 8.683929368233077, + "learning_rate": 7.992431490837393e-08, + "loss": 0.5579, + "step": 25515 + }, + { + "epoch": 1.8434085285458848, + "grad_norm": 6.558917421007502, + "learning_rate": 7.985096210790327e-08, + "loss": 0.6563, + "step": 25516 + }, + { + "epoch": 1.8434807737460943, + "grad_norm": 6.692730720769545, + "learning_rate": 7.977764243739389e-08, + "loss": 0.6616, + "step": 25517 + }, + { + "epoch": 1.8435530189463036, + "grad_norm": 6.859428110112587, + "learning_rate": 7.970435589784914e-08, + "loss": 0.5034, + "step": 25518 + }, + { + "epoch": 1.8436252641465134, + "grad_norm": 7.390913782264358, + "learning_rate": 7.963110249027295e-08, + "loss": 0.5696, + "step": 25519 + }, + { + "epoch": 1.8436975093467227, + "grad_norm": 7.848910028873909, + "learning_rate": 7.955788221566757e-08, + "loss": 0.6221, + "step": 25520 + }, + { + "epoch": 1.8437697545469323, + "grad_norm": 7.178571379769331, + "learning_rate": 7.948469507503525e-08, + "loss": 0.6322, + "step": 25521 + }, + { + "epoch": 1.8438419997471418, + "grad_norm": 7.417832986514344, + "learning_rate": 7.941154106937826e-08, + "loss": 0.6576, + "step": 25522 + }, + { + "epoch": 1.8439142449473513, + "grad_norm": 7.905673643070046, + "learning_rate": 7.9338420199698e-08, + "loss": 0.6358, + "step": 25523 + }, + { + "epoch": 1.8439864901475609, + "grad_norm": 5.919607407723275, + "learning_rate": 7.926533246699481e-08, + "loss": 0.6087, + "step": 25524 + }, + { + "epoch": 1.8440587353477702, + "grad_norm": 7.066084500690494, + "learning_rate": 7.91922778722698e-08, + "loss": 0.5118, + "step": 25525 + }, + { + "epoch": 1.84413098054798, + "grad_norm": 7.120649649310283, + "learning_rate": 7.911925641652273e-08, + "loss": 0.6069, + "step": 25526 + }, + { + "epoch": 1.8442032257481893, + "grad_norm": 6.067015234746207, + "learning_rate": 7.904626810075367e-08, + "loss": 0.6009, + "step": 25527 + }, + { + "epoch": 1.8442754709483988, + "grad_norm": 6.622424218740369, + "learning_rate": 7.897331292596122e-08, + "loss": 0.5879, + "step": 25528 + }, + { + "epoch": 1.8443477161486084, + "grad_norm": 6.6613056880193, + "learning_rate": 7.890039089314433e-08, + "loss": 0.5679, + "step": 25529 + }, + { + "epoch": 1.844419961348818, + "grad_norm": 7.9265010050067515, + "learning_rate": 7.882750200330135e-08, + "loss": 0.6209, + "step": 25530 + }, + { + "epoch": 1.8444922065490275, + "grad_norm": 7.475265406359015, + "learning_rate": 7.875464625742985e-08, + "loss": 0.5356, + "step": 25531 + }, + { + "epoch": 1.8445644517492368, + "grad_norm": 7.208574265292161, + "learning_rate": 7.868182365652732e-08, + "loss": 0.5749, + "step": 25532 + }, + { + "epoch": 1.8446366969494465, + "grad_norm": 7.026047880275081, + "learning_rate": 7.860903420159077e-08, + "loss": 0.6596, + "step": 25533 + }, + { + "epoch": 1.8447089421496559, + "grad_norm": 8.679862857561416, + "learning_rate": 7.853627789361606e-08, + "loss": 0.6267, + "step": 25534 + }, + { + "epoch": 1.8447811873498654, + "grad_norm": 8.017902846271342, + "learning_rate": 7.846355473359962e-08, + "loss": 0.7103, + "step": 25535 + }, + { + "epoch": 1.844853432550075, + "grad_norm": 7.115549613007137, + "learning_rate": 7.839086472253704e-08, + "loss": 0.5496, + "step": 25536 + }, + { + "epoch": 1.8449256777502845, + "grad_norm": 7.7835371340659005, + "learning_rate": 7.831820786142336e-08, + "loss": 0.64, + "step": 25537 + }, + { + "epoch": 1.844997922950494, + "grad_norm": 6.123241561398528, + "learning_rate": 7.824558415125278e-08, + "loss": 0.6377, + "step": 25538 + }, + { + "epoch": 1.8450701681507033, + "grad_norm": 8.302673957010825, + "learning_rate": 7.817299359302006e-08, + "loss": 0.5793, + "step": 25539 + }, + { + "epoch": 1.8451424133509131, + "grad_norm": 6.501047196763073, + "learning_rate": 7.810043618771884e-08, + "loss": 0.6285, + "step": 25540 + }, + { + "epoch": 1.8452146585511224, + "grad_norm": 8.352231633186099, + "learning_rate": 7.802791193634168e-08, + "loss": 0.5759, + "step": 25541 + }, + { + "epoch": 1.845286903751332, + "grad_norm": 6.595897740595629, + "learning_rate": 7.795542083988223e-08, + "loss": 0.5581, + "step": 25542 + }, + { + "epoch": 1.8453591489515415, + "grad_norm": 7.065873412228436, + "learning_rate": 7.788296289933217e-08, + "loss": 0.5919, + "step": 25543 + }, + { + "epoch": 1.845431394151751, + "grad_norm": 6.922977402778718, + "learning_rate": 7.781053811568407e-08, + "loss": 0.5663, + "step": 25544 + }, + { + "epoch": 1.8455036393519606, + "grad_norm": 6.736681479026117, + "learning_rate": 7.773814648992878e-08, + "loss": 0.5701, + "step": 25545 + }, + { + "epoch": 1.84557588455217, + "grad_norm": 7.827260226607347, + "learning_rate": 7.766578802305719e-08, + "loss": 0.575, + "step": 25546 + }, + { + "epoch": 1.8456481297523797, + "grad_norm": 8.851378476821642, + "learning_rate": 7.759346271606072e-08, + "loss": 0.6155, + "step": 25547 + }, + { + "epoch": 1.845720374952589, + "grad_norm": 6.745649277844757, + "learning_rate": 7.752117056992831e-08, + "loss": 0.5647, + "step": 25548 + }, + { + "epoch": 1.8457926201527985, + "grad_norm": 7.251263738526138, + "learning_rate": 7.744891158565055e-08, + "loss": 0.5695, + "step": 25549 + }, + { + "epoch": 1.845864865353008, + "grad_norm": 6.990641195988726, + "learning_rate": 7.737668576421609e-08, + "loss": 0.5808, + "step": 25550 + }, + { + "epoch": 1.8459371105532176, + "grad_norm": 5.936004450320815, + "learning_rate": 7.730449310661387e-08, + "loss": 0.5834, + "step": 25551 + }, + { + "epoch": 1.8460093557534272, + "grad_norm": 8.079556661094948, + "learning_rate": 7.72323336138317e-08, + "loss": 0.5957, + "step": 25552 + }, + { + "epoch": 1.8460816009536365, + "grad_norm": 8.81560012721921, + "learning_rate": 7.716020728685797e-08, + "loss": 0.5855, + "step": 25553 + }, + { + "epoch": 1.8461538461538463, + "grad_norm": 6.975298486154357, + "learning_rate": 7.708811412667965e-08, + "loss": 0.5613, + "step": 25554 + }, + { + "epoch": 1.8462260913540556, + "grad_norm": 6.93203079875393, + "learning_rate": 7.701605413428404e-08, + "loss": 0.5982, + "step": 25555 + }, + { + "epoch": 1.8462983365542653, + "grad_norm": 7.3682766025409725, + "learning_rate": 7.694402731065698e-08, + "loss": 0.5664, + "step": 25556 + }, + { + "epoch": 1.8463705817544747, + "grad_norm": 7.584194120522211, + "learning_rate": 7.687203365678492e-08, + "loss": 0.5431, + "step": 25557 + }, + { + "epoch": 1.8464428269546842, + "grad_norm": 7.584515769353137, + "learning_rate": 7.680007317365373e-08, + "loss": 0.5611, + "step": 25558 + }, + { + "epoch": 1.8465150721548937, + "grad_norm": 7.629084509326099, + "learning_rate": 7.672814586224736e-08, + "loss": 0.5443, + "step": 25559 + }, + { + "epoch": 1.846587317355103, + "grad_norm": 7.11484808217448, + "learning_rate": 7.66562517235514e-08, + "loss": 0.5586, + "step": 25560 + }, + { + "epoch": 1.8466595625553128, + "grad_norm": 7.335692835293698, + "learning_rate": 7.658439075854979e-08, + "loss": 0.5734, + "step": 25561 + }, + { + "epoch": 1.8467318077555221, + "grad_norm": 6.6976062313140154, + "learning_rate": 7.651256296822645e-08, + "loss": 0.5748, + "step": 25562 + }, + { + "epoch": 1.846804052955732, + "grad_norm": 7.514898887659637, + "learning_rate": 7.644076835356423e-08, + "loss": 0.5773, + "step": 25563 + }, + { + "epoch": 1.8468762981559412, + "grad_norm": 6.27311612608889, + "learning_rate": 7.636900691554594e-08, + "loss": 0.635, + "step": 25564 + }, + { + "epoch": 1.8469485433561508, + "grad_norm": 7.641519630651026, + "learning_rate": 7.629727865515412e-08, + "loss": 0.5976, + "step": 25565 + }, + { + "epoch": 1.8470207885563603, + "grad_norm": 8.005286377472798, + "learning_rate": 7.622558357337078e-08, + "loss": 0.5889, + "step": 25566 + }, + { + "epoch": 1.8470930337565696, + "grad_norm": 5.993495912705308, + "learning_rate": 7.615392167117707e-08, + "loss": 0.5805, + "step": 25567 + }, + { + "epoch": 1.8471652789567794, + "grad_norm": 7.102971607570946, + "learning_rate": 7.608229294955443e-08, + "loss": 0.5604, + "step": 25568 + }, + { + "epoch": 1.8472375241569887, + "grad_norm": 7.144604665338356, + "learning_rate": 7.601069740948319e-08, + "loss": 0.6082, + "step": 25569 + }, + { + "epoch": 1.8473097693571985, + "grad_norm": 7.192945192393133, + "learning_rate": 7.593913505194338e-08, + "loss": 0.5664, + "step": 25570 + }, + { + "epoch": 1.8473820145574078, + "grad_norm": 7.48169891361803, + "learning_rate": 7.586760587791454e-08, + "loss": 0.6563, + "step": 25571 + }, + { + "epoch": 1.8474542597576173, + "grad_norm": 7.666533372245497, + "learning_rate": 7.579610988837611e-08, + "loss": 0.6079, + "step": 25572 + }, + { + "epoch": 1.8475265049578269, + "grad_norm": 8.166895908589645, + "learning_rate": 7.572464708430649e-08, + "loss": 0.6222, + "step": 25573 + }, + { + "epoch": 1.8475987501580364, + "grad_norm": 6.88177905080167, + "learning_rate": 7.565321746668408e-08, + "loss": 0.5736, + "step": 25574 + }, + { + "epoch": 1.847670995358246, + "grad_norm": 5.909991013056962, + "learning_rate": 7.558182103648643e-08, + "loss": 0.5647, + "step": 25575 + }, + { + "epoch": 1.8477432405584553, + "grad_norm": 7.669237231109183, + "learning_rate": 7.55104577946919e-08, + "loss": 0.59, + "step": 25576 + }, + { + "epoch": 1.847815485758665, + "grad_norm": 7.464418823348629, + "learning_rate": 7.543912774227641e-08, + "loss": 0.6034, + "step": 25577 + }, + { + "epoch": 1.8478877309588744, + "grad_norm": 5.497411812512552, + "learning_rate": 7.536783088021665e-08, + "loss": 0.4974, + "step": 25578 + }, + { + "epoch": 1.847959976159084, + "grad_norm": 8.096409658931197, + "learning_rate": 7.52965672094888e-08, + "loss": 0.5671, + "step": 25579 + }, + { + "epoch": 1.8480322213592935, + "grad_norm": 9.755064993621783, + "learning_rate": 7.522533673106846e-08, + "loss": 0.6393, + "step": 25580 + }, + { + "epoch": 1.848104466559503, + "grad_norm": 7.708298004344363, + "learning_rate": 7.515413944593014e-08, + "loss": 0.6077, + "step": 25581 + }, + { + "epoch": 1.8481767117597125, + "grad_norm": 7.661064782700661, + "learning_rate": 7.508297535504888e-08, + "loss": 0.6521, + "step": 25582 + }, + { + "epoch": 1.8482489569599219, + "grad_norm": 7.8637724266769355, + "learning_rate": 7.501184445939918e-08, + "loss": 0.5953, + "step": 25583 + }, + { + "epoch": 1.8483212021601316, + "grad_norm": 7.116138234348833, + "learning_rate": 7.494074675995416e-08, + "loss": 0.589, + "step": 25584 + }, + { + "epoch": 1.848393447360341, + "grad_norm": 8.599421956352558, + "learning_rate": 7.486968225768748e-08, + "loss": 0.6021, + "step": 25585 + }, + { + "epoch": 1.8484656925605505, + "grad_norm": 7.974439078667839, + "learning_rate": 7.479865095357198e-08, + "loss": 0.637, + "step": 25586 + }, + { + "epoch": 1.84853793776076, + "grad_norm": 7.4777252026631045, + "learning_rate": 7.472765284857991e-08, + "loss": 0.5624, + "step": 25587 + }, + { + "epoch": 1.8486101829609696, + "grad_norm": 8.400575218487413, + "learning_rate": 7.465668794368303e-08, + "loss": 0.5921, + "step": 25588 + }, + { + "epoch": 1.8486824281611791, + "grad_norm": 7.846234296728601, + "learning_rate": 7.458575623985304e-08, + "loss": 0.6001, + "step": 25589 + }, + { + "epoch": 1.8487546733613884, + "grad_norm": 6.75336683158939, + "learning_rate": 7.451485773806084e-08, + "loss": 0.5504, + "step": 25590 + }, + { + "epoch": 1.8488269185615982, + "grad_norm": 8.234010402418065, + "learning_rate": 7.444399243927703e-08, + "loss": 0.556, + "step": 25591 + }, + { + "epoch": 1.8488991637618075, + "grad_norm": 6.959857552923222, + "learning_rate": 7.43731603444714e-08, + "loss": 0.6144, + "step": 25592 + }, + { + "epoch": 1.848971408962017, + "grad_norm": 8.23976980610014, + "learning_rate": 7.430236145461373e-08, + "loss": 0.5549, + "step": 25593 + }, + { + "epoch": 1.8490436541622266, + "grad_norm": 7.331918088581324, + "learning_rate": 7.423159577067351e-08, + "loss": 0.5737, + "step": 25594 + }, + { + "epoch": 1.8491158993624361, + "grad_norm": 7.55140010154755, + "learning_rate": 7.416086329361916e-08, + "loss": 0.5425, + "step": 25595 + }, + { + "epoch": 1.8491881445626457, + "grad_norm": 7.893823077542403, + "learning_rate": 7.409016402441932e-08, + "loss": 0.5856, + "step": 25596 + }, + { + "epoch": 1.849260389762855, + "grad_norm": 6.195248770656613, + "learning_rate": 7.401949796404156e-08, + "loss": 0.555, + "step": 25597 + }, + { + "epoch": 1.8493326349630648, + "grad_norm": 7.204567986057364, + "learning_rate": 7.39488651134529e-08, + "loss": 0.5727, + "step": 25598 + }, + { + "epoch": 1.849404880163274, + "grad_norm": 6.664087400579945, + "learning_rate": 7.387826547362059e-08, + "loss": 0.5993, + "step": 25599 + }, + { + "epoch": 1.8494771253634836, + "grad_norm": 7.766883180189901, + "learning_rate": 7.380769904551111e-08, + "loss": 0.5818, + "step": 25600 + }, + { + "epoch": 1.8495493705636932, + "grad_norm": 8.331603620941431, + "learning_rate": 7.373716583009088e-08, + "loss": 0.5641, + "step": 25601 + }, + { + "epoch": 1.8496216157639027, + "grad_norm": 8.165254847140412, + "learning_rate": 7.366666582832443e-08, + "loss": 0.6395, + "step": 25602 + }, + { + "epoch": 1.8496938609641123, + "grad_norm": 8.491804772309337, + "learning_rate": 7.359619904117709e-08, + "loss": 0.5921, + "step": 25603 + }, + { + "epoch": 1.8497661061643216, + "grad_norm": 6.608217972289285, + "learning_rate": 7.35257654696142e-08, + "loss": 0.6059, + "step": 25604 + }, + { + "epoch": 1.8498383513645313, + "grad_norm": 7.37196209942187, + "learning_rate": 7.34553651146e-08, + "loss": 0.6119, + "step": 25605 + }, + { + "epoch": 1.8499105965647407, + "grad_norm": 8.699502900051693, + "learning_rate": 7.33849979770973e-08, + "loss": 0.6536, + "step": 25606 + }, + { + "epoch": 1.8499828417649502, + "grad_norm": 6.953307691595699, + "learning_rate": 7.33146640580698e-08, + "loss": 0.5946, + "step": 25607 + }, + { + "epoch": 1.8500550869651597, + "grad_norm": 8.716309127136594, + "learning_rate": 7.324436335848062e-08, + "loss": 0.5842, + "step": 25608 + }, + { + "epoch": 1.8501273321653693, + "grad_norm": 8.25145685452987, + "learning_rate": 7.317409587929175e-08, + "loss": 0.6047, + "step": 25609 + }, + { + "epoch": 1.8501995773655788, + "grad_norm": 8.234100742586437, + "learning_rate": 7.310386162146494e-08, + "loss": 0.6289, + "step": 25610 + }, + { + "epoch": 1.8502718225657881, + "grad_norm": 6.262580112698659, + "learning_rate": 7.303366058596217e-08, + "loss": 0.6819, + "step": 25611 + }, + { + "epoch": 1.850344067765998, + "grad_norm": 8.490487554095466, + "learning_rate": 7.29634927737441e-08, + "loss": 0.5778, + "step": 25612 + }, + { + "epoch": 1.8504163129662072, + "grad_norm": 7.521257411451652, + "learning_rate": 7.289335818577103e-08, + "loss": 0.6185, + "step": 25613 + }, + { + "epoch": 1.8504885581664168, + "grad_norm": 7.308707533521018, + "learning_rate": 7.282325682300362e-08, + "loss": 0.6062, + "step": 25614 + }, + { + "epoch": 1.8505608033666263, + "grad_norm": 8.329238902992579, + "learning_rate": 7.275318868640163e-08, + "loss": 0.6347, + "step": 25615 + }, + { + "epoch": 1.8506330485668359, + "grad_norm": 6.812257814914614, + "learning_rate": 7.268315377692348e-08, + "loss": 0.5297, + "step": 25616 + }, + { + "epoch": 1.8507052937670454, + "grad_norm": 6.253499386554437, + "learning_rate": 7.261315209552811e-08, + "loss": 0.6257, + "step": 25617 + }, + { + "epoch": 1.8507775389672547, + "grad_norm": 7.74740212490991, + "learning_rate": 7.25431836431742e-08, + "loss": 0.562, + "step": 25618 + }, + { + "epoch": 1.8508497841674645, + "grad_norm": 8.189960605028212, + "learning_rate": 7.247324842081933e-08, + "loss": 0.6263, + "step": 25619 + }, + { + "epoch": 1.8509220293676738, + "grad_norm": 7.179743022517224, + "learning_rate": 7.240334642942076e-08, + "loss": 0.6007, + "step": 25620 + }, + { + "epoch": 1.8509942745678833, + "grad_norm": 8.211024841947449, + "learning_rate": 7.233347766993554e-08, + "loss": 0.6502, + "step": 25621 + }, + { + "epoch": 1.8510665197680929, + "grad_norm": 7.904007544664497, + "learning_rate": 7.226364214331982e-08, + "loss": 0.674, + "step": 25622 + }, + { + "epoch": 1.8511387649683024, + "grad_norm": 6.915028787968112, + "learning_rate": 7.21938398505298e-08, + "loss": 0.524, + "step": 25623 + }, + { + "epoch": 1.851211010168512, + "grad_norm": 6.831876863160567, + "learning_rate": 7.212407079252109e-08, + "loss": 0.5803, + "step": 25624 + }, + { + "epoch": 1.8512832553687213, + "grad_norm": 7.4571645094333245, + "learning_rate": 7.205433497024878e-08, + "loss": 0.594, + "step": 25625 + }, + { + "epoch": 1.851355500568931, + "grad_norm": 8.987419660737064, + "learning_rate": 7.198463238466791e-08, + "loss": 0.5765, + "step": 25626 + }, + { + "epoch": 1.8514277457691404, + "grad_norm": 6.858838592618987, + "learning_rate": 7.191496303673163e-08, + "loss": 0.6177, + "step": 25627 + }, + { + "epoch": 1.8514999909693501, + "grad_norm": 6.030315425587626, + "learning_rate": 7.184532692739443e-08, + "loss": 0.5508, + "step": 25628 + }, + { + "epoch": 1.8515722361695595, + "grad_norm": 7.317109831276211, + "learning_rate": 7.177572405760918e-08, + "loss": 0.5783, + "step": 25629 + }, + { + "epoch": 1.851644481369769, + "grad_norm": 6.572084530380212, + "learning_rate": 7.170615442832928e-08, + "loss": 0.497, + "step": 25630 + }, + { + "epoch": 1.8517167265699785, + "grad_norm": 6.781377975732945, + "learning_rate": 7.163661804050592e-08, + "loss": 0.5935, + "step": 25631 + }, + { + "epoch": 1.8517889717701879, + "grad_norm": 7.85220161228124, + "learning_rate": 7.156711489509222e-08, + "loss": 0.6532, + "step": 25632 + }, + { + "epoch": 1.8518612169703976, + "grad_norm": 7.98470083269845, + "learning_rate": 7.149764499303935e-08, + "loss": 0.5756, + "step": 25633 + }, + { + "epoch": 1.851933462170607, + "grad_norm": 6.944515770439914, + "learning_rate": 7.142820833529795e-08, + "loss": 0.5602, + "step": 25634 + }, + { + "epoch": 1.8520057073708167, + "grad_norm": 7.320128545414125, + "learning_rate": 7.135880492281865e-08, + "loss": 0.5779, + "step": 25635 + }, + { + "epoch": 1.852077952571026, + "grad_norm": 7.387412020035146, + "learning_rate": 7.128943475655153e-08, + "loss": 0.5903, + "step": 25636 + }, + { + "epoch": 1.8521501977712356, + "grad_norm": 7.94751450174022, + "learning_rate": 7.122009783744666e-08, + "loss": 0.583, + "step": 25637 + }, + { + "epoch": 1.8522224429714451, + "grad_norm": 6.019381373357226, + "learning_rate": 7.115079416645271e-08, + "loss": 0.5604, + "step": 25638 + }, + { + "epoch": 1.8522946881716544, + "grad_norm": 6.794709380210712, + "learning_rate": 7.10815237445181e-08, + "loss": 0.6254, + "step": 25639 + }, + { + "epoch": 1.8523669333718642, + "grad_norm": 7.3164886290744, + "learning_rate": 7.101228657259179e-08, + "loss": 0.6225, + "step": 25640 + }, + { + "epoch": 1.8524391785720735, + "grad_norm": 7.554558990553191, + "learning_rate": 7.094308265162109e-08, + "loss": 0.5966, + "step": 25641 + }, + { + "epoch": 1.8525114237722833, + "grad_norm": 7.420801731608503, + "learning_rate": 7.087391198255384e-08, + "loss": 0.6102, + "step": 25642 + }, + { + "epoch": 1.8525836689724926, + "grad_norm": 8.158800917393926, + "learning_rate": 7.08047745663365e-08, + "loss": 0.6044, + "step": 25643 + }, + { + "epoch": 1.8526559141727021, + "grad_norm": 6.817006991346595, + "learning_rate": 7.073567040391583e-08, + "loss": 0.5549, + "step": 25644 + }, + { + "epoch": 1.8527281593729117, + "grad_norm": 6.97014627880572, + "learning_rate": 7.066659949623744e-08, + "loss": 0.5695, + "step": 25645 + }, + { + "epoch": 1.852800404573121, + "grad_norm": 7.3089387487051924, + "learning_rate": 7.059756184424699e-08, + "loss": 0.6451, + "step": 25646 + }, + { + "epoch": 1.8528726497733308, + "grad_norm": 8.472265886149081, + "learning_rate": 7.052855744888981e-08, + "loss": 0.5861, + "step": 25647 + }, + { + "epoch": 1.85294489497354, + "grad_norm": 6.769580556928771, + "learning_rate": 7.045958631111016e-08, + "loss": 0.6213, + "step": 25648 + }, + { + "epoch": 1.8530171401737499, + "grad_norm": 8.071162338785129, + "learning_rate": 7.039064843185228e-08, + "loss": 0.613, + "step": 25649 + }, + { + "epoch": 1.8530893853739592, + "grad_norm": 6.947735632997659, + "learning_rate": 7.032174381206014e-08, + "loss": 0.6043, + "step": 25650 + }, + { + "epoch": 1.8531616305741687, + "grad_norm": 7.692045775135969, + "learning_rate": 7.025287245267659e-08, + "loss": 0.6137, + "step": 25651 + }, + { + "epoch": 1.8532338757743783, + "grad_norm": 7.420362458951503, + "learning_rate": 7.018403435464477e-08, + "loss": 0.5825, + "step": 25652 + }, + { + "epoch": 1.8533061209745878, + "grad_norm": 8.63909114861184, + "learning_rate": 7.01152295189067e-08, + "loss": 0.6178, + "step": 25653 + }, + { + "epoch": 1.8533783661747973, + "grad_norm": 7.861064873309647, + "learning_rate": 7.004645794640497e-08, + "loss": 0.6321, + "step": 25654 + }, + { + "epoch": 1.8534506113750067, + "grad_norm": 7.368944947541173, + "learning_rate": 6.99777196380802e-08, + "loss": 0.6126, + "step": 25655 + }, + { + "epoch": 1.8535228565752164, + "grad_norm": 7.514942288865595, + "learning_rate": 6.990901459487359e-08, + "loss": 0.5754, + "step": 25656 + }, + { + "epoch": 1.8535951017754257, + "grad_norm": 6.912551973502763, + "learning_rate": 6.98403428177255e-08, + "loss": 0.5752, + "step": 25657 + }, + { + "epoch": 1.8536673469756353, + "grad_norm": 7.105162997218362, + "learning_rate": 6.977170430757685e-08, + "loss": 0.5892, + "step": 25658 + }, + { + "epoch": 1.8537395921758448, + "grad_norm": 7.476273454362427, + "learning_rate": 6.970309906536576e-08, + "loss": 0.5679, + "step": 25659 + }, + { + "epoch": 1.8538118373760544, + "grad_norm": 6.773051989011276, + "learning_rate": 6.963452709203289e-08, + "loss": 0.5492, + "step": 25660 + }, + { + "epoch": 1.853884082576264, + "grad_norm": 7.218583274336135, + "learning_rate": 6.956598838851608e-08, + "loss": 0.5911, + "step": 25661 + }, + { + "epoch": 1.8539563277764732, + "grad_norm": 6.641423435455945, + "learning_rate": 6.949748295575404e-08, + "loss": 0.6976, + "step": 25662 + }, + { + "epoch": 1.854028572976683, + "grad_norm": 7.237129362693023, + "learning_rate": 6.942901079468406e-08, + "loss": 0.6258, + "step": 25663 + }, + { + "epoch": 1.8541008181768923, + "grad_norm": 8.05802473906059, + "learning_rate": 6.936057190624346e-08, + "loss": 0.5831, + "step": 25664 + }, + { + "epoch": 1.8541730633771019, + "grad_norm": 8.034933588765927, + "learning_rate": 6.929216629136981e-08, + "loss": 0.6405, + "step": 25665 + }, + { + "epoch": 1.8542453085773114, + "grad_norm": 7.380748771167848, + "learning_rate": 6.922379395099904e-08, + "loss": 0.5652, + "step": 25666 + }, + { + "epoch": 1.854317553777521, + "grad_norm": 7.617299428264192, + "learning_rate": 6.915545488606679e-08, + "loss": 0.5509, + "step": 25667 + }, + { + "epoch": 1.8543897989777305, + "grad_norm": 6.989686725456266, + "learning_rate": 6.908714909750924e-08, + "loss": 0.5547, + "step": 25668 + }, + { + "epoch": 1.8544620441779398, + "grad_norm": 7.114464985508699, + "learning_rate": 6.901887658626094e-08, + "loss": 0.6401, + "step": 25669 + }, + { + "epoch": 1.8545342893781496, + "grad_norm": 8.070126260859302, + "learning_rate": 6.895063735325669e-08, + "loss": 0.6115, + "step": 25670 + }, + { + "epoch": 1.854606534578359, + "grad_norm": 7.799534896286359, + "learning_rate": 6.888243139943074e-08, + "loss": 0.6176, + "step": 25671 + }, + { + "epoch": 1.8546787797785684, + "grad_norm": 6.861220821819648, + "learning_rate": 6.881425872571679e-08, + "loss": 0.5889, + "step": 25672 + }, + { + "epoch": 1.854751024978778, + "grad_norm": 7.6835401307229265, + "learning_rate": 6.874611933304798e-08, + "loss": 0.5517, + "step": 25673 + }, + { + "epoch": 1.8548232701789875, + "grad_norm": 7.792829239822669, + "learning_rate": 6.867801322235663e-08, + "loss": 0.5728, + "step": 25674 + }, + { + "epoch": 1.854895515379197, + "grad_norm": 6.764421386785277, + "learning_rate": 6.860994039457586e-08, + "loss": 0.579, + "step": 25675 + }, + { + "epoch": 1.8549677605794064, + "grad_norm": 7.187743075033728, + "learning_rate": 6.854190085063717e-08, + "loss": 0.5842, + "step": 25676 + }, + { + "epoch": 1.8550400057796161, + "grad_norm": 7.032175503325433, + "learning_rate": 6.847389459147175e-08, + "loss": 0.5881, + "step": 25677 + }, + { + "epoch": 1.8551122509798255, + "grad_norm": 6.4658677318766, + "learning_rate": 6.840592161801079e-08, + "loss": 0.6121, + "step": 25678 + }, + { + "epoch": 1.855184496180035, + "grad_norm": 7.299596111673213, + "learning_rate": 6.833798193118469e-08, + "loss": 0.62, + "step": 25679 + }, + { + "epoch": 1.8552567413802445, + "grad_norm": 7.183956168213072, + "learning_rate": 6.827007553192349e-08, + "loss": 0.5595, + "step": 25680 + }, + { + "epoch": 1.855328986580454, + "grad_norm": 6.3570687425925225, + "learning_rate": 6.820220242115705e-08, + "loss": 0.5299, + "step": 25681 + }, + { + "epoch": 1.8554012317806636, + "grad_norm": 8.102466499708978, + "learning_rate": 6.813436259981432e-08, + "loss": 0.5617, + "step": 25682 + }, + { + "epoch": 1.855473476980873, + "grad_norm": 8.975636460202844, + "learning_rate": 6.806655606882401e-08, + "loss": 0.6443, + "step": 25683 + }, + { + "epoch": 1.8555457221810827, + "grad_norm": 7.415924578055801, + "learning_rate": 6.799878282911398e-08, + "loss": 0.6236, + "step": 25684 + }, + { + "epoch": 1.855617967381292, + "grad_norm": 6.781045515080097, + "learning_rate": 6.79310428816124e-08, + "loss": 0.5772, + "step": 25685 + }, + { + "epoch": 1.8556902125815016, + "grad_norm": 6.518447228163884, + "learning_rate": 6.786333622724656e-08, + "loss": 0.5779, + "step": 25686 + }, + { + "epoch": 1.8557624577817111, + "grad_norm": 7.549880919875284, + "learning_rate": 6.779566286694322e-08, + "loss": 0.5915, + "step": 25687 + }, + { + "epoch": 1.8558347029819207, + "grad_norm": 7.585114769365927, + "learning_rate": 6.772802280162832e-08, + "loss": 0.6752, + "step": 25688 + }, + { + "epoch": 1.8559069481821302, + "grad_norm": 7.3811113284761785, + "learning_rate": 6.766041603222861e-08, + "loss": 0.6034, + "step": 25689 + }, + { + "epoch": 1.8559791933823395, + "grad_norm": 7.997026129155787, + "learning_rate": 6.759284255966947e-08, + "loss": 0.5699, + "step": 25690 + }, + { + "epoch": 1.8560514385825493, + "grad_norm": 8.486162029085158, + "learning_rate": 6.752530238487543e-08, + "loss": 0.6176, + "step": 25691 + }, + { + "epoch": 1.8561236837827586, + "grad_norm": 6.657745050152644, + "learning_rate": 6.74577955087713e-08, + "loss": 0.5861, + "step": 25692 + }, + { + "epoch": 1.8561959289829681, + "grad_norm": 8.266878375552157, + "learning_rate": 6.739032193228107e-08, + "loss": 0.6391, + "step": 25693 + }, + { + "epoch": 1.8562681741831777, + "grad_norm": 8.538441515903695, + "learning_rate": 6.7322881656329e-08, + "loss": 0.5697, + "step": 25694 + }, + { + "epoch": 1.8563404193833872, + "grad_norm": 7.798826168450794, + "learning_rate": 6.725547468183741e-08, + "loss": 0.6065, + "step": 25695 + }, + { + "epoch": 1.8564126645835968, + "grad_norm": 6.269602134042435, + "learning_rate": 6.718810100972945e-08, + "loss": 0.5484, + "step": 25696 + }, + { + "epoch": 1.856484909783806, + "grad_norm": 7.364947952532038, + "learning_rate": 6.712076064092742e-08, + "loss": 0.5649, + "step": 25697 + }, + { + "epoch": 1.8565571549840159, + "grad_norm": 7.325746455256897, + "learning_rate": 6.70534535763534e-08, + "loss": 0.6071, + "step": 25698 + }, + { + "epoch": 1.8566294001842252, + "grad_norm": 8.752544360120599, + "learning_rate": 6.698617981692829e-08, + "loss": 0.5687, + "step": 25699 + }, + { + "epoch": 1.8567016453844347, + "grad_norm": 7.00595221180126, + "learning_rate": 6.69189393635733e-08, + "loss": 0.6052, + "step": 25700 + }, + { + "epoch": 1.8567738905846443, + "grad_norm": 6.93278411866303, + "learning_rate": 6.685173221720908e-08, + "loss": 0.5492, + "step": 25701 + }, + { + "epoch": 1.8568461357848538, + "grad_norm": 6.732979118673992, + "learning_rate": 6.678455837875546e-08, + "loss": 0.6033, + "step": 25702 + }, + { + "epoch": 1.8569183809850633, + "grad_norm": 8.098451405122637, + "learning_rate": 6.671741784913171e-08, + "loss": 0.5921, + "step": 25703 + }, + { + "epoch": 1.8569906261852727, + "grad_norm": 8.140462394159355, + "learning_rate": 6.665031062925737e-08, + "loss": 0.6309, + "step": 25704 + }, + { + "epoch": 1.8570628713854824, + "grad_norm": 7.957147266888626, + "learning_rate": 6.658323672005085e-08, + "loss": 0.651, + "step": 25705 + }, + { + "epoch": 1.8571351165856917, + "grad_norm": 8.687593528189216, + "learning_rate": 6.651619612243032e-08, + "loss": 0.6297, + "step": 25706 + }, + { + "epoch": 1.8572073617859015, + "grad_norm": 7.676876760642684, + "learning_rate": 6.644918883731366e-08, + "loss": 0.6573, + "step": 25707 + }, + { + "epoch": 1.8572796069861108, + "grad_norm": 8.883378063707179, + "learning_rate": 6.638221486561791e-08, + "loss": 0.6107, + "step": 25708 + }, + { + "epoch": 1.8573518521863204, + "grad_norm": 7.097034114261685, + "learning_rate": 6.631527420826012e-08, + "loss": 0.5372, + "step": 25709 + }, + { + "epoch": 1.85742409738653, + "grad_norm": 6.4331107870293485, + "learning_rate": 6.624836686615677e-08, + "loss": 0.5627, + "step": 25710 + }, + { + "epoch": 1.8574963425867392, + "grad_norm": 6.950588492307993, + "learning_rate": 6.618149284022352e-08, + "loss": 0.6015, + "step": 25711 + }, + { + "epoch": 1.857568587786949, + "grad_norm": 8.290950980681439, + "learning_rate": 6.611465213137602e-08, + "loss": 0.6313, + "step": 25712 + }, + { + "epoch": 1.8576408329871583, + "grad_norm": 7.082369779984624, + "learning_rate": 6.604784474052884e-08, + "loss": 0.629, + "step": 25713 + }, + { + "epoch": 1.857713078187368, + "grad_norm": 7.056044337178759, + "learning_rate": 6.598107066859704e-08, + "loss": 0.6453, + "step": 25714 + }, + { + "epoch": 1.8577853233875774, + "grad_norm": 6.409804535026485, + "learning_rate": 6.591432991649466e-08, + "loss": 0.5772, + "step": 25715 + }, + { + "epoch": 1.857857568587787, + "grad_norm": 6.782539772397959, + "learning_rate": 6.584762248513455e-08, + "loss": 0.5442, + "step": 25716 + }, + { + "epoch": 1.8579298137879965, + "grad_norm": 8.336029938861877, + "learning_rate": 6.57809483754307e-08, + "loss": 0.6195, + "step": 25717 + }, + { + "epoch": 1.8580020589882058, + "grad_norm": 7.56083774425953, + "learning_rate": 6.571430758829573e-08, + "loss": 0.5822, + "step": 25718 + }, + { + "epoch": 1.8580743041884156, + "grad_norm": 7.742407987132232, + "learning_rate": 6.564770012464223e-08, + "loss": 0.585, + "step": 25719 + }, + { + "epoch": 1.858146549388625, + "grad_norm": 7.561384891234454, + "learning_rate": 6.558112598538113e-08, + "loss": 0.6541, + "step": 25720 + }, + { + "epoch": 1.8582187945888347, + "grad_norm": 7.498982932747265, + "learning_rate": 6.551458517142423e-08, + "loss": 0.6892, + "step": 25721 + }, + { + "epoch": 1.858291039789044, + "grad_norm": 6.276592292743426, + "learning_rate": 6.544807768368244e-08, + "loss": 0.6366, + "step": 25722 + }, + { + "epoch": 1.8583632849892535, + "grad_norm": 8.072227583158002, + "learning_rate": 6.538160352306616e-08, + "loss": 0.5712, + "step": 25723 + }, + { + "epoch": 1.858435530189463, + "grad_norm": 7.552134069513481, + "learning_rate": 6.531516269048549e-08, + "loss": 0.706, + "step": 25724 + }, + { + "epoch": 1.8585077753896726, + "grad_norm": 8.792791743553796, + "learning_rate": 6.524875518684942e-08, + "loss": 0.6681, + "step": 25725 + }, + { + "epoch": 1.8585800205898821, + "grad_norm": 7.307127454294251, + "learning_rate": 6.518238101306779e-08, + "loss": 0.5158, + "step": 25726 + }, + { + "epoch": 1.8586522657900915, + "grad_norm": 7.3020074670816655, + "learning_rate": 6.511604017004874e-08, + "loss": 0.557, + "step": 25727 + }, + { + "epoch": 1.8587245109903012, + "grad_norm": 6.595731464626709, + "learning_rate": 6.504973265870046e-08, + "loss": 0.6053, + "step": 25728 + }, + { + "epoch": 1.8587967561905105, + "grad_norm": 6.28899395828333, + "learning_rate": 6.498345847993109e-08, + "loss": 0.5785, + "step": 25729 + }, + { + "epoch": 1.85886900139072, + "grad_norm": 7.339151963973288, + "learning_rate": 6.491721763464714e-08, + "loss": 0.5861, + "step": 25730 + }, + { + "epoch": 1.8589412465909296, + "grad_norm": 8.134403831615492, + "learning_rate": 6.485101012375595e-08, + "loss": 0.6091, + "step": 25731 + }, + { + "epoch": 1.8590134917911392, + "grad_norm": 7.598205013119001, + "learning_rate": 6.478483594816343e-08, + "loss": 0.5926, + "step": 25732 + }, + { + "epoch": 1.8590857369913487, + "grad_norm": 7.3207137446730774, + "learning_rate": 6.47186951087761e-08, + "loss": 0.6526, + "step": 25733 + }, + { + "epoch": 1.859157982191558, + "grad_norm": 6.54799218307709, + "learning_rate": 6.46525876064985e-08, + "loss": 0.5511, + "step": 25734 + }, + { + "epoch": 1.8592302273917678, + "grad_norm": 7.082993742338328, + "learning_rate": 6.458651344223633e-08, + "loss": 0.5898, + "step": 25735 + }, + { + "epoch": 1.8593024725919771, + "grad_norm": 6.821638225338, + "learning_rate": 6.452047261689326e-08, + "loss": 0.6255, + "step": 25736 + }, + { + "epoch": 1.8593747177921867, + "grad_norm": 7.951489953107231, + "learning_rate": 6.445446513137471e-08, + "loss": 0.6809, + "step": 25737 + }, + { + "epoch": 1.8594469629923962, + "grad_norm": 10.025863009095449, + "learning_rate": 6.438849098658301e-08, + "loss": 0.575, + "step": 25738 + }, + { + "epoch": 1.8595192081926057, + "grad_norm": 8.151566487114465, + "learning_rate": 6.432255018342159e-08, + "loss": 0.5452, + "step": 25739 + }, + { + "epoch": 1.8595914533928153, + "grad_norm": 6.4442207290287135, + "learning_rate": 6.425664272279392e-08, + "loss": 0.5809, + "step": 25740 + }, + { + "epoch": 1.8596636985930246, + "grad_norm": 7.342491788823827, + "learning_rate": 6.419076860560091e-08, + "loss": 0.6431, + "step": 25741 + }, + { + "epoch": 1.8597359437932344, + "grad_norm": 7.258623979123901, + "learning_rate": 6.412492783274521e-08, + "loss": 0.6489, + "step": 25742 + }, + { + "epoch": 1.8598081889934437, + "grad_norm": 8.575138760436602, + "learning_rate": 6.405912040512801e-08, + "loss": 0.6619, + "step": 25743 + }, + { + "epoch": 1.8598804341936532, + "grad_norm": 7.104882197735665, + "learning_rate": 6.399334632365001e-08, + "loss": 0.6212, + "step": 25744 + }, + { + "epoch": 1.8599526793938628, + "grad_norm": 7.020663145211794, + "learning_rate": 6.392760558921157e-08, + "loss": 0.5653, + "step": 25745 + }, + { + "epoch": 1.8600249245940723, + "grad_norm": 9.973529877519464, + "learning_rate": 6.386189820271282e-08, + "loss": 0.5408, + "step": 25746 + }, + { + "epoch": 1.8600971697942819, + "grad_norm": 8.182968166721542, + "learning_rate": 6.379622416505332e-08, + "loss": 0.601, + "step": 25747 + }, + { + "epoch": 1.8601694149944912, + "grad_norm": 7.218024675664794, + "learning_rate": 6.373058347713179e-08, + "loss": 0.6519, + "step": 25748 + }, + { + "epoch": 1.860241660194701, + "grad_norm": 9.496279339249794, + "learning_rate": 6.366497613984668e-08, + "loss": 0.7144, + "step": 25749 + }, + { + "epoch": 1.8603139053949103, + "grad_norm": 6.9135540619022935, + "learning_rate": 6.359940215409643e-08, + "loss": 0.5788, + "step": 25750 + }, + { + "epoch": 1.8603861505951198, + "grad_norm": 7.653394516934034, + "learning_rate": 6.353386152077895e-08, + "loss": 0.6189, + "step": 25751 + }, + { + "epoch": 1.8604583957953293, + "grad_norm": 7.56822101688049, + "learning_rate": 6.3468354240791e-08, + "loss": 0.5779, + "step": 25752 + }, + { + "epoch": 1.8605306409955389, + "grad_norm": 7.357290438416499, + "learning_rate": 6.34028803150294e-08, + "loss": 0.5468, + "step": 25753 + }, + { + "epoch": 1.8606028861957484, + "grad_norm": 6.963635676302673, + "learning_rate": 6.333743974439033e-08, + "loss": 0.5736, + "step": 25754 + }, + { + "epoch": 1.8606751313959577, + "grad_norm": 8.087568713785084, + "learning_rate": 6.327203252976976e-08, + "loss": 0.5833, + "step": 25755 + }, + { + "epoch": 1.8607473765961675, + "grad_norm": 7.066950923072913, + "learning_rate": 6.32066586720631e-08, + "loss": 0.6764, + "step": 25756 + }, + { + "epoch": 1.8608196217963768, + "grad_norm": 7.084529311994536, + "learning_rate": 6.314131817216518e-08, + "loss": 0.6427, + "step": 25757 + }, + { + "epoch": 1.8608918669965864, + "grad_norm": 6.896421317150119, + "learning_rate": 6.307601103097084e-08, + "loss": 0.5483, + "step": 25758 + }, + { + "epoch": 1.860964112196796, + "grad_norm": 9.102520570551032, + "learning_rate": 6.301073724937351e-08, + "loss": 0.6029, + "step": 25759 + }, + { + "epoch": 1.8610363573970055, + "grad_norm": 6.939964887546684, + "learning_rate": 6.294549682826694e-08, + "loss": 0.5693, + "step": 25760 + }, + { + "epoch": 1.861108602597215, + "grad_norm": 6.790229723363188, + "learning_rate": 6.288028976854433e-08, + "loss": 0.5543, + "step": 25761 + }, + { + "epoch": 1.8611808477974243, + "grad_norm": 7.619618752621599, + "learning_rate": 6.281511607109852e-08, + "loss": 0.6482, + "step": 25762 + }, + { + "epoch": 1.861253092997634, + "grad_norm": 8.051584350864246, + "learning_rate": 6.274997573682106e-08, + "loss": 0.5943, + "step": 25763 + }, + { + "epoch": 1.8613253381978434, + "grad_norm": 7.3775601873266305, + "learning_rate": 6.268486876660429e-08, + "loss": 0.6064, + "step": 25764 + }, + { + "epoch": 1.861397583398053, + "grad_norm": 8.031080904713068, + "learning_rate": 6.261979516133886e-08, + "loss": 0.6285, + "step": 25765 + }, + { + "epoch": 1.8614698285982625, + "grad_norm": 6.976924737323974, + "learning_rate": 6.255475492191603e-08, + "loss": 0.5495, + "step": 25766 + }, + { + "epoch": 1.861542073798472, + "grad_norm": 7.4953339367071505, + "learning_rate": 6.248974804922619e-08, + "loss": 0.6099, + "step": 25767 + }, + { + "epoch": 1.8616143189986816, + "grad_norm": 6.665772982142648, + "learning_rate": 6.242477454415918e-08, + "loss": 0.5869, + "step": 25768 + }, + { + "epoch": 1.861686564198891, + "grad_norm": 6.437710360451035, + "learning_rate": 6.235983440760429e-08, + "loss": 0.6027, + "step": 25769 + }, + { + "epoch": 1.8617588093991007, + "grad_norm": 8.212122576676231, + "learning_rate": 6.229492764045053e-08, + "loss": 0.5995, + "step": 25770 + }, + { + "epoch": 1.86183105459931, + "grad_norm": 7.592474936355054, + "learning_rate": 6.223005424358635e-08, + "loss": 0.5644, + "step": 25771 + }, + { + "epoch": 1.8619032997995195, + "grad_norm": 7.324766125379288, + "learning_rate": 6.216521421790023e-08, + "loss": 0.5388, + "step": 25772 + }, + { + "epoch": 1.861975544999729, + "grad_norm": 6.363712565473617, + "learning_rate": 6.210040756427892e-08, + "loss": 0.5893, + "step": 25773 + }, + { + "epoch": 1.8620477901999386, + "grad_norm": 8.91331913462126, + "learning_rate": 6.203563428361037e-08, + "loss": 0.5894, + "step": 25774 + }, + { + "epoch": 1.8621200354001481, + "grad_norm": 7.4919270618348595, + "learning_rate": 6.197089437678105e-08, + "loss": 0.5548, + "step": 25775 + }, + { + "epoch": 1.8621922806003575, + "grad_norm": 6.475528721965791, + "learning_rate": 6.19061878446775e-08, + "loss": 0.6508, + "step": 25776 + }, + { + "epoch": 1.8622645258005672, + "grad_norm": 6.269026520279117, + "learning_rate": 6.184151468818484e-08, + "loss": 0.592, + "step": 25777 + }, + { + "epoch": 1.8623367710007765, + "grad_norm": 8.191377842553269, + "learning_rate": 6.177687490818873e-08, + "loss": 0.6437, + "step": 25778 + }, + { + "epoch": 1.8624090162009863, + "grad_norm": 7.6005570157366025, + "learning_rate": 6.171226850557432e-08, + "loss": 0.5798, + "step": 25779 + }, + { + "epoch": 1.8624812614011956, + "grad_norm": 21.767745572623113, + "learning_rate": 6.164769548122562e-08, + "loss": 0.6306, + "step": 25780 + }, + { + "epoch": 1.8625535066014052, + "grad_norm": 8.001971002008954, + "learning_rate": 6.158315583602664e-08, + "loss": 0.6616, + "step": 25781 + }, + { + "epoch": 1.8626257518016147, + "grad_norm": 7.100958434412629, + "learning_rate": 6.151864957086084e-08, + "loss": 0.5913, + "step": 25782 + }, + { + "epoch": 1.862697997001824, + "grad_norm": 8.570577333960784, + "learning_rate": 6.145417668661141e-08, + "loss": 0.6013, + "step": 25783 + }, + { + "epoch": 1.8627702422020338, + "grad_norm": 6.982514203935019, + "learning_rate": 6.138973718416097e-08, + "loss": 0.595, + "step": 25784 + }, + { + "epoch": 1.8628424874022431, + "grad_norm": 7.635564520750061, + "learning_rate": 6.132533106439159e-08, + "loss": 0.5644, + "step": 25785 + }, + { + "epoch": 1.8629147326024529, + "grad_norm": 6.44750465798182, + "learning_rate": 6.126095832818507e-08, + "loss": 0.6119, + "step": 25786 + }, + { + "epoch": 1.8629869778026622, + "grad_norm": 7.100146128265573, + "learning_rate": 6.119661897642209e-08, + "loss": 0.6187, + "step": 25787 + }, + { + "epoch": 1.8630592230028717, + "grad_norm": 7.872651491715467, + "learning_rate": 6.113231300998418e-08, + "loss": 0.6117, + "step": 25788 + }, + { + "epoch": 1.8631314682030813, + "grad_norm": 7.428243566970214, + "learning_rate": 6.10680404297509e-08, + "loss": 0.6528, + "step": 25789 + }, + { + "epoch": 1.8632037134032906, + "grad_norm": 7.333189818394016, + "learning_rate": 6.100380123660293e-08, + "loss": 0.5947, + "step": 25790 + }, + { + "epoch": 1.8632759586035004, + "grad_norm": 7.0556272301158005, + "learning_rate": 6.093959543141875e-08, + "loss": 0.5389, + "step": 25791 + }, + { + "epoch": 1.8633482038037097, + "grad_norm": 6.974828421283756, + "learning_rate": 6.087542301507766e-08, + "loss": 0.6341, + "step": 25792 + }, + { + "epoch": 1.8634204490039195, + "grad_norm": 9.104304425370684, + "learning_rate": 6.081128398845809e-08, + "loss": 0.6491, + "step": 25793 + }, + { + "epoch": 1.8634926942041288, + "grad_norm": 6.604892662729776, + "learning_rate": 6.074717835243854e-08, + "loss": 0.5865, + "step": 25794 + }, + { + "epoch": 1.8635649394043383, + "grad_norm": 8.042474050625247, + "learning_rate": 6.06831061078958e-08, + "loss": 0.6267, + "step": 25795 + }, + { + "epoch": 1.8636371846045479, + "grad_norm": 7.392005065718067, + "learning_rate": 6.061906725570748e-08, + "loss": 0.6004, + "step": 25796 + }, + { + "epoch": 1.8637094298047574, + "grad_norm": 6.854535526758736, + "learning_rate": 6.055506179675041e-08, + "loss": 0.6724, + "step": 25797 + }, + { + "epoch": 1.863781675004967, + "grad_norm": 7.1671339666322895, + "learning_rate": 6.049108973189998e-08, + "loss": 0.595, + "step": 25798 + }, + { + "epoch": 1.8638539202051763, + "grad_norm": 7.9822400368480135, + "learning_rate": 6.042715106203245e-08, + "loss": 0.5855, + "step": 25799 + }, + { + "epoch": 1.863926165405386, + "grad_norm": 8.011889682324531, + "learning_rate": 6.036324578802294e-08, + "loss": 0.6031, + "step": 25800 + }, + { + "epoch": 1.8639984106055953, + "grad_norm": 6.217382232691873, + "learning_rate": 6.029937391074659e-08, + "loss": 0.642, + "step": 25801 + }, + { + "epoch": 1.8640706558058049, + "grad_norm": 6.82631744971741, + "learning_rate": 6.023553543107713e-08, + "loss": 0.6629, + "step": 25802 + }, + { + "epoch": 1.8641429010060144, + "grad_norm": 6.851200594418539, + "learning_rate": 6.017173034988916e-08, + "loss": 0.5898, + "step": 25803 + }, + { + "epoch": 1.864215146206224, + "grad_norm": 8.685561588207124, + "learning_rate": 6.010795866805586e-08, + "loss": 0.5291, + "step": 25804 + }, + { + "epoch": 1.8642873914064335, + "grad_norm": 9.514865688352243, + "learning_rate": 6.004422038645014e-08, + "loss": 0.6449, + "step": 25805 + }, + { + "epoch": 1.8643596366066428, + "grad_norm": 7.98686164609463, + "learning_rate": 5.998051550594436e-08, + "loss": 0.5605, + "step": 25806 + }, + { + "epoch": 1.8644318818068526, + "grad_norm": 7.07553895440915, + "learning_rate": 5.99168440274106e-08, + "loss": 0.5985, + "step": 25807 + }, + { + "epoch": 1.864504127007062, + "grad_norm": 7.2769948944329705, + "learning_rate": 5.985320595172123e-08, + "loss": 0.6237, + "step": 25808 + }, + { + "epoch": 1.8645763722072715, + "grad_norm": 6.418945535357709, + "learning_rate": 5.978960127974637e-08, + "loss": 0.584, + "step": 25809 + }, + { + "epoch": 1.864648617407481, + "grad_norm": 7.991942162351226, + "learning_rate": 5.972603001235728e-08, + "loss": 0.5942, + "step": 25810 + }, + { + "epoch": 1.8647208626076905, + "grad_norm": 6.940807756802844, + "learning_rate": 5.966249215042381e-08, + "loss": 0.5161, + "step": 25811 + }, + { + "epoch": 1.8647931078079, + "grad_norm": 7.59072074594179, + "learning_rate": 5.959898769481637e-08, + "loss": 0.5421, + "step": 25812 + }, + { + "epoch": 1.8648653530081094, + "grad_norm": 7.555135119963653, + "learning_rate": 5.9535516646403727e-08, + "loss": 0.5891, + "step": 25813 + }, + { + "epoch": 1.8649375982083192, + "grad_norm": 6.85680436820235, + "learning_rate": 5.9472079006055174e-08, + "loss": 0.5911, + "step": 25814 + }, + { + "epoch": 1.8650098434085285, + "grad_norm": 7.365215729391673, + "learning_rate": 5.940867477463891e-08, + "loss": 0.6705, + "step": 25815 + }, + { + "epoch": 1.865082088608738, + "grad_norm": 8.43195437690799, + "learning_rate": 5.934530395302285e-08, + "loss": 0.6534, + "step": 25816 + }, + { + "epoch": 1.8651543338089476, + "grad_norm": 7.6929634342928725, + "learning_rate": 5.9281966542074355e-08, + "loss": 0.6471, + "step": 25817 + }, + { + "epoch": 1.8652265790091571, + "grad_norm": 7.78221963869453, + "learning_rate": 5.921866254266079e-08, + "loss": 0.5822, + "step": 25818 + }, + { + "epoch": 1.8652988242093667, + "grad_norm": 6.130264199865798, + "learning_rate": 5.9155391955648675e-08, + "loss": 0.588, + "step": 25819 + }, + { + "epoch": 1.865371069409576, + "grad_norm": 8.201645606926764, + "learning_rate": 5.9092154781904e-08, + "loss": 0.6539, + "step": 25820 + }, + { + "epoch": 1.8654433146097857, + "grad_norm": 6.992476916583642, + "learning_rate": 5.9028951022292166e-08, + "loss": 0.5983, + "step": 25821 + }, + { + "epoch": 1.865515559809995, + "grad_norm": 8.218086709065245, + "learning_rate": 5.896578067767916e-08, + "loss": 0.6593, + "step": 25822 + }, + { + "epoch": 1.8655878050102046, + "grad_norm": 5.965125776117579, + "learning_rate": 5.890264374892901e-08, + "loss": 0.57, + "step": 25823 + }, + { + "epoch": 1.8656600502104141, + "grad_norm": 7.3677599007982435, + "learning_rate": 5.88395402369063e-08, + "loss": 0.6193, + "step": 25824 + }, + { + "epoch": 1.8657322954106237, + "grad_norm": 7.234411647366006, + "learning_rate": 5.877647014247479e-08, + "loss": 0.5728, + "step": 25825 + }, + { + "epoch": 1.8658045406108332, + "grad_norm": 7.642950971610596, + "learning_rate": 5.8713433466498214e-08, + "loss": 0.6143, + "step": 25826 + }, + { + "epoch": 1.8658767858110425, + "grad_norm": 6.068521557013656, + "learning_rate": 5.865043020983896e-08, + "loss": 0.5725, + "step": 25827 + }, + { + "epoch": 1.8659490310112523, + "grad_norm": 8.471336504666366, + "learning_rate": 5.858746037335994e-08, + "loss": 0.5978, + "step": 25828 + }, + { + "epoch": 1.8660212762114616, + "grad_norm": 8.27180789103161, + "learning_rate": 5.852452395792268e-08, + "loss": 0.6086, + "step": 25829 + }, + { + "epoch": 1.8660935214116712, + "grad_norm": 6.61499938457368, + "learning_rate": 5.846162096438901e-08, + "loss": 0.5526, + "step": 25830 + }, + { + "epoch": 1.8661657666118807, + "grad_norm": 7.746705370142644, + "learning_rate": 5.839875139362017e-08, + "loss": 0.6787, + "step": 25831 + }, + { + "epoch": 1.8662380118120903, + "grad_norm": 6.854970156370235, + "learning_rate": 5.8335915246476585e-08, + "loss": 0.5775, + "step": 25832 + }, + { + "epoch": 1.8663102570122998, + "grad_norm": 7.166071785774071, + "learning_rate": 5.827311252381868e-08, + "loss": 0.6369, + "step": 25833 + }, + { + "epoch": 1.8663825022125091, + "grad_norm": 7.594799832587447, + "learning_rate": 5.821034322650576e-08, + "loss": 0.5814, + "step": 25834 + }, + { + "epoch": 1.8664547474127189, + "grad_norm": 6.456959174204507, + "learning_rate": 5.8147607355397154e-08, + "loss": 0.6055, + "step": 25835 + }, + { + "epoch": 1.8665269926129282, + "grad_norm": 7.910125144614767, + "learning_rate": 5.8084904911352437e-08, + "loss": 0.5964, + "step": 25836 + }, + { + "epoch": 1.8665992378131377, + "grad_norm": 5.723028531512818, + "learning_rate": 5.8022235895228706e-08, + "loss": 0.6016, + "step": 25837 + }, + { + "epoch": 1.8666714830133473, + "grad_norm": 7.148366792256176, + "learning_rate": 5.795960030788472e-08, + "loss": 0.5582, + "step": 25838 + }, + { + "epoch": 1.8667437282135568, + "grad_norm": 7.258823156052215, + "learning_rate": 5.7896998150177577e-08, + "loss": 0.6122, + "step": 25839 + }, + { + "epoch": 1.8668159734137664, + "grad_norm": 8.453696748079865, + "learning_rate": 5.783442942296463e-08, + "loss": 0.6051, + "step": 25840 + }, + { + "epoch": 1.8668882186139757, + "grad_norm": 6.698267459335071, + "learning_rate": 5.777189412710188e-08, + "loss": 0.5521, + "step": 25841 + }, + { + "epoch": 1.8669604638141855, + "grad_norm": 6.666515761892748, + "learning_rate": 5.7709392263445574e-08, + "loss": 0.6254, + "step": 25842 + }, + { + "epoch": 1.8670327090143948, + "grad_norm": 7.266313286451078, + "learning_rate": 5.7646923832851695e-08, + "loss": 0.5785, + "step": 25843 + }, + { + "epoch": 1.8671049542146043, + "grad_norm": 7.897759141477227, + "learning_rate": 5.758448883617484e-08, + "loss": 0.6353, + "step": 25844 + }, + { + "epoch": 1.8671771994148139, + "grad_norm": 8.05046993803123, + "learning_rate": 5.7522087274270163e-08, + "loss": 0.5433, + "step": 25845 + }, + { + "epoch": 1.8672494446150234, + "grad_norm": 9.290998175369886, + "learning_rate": 5.745971914799142e-08, + "loss": 0.5859, + "step": 25846 + }, + { + "epoch": 1.867321689815233, + "grad_norm": 7.1932334257063575, + "learning_rate": 5.739738445819265e-08, + "loss": 0.5602, + "step": 25847 + }, + { + "epoch": 1.8673939350154423, + "grad_norm": 7.645342846624449, + "learning_rate": 5.733508320572706e-08, + "loss": 0.6117, + "step": 25848 + }, + { + "epoch": 1.867466180215652, + "grad_norm": 6.821293466466903, + "learning_rate": 5.727281539144758e-08, + "loss": 0.5966, + "step": 25849 + }, + { + "epoch": 1.8675384254158613, + "grad_norm": 8.51240352626362, + "learning_rate": 5.7210581016206577e-08, + "loss": 0.5462, + "step": 25850 + }, + { + "epoch": 1.8676106706160711, + "grad_norm": 9.154523354689882, + "learning_rate": 5.7148380080856156e-08, + "loss": 0.6554, + "step": 25851 + }, + { + "epoch": 1.8676829158162804, + "grad_norm": 7.345185614331633, + "learning_rate": 5.708621258624758e-08, + "loss": 0.59, + "step": 25852 + }, + { + "epoch": 1.86775516101649, + "grad_norm": 7.400674129400651, + "learning_rate": 5.702407853323211e-08, + "loss": 0.5325, + "step": 25853 + }, + { + "epoch": 1.8678274062166995, + "grad_norm": 8.240778074842726, + "learning_rate": 5.696197792266017e-08, + "loss": 0.599, + "step": 25854 + }, + { + "epoch": 1.8678996514169088, + "grad_norm": 9.683481478146053, + "learning_rate": 5.689991075538165e-08, + "loss": 0.6977, + "step": 25855 + }, + { + "epoch": 1.8679718966171186, + "grad_norm": 7.058498362924248, + "learning_rate": 5.683787703224641e-08, + "loss": 0.5372, + "step": 25856 + }, + { + "epoch": 1.868044141817328, + "grad_norm": 7.9557297793490624, + "learning_rate": 5.677587675410351e-08, + "loss": 0.5798, + "step": 25857 + }, + { + "epoch": 1.8681163870175377, + "grad_norm": 7.548202745359891, + "learning_rate": 5.6713909921801976e-08, + "loss": 0.5992, + "step": 25858 + }, + { + "epoch": 1.868188632217747, + "grad_norm": 7.580125179012032, + "learning_rate": 5.6651976536190036e-08, + "loss": 0.6154, + "step": 25859 + }, + { + "epoch": 1.8682608774179565, + "grad_norm": 8.35875800877252, + "learning_rate": 5.6590076598115326e-08, + "loss": 0.6272, + "step": 25860 + }, + { + "epoch": 1.868333122618166, + "grad_norm": 9.600457546138873, + "learning_rate": 5.652821010842552e-08, + "loss": 0.5751, + "step": 25861 + }, + { + "epoch": 1.8684053678183754, + "grad_norm": 8.444058956025357, + "learning_rate": 5.646637706796687e-08, + "loss": 0.6277, + "step": 25862 + }, + { + "epoch": 1.8684776130185852, + "grad_norm": 6.052666308305561, + "learning_rate": 5.6404577477586486e-08, + "loss": 0.5952, + "step": 25863 + }, + { + "epoch": 1.8685498582187945, + "grad_norm": 6.886149572781636, + "learning_rate": 5.63428113381298e-08, + "loss": 0.5418, + "step": 25864 + }, + { + "epoch": 1.8686221034190043, + "grad_norm": 7.8333841957984145, + "learning_rate": 5.6281078650443077e-08, + "loss": 0.5929, + "step": 25865 + }, + { + "epoch": 1.8686943486192136, + "grad_norm": 7.92971107216324, + "learning_rate": 5.6219379415370644e-08, + "loss": 0.6142, + "step": 25866 + }, + { + "epoch": 1.8687665938194231, + "grad_norm": 7.415516139025947, + "learning_rate": 5.615771363375766e-08, + "loss": 0.6483, + "step": 25867 + }, + { + "epoch": 1.8688388390196327, + "grad_norm": 7.059269529165209, + "learning_rate": 5.609608130644789e-08, + "loss": 0.6504, + "step": 25868 + }, + { + "epoch": 1.868911084219842, + "grad_norm": 7.2239480067818524, + "learning_rate": 5.603448243428511e-08, + "loss": 0.5954, + "step": 25869 + }, + { + "epoch": 1.8689833294200517, + "grad_norm": 9.301966092238368, + "learning_rate": 5.5972917018112805e-08, + "loss": 0.6188, + "step": 25870 + }, + { + "epoch": 1.869055574620261, + "grad_norm": 8.648181666834834, + "learning_rate": 5.5911385058773635e-08, + "loss": 0.606, + "step": 25871 + }, + { + "epoch": 1.8691278198204708, + "grad_norm": 7.187033198581301, + "learning_rate": 5.584988655710999e-08, + "loss": 0.6176, + "step": 25872 + }, + { + "epoch": 1.8692000650206801, + "grad_norm": 7.678157432776434, + "learning_rate": 5.578842151396341e-08, + "loss": 0.5996, + "step": 25873 + }, + { + "epoch": 1.8692723102208897, + "grad_norm": 7.43501996918224, + "learning_rate": 5.572698993017572e-08, + "loss": 0.5915, + "step": 25874 + }, + { + "epoch": 1.8693445554210992, + "grad_norm": 6.561362322330809, + "learning_rate": 5.566559180658737e-08, + "loss": 0.5606, + "step": 25875 + }, + { + "epoch": 1.8694168006213088, + "grad_norm": 7.329416905378733, + "learning_rate": 5.560422714403962e-08, + "loss": 0.6795, + "step": 25876 + }, + { + "epoch": 1.8694890458215183, + "grad_norm": 8.16546134098575, + "learning_rate": 5.5542895943371814e-08, + "loss": 0.5778, + "step": 25877 + }, + { + "epoch": 1.8695612910217276, + "grad_norm": 6.555236284200533, + "learning_rate": 5.548159820542354e-08, + "loss": 0.6259, + "step": 25878 + }, + { + "epoch": 1.8696335362219374, + "grad_norm": 5.785750003818818, + "learning_rate": 5.542033393103441e-08, + "loss": 0.4748, + "step": 25879 + }, + { + "epoch": 1.8697057814221467, + "grad_norm": 6.867977925748904, + "learning_rate": 5.535910312104292e-08, + "loss": 0.5997, + "step": 25880 + }, + { + "epoch": 1.8697780266223563, + "grad_norm": 8.721839712817143, + "learning_rate": 5.529790577628674e-08, + "loss": 0.664, + "step": 25881 + }, + { + "epoch": 1.8698502718225658, + "grad_norm": 8.516607777291764, + "learning_rate": 5.523674189760436e-08, + "loss": 0.5841, + "step": 25882 + }, + { + "epoch": 1.8699225170227753, + "grad_norm": 6.295193355338938, + "learning_rate": 5.5175611485832895e-08, + "loss": 0.5656, + "step": 25883 + }, + { + "epoch": 1.8699947622229849, + "grad_norm": 6.390674245779848, + "learning_rate": 5.5114514541808884e-08, + "loss": 0.6585, + "step": 25884 + }, + { + "epoch": 1.8700670074231942, + "grad_norm": 7.3316222997966145, + "learning_rate": 5.505345106636861e-08, + "loss": 0.6396, + "step": 25885 + }, + { + "epoch": 1.870139252623404, + "grad_norm": 6.522339213309636, + "learning_rate": 5.499242106034836e-08, + "loss": 0.6163, + "step": 25886 + }, + { + "epoch": 1.8702114978236133, + "grad_norm": 6.534636719113102, + "learning_rate": 5.493142452458355e-08, + "loss": 0.5621, + "step": 25887 + }, + { + "epoch": 1.8702837430238228, + "grad_norm": 6.176361539289428, + "learning_rate": 5.487046145990882e-08, + "loss": 0.5608, + "step": 25888 + }, + { + "epoch": 1.8703559882240324, + "grad_norm": 7.043902732459165, + "learning_rate": 5.480953186715904e-08, + "loss": 0.5791, + "step": 25889 + }, + { + "epoch": 1.870428233424242, + "grad_norm": 8.09210023304461, + "learning_rate": 5.4748635747168546e-08, + "loss": 0.5916, + "step": 25890 + }, + { + "epoch": 1.8705004786244515, + "grad_norm": 7.269143973093565, + "learning_rate": 5.468777310077028e-08, + "loss": 0.6709, + "step": 25891 + }, + { + "epoch": 1.8705727238246608, + "grad_norm": 6.393432895705915, + "learning_rate": 5.462694392879775e-08, + "loss": 0.5727, + "step": 25892 + }, + { + "epoch": 1.8706449690248705, + "grad_norm": 7.434964557247687, + "learning_rate": 5.4566148232083614e-08, + "loss": 0.5753, + "step": 25893 + }, + { + "epoch": 1.8707172142250799, + "grad_norm": 7.873435455400732, + "learning_rate": 5.450538601146055e-08, + "loss": 0.6072, + "step": 25894 + }, + { + "epoch": 1.8707894594252894, + "grad_norm": 8.854843676246448, + "learning_rate": 5.444465726775955e-08, + "loss": 0.692, + "step": 25895 + }, + { + "epoch": 1.870861704625499, + "grad_norm": 8.236805392488673, + "learning_rate": 5.4383962001812174e-08, + "loss": 0.6341, + "step": 25896 + }, + { + "epoch": 1.8709339498257085, + "grad_norm": 6.298545294037737, + "learning_rate": 5.43233002144497e-08, + "loss": 0.5918, + "step": 25897 + }, + { + "epoch": 1.871006195025918, + "grad_norm": 6.912461745309777, + "learning_rate": 5.426267190650231e-08, + "loss": 0.5586, + "step": 25898 + }, + { + "epoch": 1.8710784402261273, + "grad_norm": 6.335101466401277, + "learning_rate": 5.420207707879988e-08, + "loss": 0.5875, + "step": 25899 + }, + { + "epoch": 1.8711506854263371, + "grad_norm": 8.009804916006411, + "learning_rate": 5.414151573217202e-08, + "loss": 0.6368, + "step": 25900 + }, + { + "epoch": 1.8712229306265464, + "grad_norm": 7.151355652237897, + "learning_rate": 5.408098786744809e-08, + "loss": 0.6041, + "step": 25901 + }, + { + "epoch": 1.871295175826756, + "grad_norm": 8.057617128321567, + "learning_rate": 5.4020493485455736e-08, + "loss": 0.6735, + "step": 25902 + }, + { + "epoch": 1.8713674210269655, + "grad_norm": 7.966356344310805, + "learning_rate": 5.396003258702403e-08, + "loss": 0.5505, + "step": 25903 + }, + { + "epoch": 1.871439666227175, + "grad_norm": 7.174484011076707, + "learning_rate": 5.38996051729801e-08, + "loss": 0.6258, + "step": 25904 + }, + { + "epoch": 1.8715119114273846, + "grad_norm": 8.310746775283834, + "learning_rate": 5.383921124415131e-08, + "loss": 0.6262, + "step": 25905 + }, + { + "epoch": 1.871584156627594, + "grad_norm": 6.721072057404985, + "learning_rate": 5.377885080136397e-08, + "loss": 0.5943, + "step": 25906 + }, + { + "epoch": 1.8716564018278037, + "grad_norm": 7.7322361310255845, + "learning_rate": 5.371852384544518e-08, + "loss": 0.6352, + "step": 25907 + }, + { + "epoch": 1.871728647028013, + "grad_norm": 6.754174248677194, + "learning_rate": 5.3658230377220674e-08, + "loss": 0.604, + "step": 25908 + }, + { + "epoch": 1.8718008922282225, + "grad_norm": 6.34534573333762, + "learning_rate": 5.359797039751535e-08, + "loss": 0.6106, + "step": 25909 + }, + { + "epoch": 1.871873137428432, + "grad_norm": 7.280308478390689, + "learning_rate": 5.35377439071541e-08, + "loss": 0.5627, + "step": 25910 + }, + { + "epoch": 1.8719453826286416, + "grad_norm": 7.278044293727201, + "learning_rate": 5.3477550906961825e-08, + "loss": 0.6059, + "step": 25911 + }, + { + "epoch": 1.8720176278288512, + "grad_norm": 8.196921947837817, + "learning_rate": 5.341739139776203e-08, + "loss": 0.5823, + "step": 25912 + }, + { + "epoch": 1.8720898730290605, + "grad_norm": 6.990812266556744, + "learning_rate": 5.3357265380378774e-08, + "loss": 0.5523, + "step": 25913 + }, + { + "epoch": 1.8721621182292703, + "grad_norm": 7.895561139758841, + "learning_rate": 5.329717285563446e-08, + "loss": 0.6169, + "step": 25914 + }, + { + "epoch": 1.8722343634294796, + "grad_norm": 6.86168699468913, + "learning_rate": 5.323711382435232e-08, + "loss": 0.6159, + "step": 25915 + }, + { + "epoch": 1.8723066086296891, + "grad_norm": 6.51927730369709, + "learning_rate": 5.317708828735446e-08, + "loss": 0.522, + "step": 25916 + }, + { + "epoch": 1.8723788538298987, + "grad_norm": 7.360938681576269, + "learning_rate": 5.311709624546218e-08, + "loss": 0.5991, + "step": 25917 + }, + { + "epoch": 1.8724510990301082, + "grad_norm": 6.713096901835079, + "learning_rate": 5.305713769949733e-08, + "loss": 0.5794, + "step": 25918 + }, + { + "epoch": 1.8725233442303177, + "grad_norm": 7.989722325700492, + "learning_rate": 5.2997212650280074e-08, + "loss": 0.6428, + "step": 25919 + }, + { + "epoch": 1.872595589430527, + "grad_norm": 8.07655139109799, + "learning_rate": 5.2937321098630865e-08, + "loss": 0.6118, + "step": 25920 + }, + { + "epoch": 1.8726678346307368, + "grad_norm": 6.546266659534607, + "learning_rate": 5.2877463045369893e-08, + "loss": 0.5725, + "step": 25921 + }, + { + "epoch": 1.8727400798309461, + "grad_norm": 7.815800328291948, + "learning_rate": 5.281763849131649e-08, + "loss": 0.6413, + "step": 25922 + }, + { + "epoch": 1.8728123250311557, + "grad_norm": 7.885427202010245, + "learning_rate": 5.275784743728946e-08, + "loss": 0.6026, + "step": 25923 + }, + { + "epoch": 1.8728845702313652, + "grad_norm": 5.863966300128521, + "learning_rate": 5.2698089884107015e-08, + "loss": 0.5863, + "step": 25924 + }, + { + "epoch": 1.8729568154315748, + "grad_norm": 8.390517250093769, + "learning_rate": 5.263836583258769e-08, + "loss": 0.5678, + "step": 25925 + }, + { + "epoch": 1.8730290606317843, + "grad_norm": 6.279285332415519, + "learning_rate": 5.257867528354915e-08, + "loss": 0.5915, + "step": 25926 + }, + { + "epoch": 1.8731013058319936, + "grad_norm": 8.610934966005116, + "learning_rate": 5.2519018237807975e-08, + "loss": 0.6939, + "step": 25927 + }, + { + "epoch": 1.8731735510322034, + "grad_norm": 6.963117435822035, + "learning_rate": 5.2459394696181e-08, + "loss": 0.5478, + "step": 25928 + }, + { + "epoch": 1.8732457962324127, + "grad_norm": 6.403026830647685, + "learning_rate": 5.23998046594848e-08, + "loss": 0.5801, + "step": 25929 + }, + { + "epoch": 1.8733180414326225, + "grad_norm": 7.111407247311138, + "learning_rate": 5.234024812853483e-08, + "loss": 0.6137, + "step": 25930 + }, + { + "epoch": 1.8733902866328318, + "grad_norm": 9.271705934984361, + "learning_rate": 5.2280725104146e-08, + "loss": 0.6116, + "step": 25931 + }, + { + "epoch": 1.8734625318330413, + "grad_norm": 7.037643577506851, + "learning_rate": 5.2221235587133765e-08, + "loss": 0.5618, + "step": 25932 + }, + { + "epoch": 1.8735347770332509, + "grad_norm": 7.064194467548426, + "learning_rate": 5.216177957831248e-08, + "loss": 0.6063, + "step": 25933 + }, + { + "epoch": 1.8736070222334602, + "grad_norm": 8.501200871627821, + "learning_rate": 5.210235707849537e-08, + "loss": 0.6986, + "step": 25934 + }, + { + "epoch": 1.87367926743367, + "grad_norm": 8.482689677803961, + "learning_rate": 5.2042968088496515e-08, + "loss": 0.5674, + "step": 25935 + }, + { + "epoch": 1.8737515126338793, + "grad_norm": 7.299344480163126, + "learning_rate": 5.1983612609128874e-08, + "loss": 0.5725, + "step": 25936 + }, + { + "epoch": 1.873823757834089, + "grad_norm": 6.755096771879803, + "learning_rate": 5.192429064120458e-08, + "loss": 0.5105, + "step": 25937 + }, + { + "epoch": 1.8738960030342984, + "grad_norm": 6.733186196460125, + "learning_rate": 5.186500218553603e-08, + "loss": 0.575, + "step": 25938 + }, + { + "epoch": 1.873968248234508, + "grad_norm": 7.90511075583459, + "learning_rate": 5.1805747242934525e-08, + "loss": 0.6445, + "step": 25939 + }, + { + "epoch": 1.8740404934347175, + "grad_norm": 7.095988857187427, + "learning_rate": 5.1746525814211914e-08, + "loss": 0.6155, + "step": 25940 + }, + { + "epoch": 1.8741127386349268, + "grad_norm": 7.273302710049385, + "learning_rate": 5.16873379001781e-08, + "loss": 0.6364, + "step": 25941 + }, + { + "epoch": 1.8741849838351365, + "grad_norm": 6.830317444245191, + "learning_rate": 5.1628183501643556e-08, + "loss": 0.5772, + "step": 25942 + }, + { + "epoch": 1.8742572290353459, + "grad_norm": 6.415186162136922, + "learning_rate": 5.156906261941846e-08, + "loss": 0.6149, + "step": 25943 + }, + { + "epoch": 1.8743294742355556, + "grad_norm": 6.833210672657754, + "learning_rate": 5.1509975254311615e-08, + "loss": 0.5432, + "step": 25944 + }, + { + "epoch": 1.874401719435765, + "grad_norm": 6.887230276929266, + "learning_rate": 5.145092140713181e-08, + "loss": 0.6061, + "step": 25945 + }, + { + "epoch": 1.8744739646359745, + "grad_norm": 7.797463801900164, + "learning_rate": 5.1391901078688136e-08, + "loss": 0.5425, + "step": 25946 + }, + { + "epoch": 1.874546209836184, + "grad_norm": 6.598823800907244, + "learning_rate": 5.133291426978798e-08, + "loss": 0.5659, + "step": 25947 + }, + { + "epoch": 1.8746184550363936, + "grad_norm": 8.017719196003128, + "learning_rate": 5.1273960981239045e-08, + "loss": 0.5865, + "step": 25948 + }, + { + "epoch": 1.8746907002366031, + "grad_norm": 6.897362146773231, + "learning_rate": 5.121504121384818e-08, + "loss": 0.5073, + "step": 25949 + }, + { + "epoch": 1.8747629454368124, + "grad_norm": 7.663074674728088, + "learning_rate": 5.115615496842197e-08, + "loss": 0.5824, + "step": 25950 + }, + { + "epoch": 1.8748351906370222, + "grad_norm": 7.571112402512236, + "learning_rate": 5.1097302245766975e-08, + "loss": 0.5971, + "step": 25951 + }, + { + "epoch": 1.8749074358372315, + "grad_norm": 7.938089423935244, + "learning_rate": 5.103848304668813e-08, + "loss": 0.6491, + "step": 25952 + }, + { + "epoch": 1.874979681037441, + "grad_norm": 8.454649274828114, + "learning_rate": 5.097969737199088e-08, + "loss": 0.5592, + "step": 25953 + }, + { + "epoch": 1.8750519262376506, + "grad_norm": 7.891869293909298, + "learning_rate": 5.092094522248015e-08, + "loss": 0.6093, + "step": 25954 + }, + { + "epoch": 1.8751241714378601, + "grad_norm": 8.88007134623691, + "learning_rate": 5.086222659896001e-08, + "loss": 0.6023, + "step": 25955 + }, + { + "epoch": 1.8751964166380697, + "grad_norm": 7.170227792438604, + "learning_rate": 5.080354150223427e-08, + "loss": 0.5793, + "step": 25956 + }, + { + "epoch": 1.875268661838279, + "grad_norm": 7.146421654804712, + "learning_rate": 5.0744889933106447e-08, + "loss": 0.58, + "step": 25957 + }, + { + "epoch": 1.8753409070384888, + "grad_norm": 7.067583533253195, + "learning_rate": 5.068627189237979e-08, + "loss": 0.5272, + "step": 25958 + }, + { + "epoch": 1.875413152238698, + "grad_norm": 7.530006654288099, + "learning_rate": 5.062768738085588e-08, + "loss": 0.6239, + "step": 25959 + }, + { + "epoch": 1.8754853974389076, + "grad_norm": 6.429332403525991, + "learning_rate": 5.0569136399337135e-08, + "loss": 0.579, + "step": 25960 + }, + { + "epoch": 1.8755576426391172, + "grad_norm": 7.5882669127258255, + "learning_rate": 5.051061894862513e-08, + "loss": 0.614, + "step": 25961 + }, + { + "epoch": 1.8756298878393267, + "grad_norm": 8.155290686071, + "learning_rate": 5.04521350295209e-08, + "loss": 0.5928, + "step": 25962 + }, + { + "epoch": 1.8757021330395363, + "grad_norm": 7.91894067596439, + "learning_rate": 5.0393684642824626e-08, + "loss": 0.6474, + "step": 25963 + }, + { + "epoch": 1.8757743782397456, + "grad_norm": 8.460522698742524, + "learning_rate": 5.033526778933706e-08, + "loss": 0.6047, + "step": 25964 + }, + { + "epoch": 1.8758466234399553, + "grad_norm": 7.997930020513877, + "learning_rate": 5.027688446985785e-08, + "loss": 0.6053, + "step": 25965 + }, + { + "epoch": 1.8759188686401647, + "grad_norm": 6.776984917103652, + "learning_rate": 5.021853468518578e-08, + "loss": 0.5939, + "step": 25966 + }, + { + "epoch": 1.8759911138403742, + "grad_norm": 9.014421987680223, + "learning_rate": 5.016021843611968e-08, + "loss": 0.6804, + "step": 25967 + }, + { + "epoch": 1.8760633590405837, + "grad_norm": 9.089480312083305, + "learning_rate": 5.010193572345834e-08, + "loss": 0.5669, + "step": 25968 + }, + { + "epoch": 1.8761356042407933, + "grad_norm": 7.753925682831442, + "learning_rate": 5.004368654799918e-08, + "loss": 0.5757, + "step": 25969 + }, + { + "epoch": 1.8762078494410028, + "grad_norm": 6.4574043179998455, + "learning_rate": 4.9985470910539624e-08, + "loss": 0.5743, + "step": 25970 + }, + { + "epoch": 1.8762800946412121, + "grad_norm": 7.111734992051375, + "learning_rate": 4.9927288811876805e-08, + "loss": 0.5556, + "step": 25971 + }, + { + "epoch": 1.876352339841422, + "grad_norm": 6.987264765019598, + "learning_rate": 4.986914025280676e-08, + "loss": 0.5807, + "step": 25972 + }, + { + "epoch": 1.8764245850416312, + "grad_norm": 8.257881792223472, + "learning_rate": 4.981102523412579e-08, + "loss": 0.5702, + "step": 25973 + }, + { + "epoch": 1.8764968302418408, + "grad_norm": 8.544676558978614, + "learning_rate": 4.9752943756629644e-08, + "loss": 0.6299, + "step": 25974 + }, + { + "epoch": 1.8765690754420503, + "grad_norm": 5.923822651353389, + "learning_rate": 4.9694895821113255e-08, + "loss": 0.5559, + "step": 25975 + }, + { + "epoch": 1.8766413206422599, + "grad_norm": 8.032824885105216, + "learning_rate": 4.963688142837125e-08, + "loss": 0.606, + "step": 25976 + }, + { + "epoch": 1.8767135658424694, + "grad_norm": 7.664261094999584, + "learning_rate": 4.9578900579197455e-08, + "loss": 0.6687, + "step": 25977 + }, + { + "epoch": 1.8767858110426787, + "grad_norm": 7.516384792188215, + "learning_rate": 4.952095327438594e-08, + "loss": 0.6841, + "step": 25978 + }, + { + "epoch": 1.8768580562428885, + "grad_norm": 7.335056824883334, + "learning_rate": 4.946303951473025e-08, + "loss": 0.616, + "step": 25979 + }, + { + "epoch": 1.8769303014430978, + "grad_norm": 8.035036611894945, + "learning_rate": 4.940515930102252e-08, + "loss": 0.5979, + "step": 25980 + }, + { + "epoch": 1.8770025466433073, + "grad_norm": 6.822183149283173, + "learning_rate": 4.934731263405546e-08, + "loss": 0.6068, + "step": 25981 + }, + { + "epoch": 1.8770747918435169, + "grad_norm": 7.678026021553097, + "learning_rate": 4.928949951462064e-08, + "loss": 0.5474, + "step": 25982 + }, + { + "epoch": 1.8771470370437264, + "grad_norm": 7.529399977001612, + "learning_rate": 4.923171994350995e-08, + "loss": 0.5843, + "step": 25983 + }, + { + "epoch": 1.877219282243936, + "grad_norm": 6.771940952573104, + "learning_rate": 4.917397392151413e-08, + "loss": 0.5141, + "step": 25984 + }, + { + "epoch": 1.8772915274441453, + "grad_norm": 7.721046754701156, + "learning_rate": 4.9116261449423666e-08, + "loss": 0.6687, + "step": 25985 + }, + { + "epoch": 1.877363772644355, + "grad_norm": 6.751737017893178, + "learning_rate": 4.9058582528028754e-08, + "loss": 0.5366, + "step": 25986 + }, + { + "epoch": 1.8774360178445644, + "grad_norm": 7.5434458901232135, + "learning_rate": 4.900093715811849e-08, + "loss": 0.6233, + "step": 25987 + }, + { + "epoch": 1.877508263044774, + "grad_norm": 6.934892318978722, + "learning_rate": 4.894332534048252e-08, + "loss": 0.5792, + "step": 25988 + }, + { + "epoch": 1.8775805082449835, + "grad_norm": 8.409653212284018, + "learning_rate": 4.8885747075909095e-08, + "loss": 0.5978, + "step": 25989 + }, + { + "epoch": 1.877652753445193, + "grad_norm": 6.614509770342253, + "learning_rate": 4.8828202365187036e-08, + "loss": 0.5572, + "step": 25990 + }, + { + "epoch": 1.8777249986454025, + "grad_norm": 7.277863726055817, + "learning_rate": 4.8770691209103217e-08, + "loss": 0.5756, + "step": 25991 + }, + { + "epoch": 1.8777972438456119, + "grad_norm": 7.270790282142946, + "learning_rate": 4.871321360844533e-08, + "loss": 0.5969, + "step": 25992 + }, + { + "epoch": 1.8778694890458216, + "grad_norm": 8.781139658170177, + "learning_rate": 4.865576956400081e-08, + "loss": 0.5209, + "step": 25993 + }, + { + "epoch": 1.877941734246031, + "grad_norm": 7.243087103646872, + "learning_rate": 4.859835907655514e-08, + "loss": 0.5663, + "step": 25994 + }, + { + "epoch": 1.8780139794462405, + "grad_norm": 7.359317722624109, + "learning_rate": 4.8540982146894345e-08, + "loss": 0.5516, + "step": 25995 + }, + { + "epoch": 1.87808622464645, + "grad_norm": 6.607168419983654, + "learning_rate": 4.8483638775804196e-08, + "loss": 0.5813, + "step": 25996 + }, + { + "epoch": 1.8781584698466596, + "grad_norm": 6.787513196170125, + "learning_rate": 4.84263289640699e-08, + "loss": 0.5878, + "step": 25997 + }, + { + "epoch": 1.8782307150468691, + "grad_norm": 7.322588881151479, + "learning_rate": 4.836905271247527e-08, + "loss": 0.6521, + "step": 25998 + }, + { + "epoch": 1.8783029602470784, + "grad_norm": 7.886567596604403, + "learning_rate": 4.831181002180496e-08, + "loss": 0.6425, + "step": 25999 + }, + { + "epoch": 1.8783752054472882, + "grad_norm": 7.618174765747673, + "learning_rate": 4.8254600892841954e-08, + "loss": 0.6556, + "step": 26000 + }, + { + "epoch": 1.8784474506474975, + "grad_norm": 6.6554180932784295, + "learning_rate": 4.8197425326370064e-08, + "loss": 0.6325, + "step": 26001 + }, + { + "epoch": 1.8785196958477073, + "grad_norm": 6.575943900117323, + "learning_rate": 4.8140283323171723e-08, + "loss": 0.5753, + "step": 26002 + }, + { + "epoch": 1.8785919410479166, + "grad_norm": 6.9551263907690615, + "learning_rate": 4.808317488402908e-08, + "loss": 0.665, + "step": 26003 + }, + { + "epoch": 1.8786641862481261, + "grad_norm": 7.165916344450587, + "learning_rate": 4.80261000097243e-08, + "loss": 0.5421, + "step": 26004 + }, + { + "epoch": 1.8787364314483357, + "grad_norm": 7.061287564883235, + "learning_rate": 4.7969058701038126e-08, + "loss": 0.5826, + "step": 26005 + }, + { + "epoch": 1.878808676648545, + "grad_norm": 7.659580549503735, + "learning_rate": 4.791205095875162e-08, + "loss": 0.6022, + "step": 26006 + }, + { + "epoch": 1.8788809218487548, + "grad_norm": 6.832033762596022, + "learning_rate": 4.785507678364526e-08, + "loss": 0.6105, + "step": 26007 + }, + { + "epoch": 1.878953167048964, + "grad_norm": 7.394536150866706, + "learning_rate": 4.779813617649925e-08, + "loss": 0.5751, + "step": 26008 + }, + { + "epoch": 1.8790254122491739, + "grad_norm": 7.4801099885412, + "learning_rate": 4.7741229138092424e-08, + "loss": 0.5151, + "step": 26009 + }, + { + "epoch": 1.8790976574493832, + "grad_norm": 7.80327678222675, + "learning_rate": 4.7684355669203875e-08, + "loss": 0.5608, + "step": 26010 + }, + { + "epoch": 1.8791699026495927, + "grad_norm": 6.508577702518178, + "learning_rate": 4.76275157706127e-08, + "loss": 0.5832, + "step": 26011 + }, + { + "epoch": 1.8792421478498023, + "grad_norm": 7.028075726625991, + "learning_rate": 4.7570709443096606e-08, + "loss": 0.6003, + "step": 26012 + }, + { + "epoch": 1.8793143930500116, + "grad_norm": 6.057914021851299, + "learning_rate": 4.7513936687433316e-08, + "loss": 0.5589, + "step": 26013 + }, + { + "epoch": 1.8793866382502213, + "grad_norm": 7.795295264797919, + "learning_rate": 4.7457197504399973e-08, + "loss": 0.547, + "step": 26014 + }, + { + "epoch": 1.8794588834504307, + "grad_norm": 7.626227608338194, + "learning_rate": 4.740049189477347e-08, + "loss": 0.5812, + "step": 26015 + }, + { + "epoch": 1.8795311286506404, + "grad_norm": 7.728683196076322, + "learning_rate": 4.734381985932984e-08, + "loss": 0.6365, + "step": 26016 + }, + { + "epoch": 1.8796033738508497, + "grad_norm": 6.968148107152494, + "learning_rate": 4.728718139884486e-08, + "loss": 0.5688, + "step": 26017 + }, + { + "epoch": 1.8796756190510593, + "grad_norm": 7.627494529039489, + "learning_rate": 4.72305765140943e-08, + "loss": 0.592, + "step": 26018 + }, + { + "epoch": 1.8797478642512688, + "grad_norm": 8.85635308291168, + "learning_rate": 4.717400520585225e-08, + "loss": 0.6124, + "step": 26019 + }, + { + "epoch": 1.8798201094514784, + "grad_norm": 7.4278083294043835, + "learning_rate": 4.711746747489365e-08, + "loss": 0.6461, + "step": 26020 + }, + { + "epoch": 1.879892354651688, + "grad_norm": 7.689879506609442, + "learning_rate": 4.7060963321992334e-08, + "loss": 0.5847, + "step": 26021 + }, + { + "epoch": 1.8799645998518972, + "grad_norm": 6.70603669779242, + "learning_rate": 4.7004492747922125e-08, + "loss": 0.5685, + "step": 26022 + }, + { + "epoch": 1.880036845052107, + "grad_norm": 7.103229121507817, + "learning_rate": 4.694805575345574e-08, + "loss": 0.5678, + "step": 26023 + }, + { + "epoch": 1.8801090902523163, + "grad_norm": 7.674229029598378, + "learning_rate": 4.68916523393656e-08, + "loss": 0.6684, + "step": 26024 + }, + { + "epoch": 1.8801813354525259, + "grad_norm": 7.917157877356019, + "learning_rate": 4.683528250642444e-08, + "loss": 0.6069, + "step": 26025 + }, + { + "epoch": 1.8802535806527354, + "grad_norm": 7.992258856521019, + "learning_rate": 4.6778946255403026e-08, + "loss": 0.5932, + "step": 26026 + }, + { + "epoch": 1.880325825852945, + "grad_norm": 7.549504487406514, + "learning_rate": 4.672264358707324e-08, + "loss": 0.5626, + "step": 26027 + }, + { + "epoch": 1.8803980710531545, + "grad_norm": 7.487762894894978, + "learning_rate": 4.666637450220557e-08, + "loss": 0.6479, + "step": 26028 + }, + { + "epoch": 1.8804703162533638, + "grad_norm": 7.571247432834895, + "learning_rate": 4.661013900157024e-08, + "loss": 0.628, + "step": 26029 + }, + { + "epoch": 1.8805425614535736, + "grad_norm": 6.889614039713033, + "learning_rate": 4.6553937085937194e-08, + "loss": 0.6148, + "step": 26030 + }, + { + "epoch": 1.8806148066537829, + "grad_norm": 7.480873387799523, + "learning_rate": 4.649776875607581e-08, + "loss": 0.5803, + "step": 26031 + }, + { + "epoch": 1.8806870518539924, + "grad_norm": 9.165150557476453, + "learning_rate": 4.64416340127552e-08, + "loss": 0.614, + "step": 26032 + }, + { + "epoch": 1.880759297054202, + "grad_norm": 8.252777094587568, + "learning_rate": 4.638553285674335e-08, + "loss": 0.6635, + "step": 26033 + }, + { + "epoch": 1.8808315422544115, + "grad_norm": 7.843947936216445, + "learning_rate": 4.632946528880855e-08, + "loss": 0.5733, + "step": 26034 + }, + { + "epoch": 1.880903787454621, + "grad_norm": 7.198013974832556, + "learning_rate": 4.6273431309717956e-08, + "loss": 0.5688, + "step": 26035 + }, + { + "epoch": 1.8809760326548304, + "grad_norm": 6.900788922749895, + "learning_rate": 4.621743092023928e-08, + "loss": 0.662, + "step": 26036 + }, + { + "epoch": 1.8810482778550401, + "grad_norm": 7.089513109972688, + "learning_rate": 4.616146412113859e-08, + "loss": 0.6223, + "step": 26037 + }, + { + "epoch": 1.8811205230552495, + "grad_norm": 6.222628663427551, + "learning_rate": 4.610553091318193e-08, + "loss": 0.6076, + "step": 26038 + }, + { + "epoch": 1.881192768255459, + "grad_norm": 8.281860329071838, + "learning_rate": 4.6049631297135356e-08, + "loss": 0.6216, + "step": 26039 + }, + { + "epoch": 1.8812650134556685, + "grad_norm": 8.286607384115442, + "learning_rate": 4.599376527376409e-08, + "loss": 0.609, + "step": 26040 + }, + { + "epoch": 1.881337258655878, + "grad_norm": 6.617968012188602, + "learning_rate": 4.593793284383252e-08, + "loss": 0.6142, + "step": 26041 + }, + { + "epoch": 1.8814095038560876, + "grad_norm": 8.19510117250172, + "learning_rate": 4.5882134008105314e-08, + "loss": 0.5258, + "step": 26042 + }, + { + "epoch": 1.881481749056297, + "grad_norm": 8.658599865078369, + "learning_rate": 4.5826368767346305e-08, + "loss": 0.5603, + "step": 26043 + }, + { + "epoch": 1.8815539942565067, + "grad_norm": 7.499763485040243, + "learning_rate": 4.577063712231877e-08, + "loss": 0.5629, + "step": 26044 + }, + { + "epoch": 1.881626239456716, + "grad_norm": 7.2380778185794945, + "learning_rate": 4.571493907378544e-08, + "loss": 0.5841, + "step": 26045 + }, + { + "epoch": 1.8816984846569256, + "grad_norm": 8.231556013555206, + "learning_rate": 4.565927462250902e-08, + "loss": 0.6048, + "step": 26046 + }, + { + "epoch": 1.8817707298571351, + "grad_norm": 8.30287473600776, + "learning_rate": 4.560364376925142e-08, + "loss": 0.579, + "step": 26047 + }, + { + "epoch": 1.8818429750573447, + "grad_norm": 8.178890448123287, + "learning_rate": 4.5548046514774235e-08, + "loss": 0.6039, + "step": 26048 + }, + { + "epoch": 1.8819152202575542, + "grad_norm": 9.39839731716784, + "learning_rate": 4.549248285983854e-08, + "loss": 0.6454, + "step": 26049 + }, + { + "epoch": 1.8819874654577635, + "grad_norm": 5.7026437216662735, + "learning_rate": 4.543695280520538e-08, + "loss": 0.5507, + "step": 26050 + }, + { + "epoch": 1.8820597106579733, + "grad_norm": 7.345688064993868, + "learning_rate": 4.5381456351634155e-08, + "loss": 0.534, + "step": 26051 + }, + { + "epoch": 1.8821319558581826, + "grad_norm": 8.216514370366397, + "learning_rate": 4.532599349988481e-08, + "loss": 0.6219, + "step": 26052 + }, + { + "epoch": 1.8822042010583921, + "grad_norm": 6.70740377805709, + "learning_rate": 4.5270564250716745e-08, + "loss": 0.5367, + "step": 26053 + }, + { + "epoch": 1.8822764462586017, + "grad_norm": 7.632533619841305, + "learning_rate": 4.521516860488878e-08, + "loss": 0.5758, + "step": 26054 + }, + { + "epoch": 1.8823486914588112, + "grad_norm": 7.059301951968925, + "learning_rate": 4.5159806563158934e-08, + "loss": 0.5722, + "step": 26055 + }, + { + "epoch": 1.8824209366590208, + "grad_norm": 7.75686144881632, + "learning_rate": 4.510447812628549e-08, + "loss": 0.5562, + "step": 26056 + }, + { + "epoch": 1.88249318185923, + "grad_norm": 7.438976750303279, + "learning_rate": 4.504918329502533e-08, + "loss": 0.5987, + "step": 26057 + }, + { + "epoch": 1.8825654270594399, + "grad_norm": 8.021493172855536, + "learning_rate": 4.4993922070135924e-08, + "loss": 0.6047, + "step": 26058 + }, + { + "epoch": 1.8826376722596492, + "grad_norm": 7.040194235202944, + "learning_rate": 4.493869445237359e-08, + "loss": 0.5718, + "step": 26059 + }, + { + "epoch": 1.8827099174598587, + "grad_norm": 6.767251187088476, + "learning_rate": 4.488350044249412e-08, + "loss": 0.5097, + "step": 26060 + }, + { + "epoch": 1.8827821626600683, + "grad_norm": 6.408482306856665, + "learning_rate": 4.482834004125358e-08, + "loss": 0.5455, + "step": 26061 + }, + { + "epoch": 1.8828544078602778, + "grad_norm": 6.921419494658948, + "learning_rate": 4.477321324940637e-08, + "loss": 0.6119, + "step": 26062 + }, + { + "epoch": 1.8829266530604873, + "grad_norm": 7.034226800415771, + "learning_rate": 4.471812006770771e-08, + "loss": 0.597, + "step": 26063 + }, + { + "epoch": 1.8829988982606967, + "grad_norm": 7.813002425250658, + "learning_rate": 4.4663060496911435e-08, + "loss": 0.5808, + "step": 26064 + }, + { + "epoch": 1.8830711434609064, + "grad_norm": 8.434254113794218, + "learning_rate": 4.4608034537771685e-08, + "loss": 0.6299, + "step": 26065 + }, + { + "epoch": 1.8831433886611157, + "grad_norm": 7.795729558740948, + "learning_rate": 4.4553042191041176e-08, + "loss": 0.5746, + "step": 26066 + }, + { + "epoch": 1.8832156338613253, + "grad_norm": 6.863527470505156, + "learning_rate": 4.449808345747319e-08, + "loss": 0.547, + "step": 26067 + }, + { + "epoch": 1.8832878790615348, + "grad_norm": 7.923176494881518, + "learning_rate": 4.444315833781965e-08, + "loss": 0.6334, + "step": 26068 + }, + { + "epoch": 1.8833601242617444, + "grad_norm": 8.59742508421058, + "learning_rate": 4.438826683283298e-08, + "loss": 0.621, + "step": 26069 + }, + { + "epoch": 1.883432369461954, + "grad_norm": 8.511935661531675, + "learning_rate": 4.433340894326399e-08, + "loss": 0.6287, + "step": 26070 + }, + { + "epoch": 1.8835046146621632, + "grad_norm": 7.981837875474628, + "learning_rate": 4.427858466986401e-08, + "loss": 0.5789, + "step": 26071 + }, + { + "epoch": 1.883576859862373, + "grad_norm": 6.862286551513853, + "learning_rate": 4.422379401338356e-08, + "loss": 0.5645, + "step": 26072 + }, + { + "epoch": 1.8836491050625823, + "grad_norm": 7.058729937811825, + "learning_rate": 4.4169036974572586e-08, + "loss": 0.6337, + "step": 26073 + }, + { + "epoch": 1.8837213502627919, + "grad_norm": 7.6736799863214715, + "learning_rate": 4.41143135541805e-08, + "loss": 0.6166, + "step": 26074 + }, + { + "epoch": 1.8837935954630014, + "grad_norm": 8.12796952195348, + "learning_rate": 4.405962375295669e-08, + "loss": 0.6533, + "step": 26075 + }, + { + "epoch": 1.883865840663211, + "grad_norm": 6.439186245701854, + "learning_rate": 4.400496757164946e-08, + "loss": 0.5824, + "step": 26076 + }, + { + "epoch": 1.8839380858634205, + "grad_norm": 8.097673209099462, + "learning_rate": 4.3950345011007644e-08, + "loss": 0.5599, + "step": 26077 + }, + { + "epoch": 1.8840103310636298, + "grad_norm": 7.726163570338451, + "learning_rate": 4.3895756071778425e-08, + "loss": 0.6394, + "step": 26078 + }, + { + "epoch": 1.8840825762638396, + "grad_norm": 8.234526949499362, + "learning_rate": 4.384120075470927e-08, + "loss": 0.6049, + "step": 26079 + }, + { + "epoch": 1.884154821464049, + "grad_norm": 6.707513257559898, + "learning_rate": 4.378667906054707e-08, + "loss": 0.6175, + "step": 26080 + }, + { + "epoch": 1.8842270666642587, + "grad_norm": 7.588926188183494, + "learning_rate": 4.3732190990037895e-08, + "loss": 0.6767, + "step": 26081 + }, + { + "epoch": 1.884299311864468, + "grad_norm": 6.509822759368986, + "learning_rate": 4.3677736543928094e-08, + "loss": 0.6387, + "step": 26082 + }, + { + "epoch": 1.8843715570646775, + "grad_norm": 9.109507747880311, + "learning_rate": 4.3623315722962914e-08, + "loss": 0.5898, + "step": 26083 + }, + { + "epoch": 1.884443802264887, + "grad_norm": 7.814518538060387, + "learning_rate": 4.35689285278873e-08, + "loss": 0.6337, + "step": 26084 + }, + { + "epoch": 1.8845160474650964, + "grad_norm": 8.125658683486979, + "learning_rate": 4.351457495944539e-08, + "loss": 0.6023, + "step": 26085 + }, + { + "epoch": 1.8845882926653061, + "grad_norm": 6.800964668513084, + "learning_rate": 4.346025501838186e-08, + "loss": 0.5978, + "step": 26086 + }, + { + "epoch": 1.8846605378655155, + "grad_norm": 7.01958722414383, + "learning_rate": 4.340596870544e-08, + "loss": 0.6424, + "step": 26087 + }, + { + "epoch": 1.8847327830657252, + "grad_norm": 7.528020522491988, + "learning_rate": 4.335171602136312e-08, + "loss": 0.6064, + "step": 26088 + }, + { + "epoch": 1.8848050282659345, + "grad_norm": 6.857745348483599, + "learning_rate": 4.329749696689367e-08, + "loss": 0.5989, + "step": 26089 + }, + { + "epoch": 1.884877273466144, + "grad_norm": 8.690332184336862, + "learning_rate": 4.3243311542774116e-08, + "loss": 0.5441, + "step": 26090 + }, + { + "epoch": 1.8849495186663536, + "grad_norm": 7.049925556195639, + "learning_rate": 4.3189159749746076e-08, + "loss": 0.505, + "step": 26091 + }, + { + "epoch": 1.885021763866563, + "grad_norm": 8.638162493107213, + "learning_rate": 4.3135041588550645e-08, + "loss": 0.648, + "step": 26092 + }, + { + "epoch": 1.8850940090667727, + "grad_norm": 7.180570761859008, + "learning_rate": 4.308095705992915e-08, + "loss": 0.5537, + "step": 26093 + }, + { + "epoch": 1.885166254266982, + "grad_norm": 7.078934983145803, + "learning_rate": 4.3026906164621576e-08, + "loss": 0.6486, + "step": 26094 + }, + { + "epoch": 1.8852384994671918, + "grad_norm": 6.882739617679532, + "learning_rate": 4.297288890336787e-08, + "loss": 0.6269, + "step": 26095 + }, + { + "epoch": 1.8853107446674011, + "grad_norm": 8.184257972000033, + "learning_rate": 4.2918905276907175e-08, + "loss": 0.6006, + "step": 26096 + }, + { + "epoch": 1.8853829898676107, + "grad_norm": 7.6976667956387725, + "learning_rate": 4.286495528597945e-08, + "loss": 0.5914, + "step": 26097 + }, + { + "epoch": 1.8854552350678202, + "grad_norm": 6.603059533476638, + "learning_rate": 4.281103893132244e-08, + "loss": 0.6143, + "step": 26098 + }, + { + "epoch": 1.8855274802680297, + "grad_norm": 7.766144702491647, + "learning_rate": 4.275715621367443e-08, + "loss": 0.5654, + "step": 26099 + }, + { + "epoch": 1.8855997254682393, + "grad_norm": 6.895981002409715, + "learning_rate": 4.270330713377319e-08, + "loss": 0.6837, + "step": 26100 + }, + { + "epoch": 1.8856719706684486, + "grad_norm": 7.023454608832749, + "learning_rate": 4.264949169235561e-08, + "loss": 0.6549, + "step": 26101 + }, + { + "epoch": 1.8857442158686584, + "grad_norm": 6.628577076388053, + "learning_rate": 4.259570989015832e-08, + "loss": 0.6169, + "step": 26102 + }, + { + "epoch": 1.8858164610688677, + "grad_norm": 7.97807621954033, + "learning_rate": 4.254196172791769e-08, + "loss": 0.6812, + "step": 26103 + }, + { + "epoch": 1.8858887062690772, + "grad_norm": 6.243049418845135, + "learning_rate": 4.248824720636952e-08, + "loss": 0.5784, + "step": 26104 + }, + { + "epoch": 1.8859609514692868, + "grad_norm": 6.568849525459901, + "learning_rate": 4.2434566326249314e-08, + "loss": 0.6502, + "step": 26105 + }, + { + "epoch": 1.8860331966694963, + "grad_norm": 7.268777930221624, + "learning_rate": 4.23809190882915e-08, + "loss": 0.539, + "step": 26106 + }, + { + "epoch": 1.8861054418697059, + "grad_norm": 7.804893758223006, + "learning_rate": 4.232730549323077e-08, + "loss": 0.5877, + "step": 26107 + }, + { + "epoch": 1.8861776870699152, + "grad_norm": 6.348481617013058, + "learning_rate": 4.2273725541801257e-08, + "loss": 0.5855, + "step": 26108 + }, + { + "epoch": 1.886249932270125, + "grad_norm": 7.2713159730701165, + "learning_rate": 4.2220179234735704e-08, + "loss": 0.6114, + "step": 26109 + }, + { + "epoch": 1.8863221774703343, + "grad_norm": 6.996228155552177, + "learning_rate": 4.216666657276769e-08, + "loss": 0.5198, + "step": 26110 + }, + { + "epoch": 1.8863944226705438, + "grad_norm": 7.181931171033247, + "learning_rate": 4.211318755662996e-08, + "loss": 0.6594, + "step": 26111 + }, + { + "epoch": 1.8864666678707533, + "grad_norm": 8.250902531125003, + "learning_rate": 4.205974218705389e-08, + "loss": 0.5834, + "step": 26112 + }, + { + "epoch": 1.8865389130709629, + "grad_norm": 7.218981454284578, + "learning_rate": 4.200633046477165e-08, + "loss": 0.5864, + "step": 26113 + }, + { + "epoch": 1.8866111582711724, + "grad_norm": 6.5801398664178175, + "learning_rate": 4.1952952390514055e-08, + "loss": 0.6172, + "step": 26114 + }, + { + "epoch": 1.8866834034713817, + "grad_norm": 7.913317992729266, + "learning_rate": 4.189960796501219e-08, + "loss": 0.548, + "step": 26115 + }, + { + "epoch": 1.8867556486715915, + "grad_norm": 8.217615085465557, + "learning_rate": 4.1846297188995743e-08, + "loss": 0.5947, + "step": 26116 + }, + { + "epoch": 1.8868278938718008, + "grad_norm": 8.334904128486103, + "learning_rate": 4.1793020063195244e-08, + "loss": 0.594, + "step": 26117 + }, + { + "epoch": 1.8869001390720104, + "grad_norm": 7.176440944260939, + "learning_rate": 4.173977658833955e-08, + "loss": 0.6045, + "step": 26118 + }, + { + "epoch": 1.88697238427222, + "grad_norm": 7.272354914062827, + "learning_rate": 4.1686566765157534e-08, + "loss": 0.5846, + "step": 26119 + }, + { + "epoch": 1.8870446294724295, + "grad_norm": 6.728901136452399, + "learning_rate": 4.163339059437777e-08, + "loss": 0.6123, + "step": 26120 + }, + { + "epoch": 1.887116874672639, + "grad_norm": 8.24435694197666, + "learning_rate": 4.1580248076728e-08, + "loss": 0.5548, + "step": 26121 + }, + { + "epoch": 1.8871891198728483, + "grad_norm": 6.666086584921172, + "learning_rate": 4.1527139212935996e-08, + "loss": 0.5689, + "step": 26122 + }, + { + "epoch": 1.887261365073058, + "grad_norm": 7.3889935166576635, + "learning_rate": 4.147406400372811e-08, + "loss": 0.5521, + "step": 26123 + }, + { + "epoch": 1.8873336102732674, + "grad_norm": 7.174293658748095, + "learning_rate": 4.1421022449831535e-08, + "loss": 0.537, + "step": 26124 + }, + { + "epoch": 1.887405855473477, + "grad_norm": 6.45145747551277, + "learning_rate": 4.136801455197237e-08, + "loss": 0.5165, + "step": 26125 + }, + { + "epoch": 1.8874781006736865, + "grad_norm": 7.541708874101361, + "learning_rate": 4.131504031087613e-08, + "loss": 0.5822, + "step": 26126 + }, + { + "epoch": 1.887550345873896, + "grad_norm": 9.14616552168175, + "learning_rate": 4.126209972726752e-08, + "loss": 0.5535, + "step": 26127 + }, + { + "epoch": 1.8876225910741056, + "grad_norm": 6.833765558979024, + "learning_rate": 4.120919280187208e-08, + "loss": 0.6242, + "step": 26128 + }, + { + "epoch": 1.887694836274315, + "grad_norm": 7.009988742443846, + "learning_rate": 4.1156319535413656e-08, + "loss": 0.587, + "step": 26129 + }, + { + "epoch": 1.8877670814745247, + "grad_norm": 7.780760722882937, + "learning_rate": 4.110347992861585e-08, + "loss": 0.5504, + "step": 26130 + }, + { + "epoch": 1.887839326674734, + "grad_norm": 7.086872278904832, + "learning_rate": 4.1050673982202246e-08, + "loss": 0.5873, + "step": 26131 + }, + { + "epoch": 1.8879115718749435, + "grad_norm": 6.6971764835311385, + "learning_rate": 4.099790169689588e-08, + "loss": 0.6461, + "step": 26132 + }, + { + "epoch": 1.887983817075153, + "grad_norm": 7.71348608504796, + "learning_rate": 4.094516307341867e-08, + "loss": 0.5721, + "step": 26133 + }, + { + "epoch": 1.8880560622753626, + "grad_norm": 7.784649332389448, + "learning_rate": 4.089245811249282e-08, + "loss": 0.5984, + "step": 26134 + }, + { + "epoch": 1.8881283074755721, + "grad_norm": 8.624146350135454, + "learning_rate": 4.083978681483996e-08, + "loss": 0.6677, + "step": 26135 + }, + { + "epoch": 1.8882005526757815, + "grad_norm": 7.68931000033488, + "learning_rate": 4.0787149181181205e-08, + "loss": 0.6851, + "step": 26136 + }, + { + "epoch": 1.8882727978759912, + "grad_norm": 6.589720448424114, + "learning_rate": 4.0734545212236795e-08, + "loss": 0.6091, + "step": 26137 + }, + { + "epoch": 1.8883450430762005, + "grad_norm": 7.472318044408115, + "learning_rate": 4.0681974908726994e-08, + "loss": 0.6148, + "step": 26138 + }, + { + "epoch": 1.88841728827641, + "grad_norm": 7.493580932237156, + "learning_rate": 4.0629438271371225e-08, + "loss": 0.615, + "step": 26139 + }, + { + "epoch": 1.8884895334766196, + "grad_norm": 7.162621206486311, + "learning_rate": 4.057693530088919e-08, + "loss": 0.6332, + "step": 26140 + }, + { + "epoch": 1.8885617786768292, + "grad_norm": 6.796249466096718, + "learning_rate": 4.052446599799892e-08, + "loss": 0.4996, + "step": 26141 + }, + { + "epoch": 1.8886340238770387, + "grad_norm": 6.110043384254379, + "learning_rate": 4.047203036341929e-08, + "loss": 0.5965, + "step": 26142 + }, + { + "epoch": 1.888706269077248, + "grad_norm": 7.378064828035157, + "learning_rate": 4.041962839786778e-08, + "loss": 0.6181, + "step": 26143 + }, + { + "epoch": 1.8887785142774578, + "grad_norm": 7.1993204855806745, + "learning_rate": 4.03672601020616e-08, + "loss": 0.5249, + "step": 26144 + }, + { + "epoch": 1.8888507594776671, + "grad_norm": 7.849776790871439, + "learning_rate": 4.031492547671822e-08, + "loss": 0.623, + "step": 26145 + }, + { + "epoch": 1.8889230046778767, + "grad_norm": 7.815456471843551, + "learning_rate": 4.026262452255347e-08, + "loss": 0.5974, + "step": 26146 + }, + { + "epoch": 1.8889952498780862, + "grad_norm": 6.7470088971956965, + "learning_rate": 4.021035724028372e-08, + "loss": 0.6012, + "step": 26147 + }, + { + "epoch": 1.8890674950782957, + "grad_norm": 7.595960256158188, + "learning_rate": 4.015812363062421e-08, + "loss": 0.5866, + "step": 26148 + }, + { + "epoch": 1.8891397402785053, + "grad_norm": 7.952561394107611, + "learning_rate": 4.010592369428995e-08, + "loss": 0.6109, + "step": 26149 + }, + { + "epoch": 1.8892119854787146, + "grad_norm": 7.679387968630958, + "learning_rate": 4.005375743199563e-08, + "loss": 0.561, + "step": 26150 + }, + { + "epoch": 1.8892842306789244, + "grad_norm": 7.442150809617855, + "learning_rate": 4.000162484445541e-08, + "loss": 0.5916, + "step": 26151 + }, + { + "epoch": 1.8893564758791337, + "grad_norm": 8.590992544116565, + "learning_rate": 3.994952593238288e-08, + "loss": 0.6318, + "step": 26152 + }, + { + "epoch": 1.8894287210793435, + "grad_norm": 7.289555736343056, + "learning_rate": 3.989746069649081e-08, + "loss": 0.6133, + "step": 26153 + }, + { + "epoch": 1.8895009662795528, + "grad_norm": 7.298507475144907, + "learning_rate": 3.9845429137493066e-08, + "loss": 0.5876, + "step": 26154 + }, + { + "epoch": 1.8895732114797623, + "grad_norm": 7.958331069812819, + "learning_rate": 3.979343125610074e-08, + "loss": 0.6616, + "step": 26155 + }, + { + "epoch": 1.8896454566799719, + "grad_norm": 9.254228604822366, + "learning_rate": 3.974146705302606e-08, + "loss": 0.5797, + "step": 26156 + }, + { + "epoch": 1.8897177018801812, + "grad_norm": 7.971902380836087, + "learning_rate": 3.968953652898094e-08, + "loss": 0.5907, + "step": 26157 + }, + { + "epoch": 1.889789947080391, + "grad_norm": 6.75996595701415, + "learning_rate": 3.963763968467538e-08, + "loss": 0.5442, + "step": 26158 + }, + { + "epoch": 1.8898621922806003, + "grad_norm": 7.1084794035822, + "learning_rate": 3.958577652082019e-08, + "loss": 0.6522, + "step": 26159 + }, + { + "epoch": 1.88993443748081, + "grad_norm": 8.466361859859868, + "learning_rate": 3.953394703812535e-08, + "loss": 0.5859, + "step": 26160 + }, + { + "epoch": 1.8900066826810193, + "grad_norm": 7.43093017711236, + "learning_rate": 3.948215123730032e-08, + "loss": 0.6095, + "step": 26161 + }, + { + "epoch": 1.8900789278812289, + "grad_norm": 9.298789768023704, + "learning_rate": 3.9430389119054224e-08, + "loss": 0.6661, + "step": 26162 + }, + { + "epoch": 1.8901511730814384, + "grad_norm": 7.094523786783047, + "learning_rate": 3.937866068409568e-08, + "loss": 0.6069, + "step": 26163 + }, + { + "epoch": 1.8902234182816477, + "grad_norm": 5.776991476857279, + "learning_rate": 3.9326965933132454e-08, + "loss": 0.5775, + "step": 26164 + }, + { + "epoch": 1.8902956634818575, + "grad_norm": 7.534256331127496, + "learning_rate": 3.9275304866872865e-08, + "loss": 0.6851, + "step": 26165 + }, + { + "epoch": 1.8903679086820668, + "grad_norm": 6.082683824676927, + "learning_rate": 3.922367748602357e-08, + "loss": 0.6123, + "step": 26166 + }, + { + "epoch": 1.8904401538822766, + "grad_norm": 8.929913909257905, + "learning_rate": 3.917208379129123e-08, + "loss": 0.5693, + "step": 26167 + }, + { + "epoch": 1.890512399082486, + "grad_norm": 7.787708956883221, + "learning_rate": 3.912052378338277e-08, + "loss": 0.583, + "step": 26168 + }, + { + "epoch": 1.8905846442826955, + "grad_norm": 7.66468713608344, + "learning_rate": 3.906899746300319e-08, + "loss": 0.6308, + "step": 26169 + }, + { + "epoch": 1.890656889482905, + "grad_norm": 7.78494432374306, + "learning_rate": 3.901750483085859e-08, + "loss": 0.5676, + "step": 26170 + }, + { + "epoch": 1.8907291346831145, + "grad_norm": 7.405956664895795, + "learning_rate": 3.8966045887653124e-08, + "loss": 0.5433, + "step": 26171 + }, + { + "epoch": 1.890801379883324, + "grad_norm": 8.173137618309877, + "learning_rate": 3.8914620634091795e-08, + "loss": 0.6224, + "step": 26172 + }, + { + "epoch": 1.8908736250835334, + "grad_norm": 8.649965724987414, + "learning_rate": 3.886322907087847e-08, + "loss": 0.5754, + "step": 26173 + }, + { + "epoch": 1.8909458702837432, + "grad_norm": 6.73247570329511, + "learning_rate": 3.881187119871649e-08, + "loss": 0.5724, + "step": 26174 + }, + { + "epoch": 1.8910181154839525, + "grad_norm": 9.29943340585446, + "learning_rate": 3.876054701830917e-08, + "loss": 0.5916, + "step": 26175 + }, + { + "epoch": 1.891090360684162, + "grad_norm": 8.369507767982554, + "learning_rate": 3.870925653035901e-08, + "loss": 0.6404, + "step": 26176 + }, + { + "epoch": 1.8911626058843716, + "grad_norm": 7.603752173706125, + "learning_rate": 3.8657999735567664e-08, + "loss": 0.6702, + "step": 26177 + }, + { + "epoch": 1.8912348510845811, + "grad_norm": 7.1487480721468595, + "learning_rate": 3.8606776634637355e-08, + "loss": 0.6043, + "step": 26178 + }, + { + "epoch": 1.8913070962847907, + "grad_norm": 7.174443601507432, + "learning_rate": 3.8555587228269465e-08, + "loss": 0.5266, + "step": 26179 + }, + { + "epoch": 1.891379341485, + "grad_norm": 8.358523887016428, + "learning_rate": 3.8504431517163987e-08, + "loss": 0.58, + "step": 26180 + }, + { + "epoch": 1.8914515866852097, + "grad_norm": 7.4579456022554345, + "learning_rate": 3.845330950202175e-08, + "loss": 0.5557, + "step": 26181 + }, + { + "epoch": 1.891523831885419, + "grad_norm": 6.760799105210539, + "learning_rate": 3.8402221183542474e-08, + "loss": 0.5582, + "step": 26182 + }, + { + "epoch": 1.8915960770856286, + "grad_norm": 8.53383544199642, + "learning_rate": 3.835116656242532e-08, + "loss": 0.5552, + "step": 26183 + }, + { + "epoch": 1.8916683222858381, + "grad_norm": 8.088465316683951, + "learning_rate": 3.8300145639369455e-08, + "loss": 0.6569, + "step": 26184 + }, + { + "epoch": 1.8917405674860477, + "grad_norm": 7.639266129243581, + "learning_rate": 3.824915841507321e-08, + "loss": 0.5997, + "step": 26185 + }, + { + "epoch": 1.8918128126862572, + "grad_norm": 7.398818524870363, + "learning_rate": 3.8198204890234904e-08, + "loss": 0.5639, + "step": 26186 + }, + { + "epoch": 1.8918850578864665, + "grad_norm": 7.538616708808336, + "learning_rate": 3.814728506555121e-08, + "loss": 0.5895, + "step": 26187 + }, + { + "epoch": 1.8919573030866763, + "grad_norm": 7.9134069323739284, + "learning_rate": 3.80963989417199e-08, + "loss": 0.6053, + "step": 26188 + }, + { + "epoch": 1.8920295482868856, + "grad_norm": 6.842014597859626, + "learning_rate": 3.8045546519437095e-08, + "loss": 0.5306, + "step": 26189 + }, + { + "epoch": 1.8921017934870952, + "grad_norm": 8.049606541086874, + "learning_rate": 3.799472779939945e-08, + "loss": 0.5385, + "step": 26190 + }, + { + "epoch": 1.8921740386873047, + "grad_norm": 6.722710156487823, + "learning_rate": 3.794394278230223e-08, + "loss": 0.6055, + "step": 26191 + }, + { + "epoch": 1.8922462838875143, + "grad_norm": 6.214170677235261, + "learning_rate": 3.789319146884074e-08, + "loss": 0.611, + "step": 26192 + }, + { + "epoch": 1.8923185290877238, + "grad_norm": 8.545351105290836, + "learning_rate": 3.784247385970996e-08, + "loss": 0.6002, + "step": 26193 + }, + { + "epoch": 1.8923907742879331, + "grad_norm": 7.052078254581428, + "learning_rate": 3.7791789955603785e-08, + "loss": 0.6212, + "step": 26194 + }, + { + "epoch": 1.8924630194881429, + "grad_norm": 7.21926759127568, + "learning_rate": 3.774113975721638e-08, + "loss": 0.6228, + "step": 26195 + }, + { + "epoch": 1.8925352646883522, + "grad_norm": 7.316042311089805, + "learning_rate": 3.769052326524081e-08, + "loss": 0.5735, + "step": 26196 + }, + { + "epoch": 1.8926075098885617, + "grad_norm": 7.929384663629254, + "learning_rate": 3.7639940480370396e-08, + "loss": 0.5786, + "step": 26197 + }, + { + "epoch": 1.8926797550887713, + "grad_norm": 7.221313285688993, + "learning_rate": 3.7589391403296813e-08, + "loss": 0.651, + "step": 26198 + }, + { + "epoch": 1.8927520002889808, + "grad_norm": 6.262636456554292, + "learning_rate": 3.753887603471284e-08, + "loss": 0.6021, + "step": 26199 + }, + { + "epoch": 1.8928242454891904, + "grad_norm": 7.44695315942316, + "learning_rate": 3.74883943753096e-08, + "loss": 0.5379, + "step": 26200 + }, + { + "epoch": 1.8928964906893997, + "grad_norm": 8.340756683053309, + "learning_rate": 3.7437946425778194e-08, + "loss": 0.5501, + "step": 26201 + }, + { + "epoch": 1.8929687358896095, + "grad_norm": 8.308476212732335, + "learning_rate": 3.7387532186809186e-08, + "loss": 0.5803, + "step": 26202 + }, + { + "epoch": 1.8930409810898188, + "grad_norm": 7.188420510433043, + "learning_rate": 3.733715165909285e-08, + "loss": 0.6315, + "step": 26203 + }, + { + "epoch": 1.8931132262900283, + "grad_norm": 8.02418201116115, + "learning_rate": 3.7286804843318925e-08, + "loss": 0.6463, + "step": 26204 + }, + { + "epoch": 1.8931854714902379, + "grad_norm": 7.459465352962689, + "learning_rate": 3.72364917401763e-08, + "loss": 0.5559, + "step": 26205 + }, + { + "epoch": 1.8932577166904474, + "grad_norm": 8.28828747260347, + "learning_rate": 3.718621235035386e-08, + "loss": 0.691, + "step": 26206 + }, + { + "epoch": 1.893329961890657, + "grad_norm": 7.207579321555446, + "learning_rate": 3.7135966674540225e-08, + "loss": 0.5882, + "step": 26207 + }, + { + "epoch": 1.8934022070908663, + "grad_norm": 7.281158839625157, + "learning_rate": 3.7085754713422625e-08, + "loss": 0.5789, + "step": 26208 + }, + { + "epoch": 1.893474452291076, + "grad_norm": 8.301493535616046, + "learning_rate": 3.703557646768857e-08, + "loss": 0.6026, + "step": 26209 + }, + { + "epoch": 1.8935466974912853, + "grad_norm": 8.85289450687753, + "learning_rate": 3.698543193802528e-08, + "loss": 0.6308, + "step": 26210 + }, + { + "epoch": 1.8936189426914949, + "grad_norm": 7.625413727089, + "learning_rate": 3.693532112511916e-08, + "loss": 0.5876, + "step": 26211 + }, + { + "epoch": 1.8936911878917044, + "grad_norm": 6.275735588577197, + "learning_rate": 3.688524402965604e-08, + "loss": 0.5114, + "step": 26212 + }, + { + "epoch": 1.893763433091914, + "grad_norm": 7.436509779719073, + "learning_rate": 3.683520065232121e-08, + "loss": 0.5723, + "step": 26213 + }, + { + "epoch": 1.8938356782921235, + "grad_norm": 6.707417427547154, + "learning_rate": 3.6785190993799957e-08, + "loss": 0.583, + "step": 26214 + }, + { + "epoch": 1.8939079234923328, + "grad_norm": 9.086783031111295, + "learning_rate": 3.6735215054777287e-08, + "loss": 0.6529, + "step": 26215 + }, + { + "epoch": 1.8939801686925426, + "grad_norm": 8.44094620015471, + "learning_rate": 3.668527283593654e-08, + "loss": 0.5674, + "step": 26216 + }, + { + "epoch": 1.894052413892752, + "grad_norm": 7.146602874632469, + "learning_rate": 3.6635364337961885e-08, + "loss": 0.6069, + "step": 26217 + }, + { + "epoch": 1.8941246590929615, + "grad_norm": 7.148051133684315, + "learning_rate": 3.65854895615364e-08, + "loss": 0.5653, + "step": 26218 + }, + { + "epoch": 1.894196904293171, + "grad_norm": 9.014605221517174, + "learning_rate": 3.653564850734287e-08, + "loss": 0.6121, + "step": 26219 + }, + { + "epoch": 1.8942691494933805, + "grad_norm": 7.000245771180723, + "learning_rate": 3.648584117606324e-08, + "loss": 0.6287, + "step": 26220 + }, + { + "epoch": 1.89434139469359, + "grad_norm": 7.1408547128669015, + "learning_rate": 3.6436067568380026e-08, + "loss": 0.6092, + "step": 26221 + }, + { + "epoch": 1.8944136398937994, + "grad_norm": 7.8140691783489755, + "learning_rate": 3.6386327684974356e-08, + "loss": 0.5939, + "step": 26222 + }, + { + "epoch": 1.8944858850940092, + "grad_norm": 9.114050579668186, + "learning_rate": 3.6336621526526504e-08, + "loss": 0.6457, + "step": 26223 + }, + { + "epoch": 1.8945581302942185, + "grad_norm": 8.752468959706455, + "learning_rate": 3.628694909371788e-08, + "loss": 0.5947, + "step": 26224 + }, + { + "epoch": 1.8946303754944283, + "grad_norm": 8.661504039234739, + "learning_rate": 3.6237310387227665e-08, + "loss": 0.6282, + "step": 26225 + }, + { + "epoch": 1.8947026206946376, + "grad_norm": 7.506392488548588, + "learning_rate": 3.618770540773586e-08, + "loss": 0.5378, + "step": 26226 + }, + { + "epoch": 1.8947748658948471, + "grad_norm": 8.011345447787722, + "learning_rate": 3.61381341559211e-08, + "loss": 0.5953, + "step": 26227 + }, + { + "epoch": 1.8948471110950567, + "grad_norm": 6.803454632323349, + "learning_rate": 3.608859663246228e-08, + "loss": 0.6266, + "step": 26228 + }, + { + "epoch": 1.894919356295266, + "grad_norm": 7.418634417950332, + "learning_rate": 3.603909283803747e-08, + "loss": 0.5923, + "step": 26229 + }, + { + "epoch": 1.8949916014954757, + "grad_norm": 8.320743918539385, + "learning_rate": 3.598962277332446e-08, + "loss": 0.5399, + "step": 26230 + }, + { + "epoch": 1.895063846695685, + "grad_norm": 7.252330273993055, + "learning_rate": 3.594018643900021e-08, + "loss": 0.616, + "step": 26231 + }, + { + "epoch": 1.8951360918958948, + "grad_norm": 9.251720371584108, + "learning_rate": 3.589078383574196e-08, + "loss": 0.6723, + "step": 26232 + }, + { + "epoch": 1.8952083370961041, + "grad_norm": 6.660390506327489, + "learning_rate": 3.584141496422527e-08, + "loss": 0.6018, + "step": 26233 + }, + { + "epoch": 1.8952805822963137, + "grad_norm": 7.785058985139394, + "learning_rate": 3.579207982512628e-08, + "loss": 0.5345, + "step": 26234 + }, + { + "epoch": 1.8953528274965232, + "grad_norm": 7.5676105980404555, + "learning_rate": 3.574277841912055e-08, + "loss": 0.5929, + "step": 26235 + }, + { + "epoch": 1.8954250726967325, + "grad_norm": 8.645832499921521, + "learning_rate": 3.569351074688282e-08, + "loss": 0.6393, + "step": 26236 + }, + { + "epoch": 1.8954973178969423, + "grad_norm": 8.506408519515091, + "learning_rate": 3.5644276809086996e-08, + "loss": 0.6617, + "step": 26237 + }, + { + "epoch": 1.8955695630971516, + "grad_norm": 8.385092959266872, + "learning_rate": 3.55950766064081e-08, + "loss": 0.6099, + "step": 26238 + }, + { + "epoch": 1.8956418082973614, + "grad_norm": 6.937078239748672, + "learning_rate": 3.554591013951891e-08, + "loss": 0.5898, + "step": 26239 + }, + { + "epoch": 1.8957140534975707, + "grad_norm": 6.866515872126706, + "learning_rate": 3.549677740909307e-08, + "loss": 0.5394, + "step": 26240 + }, + { + "epoch": 1.8957862986977803, + "grad_norm": 7.09739799638514, + "learning_rate": 3.544767841580254e-08, + "loss": 0.6219, + "step": 26241 + }, + { + "epoch": 1.8958585438979898, + "grad_norm": 7.408389526886399, + "learning_rate": 3.5398613160319564e-08, + "loss": 0.5864, + "step": 26242 + }, + { + "epoch": 1.8959307890981993, + "grad_norm": 7.94561400915133, + "learning_rate": 3.534958164331609e-08, + "loss": 0.5461, + "step": 26243 + }, + { + "epoch": 1.8960030342984089, + "grad_norm": 6.181911526449534, + "learning_rate": 3.5300583865463255e-08, + "loss": 0.6008, + "step": 26244 + }, + { + "epoch": 1.8960752794986182, + "grad_norm": 6.446882502911594, + "learning_rate": 3.525161982743136e-08, + "loss": 0.5984, + "step": 26245 + }, + { + "epoch": 1.896147524698828, + "grad_norm": 7.177234251283898, + "learning_rate": 3.5202689529891256e-08, + "loss": 0.5847, + "step": 26246 + }, + { + "epoch": 1.8962197698990373, + "grad_norm": 7.411505914581395, + "learning_rate": 3.51537929735124e-08, + "loss": 0.5606, + "step": 26247 + }, + { + "epoch": 1.8962920150992468, + "grad_norm": 7.058821538850093, + "learning_rate": 3.5104930158964546e-08, + "loss": 0.5578, + "step": 26248 + }, + { + "epoch": 1.8963642602994564, + "grad_norm": 6.837784627677443, + "learning_rate": 3.505610108691604e-08, + "loss": 0.5967, + "step": 26249 + }, + { + "epoch": 1.896436505499666, + "grad_norm": 6.777015313097284, + "learning_rate": 3.500730575803607e-08, + "loss": 0.619, + "step": 26250 + }, + { + "epoch": 1.8965087506998755, + "grad_norm": 7.107231605123558, + "learning_rate": 3.4958544172991606e-08, + "loss": 0.6099, + "step": 26251 + }, + { + "epoch": 1.8965809959000848, + "grad_norm": 8.529031430134987, + "learning_rate": 3.4909816332450996e-08, + "loss": 0.6031, + "step": 26252 + }, + { + "epoch": 1.8966532411002945, + "grad_norm": 6.961094230488637, + "learning_rate": 3.486112223708094e-08, + "loss": 0.6778, + "step": 26253 + }, + { + "epoch": 1.8967254863005039, + "grad_norm": 7.955112651072879, + "learning_rate": 3.481246188754811e-08, + "loss": 0.6076, + "step": 26254 + }, + { + "epoch": 1.8967977315007134, + "grad_norm": 7.353419654088216, + "learning_rate": 3.476383528451838e-08, + "loss": 0.5942, + "step": 26255 + }, + { + "epoch": 1.896869976700923, + "grad_norm": 8.319831082023146, + "learning_rate": 3.471524242865787e-08, + "loss": 0.5852, + "step": 26256 + }, + { + "epoch": 1.8969422219011325, + "grad_norm": 6.7022846824496805, + "learning_rate": 3.4666683320631336e-08, + "loss": 0.5912, + "step": 26257 + }, + { + "epoch": 1.897014467101342, + "grad_norm": 7.538281968581554, + "learning_rate": 3.4618157961103794e-08, + "loss": 0.686, + "step": 26258 + }, + { + "epoch": 1.8970867123015513, + "grad_norm": 8.436602057253912, + "learning_rate": 3.456966635073916e-08, + "loss": 0.5446, + "step": 26259 + }, + { + "epoch": 1.8971589575017611, + "grad_norm": 6.5770741255313805, + "learning_rate": 3.452120849020163e-08, + "loss": 0.5371, + "step": 26260 + }, + { + "epoch": 1.8972312027019704, + "grad_norm": 8.20480591037079, + "learning_rate": 3.4472784380154836e-08, + "loss": 0.5474, + "step": 26261 + }, + { + "epoch": 1.89730344790218, + "grad_norm": 7.156228944172763, + "learning_rate": 3.442439402126102e-08, + "loss": 0.5555, + "step": 26262 + }, + { + "epoch": 1.8973756931023895, + "grad_norm": 7.880584432161846, + "learning_rate": 3.437603741418272e-08, + "loss": 0.524, + "step": 26263 + }, + { + "epoch": 1.897447938302599, + "grad_norm": 7.7873868833014095, + "learning_rate": 3.432771455958189e-08, + "loss": 0.609, + "step": 26264 + }, + { + "epoch": 1.8975201835028086, + "grad_norm": 6.9193648012610005, + "learning_rate": 3.42794254581208e-08, + "loss": 0.5839, + "step": 26265 + }, + { + "epoch": 1.897592428703018, + "grad_norm": 8.139966121668692, + "learning_rate": 3.4231170110459176e-08, + "loss": 0.648, + "step": 26266 + }, + { + "epoch": 1.8976646739032277, + "grad_norm": 8.123245284529867, + "learning_rate": 3.418294851725845e-08, + "loss": 0.6178, + "step": 26267 + }, + { + "epoch": 1.897736919103437, + "grad_norm": 8.622967024333004, + "learning_rate": 3.413476067917893e-08, + "loss": 0.6028, + "step": 26268 + }, + { + "epoch": 1.8978091643036465, + "grad_norm": 7.865252562758335, + "learning_rate": 3.4086606596879514e-08, + "loss": 0.6067, + "step": 26269 + }, + { + "epoch": 1.897881409503856, + "grad_norm": 7.062110552557427, + "learning_rate": 3.4038486271019963e-08, + "loss": 0.5534, + "step": 26270 + }, + { + "epoch": 1.8979536547040656, + "grad_norm": 6.691907624514052, + "learning_rate": 3.399039970225892e-08, + "loss": 0.6144, + "step": 26271 + }, + { + "epoch": 1.8980258999042752, + "grad_norm": 6.930300989701769, + "learning_rate": 3.394234689125475e-08, + "loss": 0.5555, + "step": 26272 + }, + { + "epoch": 1.8980981451044845, + "grad_norm": 7.304353006629607, + "learning_rate": 3.38943278386647e-08, + "loss": 0.6169, + "step": 26273 + }, + { + "epoch": 1.8981703903046943, + "grad_norm": 7.3664149084555115, + "learning_rate": 3.3846342545146574e-08, + "loss": 0.6892, + "step": 26274 + }, + { + "epoch": 1.8982426355049036, + "grad_norm": 6.766197268486503, + "learning_rate": 3.379839101135707e-08, + "loss": 0.5502, + "step": 26275 + }, + { + "epoch": 1.8983148807051131, + "grad_norm": 9.170089729893425, + "learning_rate": 3.3750473237952886e-08, + "loss": 0.6402, + "step": 26276 + }, + { + "epoch": 1.8983871259053227, + "grad_norm": 6.52401522516641, + "learning_rate": 3.3702589225589887e-08, + "loss": 0.6109, + "step": 26277 + }, + { + "epoch": 1.8984593711055322, + "grad_norm": 8.800555853194462, + "learning_rate": 3.365473897492338e-08, + "loss": 0.6201, + "step": 26278 + }, + { + "epoch": 1.8985316163057417, + "grad_norm": 7.3129906122978605, + "learning_rate": 3.360692248660868e-08, + "loss": 0.6414, + "step": 26279 + }, + { + "epoch": 1.898603861505951, + "grad_norm": 8.044168139232676, + "learning_rate": 3.355913976129999e-08, + "loss": 0.6383, + "step": 26280 + }, + { + "epoch": 1.8986761067061608, + "grad_norm": 8.088034714440349, + "learning_rate": 3.3511390799651486e-08, + "loss": 0.5542, + "step": 26281 + }, + { + "epoch": 1.8987483519063701, + "grad_norm": 6.282512480984854, + "learning_rate": 3.3463675602317115e-08, + "loss": 0.569, + "step": 26282 + }, + { + "epoch": 1.8988205971065797, + "grad_norm": 6.497104880342619, + "learning_rate": 3.341599416994995e-08, + "loss": 0.5463, + "step": 26283 + }, + { + "epoch": 1.8988928423067892, + "grad_norm": 7.41683268271956, + "learning_rate": 3.336834650320253e-08, + "loss": 0.5927, + "step": 26284 + }, + { + "epoch": 1.8989650875069988, + "grad_norm": 7.245117121685031, + "learning_rate": 3.332073260272711e-08, + "loss": 0.5647, + "step": 26285 + }, + { + "epoch": 1.8990373327072083, + "grad_norm": 7.43525803057306, + "learning_rate": 3.327315246917567e-08, + "loss": 0.6057, + "step": 26286 + }, + { + "epoch": 1.8991095779074176, + "grad_norm": 8.500851756784158, + "learning_rate": 3.322560610319964e-08, + "loss": 0.5963, + "step": 26287 + }, + { + "epoch": 1.8991818231076274, + "grad_norm": 7.311603458578566, + "learning_rate": 3.3178093505449594e-08, + "loss": 0.6185, + "step": 26288 + }, + { + "epoch": 1.8992540683078367, + "grad_norm": 6.96594893608029, + "learning_rate": 3.313061467657641e-08, + "loss": 0.5951, + "step": 26289 + }, + { + "epoch": 1.8993263135080463, + "grad_norm": 7.996358042469045, + "learning_rate": 3.308316961722929e-08, + "loss": 0.5663, + "step": 26290 + }, + { + "epoch": 1.8993985587082558, + "grad_norm": 7.146326906231789, + "learning_rate": 3.303575832805828e-08, + "loss": 0.5418, + "step": 26291 + }, + { + "epoch": 1.8994708039084653, + "grad_norm": 8.109354302117131, + "learning_rate": 3.2988380809712285e-08, + "loss": 0.5511, + "step": 26292 + }, + { + "epoch": 1.8995430491086749, + "grad_norm": 6.5425276786967945, + "learning_rate": 3.2941037062839964e-08, + "loss": 0.597, + "step": 26293 + }, + { + "epoch": 1.8996152943088842, + "grad_norm": 6.81594761395807, + "learning_rate": 3.289372708808886e-08, + "loss": 0.5635, + "step": 26294 + }, + { + "epoch": 1.899687539509094, + "grad_norm": 9.974780515775302, + "learning_rate": 3.284645088610733e-08, + "loss": 0.6059, + "step": 26295 + }, + { + "epoch": 1.8997597847093033, + "grad_norm": 5.508399964514867, + "learning_rate": 3.2799208457542364e-08, + "loss": 0.5284, + "step": 26296 + }, + { + "epoch": 1.8998320299095128, + "grad_norm": 8.357028639016372, + "learning_rate": 3.275199980304067e-08, + "loss": 0.581, + "step": 26297 + }, + { + "epoch": 1.8999042751097224, + "grad_norm": 8.261872043495103, + "learning_rate": 3.2704824923248116e-08, + "loss": 0.6332, + "step": 26298 + }, + { + "epoch": 1.899976520309932, + "grad_norm": 7.047173538143429, + "learning_rate": 3.265768381881085e-08, + "loss": 0.5715, + "step": 26299 + }, + { + "epoch": 1.9000487655101415, + "grad_norm": 7.415808067200147, + "learning_rate": 3.261057649037447e-08, + "loss": 0.6068, + "step": 26300 + }, + { + "epoch": 1.9001210107103508, + "grad_norm": 7.092104279777221, + "learning_rate": 3.256350293858318e-08, + "loss": 0.5676, + "step": 26301 + }, + { + "epoch": 1.9001932559105605, + "grad_norm": 7.729581453310352, + "learning_rate": 3.251646316408147e-08, + "loss": 0.5919, + "step": 26302 + }, + { + "epoch": 1.9002655011107699, + "grad_norm": 7.121574247537099, + "learning_rate": 3.2469457167513816e-08, + "loss": 0.5668, + "step": 26303 + }, + { + "epoch": 1.9003377463109796, + "grad_norm": 6.705438530311724, + "learning_rate": 3.2422484949523325e-08, + "loss": 0.6237, + "step": 26304 + }, + { + "epoch": 1.900409991511189, + "grad_norm": 7.464508767748808, + "learning_rate": 3.2375546510753084e-08, + "loss": 0.5763, + "step": 26305 + }, + { + "epoch": 1.9004822367113985, + "grad_norm": 8.035512779979074, + "learning_rate": 3.2328641851845364e-08, + "loss": 0.5512, + "step": 26306 + }, + { + "epoch": 1.900554481911608, + "grad_norm": 8.60195808297902, + "learning_rate": 3.228177097344298e-08, + "loss": 0.6085, + "step": 26307 + }, + { + "epoch": 1.9006267271118173, + "grad_norm": 6.674882054804912, + "learning_rate": 3.2234933876186814e-08, + "loss": 0.6523, + "step": 26308 + }, + { + "epoch": 1.9006989723120271, + "grad_norm": 6.750594783998232, + "learning_rate": 3.2188130560718287e-08, + "loss": 0.585, + "step": 26309 + }, + { + "epoch": 1.9007712175122364, + "grad_norm": 7.023353584645385, + "learning_rate": 3.2141361027678286e-08, + "loss": 0.628, + "step": 26310 + }, + { + "epoch": 1.9008434627124462, + "grad_norm": 8.936893042382042, + "learning_rate": 3.2094625277706846e-08, + "loss": 0.5871, + "step": 26311 + }, + { + "epoch": 1.9009157079126555, + "grad_norm": 7.305305528250272, + "learning_rate": 3.204792331144374e-08, + "loss": 0.6328, + "step": 26312 + }, + { + "epoch": 1.900987953112865, + "grad_norm": 6.256603262242502, + "learning_rate": 3.2001255129528174e-08, + "loss": 0.6021, + "step": 26313 + }, + { + "epoch": 1.9010601983130746, + "grad_norm": 9.049106988000165, + "learning_rate": 3.195462073259936e-08, + "loss": 0.606, + "step": 26314 + }, + { + "epoch": 1.901132443513284, + "grad_norm": 8.448392943202506, + "learning_rate": 3.190802012129512e-08, + "loss": 0.5378, + "step": 26315 + }, + { + "epoch": 1.9012046887134937, + "grad_norm": 8.56260736774039, + "learning_rate": 3.186145329625412e-08, + "loss": 0.6376, + "step": 26316 + }, + { + "epoch": 1.901276933913703, + "grad_norm": 6.348662480342347, + "learning_rate": 3.181492025811334e-08, + "loss": 0.5871, + "step": 26317 + }, + { + "epoch": 1.9013491791139128, + "grad_norm": 7.364956757728649, + "learning_rate": 3.176842100751004e-08, + "loss": 0.6263, + "step": 26318 + }, + { + "epoch": 1.901421424314122, + "grad_norm": 7.348072873704737, + "learning_rate": 3.17219555450804e-08, + "loss": 0.5769, + "step": 26319 + }, + { + "epoch": 1.9014936695143316, + "grad_norm": 6.643532509173072, + "learning_rate": 3.167552387146056e-08, + "loss": 0.5459, + "step": 26320 + }, + { + "epoch": 1.9015659147145412, + "grad_norm": 8.092164344555595, + "learning_rate": 3.1629125987286405e-08, + "loss": 0.5901, + "step": 26321 + }, + { + "epoch": 1.9016381599147507, + "grad_norm": 7.141402254436307, + "learning_rate": 3.158276189319298e-08, + "loss": 0.6048, + "step": 26322 + }, + { + "epoch": 1.9017104051149603, + "grad_norm": 8.87635900273912, + "learning_rate": 3.1536431589814783e-08, + "loss": 0.6244, + "step": 26323 + }, + { + "epoch": 1.9017826503151696, + "grad_norm": 7.354653433304892, + "learning_rate": 3.149013507778631e-08, + "loss": 0.6073, + "step": 26324 + }, + { + "epoch": 1.9018548955153793, + "grad_norm": 6.654581496609651, + "learning_rate": 3.144387235774149e-08, + "loss": 0.5941, + "step": 26325 + }, + { + "epoch": 1.9019271407155887, + "grad_norm": 8.165994369964565, + "learning_rate": 3.139764343031315e-08, + "loss": 0.6621, + "step": 26326 + }, + { + "epoch": 1.9019993859157982, + "grad_norm": 6.607283313097828, + "learning_rate": 3.1351448296134125e-08, + "loss": 0.5627, + "step": 26327 + }, + { + "epoch": 1.9020716311160077, + "grad_norm": 6.859976983212486, + "learning_rate": 3.1305286955837234e-08, + "loss": 0.5773, + "step": 26328 + }, + { + "epoch": 1.9021438763162173, + "grad_norm": 6.709370724410608, + "learning_rate": 3.12591594100542e-08, + "loss": 0.6248, + "step": 26329 + }, + { + "epoch": 1.9022161215164268, + "grad_norm": 8.388234627355997, + "learning_rate": 3.1213065659416176e-08, + "loss": 0.6938, + "step": 26330 + }, + { + "epoch": 1.9022883667166361, + "grad_norm": 7.936825941405665, + "learning_rate": 3.116700570455433e-08, + "loss": 0.5967, + "step": 26331 + }, + { + "epoch": 1.902360611916846, + "grad_norm": 7.590446350080938, + "learning_rate": 3.112097954609955e-08, + "loss": 0.6292, + "step": 26332 + }, + { + "epoch": 1.9024328571170552, + "grad_norm": 8.438702984988135, + "learning_rate": 3.1074987184681324e-08, + "loss": 0.6197, + "step": 26333 + }, + { + "epoch": 1.9025051023172648, + "grad_norm": 8.379491711772037, + "learning_rate": 3.102902862092971e-08, + "loss": 0.6033, + "step": 26334 + }, + { + "epoch": 1.9025773475174743, + "grad_norm": 7.041142671790689, + "learning_rate": 3.0983103855473654e-08, + "loss": 0.5574, + "step": 26335 + }, + { + "epoch": 1.9026495927176839, + "grad_norm": 6.265913274131357, + "learning_rate": 3.0937212888941804e-08, + "loss": 0.5659, + "step": 26336 + }, + { + "epoch": 1.9027218379178934, + "grad_norm": 7.225170896416352, + "learning_rate": 3.0891355721962556e-08, + "loss": 0.6478, + "step": 26337 + }, + { + "epoch": 1.9027940831181027, + "grad_norm": 6.623682718851331, + "learning_rate": 3.084553235516319e-08, + "loss": 0.6115, + "step": 26338 + }, + { + "epoch": 1.9028663283183125, + "grad_norm": 8.416852086608838, + "learning_rate": 3.0799742789171526e-08, + "loss": 0.623, + "step": 26339 + }, + { + "epoch": 1.9029385735185218, + "grad_norm": 7.156056230745248, + "learning_rate": 3.07539870246143e-08, + "loss": 0.5793, + "step": 26340 + }, + { + "epoch": 1.9030108187187313, + "grad_norm": 7.780097355172172, + "learning_rate": 3.070826506211738e-08, + "loss": 0.585, + "step": 26341 + }, + { + "epoch": 1.9030830639189409, + "grad_norm": 8.094194436850586, + "learning_rate": 3.0662576902307226e-08, + "loss": 0.5498, + "step": 26342 + }, + { + "epoch": 1.9031553091191504, + "grad_norm": 7.485851930540475, + "learning_rate": 3.061692254580917e-08, + "loss": 0.6218, + "step": 26343 + }, + { + "epoch": 1.90322755431936, + "grad_norm": 5.6153457869382155, + "learning_rate": 3.057130199324798e-08, + "loss": 0.5363, + "step": 26344 + }, + { + "epoch": 1.9032997995195693, + "grad_norm": 7.8970814492608445, + "learning_rate": 3.052571524524817e-08, + "loss": 0.5856, + "step": 26345 + }, + { + "epoch": 1.903372044719779, + "grad_norm": 8.680294424240683, + "learning_rate": 3.0480162302433945e-08, + "loss": 0.6518, + "step": 26346 + }, + { + "epoch": 1.9034442899199884, + "grad_norm": 7.225825553640264, + "learning_rate": 3.043464316542927e-08, + "loss": 0.6098, + "step": 26347 + }, + { + "epoch": 1.903516535120198, + "grad_norm": 8.531387272104174, + "learning_rate": 3.038915783485641e-08, + "loss": 0.6784, + "step": 26348 + }, + { + "epoch": 1.9035887803204075, + "grad_norm": 9.307026528266242, + "learning_rate": 3.034370631133848e-08, + "loss": 0.5689, + "step": 26349 + }, + { + "epoch": 1.903661025520617, + "grad_norm": 7.154434973414627, + "learning_rate": 3.029828859549777e-08, + "loss": 0.5466, + "step": 26350 + }, + { + "epoch": 1.9037332707208265, + "grad_norm": 10.268826570416932, + "learning_rate": 3.025290468795572e-08, + "loss": 0.6037, + "step": 26351 + }, + { + "epoch": 1.9038055159210359, + "grad_norm": 6.187475955800821, + "learning_rate": 3.020755458933378e-08, + "loss": 0.6344, + "step": 26352 + }, + { + "epoch": 1.9038777611212456, + "grad_norm": 8.690638572136336, + "learning_rate": 3.016223830025283e-08, + "loss": 0.5362, + "step": 26353 + }, + { + "epoch": 1.903950006321455, + "grad_norm": 6.818117398044391, + "learning_rate": 3.0116955821333226e-08, + "loss": 0.6008, + "step": 26354 + }, + { + "epoch": 1.9040222515216645, + "grad_norm": 8.575589387173476, + "learning_rate": 3.007170715319446e-08, + "loss": 0.6003, + "step": 26355 + }, + { + "epoch": 1.904094496721874, + "grad_norm": 6.906038997412994, + "learning_rate": 3.00264922964566e-08, + "loss": 0.5991, + "step": 26356 + }, + { + "epoch": 1.9041667419220836, + "grad_norm": 7.892071098537141, + "learning_rate": 2.998131125173831e-08, + "loss": 0.6154, + "step": 26357 + }, + { + "epoch": 1.9042389871222931, + "grad_norm": 8.458699586452262, + "learning_rate": 2.993616401965771e-08, + "loss": 0.5908, + "step": 26358 + }, + { + "epoch": 1.9043112323225024, + "grad_norm": 6.46420674023639, + "learning_rate": 2.9891050600833195e-08, + "loss": 0.5852, + "step": 26359 + }, + { + "epoch": 1.9043834775227122, + "grad_norm": 7.742642016883599, + "learning_rate": 2.9845970995882323e-08, + "loss": 0.6495, + "step": 26360 + }, + { + "epoch": 1.9044557227229215, + "grad_norm": 6.733283642626422, + "learning_rate": 2.9800925205422105e-08, + "loss": 0.5412, + "step": 26361 + }, + { + "epoch": 1.904527967923131, + "grad_norm": 6.77905350371643, + "learning_rate": 2.9755913230069267e-08, + "loss": 0.5924, + "step": 26362 + }, + { + "epoch": 1.9046002131233406, + "grad_norm": 9.587440030980963, + "learning_rate": 2.9710935070439984e-08, + "loss": 0.689, + "step": 26363 + }, + { + "epoch": 1.9046724583235501, + "grad_norm": 7.370738980681509, + "learning_rate": 2.966599072715015e-08, + "loss": 0.571, + "step": 26364 + }, + { + "epoch": 1.9047447035237597, + "grad_norm": 8.378804723306816, + "learning_rate": 2.9621080200814555e-08, + "loss": 0.5934, + "step": 26365 + }, + { + "epoch": 1.904816948723969, + "grad_norm": 7.839033068760776, + "learning_rate": 2.957620349204826e-08, + "loss": 0.6083, + "step": 26366 + }, + { + "epoch": 1.9048891939241788, + "grad_norm": 8.208283813120637, + "learning_rate": 2.9531360601465497e-08, + "loss": 0.6938, + "step": 26367 + }, + { + "epoch": 1.904961439124388, + "grad_norm": 7.622128055214494, + "learning_rate": 2.9486551529680497e-08, + "loss": 0.705, + "step": 26368 + }, + { + "epoch": 1.9050336843245976, + "grad_norm": 7.181054451108497, + "learning_rate": 2.94417762773061e-08, + "loss": 0.5829, + "step": 26369 + }, + { + "epoch": 1.9051059295248072, + "grad_norm": 7.508817702690373, + "learning_rate": 2.9397034844955708e-08, + "loss": 0.6167, + "step": 26370 + }, + { + "epoch": 1.9051781747250167, + "grad_norm": 6.518630105486001, + "learning_rate": 2.935232723324133e-08, + "loss": 0.6231, + "step": 26371 + }, + { + "epoch": 1.9052504199252263, + "grad_norm": 8.63228399117504, + "learning_rate": 2.930765344277553e-08, + "loss": 0.6672, + "step": 26372 + }, + { + "epoch": 1.9053226651254356, + "grad_norm": 7.728915667164231, + "learning_rate": 2.9263013474169487e-08, + "loss": 0.641, + "step": 26373 + }, + { + "epoch": 1.9053949103256453, + "grad_norm": 6.341307221434026, + "learning_rate": 2.921840732803438e-08, + "loss": 0.6197, + "step": 26374 + }, + { + "epoch": 1.9054671555258547, + "grad_norm": 6.4745053842516125, + "learning_rate": 2.917383500498111e-08, + "loss": 0.5652, + "step": 26375 + }, + { + "epoch": 1.9055394007260644, + "grad_norm": 8.973096266284225, + "learning_rate": 2.9129296505619186e-08, + "loss": 0.6358, + "step": 26376 + }, + { + "epoch": 1.9056116459262737, + "grad_norm": 6.949094688676181, + "learning_rate": 2.9084791830558955e-08, + "loss": 0.61, + "step": 26377 + }, + { + "epoch": 1.9056838911264833, + "grad_norm": 8.260397168233082, + "learning_rate": 2.9040320980409098e-08, + "loss": 0.6098, + "step": 26378 + }, + { + "epoch": 1.9057561363266928, + "grad_norm": 6.7632756548145965, + "learning_rate": 2.8995883955778848e-08, + "loss": 0.5814, + "step": 26379 + }, + { + "epoch": 1.9058283815269021, + "grad_norm": 6.878651429568295, + "learning_rate": 2.895148075727633e-08, + "loss": 0.5057, + "step": 26380 + }, + { + "epoch": 1.905900626727112, + "grad_norm": 8.309597340363995, + "learning_rate": 2.8907111385509388e-08, + "loss": 0.632, + "step": 26381 + }, + { + "epoch": 1.9059728719273212, + "grad_norm": 7.048698781370789, + "learning_rate": 2.88627758410856e-08, + "loss": 0.6058, + "step": 26382 + }, + { + "epoch": 1.906045117127531, + "grad_norm": 7.988615996195334, + "learning_rate": 2.8818474124611417e-08, + "loss": 0.6169, + "step": 26383 + }, + { + "epoch": 1.9061173623277403, + "grad_norm": 7.414591409611481, + "learning_rate": 2.877420623669358e-08, + "loss": 0.628, + "step": 26384 + }, + { + "epoch": 1.9061896075279499, + "grad_norm": 7.170882677894469, + "learning_rate": 2.872997217793827e-08, + "loss": 0.6189, + "step": 26385 + }, + { + "epoch": 1.9062618527281594, + "grad_norm": 7.728010668682056, + "learning_rate": 2.8685771948950836e-08, + "loss": 0.6104, + "step": 26386 + }, + { + "epoch": 1.9063340979283687, + "grad_norm": 7.968258592001998, + "learning_rate": 2.8641605550336072e-08, + "loss": 0.6892, + "step": 26387 + }, + { + "epoch": 1.9064063431285785, + "grad_norm": 7.923217659655412, + "learning_rate": 2.8597472982699047e-08, + "loss": 0.6196, + "step": 26388 + }, + { + "epoch": 1.9064785883287878, + "grad_norm": 7.892070615178928, + "learning_rate": 2.8553374246643727e-08, + "loss": 0.5165, + "step": 26389 + }, + { + "epoch": 1.9065508335289976, + "grad_norm": 7.068464613060179, + "learning_rate": 2.8509309342773518e-08, + "loss": 0.5534, + "step": 26390 + }, + { + "epoch": 1.9066230787292069, + "grad_norm": 8.161239325711689, + "learning_rate": 2.8465278271692098e-08, + "loss": 0.601, + "step": 26391 + }, + { + "epoch": 1.9066953239294164, + "grad_norm": 7.3426549216770445, + "learning_rate": 2.842128103400177e-08, + "loss": 0.6907, + "step": 26392 + }, + { + "epoch": 1.906767569129626, + "grad_norm": 6.769735237477672, + "learning_rate": 2.8377317630305378e-08, + "loss": 0.5755, + "step": 26393 + }, + { + "epoch": 1.9068398143298355, + "grad_norm": 7.235490684083086, + "learning_rate": 2.8333388061204114e-08, + "loss": 0.595, + "step": 26394 + }, + { + "epoch": 1.906912059530045, + "grad_norm": 7.168135588572674, + "learning_rate": 2.8289492327299716e-08, + "loss": 0.6293, + "step": 26395 + }, + { + "epoch": 1.9069843047302544, + "grad_norm": 8.819958716055858, + "learning_rate": 2.8245630429193373e-08, + "loss": 0.5932, + "step": 26396 + }, + { + "epoch": 1.9070565499304641, + "grad_norm": 7.282871311411058, + "learning_rate": 2.82018023674846e-08, + "loss": 0.5285, + "step": 26397 + }, + { + "epoch": 1.9071287951306735, + "grad_norm": 7.1568168848716525, + "learning_rate": 2.8158008142774308e-08, + "loss": 0.6208, + "step": 26398 + }, + { + "epoch": 1.907201040330883, + "grad_norm": 7.256563569882786, + "learning_rate": 2.8114247755661183e-08, + "loss": 0.584, + "step": 26399 + }, + { + "epoch": 1.9072732855310925, + "grad_norm": 8.335898602152685, + "learning_rate": 2.8070521206745304e-08, + "loss": 0.6487, + "step": 26400 + }, + { + "epoch": 1.907345530731302, + "grad_norm": 7.982159510613707, + "learning_rate": 2.8026828496624246e-08, + "loss": 0.6266, + "step": 26401 + }, + { + "epoch": 1.9074177759315116, + "grad_norm": 7.761909931748385, + "learning_rate": 2.79831696258967e-08, + "loss": 0.5569, + "step": 26402 + }, + { + "epoch": 1.907490021131721, + "grad_norm": 7.999622097626486, + "learning_rate": 2.7939544595159962e-08, + "loss": 0.6047, + "step": 26403 + }, + { + "epoch": 1.9075622663319307, + "grad_norm": 6.9530868700675015, + "learning_rate": 2.789595340501189e-08, + "loss": 0.5764, + "step": 26404 + }, + { + "epoch": 1.90763451153214, + "grad_norm": 6.833914879344416, + "learning_rate": 2.7852396056048392e-08, + "loss": 0.5605, + "step": 26405 + }, + { + "epoch": 1.9077067567323496, + "grad_norm": 8.802571649192078, + "learning_rate": 2.7808872548865938e-08, + "loss": 0.6004, + "step": 26406 + }, + { + "epoch": 1.9077790019325591, + "grad_norm": 6.877110105422161, + "learning_rate": 2.776538288406072e-08, + "loss": 0.5892, + "step": 26407 + }, + { + "epoch": 1.9078512471327687, + "grad_norm": 6.260884686926304, + "learning_rate": 2.7721927062227816e-08, + "loss": 0.5419, + "step": 26408 + }, + { + "epoch": 1.9079234923329782, + "grad_norm": 8.30757488537264, + "learning_rate": 2.7678505083962303e-08, + "loss": 0.5476, + "step": 26409 + }, + { + "epoch": 1.9079957375331875, + "grad_norm": 8.610047135883107, + "learning_rate": 2.7635116949858154e-08, + "loss": 0.6346, + "step": 26410 + }, + { + "epoch": 1.9080679827333973, + "grad_norm": 8.616182104341561, + "learning_rate": 2.759176266050989e-08, + "loss": 0.6604, + "step": 26411 + }, + { + "epoch": 1.9081402279336066, + "grad_norm": 7.449930270079897, + "learning_rate": 2.7548442216510374e-08, + "loss": 0.5381, + "step": 26412 + }, + { + "epoch": 1.9082124731338161, + "grad_norm": 6.806073158285731, + "learning_rate": 2.7505155618453017e-08, + "loss": 0.6685, + "step": 26413 + }, + { + "epoch": 1.9082847183340257, + "grad_norm": 8.02039836017869, + "learning_rate": 2.74619028669304e-08, + "loss": 0.6585, + "step": 26414 + }, + { + "epoch": 1.9083569635342352, + "grad_norm": 7.881230871432972, + "learning_rate": 2.7418683962534552e-08, + "loss": 0.5215, + "step": 26415 + }, + { + "epoch": 1.9084292087344448, + "grad_norm": 7.107019323410489, + "learning_rate": 2.7375498905856945e-08, + "loss": 0.6471, + "step": 26416 + }, + { + "epoch": 1.908501453934654, + "grad_norm": 7.775699704199782, + "learning_rate": 2.7332347697489046e-08, + "loss": 0.5963, + "step": 26417 + }, + { + "epoch": 1.9085736991348639, + "grad_norm": 6.894402054827054, + "learning_rate": 2.728923033802122e-08, + "loss": 0.6348, + "step": 26418 + }, + { + "epoch": 1.9086459443350732, + "grad_norm": 7.788488246368377, + "learning_rate": 2.7246146828044108e-08, + "loss": 0.5889, + "step": 26419 + }, + { + "epoch": 1.9087181895352827, + "grad_norm": 7.2562378981010465, + "learning_rate": 2.720309716814723e-08, + "loss": 0.6205, + "step": 26420 + }, + { + "epoch": 1.9087904347354923, + "grad_norm": 6.98737531902194, + "learning_rate": 2.7160081358919844e-08, + "loss": 0.596, + "step": 26421 + }, + { + "epoch": 1.9088626799357018, + "grad_norm": 7.196889300711068, + "learning_rate": 2.711709940095092e-08, + "loss": 0.6165, + "step": 26422 + }, + { + "epoch": 1.9089349251359113, + "grad_norm": 7.229076842996838, + "learning_rate": 2.7074151294828875e-08, + "loss": 0.5733, + "step": 26423 + }, + { + "epoch": 1.9090071703361207, + "grad_norm": 6.5183313544372865, + "learning_rate": 2.7031237041141577e-08, + "loss": 0.5488, + "step": 26424 + }, + { + "epoch": 1.9090794155363304, + "grad_norm": 7.3086529907239655, + "learning_rate": 2.698835664047661e-08, + "loss": 0.6871, + "step": 26425 + }, + { + "epoch": 1.9091516607365397, + "grad_norm": 8.949165893558444, + "learning_rate": 2.6945510093420724e-08, + "loss": 0.6741, + "step": 26426 + }, + { + "epoch": 1.9092239059367493, + "grad_norm": 6.942772863326171, + "learning_rate": 2.6902697400560672e-08, + "loss": 0.6243, + "step": 26427 + }, + { + "epoch": 1.9092961511369588, + "grad_norm": 7.33136110083389, + "learning_rate": 2.685991856248238e-08, + "loss": 0.6583, + "step": 26428 + }, + { + "epoch": 1.9093683963371684, + "grad_norm": 6.7501226343388225, + "learning_rate": 2.681717357977176e-08, + "loss": 0.5621, + "step": 26429 + }, + { + "epoch": 1.909440641537378, + "grad_norm": 7.406740067762367, + "learning_rate": 2.677446245301363e-08, + "loss": 0.6152, + "step": 26430 + }, + { + "epoch": 1.9095128867375872, + "grad_norm": 7.783137203688838, + "learning_rate": 2.6731785182792514e-08, + "loss": 0.6889, + "step": 26431 + }, + { + "epoch": 1.909585131937797, + "grad_norm": 8.037654949966848, + "learning_rate": 2.6689141769693228e-08, + "loss": 0.572, + "step": 26432 + }, + { + "epoch": 1.9096573771380063, + "grad_norm": 9.223674118571486, + "learning_rate": 2.6646532214299193e-08, + "loss": 0.633, + "step": 26433 + }, + { + "epoch": 1.9097296223382159, + "grad_norm": 7.63262233282488, + "learning_rate": 2.6603956517193553e-08, + "loss": 0.6132, + "step": 26434 + }, + { + "epoch": 1.9098018675384254, + "grad_norm": 6.455440373723637, + "learning_rate": 2.6561414678959176e-08, + "loss": 0.5806, + "step": 26435 + }, + { + "epoch": 1.909874112738635, + "grad_norm": 7.701349699950505, + "learning_rate": 2.6518906700178647e-08, + "loss": 0.6234, + "step": 26436 + }, + { + "epoch": 1.9099463579388445, + "grad_norm": 7.990643274721232, + "learning_rate": 2.647643258143373e-08, + "loss": 0.6152, + "step": 26437 + }, + { + "epoch": 1.9100186031390538, + "grad_norm": 7.80806123903367, + "learning_rate": 2.6433992323305903e-08, + "loss": 0.6128, + "step": 26438 + }, + { + "epoch": 1.9100908483392636, + "grad_norm": 7.9244296377094425, + "learning_rate": 2.6391585926376084e-08, + "loss": 0.6445, + "step": 26439 + }, + { + "epoch": 1.910163093539473, + "grad_norm": 6.958574606848879, + "learning_rate": 2.6349213391224926e-08, + "loss": 0.5383, + "step": 26440 + }, + { + "epoch": 1.9102353387396824, + "grad_norm": 7.618426882343959, + "learning_rate": 2.630687471843224e-08, + "loss": 0.5555, + "step": 26441 + }, + { + "epoch": 1.910307583939892, + "grad_norm": 7.724944189172507, + "learning_rate": 2.626456990857784e-08, + "loss": 0.603, + "step": 26442 + }, + { + "epoch": 1.9103798291401015, + "grad_norm": 8.00053070215428, + "learning_rate": 2.622229896224071e-08, + "loss": 0.6155, + "step": 26443 + }, + { + "epoch": 1.910452074340311, + "grad_norm": 7.922501939873835, + "learning_rate": 2.6180061879999552e-08, + "loss": 0.5368, + "step": 26444 + }, + { + "epoch": 1.9105243195405204, + "grad_norm": 8.868117121561841, + "learning_rate": 2.6137858662432512e-08, + "loss": 0.6334, + "step": 26445 + }, + { + "epoch": 1.9105965647407301, + "grad_norm": 10.854287955301423, + "learning_rate": 2.6095689310117467e-08, + "loss": 0.6924, + "step": 26446 + }, + { + "epoch": 1.9106688099409395, + "grad_norm": 7.492806608935416, + "learning_rate": 2.605355382363145e-08, + "loss": 0.5548, + "step": 26447 + }, + { + "epoch": 1.9107410551411492, + "grad_norm": 8.237576928018306, + "learning_rate": 2.6011452203551224e-08, + "loss": 0.569, + "step": 26448 + }, + { + "epoch": 1.9108133003413585, + "grad_norm": 6.345380000525911, + "learning_rate": 2.596938445045355e-08, + "loss": 0.6163, + "step": 26449 + }, + { + "epoch": 1.910885545541568, + "grad_norm": 6.678489533017698, + "learning_rate": 2.592735056491408e-08, + "loss": 0.5364, + "step": 26450 + }, + { + "epoch": 1.9109577907417776, + "grad_norm": 7.717530930348687, + "learning_rate": 2.5885350547508183e-08, + "loss": 0.5432, + "step": 26451 + }, + { + "epoch": 1.911030035941987, + "grad_norm": 8.389249152503067, + "learning_rate": 2.584338439881068e-08, + "loss": 0.5828, + "step": 26452 + }, + { + "epoch": 1.9111022811421967, + "grad_norm": 5.886369782187192, + "learning_rate": 2.580145211939611e-08, + "loss": 0.5178, + "step": 26453 + }, + { + "epoch": 1.911174526342406, + "grad_norm": 7.782846064974802, + "learning_rate": 2.5759553709838736e-08, + "loss": 0.5572, + "step": 26454 + }, + { + "epoch": 1.9112467715426158, + "grad_norm": 7.4644172901960495, + "learning_rate": 2.571768917071199e-08, + "loss": 0.6197, + "step": 26455 + }, + { + "epoch": 1.9113190167428251, + "grad_norm": 6.710113794581316, + "learning_rate": 2.5675858502588745e-08, + "loss": 0.5881, + "step": 26456 + }, + { + "epoch": 1.9113912619430347, + "grad_norm": 8.540224825113011, + "learning_rate": 2.563406170604188e-08, + "loss": 0.5947, + "step": 26457 + }, + { + "epoch": 1.9114635071432442, + "grad_norm": 8.578040506471398, + "learning_rate": 2.559229878164343e-08, + "loss": 0.6098, + "step": 26458 + }, + { + "epoch": 1.9115357523434535, + "grad_norm": 6.688388943637844, + "learning_rate": 2.555056972996517e-08, + "loss": 0.5747, + "step": 26459 + }, + { + "epoch": 1.9116079975436633, + "grad_norm": 7.26704560204672, + "learning_rate": 2.5508874551578577e-08, + "loss": 0.6248, + "step": 26460 + }, + { + "epoch": 1.9116802427438726, + "grad_norm": 7.940586241019879, + "learning_rate": 2.5467213247054034e-08, + "loss": 0.5682, + "step": 26461 + }, + { + "epoch": 1.9117524879440824, + "grad_norm": 6.926809235697497, + "learning_rate": 2.5425585816961917e-08, + "loss": 0.5971, + "step": 26462 + }, + { + "epoch": 1.9118247331442917, + "grad_norm": 8.309967343556565, + "learning_rate": 2.538399226187205e-08, + "loss": 0.586, + "step": 26463 + }, + { + "epoch": 1.9118969783445012, + "grad_norm": 8.431855298619366, + "learning_rate": 2.534243258235397e-08, + "loss": 0.5806, + "step": 26464 + }, + { + "epoch": 1.9119692235447108, + "grad_norm": 7.254954782525139, + "learning_rate": 2.5300906778976675e-08, + "loss": 0.6331, + "step": 26465 + }, + { + "epoch": 1.9120414687449203, + "grad_norm": 6.818262305507116, + "learning_rate": 2.5259414852308316e-08, + "loss": 0.5509, + "step": 26466 + }, + { + "epoch": 1.9121137139451299, + "grad_norm": 7.00845779628964, + "learning_rate": 2.5217956802917053e-08, + "loss": 0.6097, + "step": 26467 + }, + { + "epoch": 1.9121859591453392, + "grad_norm": 6.100541150212779, + "learning_rate": 2.5176532631370766e-08, + "loss": 0.6112, + "step": 26468 + }, + { + "epoch": 1.912258204345549, + "grad_norm": 7.076860260126479, + "learning_rate": 2.513514233823594e-08, + "loss": 0.5425, + "step": 26469 + }, + { + "epoch": 1.9123304495457583, + "grad_norm": 7.930345810497312, + "learning_rate": 2.509378592407935e-08, + "loss": 0.5483, + "step": 26470 + }, + { + "epoch": 1.9124026947459678, + "grad_norm": 6.809377812266371, + "learning_rate": 2.505246338946721e-08, + "loss": 0.632, + "step": 26471 + }, + { + "epoch": 1.9124749399461773, + "grad_norm": 7.929416896167865, + "learning_rate": 2.5011174734965172e-08, + "loss": 0.59, + "step": 26472 + }, + { + "epoch": 1.9125471851463869, + "grad_norm": 8.405494400724177, + "learning_rate": 2.4969919961138624e-08, + "loss": 0.6215, + "step": 26473 + }, + { + "epoch": 1.9126194303465964, + "grad_norm": 6.505064238598341, + "learning_rate": 2.492869906855183e-08, + "loss": 0.6287, + "step": 26474 + }, + { + "epoch": 1.9126916755468057, + "grad_norm": 8.263859522435203, + "learning_rate": 2.4887512057769626e-08, + "loss": 0.5746, + "step": 26475 + }, + { + "epoch": 1.9127639207470155, + "grad_norm": 9.70441323787811, + "learning_rate": 2.4846358929355442e-08, + "loss": 0.5902, + "step": 26476 + }, + { + "epoch": 1.9128361659472248, + "grad_norm": 7.73391901078495, + "learning_rate": 2.4805239683872716e-08, + "loss": 0.5491, + "step": 26477 + }, + { + "epoch": 1.9129084111474344, + "grad_norm": 7.769601460058587, + "learning_rate": 2.4764154321884615e-08, + "loss": 0.5709, + "step": 26478 + }, + { + "epoch": 1.912980656347644, + "grad_norm": 7.034711062225933, + "learning_rate": 2.4723102843953184e-08, + "loss": 0.6948, + "step": 26479 + }, + { + "epoch": 1.9130529015478535, + "grad_norm": 7.1484559105808, + "learning_rate": 2.4682085250640476e-08, + "loss": 0.69, + "step": 26480 + }, + { + "epoch": 1.913125146748063, + "grad_norm": 6.249374663540563, + "learning_rate": 2.4641101542508262e-08, + "loss": 0.5856, + "step": 26481 + }, + { + "epoch": 1.9131973919482723, + "grad_norm": 7.136922665309974, + "learning_rate": 2.460015172011748e-08, + "loss": 0.5745, + "step": 26482 + }, + { + "epoch": 1.913269637148482, + "grad_norm": 8.905350060755042, + "learning_rate": 2.4559235784028245e-08, + "loss": 0.6757, + "step": 26483 + }, + { + "epoch": 1.9133418823486914, + "grad_norm": 7.152527753128774, + "learning_rate": 2.4518353734800938e-08, + "loss": 0.5766, + "step": 26484 + }, + { + "epoch": 1.913414127548901, + "grad_norm": 8.059691420486086, + "learning_rate": 2.4477505572995664e-08, + "loss": 0.4955, + "step": 26485 + }, + { + "epoch": 1.9134863727491105, + "grad_norm": 7.8539275142238, + "learning_rate": 2.4436691299171143e-08, + "loss": 0.6386, + "step": 26486 + }, + { + "epoch": 1.91355861794932, + "grad_norm": 8.390638184492667, + "learning_rate": 2.43959109138861e-08, + "loss": 0.5327, + "step": 26487 + }, + { + "epoch": 1.9136308631495296, + "grad_norm": 7.912941012514203, + "learning_rate": 2.4355164417698695e-08, + "loss": 0.6022, + "step": 26488 + }, + { + "epoch": 1.913703108349739, + "grad_norm": 7.961534532016563, + "learning_rate": 2.4314451811167094e-08, + "loss": 0.5591, + "step": 26489 + }, + { + "epoch": 1.9137753535499487, + "grad_norm": 8.962041279693247, + "learning_rate": 2.427377309484835e-08, + "loss": 0.5312, + "step": 26490 + }, + { + "epoch": 1.913847598750158, + "grad_norm": 8.336883805823184, + "learning_rate": 2.423312826929952e-08, + "loss": 0.6446, + "step": 26491 + }, + { + "epoch": 1.9139198439503675, + "grad_norm": 7.860396635419442, + "learning_rate": 2.4192517335076548e-08, + "loss": 0.6286, + "step": 26492 + }, + { + "epoch": 1.913992089150577, + "grad_norm": 9.107446378036414, + "learning_rate": 2.4151940292735653e-08, + "loss": 0.6305, + "step": 26493 + }, + { + "epoch": 1.9140643343507866, + "grad_norm": 7.738713567713013, + "learning_rate": 2.4111397142832503e-08, + "loss": 0.6539, + "step": 26494 + }, + { + "epoch": 1.9141365795509961, + "grad_norm": 7.293748859840179, + "learning_rate": 2.4070887885921935e-08, + "loss": 0.6028, + "step": 26495 + }, + { + "epoch": 1.9142088247512055, + "grad_norm": 5.795991156719404, + "learning_rate": 2.40304125225585e-08, + "loss": 0.5774, + "step": 26496 + }, + { + "epoch": 1.9142810699514152, + "grad_norm": 7.230425751577307, + "learning_rate": 2.3989971053296202e-08, + "loss": 0.6568, + "step": 26497 + }, + { + "epoch": 1.9143533151516245, + "grad_norm": 8.890151723590382, + "learning_rate": 2.3949563478688764e-08, + "loss": 0.6008, + "step": 26498 + }, + { + "epoch": 1.914425560351834, + "grad_norm": 9.286872573545176, + "learning_rate": 2.3909189799289078e-08, + "loss": 0.6034, + "step": 26499 + }, + { + "epoch": 1.9144978055520436, + "grad_norm": 5.647511513180288, + "learning_rate": 2.3868850015650312e-08, + "loss": 0.5467, + "step": 26500 + }, + { + "epoch": 1.9145700507522532, + "grad_norm": 6.537299024362856, + "learning_rate": 2.382854412832425e-08, + "loss": 0.5455, + "step": 26501 + }, + { + "epoch": 1.9146422959524627, + "grad_norm": 6.293633395708721, + "learning_rate": 2.3788272137862945e-08, + "loss": 0.6009, + "step": 26502 + }, + { + "epoch": 1.914714541152672, + "grad_norm": 6.8376019179090095, + "learning_rate": 2.3748034044817347e-08, + "loss": 0.6129, + "step": 26503 + }, + { + "epoch": 1.9147867863528818, + "grad_norm": 7.981356593341031, + "learning_rate": 2.3707829849738405e-08, + "loss": 0.6361, + "step": 26504 + }, + { + "epoch": 1.9148590315530911, + "grad_norm": 8.176813270984894, + "learning_rate": 2.3667659553176513e-08, + "loss": 0.578, + "step": 26505 + }, + { + "epoch": 1.9149312767533007, + "grad_norm": 7.622620007904901, + "learning_rate": 2.3627523155681785e-08, + "loss": 0.5857, + "step": 26506 + }, + { + "epoch": 1.9150035219535102, + "grad_norm": 8.465498521774602, + "learning_rate": 2.3587420657803507e-08, + "loss": 0.6755, + "step": 26507 + }, + { + "epoch": 1.9150757671537197, + "grad_norm": 8.35295734230127, + "learning_rate": 2.3547352060090677e-08, + "loss": 0.6415, + "step": 26508 + }, + { + "epoch": 1.9151480123539293, + "grad_norm": 8.65793237966319, + "learning_rate": 2.3507317363091752e-08, + "loss": 0.6368, + "step": 26509 + }, + { + "epoch": 1.9152202575541386, + "grad_norm": 7.512080064357045, + "learning_rate": 2.3467316567354625e-08, + "loss": 0.5557, + "step": 26510 + }, + { + "epoch": 1.9152925027543484, + "grad_norm": 7.6176755152163675, + "learning_rate": 2.3427349673427193e-08, + "loss": 0.6302, + "step": 26511 + }, + { + "epoch": 1.9153647479545577, + "grad_norm": 7.122385448794608, + "learning_rate": 2.3387416681856235e-08, + "loss": 0.6711, + "step": 26512 + }, + { + "epoch": 1.9154369931547672, + "grad_norm": 8.50893829715695, + "learning_rate": 2.334751759318854e-08, + "loss": 0.6514, + "step": 26513 + }, + { + "epoch": 1.9155092383549768, + "grad_norm": 6.360318500533024, + "learning_rate": 2.3307652407970614e-08, + "loss": 0.5216, + "step": 26514 + }, + { + "epoch": 1.9155814835551863, + "grad_norm": 7.305421712766977, + "learning_rate": 2.326782112674758e-08, + "loss": 0.5204, + "step": 26515 + }, + { + "epoch": 1.9156537287553959, + "grad_norm": 7.35679137280503, + "learning_rate": 2.322802375006539e-08, + "loss": 0.6241, + "step": 26516 + }, + { + "epoch": 1.9157259739556052, + "grad_norm": 8.010290203576137, + "learning_rate": 2.3188260278468043e-08, + "loss": 0.5809, + "step": 26517 + }, + { + "epoch": 1.915798219155815, + "grad_norm": 6.217690535733065, + "learning_rate": 2.3148530712500673e-08, + "loss": 0.6216, + "step": 26518 + }, + { + "epoch": 1.9158704643560243, + "grad_norm": 6.479327855042142, + "learning_rate": 2.3108835052706445e-08, + "loss": 0.5443, + "step": 26519 + }, + { + "epoch": 1.9159427095562338, + "grad_norm": 6.848094101446447, + "learning_rate": 2.3069173299629377e-08, + "loss": 0.556, + "step": 26520 + }, + { + "epoch": 1.9160149547564433, + "grad_norm": 6.945030455553414, + "learning_rate": 2.3029545453811807e-08, + "loss": 0.6154, + "step": 26521 + }, + { + "epoch": 1.9160871999566529, + "grad_norm": 6.016720679437845, + "learning_rate": 2.2989951515796637e-08, + "loss": 0.5437, + "step": 26522 + }, + { + "epoch": 1.9161594451568624, + "grad_norm": 9.211051390172695, + "learning_rate": 2.2950391486125932e-08, + "loss": 0.6082, + "step": 26523 + }, + { + "epoch": 1.9162316903570717, + "grad_norm": 8.542743249052863, + "learning_rate": 2.291086536534093e-08, + "loss": 0.5852, + "step": 26524 + }, + { + "epoch": 1.9163039355572815, + "grad_norm": 6.332270114730435, + "learning_rate": 2.2871373153982857e-08, + "loss": 0.561, + "step": 26525 + }, + { + "epoch": 1.9163761807574908, + "grad_norm": 7.733618373660163, + "learning_rate": 2.2831914852592396e-08, + "loss": 0.5823, + "step": 26526 + }, + { + "epoch": 1.9164484259577006, + "grad_norm": 6.714194949385918, + "learning_rate": 2.2792490461709395e-08, + "loss": 0.5915, + "step": 26527 + }, + { + "epoch": 1.91652067115791, + "grad_norm": 6.936922891343067, + "learning_rate": 2.2753099981873972e-08, + "loss": 0.6035, + "step": 26528 + }, + { + "epoch": 1.9165929163581195, + "grad_norm": 7.197928649921812, + "learning_rate": 2.2713743413625145e-08, + "loss": 0.6102, + "step": 26529 + }, + { + "epoch": 1.916665161558329, + "grad_norm": 7.59561498526581, + "learning_rate": 2.2674420757501647e-08, + "loss": 0.5724, + "step": 26530 + }, + { + "epoch": 1.9167374067585383, + "grad_norm": 6.397490975186123, + "learning_rate": 2.2635132014041662e-08, + "loss": 0.5875, + "step": 26531 + }, + { + "epoch": 1.916809651958748, + "grad_norm": 6.940570873339711, + "learning_rate": 2.2595877183783365e-08, + "loss": 0.6443, + "step": 26532 + }, + { + "epoch": 1.9168818971589574, + "grad_norm": 7.861835921099662, + "learning_rate": 2.2556656267263834e-08, + "loss": 0.5797, + "step": 26533 + }, + { + "epoch": 1.9169541423591672, + "grad_norm": 9.089345592951423, + "learning_rate": 2.251746926502013e-08, + "loss": 0.5453, + "step": 26534 + }, + { + "epoch": 1.9170263875593765, + "grad_norm": 9.494461051024121, + "learning_rate": 2.24783161775885e-08, + "loss": 0.5908, + "step": 26535 + }, + { + "epoch": 1.917098632759586, + "grad_norm": 8.539430601626785, + "learning_rate": 2.2439197005505175e-08, + "loss": 0.6044, + "step": 26536 + }, + { + "epoch": 1.9171708779597956, + "grad_norm": 8.203501200450987, + "learning_rate": 2.2400111749305566e-08, + "loss": 0.5303, + "step": 26537 + }, + { + "epoch": 1.917243123160005, + "grad_norm": 6.876547067144265, + "learning_rate": 2.2361060409524793e-08, + "loss": 0.4996, + "step": 26538 + }, + { + "epoch": 1.9173153683602147, + "grad_norm": 9.765540624635497, + "learning_rate": 2.2322042986697156e-08, + "loss": 0.6038, + "step": 26539 + }, + { + "epoch": 1.917387613560424, + "grad_norm": 7.627837356713797, + "learning_rate": 2.228305948135695e-08, + "loss": 0.6138, + "step": 26540 + }, + { + "epoch": 1.9174598587606337, + "grad_norm": 7.82394596106155, + "learning_rate": 2.2244109894037915e-08, + "loss": 0.5758, + "step": 26541 + }, + { + "epoch": 1.917532103960843, + "grad_norm": 7.269391140005243, + "learning_rate": 2.2205194225272953e-08, + "loss": 0.5922, + "step": 26542 + }, + { + "epoch": 1.9176043491610526, + "grad_norm": 7.415450550057735, + "learning_rate": 2.2166312475595254e-08, + "loss": 0.569, + "step": 26543 + }, + { + "epoch": 1.9176765943612621, + "grad_norm": 10.496982322478111, + "learning_rate": 2.2127464645536888e-08, + "loss": 0.6822, + "step": 26544 + }, + { + "epoch": 1.9177488395614717, + "grad_norm": 7.976040246737608, + "learning_rate": 2.208865073562938e-08, + "loss": 0.6297, + "step": 26545 + }, + { + "epoch": 1.9178210847616812, + "grad_norm": 8.067228128529301, + "learning_rate": 2.2049870746404524e-08, + "loss": 0.6043, + "step": 26546 + }, + { + "epoch": 1.9178933299618905, + "grad_norm": 7.96303697628877, + "learning_rate": 2.2011124678392725e-08, + "loss": 0.6406, + "step": 26547 + }, + { + "epoch": 1.9179655751621003, + "grad_norm": 6.864613958897288, + "learning_rate": 2.19724125321244e-08, + "loss": 0.567, + "step": 26548 + }, + { + "epoch": 1.9180378203623096, + "grad_norm": 7.0386769044116, + "learning_rate": 2.193373430812995e-08, + "loss": 0.6463, + "step": 26549 + }, + { + "epoch": 1.9181100655625192, + "grad_norm": 6.973412839729251, + "learning_rate": 2.18950900069384e-08, + "loss": 0.5613, + "step": 26550 + }, + { + "epoch": 1.9181823107627287, + "grad_norm": 7.876453810420369, + "learning_rate": 2.1856479629079053e-08, + "loss": 0.6486, + "step": 26551 + }, + { + "epoch": 1.9182545559629383, + "grad_norm": 7.566838052744031, + "learning_rate": 2.181790317508037e-08, + "loss": 0.6316, + "step": 26552 + }, + { + "epoch": 1.9183268011631478, + "grad_norm": 8.171415181106129, + "learning_rate": 2.1779360645470537e-08, + "loss": 0.63, + "step": 26553 + }, + { + "epoch": 1.9183990463633571, + "grad_norm": 7.27315113393034, + "learning_rate": 2.1740852040776916e-08, + "loss": 0.5681, + "step": 26554 + }, + { + "epoch": 1.9184712915635669, + "grad_norm": 6.746421713220808, + "learning_rate": 2.170237736152686e-08, + "loss": 0.5675, + "step": 26555 + }, + { + "epoch": 1.9185435367637762, + "grad_norm": 7.223310607020045, + "learning_rate": 2.1663936608246892e-08, + "loss": 0.641, + "step": 26556 + }, + { + "epoch": 1.9186157819639857, + "grad_norm": 7.417692334979661, + "learning_rate": 2.162552978146326e-08, + "loss": 0.6315, + "step": 26557 + }, + { + "epoch": 1.9186880271641953, + "grad_norm": 8.096073715507634, + "learning_rate": 2.1587156881701933e-08, + "loss": 0.6482, + "step": 26558 + }, + { + "epoch": 1.9187602723644048, + "grad_norm": 7.800049405063762, + "learning_rate": 2.154881790948804e-08, + "loss": 0.574, + "step": 26559 + }, + { + "epoch": 1.9188325175646144, + "grad_norm": 7.929932598967656, + "learning_rate": 2.1510512865346445e-08, + "loss": 0.6952, + "step": 26560 + }, + { + "epoch": 1.9189047627648237, + "grad_norm": 7.530521088461909, + "learning_rate": 2.147224174980145e-08, + "loss": 0.6408, + "step": 26561 + }, + { + "epoch": 1.9189770079650335, + "grad_norm": 7.304867926832545, + "learning_rate": 2.1434004563376797e-08, + "loss": 0.634, + "step": 26562 + }, + { + "epoch": 1.9190492531652428, + "grad_norm": 9.030491358166612, + "learning_rate": 2.139580130659652e-08, + "loss": 0.5862, + "step": 26563 + }, + { + "epoch": 1.9191214983654523, + "grad_norm": 6.130892974382286, + "learning_rate": 2.1357631979982974e-08, + "loss": 0.6061, + "step": 26564 + }, + { + "epoch": 1.9191937435656619, + "grad_norm": 7.233816038746208, + "learning_rate": 2.1319496584059075e-08, + "loss": 0.5995, + "step": 26565 + }, + { + "epoch": 1.9192659887658714, + "grad_norm": 7.651606930772573, + "learning_rate": 2.1281395119346625e-08, + "loss": 0.6342, + "step": 26566 + }, + { + "epoch": 1.919338233966081, + "grad_norm": 6.5143511874032605, + "learning_rate": 2.1243327586367156e-08, + "loss": 0.6141, + "step": 26567 + }, + { + "epoch": 1.9194104791662903, + "grad_norm": 7.124209443789157, + "learning_rate": 2.1205293985642194e-08, + "loss": 0.5542, + "step": 26568 + }, + { + "epoch": 1.9194827243665, + "grad_norm": 8.848506874810813, + "learning_rate": 2.11672943176916e-08, + "loss": 0.5988, + "step": 26569 + }, + { + "epoch": 1.9195549695667093, + "grad_norm": 6.399566289988031, + "learning_rate": 2.1129328583036344e-08, + "loss": 0.6212, + "step": 26570 + }, + { + "epoch": 1.9196272147669189, + "grad_norm": 7.75462892177024, + "learning_rate": 2.1091396782196017e-08, + "loss": 0.6131, + "step": 26571 + }, + { + "epoch": 1.9196994599671284, + "grad_norm": 7.229844324485751, + "learning_rate": 2.105349891568964e-08, + "loss": 0.5951, + "step": 26572 + }, + { + "epoch": 1.919771705167338, + "grad_norm": 8.126468467058354, + "learning_rate": 2.101563498403597e-08, + "loss": 0.5838, + "step": 26573 + }, + { + "epoch": 1.9198439503675475, + "grad_norm": 7.46881103490686, + "learning_rate": 2.097780498775348e-08, + "loss": 0.6472, + "step": 26574 + }, + { + "epoch": 1.9199161955677568, + "grad_norm": 7.093543768082161, + "learning_rate": 2.0940008927360367e-08, + "loss": 0.6175, + "step": 26575 + }, + { + "epoch": 1.9199884407679666, + "grad_norm": 7.459295825194472, + "learning_rate": 2.090224680337316e-08, + "loss": 0.5821, + "step": 26576 + }, + { + "epoch": 1.920060685968176, + "grad_norm": 8.019868972735386, + "learning_rate": 2.0864518616309504e-08, + "loss": 0.5522, + "step": 26577 + }, + { + "epoch": 1.9201329311683855, + "grad_norm": 7.478733939259272, + "learning_rate": 2.082682436668565e-08, + "loss": 0.5659, + "step": 26578 + }, + { + "epoch": 1.920205176368595, + "grad_norm": 7.236654429215261, + "learning_rate": 2.0789164055017574e-08, + "loss": 0.5971, + "step": 26579 + }, + { + "epoch": 1.9202774215688045, + "grad_norm": 7.686126089990993, + "learning_rate": 2.07515376818207e-08, + "loss": 0.6194, + "step": 26580 + }, + { + "epoch": 1.920349666769014, + "grad_norm": 8.377702874687248, + "learning_rate": 2.0713945247610447e-08, + "loss": 0.632, + "step": 26581 + }, + { + "epoch": 1.9204219119692234, + "grad_norm": 8.197129970806904, + "learning_rate": 2.06763867529014e-08, + "loss": 0.6108, + "step": 26582 + }, + { + "epoch": 1.9204941571694332, + "grad_norm": 7.349593803963043, + "learning_rate": 2.063886219820732e-08, + "loss": 0.6053, + "step": 26583 + }, + { + "epoch": 1.9205664023696425, + "grad_norm": 7.698746061403035, + "learning_rate": 2.060137158404224e-08, + "loss": 0.6034, + "step": 26584 + }, + { + "epoch": 1.920638647569852, + "grad_norm": 8.073040363635663, + "learning_rate": 2.056391491091936e-08, + "loss": 0.6654, + "step": 26585 + }, + { + "epoch": 1.9207108927700616, + "grad_norm": 8.104334446233462, + "learning_rate": 2.0526492179351043e-08, + "loss": 0.6155, + "step": 26586 + }, + { + "epoch": 1.9207831379702711, + "grad_norm": 7.0214506909831815, + "learning_rate": 2.0489103389850218e-08, + "loss": 0.6005, + "step": 26587 + }, + { + "epoch": 1.9208553831704807, + "grad_norm": 6.624094775272836, + "learning_rate": 2.0451748542927862e-08, + "loss": 0.6737, + "step": 26588 + }, + { + "epoch": 1.92092762837069, + "grad_norm": 7.110013492963089, + "learning_rate": 2.0414427639096067e-08, + "loss": 0.608, + "step": 26589 + }, + { + "epoch": 1.9209998735708997, + "grad_norm": 7.34265310333719, + "learning_rate": 2.0377140678865537e-08, + "loss": 0.563, + "step": 26590 + }, + { + "epoch": 1.921072118771109, + "grad_norm": 7.7726267636263495, + "learning_rate": 2.0339887662746695e-08, + "loss": 0.6294, + "step": 26591 + }, + { + "epoch": 1.9211443639713186, + "grad_norm": 6.990006261121486, + "learning_rate": 2.030266859124941e-08, + "loss": 0.5375, + "step": 26592 + }, + { + "epoch": 1.9212166091715281, + "grad_norm": 6.658964794454845, + "learning_rate": 2.0265483464883275e-08, + "loss": 0.5951, + "step": 26593 + }, + { + "epoch": 1.9212888543717377, + "grad_norm": 7.613351898883334, + "learning_rate": 2.022833228415705e-08, + "loss": 0.6225, + "step": 26594 + }, + { + "epoch": 1.9213610995719472, + "grad_norm": 10.04534739127651, + "learning_rate": 2.019121504957977e-08, + "loss": 0.6776, + "step": 26595 + }, + { + "epoch": 1.9214333447721565, + "grad_norm": 8.149858681793392, + "learning_rate": 2.0154131761659367e-08, + "loss": 0.5765, + "step": 26596 + }, + { + "epoch": 1.9215055899723663, + "grad_norm": 8.625409102758049, + "learning_rate": 2.0117082420902934e-08, + "loss": 0.7393, + "step": 26597 + }, + { + "epoch": 1.9215778351725756, + "grad_norm": 8.478900077309326, + "learning_rate": 2.0080067027818673e-08, + "loss": 0.5691, + "step": 26598 + }, + { + "epoch": 1.9216500803727854, + "grad_norm": 7.6792642781399065, + "learning_rate": 2.004308558291257e-08, + "loss": 0.5921, + "step": 26599 + }, + { + "epoch": 1.9217223255729947, + "grad_norm": 6.622280498364852, + "learning_rate": 2.0006138086691164e-08, + "loss": 0.5671, + "step": 26600 + }, + { + "epoch": 1.9217945707732043, + "grad_norm": 8.29206435689136, + "learning_rate": 1.996922453965988e-08, + "loss": 0.5771, + "step": 26601 + }, + { + "epoch": 1.9218668159734138, + "grad_norm": 7.345012150514861, + "learning_rate": 1.993234494232443e-08, + "loss": 0.5365, + "step": 26602 + }, + { + "epoch": 1.9219390611736231, + "grad_norm": 8.377261183934023, + "learning_rate": 1.9895499295189692e-08, + "loss": 0.6317, + "step": 26603 + }, + { + "epoch": 1.9220113063738329, + "grad_norm": 7.5664947299776335, + "learning_rate": 1.9858687598759695e-08, + "loss": 0.6163, + "step": 26604 + }, + { + "epoch": 1.9220835515740422, + "grad_norm": 6.352944912213326, + "learning_rate": 1.9821909853538767e-08, + "loss": 0.6074, + "step": 26605 + }, + { + "epoch": 1.922155796774252, + "grad_norm": 6.340150610642069, + "learning_rate": 1.9785166060029835e-08, + "loss": 0.5797, + "step": 26606 + }, + { + "epoch": 1.9222280419744613, + "grad_norm": 6.309240424963233, + "learning_rate": 1.9748456218736388e-08, + "loss": 0.5768, + "step": 26607 + }, + { + "epoch": 1.9223002871746708, + "grad_norm": 6.598084963776155, + "learning_rate": 1.97117803301608e-08, + "loss": 0.5888, + "step": 26608 + }, + { + "epoch": 1.9223725323748804, + "grad_norm": 6.474183385307509, + "learning_rate": 1.967513839480517e-08, + "loss": 0.657, + "step": 26609 + }, + { + "epoch": 1.9224447775750897, + "grad_norm": 8.597360303431032, + "learning_rate": 1.9638530413171042e-08, + "loss": 0.6289, + "step": 26610 + }, + { + "epoch": 1.9225170227752995, + "grad_norm": 7.495264974621973, + "learning_rate": 1.960195638575968e-08, + "loss": 0.5707, + "step": 26611 + }, + { + "epoch": 1.9225892679755088, + "grad_norm": 8.248241815198691, + "learning_rate": 1.956541631307124e-08, + "loss": 0.607, + "step": 26612 + }, + { + "epoch": 1.9226615131757185, + "grad_norm": 7.26105025458167, + "learning_rate": 1.9528910195606708e-08, + "loss": 0.6386, + "step": 26613 + }, + { + "epoch": 1.9227337583759279, + "grad_norm": 6.589094932355118, + "learning_rate": 1.9492438033865136e-08, + "loss": 0.5668, + "step": 26614 + }, + { + "epoch": 1.9228060035761374, + "grad_norm": 7.978993970896184, + "learning_rate": 1.9455999828346393e-08, + "loss": 0.6585, + "step": 26615 + }, + { + "epoch": 1.922878248776347, + "grad_norm": 7.211511194422108, + "learning_rate": 1.94195955795487e-08, + "loss": 0.5623, + "step": 26616 + }, + { + "epoch": 1.9229504939765565, + "grad_norm": 8.38802270298088, + "learning_rate": 1.9383225287970818e-08, + "loss": 0.6091, + "step": 26617 + }, + { + "epoch": 1.923022739176766, + "grad_norm": 7.463861501637407, + "learning_rate": 1.934688895411041e-08, + "loss": 0.5707, + "step": 26618 + }, + { + "epoch": 1.9230949843769753, + "grad_norm": 6.67792374375028, + "learning_rate": 1.9310586578465128e-08, + "loss": 0.6417, + "step": 26619 + }, + { + "epoch": 1.9231672295771851, + "grad_norm": 8.021540253107664, + "learning_rate": 1.9274318161531803e-08, + "loss": 0.6599, + "step": 26620 + }, + { + "epoch": 1.9232394747773944, + "grad_norm": 8.066951025792772, + "learning_rate": 1.9238083703806975e-08, + "loss": 0.6132, + "step": 26621 + }, + { + "epoch": 1.923311719977604, + "grad_norm": 8.306664734784551, + "learning_rate": 1.9201883205786365e-08, + "loss": 0.6297, + "step": 26622 + }, + { + "epoch": 1.9233839651778135, + "grad_norm": 7.437747983244381, + "learning_rate": 1.916571666796596e-08, + "loss": 0.6117, + "step": 26623 + }, + { + "epoch": 1.923456210378023, + "grad_norm": 7.893934949337687, + "learning_rate": 1.912958409084037e-08, + "loss": 0.6055, + "step": 26624 + }, + { + "epoch": 1.9235284555782326, + "grad_norm": 7.25521768065921, + "learning_rate": 1.9093485474904748e-08, + "loss": 0.6072, + "step": 26625 + }, + { + "epoch": 1.923600700778442, + "grad_norm": 5.627343431393594, + "learning_rate": 1.9057420820652872e-08, + "loss": 0.5897, + "step": 26626 + }, + { + "epoch": 1.9236729459786517, + "grad_norm": 7.527893078035576, + "learning_rate": 1.902139012857851e-08, + "loss": 0.5992, + "step": 26627 + }, + { + "epoch": 1.923745191178861, + "grad_norm": 6.694631036643981, + "learning_rate": 1.8985393399175434e-08, + "loss": 0.6288, + "step": 26628 + }, + { + "epoch": 1.9238174363790705, + "grad_norm": 8.126691906175022, + "learning_rate": 1.894943063293547e-08, + "loss": 0.5942, + "step": 26629 + }, + { + "epoch": 1.92388968157928, + "grad_norm": 8.078324748950845, + "learning_rate": 1.8913501830351566e-08, + "loss": 0.6093, + "step": 26630 + }, + { + "epoch": 1.9239619267794896, + "grad_norm": 8.209426058409692, + "learning_rate": 1.8877606991915265e-08, + "loss": 0.6066, + "step": 26631 + }, + { + "epoch": 1.9240341719796992, + "grad_norm": 8.550847275898581, + "learning_rate": 1.8841746118118396e-08, + "loss": 0.5806, + "step": 26632 + }, + { + "epoch": 1.9241064171799085, + "grad_norm": 8.292226290068495, + "learning_rate": 1.8805919209451128e-08, + "loss": 0.6034, + "step": 26633 + }, + { + "epoch": 1.9241786623801183, + "grad_norm": 7.462738299119337, + "learning_rate": 1.8770126266404454e-08, + "loss": 0.5722, + "step": 26634 + }, + { + "epoch": 1.9242509075803276, + "grad_norm": 6.275192756275935, + "learning_rate": 1.873436728946798e-08, + "loss": 0.5733, + "step": 26635 + }, + { + "epoch": 1.9243231527805371, + "grad_norm": 7.929702894073728, + "learning_rate": 1.8698642279131872e-08, + "loss": 0.5583, + "step": 26636 + }, + { + "epoch": 1.9243953979807467, + "grad_norm": 8.863334618614116, + "learning_rate": 1.8662951235884353e-08, + "loss": 0.5938, + "step": 26637 + }, + { + "epoch": 1.9244676431809562, + "grad_norm": 6.17399464195323, + "learning_rate": 1.8627294160214472e-08, + "loss": 0.5732, + "step": 26638 + }, + { + "epoch": 1.9245398883811657, + "grad_norm": 6.281206273168694, + "learning_rate": 1.859167105261045e-08, + "loss": 0.5588, + "step": 26639 + }, + { + "epoch": 1.924612133581375, + "grad_norm": 7.433689455587928, + "learning_rate": 1.8556081913559676e-08, + "loss": 0.5834, + "step": 26640 + }, + { + "epoch": 1.9246843787815848, + "grad_norm": 7.025974583476391, + "learning_rate": 1.8520526743549538e-08, + "loss": 0.5633, + "step": 26641 + }, + { + "epoch": 1.9247566239817941, + "grad_norm": 7.486493473829481, + "learning_rate": 1.8485005543066593e-08, + "loss": 0.697, + "step": 26642 + }, + { + "epoch": 1.9248288691820037, + "grad_norm": 6.931493960215779, + "learning_rate": 1.8449518312597114e-08, + "loss": 0.5729, + "step": 26643 + }, + { + "epoch": 1.9249011143822132, + "grad_norm": 8.990859582391076, + "learning_rate": 1.8414065052626828e-08, + "loss": 0.6506, + "step": 26644 + }, + { + "epoch": 1.9249733595824228, + "grad_norm": 7.382808366400959, + "learning_rate": 1.837864576364118e-08, + "loss": 0.6488, + "step": 26645 + }, + { + "epoch": 1.9250456047826323, + "grad_norm": 6.688258333081139, + "learning_rate": 1.8343260446125055e-08, + "loss": 0.5828, + "step": 26646 + }, + { + "epoch": 1.9251178499828416, + "grad_norm": 7.605579095244717, + "learning_rate": 1.8307909100562794e-08, + "loss": 0.658, + "step": 26647 + }, + { + "epoch": 1.9251900951830514, + "grad_norm": 7.6705858192384975, + "learning_rate": 1.827259172743845e-08, + "loss": 0.5888, + "step": 26648 + }, + { + "epoch": 1.9252623403832607, + "grad_norm": 6.834071452896217, + "learning_rate": 1.8237308327235246e-08, + "loss": 0.5027, + "step": 26649 + }, + { + "epoch": 1.9253345855834703, + "grad_norm": 8.043241461956244, + "learning_rate": 1.8202058900436692e-08, + "loss": 0.586, + "step": 26650 + }, + { + "epoch": 1.9254068307836798, + "grad_norm": 6.145044963724934, + "learning_rate": 1.816684344752462e-08, + "loss": 0.5529, + "step": 26651 + }, + { + "epoch": 1.9254790759838893, + "grad_norm": 6.51855110311231, + "learning_rate": 1.8131661968981417e-08, + "loss": 0.5995, + "step": 26652 + }, + { + "epoch": 1.9255513211840989, + "grad_norm": 7.624644536972562, + "learning_rate": 1.809651446528893e-08, + "loss": 0.5747, + "step": 26653 + }, + { + "epoch": 1.9256235663843082, + "grad_norm": 8.464329089972777, + "learning_rate": 1.8061400936927876e-08, + "loss": 0.5917, + "step": 26654 + }, + { + "epoch": 1.925695811584518, + "grad_norm": 6.647872309077015, + "learning_rate": 1.8026321384379265e-08, + "loss": 0.6097, + "step": 26655 + }, + { + "epoch": 1.9257680567847273, + "grad_norm": 6.901097926264387, + "learning_rate": 1.7991275808122987e-08, + "loss": 0.5601, + "step": 26656 + }, + { + "epoch": 1.9258403019849368, + "grad_norm": 7.0860047931272705, + "learning_rate": 1.7956264208639217e-08, + "loss": 0.5728, + "step": 26657 + }, + { + "epoch": 1.9259125471851464, + "grad_norm": 7.402685152499802, + "learning_rate": 1.7921286586407015e-08, + "loss": 0.5586, + "step": 26658 + }, + { + "epoch": 1.925984792385356, + "grad_norm": 6.630422712058121, + "learning_rate": 1.7886342941904888e-08, + "loss": 0.5745, + "step": 26659 + }, + { + "epoch": 1.9260570375855655, + "grad_norm": 8.023674267745594, + "learning_rate": 1.7851433275611896e-08, + "loss": 0.6235, + "step": 26660 + }, + { + "epoch": 1.9261292827857748, + "grad_norm": 7.401733051160689, + "learning_rate": 1.7816557588005436e-08, + "loss": 0.6195, + "step": 26661 + }, + { + "epoch": 1.9262015279859845, + "grad_norm": 7.094522980238958, + "learning_rate": 1.77817158795629e-08, + "loss": 0.613, + "step": 26662 + }, + { + "epoch": 1.9262737731861939, + "grad_norm": 7.908376294720458, + "learning_rate": 1.774690815076113e-08, + "loss": 0.6064, + "step": 26663 + }, + { + "epoch": 1.9263460183864034, + "grad_norm": 6.596199340047463, + "learning_rate": 1.7712134402077252e-08, + "loss": 0.6267, + "step": 26664 + }, + { + "epoch": 1.926418263586613, + "grad_norm": 6.110411417044427, + "learning_rate": 1.767739463398671e-08, + "loss": 0.6549, + "step": 26665 + }, + { + "epoch": 1.9264905087868225, + "grad_norm": 7.1609308517861345, + "learning_rate": 1.764268884696524e-08, + "loss": 0.62, + "step": 26666 + }, + { + "epoch": 1.926562753987032, + "grad_norm": 6.736822475699638, + "learning_rate": 1.7608017041487733e-08, + "loss": 0.5062, + "step": 26667 + }, + { + "epoch": 1.9266349991872413, + "grad_norm": 8.473737648542592, + "learning_rate": 1.7573379218029374e-08, + "loss": 0.7134, + "step": 26668 + }, + { + "epoch": 1.9267072443874511, + "grad_norm": 8.780077438534365, + "learning_rate": 1.7538775377063665e-08, + "loss": 0.6597, + "step": 26669 + }, + { + "epoch": 1.9267794895876604, + "grad_norm": 7.0625658960559665, + "learning_rate": 1.7504205519064955e-08, + "loss": 0.6212, + "step": 26670 + }, + { + "epoch": 1.9268517347878702, + "grad_norm": 7.301691136368871, + "learning_rate": 1.7469669644505914e-08, + "loss": 0.5676, + "step": 26671 + }, + { + "epoch": 1.9269239799880795, + "grad_norm": 8.61083528904136, + "learning_rate": 1.7435167753859506e-08, + "loss": 0.5444, + "step": 26672 + }, + { + "epoch": 1.926996225188289, + "grad_norm": 7.830580153134758, + "learning_rate": 1.7400699847598125e-08, + "loss": 0.5091, + "step": 26673 + }, + { + "epoch": 1.9270684703884986, + "grad_norm": 6.3876420873078725, + "learning_rate": 1.7366265926193616e-08, + "loss": 0.6245, + "step": 26674 + }, + { + "epoch": 1.927140715588708, + "grad_norm": 7.435985763659298, + "learning_rate": 1.733186599011727e-08, + "loss": 0.61, + "step": 26675 + }, + { + "epoch": 1.9272129607889177, + "grad_norm": 8.354035056988835, + "learning_rate": 1.729750003983982e-08, + "loss": 0.6324, + "step": 26676 + }, + { + "epoch": 1.927285205989127, + "grad_norm": 7.110924185176916, + "learning_rate": 1.7263168075832005e-08, + "loss": 0.5463, + "step": 26677 + }, + { + "epoch": 1.9273574511893368, + "grad_norm": 7.6537204845656195, + "learning_rate": 1.7228870098564e-08, + "loss": 0.609, + "step": 26678 + }, + { + "epoch": 1.927429696389546, + "grad_norm": 7.87261151619721, + "learning_rate": 1.7194606108504596e-08, + "loss": 0.6311, + "step": 26679 + }, + { + "epoch": 1.9275019415897556, + "grad_norm": 7.565516098984532, + "learning_rate": 1.716037610612342e-08, + "loss": 0.6598, + "step": 26680 + }, + { + "epoch": 1.9275741867899652, + "grad_norm": 6.9178066310055515, + "learning_rate": 1.7126180091888988e-08, + "loss": 0.5886, + "step": 26681 + }, + { + "epoch": 1.9276464319901745, + "grad_norm": 7.448025988507847, + "learning_rate": 1.7092018066268977e-08, + "loss": 0.5345, + "step": 26682 + }, + { + "epoch": 1.9277186771903843, + "grad_norm": 8.831519414486786, + "learning_rate": 1.7057890029731628e-08, + "loss": 0.602, + "step": 26683 + }, + { + "epoch": 1.9277909223905936, + "grad_norm": 6.97503652255433, + "learning_rate": 1.702379598274406e-08, + "loss": 0.5314, + "step": 26684 + }, + { + "epoch": 1.9278631675908033, + "grad_norm": 6.523833668266012, + "learning_rate": 1.6989735925772578e-08, + "loss": 0.5861, + "step": 26685 + }, + { + "epoch": 1.9279354127910127, + "grad_norm": 8.21028612277145, + "learning_rate": 1.6955709859283743e-08, + "loss": 0.7061, + "step": 26686 + }, + { + "epoch": 1.9280076579912222, + "grad_norm": 6.653168587387311, + "learning_rate": 1.6921717783743297e-08, + "loss": 0.5395, + "step": 26687 + }, + { + "epoch": 1.9280799031914317, + "grad_norm": 7.961250396359554, + "learning_rate": 1.68877596996167e-08, + "loss": 0.5715, + "step": 26688 + }, + { + "epoch": 1.9281521483916413, + "grad_norm": 7.149039421379456, + "learning_rate": 1.6853835607368585e-08, + "loss": 0.6012, + "step": 26689 + }, + { + "epoch": 1.9282243935918508, + "grad_norm": 6.391257107911838, + "learning_rate": 1.68199455074633e-08, + "loss": 0.5218, + "step": 26690 + }, + { + "epoch": 1.9282966387920601, + "grad_norm": 7.548896092433348, + "learning_rate": 1.6786089400364913e-08, + "loss": 0.589, + "step": 26691 + }, + { + "epoch": 1.92836888399227, + "grad_norm": 7.7738394015137295, + "learning_rate": 1.6752267286536674e-08, + "loss": 0.5818, + "step": 26692 + }, + { + "epoch": 1.9284411291924792, + "grad_norm": 9.392589699969474, + "learning_rate": 1.671847916644209e-08, + "loss": 0.6571, + "step": 26693 + }, + { + "epoch": 1.9285133743926888, + "grad_norm": 7.583769845101878, + "learning_rate": 1.6684725040543303e-08, + "loss": 0.6808, + "step": 26694 + }, + { + "epoch": 1.9285856195928983, + "grad_norm": 7.592347317725181, + "learning_rate": 1.6651004909302438e-08, + "loss": 0.5121, + "step": 26695 + }, + { + "epoch": 1.9286578647931079, + "grad_norm": 7.904753894003017, + "learning_rate": 1.661731877318107e-08, + "loss": 0.5939, + "step": 26696 + }, + { + "epoch": 1.9287301099933174, + "grad_norm": 7.754580712952275, + "learning_rate": 1.6583666632640495e-08, + "loss": 0.5999, + "step": 26697 + }, + { + "epoch": 1.9288023551935267, + "grad_norm": 7.512765574402527, + "learning_rate": 1.6550048488141178e-08, + "loss": 0.6137, + "step": 26698 + }, + { + "epoch": 1.9288746003937365, + "grad_norm": 7.225988680765476, + "learning_rate": 1.651646434014359e-08, + "loss": 0.6585, + "step": 26699 + }, + { + "epoch": 1.9289468455939458, + "grad_norm": 7.84956466536782, + "learning_rate": 1.648291418910708e-08, + "loss": 0.6664, + "step": 26700 + }, + { + "epoch": 1.9290190907941553, + "grad_norm": 7.4097879063861845, + "learning_rate": 1.644939803549128e-08, + "loss": 0.5903, + "step": 26701 + }, + { + "epoch": 1.9290913359943649, + "grad_norm": 7.679595853582741, + "learning_rate": 1.641591587975472e-08, + "loss": 0.5779, + "step": 26702 + }, + { + "epoch": 1.9291635811945744, + "grad_norm": 8.486973822889414, + "learning_rate": 1.6382467722356187e-08, + "loss": 0.5718, + "step": 26703 + }, + { + "epoch": 1.929235826394784, + "grad_norm": 8.499072304757968, + "learning_rate": 1.6349053563753105e-08, + "loss": 0.5708, + "step": 26704 + }, + { + "epoch": 1.9293080715949933, + "grad_norm": 6.929580154361638, + "learning_rate": 1.6315673404402878e-08, + "loss": 0.5607, + "step": 26705 + }, + { + "epoch": 1.929380316795203, + "grad_norm": 7.572862067850017, + "learning_rate": 1.628232724476292e-08, + "loss": 0.733, + "step": 26706 + }, + { + "epoch": 1.9294525619954124, + "grad_norm": 8.197255154344635, + "learning_rate": 1.6249015085289253e-08, + "loss": 0.5878, + "step": 26707 + }, + { + "epoch": 1.929524807195622, + "grad_norm": 8.672292008554637, + "learning_rate": 1.6215736926438186e-08, + "loss": 0.6096, + "step": 26708 + }, + { + "epoch": 1.9295970523958315, + "grad_norm": 7.278095134904778, + "learning_rate": 1.6182492768664903e-08, + "loss": 0.6345, + "step": 26709 + }, + { + "epoch": 1.929669297596041, + "grad_norm": 8.431846702731221, + "learning_rate": 1.6149282612424877e-08, + "loss": 0.6843, + "step": 26710 + }, + { + "epoch": 1.9297415427962505, + "grad_norm": 6.845219145249067, + "learning_rate": 1.611610645817274e-08, + "loss": 0.6244, + "step": 26711 + }, + { + "epoch": 1.9298137879964599, + "grad_norm": 8.67185388510943, + "learning_rate": 1.6082964306362302e-08, + "loss": 0.6261, + "step": 26712 + }, + { + "epoch": 1.9298860331966696, + "grad_norm": 7.007679405351914, + "learning_rate": 1.6049856157447363e-08, + "loss": 0.6283, + "step": 26713 + }, + { + "epoch": 1.929958278396879, + "grad_norm": 7.03948085951545, + "learning_rate": 1.6016782011881447e-08, + "loss": 0.5737, + "step": 26714 + }, + { + "epoch": 1.9300305235970885, + "grad_norm": 7.604972928994055, + "learning_rate": 1.5983741870116977e-08, + "loss": 0.6544, + "step": 26715 + }, + { + "epoch": 1.930102768797298, + "grad_norm": 7.3950288003666325, + "learning_rate": 1.595073573260636e-08, + "loss": 0.581, + "step": 26716 + }, + { + "epoch": 1.9301750139975076, + "grad_norm": 9.215486627770728, + "learning_rate": 1.591776359980146e-08, + "loss": 0.5924, + "step": 26717 + }, + { + "epoch": 1.9302472591977171, + "grad_norm": 6.667938715057575, + "learning_rate": 1.5884825472153863e-08, + "loss": 0.6351, + "step": 26718 + }, + { + "epoch": 1.9303195043979264, + "grad_norm": 6.779876991889048, + "learning_rate": 1.5851921350113763e-08, + "loss": 0.5467, + "step": 26719 + }, + { + "epoch": 1.9303917495981362, + "grad_norm": 6.907502073825411, + "learning_rate": 1.5819051234132464e-08, + "loss": 0.5569, + "step": 26720 + }, + { + "epoch": 1.9304639947983455, + "grad_norm": 5.330221619427004, + "learning_rate": 1.5786215124659055e-08, + "loss": 0.5321, + "step": 26721 + }, + { + "epoch": 1.930536239998555, + "grad_norm": 8.594153932268345, + "learning_rate": 1.5753413022143726e-08, + "loss": 0.6135, + "step": 26722 + }, + { + "epoch": 1.9306084851987646, + "grad_norm": 8.458405993921538, + "learning_rate": 1.572064492703529e-08, + "loss": 0.5974, + "step": 26723 + }, + { + "epoch": 1.9306807303989741, + "grad_norm": 7.484424184745094, + "learning_rate": 1.568791083978227e-08, + "loss": 0.5895, + "step": 26724 + }, + { + "epoch": 1.9307529755991837, + "grad_norm": 9.03523815759458, + "learning_rate": 1.5655210760833204e-08, + "loss": 0.5629, + "step": 26725 + }, + { + "epoch": 1.930825220799393, + "grad_norm": 7.666945853194759, + "learning_rate": 1.5622544690634955e-08, + "loss": 0.5933, + "step": 26726 + }, + { + "epoch": 1.9308974659996028, + "grad_norm": 6.631260918438378, + "learning_rate": 1.558991262963494e-08, + "loss": 0.5821, + "step": 26727 + }, + { + "epoch": 1.930969711199812, + "grad_norm": 8.864435489794845, + "learning_rate": 1.55573145782803e-08, + "loss": 0.6071, + "step": 26728 + }, + { + "epoch": 1.9310419564000216, + "grad_norm": 8.086582382949578, + "learning_rate": 1.55247505370168e-08, + "loss": 0.5987, + "step": 26729 + }, + { + "epoch": 1.9311142016002312, + "grad_norm": 7.86933937683356, + "learning_rate": 1.5492220506290457e-08, + "loss": 0.5942, + "step": 26730 + }, + { + "epoch": 1.9311864468004407, + "grad_norm": 7.187811537915406, + "learning_rate": 1.5459724486546202e-08, + "loss": 0.5767, + "step": 26731 + }, + { + "epoch": 1.9312586920006503, + "grad_norm": 6.40212567314682, + "learning_rate": 1.5427262478229787e-08, + "loss": 0.5681, + "step": 26732 + }, + { + "epoch": 1.9313309372008596, + "grad_norm": 7.505680730293399, + "learning_rate": 1.5394834481784472e-08, + "loss": 0.6499, + "step": 26733 + }, + { + "epoch": 1.9314031824010693, + "grad_norm": 6.904356545887831, + "learning_rate": 1.5362440497654895e-08, + "loss": 0.6005, + "step": 26734 + }, + { + "epoch": 1.9314754276012787, + "grad_norm": 8.612221359817006, + "learning_rate": 1.5330080526284597e-08, + "loss": 0.6203, + "step": 26735 + }, + { + "epoch": 1.9315476728014882, + "grad_norm": 7.962128644634608, + "learning_rate": 1.5297754568115996e-08, + "loss": 0.6525, + "step": 26736 + }, + { + "epoch": 1.9316199180016977, + "grad_norm": 8.46454180789357, + "learning_rate": 1.5265462623591797e-08, + "loss": 0.684, + "step": 26737 + }, + { + "epoch": 1.9316921632019073, + "grad_norm": 6.901794929900086, + "learning_rate": 1.5233204693154424e-08, + "loss": 0.5776, + "step": 26738 + }, + { + "epoch": 1.9317644084021168, + "grad_norm": 8.737257043519508, + "learning_rate": 1.5200980777244912e-08, + "loss": 0.6133, + "step": 26739 + }, + { + "epoch": 1.9318366536023261, + "grad_norm": 6.467023093766753, + "learning_rate": 1.5168790876304852e-08, + "loss": 0.621, + "step": 26740 + }, + { + "epoch": 1.931908898802536, + "grad_norm": 8.099312892074886, + "learning_rate": 1.5136634990774446e-08, + "loss": 0.6044, + "step": 26741 + }, + { + "epoch": 1.9319811440027452, + "grad_norm": 8.103834078386571, + "learning_rate": 1.5104513121094456e-08, + "loss": 0.5693, + "step": 26742 + }, + { + "epoch": 1.9320533892029548, + "grad_norm": 8.480435463142872, + "learning_rate": 1.5072425267703973e-08, + "loss": 0.6595, + "step": 26743 + }, + { + "epoch": 1.9321256344031643, + "grad_norm": 8.038545730930547, + "learning_rate": 1.504037143104292e-08, + "loss": 0.6133, + "step": 26744 + }, + { + "epoch": 1.9321978796033739, + "grad_norm": 7.014596162728962, + "learning_rate": 1.5008351611549288e-08, + "loss": 0.6058, + "step": 26745 + }, + { + "epoch": 1.9322701248035834, + "grad_norm": 6.480727748742445, + "learning_rate": 1.497636580966244e-08, + "loss": 0.5945, + "step": 26746 + }, + { + "epoch": 1.9323423700037927, + "grad_norm": 8.141854982205807, + "learning_rate": 1.4944414025819253e-08, + "loss": 0.5412, + "step": 26747 + }, + { + "epoch": 1.9324146152040025, + "grad_norm": 6.540446692094425, + "learning_rate": 1.491249626045743e-08, + "loss": 0.546, + "step": 26748 + }, + { + "epoch": 1.9324868604042118, + "grad_norm": 6.780143120079381, + "learning_rate": 1.4880612514014125e-08, + "loss": 0.544, + "step": 26749 + }, + { + "epoch": 1.9325591056044216, + "grad_norm": 7.469338110589367, + "learning_rate": 1.4848762786925653e-08, + "loss": 0.5835, + "step": 26750 + }, + { + "epoch": 1.9326313508046309, + "grad_norm": 7.392157817064163, + "learning_rate": 1.4816947079628053e-08, + "loss": 0.6163, + "step": 26751 + }, + { + "epoch": 1.9327035960048404, + "grad_norm": 8.24205050241595, + "learning_rate": 1.478516539255681e-08, + "loss": 0.6147, + "step": 26752 + }, + { + "epoch": 1.93277584120505, + "grad_norm": 7.164448269074468, + "learning_rate": 1.4753417726147135e-08, + "loss": 0.6452, + "step": 26753 + }, + { + "epoch": 1.9328480864052593, + "grad_norm": 8.216031977634918, + "learning_rate": 1.4721704080833399e-08, + "loss": 0.6196, + "step": 26754 + }, + { + "epoch": 1.932920331605469, + "grad_norm": 8.299897930942434, + "learning_rate": 1.4690024457049701e-08, + "loss": 0.6552, + "step": 26755 + }, + { + "epoch": 1.9329925768056784, + "grad_norm": 7.232132566199137, + "learning_rate": 1.465837885522986e-08, + "loss": 0.5785, + "step": 26756 + }, + { + "epoch": 1.9330648220058881, + "grad_norm": 6.794773101274445, + "learning_rate": 1.4626767275807418e-08, + "loss": 0.5501, + "step": 26757 + }, + { + "epoch": 1.9331370672060975, + "grad_norm": 7.613443591137304, + "learning_rate": 1.459518971921453e-08, + "loss": 0.5807, + "step": 26758 + }, + { + "epoch": 1.933209312406307, + "grad_norm": 7.331245327400872, + "learning_rate": 1.4563646185883629e-08, + "loss": 0.6286, + "step": 26759 + }, + { + "epoch": 1.9332815576065165, + "grad_norm": 7.840747273701569, + "learning_rate": 1.4532136676246588e-08, + "loss": 0.6268, + "step": 26760 + }, + { + "epoch": 1.9333538028067259, + "grad_norm": 8.090548673868682, + "learning_rate": 1.4500661190734733e-08, + "loss": 0.6318, + "step": 26761 + }, + { + "epoch": 1.9334260480069356, + "grad_norm": 7.352890494676838, + "learning_rate": 1.4469219729779105e-08, + "loss": 0.5626, + "step": 26762 + }, + { + "epoch": 1.933498293207145, + "grad_norm": 6.4648711881026495, + "learning_rate": 1.4437812293809916e-08, + "loss": 0.5893, + "step": 26763 + }, + { + "epoch": 1.9335705384073547, + "grad_norm": 6.680486977009984, + "learning_rate": 1.4406438883257379e-08, + "loss": 0.5767, + "step": 26764 + }, + { + "epoch": 1.933642783607564, + "grad_norm": 7.356807447134184, + "learning_rate": 1.4375099498550316e-08, + "loss": 0.6178, + "step": 26765 + }, + { + "epoch": 1.9337150288077736, + "grad_norm": 8.994028865687302, + "learning_rate": 1.4343794140118384e-08, + "loss": 0.6612, + "step": 26766 + }, + { + "epoch": 1.9337872740079831, + "grad_norm": 7.459263606811882, + "learning_rate": 1.4312522808390128e-08, + "loss": 0.567, + "step": 26767 + }, + { + "epoch": 1.9338595192081927, + "grad_norm": 7.087099966234178, + "learning_rate": 1.4281285503793262e-08, + "loss": 0.5394, + "step": 26768 + }, + { + "epoch": 1.9339317644084022, + "grad_norm": 9.117150261679521, + "learning_rate": 1.4250082226755501e-08, + "loss": 0.6177, + "step": 26769 + }, + { + "epoch": 1.9340040096086115, + "grad_norm": 8.196448175600972, + "learning_rate": 1.4218912977704002e-08, + "loss": 0.6061, + "step": 26770 + }, + { + "epoch": 1.9340762548088213, + "grad_norm": 7.699795202529004, + "learning_rate": 1.4187777757065647e-08, + "loss": 0.5603, + "step": 26771 + }, + { + "epoch": 1.9341485000090306, + "grad_norm": 8.468750900887867, + "learning_rate": 1.4156676565266481e-08, + "loss": 0.6832, + "step": 26772 + }, + { + "epoch": 1.9342207452092401, + "grad_norm": 7.412158267295927, + "learning_rate": 1.4125609402732277e-08, + "loss": 0.5272, + "step": 26773 + }, + { + "epoch": 1.9342929904094497, + "grad_norm": 8.284454992275773, + "learning_rate": 1.4094576269888249e-08, + "loss": 0.6019, + "step": 26774 + }, + { + "epoch": 1.9343652356096592, + "grad_norm": 6.793464313525934, + "learning_rate": 1.4063577167159337e-08, + "loss": 0.5832, + "step": 26775 + }, + { + "epoch": 1.9344374808098688, + "grad_norm": 7.951138531233819, + "learning_rate": 1.4032612094969922e-08, + "loss": 0.6325, + "step": 26776 + }, + { + "epoch": 1.934509726010078, + "grad_norm": 8.966688178319075, + "learning_rate": 1.4001681053743555e-08, + "loss": 0.6369, + "step": 26777 + }, + { + "epoch": 1.9345819712102879, + "grad_norm": 7.096788201080549, + "learning_rate": 1.397078404390434e-08, + "loss": 0.5591, + "step": 26778 + }, + { + "epoch": 1.9346542164104972, + "grad_norm": 7.739115054381293, + "learning_rate": 1.393992106587444e-08, + "loss": 0.5982, + "step": 26779 + }, + { + "epoch": 1.9347264616107067, + "grad_norm": 8.382932180384019, + "learning_rate": 1.390909212007685e-08, + "loss": 0.6212, + "step": 26780 + }, + { + "epoch": 1.9347987068109163, + "grad_norm": 7.5746349989574835, + "learning_rate": 1.3878297206933455e-08, + "loss": 0.6749, + "step": 26781 + }, + { + "epoch": 1.9348709520111258, + "grad_norm": 6.780569578568931, + "learning_rate": 1.3847536326865862e-08, + "loss": 0.6443, + "step": 26782 + }, + { + "epoch": 1.9349431972113353, + "grad_norm": 7.468676450997694, + "learning_rate": 1.3816809480295123e-08, + "loss": 0.5381, + "step": 26783 + }, + { + "epoch": 1.9350154424115447, + "grad_norm": 7.428282595877647, + "learning_rate": 1.3786116667641736e-08, + "loss": 0.5784, + "step": 26784 + }, + { + "epoch": 1.9350876876117544, + "grad_norm": 6.753614799957941, + "learning_rate": 1.3755457889326473e-08, + "loss": 0.5772, + "step": 26785 + }, + { + "epoch": 1.9351599328119637, + "grad_norm": 7.112049044239361, + "learning_rate": 1.372483314576789e-08, + "loss": 0.6091, + "step": 26786 + }, + { + "epoch": 1.9352321780121733, + "grad_norm": 7.876072644039633, + "learning_rate": 1.3694242437386207e-08, + "loss": 0.6122, + "step": 26787 + }, + { + "epoch": 1.9353044232123828, + "grad_norm": 8.901057714124528, + "learning_rate": 1.3663685764599699e-08, + "loss": 0.6955, + "step": 26788 + }, + { + "epoch": 1.9353766684125924, + "grad_norm": 6.476439692272791, + "learning_rate": 1.3633163127827197e-08, + "loss": 0.5876, + "step": 26789 + }, + { + "epoch": 1.935448913612802, + "grad_norm": 8.729856137622988, + "learning_rate": 1.360267452748587e-08, + "loss": 0.6261, + "step": 26790 + }, + { + "epoch": 1.9355211588130112, + "grad_norm": 7.657061599999794, + "learning_rate": 1.357221996399316e-08, + "loss": 0.5482, + "step": 26791 + }, + { + "epoch": 1.935593404013221, + "grad_norm": 6.883331244640018, + "learning_rate": 1.3541799437766511e-08, + "loss": 0.5675, + "step": 26792 + }, + { + "epoch": 1.9356656492134303, + "grad_norm": 7.738438011095165, + "learning_rate": 1.351141294922198e-08, + "loss": 0.6051, + "step": 26793 + }, + { + "epoch": 1.9357378944136399, + "grad_norm": 7.903852136842782, + "learning_rate": 1.3481060498775345e-08, + "loss": 0.5815, + "step": 26794 + }, + { + "epoch": 1.9358101396138494, + "grad_norm": 8.42694335620059, + "learning_rate": 1.3450742086842661e-08, + "loss": 0.5865, + "step": 26795 + }, + { + "epoch": 1.935882384814059, + "grad_norm": 6.990885932019056, + "learning_rate": 1.342045771383832e-08, + "loss": 0.6898, + "step": 26796 + }, + { + "epoch": 1.9359546300142685, + "grad_norm": 7.275938790171363, + "learning_rate": 1.3390207380177266e-08, + "loss": 0.5949, + "step": 26797 + }, + { + "epoch": 1.9360268752144778, + "grad_norm": 6.83951357355562, + "learning_rate": 1.3359991086273617e-08, + "loss": 0.6469, + "step": 26798 + }, + { + "epoch": 1.9360991204146876, + "grad_norm": 7.379021017172604, + "learning_rate": 1.3329808832540924e-08, + "loss": 0.5492, + "step": 26799 + }, + { + "epoch": 1.9361713656148969, + "grad_norm": 9.025340539314067, + "learning_rate": 1.3299660619392475e-08, + "loss": 0.6855, + "step": 26800 + }, + { + "epoch": 1.9362436108151064, + "grad_norm": 6.8700407694977, + "learning_rate": 1.3269546447240712e-08, + "loss": 0.6185, + "step": 26801 + }, + { + "epoch": 1.936315856015316, + "grad_norm": 8.035134411431189, + "learning_rate": 1.3239466316498362e-08, + "loss": 0.5879, + "step": 26802 + }, + { + "epoch": 1.9363881012155255, + "grad_norm": 8.064919751803322, + "learning_rate": 1.3209420227576485e-08, + "loss": 0.6186, + "step": 26803 + }, + { + "epoch": 1.936460346415735, + "grad_norm": 7.870125609271613, + "learning_rate": 1.3179408180886977e-08, + "loss": 0.5915, + "step": 26804 + }, + { + "epoch": 1.9365325916159444, + "grad_norm": 7.581948234186179, + "learning_rate": 1.3149430176840338e-08, + "loss": 0.5634, + "step": 26805 + }, + { + "epoch": 1.9366048368161541, + "grad_norm": 8.078453190111595, + "learning_rate": 1.3119486215847354e-08, + "loss": 0.6115, + "step": 26806 + }, + { + "epoch": 1.9366770820163635, + "grad_norm": 7.087304232684039, + "learning_rate": 1.308957629831742e-08, + "loss": 0.5487, + "step": 26807 + }, + { + "epoch": 1.936749327216573, + "grad_norm": 7.872978314248115, + "learning_rate": 1.3059700424660204e-08, + "loss": 0.6121, + "step": 26808 + }, + { + "epoch": 1.9368215724167825, + "grad_norm": 6.317041859350245, + "learning_rate": 1.3029858595284828e-08, + "loss": 0.5835, + "step": 26809 + }, + { + "epoch": 1.936893817616992, + "grad_norm": 6.814902065858051, + "learning_rate": 1.3000050810599574e-08, + "loss": 0.5691, + "step": 26810 + }, + { + "epoch": 1.9369660628172016, + "grad_norm": 7.771243359859109, + "learning_rate": 1.2970277071012726e-08, + "loss": 0.6556, + "step": 26811 + }, + { + "epoch": 1.937038308017411, + "grad_norm": 7.539906192671571, + "learning_rate": 1.2940537376931739e-08, + "loss": 0.553, + "step": 26812 + }, + { + "epoch": 1.9371105532176207, + "grad_norm": 6.888594901398701, + "learning_rate": 1.2910831728763507e-08, + "loss": 0.5998, + "step": 26813 + }, + { + "epoch": 1.93718279841783, + "grad_norm": 7.661312251423927, + "learning_rate": 1.2881160126915204e-08, + "loss": 0.56, + "step": 26814 + }, + { + "epoch": 1.9372550436180396, + "grad_norm": 5.193059309331483, + "learning_rate": 1.285152257179234e-08, + "loss": 0.5551, + "step": 26815 + }, + { + "epoch": 1.9373272888182491, + "grad_norm": 7.083185471132111, + "learning_rate": 1.2821919063801257e-08, + "loss": 0.5902, + "step": 26816 + }, + { + "epoch": 1.9373995340184587, + "grad_norm": 7.970673691672561, + "learning_rate": 1.279234960334691e-08, + "loss": 0.5814, + "step": 26817 + }, + { + "epoch": 1.9374717792186682, + "grad_norm": 7.983920388614137, + "learning_rate": 1.2762814190833972e-08, + "loss": 0.5956, + "step": 26818 + }, + { + "epoch": 1.9375440244188775, + "grad_norm": 7.823214576490566, + "learning_rate": 1.2733312826666843e-08, + "loss": 0.5657, + "step": 26819 + }, + { + "epoch": 1.9376162696190873, + "grad_norm": 6.768734682018246, + "learning_rate": 1.2703845511249646e-08, + "loss": 0.5901, + "step": 26820 + }, + { + "epoch": 1.9376885148192966, + "grad_norm": 6.160726246318698, + "learning_rate": 1.267441224498539e-08, + "loss": 0.4802, + "step": 26821 + }, + { + "epoch": 1.9377607600195064, + "grad_norm": 7.531481220922184, + "learning_rate": 1.2645013028277364e-08, + "loss": 0.6326, + "step": 26822 + }, + { + "epoch": 1.9378330052197157, + "grad_norm": 8.327543256241322, + "learning_rate": 1.261564786152747e-08, + "loss": 0.6228, + "step": 26823 + }, + { + "epoch": 1.9379052504199252, + "grad_norm": 6.9041115052025, + "learning_rate": 1.258631674513816e-08, + "loss": 0.5628, + "step": 26824 + }, + { + "epoch": 1.9379774956201348, + "grad_norm": 7.711263013962617, + "learning_rate": 1.2557019679510785e-08, + "loss": 0.6775, + "step": 26825 + }, + { + "epoch": 1.938049740820344, + "grad_norm": 6.003518979966855, + "learning_rate": 1.2527756665046686e-08, + "loss": 0.6085, + "step": 26826 + }, + { + "epoch": 1.9381219860205539, + "grad_norm": 9.113423149747172, + "learning_rate": 1.2498527702145824e-08, + "loss": 0.6324, + "step": 26827 + }, + { + "epoch": 1.9381942312207632, + "grad_norm": 9.38798986139713, + "learning_rate": 1.2469332791208988e-08, + "loss": 0.6231, + "step": 26828 + }, + { + "epoch": 1.938266476420973, + "grad_norm": 6.476828722664695, + "learning_rate": 1.244017193263558e-08, + "loss": 0.5637, + "step": 26829 + }, + { + "epoch": 1.9383387216211823, + "grad_norm": 7.260854553707687, + "learning_rate": 1.241104512682445e-08, + "loss": 0.5075, + "step": 26830 + }, + { + "epoch": 1.9384109668213918, + "grad_norm": 8.345448296118285, + "learning_rate": 1.2381952374174721e-08, + "loss": 0.6038, + "step": 26831 + }, + { + "epoch": 1.9384832120216013, + "grad_norm": 6.9730597200318645, + "learning_rate": 1.2352893675084965e-08, + "loss": 0.5593, + "step": 26832 + }, + { + "epoch": 1.9385554572218107, + "grad_norm": 8.554965302995052, + "learning_rate": 1.2323869029952084e-08, + "loss": 0.597, + "step": 26833 + }, + { + "epoch": 1.9386277024220204, + "grad_norm": 8.797247490864818, + "learning_rate": 1.2294878439174095e-08, + "loss": 0.6807, + "step": 26834 + }, + { + "epoch": 1.9386999476222297, + "grad_norm": 6.747647758638428, + "learning_rate": 1.2265921903147348e-08, + "loss": 0.5106, + "step": 26835 + }, + { + "epoch": 1.9387721928224395, + "grad_norm": 6.329517653951955, + "learning_rate": 1.2236999422268746e-08, + "loss": 0.4608, + "step": 26836 + }, + { + "epoch": 1.9388444380226488, + "grad_norm": 6.759160642712536, + "learning_rate": 1.2208110996934086e-08, + "loss": 0.5526, + "step": 26837 + }, + { + "epoch": 1.9389166832228584, + "grad_norm": 8.154968394879551, + "learning_rate": 1.2179256627538605e-08, + "loss": 0.5897, + "step": 26838 + }, + { + "epoch": 1.938988928423068, + "grad_norm": 6.394916315433415, + "learning_rate": 1.2150436314477543e-08, + "loss": 0.5381, + "step": 26839 + }, + { + "epoch": 1.9390611736232775, + "grad_norm": 6.257197393887636, + "learning_rate": 1.2121650058145306e-08, + "loss": 0.5801, + "step": 26840 + }, + { + "epoch": 1.939133418823487, + "grad_norm": 7.037974486269808, + "learning_rate": 1.2092897858935748e-08, + "loss": 0.6117, + "step": 26841 + }, + { + "epoch": 1.9392056640236963, + "grad_norm": 7.3978525190294615, + "learning_rate": 1.2064179717242996e-08, + "loss": 0.591, + "step": 26842 + }, + { + "epoch": 1.939277909223906, + "grad_norm": 7.7128480903803105, + "learning_rate": 1.2035495633459792e-08, + "loss": 0.5991, + "step": 26843 + }, + { + "epoch": 1.9393501544241154, + "grad_norm": 7.931812080685017, + "learning_rate": 1.20068456079786e-08, + "loss": 0.6191, + "step": 26844 + }, + { + "epoch": 1.939422399624325, + "grad_norm": 7.5893526885339195, + "learning_rate": 1.1978229641192162e-08, + "loss": 0.594, + "step": 26845 + }, + { + "epoch": 1.9394946448245345, + "grad_norm": 6.02304198553534, + "learning_rate": 1.1949647733491831e-08, + "loss": 0.5658, + "step": 26846 + }, + { + "epoch": 1.939566890024744, + "grad_norm": 7.989359455047264, + "learning_rate": 1.192109988526896e-08, + "loss": 0.5843, + "step": 26847 + }, + { + "epoch": 1.9396391352249536, + "grad_norm": 8.68716978913803, + "learning_rate": 1.1892586096914626e-08, + "loss": 0.6395, + "step": 26848 + }, + { + "epoch": 1.939711380425163, + "grad_norm": 8.58095505242956, + "learning_rate": 1.1864106368818517e-08, + "loss": 0.6257, + "step": 26849 + }, + { + "epoch": 1.9397836256253727, + "grad_norm": 6.809573043457254, + "learning_rate": 1.1835660701371155e-08, + "loss": 0.6126, + "step": 26850 + }, + { + "epoch": 1.939855870825582, + "grad_norm": 7.771528061261982, + "learning_rate": 1.180724909496167e-08, + "loss": 0.5341, + "step": 26851 + }, + { + "epoch": 1.9399281160257915, + "grad_norm": 7.090865973139207, + "learning_rate": 1.1778871549978643e-08, + "loss": 0.5864, + "step": 26852 + }, + { + "epoch": 1.940000361226001, + "grad_norm": 7.442746659952781, + "learning_rate": 1.1750528066811207e-08, + "loss": 0.5752, + "step": 26853 + }, + { + "epoch": 1.9400726064262106, + "grad_norm": 7.940401282944061, + "learning_rate": 1.1722218645846827e-08, + "loss": 0.5926, + "step": 26854 + }, + { + "epoch": 1.9401448516264201, + "grad_norm": 7.125148570870815, + "learning_rate": 1.169394328747353e-08, + "loss": 0.5926, + "step": 26855 + }, + { + "epoch": 1.9402170968266295, + "grad_norm": 7.724650363355224, + "learning_rate": 1.1665701992077948e-08, + "loss": 0.5709, + "step": 26856 + }, + { + "epoch": 1.9402893420268392, + "grad_norm": 7.462343668618087, + "learning_rate": 1.1637494760046997e-08, + "loss": 0.5864, + "step": 26857 + }, + { + "epoch": 1.9403615872270485, + "grad_norm": 8.573509100046154, + "learning_rate": 1.1609321591766476e-08, + "loss": 0.5919, + "step": 26858 + }, + { + "epoch": 1.940433832427258, + "grad_norm": 8.403029721969379, + "learning_rate": 1.1581182487622188e-08, + "loss": 0.6217, + "step": 26859 + }, + { + "epoch": 1.9405060776274676, + "grad_norm": 6.960857763379593, + "learning_rate": 1.1553077447999384e-08, + "loss": 0.6022, + "step": 26860 + }, + { + "epoch": 1.9405783228276772, + "grad_norm": 8.720807888858406, + "learning_rate": 1.1525006473283028e-08, + "loss": 0.6545, + "step": 26861 + }, + { + "epoch": 1.9406505680278867, + "grad_norm": 7.503198068970793, + "learning_rate": 1.1496969563856708e-08, + "loss": 0.4864, + "step": 26862 + }, + { + "epoch": 1.940722813228096, + "grad_norm": 7.152281614540997, + "learning_rate": 1.1468966720105112e-08, + "loss": 0.5712, + "step": 26863 + }, + { + "epoch": 1.9407950584283058, + "grad_norm": 6.791160829893552, + "learning_rate": 1.1440997942410992e-08, + "loss": 0.6293, + "step": 26864 + }, + { + "epoch": 1.9408673036285151, + "grad_norm": 6.1332939201968335, + "learning_rate": 1.1413063231157374e-08, + "loss": 0.5918, + "step": 26865 + }, + { + "epoch": 1.9409395488287247, + "grad_norm": 7.512799086623227, + "learning_rate": 1.138516258672645e-08, + "loss": 0.5361, + "step": 26866 + }, + { + "epoch": 1.9410117940289342, + "grad_norm": 8.268584156175343, + "learning_rate": 1.1357296009500696e-08, + "loss": 0.6648, + "step": 26867 + }, + { + "epoch": 1.9410840392291437, + "grad_norm": 7.5248162738258255, + "learning_rate": 1.1329463499861193e-08, + "loss": 0.587, + "step": 26868 + }, + { + "epoch": 1.9411562844293533, + "grad_norm": 6.478873913724622, + "learning_rate": 1.1301665058188748e-08, + "loss": 0.5818, + "step": 26869 + }, + { + "epoch": 1.9412285296295626, + "grad_norm": 8.657488682668165, + "learning_rate": 1.1273900684864448e-08, + "loss": 0.6758, + "step": 26870 + }, + { + "epoch": 1.9413007748297724, + "grad_norm": 8.344094258610818, + "learning_rate": 1.1246170380267984e-08, + "loss": 0.6267, + "step": 26871 + }, + { + "epoch": 1.9413730200299817, + "grad_norm": 7.667680700938385, + "learning_rate": 1.1218474144779057e-08, + "loss": 0.6378, + "step": 26872 + }, + { + "epoch": 1.9414452652301912, + "grad_norm": 8.208599828534677, + "learning_rate": 1.1190811978776806e-08, + "loss": 0.5647, + "step": 26873 + }, + { + "epoch": 1.9415175104304008, + "grad_norm": 7.269334203149562, + "learning_rate": 1.1163183882639816e-08, + "loss": 0.598, + "step": 26874 + }, + { + "epoch": 1.9415897556306103, + "grad_norm": 9.667093947164307, + "learning_rate": 1.1135589856746398e-08, + "loss": 0.5755, + "step": 26875 + }, + { + "epoch": 1.9416620008308199, + "grad_norm": 7.767386591647884, + "learning_rate": 1.1108029901474304e-08, + "loss": 0.6035, + "step": 26876 + }, + { + "epoch": 1.9417342460310292, + "grad_norm": 6.84414824977459, + "learning_rate": 1.108050401720101e-08, + "loss": 0.5875, + "step": 26877 + }, + { + "epoch": 1.941806491231239, + "grad_norm": 7.280066398066292, + "learning_rate": 1.1053012204302881e-08, + "loss": 0.5297, + "step": 26878 + }, + { + "epoch": 1.9418787364314483, + "grad_norm": 8.820202646846914, + "learning_rate": 1.1025554463156562e-08, + "loss": 0.6278, + "step": 26879 + }, + { + "epoch": 1.9419509816316578, + "grad_norm": 7.252651650474795, + "learning_rate": 1.0998130794137585e-08, + "loss": 0.5745, + "step": 26880 + }, + { + "epoch": 1.9420232268318673, + "grad_norm": 8.61025492469091, + "learning_rate": 1.097074119762176e-08, + "loss": 0.6231, + "step": 26881 + }, + { + "epoch": 1.9420954720320769, + "grad_norm": 9.591208055957093, + "learning_rate": 1.0943385673984064e-08, + "loss": 0.6597, + "step": 26882 + }, + { + "epoch": 1.9421677172322864, + "grad_norm": 8.141432357847258, + "learning_rate": 1.0916064223598643e-08, + "loss": 0.5415, + "step": 26883 + }, + { + "epoch": 1.9422399624324957, + "grad_norm": 7.675975566083655, + "learning_rate": 1.0888776846839366e-08, + "loss": 0.5889, + "step": 26884 + }, + { + "epoch": 1.9423122076327055, + "grad_norm": 7.650392619510845, + "learning_rate": 1.0861523544080654e-08, + "loss": 0.5718, + "step": 26885 + }, + { + "epoch": 1.9423844528329148, + "grad_norm": 7.602927607785603, + "learning_rate": 1.083430431569471e-08, + "loss": 0.6107, + "step": 26886 + }, + { + "epoch": 1.9424566980331244, + "grad_norm": 7.116322101527273, + "learning_rate": 1.0807119162054291e-08, + "loss": 0.5988, + "step": 26887 + }, + { + "epoch": 1.942528943233334, + "grad_norm": 8.473477441186061, + "learning_rate": 1.07799680835316e-08, + "loss": 0.622, + "step": 26888 + }, + { + "epoch": 1.9426011884335435, + "grad_norm": 6.775386120818624, + "learning_rate": 1.0752851080498839e-08, + "loss": 0.5794, + "step": 26889 + }, + { + "epoch": 1.942673433633753, + "grad_norm": 7.353207476113235, + "learning_rate": 1.0725768153326544e-08, + "loss": 0.5689, + "step": 26890 + }, + { + "epoch": 1.9427456788339623, + "grad_norm": 7.23841036801325, + "learning_rate": 1.069871930238553e-08, + "loss": 0.6043, + "step": 26891 + }, + { + "epoch": 1.942817924034172, + "grad_norm": 7.52882947844017, + "learning_rate": 1.0671704528046334e-08, + "loss": 0.5702, + "step": 26892 + }, + { + "epoch": 1.9428901692343814, + "grad_norm": 7.528261216872252, + "learning_rate": 1.064472383067866e-08, + "loss": 0.561, + "step": 26893 + }, + { + "epoch": 1.9429624144345912, + "grad_norm": 5.899272692836242, + "learning_rate": 1.0617777210651937e-08, + "loss": 0.6437, + "step": 26894 + }, + { + "epoch": 1.9430346596348005, + "grad_norm": 7.408622265199394, + "learning_rate": 1.0590864668334755e-08, + "loss": 0.5673, + "step": 26895 + }, + { + "epoch": 1.94310690483501, + "grad_norm": 7.9096898975182555, + "learning_rate": 1.0563986204095988e-08, + "loss": 0.5593, + "step": 26896 + }, + { + "epoch": 1.9431791500352196, + "grad_norm": 7.790898358277042, + "learning_rate": 1.0537141818303398e-08, + "loss": 0.6401, + "step": 26897 + }, + { + "epoch": 1.943251395235429, + "grad_norm": 7.56999705063877, + "learning_rate": 1.051033151132419e-08, + "loss": 0.6381, + "step": 26898 + }, + { + "epoch": 1.9433236404356387, + "grad_norm": 6.754032484632185, + "learning_rate": 1.048355528352557e-08, + "loss": 0.6365, + "step": 26899 + }, + { + "epoch": 1.943395885635848, + "grad_norm": 7.0341625369645495, + "learning_rate": 1.0456813135274191e-08, + "loss": 0.628, + "step": 26900 + }, + { + "epoch": 1.9434681308360577, + "grad_norm": 9.92117600269077, + "learning_rate": 1.0430105066935869e-08, + "loss": 0.6216, + "step": 26901 + }, + { + "epoch": 1.943540376036267, + "grad_norm": 7.2323620098622845, + "learning_rate": 1.0403431078876425e-08, + "loss": 0.6083, + "step": 26902 + }, + { + "epoch": 1.9436126212364766, + "grad_norm": 6.911260246038819, + "learning_rate": 1.037679117146112e-08, + "loss": 0.5854, + "step": 26903 + }, + { + "epoch": 1.9436848664366861, + "grad_norm": 7.059692364871227, + "learning_rate": 1.0350185345054387e-08, + "loss": 0.585, + "step": 26904 + }, + { + "epoch": 1.9437571116368955, + "grad_norm": 8.50341773042725, + "learning_rate": 1.0323613600020654e-08, + "loss": 0.5712, + "step": 26905 + }, + { + "epoch": 1.9438293568371052, + "grad_norm": 7.042821695376174, + "learning_rate": 1.0297075936723245e-08, + "loss": 0.6068, + "step": 26906 + }, + { + "epoch": 1.9439016020373145, + "grad_norm": 6.3600533988198436, + "learning_rate": 1.0270572355526031e-08, + "loss": 0.5323, + "step": 26907 + }, + { + "epoch": 1.9439738472375243, + "grad_norm": 7.0632164515782545, + "learning_rate": 1.0244102856791505e-08, + "loss": 0.6669, + "step": 26908 + }, + { + "epoch": 1.9440460924377336, + "grad_norm": 7.220828595153463, + "learning_rate": 1.0217667440881874e-08, + "loss": 0.5894, + "step": 26909 + }, + { + "epoch": 1.9441183376379432, + "grad_norm": 7.565241293175956, + "learning_rate": 1.019126610815907e-08, + "loss": 0.6294, + "step": 26910 + }, + { + "epoch": 1.9441905828381527, + "grad_norm": 9.71696391973248, + "learning_rate": 1.0164898858985028e-08, + "loss": 0.6521, + "step": 26911 + }, + { + "epoch": 1.9442628280383623, + "grad_norm": 7.477245144188403, + "learning_rate": 1.0138565693720015e-08, + "loss": 0.6752, + "step": 26912 + }, + { + "epoch": 1.9443350732385718, + "grad_norm": 7.872397098589531, + "learning_rate": 1.0112266612724852e-08, + "loss": 0.5922, + "step": 26913 + }, + { + "epoch": 1.9444073184387811, + "grad_norm": 6.603326433163073, + "learning_rate": 1.008600161635953e-08, + "loss": 0.5539, + "step": 26914 + }, + { + "epoch": 1.9444795636389909, + "grad_norm": 8.148476824569538, + "learning_rate": 1.0059770704983485e-08, + "loss": 0.6667, + "step": 26915 + }, + { + "epoch": 1.9445518088392002, + "grad_norm": 7.1209726830767295, + "learning_rate": 1.003357387895587e-08, + "loss": 0.6498, + "step": 26916 + }, + { + "epoch": 1.9446240540394097, + "grad_norm": 5.930425987446275, + "learning_rate": 1.0007411138635292e-08, + "loss": 0.5546, + "step": 26917 + }, + { + "epoch": 1.9446962992396193, + "grad_norm": 6.875964010449593, + "learning_rate": 9.981282484379795e-09, + "loss": 0.6134, + "step": 26918 + }, + { + "epoch": 1.9447685444398288, + "grad_norm": 5.767450472681493, + "learning_rate": 9.955187916547426e-09, + "loss": 0.5335, + "step": 26919 + }, + { + "epoch": 1.9448407896400384, + "grad_norm": 7.288282411951215, + "learning_rate": 9.929127435494845e-09, + "loss": 0.6251, + "step": 26920 + }, + { + "epoch": 1.9449130348402477, + "grad_norm": 7.035575383995734, + "learning_rate": 9.903101041579266e-09, + "loss": 0.6295, + "step": 26921 + }, + { + "epoch": 1.9449852800404575, + "grad_norm": 7.606725838665355, + "learning_rate": 9.877108735156794e-09, + "loss": 0.6107, + "step": 26922 + }, + { + "epoch": 1.9450575252406668, + "grad_norm": 7.513599273434008, + "learning_rate": 9.851150516583252e-09, + "loss": 0.5782, + "step": 26923 + }, + { + "epoch": 1.9451297704408763, + "grad_norm": 6.246181389609519, + "learning_rate": 9.825226386213915e-09, + "loss": 0.5009, + "step": 26924 + }, + { + "epoch": 1.9452020156410859, + "grad_norm": 8.689483635665136, + "learning_rate": 9.799336344403775e-09, + "loss": 0.6707, + "step": 26925 + }, + { + "epoch": 1.9452742608412954, + "grad_norm": 7.228607976983681, + "learning_rate": 9.773480391507272e-09, + "loss": 0.5655, + "step": 26926 + }, + { + "epoch": 1.945346506041505, + "grad_norm": 6.415336900458675, + "learning_rate": 9.74765852787829e-09, + "loss": 0.5714, + "step": 26927 + }, + { + "epoch": 1.9454187512417143, + "grad_norm": 7.747679578356151, + "learning_rate": 9.721870753870155e-09, + "loss": 0.5784, + "step": 26928 + }, + { + "epoch": 1.945490996441924, + "grad_norm": 7.060376143336868, + "learning_rate": 9.696117069836198e-09, + "loss": 0.5831, + "step": 26929 + }, + { + "epoch": 1.9455632416421333, + "grad_norm": 7.059273852214308, + "learning_rate": 9.670397476128634e-09, + "loss": 0.6406, + "step": 26930 + }, + { + "epoch": 1.9456354868423429, + "grad_norm": 7.422893638978397, + "learning_rate": 9.644711973099685e-09, + "loss": 0.5848, + "step": 26931 + }, + { + "epoch": 1.9457077320425524, + "grad_norm": 8.051624622249673, + "learning_rate": 9.61906056110129e-09, + "loss": 0.6094, + "step": 26932 + }, + { + "epoch": 1.945779977242762, + "grad_norm": 7.699823689609667, + "learning_rate": 9.593443240484002e-09, + "loss": 0.6464, + "step": 26933 + }, + { + "epoch": 1.9458522224429715, + "grad_norm": 5.6553205706516705, + "learning_rate": 9.567860011598928e-09, + "loss": 0.53, + "step": 26934 + }, + { + "epoch": 1.9459244676431808, + "grad_norm": 8.858160672399597, + "learning_rate": 9.542310874796346e-09, + "loss": 0.5883, + "step": 26935 + }, + { + "epoch": 1.9459967128433906, + "grad_norm": 5.860540248717343, + "learning_rate": 9.516795830425418e-09, + "loss": 0.5117, + "step": 26936 + }, + { + "epoch": 1.9460689580436, + "grad_norm": 8.38708307676877, + "learning_rate": 9.491314878836144e-09, + "loss": 0.6776, + "step": 26937 + }, + { + "epoch": 1.9461412032438095, + "grad_norm": 7.274685106311894, + "learning_rate": 9.465868020376856e-09, + "loss": 0.5638, + "step": 26938 + }, + { + "epoch": 1.946213448444019, + "grad_norm": 7.764252385922593, + "learning_rate": 9.440455255396163e-09, + "loss": 0.6026, + "step": 26939 + }, + { + "epoch": 1.9462856936442285, + "grad_norm": 7.2903670833143, + "learning_rate": 9.415076584241567e-09, + "loss": 0.566, + "step": 26940 + }, + { + "epoch": 1.946357938844438, + "grad_norm": 7.561272639781313, + "learning_rate": 9.389732007261121e-09, + "loss": 0.5849, + "step": 26941 + }, + { + "epoch": 1.9464301840446474, + "grad_norm": 7.530944564767268, + "learning_rate": 9.364421524801215e-09, + "loss": 0.5769, + "step": 26942 + }, + { + "epoch": 1.9465024292448572, + "grad_norm": 7.534321898467477, + "learning_rate": 9.339145137208516e-09, + "loss": 0.5197, + "step": 26943 + }, + { + "epoch": 1.9465746744450665, + "grad_norm": 7.603033223442953, + "learning_rate": 9.313902844829136e-09, + "loss": 0.5702, + "step": 26944 + }, + { + "epoch": 1.946646919645276, + "grad_norm": 7.206041651655949, + "learning_rate": 9.288694648008357e-09, + "loss": 0.6363, + "step": 26945 + }, + { + "epoch": 1.9467191648454856, + "grad_norm": 6.490252889399563, + "learning_rate": 9.263520547091732e-09, + "loss": 0.6398, + "step": 26946 + }, + { + "epoch": 1.9467914100456951, + "grad_norm": 7.064838933600315, + "learning_rate": 9.23838054242343e-09, + "loss": 0.5933, + "step": 26947 + }, + { + "epoch": 1.9468636552459047, + "grad_norm": 8.062890331666944, + "learning_rate": 9.213274634347624e-09, + "loss": 0.5472, + "step": 26948 + }, + { + "epoch": 1.946935900446114, + "grad_norm": 7.218670733128109, + "learning_rate": 9.188202823208203e-09, + "loss": 0.6327, + "step": 26949 + }, + { + "epoch": 1.9470081456463237, + "grad_norm": 7.363985532880059, + "learning_rate": 9.163165109348227e-09, + "loss": 0.5861, + "step": 26950 + }, + { + "epoch": 1.947080390846533, + "grad_norm": 8.288452701267731, + "learning_rate": 9.138161493110753e-09, + "loss": 0.649, + "step": 26951 + }, + { + "epoch": 1.9471526360467426, + "grad_norm": 5.89369829788265, + "learning_rate": 9.113191974837454e-09, + "loss": 0.5801, + "step": 26952 + }, + { + "epoch": 1.9472248812469521, + "grad_norm": 7.6291797626021145, + "learning_rate": 9.088256554870833e-09, + "loss": 0.6037, + "step": 26953 + }, + { + "epoch": 1.9472971264471617, + "grad_norm": 7.522551138727182, + "learning_rate": 9.063355233552006e-09, + "loss": 0.6019, + "step": 26954 + }, + { + "epoch": 1.9473693716473712, + "grad_norm": 6.55724538514578, + "learning_rate": 9.038488011221535e-09, + "loss": 0.5801, + "step": 26955 + }, + { + "epoch": 1.9474416168475805, + "grad_norm": 7.101903858940282, + "learning_rate": 9.01365488821998e-09, + "loss": 0.6119, + "step": 26956 + }, + { + "epoch": 1.9475138620477903, + "grad_norm": 7.841189753116855, + "learning_rate": 8.988855864887347e-09, + "loss": 0.5863, + "step": 26957 + }, + { + "epoch": 1.9475861072479996, + "grad_norm": 7.393569069564832, + "learning_rate": 8.964090941563363e-09, + "loss": 0.5915, + "step": 26958 + }, + { + "epoch": 1.9476583524482092, + "grad_norm": 7.148290480472439, + "learning_rate": 8.939360118586648e-09, + "loss": 0.5447, + "step": 26959 + }, + { + "epoch": 1.9477305976484187, + "grad_norm": 6.513136280922363, + "learning_rate": 8.91466339629582e-09, + "loss": 0.5637, + "step": 26960 + }, + { + "epoch": 1.9478028428486283, + "grad_norm": 9.239944353472776, + "learning_rate": 8.890000775029218e-09, + "loss": 0.5795, + "step": 26961 + }, + { + "epoch": 1.9478750880488378, + "grad_norm": 7.449582071550825, + "learning_rate": 8.86537225512435e-09, + "loss": 0.6193, + "step": 26962 + }, + { + "epoch": 1.9479473332490471, + "grad_norm": 7.980438396678082, + "learning_rate": 8.840777836918169e-09, + "loss": 0.6124, + "step": 26963 + }, + { + "epoch": 1.9480195784492569, + "grad_norm": 8.375492366153066, + "learning_rate": 8.816217520747628e-09, + "loss": 0.54, + "step": 26964 + }, + { + "epoch": 1.9480918236494662, + "grad_norm": 8.72551549574602, + "learning_rate": 8.791691306948568e-09, + "loss": 0.6296, + "step": 26965 + }, + { + "epoch": 1.9481640688496757, + "grad_norm": 7.575010434385862, + "learning_rate": 8.767199195857113e-09, + "loss": 0.5813, + "step": 26966 + }, + { + "epoch": 1.9482363140498853, + "grad_norm": 7.186297241416624, + "learning_rate": 8.742741187808267e-09, + "loss": 0.566, + "step": 26967 + }, + { + "epoch": 1.9483085592500948, + "grad_norm": 8.019398535342432, + "learning_rate": 8.718317283136768e-09, + "loss": 0.7046, + "step": 26968 + }, + { + "epoch": 1.9483808044503044, + "grad_norm": 6.976460523383341, + "learning_rate": 8.693927482177623e-09, + "loss": 0.6149, + "step": 26969 + }, + { + "epoch": 1.9484530496505137, + "grad_norm": 8.451679437540292, + "learning_rate": 8.669571785263897e-09, + "loss": 0.536, + "step": 26970 + }, + { + "epoch": 1.9485252948507235, + "grad_norm": 7.9931482537450576, + "learning_rate": 8.645250192729494e-09, + "loss": 0.621, + "step": 26971 + }, + { + "epoch": 1.9485975400509328, + "grad_norm": 7.047659887345254, + "learning_rate": 8.620962704907199e-09, + "loss": 0.596, + "step": 26972 + }, + { + "epoch": 1.9486697852511425, + "grad_norm": 7.941109625009342, + "learning_rate": 8.59670932212925e-09, + "loss": 0.5835, + "step": 26973 + }, + { + "epoch": 1.9487420304513519, + "grad_norm": 6.232606412282881, + "learning_rate": 8.572490044728155e-09, + "loss": 0.5562, + "step": 26974 + }, + { + "epoch": 1.9488142756515614, + "grad_norm": 6.372236962020989, + "learning_rate": 8.548304873035318e-09, + "loss": 0.5949, + "step": 26975 + }, + { + "epoch": 1.948886520851771, + "grad_norm": 7.353051062267717, + "learning_rate": 8.524153807381586e-09, + "loss": 0.5648, + "step": 26976 + }, + { + "epoch": 1.9489587660519803, + "grad_norm": 7.065659078105015, + "learning_rate": 8.500036848097249e-09, + "loss": 0.5129, + "step": 26977 + }, + { + "epoch": 1.94903101125219, + "grad_norm": 6.698681191989515, + "learning_rate": 8.475953995513431e-09, + "loss": 0.656, + "step": 26978 + }, + { + "epoch": 1.9491032564523993, + "grad_norm": 8.240273030026069, + "learning_rate": 8.451905249959035e-09, + "loss": 0.5856, + "step": 26979 + }, + { + "epoch": 1.949175501652609, + "grad_norm": 6.022700916579405, + "learning_rate": 8.427890611763523e-09, + "loss": 0.5384, + "step": 26980 + }, + { + "epoch": 1.9492477468528184, + "grad_norm": 8.50790755513933, + "learning_rate": 8.403910081255794e-09, + "loss": 0.5815, + "step": 26981 + }, + { + "epoch": 1.949319992053028, + "grad_norm": 7.022417955581332, + "learning_rate": 8.379963658763646e-09, + "loss": 0.5622, + "step": 26982 + }, + { + "epoch": 1.9493922372532375, + "grad_norm": 6.606057779769144, + "learning_rate": 8.356051344615423e-09, + "loss": 0.4744, + "step": 26983 + }, + { + "epoch": 1.9494644824534468, + "grad_norm": 8.552102117173215, + "learning_rate": 8.332173139138089e-09, + "loss": 0.6295, + "step": 26984 + }, + { + "epoch": 1.9495367276536566, + "grad_norm": 6.141260167631168, + "learning_rate": 8.308329042658602e-09, + "loss": 0.6432, + "step": 26985 + }, + { + "epoch": 1.949608972853866, + "grad_norm": 6.986454801234834, + "learning_rate": 8.28451905550337e-09, + "loss": 0.5721, + "step": 26986 + }, + { + "epoch": 1.9496812180540757, + "grad_norm": 8.83585590355229, + "learning_rate": 8.26074317799852e-09, + "loss": 0.5645, + "step": 26987 + }, + { + "epoch": 1.949753463254285, + "grad_norm": 7.938993058241618, + "learning_rate": 8.237001410469347e-09, + "loss": 0.5511, + "step": 26988 + }, + { + "epoch": 1.9498257084544945, + "grad_norm": 7.7836988647800425, + "learning_rate": 8.213293753241147e-09, + "loss": 0.5602, + "step": 26989 + }, + { + "epoch": 1.949897953654704, + "grad_norm": 7.832597440959106, + "learning_rate": 8.189620206637827e-09, + "loss": 0.627, + "step": 26990 + }, + { + "epoch": 1.9499701988549136, + "grad_norm": 8.177345093508059, + "learning_rate": 8.16598077098385e-09, + "loss": 0.6133, + "step": 26991 + }, + { + "epoch": 1.9500424440551232, + "grad_norm": 7.666819722955969, + "learning_rate": 8.142375446603124e-09, + "loss": 0.6132, + "step": 26992 + }, + { + "epoch": 1.9501146892553325, + "grad_norm": 8.721335407159827, + "learning_rate": 8.118804233818168e-09, + "loss": 0.64, + "step": 26993 + }, + { + "epoch": 1.9501869344555423, + "grad_norm": 8.221235124372422, + "learning_rate": 8.095267132952056e-09, + "loss": 0.5906, + "step": 26994 + }, + { + "epoch": 1.9502591796557516, + "grad_norm": 7.555754119782057, + "learning_rate": 8.071764144327033e-09, + "loss": 0.5611, + "step": 26995 + }, + { + "epoch": 1.9503314248559611, + "grad_norm": 8.049546829641882, + "learning_rate": 8.048295268264506e-09, + "loss": 0.581, + "step": 26996 + }, + { + "epoch": 1.9504036700561707, + "grad_norm": 7.40653121993785, + "learning_rate": 8.024860505086162e-09, + "loss": 0.5483, + "step": 26997 + }, + { + "epoch": 1.9504759152563802, + "grad_norm": 7.524373440939182, + "learning_rate": 8.001459855112304e-09, + "loss": 0.6133, + "step": 26998 + }, + { + "epoch": 1.9505481604565897, + "grad_norm": 6.505809828410103, + "learning_rate": 7.978093318663782e-09, + "loss": 0.6811, + "step": 26999 + }, + { + "epoch": 1.950620405656799, + "grad_norm": 7.141919575875346, + "learning_rate": 7.954760896060065e-09, + "loss": 0.5112, + "step": 27000 + }, + { + "epoch": 1.9506926508570088, + "grad_norm": 7.58969799300341, + "learning_rate": 7.931462587620897e-09, + "loss": 0.5318, + "step": 27001 + }, + { + "epoch": 1.9507648960572181, + "grad_norm": 8.558824627476968, + "learning_rate": 7.908198393664912e-09, + "loss": 0.621, + "step": 27002 + }, + { + "epoch": 1.9508371412574277, + "grad_norm": 6.623615624079259, + "learning_rate": 7.884968314510744e-09, + "loss": 0.5734, + "step": 27003 + }, + { + "epoch": 1.9509093864576372, + "grad_norm": 9.917174083219221, + "learning_rate": 7.861772350476472e-09, + "loss": 0.5991, + "step": 27004 + }, + { + "epoch": 1.9509816316578468, + "grad_norm": 8.139402330656285, + "learning_rate": 7.838610501879341e-09, + "loss": 0.5768, + "step": 27005 + }, + { + "epoch": 1.9510538768580563, + "grad_norm": 6.410955719778978, + "learning_rate": 7.815482769036597e-09, + "loss": 0.5793, + "step": 27006 + }, + { + "epoch": 1.9511261220582656, + "grad_norm": 7.060401807378998, + "learning_rate": 7.792389152264933e-09, + "loss": 0.5768, + "step": 27007 + }, + { + "epoch": 1.9511983672584754, + "grad_norm": 7.921924598433244, + "learning_rate": 7.769329651880486e-09, + "loss": 0.6178, + "step": 27008 + }, + { + "epoch": 1.9512706124586847, + "grad_norm": 7.830828354087557, + "learning_rate": 7.746304268198556e-09, + "loss": 0.6028, + "step": 27009 + }, + { + "epoch": 1.9513428576588943, + "grad_norm": 6.776194573411282, + "learning_rate": 7.723313001534727e-09, + "loss": 0.6188, + "step": 27010 + }, + { + "epoch": 1.9514151028591038, + "grad_norm": 7.891308805843899, + "learning_rate": 7.700355852203744e-09, + "loss": 0.6382, + "step": 27011 + }, + { + "epoch": 1.9514873480593133, + "grad_norm": 7.296656211170544, + "learning_rate": 7.677432820519526e-09, + "loss": 0.6871, + "step": 27012 + }, + { + "epoch": 1.9515595932595229, + "grad_norm": 7.411564332727221, + "learning_rate": 7.654543906796263e-09, + "loss": 0.6317, + "step": 27013 + }, + { + "epoch": 1.9516318384597322, + "grad_norm": 7.569527631607789, + "learning_rate": 7.63168911134704e-09, + "loss": 0.5809, + "step": 27014 + }, + { + "epoch": 1.951704083659942, + "grad_norm": 8.198122079320447, + "learning_rate": 7.608868434484662e-09, + "loss": 0.6631, + "step": 27015 + }, + { + "epoch": 1.9517763288601513, + "grad_norm": 7.6138008296599455, + "learning_rate": 7.586081876521933e-09, + "loss": 0.5748, + "step": 27016 + }, + { + "epoch": 1.9518485740603608, + "grad_norm": 7.037670408526726, + "learning_rate": 7.563329437770273e-09, + "loss": 0.6162, + "step": 27017 + }, + { + "epoch": 1.9519208192605704, + "grad_norm": 8.837446682565474, + "learning_rate": 7.540611118541652e-09, + "loss": 0.6402, + "step": 27018 + }, + { + "epoch": 1.95199306446078, + "grad_norm": 8.135516124968895, + "learning_rate": 7.517926919146379e-09, + "loss": 0.5974, + "step": 27019 + }, + { + "epoch": 1.9520653096609895, + "grad_norm": 6.038379625646103, + "learning_rate": 7.495276839895593e-09, + "loss": 0.5715, + "step": 27020 + }, + { + "epoch": 1.9521375548611988, + "grad_norm": 8.710524093329216, + "learning_rate": 7.472660881099325e-09, + "loss": 0.624, + "step": 27021 + }, + { + "epoch": 1.9522098000614085, + "grad_norm": 8.269684398676345, + "learning_rate": 7.450079043067049e-09, + "loss": 0.6099, + "step": 27022 + }, + { + "epoch": 1.9522820452616179, + "grad_norm": 8.202858068573665, + "learning_rate": 7.4275313261076845e-09, + "loss": 0.5828, + "step": 27023 + }, + { + "epoch": 1.9523542904618274, + "grad_norm": 7.1392185153223515, + "learning_rate": 7.405017730529873e-09, + "loss": 0.6167, + "step": 27024 + }, + { + "epoch": 1.952426535662037, + "grad_norm": 9.274619665856335, + "learning_rate": 7.382538256642258e-09, + "loss": 0.6691, + "step": 27025 + }, + { + "epoch": 1.9524987808622465, + "grad_norm": 7.3860403926518545, + "learning_rate": 7.360092904752092e-09, + "loss": 0.5898, + "step": 27026 + }, + { + "epoch": 1.952571026062456, + "grad_norm": 6.823568329496998, + "learning_rate": 7.3376816751671856e-09, + "loss": 0.576, + "step": 27027 + }, + { + "epoch": 1.9526432712626653, + "grad_norm": 7.261313457372619, + "learning_rate": 7.315304568193682e-09, + "loss": 0.5306, + "step": 27028 + }, + { + "epoch": 1.9527155164628751, + "grad_norm": 7.202823657437181, + "learning_rate": 7.29296158413828e-09, + "loss": 0.6172, + "step": 27029 + }, + { + "epoch": 1.9527877616630844, + "grad_norm": 7.864885889427191, + "learning_rate": 7.270652723306848e-09, + "loss": 0.5706, + "step": 27030 + }, + { + "epoch": 1.952860006863294, + "grad_norm": 6.937780151209295, + "learning_rate": 7.248377986004696e-09, + "loss": 0.5956, + "step": 27031 + }, + { + "epoch": 1.9529322520635035, + "grad_norm": 6.570547346891592, + "learning_rate": 7.226137372536856e-09, + "loss": 0.5611, + "step": 27032 + }, + { + "epoch": 1.953004497263713, + "grad_norm": 7.542285226426244, + "learning_rate": 7.203930883207533e-09, + "loss": 0.6369, + "step": 27033 + }, + { + "epoch": 1.9530767424639226, + "grad_norm": 6.764968664036118, + "learning_rate": 7.181758518320647e-09, + "loss": 0.6171, + "step": 27034 + }, + { + "epoch": 1.953148987664132, + "grad_norm": 6.924431666660449, + "learning_rate": 7.159620278180401e-09, + "loss": 0.5915, + "step": 27035 + }, + { + "epoch": 1.9532212328643417, + "grad_norm": 8.044184736880096, + "learning_rate": 7.137516163089053e-09, + "loss": 0.6651, + "step": 27036 + }, + { + "epoch": 1.953293478064551, + "grad_norm": 7.656069383631683, + "learning_rate": 7.1154461733496935e-09, + "loss": 0.6148, + "step": 27037 + }, + { + "epoch": 1.9533657232647605, + "grad_norm": 7.818622600880541, + "learning_rate": 7.093410309264026e-09, + "loss": 0.6796, + "step": 27038 + }, + { + "epoch": 1.95343796846497, + "grad_norm": 7.249389359638954, + "learning_rate": 7.071408571134308e-09, + "loss": 0.6079, + "step": 27039 + }, + { + "epoch": 1.9535102136651796, + "grad_norm": 7.255823098832601, + "learning_rate": 7.049440959261134e-09, + "loss": 0.5578, + "step": 27040 + }, + { + "epoch": 1.9535824588653892, + "grad_norm": 7.6367479718910225, + "learning_rate": 7.02750747394565e-09, + "loss": 0.624, + "step": 27041 + }, + { + "epoch": 1.9536547040655985, + "grad_norm": 8.424357258987227, + "learning_rate": 7.005608115487617e-09, + "loss": 0.6214, + "step": 27042 + }, + { + "epoch": 1.9537269492658083, + "grad_norm": 7.011771522259627, + "learning_rate": 6.98374288418735e-09, + "loss": 0.5339, + "step": 27043 + }, + { + "epoch": 1.9537991944660176, + "grad_norm": 7.190700713964704, + "learning_rate": 6.961911780344055e-09, + "loss": 0.6582, + "step": 27044 + }, + { + "epoch": 1.9538714396662273, + "grad_norm": 6.637423975737229, + "learning_rate": 6.940114804256104e-09, + "loss": 0.5792, + "step": 27045 + }, + { + "epoch": 1.9539436848664367, + "grad_norm": 7.619147384081211, + "learning_rate": 6.918351956222702e-09, + "loss": 0.6092, + "step": 27046 + }, + { + "epoch": 1.9540159300666462, + "grad_norm": 7.673471941100676, + "learning_rate": 6.896623236540834e-09, + "loss": 0.6447, + "step": 27047 + }, + { + "epoch": 1.9540881752668557, + "grad_norm": 6.8536422513416175, + "learning_rate": 6.874928645508872e-09, + "loss": 0.517, + "step": 27048 + }, + { + "epoch": 1.954160420467065, + "grad_norm": 7.0956724813541445, + "learning_rate": 6.853268183422968e-09, + "loss": 0.6195, + "step": 27049 + }, + { + "epoch": 1.9542326656672748, + "grad_norm": 6.698503799768839, + "learning_rate": 6.831641850580384e-09, + "loss": 0.5605, + "step": 27050 + }, + { + "epoch": 1.9543049108674841, + "grad_norm": 6.99827118369579, + "learning_rate": 6.8100496472764396e-09, + "loss": 0.6236, + "step": 27051 + }, + { + "epoch": 1.954377156067694, + "grad_norm": 7.529123093000973, + "learning_rate": 6.7884915738072875e-09, + "loss": 0.6035, + "step": 27052 + }, + { + "epoch": 1.9544494012679032, + "grad_norm": 6.052942351823777, + "learning_rate": 6.7669676304676915e-09, + "loss": 0.522, + "step": 27053 + }, + { + "epoch": 1.9545216464681128, + "grad_norm": 7.851308152363473, + "learning_rate": 6.745477817552693e-09, + "loss": 0.6001, + "step": 27054 + }, + { + "epoch": 1.9545938916683223, + "grad_norm": 7.9885825699588775, + "learning_rate": 6.724022135355945e-09, + "loss": 0.571, + "step": 27055 + }, + { + "epoch": 1.9546661368685316, + "grad_norm": 7.931133450372474, + "learning_rate": 6.702600584171659e-09, + "loss": 0.6292, + "step": 27056 + }, + { + "epoch": 1.9547383820687414, + "grad_norm": 6.798283012386577, + "learning_rate": 6.681213164292655e-09, + "loss": 0.603, + "step": 27057 + }, + { + "epoch": 1.9548106272689507, + "grad_norm": 7.591529552953682, + "learning_rate": 6.659859876012032e-09, + "loss": 0.5716, + "step": 27058 + }, + { + "epoch": 1.9548828724691605, + "grad_norm": 8.493612245364712, + "learning_rate": 6.638540719621778e-09, + "loss": 0.5793, + "step": 27059 + }, + { + "epoch": 1.9549551176693698, + "grad_norm": 6.7061754946022925, + "learning_rate": 6.61725569541416e-09, + "loss": 0.5938, + "step": 27060 + }, + { + "epoch": 1.9550273628695793, + "grad_norm": 7.804806025829859, + "learning_rate": 6.596004803680334e-09, + "loss": 0.6279, + "step": 27061 + }, + { + "epoch": 1.9550996080697889, + "grad_norm": 5.85588502835669, + "learning_rate": 6.5747880447109e-09, + "loss": 0.5532, + "step": 27062 + }, + { + "epoch": 1.9551718532699984, + "grad_norm": 6.600211353963765, + "learning_rate": 6.5536054187970155e-09, + "loss": 0.6323, + "step": 27063 + }, + { + "epoch": 1.955244098470208, + "grad_norm": 8.499907549187023, + "learning_rate": 6.532456926227892e-09, + "loss": 0.6442, + "step": 27064 + }, + { + "epoch": 1.9553163436704173, + "grad_norm": 7.746835123923566, + "learning_rate": 6.511342567293577e-09, + "loss": 0.6173, + "step": 27065 + }, + { + "epoch": 1.955388588870627, + "grad_norm": 6.6448171510360705, + "learning_rate": 6.490262342282727e-09, + "loss": 0.599, + "step": 27066 + }, + { + "epoch": 1.9554608340708364, + "grad_norm": 8.131626185541654, + "learning_rate": 6.469216251484278e-09, + "loss": 0.5608, + "step": 27067 + }, + { + "epoch": 1.955533079271046, + "grad_norm": 7.779193407233084, + "learning_rate": 6.448204295186333e-09, + "loss": 0.5781, + "step": 27068 + }, + { + "epoch": 1.9556053244712555, + "grad_norm": 7.001668186322273, + "learning_rate": 6.427226473675885e-09, + "loss": 0.5991, + "step": 27069 + }, + { + "epoch": 1.955677569671465, + "grad_norm": 6.064311907071568, + "learning_rate": 6.4062827872410364e-09, + "loss": 0.543, + "step": 27070 + }, + { + "epoch": 1.9557498148716745, + "grad_norm": 8.981727276293945, + "learning_rate": 6.385373236167669e-09, + "loss": 0.6349, + "step": 27071 + }, + { + "epoch": 1.9558220600718839, + "grad_norm": 9.108695736091711, + "learning_rate": 6.364497820742499e-09, + "loss": 0.5474, + "step": 27072 + }, + { + "epoch": 1.9558943052720936, + "grad_norm": 8.06524043860563, + "learning_rate": 6.343656541251131e-09, + "loss": 0.6441, + "step": 27073 + }, + { + "epoch": 1.955966550472303, + "grad_norm": 7.526149177883329, + "learning_rate": 6.322849397979169e-09, + "loss": 0.6306, + "step": 27074 + }, + { + "epoch": 1.9560387956725125, + "grad_norm": 6.994685198971776, + "learning_rate": 6.302076391210832e-09, + "loss": 0.6344, + "step": 27075 + }, + { + "epoch": 1.956111040872722, + "grad_norm": 7.286144788829622, + "learning_rate": 6.28133752123089e-09, + "loss": 0.5828, + "step": 27076 + }, + { + "epoch": 1.9561832860729316, + "grad_norm": 8.041605051493093, + "learning_rate": 6.2606327883232846e-09, + "loss": 0.5284, + "step": 27077 + }, + { + "epoch": 1.9562555312731411, + "grad_norm": 6.4249667036922995, + "learning_rate": 6.239962192771398e-09, + "loss": 0.6557, + "step": 27078 + }, + { + "epoch": 1.9563277764733504, + "grad_norm": 6.981478028428206, + "learning_rate": 6.219325734858062e-09, + "loss": 0.6708, + "step": 27079 + }, + { + "epoch": 1.9564000216735602, + "grad_norm": 9.667889440962583, + "learning_rate": 6.198723414866103e-09, + "loss": 0.6017, + "step": 27080 + }, + { + "epoch": 1.9564722668737695, + "grad_norm": 5.773264055137423, + "learning_rate": 6.178155233076966e-09, + "loss": 0.5451, + "step": 27081 + }, + { + "epoch": 1.956544512073979, + "grad_norm": 7.176595360306736, + "learning_rate": 6.157621189772644e-09, + "loss": 0.6856, + "step": 27082 + }, + { + "epoch": 1.9566167572741886, + "grad_norm": 7.8695805384761455, + "learning_rate": 6.1371212852343045e-09, + "loss": 0.5827, + "step": 27083 + }, + { + "epoch": 1.9566890024743981, + "grad_norm": 7.443681215982185, + "learning_rate": 6.116655519742276e-09, + "loss": 0.6449, + "step": 27084 + }, + { + "epoch": 1.9567612476746077, + "grad_norm": 7.589090055620269, + "learning_rate": 6.096223893576891e-09, + "loss": 0.5965, + "step": 27085 + }, + { + "epoch": 1.956833492874817, + "grad_norm": 6.785795736182556, + "learning_rate": 6.075826407017648e-09, + "loss": 0.5512, + "step": 27086 + }, + { + "epoch": 1.9569057380750268, + "grad_norm": 7.904906147370155, + "learning_rate": 6.0554630603440464e-09, + "loss": 0.5648, + "step": 27087 + }, + { + "epoch": 1.956977983275236, + "grad_norm": 7.679964667874761, + "learning_rate": 6.035133853835029e-09, + "loss": 0.549, + "step": 27088 + }, + { + "epoch": 1.9570502284754456, + "grad_norm": 7.339136890524087, + "learning_rate": 6.014838787768151e-09, + "loss": 0.5573, + "step": 27089 + }, + { + "epoch": 1.9571224736756552, + "grad_norm": 8.509742090640511, + "learning_rate": 5.994577862421802e-09, + "loss": 0.6568, + "step": 27090 + }, + { + "epoch": 1.9571947188758647, + "grad_norm": 7.410132054685064, + "learning_rate": 5.97435107807326e-09, + "loss": 0.6071, + "step": 27091 + }, + { + "epoch": 1.9572669640760743, + "grad_norm": 7.4876227927607255, + "learning_rate": 5.954158434999524e-09, + "loss": 0.5426, + "step": 27092 + }, + { + "epoch": 1.9573392092762836, + "grad_norm": 7.087229685531471, + "learning_rate": 5.933999933476764e-09, + "loss": 0.5985, + "step": 27093 + }, + { + "epoch": 1.9574114544764933, + "grad_norm": 7.881537252057266, + "learning_rate": 5.9138755737808695e-09, + "loss": 0.5872, + "step": 27094 + }, + { + "epoch": 1.9574836996767027, + "grad_norm": 10.812040991770003, + "learning_rate": 5.893785356187731e-09, + "loss": 0.605, + "step": 27095 + }, + { + "epoch": 1.9575559448769122, + "grad_norm": 7.3393759829658505, + "learning_rate": 5.8737292809718514e-09, + "loss": 0.5268, + "step": 27096 + }, + { + "epoch": 1.9576281900771217, + "grad_norm": 7.227494925065372, + "learning_rate": 5.853707348408289e-09, + "loss": 0.6195, + "step": 27097 + }, + { + "epoch": 1.9577004352773313, + "grad_norm": 8.91507281262439, + "learning_rate": 5.833719558770712e-09, + "loss": 0.6083, + "step": 27098 + }, + { + "epoch": 1.9577726804775408, + "grad_norm": 7.398713603200068, + "learning_rate": 5.813765912333069e-09, + "loss": 0.6226, + "step": 27099 + }, + { + "epoch": 1.9578449256777501, + "grad_norm": 8.509623297165861, + "learning_rate": 5.793846409368198e-09, + "loss": 0.6076, + "step": 27100 + }, + { + "epoch": 1.95791717087796, + "grad_norm": 6.9488838891355424, + "learning_rate": 5.773961050148935e-09, + "loss": 0.5995, + "step": 27101 + }, + { + "epoch": 1.9579894160781692, + "grad_norm": 6.296880149365916, + "learning_rate": 5.754109834947564e-09, + "loss": 0.6197, + "step": 27102 + }, + { + "epoch": 1.9580616612783788, + "grad_norm": 7.368570143403125, + "learning_rate": 5.734292764036087e-09, + "loss": 0.5433, + "step": 27103 + }, + { + "epoch": 1.9581339064785883, + "grad_norm": 7.864133573559961, + "learning_rate": 5.714509837685122e-09, + "loss": 0.6028, + "step": 27104 + }, + { + "epoch": 1.9582061516787979, + "grad_norm": 7.89746208469217, + "learning_rate": 5.694761056165843e-09, + "loss": 0.5758, + "step": 27105 + }, + { + "epoch": 1.9582783968790074, + "grad_norm": 7.972118189447123, + "learning_rate": 5.675046419748587e-09, + "loss": 0.6073, + "step": 27106 + }, + { + "epoch": 1.9583506420792167, + "grad_norm": 6.891376480334039, + "learning_rate": 5.6553659287034154e-09, + "loss": 0.6417, + "step": 27107 + }, + { + "epoch": 1.9584228872794265, + "grad_norm": 8.471348662931145, + "learning_rate": 5.635719583299282e-09, + "loss": 0.5907, + "step": 27108 + }, + { + "epoch": 1.9584951324796358, + "grad_norm": 8.661844917397614, + "learning_rate": 5.616107383805691e-09, + "loss": 0.6007, + "step": 27109 + }, + { + "epoch": 1.9585673776798453, + "grad_norm": 7.818424999334255, + "learning_rate": 5.5965293304904855e-09, + "loss": 0.6236, + "step": 27110 + }, + { + "epoch": 1.9586396228800549, + "grad_norm": 7.240828138588041, + "learning_rate": 5.576985423622339e-09, + "loss": 0.5337, + "step": 27111 + }, + { + "epoch": 1.9587118680802644, + "grad_norm": 7.2669899591513305, + "learning_rate": 5.557475663468259e-09, + "loss": 0.5994, + "step": 27112 + }, + { + "epoch": 1.958784113280474, + "grad_norm": 7.027999465655772, + "learning_rate": 5.538000050295533e-09, + "loss": 0.5824, + "step": 27113 + }, + { + "epoch": 1.9588563584806833, + "grad_norm": 7.511067934056366, + "learning_rate": 5.5185585843708926e-09, + "loss": 0.5697, + "step": 27114 + }, + { + "epoch": 1.958928603680893, + "grad_norm": 6.923883770932949, + "learning_rate": 5.499151265960234e-09, + "loss": 0.5699, + "step": 27115 + }, + { + "epoch": 1.9590008488811024, + "grad_norm": 7.943049375789531, + "learning_rate": 5.4797780953294575e-09, + "loss": 0.5448, + "step": 27116 + }, + { + "epoch": 1.9590730940813121, + "grad_norm": 8.453152076550905, + "learning_rate": 5.460439072743629e-09, + "loss": 0.5836, + "step": 27117 + }, + { + "epoch": 1.9591453392815215, + "grad_norm": 6.521600777637409, + "learning_rate": 5.441134198467535e-09, + "loss": 0.5442, + "step": 27118 + }, + { + "epoch": 1.959217584481731, + "grad_norm": 5.941110256228221, + "learning_rate": 5.421863472765132e-09, + "loss": 0.6315, + "step": 27119 + }, + { + "epoch": 1.9592898296819405, + "grad_norm": 6.786164783639071, + "learning_rate": 5.402626895900653e-09, + "loss": 0.58, + "step": 27120 + }, + { + "epoch": 1.9593620748821499, + "grad_norm": 8.2790114346142, + "learning_rate": 5.383424468137499e-09, + "loss": 0.6147, + "step": 27121 + }, + { + "epoch": 1.9594343200823596, + "grad_norm": 8.099913382361432, + "learning_rate": 5.36425618973796e-09, + "loss": 0.6585, + "step": 27122 + }, + { + "epoch": 1.959506565282569, + "grad_norm": 7.55761283404142, + "learning_rate": 5.3451220609651576e-09, + "loss": 0.6018, + "step": 27123 + }, + { + "epoch": 1.9595788104827787, + "grad_norm": 7.230048515188707, + "learning_rate": 5.326022082080551e-09, + "loss": 0.5193, + "step": 27124 + }, + { + "epoch": 1.959651055682988, + "grad_norm": 8.045840535752484, + "learning_rate": 5.306956253345597e-09, + "loss": 0.6112, + "step": 27125 + }, + { + "epoch": 1.9597233008831976, + "grad_norm": 7.209868809621512, + "learning_rate": 5.287924575021475e-09, + "loss": 0.5348, + "step": 27126 + }, + { + "epoch": 1.9597955460834071, + "grad_norm": 7.660725931774208, + "learning_rate": 5.268927047368533e-09, + "loss": 0.603, + "step": 27127 + }, + { + "epoch": 1.9598677912836164, + "grad_norm": 8.558012071601429, + "learning_rate": 5.249963670647118e-09, + "loss": 0.6738, + "step": 27128 + }, + { + "epoch": 1.9599400364838262, + "grad_norm": 6.821809340059215, + "learning_rate": 5.231034445116467e-09, + "loss": 0.5653, + "step": 27129 + }, + { + "epoch": 1.9600122816840355, + "grad_norm": 7.106144902803395, + "learning_rate": 5.212139371036096e-09, + "loss": 0.5583, + "step": 27130 + }, + { + "epoch": 1.9600845268842453, + "grad_norm": 8.032097797695087, + "learning_rate": 5.193278448664407e-09, + "loss": 0.6686, + "step": 27131 + }, + { + "epoch": 1.9601567720844546, + "grad_norm": 6.871084589515639, + "learning_rate": 5.1744516782595285e-09, + "loss": 0.6443, + "step": 27132 + }, + { + "epoch": 1.9602290172846641, + "grad_norm": 7.6778287762894, + "learning_rate": 5.1556590600793096e-09, + "loss": 0.5417, + "step": 27133 + }, + { + "epoch": 1.9603012624848737, + "grad_norm": 7.602177719515741, + "learning_rate": 5.1369005943810445e-09, + "loss": 0.6198, + "step": 27134 + }, + { + "epoch": 1.9603735076850832, + "grad_norm": 8.762327828668063, + "learning_rate": 5.118176281421472e-09, + "loss": 0.5699, + "step": 27135 + }, + { + "epoch": 1.9604457528852928, + "grad_norm": 7.019070397364886, + "learning_rate": 5.099486121457054e-09, + "loss": 0.6264, + "step": 27136 + }, + { + "epoch": 1.960517998085502, + "grad_norm": 7.133012783418252, + "learning_rate": 5.080830114743418e-09, + "loss": 0.5707, + "step": 27137 + }, + { + "epoch": 1.9605902432857119, + "grad_norm": 6.32684901053436, + "learning_rate": 5.0622082615359166e-09, + "loss": 0.6104, + "step": 27138 + }, + { + "epoch": 1.9606624884859212, + "grad_norm": 8.91267329364528, + "learning_rate": 5.043620562089624e-09, + "loss": 0.616, + "step": 27139 + }, + { + "epoch": 1.9607347336861307, + "grad_norm": 8.142183414590704, + "learning_rate": 5.025067016659058e-09, + "loss": 0.5863, + "step": 27140 + }, + { + "epoch": 1.9608069788863403, + "grad_norm": 7.317071773404864, + "learning_rate": 5.006547625497904e-09, + "loss": 0.5843, + "step": 27141 + }, + { + "epoch": 1.9608792240865498, + "grad_norm": 7.462607695091565, + "learning_rate": 4.988062388860127e-09, + "loss": 0.6244, + "step": 27142 + }, + { + "epoch": 1.9609514692867593, + "grad_norm": 8.505943912826167, + "learning_rate": 4.9696113069985785e-09, + "loss": 0.6143, + "step": 27143 + }, + { + "epoch": 1.9610237144869687, + "grad_norm": 6.842713996124905, + "learning_rate": 4.951194380165558e-09, + "loss": 0.6438, + "step": 27144 + }, + { + "epoch": 1.9610959596871784, + "grad_norm": 8.576879303193147, + "learning_rate": 4.932811608613364e-09, + "loss": 0.5921, + "step": 27145 + }, + { + "epoch": 1.9611682048873877, + "grad_norm": 7.220281529313717, + "learning_rate": 4.914462992594016e-09, + "loss": 0.615, + "step": 27146 + }, + { + "epoch": 1.9612404500875973, + "grad_norm": 8.800726635150113, + "learning_rate": 4.896148532357869e-09, + "loss": 0.642, + "step": 27147 + }, + { + "epoch": 1.9613126952878068, + "grad_norm": 8.973014641705088, + "learning_rate": 4.87786822815639e-09, + "loss": 0.6041, + "step": 27148 + }, + { + "epoch": 1.9613849404880164, + "grad_norm": 7.344408371003642, + "learning_rate": 4.8596220802396565e-09, + "loss": 0.5643, + "step": 27149 + }, + { + "epoch": 1.961457185688226, + "grad_norm": 6.7107902750512665, + "learning_rate": 4.841410088857468e-09, + "loss": 0.6519, + "step": 27150 + }, + { + "epoch": 1.9615294308884352, + "grad_norm": 7.718275047855931, + "learning_rate": 4.823232254258514e-09, + "loss": 0.6505, + "step": 27151 + }, + { + "epoch": 1.961601676088645, + "grad_norm": 6.903745171115118, + "learning_rate": 4.8050885766925955e-09, + "loss": 0.6013, + "step": 27152 + }, + { + "epoch": 1.9616739212888543, + "grad_norm": 10.570320800417104, + "learning_rate": 4.78697905640757e-09, + "loss": 0.6065, + "step": 27153 + }, + { + "epoch": 1.9617461664890639, + "grad_norm": 7.853467294600728, + "learning_rate": 4.768903693651017e-09, + "loss": 0.5866, + "step": 27154 + }, + { + "epoch": 1.9618184116892734, + "grad_norm": 5.494487948106689, + "learning_rate": 4.750862488671071e-09, + "loss": 0.5673, + "step": 27155 + }, + { + "epoch": 1.961890656889483, + "grad_norm": 6.247730606054496, + "learning_rate": 4.732855441714202e-09, + "loss": 0.6078, + "step": 27156 + }, + { + "epoch": 1.9619629020896925, + "grad_norm": 7.778233445156024, + "learning_rate": 4.714882553027433e-09, + "loss": 0.6392, + "step": 27157 + }, + { + "epoch": 1.9620351472899018, + "grad_norm": 7.985504846826618, + "learning_rate": 4.696943822856126e-09, + "loss": 0.5854, + "step": 27158 + }, + { + "epoch": 1.9621073924901116, + "grad_norm": 8.995268107601724, + "learning_rate": 4.679039251446193e-09, + "loss": 0.568, + "step": 27159 + }, + { + "epoch": 1.9621796376903209, + "grad_norm": 7.712666821028087, + "learning_rate": 4.661168839042996e-09, + "loss": 0.6435, + "step": 27160 + }, + { + "epoch": 1.9622518828905304, + "grad_norm": 7.137308297864786, + "learning_rate": 4.643332585890503e-09, + "loss": 0.5522, + "step": 27161 + }, + { + "epoch": 1.96232412809074, + "grad_norm": 7.181921079137733, + "learning_rate": 4.6255304922335206e-09, + "loss": 0.5869, + "step": 27162 + }, + { + "epoch": 1.9623963732909495, + "grad_norm": 7.832527795590717, + "learning_rate": 4.607762558315465e-09, + "loss": 0.5303, + "step": 27163 + }, + { + "epoch": 1.962468618491159, + "grad_norm": 5.567036354139476, + "learning_rate": 4.590028784379752e-09, + "loss": 0.6072, + "step": 27164 + }, + { + "epoch": 1.9625408636913684, + "grad_norm": 8.328142405880028, + "learning_rate": 4.572329170668688e-09, + "loss": 0.5802, + "step": 27165 + }, + { + "epoch": 1.9626131088915781, + "grad_norm": 8.05892841570594, + "learning_rate": 4.554663717424856e-09, + "loss": 0.6229, + "step": 27166 + }, + { + "epoch": 1.9626853540917875, + "grad_norm": 6.841957170991901, + "learning_rate": 4.5370324248902864e-09, + "loss": 0.6019, + "step": 27167 + }, + { + "epoch": 1.962757599291997, + "grad_norm": 8.56483639215113, + "learning_rate": 4.519435293306174e-09, + "loss": 0.596, + "step": 27168 + }, + { + "epoch": 1.9628298444922065, + "grad_norm": 7.498812772243621, + "learning_rate": 4.501872322913159e-09, + "loss": 0.6224, + "step": 27169 + }, + { + "epoch": 1.962902089692416, + "grad_norm": 6.10547187399952, + "learning_rate": 4.4843435139518835e-09, + "loss": 0.4969, + "step": 27170 + }, + { + "epoch": 1.9629743348926256, + "grad_norm": 7.114971397916092, + "learning_rate": 4.466848866662432e-09, + "loss": 0.6462, + "step": 27171 + }, + { + "epoch": 1.963046580092835, + "grad_norm": 7.4953705805090225, + "learning_rate": 4.449388381284337e-09, + "loss": 0.6656, + "step": 27172 + }, + { + "epoch": 1.9631188252930447, + "grad_norm": 9.221453771274989, + "learning_rate": 4.431962058056016e-09, + "loss": 0.7036, + "step": 27173 + }, + { + "epoch": 1.963191070493254, + "grad_norm": 7.247340273509935, + "learning_rate": 4.414569897216725e-09, + "loss": 0.5751, + "step": 27174 + }, + { + "epoch": 1.9632633156934636, + "grad_norm": 7.362487005406879, + "learning_rate": 4.397211899004328e-09, + "loss": 0.6359, + "step": 27175 + }, + { + "epoch": 1.9633355608936731, + "grad_norm": 8.975550608627158, + "learning_rate": 4.379888063656135e-09, + "loss": 0.568, + "step": 27176 + }, + { + "epoch": 1.9634078060938827, + "grad_norm": 7.812310056284718, + "learning_rate": 4.362598391409734e-09, + "loss": 0.5548, + "step": 27177 + }, + { + "epoch": 1.9634800512940922, + "grad_norm": 8.35725504295683, + "learning_rate": 4.345342882501602e-09, + "loss": 0.5982, + "step": 27178 + }, + { + "epoch": 1.9635522964943015, + "grad_norm": 6.991742304262847, + "learning_rate": 4.3281215371679395e-09, + "loss": 0.6352, + "step": 27179 + }, + { + "epoch": 1.9636245416945113, + "grad_norm": 6.8342196502262835, + "learning_rate": 4.310934355644669e-09, + "loss": 0.5579, + "step": 27180 + }, + { + "epoch": 1.9636967868947206, + "grad_norm": 7.886844749256635, + "learning_rate": 4.29378133816688e-09, + "loss": 0.6198, + "step": 27181 + }, + { + "epoch": 1.9637690320949301, + "grad_norm": 7.896409495996696, + "learning_rate": 4.276662484969385e-09, + "loss": 0.5333, + "step": 27182 + }, + { + "epoch": 1.9638412772951397, + "grad_norm": 7.425147561812982, + "learning_rate": 4.2595777962864405e-09, + "loss": 0.5628, + "step": 27183 + }, + { + "epoch": 1.9639135224953492, + "grad_norm": 7.088089486825218, + "learning_rate": 4.242527272352304e-09, + "loss": 0.5782, + "step": 27184 + }, + { + "epoch": 1.9639857676955588, + "grad_norm": 7.739817174679318, + "learning_rate": 4.225510913400121e-09, + "loss": 0.5981, + "step": 27185 + }, + { + "epoch": 1.964058012895768, + "grad_norm": 7.455501793207845, + "learning_rate": 4.208528719662486e-09, + "loss": 0.5261, + "step": 27186 + }, + { + "epoch": 1.9641302580959779, + "grad_norm": 7.551556700837056, + "learning_rate": 4.191580691372543e-09, + "loss": 0.5972, + "step": 27187 + }, + { + "epoch": 1.9642025032961872, + "grad_norm": 7.106903652703377, + "learning_rate": 4.174666828762053e-09, + "loss": 0.5874, + "step": 27188 + }, + { + "epoch": 1.9642747484963967, + "grad_norm": 6.731660300158388, + "learning_rate": 4.1577871320624965e-09, + "loss": 0.6207, + "step": 27189 + }, + { + "epoch": 1.9643469936966063, + "grad_norm": 7.859141776717909, + "learning_rate": 4.1409416015048e-09, + "loss": 0.6445, + "step": 27190 + }, + { + "epoch": 1.9644192388968158, + "grad_norm": 5.869157848872398, + "learning_rate": 4.124130237319613e-09, + "loss": 0.532, + "step": 27191 + }, + { + "epoch": 1.9644914840970253, + "grad_norm": 8.898457648545012, + "learning_rate": 4.107353039737305e-09, + "loss": 0.6086, + "step": 27192 + }, + { + "epoch": 1.9645637292972347, + "grad_norm": 7.393257430397572, + "learning_rate": 4.090610008987416e-09, + "loss": 0.5588, + "step": 27193 + }, + { + "epoch": 1.9646359744974444, + "grad_norm": 6.854365507293024, + "learning_rate": 4.073901145299208e-09, + "loss": 0.5258, + "step": 27194 + }, + { + "epoch": 1.9647082196976537, + "grad_norm": 8.362676407075258, + "learning_rate": 4.0572264489011085e-09, + "loss": 0.5894, + "step": 27195 + }, + { + "epoch": 1.9647804648978635, + "grad_norm": 7.877813411907649, + "learning_rate": 4.040585920021544e-09, + "loss": 0.6327, + "step": 27196 + }, + { + "epoch": 1.9648527100980728, + "grad_norm": 6.309577189696956, + "learning_rate": 4.023979558888669e-09, + "loss": 0.6068, + "step": 27197 + }, + { + "epoch": 1.9649249552982824, + "grad_norm": 6.67317419334287, + "learning_rate": 4.007407365729244e-09, + "loss": 0.5749, + "step": 27198 + }, + { + "epoch": 1.964997200498492, + "grad_norm": 7.991903738165734, + "learning_rate": 3.990869340770586e-09, + "loss": 0.5667, + "step": 27199 + }, + { + "epoch": 1.9650694456987012, + "grad_norm": 8.869998435930976, + "learning_rate": 3.974365484238907e-09, + "loss": 0.559, + "step": 27200 + }, + { + "epoch": 1.965141690898911, + "grad_norm": 6.87842072436522, + "learning_rate": 3.957895796360134e-09, + "loss": 0.5546, + "step": 27201 + }, + { + "epoch": 1.9652139360991203, + "grad_norm": 6.642262550850696, + "learning_rate": 3.941460277359643e-09, + "loss": 0.5821, + "step": 27202 + }, + { + "epoch": 1.96528618129933, + "grad_norm": 7.544793957626484, + "learning_rate": 3.925058927462533e-09, + "loss": 0.6157, + "step": 27203 + }, + { + "epoch": 1.9653584264995394, + "grad_norm": 8.007145075082502, + "learning_rate": 3.908691746893067e-09, + "loss": 0.5726, + "step": 27204 + }, + { + "epoch": 1.965430671699749, + "grad_norm": 7.140586802841628, + "learning_rate": 3.89235873587579e-09, + "loss": 0.5475, + "step": 27205 + }, + { + "epoch": 1.9655029168999585, + "grad_norm": 6.3279158192664555, + "learning_rate": 3.876059894633855e-09, + "loss": 0.6379, + "step": 27206 + }, + { + "epoch": 1.9655751621001678, + "grad_norm": 7.338742111064067, + "learning_rate": 3.8597952233906945e-09, + "loss": 0.6285, + "step": 27207 + }, + { + "epoch": 1.9656474073003776, + "grad_norm": 7.564245857927274, + "learning_rate": 3.843564722368909e-09, + "loss": 0.5797, + "step": 27208 + }, + { + "epoch": 1.965719652500587, + "grad_norm": 6.704835473416625, + "learning_rate": 3.827368391790265e-09, + "loss": 0.5579, + "step": 27209 + }, + { + "epoch": 1.9657918977007967, + "grad_norm": 7.9558259166239305, + "learning_rate": 3.811206231877085e-09, + "loss": 0.626, + "step": 27210 + }, + { + "epoch": 1.965864142901006, + "grad_norm": 6.986166773280039, + "learning_rate": 3.795078242850026e-09, + "loss": 0.6578, + "step": 27211 + }, + { + "epoch": 1.9659363881012155, + "grad_norm": 8.196321118075199, + "learning_rate": 3.778984424930298e-09, + "loss": 0.6901, + "step": 27212 + }, + { + "epoch": 1.966008633301425, + "grad_norm": 7.812843010048148, + "learning_rate": 3.762924778338284e-09, + "loss": 0.5526, + "step": 27213 + }, + { + "epoch": 1.9660808785016346, + "grad_norm": 7.088240714952157, + "learning_rate": 3.74689930329325e-09, + "loss": 0.6213, + "step": 27214 + }, + { + "epoch": 1.9661531237018441, + "grad_norm": 6.294780070346636, + "learning_rate": 3.730908000015299e-09, + "loss": 0.5633, + "step": 27215 + }, + { + "epoch": 1.9662253689020535, + "grad_norm": 7.071849204980159, + "learning_rate": 3.7149508687228665e-09, + "loss": 0.6209, + "step": 27216 + }, + { + "epoch": 1.9662976141022632, + "grad_norm": 8.289092413500454, + "learning_rate": 3.6990279096343897e-09, + "loss": 0.6192, + "step": 27217 + }, + { + "epoch": 1.9663698593024725, + "grad_norm": 8.02963442453635, + "learning_rate": 3.6831391229683045e-09, + "loss": 0.5903, + "step": 27218 + }, + { + "epoch": 1.966442104502682, + "grad_norm": 7.553772990311541, + "learning_rate": 3.6672845089413823e-09, + "loss": 0.5668, + "step": 27219 + }, + { + "epoch": 1.9665143497028916, + "grad_norm": 6.9420107362005306, + "learning_rate": 3.6514640677712266e-09, + "loss": 0.5898, + "step": 27220 + }, + { + "epoch": 1.9665865949031012, + "grad_norm": 6.497114274544074, + "learning_rate": 3.635677799674331e-09, + "loss": 0.6303, + "step": 27221 + }, + { + "epoch": 1.9666588401033107, + "grad_norm": 9.286355820177542, + "learning_rate": 3.619925704866634e-09, + "loss": 0.6517, + "step": 27222 + }, + { + "epoch": 1.96673108530352, + "grad_norm": 7.4167257013721715, + "learning_rate": 3.6042077835637955e-09, + "loss": 0.5568, + "step": 27223 + }, + { + "epoch": 1.9668033305037298, + "grad_norm": 6.640163127320132, + "learning_rate": 3.5885240359809227e-09, + "loss": 0.5677, + "step": 27224 + }, + { + "epoch": 1.9668755757039391, + "grad_norm": 7.03409962871368, + "learning_rate": 3.5728744623331203e-09, + "loss": 0.6065, + "step": 27225 + }, + { + "epoch": 1.9669478209041487, + "grad_norm": 7.503953273294231, + "learning_rate": 3.557259062834106e-09, + "loss": 0.6683, + "step": 27226 + }, + { + "epoch": 1.9670200661043582, + "grad_norm": 6.494427566499137, + "learning_rate": 3.5416778376978765e-09, + "loss": 0.5189, + "step": 27227 + }, + { + "epoch": 1.9670923113045677, + "grad_norm": 7.919247042766455, + "learning_rate": 3.5261307871375937e-09, + "loss": 0.6103, + "step": 27228 + }, + { + "epoch": 1.9671645565047773, + "grad_norm": 8.072394871409657, + "learning_rate": 3.510617911366421e-09, + "loss": 0.5801, + "step": 27229 + }, + { + "epoch": 1.9672368017049866, + "grad_norm": 6.821958642535074, + "learning_rate": 3.49513921059641e-09, + "loss": 0.6012, + "step": 27230 + }, + { + "epoch": 1.9673090469051964, + "grad_norm": 7.549360225425226, + "learning_rate": 3.479694685039614e-09, + "loss": 0.5845, + "step": 27231 + }, + { + "epoch": 1.9673812921054057, + "grad_norm": 7.401813707523898, + "learning_rate": 3.4642843349069756e-09, + "loss": 0.6716, + "step": 27232 + }, + { + "epoch": 1.9674535373056152, + "grad_norm": 7.98065445245576, + "learning_rate": 3.448908160410269e-09, + "loss": 0.6353, + "step": 27233 + }, + { + "epoch": 1.9675257825058248, + "grad_norm": 8.054671397595447, + "learning_rate": 3.4335661617593276e-09, + "loss": 0.6403, + "step": 27234 + }, + { + "epoch": 1.9675980277060343, + "grad_norm": 7.236516845548568, + "learning_rate": 3.4182583391645374e-09, + "loss": 0.6036, + "step": 27235 + }, + { + "epoch": 1.9676702729062439, + "grad_norm": 7.568214968380404, + "learning_rate": 3.4029846928354537e-09, + "loss": 0.6359, + "step": 27236 + }, + { + "epoch": 1.9677425181064532, + "grad_norm": 7.0025867042498735, + "learning_rate": 3.3877452229807985e-09, + "loss": 0.6267, + "step": 27237 + }, + { + "epoch": 1.967814763306663, + "grad_norm": 8.165486568382653, + "learning_rate": 3.3725399298095705e-09, + "loss": 0.5908, + "step": 27238 + }, + { + "epoch": 1.9678870085068723, + "grad_norm": 7.051614660319078, + "learning_rate": 3.35736881352966e-09, + "loss": 0.5144, + "step": 27239 + }, + { + "epoch": 1.9679592537070818, + "grad_norm": 7.3635094564271855, + "learning_rate": 3.3422318743489556e-09, + "loss": 0.5751, + "step": 27240 + }, + { + "epoch": 1.9680314989072913, + "grad_norm": 6.916371932893092, + "learning_rate": 3.327129112474237e-09, + "loss": 0.5773, + "step": 27241 + }, + { + "epoch": 1.9681037441075009, + "grad_norm": 7.117467278049233, + "learning_rate": 3.312060528112837e-09, + "loss": 0.5577, + "step": 27242 + }, + { + "epoch": 1.9681759893077104, + "grad_norm": 6.554590890602102, + "learning_rate": 3.297026121470981e-09, + "loss": 0.5741, + "step": 27243 + }, + { + "epoch": 1.9682482345079197, + "grad_norm": 6.834602269151456, + "learning_rate": 3.282025892754059e-09, + "loss": 0.5332, + "step": 27244 + }, + { + "epoch": 1.9683204797081295, + "grad_norm": 6.722708454184291, + "learning_rate": 3.2670598421674636e-09, + "loss": 0.5389, + "step": 27245 + }, + { + "epoch": 1.9683927249083388, + "grad_norm": 7.706713724531667, + "learning_rate": 3.2521279699165853e-09, + "loss": 0.6339, + "step": 27246 + }, + { + "epoch": 1.9684649701085484, + "grad_norm": 7.901296639566558, + "learning_rate": 3.23723027620515e-09, + "loss": 0.5574, + "step": 27247 + }, + { + "epoch": 1.968537215308758, + "grad_norm": 7.928038958487459, + "learning_rate": 3.2223667612374386e-09, + "loss": 0.6074, + "step": 27248 + }, + { + "epoch": 1.9686094605089675, + "grad_norm": 7.55168526137479, + "learning_rate": 3.207537425217178e-09, + "loss": 0.5283, + "step": 27249 + }, + { + "epoch": 1.968681705709177, + "grad_norm": 8.111809455153564, + "learning_rate": 3.1927422683469824e-09, + "loss": 0.6237, + "step": 27250 + }, + { + "epoch": 1.9687539509093863, + "grad_norm": 7.302861569917302, + "learning_rate": 3.177981290829468e-09, + "loss": 0.5651, + "step": 27251 + }, + { + "epoch": 1.968826196109596, + "grad_norm": 7.811643996073216, + "learning_rate": 3.1632544928666963e-09, + "loss": 0.6203, + "step": 27252 + }, + { + "epoch": 1.9688984413098054, + "grad_norm": 6.901091293060558, + "learning_rate": 3.148561874660172e-09, + "loss": 0.5696, + "step": 27253 + }, + { + "epoch": 1.968970686510015, + "grad_norm": 8.125019249526476, + "learning_rate": 3.1339034364114007e-09, + "loss": 0.5885, + "step": 27254 + }, + { + "epoch": 1.9690429317102245, + "grad_norm": 7.453365042157814, + "learning_rate": 3.1192791783207775e-09, + "loss": 0.6378, + "step": 27255 + }, + { + "epoch": 1.969115176910434, + "grad_norm": 7.853954956547221, + "learning_rate": 3.1046891005884204e-09, + "loss": 0.6544, + "step": 27256 + }, + { + "epoch": 1.9691874221106436, + "grad_norm": 7.678825384357119, + "learning_rate": 3.0901332034141695e-09, + "loss": 0.6024, + "step": 27257 + }, + { + "epoch": 1.969259667310853, + "grad_norm": 8.196282023055627, + "learning_rate": 3.0756114869973097e-09, + "loss": 0.6346, + "step": 27258 + }, + { + "epoch": 1.9693319125110627, + "grad_norm": 7.964499383341435, + "learning_rate": 3.061123951536571e-09, + "loss": 0.6066, + "step": 27259 + }, + { + "epoch": 1.969404157711272, + "grad_norm": 7.07358268856751, + "learning_rate": 3.046670597230128e-09, + "loss": 0.5849, + "step": 27260 + }, + { + "epoch": 1.9694764029114815, + "grad_norm": 6.775176391557547, + "learning_rate": 3.0322514242764335e-09, + "loss": 0.6576, + "step": 27261 + }, + { + "epoch": 1.969548648111691, + "grad_norm": 8.083588219258521, + "learning_rate": 3.0178664328719964e-09, + "loss": 0.569, + "step": 27262 + }, + { + "epoch": 1.9696208933119006, + "grad_norm": 7.9918624498598865, + "learning_rate": 3.003515623214437e-09, + "loss": 0.5876, + "step": 27263 + }, + { + "epoch": 1.9696931385121101, + "grad_norm": 6.732376828885418, + "learning_rate": 2.9891989954999866e-09, + "loss": 0.5725, + "step": 27264 + }, + { + "epoch": 1.9697653837123195, + "grad_norm": 6.565955714485141, + "learning_rate": 2.9749165499243225e-09, + "loss": 0.5355, + "step": 27265 + }, + { + "epoch": 1.9698376289125292, + "grad_norm": 5.8362358864691, + "learning_rate": 2.960668286683399e-09, + "loss": 0.5626, + "step": 27266 + }, + { + "epoch": 1.9699098741127385, + "grad_norm": 6.9115378732902695, + "learning_rate": 2.946454205972338e-09, + "loss": 0.6166, + "step": 27267 + }, + { + "epoch": 1.9699821193129483, + "grad_norm": 6.753988147457478, + "learning_rate": 2.93227430798515e-09, + "loss": 0.5306, + "step": 27268 + }, + { + "epoch": 1.9700543645131576, + "grad_norm": 6.535005064935334, + "learning_rate": 2.9181285929164025e-09, + "loss": 0.5517, + "step": 27269 + }, + { + "epoch": 1.9701266097133672, + "grad_norm": 8.19021305240285, + "learning_rate": 2.904017060959552e-09, + "loss": 0.5854, + "step": 27270 + }, + { + "epoch": 1.9701988549135767, + "grad_norm": 7.681475007352053, + "learning_rate": 2.889939712308054e-09, + "loss": 0.6185, + "step": 27271 + }, + { + "epoch": 1.970271100113786, + "grad_norm": 7.649620455632709, + "learning_rate": 2.8758965471542556e-09, + "loss": 0.5875, + "step": 27272 + }, + { + "epoch": 1.9703433453139958, + "grad_norm": 7.48725469375265, + "learning_rate": 2.861887565690502e-09, + "loss": 0.6234, + "step": 27273 + }, + { + "epoch": 1.9704155905142051, + "grad_norm": 7.350315746350283, + "learning_rate": 2.847912768108585e-09, + "loss": 0.6499, + "step": 27274 + }, + { + "epoch": 1.9704878357144149, + "grad_norm": 7.395707106968241, + "learning_rate": 2.8339721546000177e-09, + "loss": 0.6888, + "step": 27275 + }, + { + "epoch": 1.9705600809146242, + "grad_norm": 7.522705296015731, + "learning_rate": 2.8200657253552034e-09, + "loss": 0.5795, + "step": 27276 + }, + { + "epoch": 1.9706323261148337, + "grad_norm": 7.129461397477038, + "learning_rate": 2.806193480564823e-09, + "loss": 0.6008, + "step": 27277 + }, + { + "epoch": 1.9707045713150433, + "grad_norm": 8.680486468909105, + "learning_rate": 2.792355420418724e-09, + "loss": 0.6109, + "step": 27278 + }, + { + "epoch": 1.9707768165152526, + "grad_norm": 7.999987840643225, + "learning_rate": 2.7785515451064782e-09, + "loss": 0.6183, + "step": 27279 + }, + { + "epoch": 1.9708490617154624, + "grad_norm": 7.965852816628701, + "learning_rate": 2.764781854816545e-09, + "loss": 0.6122, + "step": 27280 + }, + { + "epoch": 1.9709213069156717, + "grad_norm": 7.616899784229673, + "learning_rate": 2.751046349738218e-09, + "loss": 0.5719, + "step": 27281 + }, + { + "epoch": 1.9709935521158815, + "grad_norm": 6.388533046966846, + "learning_rate": 2.7373450300588467e-09, + "loss": 0.5449, + "step": 27282 + }, + { + "epoch": 1.9710657973160908, + "grad_norm": 6.05481665688856, + "learning_rate": 2.7236778959660594e-09, + "loss": 0.6015, + "step": 27283 + }, + { + "epoch": 1.9711380425163003, + "grad_norm": 7.723482850852465, + "learning_rate": 2.7100449476472057e-09, + "loss": 0.6285, + "step": 27284 + }, + { + "epoch": 1.9712102877165099, + "grad_norm": 6.76827704273379, + "learning_rate": 2.696446185289081e-09, + "loss": 0.5806, + "step": 27285 + }, + { + "epoch": 1.9712825329167194, + "grad_norm": 9.178166845458106, + "learning_rate": 2.6828816090773703e-09, + "loss": 0.6803, + "step": 27286 + }, + { + "epoch": 1.971354778116929, + "grad_norm": 7.396042626513393, + "learning_rate": 2.669351219197758e-09, + "loss": 0.5859, + "step": 27287 + }, + { + "epoch": 1.9714270233171383, + "grad_norm": 7.485235558971476, + "learning_rate": 2.6558550158359287e-09, + "loss": 0.6242, + "step": 27288 + }, + { + "epoch": 1.971499268517348, + "grad_norm": 5.934932434674416, + "learning_rate": 2.6423929991764575e-09, + "loss": 0.5373, + "step": 27289 + }, + { + "epoch": 1.9715715137175573, + "grad_norm": 6.9341626087065755, + "learning_rate": 2.6289651694033634e-09, + "loss": 0.5719, + "step": 27290 + }, + { + "epoch": 1.9716437589177669, + "grad_norm": 7.799653255873958, + "learning_rate": 2.6155715267006663e-09, + "loss": 0.5814, + "step": 27291 + }, + { + "epoch": 1.9717160041179764, + "grad_norm": 7.153167727148221, + "learning_rate": 2.6022120712518305e-09, + "loss": 0.5675, + "step": 27292 + }, + { + "epoch": 1.971788249318186, + "grad_norm": 8.130247329009086, + "learning_rate": 2.588886803239488e-09, + "loss": 0.6394, + "step": 27293 + }, + { + "epoch": 1.9718604945183955, + "grad_norm": 8.639757440482406, + "learning_rate": 2.575595722846269e-09, + "loss": 0.583, + "step": 27294 + }, + { + "epoch": 1.9719327397186048, + "grad_norm": 6.927181508986701, + "learning_rate": 2.5623388302539743e-09, + "loss": 0.616, + "step": 27295 + }, + { + "epoch": 1.9720049849188146, + "grad_norm": 6.880382026445478, + "learning_rate": 2.5491161256441243e-09, + "loss": 0.5284, + "step": 27296 + }, + { + "epoch": 1.972077230119024, + "grad_norm": 8.461920768264077, + "learning_rate": 2.5359276091979635e-09, + "loss": 0.6241, + "step": 27297 + }, + { + "epoch": 1.9721494753192335, + "grad_norm": 7.5100648103601975, + "learning_rate": 2.5227732810953477e-09, + "loss": 0.622, + "step": 27298 + }, + { + "epoch": 1.972221720519443, + "grad_norm": 7.9431984938834574, + "learning_rate": 2.5096531415169655e-09, + "loss": 0.5345, + "step": 27299 + }, + { + "epoch": 1.9722939657196525, + "grad_norm": 6.292298577850726, + "learning_rate": 2.496567190642396e-09, + "loss": 0.5, + "step": 27300 + }, + { + "epoch": 1.972366210919862, + "grad_norm": 7.247603711020764, + "learning_rate": 2.4835154286506623e-09, + "loss": 0.5847, + "step": 27301 + }, + { + "epoch": 1.9724384561200714, + "grad_norm": 6.978075023699369, + "learning_rate": 2.470497855720233e-09, + "loss": 0.5606, + "step": 27302 + }, + { + "epoch": 1.9725107013202812, + "grad_norm": 7.4767533201223735, + "learning_rate": 2.457514472029576e-09, + "loss": 0.5648, + "step": 27303 + }, + { + "epoch": 1.9725829465204905, + "grad_norm": 8.015723036376595, + "learning_rate": 2.4445652777563276e-09, + "loss": 0.6415, + "step": 27304 + }, + { + "epoch": 1.9726551917207, + "grad_norm": 7.17690630852716, + "learning_rate": 2.4316502730775682e-09, + "loss": 0.6125, + "step": 27305 + }, + { + "epoch": 1.9727274369209096, + "grad_norm": 8.251579826985454, + "learning_rate": 2.4187694581706557e-09, + "loss": 0.6205, + "step": 27306 + }, + { + "epoch": 1.9727996821211191, + "grad_norm": 7.612742092248097, + "learning_rate": 2.4059228332112825e-09, + "loss": 0.6138, + "step": 27307 + }, + { + "epoch": 1.9728719273213287, + "grad_norm": 8.897324972293507, + "learning_rate": 2.39311039837542e-09, + "loss": 0.651, + "step": 27308 + }, + { + "epoch": 1.972944172521538, + "grad_norm": 7.549994856472663, + "learning_rate": 2.3803321538387604e-09, + "loss": 0.6325, + "step": 27309 + }, + { + "epoch": 1.9730164177217477, + "grad_norm": 6.952774690525848, + "learning_rate": 2.3675880997761636e-09, + "loss": 0.6091, + "step": 27310 + }, + { + "epoch": 1.973088662921957, + "grad_norm": 7.954707200028396, + "learning_rate": 2.354878236361935e-09, + "loss": 0.5611, + "step": 27311 + }, + { + "epoch": 1.9731609081221666, + "grad_norm": 6.900909983020355, + "learning_rate": 2.3422025637701017e-09, + "loss": 0.578, + "step": 27312 + }, + { + "epoch": 1.9732331533223761, + "grad_norm": 6.387848416435492, + "learning_rate": 2.329561082174414e-09, + "loss": 0.5629, + "step": 27313 + }, + { + "epoch": 1.9733053985225857, + "grad_norm": 6.559449276951117, + "learning_rate": 2.3169537917475118e-09, + "loss": 0.5361, + "step": 27314 + }, + { + "epoch": 1.9733776437227952, + "grad_norm": 7.10879762421954, + "learning_rate": 2.3043806926623112e-09, + "loss": 0.578, + "step": 27315 + }, + { + "epoch": 1.9734498889230045, + "grad_norm": 8.48610404084184, + "learning_rate": 2.2918417850906203e-09, + "loss": 0.6207, + "step": 27316 + }, + { + "epoch": 1.9735221341232143, + "grad_norm": 6.510964462630907, + "learning_rate": 2.2793370692045237e-09, + "loss": 0.5357, + "step": 27317 + }, + { + "epoch": 1.9735943793234236, + "grad_norm": 6.916623157532895, + "learning_rate": 2.2668665451747174e-09, + "loss": 0.53, + "step": 27318 + }, + { + "epoch": 1.9736666245236332, + "grad_norm": 7.192605237247276, + "learning_rate": 2.2544302131721764e-09, + "loss": 0.6305, + "step": 27319 + }, + { + "epoch": 1.9737388697238427, + "grad_norm": 8.281518089705365, + "learning_rate": 2.2420280733673194e-09, + "loss": 0.606, + "step": 27320 + }, + { + "epoch": 1.9738111149240523, + "grad_norm": 7.245923178585386, + "learning_rate": 2.229660125929456e-09, + "loss": 0.657, + "step": 27321 + }, + { + "epoch": 1.9738833601242618, + "grad_norm": 9.747094919769685, + "learning_rate": 2.2173263710281723e-09, + "loss": 0.5959, + "step": 27322 + }, + { + "epoch": 1.9739556053244711, + "grad_norm": 7.063623659660972, + "learning_rate": 2.2050268088325e-09, + "loss": 0.6972, + "step": 27323 + }, + { + "epoch": 1.9740278505246809, + "grad_norm": 7.176227254613895, + "learning_rate": 2.1927614395103604e-09, + "loss": 0.6355, + "step": 27324 + }, + { + "epoch": 1.9741000957248902, + "grad_norm": 7.769526585672934, + "learning_rate": 2.1805302632299517e-09, + "loss": 0.6136, + "step": 27325 + }, + { + "epoch": 1.9741723409250997, + "grad_norm": 7.160018528322936, + "learning_rate": 2.168333280158641e-09, + "loss": 0.6319, + "step": 27326 + }, + { + "epoch": 1.9742445861253093, + "grad_norm": 7.987932879399464, + "learning_rate": 2.156170490463516e-09, + "loss": 0.6319, + "step": 27327 + }, + { + "epoch": 1.9743168313255188, + "grad_norm": 6.809668276153399, + "learning_rate": 2.144041894310833e-09, + "loss": 0.6228, + "step": 27328 + }, + { + "epoch": 1.9743890765257284, + "grad_norm": 8.818468169663879, + "learning_rate": 2.1319474918668483e-09, + "loss": 0.604, + "step": 27329 + }, + { + "epoch": 1.9744613217259377, + "grad_norm": 6.285800481180498, + "learning_rate": 2.1198872832972615e-09, + "loss": 0.5816, + "step": 27330 + }, + { + "epoch": 1.9745335669261475, + "grad_norm": 7.805073618764652, + "learning_rate": 2.1078612687666645e-09, + "loss": 0.6074, + "step": 27331 + }, + { + "epoch": 1.9746058121263568, + "grad_norm": 7.886909320072526, + "learning_rate": 2.0958694484399245e-09, + "loss": 0.6093, + "step": 27332 + }, + { + "epoch": 1.9746780573265663, + "grad_norm": 7.762893779797955, + "learning_rate": 2.083911822481355e-09, + "loss": 0.6528, + "step": 27333 + }, + { + "epoch": 1.9747503025267759, + "grad_norm": 5.939651902399642, + "learning_rate": 2.0719883910544357e-09, + "loss": 0.598, + "step": 27334 + }, + { + "epoch": 1.9748225477269854, + "grad_norm": 7.0464987643812, + "learning_rate": 2.060099154322648e-09, + "loss": 0.5717, + "step": 27335 + }, + { + "epoch": 1.974894792927195, + "grad_norm": 7.230305723952232, + "learning_rate": 2.048244112448361e-09, + "loss": 0.6612, + "step": 27336 + }, + { + "epoch": 1.9749670381274043, + "grad_norm": 8.056045669320145, + "learning_rate": 2.0364232655939453e-09, + "loss": 0.5734, + "step": 27337 + }, + { + "epoch": 1.975039283327614, + "grad_norm": 7.1266121629867625, + "learning_rate": 2.024636613921771e-09, + "loss": 0.5521, + "step": 27338 + }, + { + "epoch": 1.9751115285278233, + "grad_norm": 7.441002285623116, + "learning_rate": 2.012884157592543e-09, + "loss": 0.6294, + "step": 27339 + }, + { + "epoch": 1.975183773728033, + "grad_norm": 7.833475503916444, + "learning_rate": 2.0011658967672433e-09, + "loss": 0.6053, + "step": 27340 + }, + { + "epoch": 1.9752560189282424, + "grad_norm": 7.748951010314487, + "learning_rate": 1.9894818316065768e-09, + "loss": 0.5726, + "step": 27341 + }, + { + "epoch": 1.975328264128452, + "grad_norm": 6.50959949298588, + "learning_rate": 1.9778319622704156e-09, + "loss": 0.5516, + "step": 27342 + }, + { + "epoch": 1.9754005093286615, + "grad_norm": 8.856604625679385, + "learning_rate": 1.9662162889180768e-09, + "loss": 0.6313, + "step": 27343 + }, + { + "epoch": 1.9754727545288708, + "grad_norm": 7.840049326605255, + "learning_rate": 1.9546348117086e-09, + "loss": 0.5873, + "step": 27344 + }, + { + "epoch": 1.9755449997290806, + "grad_norm": 8.541463362592602, + "learning_rate": 1.9430875308004694e-09, + "loss": 0.6087, + "step": 27345 + }, + { + "epoch": 1.97561724492929, + "grad_norm": 8.079051925633962, + "learning_rate": 1.931574446352169e-09, + "loss": 0.5398, + "step": 27346 + }, + { + "epoch": 1.9756894901294997, + "grad_norm": 6.813939213747257, + "learning_rate": 1.9200955585205185e-09, + "loss": 0.6397, + "step": 27347 + }, + { + "epoch": 1.975761735329709, + "grad_norm": 7.071592436373977, + "learning_rate": 1.9086508674634465e-09, + "loss": 0.6136, + "step": 27348 + }, + { + "epoch": 1.9758339805299185, + "grad_norm": 7.771154511251474, + "learning_rate": 1.897240373337217e-09, + "loss": 0.5931, + "step": 27349 + }, + { + "epoch": 1.975906225730128, + "grad_norm": 6.8951127366330205, + "learning_rate": 1.8858640762983716e-09, + "loss": 0.5744, + "step": 27350 + }, + { + "epoch": 1.9759784709303374, + "grad_norm": 8.41880886904065, + "learning_rate": 1.874521976502064e-09, + "loss": 0.6156, + "step": 27351 + }, + { + "epoch": 1.9760507161305472, + "grad_norm": 7.009494609381983, + "learning_rate": 1.863214074104003e-09, + "loss": 0.6137, + "step": 27352 + }, + { + "epoch": 1.9761229613307565, + "grad_norm": 7.137842750670596, + "learning_rate": 1.8519403692587867e-09, + "loss": 0.5947, + "step": 27353 + }, + { + "epoch": 1.9761952065309663, + "grad_norm": 6.168322504105021, + "learning_rate": 1.8407008621207368e-09, + "loss": 0.6474, + "step": 27354 + }, + { + "epoch": 1.9762674517311756, + "grad_norm": 8.276280938433338, + "learning_rate": 1.8294955528438963e-09, + "loss": 0.6374, + "step": 27355 + }, + { + "epoch": 1.9763396969313851, + "grad_norm": 6.941861542908228, + "learning_rate": 1.8183244415817535e-09, + "loss": 0.5372, + "step": 27356 + }, + { + "epoch": 1.9764119421315947, + "grad_norm": 6.287442774730265, + "learning_rate": 1.807187528486687e-09, + "loss": 0.6302, + "step": 27357 + }, + { + "epoch": 1.9764841873318042, + "grad_norm": 7.118347542415946, + "learning_rate": 1.7960848137119068e-09, + "loss": 0.5402, + "step": 27358 + }, + { + "epoch": 1.9765564325320137, + "grad_norm": 6.318104286291968, + "learning_rate": 1.7850162974086815e-09, + "loss": 0.594, + "step": 27359 + }, + { + "epoch": 1.976628677732223, + "grad_norm": 6.179101040835274, + "learning_rate": 1.7739819797288337e-09, + "loss": 0.5406, + "step": 27360 + }, + { + "epoch": 1.9767009229324328, + "grad_norm": 7.2187824991658855, + "learning_rate": 1.7629818608236314e-09, + "loss": 0.5227, + "step": 27361 + }, + { + "epoch": 1.9767731681326421, + "grad_norm": 7.210718484282292, + "learning_rate": 1.7520159408432325e-09, + "loss": 0.6475, + "step": 27362 + }, + { + "epoch": 1.9768454133328517, + "grad_norm": 6.531799585317503, + "learning_rate": 1.7410842199383493e-09, + "loss": 0.537, + "step": 27363 + }, + { + "epoch": 1.9769176585330612, + "grad_norm": 7.767044029836948, + "learning_rate": 1.730186698257752e-09, + "loss": 0.5437, + "step": 27364 + }, + { + "epoch": 1.9769899037332708, + "grad_norm": 8.44259286401862, + "learning_rate": 1.7193233759513206e-09, + "loss": 0.6409, + "step": 27365 + }, + { + "epoch": 1.9770621489334803, + "grad_norm": 6.431083370085346, + "learning_rate": 1.7084942531675474e-09, + "loss": 0.5342, + "step": 27366 + }, + { + "epoch": 1.9771343941336896, + "grad_norm": 8.308922019108671, + "learning_rate": 1.697699330054925e-09, + "loss": 0.5987, + "step": 27367 + }, + { + "epoch": 1.9772066393338994, + "grad_norm": 7.4847445147887415, + "learning_rate": 1.6869386067608351e-09, + "loss": 0.5865, + "step": 27368 + }, + { + "epoch": 1.9772788845341087, + "grad_norm": 7.724957769087135, + "learning_rate": 1.67621208343266e-09, + "loss": 0.5895, + "step": 27369 + }, + { + "epoch": 1.9773511297343183, + "grad_norm": 6.561412030815834, + "learning_rate": 1.6655197602172267e-09, + "loss": 0.5169, + "step": 27370 + }, + { + "epoch": 1.9774233749345278, + "grad_norm": 7.912938361055226, + "learning_rate": 1.6548616372613624e-09, + "loss": 0.521, + "step": 27371 + }, + { + "epoch": 1.9774956201347373, + "grad_norm": 8.838470932260295, + "learning_rate": 1.6442377147102285e-09, + "loss": 0.6217, + "step": 27372 + }, + { + "epoch": 1.9775678653349469, + "grad_norm": 6.664466335855261, + "learning_rate": 1.6336479927098193e-09, + "loss": 0.6279, + "step": 27373 + }, + { + "epoch": 1.9776401105351562, + "grad_norm": 7.33145658001988, + "learning_rate": 1.6230924714047413e-09, + "loss": 0.5382, + "step": 27374 + }, + { + "epoch": 1.977712355735366, + "grad_norm": 6.705819679809844, + "learning_rate": 1.612571150939879e-09, + "loss": 0.5844, + "step": 27375 + }, + { + "epoch": 1.9777846009355753, + "grad_norm": 6.254881516515283, + "learning_rate": 1.6020840314590059e-09, + "loss": 0.5363, + "step": 27376 + }, + { + "epoch": 1.9778568461357848, + "grad_norm": 8.874272706225177, + "learning_rate": 1.5916311131056184e-09, + "loss": 0.7349, + "step": 27377 + }, + { + "epoch": 1.9779290913359944, + "grad_norm": 6.882488819006538, + "learning_rate": 1.5812123960229354e-09, + "loss": 0.614, + "step": 27378 + }, + { + "epoch": 1.978001336536204, + "grad_norm": 9.46569654123468, + "learning_rate": 1.5708278803536202e-09, + "loss": 0.599, + "step": 27379 + }, + { + "epoch": 1.9780735817364135, + "grad_norm": 7.781699294022347, + "learning_rate": 1.560477566239782e-09, + "loss": 0.5348, + "step": 27380 + }, + { + "epoch": 1.9781458269366228, + "grad_norm": 7.469079173986268, + "learning_rate": 1.5501614538229736e-09, + "loss": 0.5526, + "step": 27381 + }, + { + "epoch": 1.9782180721368325, + "grad_norm": 8.297052931089965, + "learning_rate": 1.5398795432447488e-09, + "loss": 0.6158, + "step": 27382 + }, + { + "epoch": 1.9782903173370419, + "grad_norm": 9.229858326594396, + "learning_rate": 1.5296318346455509e-09, + "loss": 0.6485, + "step": 27383 + }, + { + "epoch": 1.9783625625372514, + "grad_norm": 8.796198058295511, + "learning_rate": 1.519418328165545e-09, + "loss": 0.6142, + "step": 27384 + }, + { + "epoch": 1.978434807737461, + "grad_norm": 7.878067630133733, + "learning_rate": 1.5092390239448974e-09, + "loss": 0.6818, + "step": 27385 + }, + { + "epoch": 1.9785070529376705, + "grad_norm": 7.709644707831619, + "learning_rate": 1.4990939221229406e-09, + "loss": 0.6068, + "step": 27386 + }, + { + "epoch": 1.97857929813788, + "grad_norm": 7.7571867568478226, + "learning_rate": 1.4889830228384527e-09, + "loss": 0.5825, + "step": 27387 + }, + { + "epoch": 1.9786515433380893, + "grad_norm": 7.1024506436344685, + "learning_rate": 1.4789063262296566e-09, + "loss": 0.6054, + "step": 27388 + }, + { + "epoch": 1.978723788538299, + "grad_norm": 5.475827939213672, + "learning_rate": 1.4688638324344973e-09, + "loss": 0.5773, + "step": 27389 + }, + { + "epoch": 1.9787960337385084, + "grad_norm": 8.141765961160306, + "learning_rate": 1.45885554159092e-09, + "loss": 0.6637, + "step": 27390 + }, + { + "epoch": 1.978868278938718, + "grad_norm": 7.297687101406122, + "learning_rate": 1.4488814538354823e-09, + "loss": 0.5926, + "step": 27391 + }, + { + "epoch": 1.9789405241389275, + "grad_norm": 7.3270223800673495, + "learning_rate": 1.4389415693050191e-09, + "loss": 0.5953, + "step": 27392 + }, + { + "epoch": 1.979012769339137, + "grad_norm": 8.636507180951169, + "learning_rate": 1.429035888135255e-09, + "loss": 0.535, + "step": 27393 + }, + { + "epoch": 1.9790850145393466, + "grad_norm": 7.360928575983632, + "learning_rate": 1.4191644104619151e-09, + "loss": 0.6019, + "step": 27394 + }, + { + "epoch": 1.979157259739556, + "grad_norm": 7.600326390535855, + "learning_rate": 1.4093271364204464e-09, + "loss": 0.6439, + "step": 27395 + }, + { + "epoch": 1.9792295049397657, + "grad_norm": 9.402907680689124, + "learning_rate": 1.3995240661449083e-09, + "loss": 0.6733, + "step": 27396 + }, + { + "epoch": 1.979301750139975, + "grad_norm": 8.472787267745641, + "learning_rate": 1.389755199770193e-09, + "loss": 0.5421, + "step": 27397 + }, + { + "epoch": 1.9793739953401845, + "grad_norm": 6.891339115844635, + "learning_rate": 1.3800205374295272e-09, + "loss": 0.5797, + "step": 27398 + }, + { + "epoch": 1.979446240540394, + "grad_norm": 7.515170965933962, + "learning_rate": 1.3703200792564153e-09, + "loss": 0.5855, + "step": 27399 + }, + { + "epoch": 1.9795184857406036, + "grad_norm": 7.413244107804916, + "learning_rate": 1.3606538253832512e-09, + "loss": 0.5851, + "step": 27400 + }, + { + "epoch": 1.9795907309408132, + "grad_norm": 7.193570964848985, + "learning_rate": 1.3510217759429843e-09, + "loss": 0.5846, + "step": 27401 + }, + { + "epoch": 1.9796629761410225, + "grad_norm": 8.106497958000508, + "learning_rate": 1.3414239310671761e-09, + "loss": 0.6141, + "step": 27402 + }, + { + "epoch": 1.9797352213412323, + "grad_norm": 6.829736584879951, + "learning_rate": 1.3318602908871103e-09, + "loss": 0.5161, + "step": 27403 + }, + { + "epoch": 1.9798074665414416, + "grad_norm": 7.659188590238514, + "learning_rate": 1.3223308555335157e-09, + "loss": 0.6036, + "step": 27404 + }, + { + "epoch": 1.9798797117416511, + "grad_norm": 6.5871302926193405, + "learning_rate": 1.3128356251373987e-09, + "loss": 0.6459, + "step": 27405 + }, + { + "epoch": 1.9799519569418607, + "grad_norm": 8.15386904753418, + "learning_rate": 1.3033745998286552e-09, + "loss": 0.6506, + "step": 27406 + }, + { + "epoch": 1.9800242021420702, + "grad_norm": 7.854530009020123, + "learning_rate": 1.2939477797363487e-09, + "loss": 0.5343, + "step": 27407 + }, + { + "epoch": 1.9800964473422797, + "grad_norm": 6.85444175222309, + "learning_rate": 1.2845551649895426e-09, + "loss": 0.5955, + "step": 27408 + }, + { + "epoch": 1.980168692542489, + "grad_norm": 7.047397907081889, + "learning_rate": 1.2751967557173006e-09, + "loss": 0.6467, + "step": 27409 + }, + { + "epoch": 1.9802409377426988, + "grad_norm": 7.283959928164585, + "learning_rate": 1.2658725520475757e-09, + "loss": 0.6872, + "step": 27410 + }, + { + "epoch": 1.9803131829429081, + "grad_norm": 6.595566630398437, + "learning_rate": 1.256582554107766e-09, + "loss": 0.5752, + "step": 27411 + }, + { + "epoch": 1.9803854281431177, + "grad_norm": 7.844468771536871, + "learning_rate": 1.2473267620252693e-09, + "loss": 0.5919, + "step": 27412 + }, + { + "epoch": 1.9804576733433272, + "grad_norm": 7.134444552469773, + "learning_rate": 1.2381051759266517e-09, + "loss": 0.5672, + "step": 27413 + }, + { + "epoch": 1.9805299185435368, + "grad_norm": 7.141376347318015, + "learning_rate": 1.2289177959382004e-09, + "loss": 0.6258, + "step": 27414 + }, + { + "epoch": 1.9806021637437463, + "grad_norm": 6.597604501813785, + "learning_rate": 1.2197646221859261e-09, + "loss": 0.6245, + "step": 27415 + }, + { + "epoch": 1.9806744089439556, + "grad_norm": 6.424274079990204, + "learning_rate": 1.2106456547947287e-09, + "loss": 0.6127, + "step": 27416 + }, + { + "epoch": 1.9807466541441654, + "grad_norm": 7.137219842065579, + "learning_rate": 1.2015608938895085e-09, + "loss": 0.5905, + "step": 27417 + }, + { + "epoch": 1.9808188993443747, + "grad_norm": 6.669784611508532, + "learning_rate": 1.1925103395948878e-09, + "loss": 0.5766, + "step": 27418 + }, + { + "epoch": 1.9808911445445845, + "grad_norm": 6.1614775948909974, + "learning_rate": 1.1834939920343792e-09, + "loss": 0.5596, + "step": 27419 + }, + { + "epoch": 1.9809633897447938, + "grad_norm": 9.335103502889936, + "learning_rate": 1.1745118513317722e-09, + "loss": 0.5942, + "step": 27420 + }, + { + "epoch": 1.9810356349450033, + "grad_norm": 7.347687659837836, + "learning_rate": 1.165563917610024e-09, + "loss": 0.5516, + "step": 27421 + }, + { + "epoch": 1.9811078801452129, + "grad_norm": 7.345240145523505, + "learning_rate": 1.1566501909912597e-09, + "loss": 0.5075, + "step": 27422 + }, + { + "epoch": 1.9811801253454222, + "grad_norm": 7.55266947258447, + "learning_rate": 1.1477706715978809e-09, + "loss": 0.6205, + "step": 27423 + }, + { + "epoch": 1.981252370545632, + "grad_norm": 7.604915244127401, + "learning_rate": 1.1389253595511796e-09, + "loss": 0.5857, + "step": 27424 + }, + { + "epoch": 1.9813246157458413, + "grad_norm": 6.546615704702591, + "learning_rate": 1.1301142549724476e-09, + "loss": 0.5648, + "step": 27425 + }, + { + "epoch": 1.981396860946051, + "grad_norm": 7.357091334740574, + "learning_rate": 1.121337357982144e-09, + "loss": 0.5586, + "step": 27426 + }, + { + "epoch": 1.9814691061462604, + "grad_norm": 7.721730757091299, + "learning_rate": 1.1125946687004508e-09, + "loss": 0.6091, + "step": 27427 + }, + { + "epoch": 1.98154135134647, + "grad_norm": 6.186102815342806, + "learning_rate": 1.1038861872469942e-09, + "loss": 0.6514, + "step": 27428 + }, + { + "epoch": 1.9816135965466795, + "grad_norm": 7.272014212331113, + "learning_rate": 1.0952119137414008e-09, + "loss": 0.6216, + "step": 27429 + }, + { + "epoch": 1.9816858417468888, + "grad_norm": 7.484455020612149, + "learning_rate": 1.0865718483016318e-09, + "loss": 0.6095, + "step": 27430 + }, + { + "epoch": 1.9817580869470985, + "grad_norm": 7.974652905220404, + "learning_rate": 1.0779659910467589e-09, + "loss": 0.6158, + "step": 27431 + }, + { + "epoch": 1.9818303321473079, + "grad_norm": 7.577111371079108, + "learning_rate": 1.0693943420941877e-09, + "loss": 0.6557, + "step": 27432 + }, + { + "epoch": 1.9819025773475176, + "grad_norm": 7.307121711724866, + "learning_rate": 1.060856901561047e-09, + "loss": 0.6415, + "step": 27433 + }, + { + "epoch": 1.981974822547727, + "grad_norm": 8.509363290566448, + "learning_rate": 1.052353669564743e-09, + "loss": 0.6846, + "step": 27434 + }, + { + "epoch": 1.9820470677479365, + "grad_norm": 7.855732432017847, + "learning_rate": 1.0438846462215713e-09, + "loss": 0.5411, + "step": 27435 + }, + { + "epoch": 1.982119312948146, + "grad_norm": 8.288572363424866, + "learning_rate": 1.0354498316469952e-09, + "loss": 0.5851, + "step": 27436 + }, + { + "epoch": 1.9821915581483556, + "grad_norm": 6.801197440237546, + "learning_rate": 1.0270492259567554e-09, + "loss": 0.5866, + "step": 27437 + }, + { + "epoch": 1.9822638033485651, + "grad_norm": 7.405637047750032, + "learning_rate": 1.0186828292660377e-09, + "loss": 0.6146, + "step": 27438 + }, + { + "epoch": 1.9823360485487744, + "grad_norm": 8.543754607773538, + "learning_rate": 1.0103506416891951e-09, + "loss": 0.6301, + "step": 27439 + }, + { + "epoch": 1.9824082937489842, + "grad_norm": 7.865121609959576, + "learning_rate": 1.0020526633403027e-09, + "loss": 0.6151, + "step": 27440 + }, + { + "epoch": 1.9824805389491935, + "grad_norm": 7.019232894848902, + "learning_rate": 9.937888943331587e-10, + "loss": 0.4737, + "step": 27441 + }, + { + "epoch": 1.982552784149403, + "grad_norm": 7.319145899488412, + "learning_rate": 9.855593347804504e-10, + "loss": 0.6487, + "step": 27442 + }, + { + "epoch": 1.9826250293496126, + "grad_norm": 8.44568288098552, + "learning_rate": 9.773639847951432e-10, + "loss": 0.6486, + "step": 27443 + }, + { + "epoch": 1.9826972745498221, + "grad_norm": 7.837072683850337, + "learning_rate": 9.692028444893697e-10, + "loss": 0.5626, + "step": 27444 + }, + { + "epoch": 1.9827695197500317, + "grad_norm": 7.118166943017149, + "learning_rate": 9.610759139747072e-10, + "loss": 0.5055, + "step": 27445 + }, + { + "epoch": 1.982841764950241, + "grad_norm": 6.821211258615875, + "learning_rate": 9.52983193362733e-10, + "loss": 0.4659, + "step": 27446 + }, + { + "epoch": 1.9829140101504508, + "grad_norm": 7.641423532770431, + "learning_rate": 9.449246827639147e-10, + "loss": 0.642, + "step": 27447 + }, + { + "epoch": 1.98298625535066, + "grad_norm": 6.226676451670591, + "learning_rate": 9.36900382288719e-10, + "loss": 0.567, + "step": 27448 + }, + { + "epoch": 1.9830585005508696, + "grad_norm": 7.300236779122304, + "learning_rate": 9.289102920467808e-10, + "loss": 0.5155, + "step": 27449 + }, + { + "epoch": 1.9831307457510792, + "grad_norm": 6.834087640331548, + "learning_rate": 9.209544121480119e-10, + "loss": 0.6148, + "step": 27450 + }, + { + "epoch": 1.9832029909512887, + "grad_norm": 6.924573247547067, + "learning_rate": 9.130327427006591e-10, + "loss": 0.6262, + "step": 27451 + }, + { + "epoch": 1.9832752361514983, + "grad_norm": 7.265153241992812, + "learning_rate": 9.051452838135244e-10, + "loss": 0.6065, + "step": 27452 + }, + { + "epoch": 1.9833474813517076, + "grad_norm": 7.177087821645307, + "learning_rate": 8.972920355945769e-10, + "loss": 0.6403, + "step": 27453 + }, + { + "epoch": 1.9834197265519173, + "grad_norm": 7.236190007959909, + "learning_rate": 8.894729981515082e-10, + "loss": 0.5699, + "step": 27454 + }, + { + "epoch": 1.9834919717521267, + "grad_norm": 7.770794933998066, + "learning_rate": 8.816881715908998e-10, + "loss": 0.5982, + "step": 27455 + }, + { + "epoch": 1.9835642169523362, + "grad_norm": 7.8136315098567435, + "learning_rate": 8.739375560193331e-10, + "loss": 0.5964, + "step": 27456 + }, + { + "epoch": 1.9836364621525457, + "grad_norm": 7.937544454615303, + "learning_rate": 8.662211515433893e-10, + "loss": 0.5937, + "step": 27457 + }, + { + "epoch": 1.9837087073527553, + "grad_norm": 7.3378669711989, + "learning_rate": 8.5853895826854e-10, + "loss": 0.6264, + "step": 27458 + }, + { + "epoch": 1.9837809525529648, + "grad_norm": 8.78550188292968, + "learning_rate": 8.508909762994233e-10, + "loss": 0.6235, + "step": 27459 + }, + { + "epoch": 1.9838531977531741, + "grad_norm": 8.056860082238343, + "learning_rate": 8.432772057415106e-10, + "loss": 0.6575, + "step": 27460 + }, + { + "epoch": 1.983925442953384, + "grad_norm": 7.531169716814699, + "learning_rate": 8.356976466986077e-10, + "loss": 0.5969, + "step": 27461 + }, + { + "epoch": 1.9839976881535932, + "grad_norm": 8.011099744492697, + "learning_rate": 8.281522992745206e-10, + "loss": 0.6657, + "step": 27462 + }, + { + "epoch": 1.9840699333538028, + "grad_norm": 7.412412502036148, + "learning_rate": 8.206411635724998e-10, + "loss": 0.6331, + "step": 27463 + }, + { + "epoch": 1.9841421785540123, + "grad_norm": 6.263125865370163, + "learning_rate": 8.131642396952411e-10, + "loss": 0.5977, + "step": 27464 + }, + { + "epoch": 1.9842144237542219, + "grad_norm": 9.306492038753529, + "learning_rate": 8.0572152774544e-10, + "loss": 0.6045, + "step": 27465 + }, + { + "epoch": 1.9842866689544314, + "grad_norm": 7.247607921732397, + "learning_rate": 7.98313027824682e-10, + "loss": 0.6278, + "step": 27466 + }, + { + "epoch": 1.9843589141546407, + "grad_norm": 5.973162073904643, + "learning_rate": 7.9093874003483e-10, + "loss": 0.5305, + "step": 27467 + }, + { + "epoch": 1.9844311593548505, + "grad_norm": 9.10907641489104, + "learning_rate": 7.835986644763593e-10, + "loss": 0.5532, + "step": 27468 + }, + { + "epoch": 1.9845034045550598, + "grad_norm": 6.683567209820917, + "learning_rate": 7.762928012497451e-10, + "loss": 0.5846, + "step": 27469 + }, + { + "epoch": 1.9845756497552693, + "grad_norm": 7.031957158449665, + "learning_rate": 7.690211504551847e-10, + "loss": 0.5833, + "step": 27470 + }, + { + "epoch": 1.9846478949554789, + "grad_norm": 6.870692620379712, + "learning_rate": 7.61783712192321e-10, + "loss": 0.571, + "step": 27471 + }, + { + "epoch": 1.9847201401556884, + "grad_norm": 8.151660548756213, + "learning_rate": 7.545804865602413e-10, + "loss": 0.5748, + "step": 27472 + }, + { + "epoch": 1.984792385355898, + "grad_norm": 7.1542504861313425, + "learning_rate": 7.474114736572002e-10, + "loss": 0.6294, + "step": 27473 + }, + { + "epoch": 1.9848646305561073, + "grad_norm": 6.589404658515222, + "learning_rate": 7.402766735814526e-10, + "loss": 0.5924, + "step": 27474 + }, + { + "epoch": 1.984936875756317, + "grad_norm": 7.4429644858022845, + "learning_rate": 7.331760864309756e-10, + "loss": 0.5793, + "step": 27475 + }, + { + "epoch": 1.9850091209565264, + "grad_norm": 7.149863245796646, + "learning_rate": 7.261097123029137e-10, + "loss": 0.6495, + "step": 27476 + }, + { + "epoch": 1.985081366156736, + "grad_norm": 7.131739591062377, + "learning_rate": 7.190775512935789e-10, + "loss": 0.5933, + "step": 27477 + }, + { + "epoch": 1.9851536113569455, + "grad_norm": 6.876632219241548, + "learning_rate": 7.120796034995603e-10, + "loss": 0.5538, + "step": 27478 + }, + { + "epoch": 1.985225856557155, + "grad_norm": 7.77755710954829, + "learning_rate": 7.051158690166149e-10, + "loss": 0.5815, + "step": 27479 + }, + { + "epoch": 1.9852981017573645, + "grad_norm": 7.417885955139579, + "learning_rate": 6.981863479399442e-10, + "loss": 0.5844, + "step": 27480 + }, + { + "epoch": 1.9853703469575739, + "grad_norm": 7.793740661174988, + "learning_rate": 6.912910403647499e-10, + "loss": 0.6163, + "step": 27481 + }, + { + "epoch": 1.9854425921577836, + "grad_norm": 7.384702087468004, + "learning_rate": 6.844299463848459e-10, + "loss": 0.6038, + "step": 27482 + }, + { + "epoch": 1.985514837357993, + "grad_norm": 6.295253043084514, + "learning_rate": 6.776030660948785e-10, + "loss": 0.6877, + "step": 27483 + }, + { + "epoch": 1.9855870825582025, + "grad_norm": 7.2376767362582735, + "learning_rate": 6.70810399587829e-10, + "loss": 0.5525, + "step": 27484 + }, + { + "epoch": 1.985659327758412, + "grad_norm": 7.482847308693051, + "learning_rate": 6.640519469566787e-10, + "loss": 0.5987, + "step": 27485 + }, + { + "epoch": 1.9857315729586216, + "grad_norm": 8.064999688202295, + "learning_rate": 6.573277082941309e-10, + "loss": 0.5833, + "step": 27486 + }, + { + "epoch": 1.9858038181588311, + "grad_norm": 6.297894492337982, + "learning_rate": 6.506376836920569e-10, + "loss": 0.5694, + "step": 27487 + }, + { + "epoch": 1.9858760633590404, + "grad_norm": 7.066285865648133, + "learning_rate": 6.439818732423275e-10, + "loss": 0.5464, + "step": 27488 + }, + { + "epoch": 1.9859483085592502, + "grad_norm": 7.050766101091759, + "learning_rate": 6.373602770357034e-10, + "loss": 0.6241, + "step": 27489 + }, + { + "epoch": 1.9860205537594595, + "grad_norm": 8.389135473583712, + "learning_rate": 6.307728951629454e-10, + "loss": 0.6751, + "step": 27490 + }, + { + "epoch": 1.9860927989596693, + "grad_norm": 8.241061830714306, + "learning_rate": 6.242197277142592e-10, + "loss": 0.5998, + "step": 27491 + }, + { + "epoch": 1.9861650441598786, + "grad_norm": 8.967869093961273, + "learning_rate": 6.177007747795727e-10, + "loss": 0.5658, + "step": 27492 + }, + { + "epoch": 1.9862372893600881, + "grad_norm": 7.54314524732011, + "learning_rate": 6.112160364477038e-10, + "loss": 0.568, + "step": 27493 + }, + { + "epoch": 1.9863095345602977, + "grad_norm": 7.188505152202521, + "learning_rate": 6.047655128077479e-10, + "loss": 0.6056, + "step": 27494 + }, + { + "epoch": 1.986381779760507, + "grad_norm": 8.14385676163951, + "learning_rate": 5.983492039479677e-10, + "loss": 0.6572, + "step": 27495 + }, + { + "epoch": 1.9864540249607168, + "grad_norm": 7.646209236349418, + "learning_rate": 5.919671099560708e-10, + "loss": 0.6899, + "step": 27496 + }, + { + "epoch": 1.986526270160926, + "grad_norm": 7.423694010232936, + "learning_rate": 5.856192309192099e-10, + "loss": 0.5722, + "step": 27497 + }, + { + "epoch": 1.9865985153611359, + "grad_norm": 7.35533053632425, + "learning_rate": 5.793055669248149e-10, + "loss": 0.553, + "step": 27498 + }, + { + "epoch": 1.9866707605613452, + "grad_norm": 6.577059335536957, + "learning_rate": 5.730261180589281e-10, + "loss": 0.5295, + "step": 27499 + }, + { + "epoch": 1.9867430057615547, + "grad_norm": 7.297526099748714, + "learning_rate": 5.667808844078693e-10, + "loss": 0.5585, + "step": 27500 + }, + { + "epoch": 1.9868152509617643, + "grad_norm": 6.8849196569648745, + "learning_rate": 5.605698660568481e-10, + "loss": 0.5984, + "step": 27501 + }, + { + "epoch": 1.9868874961619736, + "grad_norm": 8.254007002165487, + "learning_rate": 5.543930630907967e-10, + "loss": 0.6448, + "step": 27502 + }, + { + "epoch": 1.9869597413621833, + "grad_norm": 7.488134280231578, + "learning_rate": 5.482504755943696e-10, + "loss": 0.5665, + "step": 27503 + }, + { + "epoch": 1.9870319865623927, + "grad_norm": 7.130217666192608, + "learning_rate": 5.42142103651666e-10, + "loss": 0.5403, + "step": 27504 + }, + { + "epoch": 1.9871042317626024, + "grad_norm": 7.437494614543087, + "learning_rate": 5.360679473465081e-10, + "loss": 0.5489, + "step": 27505 + }, + { + "epoch": 1.9871764769628117, + "grad_norm": 5.902991632315062, + "learning_rate": 5.300280067618846e-10, + "loss": 0.5698, + "step": 27506 + }, + { + "epoch": 1.9872487221630213, + "grad_norm": 8.91631019331309, + "learning_rate": 5.240222819805074e-10, + "loss": 0.5885, + "step": 27507 + }, + { + "epoch": 1.9873209673632308, + "grad_norm": 9.906645469112316, + "learning_rate": 5.180507730842554e-10, + "loss": 0.5777, + "step": 27508 + }, + { + "epoch": 1.9873932125634404, + "grad_norm": 6.06615062796516, + "learning_rate": 5.121134801552852e-10, + "loss": 0.5679, + "step": 27509 + }, + { + "epoch": 1.98746545776365, + "grad_norm": 8.188053956838441, + "learning_rate": 5.062104032749204e-10, + "loss": 0.5526, + "step": 27510 + }, + { + "epoch": 1.9875377029638592, + "grad_norm": 7.517355610583855, + "learning_rate": 5.003415425236525e-10, + "loss": 0.5892, + "step": 27511 + }, + { + "epoch": 1.987609948164069, + "grad_norm": 8.572792271715464, + "learning_rate": 4.945068979819723e-10, + "loss": 0.5929, + "step": 27512 + }, + { + "epoch": 1.9876821933642783, + "grad_norm": 7.901631208607242, + "learning_rate": 4.887064697298161e-10, + "loss": 0.6221, + "step": 27513 + }, + { + "epoch": 1.9877544385644879, + "grad_norm": 9.26277062642352, + "learning_rate": 4.829402578465647e-10, + "loss": 0.6073, + "step": 27514 + }, + { + "epoch": 1.9878266837646974, + "grad_norm": 7.227613415984419, + "learning_rate": 4.772082624110441e-10, + "loss": 0.6082, + "step": 27515 + }, + { + "epoch": 1.987898928964907, + "grad_norm": 6.990359616176879, + "learning_rate": 4.715104835018025e-10, + "loss": 0.6373, + "step": 27516 + }, + { + "epoch": 1.9879711741651165, + "grad_norm": 8.053441362898505, + "learning_rate": 4.658469211968331e-10, + "loss": 0.602, + "step": 27517 + }, + { + "epoch": 1.9880434193653258, + "grad_norm": 7.63267206162383, + "learning_rate": 4.6021757557357385e-10, + "loss": 0.5947, + "step": 27518 + }, + { + "epoch": 1.9881156645655356, + "grad_norm": 6.841760076085737, + "learning_rate": 4.546224467091853e-10, + "loss": 0.6323, + "step": 27519 + }, + { + "epoch": 1.9881879097657449, + "grad_norm": 6.906660939414302, + "learning_rate": 4.4906153468027294e-10, + "loss": 0.6708, + "step": 27520 + }, + { + "epoch": 1.9882601549659544, + "grad_norm": 7.458409257498796, + "learning_rate": 4.435348395628869e-10, + "loss": 0.5437, + "step": 27521 + }, + { + "epoch": 1.988332400166164, + "grad_norm": 8.838915039040867, + "learning_rate": 4.380423614328e-10, + "loss": 0.5761, + "step": 27522 + }, + { + "epoch": 1.9884046453663735, + "grad_norm": 7.249647986975546, + "learning_rate": 4.325841003652298e-10, + "loss": 0.5409, + "step": 27523 + }, + { + "epoch": 1.988476890566583, + "grad_norm": 7.620869753003021, + "learning_rate": 4.271600564348388e-10, + "loss": 0.6036, + "step": 27524 + }, + { + "epoch": 1.9885491357667924, + "grad_norm": 7.416681982588938, + "learning_rate": 4.2177022971545687e-10, + "loss": 0.6795, + "step": 27525 + }, + { + "epoch": 1.9886213809670021, + "grad_norm": 8.663472051426025, + "learning_rate": 4.1641462028146895e-10, + "loss": 0.6409, + "step": 27526 + }, + { + "epoch": 1.9886936261672115, + "grad_norm": 7.486668500121592, + "learning_rate": 4.110932282061497e-10, + "loss": 0.609, + "step": 27527 + }, + { + "epoch": 1.988765871367421, + "grad_norm": 9.012726157389114, + "learning_rate": 4.0580605356194125e-10, + "loss": 0.5821, + "step": 27528 + }, + { + "epoch": 1.9888381165676305, + "grad_norm": 7.2184329272897765, + "learning_rate": 4.0055309642128557e-10, + "loss": 0.5376, + "step": 27529 + }, + { + "epoch": 1.98891036176784, + "grad_norm": 8.842052498997024, + "learning_rate": 3.953343568566248e-10, + "loss": 0.5964, + "step": 27530 + }, + { + "epoch": 1.9889826069680496, + "grad_norm": 6.246048250205604, + "learning_rate": 3.9014983493873557e-10, + "loss": 0.5914, + "step": 27531 + }, + { + "epoch": 1.989054852168259, + "grad_norm": 7.397787289108977, + "learning_rate": 3.849995307389498e-10, + "loss": 0.5769, + "step": 27532 + }, + { + "epoch": 1.9891270973684687, + "grad_norm": 8.095318852083254, + "learning_rate": 3.798834443277666e-10, + "loss": 0.6011, + "step": 27533 + }, + { + "epoch": 1.989199342568678, + "grad_norm": 7.2061123228527535, + "learning_rate": 3.7480157577513e-10, + "loss": 0.6254, + "step": 27534 + }, + { + "epoch": 1.9892715877688876, + "grad_norm": 7.103932336725582, + "learning_rate": 3.6975392515042895e-10, + "loss": 0.5614, + "step": 27535 + }, + { + "epoch": 1.9893438329690971, + "grad_norm": 6.926933145448545, + "learning_rate": 3.6474049252305243e-10, + "loss": 0.5357, + "step": 27536 + }, + { + "epoch": 1.9894160781693067, + "grad_norm": 7.854639526487798, + "learning_rate": 3.597612779615567e-10, + "loss": 0.5419, + "step": 27537 + }, + { + "epoch": 1.9894883233695162, + "grad_norm": 6.642113516536189, + "learning_rate": 3.548162815342204e-10, + "loss": 0.5348, + "step": 27538 + }, + { + "epoch": 1.9895605685697255, + "grad_norm": 7.91293137084094, + "learning_rate": 3.499055033084897e-10, + "loss": 0.5929, + "step": 27539 + }, + { + "epoch": 1.9896328137699353, + "grad_norm": 7.244099550806984, + "learning_rate": 3.450289433518106e-10, + "loss": 0.6391, + "step": 27540 + }, + { + "epoch": 1.9897050589701446, + "grad_norm": 7.991009428692543, + "learning_rate": 3.4018660173051886e-10, + "loss": 0.587, + "step": 27541 + }, + { + "epoch": 1.9897773041703541, + "grad_norm": 5.920673500103195, + "learning_rate": 3.3537847851150553e-10, + "loss": 0.582, + "step": 27542 + }, + { + "epoch": 1.9898495493705637, + "grad_norm": 7.284271268298308, + "learning_rate": 3.3060457376027364e-10, + "loss": 0.5305, + "step": 27543 + }, + { + "epoch": 1.9899217945707732, + "grad_norm": 7.66408191176823, + "learning_rate": 3.258648875420489e-10, + "loss": 0.606, + "step": 27544 + }, + { + "epoch": 1.9899940397709828, + "grad_norm": 7.353312528320835, + "learning_rate": 3.211594199220569e-10, + "loss": 0.559, + "step": 27545 + }, + { + "epoch": 1.990066284971192, + "grad_norm": 8.242749814201067, + "learning_rate": 3.1648817096441295e-10, + "loss": 0.6063, + "step": 27546 + }, + { + "epoch": 1.9901385301714019, + "grad_norm": 6.626013624274573, + "learning_rate": 3.118511407332325e-10, + "loss": 0.657, + "step": 27547 + }, + { + "epoch": 1.9902107753716112, + "grad_norm": 8.824069173422416, + "learning_rate": 3.0724832929179826e-10, + "loss": 0.661, + "step": 27548 + }, + { + "epoch": 1.9902830205718207, + "grad_norm": 7.836573262157642, + "learning_rate": 3.026797367033929e-10, + "loss": 0.6522, + "step": 27549 + }, + { + "epoch": 1.9903552657720303, + "grad_norm": 6.383675450385881, + "learning_rate": 2.9814536303018893e-10, + "loss": 0.5785, + "step": 27550 + }, + { + "epoch": 1.9904275109722398, + "grad_norm": 8.637040289695326, + "learning_rate": 2.9364520833463637e-10, + "loss": 0.5863, + "step": 27551 + }, + { + "epoch": 1.9904997561724493, + "grad_norm": 7.001085469690128, + "learning_rate": 2.891792726780751e-10, + "loss": 0.6139, + "step": 27552 + }, + { + "epoch": 1.9905720013726587, + "grad_norm": 6.210724986537275, + "learning_rate": 2.8474755612184493e-10, + "loss": 0.5212, + "step": 27553 + }, + { + "epoch": 1.9906442465728684, + "grad_norm": 7.849967042890237, + "learning_rate": 2.803500587267305e-10, + "loss": 0.6104, + "step": 27554 + }, + { + "epoch": 1.9907164917730777, + "grad_norm": 8.824957085091743, + "learning_rate": 2.7598678055240633e-10, + "loss": 0.5723, + "step": 27555 + }, + { + "epoch": 1.9907887369732873, + "grad_norm": 7.406915948600261, + "learning_rate": 2.716577216591021e-10, + "loss": 0.6414, + "step": 27556 + }, + { + "epoch": 1.9908609821734968, + "grad_norm": 6.38768717571814, + "learning_rate": 2.6736288210565954e-10, + "loss": 0.5785, + "step": 27557 + }, + { + "epoch": 1.9909332273737064, + "grad_norm": 7.922299465868751, + "learning_rate": 2.63102261951198e-10, + "loss": 0.6822, + "step": 27558 + }, + { + "epoch": 1.991005472573916, + "grad_norm": 6.919772205462278, + "learning_rate": 2.588758612540043e-10, + "loss": 0.5632, + "step": 27559 + }, + { + "epoch": 1.9910777177741252, + "grad_norm": 6.92159034712035, + "learning_rate": 2.5468368007181e-10, + "loss": 0.6501, + "step": 27560 + }, + { + "epoch": 1.991149962974335, + "grad_norm": 8.334674371263702, + "learning_rate": 2.5052571846234664e-10, + "loss": 0.6626, + "step": 27561 + }, + { + "epoch": 1.9912222081745443, + "grad_norm": 7.43525803057306, + "learning_rate": 2.464019764819581e-10, + "loss": 0.6452, + "step": 27562 + }, + { + "epoch": 1.991294453374754, + "grad_norm": 7.391872695867015, + "learning_rate": 2.4231245418726567e-10, + "loss": 0.6076, + "step": 27563 + }, + { + "epoch": 1.9913666985749634, + "grad_norm": 9.423418263431701, + "learning_rate": 2.3825715163461325e-10, + "loss": 0.6823, + "step": 27564 + }, + { + "epoch": 1.991438943775173, + "grad_norm": 6.724301054415259, + "learning_rate": 2.3423606887923446e-10, + "loss": 0.5914, + "step": 27565 + }, + { + "epoch": 1.9915111889753825, + "grad_norm": 7.00857971822737, + "learning_rate": 2.302492059763628e-10, + "loss": 0.5491, + "step": 27566 + }, + { + "epoch": 1.9915834341755918, + "grad_norm": 6.935580426207216, + "learning_rate": 2.2629656298039925e-10, + "loss": 0.5988, + "step": 27567 + }, + { + "epoch": 1.9916556793758016, + "grad_norm": 6.132956494627223, + "learning_rate": 2.2237813994518964e-10, + "loss": 0.602, + "step": 27568 + }, + { + "epoch": 1.9917279245760109, + "grad_norm": 8.329985849063938, + "learning_rate": 2.1849393692485731e-10, + "loss": 0.6513, + "step": 27569 + }, + { + "epoch": 1.9918001697762207, + "grad_norm": 6.345838982656181, + "learning_rate": 2.1464395397241543e-10, + "loss": 0.6215, + "step": 27570 + }, + { + "epoch": 1.99187241497643, + "grad_norm": 7.105525657757269, + "learning_rate": 2.108281911405996e-10, + "loss": 0.6051, + "step": 27571 + }, + { + "epoch": 1.9919446601766395, + "grad_norm": 7.3682385500915135, + "learning_rate": 2.070466484815903e-10, + "loss": 0.6059, + "step": 27572 + }, + { + "epoch": 1.992016905376849, + "grad_norm": 6.376481426371394, + "learning_rate": 2.0329932604701286e-10, + "loss": 0.6273, + "step": 27573 + }, + { + "epoch": 1.9920891505770584, + "grad_norm": 7.312534430007086, + "learning_rate": 1.995862238884927e-10, + "loss": 0.6107, + "step": 27574 + }, + { + "epoch": 1.9921613957772681, + "grad_norm": 7.154523749559282, + "learning_rate": 1.9590734205626739e-10, + "loss": 0.6391, + "step": 27575 + }, + { + "epoch": 1.9922336409774775, + "grad_norm": 7.1347220498087625, + "learning_rate": 1.9226268060140718e-10, + "loss": 0.5803, + "step": 27576 + }, + { + "epoch": 1.9923058861776872, + "grad_norm": 7.1574708655134485, + "learning_rate": 1.886522395735946e-10, + "loss": 0.544, + "step": 27577 + }, + { + "epoch": 1.9923781313778965, + "grad_norm": 7.369126389381294, + "learning_rate": 1.8507601902195692e-10, + "loss": 0.5846, + "step": 27578 + }, + { + "epoch": 1.992450376578106, + "grad_norm": 6.841539278443582, + "learning_rate": 1.8153401899562162e-10, + "loss": 0.5821, + "step": 27579 + }, + { + "epoch": 1.9925226217783156, + "grad_norm": 8.526903994759166, + "learning_rate": 1.780262395431609e-10, + "loss": 0.5855, + "step": 27580 + }, + { + "epoch": 1.992594866978525, + "grad_norm": 8.774159239569801, + "learning_rate": 1.745526807125919e-10, + "loss": 0.5911, + "step": 27581 + }, + { + "epoch": 1.9926671121787347, + "grad_norm": 7.738850356486284, + "learning_rate": 1.7111334255137667e-10, + "loss": 0.6247, + "step": 27582 + }, + { + "epoch": 1.992739357378944, + "grad_norm": 6.657157155576468, + "learning_rate": 1.6770822510669972e-10, + "loss": 0.6041, + "step": 27583 + }, + { + "epoch": 1.9928116025791538, + "grad_norm": 7.738119802470919, + "learning_rate": 1.6433732842491279e-10, + "loss": 0.6388, + "step": 27584 + }, + { + "epoch": 1.9928838477793631, + "grad_norm": 7.071471600862627, + "learning_rate": 1.610006525526453e-10, + "loss": 0.6076, + "step": 27585 + }, + { + "epoch": 1.9929560929795727, + "grad_norm": 8.263238169235104, + "learning_rate": 1.5769819753513883e-10, + "loss": 0.6436, + "step": 27586 + }, + { + "epoch": 1.9930283381797822, + "grad_norm": 8.101252668557171, + "learning_rate": 1.5442996341763495e-10, + "loss": 0.6016, + "step": 27587 + }, + { + "epoch": 1.9931005833799917, + "grad_norm": 7.956477030281583, + "learning_rate": 1.511959502450977e-10, + "loss": 0.6292, + "step": 27588 + }, + { + "epoch": 1.9931728285802013, + "grad_norm": 7.810893633681709, + "learning_rate": 1.4799615806165845e-10, + "loss": 0.6751, + "step": 27589 + }, + { + "epoch": 1.9932450737804106, + "grad_norm": 6.753765892392781, + "learning_rate": 1.4483058691089346e-10, + "loss": 0.5752, + "step": 27590 + }, + { + "epoch": 1.9933173189806204, + "grad_norm": 6.305876023060784, + "learning_rate": 1.4169923683665655e-10, + "loss": 0.6118, + "step": 27591 + }, + { + "epoch": 1.9933895641808297, + "grad_norm": 7.098337446242777, + "learning_rate": 1.386021078814137e-10, + "loss": 0.631, + "step": 27592 + }, + { + "epoch": 1.9934618093810392, + "grad_norm": 7.126404740908981, + "learning_rate": 1.3553920008790855e-10, + "loss": 0.6015, + "step": 27593 + }, + { + "epoch": 1.9935340545812488, + "grad_norm": 6.483678720618201, + "learning_rate": 1.3251051349777444e-10, + "loss": 0.5273, + "step": 27594 + }, + { + "epoch": 1.9936062997814583, + "grad_norm": 8.20733755303586, + "learning_rate": 1.295160481523672e-10, + "loss": 0.5745, + "step": 27595 + }, + { + "epoch": 1.9936785449816679, + "grad_norm": 7.693639273199211, + "learning_rate": 1.2655580409304258e-10, + "loss": 0.6329, + "step": 27596 + }, + { + "epoch": 1.9937507901818772, + "grad_norm": 6.898322755716857, + "learning_rate": 1.2362978136004622e-10, + "loss": 0.569, + "step": 27597 + }, + { + "epoch": 1.993823035382087, + "grad_norm": 6.617454118097605, + "learning_rate": 1.207379799936237e-10, + "loss": 0.5381, + "step": 27598 + }, + { + "epoch": 1.9938952805822963, + "grad_norm": 8.35366380911062, + "learning_rate": 1.178804000331879e-10, + "loss": 0.6079, + "step": 27599 + }, + { + "epoch": 1.9939675257825058, + "grad_norm": 7.090311031863444, + "learning_rate": 1.1505704151787423e-10, + "loss": 0.5886, + "step": 27600 + }, + { + "epoch": 1.9940397709827153, + "grad_norm": 7.040591667628044, + "learning_rate": 1.1226790448654046e-10, + "loss": 0.6939, + "step": 27601 + }, + { + "epoch": 1.9941120161829249, + "grad_norm": 7.581999553160175, + "learning_rate": 1.0951298897721175e-10, + "loss": 0.5827, + "step": 27602 + }, + { + "epoch": 1.9941842613831344, + "grad_norm": 5.5134208965305564, + "learning_rate": 1.0679229502763567e-10, + "loss": 0.5531, + "step": 27603 + }, + { + "epoch": 1.9942565065833437, + "grad_norm": 6.351010231698164, + "learning_rate": 1.0410582267472714e-10, + "loss": 0.6309, + "step": 27604 + }, + { + "epoch": 1.9943287517835535, + "grad_norm": 7.480760693144495, + "learning_rate": 1.0145357195595618e-10, + "loss": 0.621, + "step": 27605 + }, + { + "epoch": 1.9944009969837628, + "grad_norm": 7.644933690943043, + "learning_rate": 9.883554290712749e-11, + "loss": 0.5748, + "step": 27606 + }, + { + "epoch": 1.9944732421839724, + "grad_norm": 6.53833733004836, + "learning_rate": 9.625173556404576e-11, + "loss": 0.6048, + "step": 27607 + }, + { + "epoch": 1.994545487384182, + "grad_norm": 6.474317725331753, + "learning_rate": 9.370214996223814e-11, + "loss": 0.5692, + "step": 27608 + }, + { + "epoch": 1.9946177325843915, + "grad_norm": 8.404342037773725, + "learning_rate": 9.118678613667665e-11, + "loss": 0.6115, + "step": 27609 + }, + { + "epoch": 1.994689977784601, + "grad_norm": 8.353586634971158, + "learning_rate": 8.870564412177818e-11, + "loss": 0.5993, + "step": 27610 + }, + { + "epoch": 1.9947622229848103, + "grad_norm": 6.314515651271778, + "learning_rate": 8.625872395140456e-11, + "loss": 0.5625, + "step": 27611 + }, + { + "epoch": 1.99483446818502, + "grad_norm": 6.184069674811234, + "learning_rate": 8.384602565914002e-11, + "loss": 0.561, + "step": 27612 + }, + { + "epoch": 1.9949067133852294, + "grad_norm": 7.825332234797506, + "learning_rate": 8.146754927773614e-11, + "loss": 0.6064, + "step": 27613 + }, + { + "epoch": 1.994978958585439, + "grad_norm": 6.497514396202399, + "learning_rate": 7.912329484022208e-11, + "loss": 0.603, + "step": 27614 + }, + { + "epoch": 1.9950512037856485, + "grad_norm": 7.32427135397776, + "learning_rate": 7.681326237851671e-11, + "loss": 0.593, + "step": 27615 + }, + { + "epoch": 1.995123448985858, + "grad_norm": 7.468400380398962, + "learning_rate": 7.453745192398387e-11, + "loss": 0.664, + "step": 27616 + }, + { + "epoch": 1.9951956941860676, + "grad_norm": 7.397344071374721, + "learning_rate": 7.229586350798734e-11, + "loss": 0.6127, + "step": 27617 + }, + { + "epoch": 1.995267939386277, + "grad_norm": 6.054099643841237, + "learning_rate": 7.008849716105826e-11, + "loss": 0.6353, + "step": 27618 + }, + { + "epoch": 1.9953401845864867, + "grad_norm": 6.908214997250895, + "learning_rate": 6.791535291372775e-11, + "loss": 0.5373, + "step": 27619 + }, + { + "epoch": 1.995412429786696, + "grad_norm": 6.829931513385699, + "learning_rate": 6.577643079569429e-11, + "loss": 0.5964, + "step": 27620 + }, + { + "epoch": 1.9954846749869055, + "grad_norm": 7.420887064181474, + "learning_rate": 6.367173083582367e-11, + "loss": 0.5641, + "step": 27621 + }, + { + "epoch": 1.995556920187115, + "grad_norm": 7.678118680362173, + "learning_rate": 6.160125306325926e-11, + "loss": 0.6485, + "step": 27622 + }, + { + "epoch": 1.9956291653873246, + "grad_norm": 8.210991391949689, + "learning_rate": 5.956499750658929e-11, + "loss": 0.5813, + "step": 27623 + }, + { + "epoch": 1.9957014105875341, + "grad_norm": 7.977917472988207, + "learning_rate": 5.756296419301421e-11, + "loss": 0.5963, + "step": 27624 + }, + { + "epoch": 1.9957736557877435, + "grad_norm": 7.821870357132561, + "learning_rate": 5.5595153150567183e-11, + "loss": 0.6551, + "step": 27625 + }, + { + "epoch": 1.9958459009879532, + "grad_norm": 7.90760061918217, + "learning_rate": 5.366156440589354e-11, + "loss": 0.6814, + "step": 27626 + }, + { + "epoch": 1.9959181461881625, + "grad_norm": 6.806440826066881, + "learning_rate": 5.176219798536109e-11, + "loss": 0.6476, + "step": 27627 + }, + { + "epoch": 1.995990391388372, + "grad_norm": 7.321288476328008, + "learning_rate": 4.9897053915337614e-11, + "loss": 0.6625, + "step": 27628 + }, + { + "epoch": 1.9960626365885816, + "grad_norm": 7.289584779989079, + "learning_rate": 4.8066132221080696e-11, + "loss": 0.5807, + "step": 27629 + }, + { + "epoch": 1.9961348817887912, + "grad_norm": 6.7420268122508515, + "learning_rate": 4.6269432927570354e-11, + "loss": 0.623, + "step": 27630 + }, + { + "epoch": 1.9962071269890007, + "grad_norm": 7.253441355592767, + "learning_rate": 4.450695605950906e-11, + "loss": 0.5737, + "step": 27631 + }, + { + "epoch": 1.99627937218921, + "grad_norm": 8.446851732996072, + "learning_rate": 4.27787016413217e-11, + "loss": 0.5672, + "step": 27632 + }, + { + "epoch": 1.9963516173894198, + "grad_norm": 7.352966239437697, + "learning_rate": 4.108466969632297e-11, + "loss": 0.6153, + "step": 27633 + }, + { + "epoch": 1.9964238625896291, + "grad_norm": 9.192511404952173, + "learning_rate": 3.942486024754999e-11, + "loss": 0.5589, + "step": 27634 + }, + { + "epoch": 1.9964961077898387, + "grad_norm": 6.512586584998284, + "learning_rate": 3.7799273318317454e-11, + "loss": 0.5573, + "step": 27635 + }, + { + "epoch": 1.9965683529900482, + "grad_norm": 6.705080116242786, + "learning_rate": 3.62079089302747e-11, + "loss": 0.5785, + "step": 27636 + }, + { + "epoch": 1.9966405981902577, + "grad_norm": 6.745902901994274, + "learning_rate": 3.46507671056262e-11, + "loss": 0.5367, + "step": 27637 + }, + { + "epoch": 1.9967128433904673, + "grad_norm": 6.825782356554649, + "learning_rate": 3.3127847865188635e-11, + "loss": 0.6088, + "step": 27638 + }, + { + "epoch": 1.9967850885906766, + "grad_norm": 6.718835732556216, + "learning_rate": 3.163915123033379e-11, + "loss": 0.5379, + "step": 27639 + }, + { + "epoch": 1.9968573337908864, + "grad_norm": 7.454072072377961, + "learning_rate": 3.0184677221323234e-11, + "loss": 0.5561, + "step": 27640 + }, + { + "epoch": 1.9969295789910957, + "grad_norm": 7.175444468910341, + "learning_rate": 2.876442585786343e-11, + "loss": 0.5587, + "step": 27641 + }, + { + "epoch": 1.9970018241913055, + "grad_norm": 7.109409340576403, + "learning_rate": 2.737839715966084e-11, + "loss": 0.5679, + "step": 27642 + }, + { + "epoch": 1.9970740693915148, + "grad_norm": 6.514116364144506, + "learning_rate": 2.6026591145311697e-11, + "loss": 0.5608, + "step": 27643 + }, + { + "epoch": 1.9971463145917243, + "grad_norm": 7.993768649344205, + "learning_rate": 2.470900783368979e-11, + "loss": 0.6221, + "step": 27644 + }, + { + "epoch": 1.9972185597919339, + "grad_norm": 6.8861149497784675, + "learning_rate": 2.3425647242836247e-11, + "loss": 0.5943, + "step": 27645 + }, + { + "epoch": 1.9972908049921432, + "grad_norm": 8.817653150361389, + "learning_rate": 2.2176509389959523e-11, + "loss": 0.5849, + "step": 27646 + }, + { + "epoch": 1.997363050192353, + "grad_norm": 9.696438904525683, + "learning_rate": 2.096159429254563e-11, + "loss": 0.6698, + "step": 27647 + }, + { + "epoch": 1.9974352953925623, + "grad_norm": 7.1526200195824226, + "learning_rate": 1.9780901966970357e-11, + "loss": 0.6028, + "step": 27648 + }, + { + "epoch": 1.997507540592772, + "grad_norm": 7.455832575379483, + "learning_rate": 1.863443242933194e-11, + "loss": 0.5337, + "step": 27649 + }, + { + "epoch": 1.9975797857929813, + "grad_norm": 7.163041136639892, + "learning_rate": 1.752218569572861e-11, + "loss": 0.5857, + "step": 27650 + }, + { + "epoch": 1.9976520309931909, + "grad_norm": 8.39113236070805, + "learning_rate": 1.6444161780870827e-11, + "loss": 0.5579, + "step": 27651 + }, + { + "epoch": 1.9977242761934004, + "grad_norm": 7.175254939531112, + "learning_rate": 1.5400360700024154e-11, + "loss": 0.6432, + "step": 27652 + }, + { + "epoch": 1.9977965213936097, + "grad_norm": 7.560678562208879, + "learning_rate": 1.4390782467066378e-11, + "loss": 0.6285, + "step": 27653 + }, + { + "epoch": 1.9978687665938195, + "grad_norm": 7.025264379338999, + "learning_rate": 1.3415427096152845e-11, + "loss": 0.5357, + "step": 27654 + }, + { + "epoch": 1.9979410117940288, + "grad_norm": 7.857743748068064, + "learning_rate": 1.2474294600328674e-11, + "loss": 0.5985, + "step": 27655 + }, + { + "epoch": 1.9980132569942386, + "grad_norm": 7.759472871947442, + "learning_rate": 1.1567384992638986e-11, + "loss": 0.6049, + "step": 27656 + }, + { + "epoch": 1.998085502194448, + "grad_norm": 7.7420241630254, + "learning_rate": 1.069469828557379e-11, + "loss": 0.5621, + "step": 27657 + }, + { + "epoch": 1.9981577473946575, + "grad_norm": 7.912111303535929, + "learning_rate": 9.856234491067984e-12, + "loss": 0.6485, + "step": 27658 + }, + { + "epoch": 1.998229992594867, + "grad_norm": 8.71628943284072, + "learning_rate": 9.051993620223798e-12, + "loss": 0.604, + "step": 27659 + }, + { + "epoch": 1.9983022377950765, + "grad_norm": 10.509515719303952, + "learning_rate": 8.281975684421017e-12, + "loss": 0.6039, + "step": 27660 + }, + { + "epoch": 1.998374482995286, + "grad_norm": 7.212876874309936, + "learning_rate": 7.546180694206761e-12, + "loss": 0.557, + "step": 27661 + }, + { + "epoch": 1.9984467281954954, + "grad_norm": 7.211281880827159, + "learning_rate": 6.8446086595730374e-12, + "loss": 0.5816, + "step": 27662 + }, + { + "epoch": 1.9985189733957052, + "grad_norm": 7.23308114990718, + "learning_rate": 6.177259590234297e-12, + "loss": 0.5956, + "step": 27663 + }, + { + "epoch": 1.9985912185959145, + "grad_norm": 7.592814069927452, + "learning_rate": 5.544133495072324e-12, + "loss": 0.6853, + "step": 27664 + }, + { + "epoch": 1.998663463796124, + "grad_norm": 6.948119687188758, + "learning_rate": 4.945230382691346e-12, + "loss": 0.5817, + "step": 27665 + }, + { + "epoch": 1.9987357089963336, + "grad_norm": 6.760613186355035, + "learning_rate": 4.380550261695593e-12, + "loss": 0.5987, + "step": 27666 + }, + { + "epoch": 1.9988079541965431, + "grad_norm": 6.6612054708735196, + "learning_rate": 3.85009313957907e-12, + "loss": 0.6224, + "step": 27667 + }, + { + "epoch": 1.9988801993967527, + "grad_norm": 8.053498203450987, + "learning_rate": 3.3538590238357815e-12, + "loss": 0.6129, + "step": 27668 + }, + { + "epoch": 1.998952444596962, + "grad_norm": 7.62718119457336, + "learning_rate": 2.891847920849511e-12, + "loss": 0.5844, + "step": 27669 + }, + { + "epoch": 1.9990246897971717, + "grad_norm": 7.796769563806764, + "learning_rate": 2.4640598370040404e-12, + "loss": 0.5461, + "step": 27670 + }, + { + "epoch": 1.999096934997381, + "grad_norm": 7.794256286573532, + "learning_rate": 2.0704947784055962e-12, + "loss": 0.6329, + "step": 27671 + }, + { + "epoch": 1.9991691801975906, + "grad_norm": 8.532809942804278, + "learning_rate": 1.7111527506052939e-12, + "loss": 0.5999, + "step": 27672 + }, + { + "epoch": 1.9992414253978001, + "grad_norm": 7.468246123742503, + "learning_rate": 1.3860337580440253e-12, + "loss": 0.5642, + "step": 27673 + }, + { + "epoch": 1.9993136705980097, + "grad_norm": 8.047742008055593, + "learning_rate": 1.0951378054402383e-12, + "loss": 0.6071, + "step": 27674 + }, + { + "epoch": 1.9993859157982192, + "grad_norm": 7.9230491475308575, + "learning_rate": 8.38464896402158e-13, + "loss": 0.572, + "step": 27675 + }, + { + "epoch": 1.9994581609984285, + "grad_norm": 7.349601589485645, + "learning_rate": 6.160150350931204e-13, + "loss": 0.6082, + "step": 27676 + }, + { + "epoch": 1.9995304061986383, + "grad_norm": 7.643453312478533, + "learning_rate": 4.2778822428868326e-13, + "loss": 0.6224, + "step": 27677 + }, + { + "epoch": 1.9996026513988476, + "grad_norm": 8.19567323368971, + "learning_rate": 2.7378446620929256e-13, + "loss": 0.6153, + "step": 27678 + }, + { + "epoch": 1.9996748965990572, + "grad_norm": 7.5635677876398155, + "learning_rate": 1.5400376363050584e-13, + "loss": 0.6119, + "step": 27679 + }, + { + "epoch": 1.9997471417992667, + "grad_norm": 7.835210399419649, + "learning_rate": 6.844611738499041e-14, + "loss": 0.5718, + "step": 27680 + }, + { + "epoch": 1.9998193869994763, + "grad_norm": 7.066121750940971, + "learning_rate": 1.711152941563654e-14, + "loss": 0.5958, + "step": 27681 + }, + { + "epoch": 1.9998916321996858, + "grad_norm": 7.088110475975167, + "learning_rate": 0.0, + "loss": 0.6426, + "step": 27682 + }, + { + "epoch": 1.9998916321996858, + "step": 27682, + "total_flos": 6025310322507776.0, + "train_loss": 0.7615339792853068, + "train_runtime": 1037563.6546, + "train_samples_per_second": 3.415, + "train_steps_per_second": 0.027 + } + ], + "logging_steps": 1.0, + "max_steps": 27682, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6025310322507776.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}