diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,39132 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.999328768962277, + "eval_steps": 500, + "global_step": 27930, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.27031534910202026, + "learning_rate": 4.9999996046262024e-05, + "loss": 2.2625, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.3651030957698822, + "learning_rate": 4.999998418504935e-05, + "loss": 2.1664, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.4536094069480896, + "learning_rate": 4.999996441636572e-05, + "loss": 2.1371, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 0.3164426386356354, + "learning_rate": 4.999993674021739e-05, + "loss": 2.2912, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 0.9441952109336853, + "learning_rate": 4.9999908905929944e-05, + "loss": 2.5717, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 0.45403921604156494, + "learning_rate": 4.99998669963689e-05, + "loss": 2.2767, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.38720938563346863, + "learning_rate": 4.9999817179373974e-05, + "loss": 2.2304, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 0.2565215826034546, + "learning_rate": 4.9999759454960915e-05, + "loss": 1.9965, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.41144421696662903, + "learning_rate": 4.9999693823147996e-05, + "loss": 2.2625, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 0.7766994833946228, + "learning_rate": 4.999962028395596e-05, + "loss": 2.253, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 0.4188031852245331, + "learning_rate": 4.9999538837408077e-05, + "loss": 2.132, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 0.6002323031425476, + "learning_rate": 4.999944948353011e-05, + "loss": 2.2579, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 0.3897973597049713, + "learning_rate": 4.999935222235031e-05, + "loss": 2.0546, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 3.2654597759246826, + "learning_rate": 4.9999247053899453e-05, + "loss": 2.1811, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 0.29960134625434875, + "learning_rate": 4.99991339782108e-05, + "loss": 1.9428, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 0.48218828439712524, + "learning_rate": 4.999901299532012e-05, + "loss": 2.2476, + "step": 80 + }, + { + "epoch": 0.02, + "grad_norm": 1.6963359117507935, + "learning_rate": 4.9998884105265666e-05, + "loss": 1.9934, + "step": 85 + }, + { + "epoch": 0.02, + "grad_norm": 2.0964760780334473, + "learning_rate": 4.9998747308088226e-05, + "loss": 2.1915, + "step": 90 + }, + { + "epoch": 0.02, + "grad_norm": 0.5373502969741821, + "learning_rate": 4.999860260383104e-05, + "loss": 1.988, + "step": 95 + }, + { + "epoch": 0.02, + "grad_norm": 0.4756152331829071, + "learning_rate": 4.999844999253991e-05, + "loss": 2.0278, + "step": 100 + }, + { + "epoch": 0.02, + "grad_norm": 0.35103705525398254, + "learning_rate": 4.9998289474263094e-05, + "loss": 2.1245, + "step": 105 + }, + { + "epoch": 0.02, + "grad_norm": 0.6553156971931458, + "learning_rate": 4.999812104905136e-05, + "loss": 2.0964, + "step": 110 + }, + { + "epoch": 0.02, + "grad_norm": 0.5029746890068054, + "learning_rate": 4.9997944716957985e-05, + "loss": 2.356, + "step": 115 + }, + { + "epoch": 0.02, + "grad_norm": 0.3125253915786743, + "learning_rate": 4.999776047803873e-05, + "loss": 2.1678, + "step": 120 + }, + { + "epoch": 0.02, + "grad_norm": 0.8542938232421875, + "learning_rate": 4.99975683323519e-05, + "loss": 2.1263, + "step": 125 + }, + { + "epoch": 0.02, + "grad_norm": 0.3991641700267792, + "learning_rate": 4.9997368279958236e-05, + "loss": 2.0609, + "step": 130 + }, + { + "epoch": 0.02, + "grad_norm": 0.4593062698841095, + "learning_rate": 4.999716032092103e-05, + "loss": 2.0401, + "step": 135 + }, + { + "epoch": 0.03, + "grad_norm": 0.3167307674884796, + "learning_rate": 4.999694445530607e-05, + "loss": 1.9401, + "step": 140 + }, + { + "epoch": 0.03, + "grad_norm": 0.7320476174354553, + "learning_rate": 4.9996720683181617e-05, + "loss": 2.0134, + "step": 145 + }, + { + "epoch": 0.03, + "grad_norm": 0.5831002593040466, + "learning_rate": 4.9996489004618455e-05, + "loss": 1.922, + "step": 150 + }, + { + "epoch": 0.03, + "grad_norm": 1.0691373348236084, + "learning_rate": 4.999624941968986e-05, + "loss": 2.0451, + "step": 155 + }, + { + "epoch": 0.03, + "grad_norm": 0.5325329899787903, + "learning_rate": 4.999600192847162e-05, + "loss": 2.0023, + "step": 160 + }, + { + "epoch": 0.03, + "grad_norm": 1.7082959413528442, + "learning_rate": 4.9995746531042006e-05, + "loss": 1.9057, + "step": 165 + }, + { + "epoch": 0.03, + "grad_norm": 1.2313815355300903, + "learning_rate": 4.9995483227481824e-05, + "loss": 1.8793, + "step": 170 + }, + { + "epoch": 0.03, + "grad_norm": 0.24114160239696503, + "learning_rate": 4.9995212017874325e-05, + "loss": 2.1221, + "step": 175 + }, + { + "epoch": 0.03, + "grad_norm": 0.4924792945384979, + "learning_rate": 4.9994932902305315e-05, + "loss": 2.0354, + "step": 180 + }, + { + "epoch": 0.03, + "grad_norm": 3.932770013809204, + "learning_rate": 4.9994645880863064e-05, + "loss": 2.0849, + "step": 185 + }, + { + "epoch": 0.03, + "grad_norm": 0.9561445713043213, + "learning_rate": 4.9994350953638366e-05, + "loss": 1.9205, + "step": 190 + }, + { + "epoch": 0.03, + "grad_norm": 0.7701564431190491, + "learning_rate": 4.999404812072451e-05, + "loss": 2.2077, + "step": 195 + }, + { + "epoch": 0.04, + "grad_norm": 0.6854715943336487, + "learning_rate": 4.9993737382217265e-05, + "loss": 2.2886, + "step": 200 + }, + { + "epoch": 0.04, + "grad_norm": 0.7976120710372925, + "learning_rate": 4.999341873821493e-05, + "loss": 2.3515, + "step": 205 + }, + { + "epoch": 0.04, + "grad_norm": 1.1986960172653198, + "learning_rate": 4.999309218881829e-05, + "loss": 1.7781, + "step": 210 + }, + { + "epoch": 0.04, + "grad_norm": 0.5879755020141602, + "learning_rate": 4.999275773413063e-05, + "loss": 2.2107, + "step": 215 + }, + { + "epoch": 0.04, + "grad_norm": 1.0343559980392456, + "learning_rate": 4.9992415374257734e-05, + "loss": 1.8219, + "step": 220 + }, + { + "epoch": 0.04, + "grad_norm": 0.6155093312263489, + "learning_rate": 4.9992065109307904e-05, + "loss": 2.3041, + "step": 225 + }, + { + "epoch": 0.04, + "grad_norm": 1.1242767572402954, + "learning_rate": 4.9991706939391916e-05, + "loss": 2.2295, + "step": 230 + }, + { + "epoch": 0.04, + "grad_norm": 0.5417547225952148, + "learning_rate": 4.9991340864623057e-05, + "loss": 1.7807, + "step": 235 + }, + { + "epoch": 0.04, + "grad_norm": 0.36535757780075073, + "learning_rate": 4.999096688511712e-05, + "loss": 1.8687, + "step": 240 + }, + { + "epoch": 0.04, + "grad_norm": 0.8204144835472107, + "learning_rate": 4.99905850009924e-05, + "loss": 2.1915, + "step": 245 + }, + { + "epoch": 0.04, + "grad_norm": 0.5473398566246033, + "learning_rate": 4.999019521236969e-05, + "loss": 1.9105, + "step": 250 + }, + { + "epoch": 0.05, + "grad_norm": 0.5916422605514526, + "learning_rate": 4.998979751937226e-05, + "loss": 1.8488, + "step": 255 + }, + { + "epoch": 0.05, + "grad_norm": 0.40376555919647217, + "learning_rate": 4.998939192212591e-05, + "loss": 1.9348, + "step": 260 + }, + { + "epoch": 0.05, + "grad_norm": 0.4245920479297638, + "learning_rate": 4.998897842075894e-05, + "loss": 1.8736, + "step": 265 + }, + { + "epoch": 0.05, + "grad_norm": 3.863102436065674, + "learning_rate": 4.998855701540213e-05, + "loss": 1.8208, + "step": 270 + }, + { + "epoch": 0.05, + "grad_norm": 0.9868331551551819, + "learning_rate": 4.998812770618877e-05, + "loss": 2.0812, + "step": 275 + }, + { + "epoch": 0.05, + "grad_norm": 0.3408450484275818, + "learning_rate": 4.998769049325465e-05, + "loss": 2.2077, + "step": 280 + }, + { + "epoch": 0.05, + "grad_norm": 0.3527214229106903, + "learning_rate": 4.998724537673807e-05, + "loss": 2.1004, + "step": 285 + }, + { + "epoch": 0.05, + "grad_norm": 1.1077073812484741, + "learning_rate": 4.9986792356779803e-05, + "loss": 2.3449, + "step": 290 + }, + { + "epoch": 0.05, + "grad_norm": 0.5574270486831665, + "learning_rate": 4.9986331433523156e-05, + "loss": 1.8885, + "step": 295 + }, + { + "epoch": 0.05, + "grad_norm": 0.4326357841491699, + "learning_rate": 4.99858626071139e-05, + "loss": 1.9964, + "step": 300 + }, + { + "epoch": 0.05, + "grad_norm": 0.9008758068084717, + "learning_rate": 4.998538587770034e-05, + "loss": 1.9349, + "step": 305 + }, + { + "epoch": 0.06, + "grad_norm": 0.8824496865272522, + "learning_rate": 4.998490124543327e-05, + "loss": 1.9985, + "step": 310 + }, + { + "epoch": 0.06, + "grad_norm": 0.6680270433425903, + "learning_rate": 4.998440871046596e-05, + "loss": 1.8583, + "step": 315 + }, + { + "epoch": 0.06, + "grad_norm": 0.392929345369339, + "learning_rate": 4.99839082729542e-05, + "loss": 1.926, + "step": 320 + }, + { + "epoch": 0.06, + "grad_norm": 0.6838160753250122, + "learning_rate": 4.998339993305629e-05, + "loss": 2.0671, + "step": 325 + }, + { + "epoch": 0.06, + "grad_norm": 0.6205262541770935, + "learning_rate": 4.9982883690933014e-05, + "loss": 1.8439, + "step": 330 + }, + { + "epoch": 0.06, + "grad_norm": 0.5349203944206238, + "learning_rate": 4.998235954674766e-05, + "loss": 2.1677, + "step": 335 + }, + { + "epoch": 0.06, + "grad_norm": 0.5297353267669678, + "learning_rate": 4.9981827500666e-05, + "loss": 2.1915, + "step": 340 + }, + { + "epoch": 0.06, + "grad_norm": 0.5714792013168335, + "learning_rate": 4.9981287552856344e-05, + "loss": 1.8878, + "step": 345 + }, + { + "epoch": 0.06, + "grad_norm": 0.546989917755127, + "learning_rate": 4.998073970348945e-05, + "loss": 2.1013, + "step": 350 + }, + { + "epoch": 0.06, + "grad_norm": 0.7602975964546204, + "learning_rate": 4.9980183952738634e-05, + "loss": 2.2048, + "step": 355 + }, + { + "epoch": 0.06, + "grad_norm": 1.0656648874282837, + "learning_rate": 4.997962030077965e-05, + "loss": 2.1493, + "step": 360 + }, + { + "epoch": 0.07, + "grad_norm": 0.6455523371696472, + "learning_rate": 4.997904874779079e-05, + "loss": 1.6774, + "step": 365 + }, + { + "epoch": 0.07, + "grad_norm": 0.34377214312553406, + "learning_rate": 4.997846929395285e-05, + "loss": 1.8235, + "step": 370 + }, + { + "epoch": 0.07, + "grad_norm": 0.6402696967124939, + "learning_rate": 4.997788193944909e-05, + "loss": 1.9533, + "step": 375 + }, + { + "epoch": 0.07, + "grad_norm": 0.5523343086242676, + "learning_rate": 4.99772866844653e-05, + "loss": 2.0183, + "step": 380 + }, + { + "epoch": 0.07, + "grad_norm": 0.6676031947135925, + "learning_rate": 4.997668352918975e-05, + "loss": 1.8157, + "step": 385 + }, + { + "epoch": 0.07, + "grad_norm": 0.9743964672088623, + "learning_rate": 4.997607247381323e-05, + "loss": 2.1711, + "step": 390 + }, + { + "epoch": 0.07, + "grad_norm": 0.7307085990905762, + "learning_rate": 4.9975453518529014e-05, + "loss": 1.8769, + "step": 395 + }, + { + "epoch": 0.07, + "grad_norm": 1.004335641860962, + "learning_rate": 4.997482666353287e-05, + "loss": 1.6377, + "step": 400 + }, + { + "epoch": 0.07, + "grad_norm": 0.7974880337715149, + "learning_rate": 4.997419190902307e-05, + "loss": 1.8962, + "step": 405 + }, + { + "epoch": 0.07, + "grad_norm": 0.4197213649749756, + "learning_rate": 4.99735492552004e-05, + "loss": 2.0266, + "step": 410 + }, + { + "epoch": 0.07, + "grad_norm": 0.5465844869613647, + "learning_rate": 4.997289870226812e-05, + "loss": 1.9882, + "step": 415 + }, + { + "epoch": 0.08, + "grad_norm": 0.7714999914169312, + "learning_rate": 4.9972240250432e-05, + "loss": 1.7723, + "step": 420 + }, + { + "epoch": 0.08, + "grad_norm": 0.4609231650829315, + "learning_rate": 4.9971573899900306e-05, + "loss": 2.1264, + "step": 425 + }, + { + "epoch": 0.08, + "grad_norm": 2.4901442527770996, + "learning_rate": 4.997089965088381e-05, + "loss": 1.8215, + "step": 430 + }, + { + "epoch": 0.08, + "grad_norm": 1.3669675588607788, + "learning_rate": 4.997021750359577e-05, + "loss": 1.9637, + "step": 435 + }, + { + "epoch": 0.08, + "grad_norm": 0.6169967651367188, + "learning_rate": 4.9969527458251964e-05, + "loss": 2.2501, + "step": 440 + }, + { + "epoch": 0.08, + "grad_norm": 0.862839937210083, + "learning_rate": 4.9968829515070626e-05, + "loss": 2.0921, + "step": 445 + }, + { + "epoch": 0.08, + "grad_norm": 0.5367420315742493, + "learning_rate": 4.996812367427254e-05, + "loss": 1.9298, + "step": 450 + }, + { + "epoch": 0.08, + "grad_norm": 0.4946227967739105, + "learning_rate": 4.996740993608094e-05, + "loss": 2.0265, + "step": 455 + }, + { + "epoch": 0.08, + "grad_norm": 0.68914395570755, + "learning_rate": 4.9966688300721603e-05, + "loss": 1.9182, + "step": 460 + }, + { + "epoch": 0.08, + "grad_norm": 0.44019490480422974, + "learning_rate": 4.9965958768422775e-05, + "loss": 2.1147, + "step": 465 + }, + { + "epoch": 0.08, + "grad_norm": 0.7905745506286621, + "learning_rate": 4.9965221339415194e-05, + "loss": 1.909, + "step": 470 + }, + { + "epoch": 0.09, + "grad_norm": 0.5773944854736328, + "learning_rate": 4.996447601393211e-05, + "loss": 1.8616, + "step": 475 + }, + { + "epoch": 0.09, + "grad_norm": 2.7853548526763916, + "learning_rate": 4.9963722792209286e-05, + "loss": 2.2229, + "step": 480 + }, + { + "epoch": 0.09, + "grad_norm": 0.6674456596374512, + "learning_rate": 4.996296167448495e-05, + "loss": 1.9064, + "step": 485 + }, + { + "epoch": 0.09, + "grad_norm": 2.360712766647339, + "learning_rate": 4.9962192660999866e-05, + "loss": 1.932, + "step": 490 + }, + { + "epoch": 0.09, + "grad_norm": 0.3946886360645294, + "learning_rate": 4.996141575199723e-05, + "loss": 2.112, + "step": 495 + }, + { + "epoch": 0.09, + "grad_norm": 1.100876808166504, + "learning_rate": 4.996063094772281e-05, + "loss": 2.0544, + "step": 500 + }, + { + "epoch": 0.09, + "grad_norm": 1.3931337594985962, + "learning_rate": 4.9959838248424836e-05, + "loss": 1.7863, + "step": 505 + }, + { + "epoch": 0.09, + "grad_norm": 0.4513574540615082, + "learning_rate": 4.995903765435402e-05, + "loss": 2.1613, + "step": 510 + }, + { + "epoch": 0.09, + "grad_norm": 0.6865633130073547, + "learning_rate": 4.9958229165763614e-05, + "loss": 2.0138, + "step": 515 + }, + { + "epoch": 0.09, + "grad_norm": 0.6642078757286072, + "learning_rate": 4.9957412782909324e-05, + "loss": 1.8269, + "step": 520 + }, + { + "epoch": 0.09, + "grad_norm": 1.0124841928482056, + "learning_rate": 4.9956588506049374e-05, + "loss": 2.0759, + "step": 525 + }, + { + "epoch": 0.09, + "grad_norm": 0.8257017731666565, + "learning_rate": 4.9955756335444495e-05, + "loss": 1.7972, + "step": 530 + }, + { + "epoch": 0.1, + "grad_norm": 0.7027864456176758, + "learning_rate": 4.9954916271357876e-05, + "loss": 2.1897, + "step": 535 + }, + { + "epoch": 0.1, + "grad_norm": 0.6689014434814453, + "learning_rate": 4.9954068314055255e-05, + "loss": 2.0164, + "step": 540 + }, + { + "epoch": 0.1, + "grad_norm": 0.6984819769859314, + "learning_rate": 4.9953212463804824e-05, + "loss": 1.8525, + "step": 545 + }, + { + "epoch": 0.1, + "grad_norm": 0.8252110481262207, + "learning_rate": 4.9952348720877294e-05, + "loss": 1.906, + "step": 550 + }, + { + "epoch": 0.1, + "grad_norm": 0.7641300559043884, + "learning_rate": 4.995147708554587e-05, + "loss": 2.179, + "step": 555 + }, + { + "epoch": 0.1, + "grad_norm": 0.5277909636497498, + "learning_rate": 4.9950597558086246e-05, + "loss": 2.078, + "step": 560 + }, + { + "epoch": 0.1, + "grad_norm": 1.3785237073898315, + "learning_rate": 4.99497101387766e-05, + "loss": 2.1335, + "step": 565 + }, + { + "epoch": 0.1, + "grad_norm": 0.9279726147651672, + "learning_rate": 4.994881482789764e-05, + "loss": 1.823, + "step": 570 + }, + { + "epoch": 0.1, + "grad_norm": 0.6728343963623047, + "learning_rate": 4.994791162573256e-05, + "loss": 2.1131, + "step": 575 + }, + { + "epoch": 0.1, + "grad_norm": 0.7369157075881958, + "learning_rate": 4.9947000532567014e-05, + "loss": 1.7872, + "step": 580 + }, + { + "epoch": 0.1, + "grad_norm": 1.544488549232483, + "learning_rate": 4.994608154868921e-05, + "loss": 2.2275, + "step": 585 + }, + { + "epoch": 0.11, + "grad_norm": 1.0794744491577148, + "learning_rate": 4.9945154674389796e-05, + "loss": 2.0575, + "step": 590 + }, + { + "epoch": 0.11, + "grad_norm": 0.5891600847244263, + "learning_rate": 4.994421990996195e-05, + "loss": 2.0061, + "step": 595 + }, + { + "epoch": 0.11, + "grad_norm": 0.5102590322494507, + "learning_rate": 4.994327725570135e-05, + "loss": 1.8905, + "step": 600 + }, + { + "epoch": 0.11, + "grad_norm": 0.9400714635848999, + "learning_rate": 4.994232671190614e-05, + "loss": 1.9824, + "step": 605 + }, + { + "epoch": 0.11, + "grad_norm": 0.7190917730331421, + "learning_rate": 4.994136827887699e-05, + "loss": 1.9437, + "step": 610 + }, + { + "epoch": 0.11, + "grad_norm": 0.5271459221839905, + "learning_rate": 4.994040195691703e-05, + "loss": 1.9421, + "step": 615 + }, + { + "epoch": 0.11, + "grad_norm": 1.935598611831665, + "learning_rate": 4.9939427746331937e-05, + "loss": 2.1908, + "step": 620 + }, + { + "epoch": 0.11, + "grad_norm": 14.462722778320312, + "learning_rate": 4.993844564742982e-05, + "loss": 2.0414, + "step": 625 + }, + { + "epoch": 0.11, + "grad_norm": 2.596210479736328, + "learning_rate": 4.9937455660521345e-05, + "loss": 1.966, + "step": 630 + }, + { + "epoch": 0.11, + "grad_norm": 0.547702431678772, + "learning_rate": 4.993645778591963e-05, + "loss": 1.8967, + "step": 635 + }, + { + "epoch": 0.11, + "grad_norm": 2.006232976913452, + "learning_rate": 4.99354520239403e-05, + "loss": 1.8511, + "step": 640 + }, + { + "epoch": 0.12, + "grad_norm": 1.9103983640670776, + "learning_rate": 4.993443837490148e-05, + "loss": 1.7829, + "step": 645 + }, + { + "epoch": 0.12, + "grad_norm": 0.8740873336791992, + "learning_rate": 4.993341683912378e-05, + "loss": 2.0906, + "step": 650 + }, + { + "epoch": 0.12, + "grad_norm": 0.9075211882591248, + "learning_rate": 4.9932387416930326e-05, + "loss": 2.1868, + "step": 655 + }, + { + "epoch": 0.12, + "grad_norm": 0.5851132273674011, + "learning_rate": 4.993135010864671e-05, + "loss": 1.8202, + "step": 660 + }, + { + "epoch": 0.12, + "grad_norm": 0.7303404808044434, + "learning_rate": 4.993030491460104e-05, + "loss": 1.8032, + "step": 665 + }, + { + "epoch": 0.12, + "grad_norm": 0.5029420256614685, + "learning_rate": 4.9929251835123904e-05, + "loss": 1.8013, + "step": 670 + }, + { + "epoch": 0.12, + "grad_norm": 0.8666778802871704, + "learning_rate": 4.9928190870548384e-05, + "loss": 2.1201, + "step": 675 + }, + { + "epoch": 0.12, + "grad_norm": 1.3739973306655884, + "learning_rate": 4.9927122021210076e-05, + "loss": 1.8149, + "step": 680 + }, + { + "epoch": 0.12, + "grad_norm": 0.6640814542770386, + "learning_rate": 4.992604528744705e-05, + "loss": 1.9143, + "step": 685 + }, + { + "epoch": 0.12, + "grad_norm": 0.9875686764717102, + "learning_rate": 4.992496066959987e-05, + "loss": 1.7527, + "step": 690 + }, + { + "epoch": 0.12, + "grad_norm": 0.5057778358459473, + "learning_rate": 4.992386816801161e-05, + "loss": 1.9771, + "step": 695 + }, + { + "epoch": 0.13, + "grad_norm": 0.7259483337402344, + "learning_rate": 4.992276778302782e-05, + "loss": 1.891, + "step": 700 + }, + { + "epoch": 0.13, + "grad_norm": 0.7736199498176575, + "learning_rate": 4.9921659514996555e-05, + "loss": 1.8235, + "step": 705 + }, + { + "epoch": 0.13, + "grad_norm": 0.6016276478767395, + "learning_rate": 4.992054336426836e-05, + "loss": 1.868, + "step": 710 + }, + { + "epoch": 0.13, + "grad_norm": 0.6753050088882446, + "learning_rate": 4.991941933119626e-05, + "loss": 1.8746, + "step": 715 + }, + { + "epoch": 0.13, + "grad_norm": 0.6114783883094788, + "learning_rate": 4.99182874161358e-05, + "loss": 1.9586, + "step": 720 + }, + { + "epoch": 0.13, + "grad_norm": 1.7114779949188232, + "learning_rate": 4.9917147619445005e-05, + "loss": 2.3431, + "step": 725 + }, + { + "epoch": 0.13, + "grad_norm": 0.6027891635894775, + "learning_rate": 4.991599994148438e-05, + "loss": 2.1642, + "step": 730 + }, + { + "epoch": 0.13, + "grad_norm": 0.7011446952819824, + "learning_rate": 4.991484438261693e-05, + "loss": 1.9181, + "step": 735 + }, + { + "epoch": 0.13, + "grad_norm": 0.5473892092704773, + "learning_rate": 4.991368094320818e-05, + "loss": 2.1816, + "step": 740 + }, + { + "epoch": 0.13, + "grad_norm": 0.4778834879398346, + "learning_rate": 4.991250962362611e-05, + "loss": 1.7557, + "step": 745 + }, + { + "epoch": 0.13, + "grad_norm": 0.6992874145507812, + "learning_rate": 4.99113304242412e-05, + "loss": 2.0037, + "step": 750 + }, + { + "epoch": 0.14, + "grad_norm": 1.2139089107513428, + "learning_rate": 4.9910143345426446e-05, + "loss": 1.9781, + "step": 755 + }, + { + "epoch": 0.14, + "grad_norm": 0.7376035451889038, + "learning_rate": 4.9908948387557306e-05, + "loss": 1.9145, + "step": 760 + }, + { + "epoch": 0.14, + "grad_norm": 0.9813640117645264, + "learning_rate": 4.9907745551011764e-05, + "loss": 1.9012, + "step": 765 + }, + { + "epoch": 0.14, + "grad_norm": 0.6064789295196533, + "learning_rate": 4.990653483617025e-05, + "loss": 2.0977, + "step": 770 + }, + { + "epoch": 0.14, + "grad_norm": 0.8322803378105164, + "learning_rate": 4.990531624341573e-05, + "loss": 2.0924, + "step": 775 + }, + { + "epoch": 0.14, + "grad_norm": 0.39520594477653503, + "learning_rate": 4.990408977313363e-05, + "loss": 2.0109, + "step": 780 + }, + { + "epoch": 0.14, + "grad_norm": 0.7123391628265381, + "learning_rate": 4.9902855425711905e-05, + "loss": 2.1073, + "step": 785 + }, + { + "epoch": 0.14, + "grad_norm": 0.617445170879364, + "learning_rate": 4.9901613201540945e-05, + "loss": 2.081, + "step": 790 + }, + { + "epoch": 0.14, + "grad_norm": 0.2326832264661789, + "learning_rate": 4.990036310101369e-05, + "loss": 2.0825, + "step": 795 + }, + { + "epoch": 0.14, + "grad_norm": 0.828574001789093, + "learning_rate": 4.989910512452554e-05, + "loss": 1.9948, + "step": 800 + }, + { + "epoch": 0.14, + "grad_norm": 0.5718833804130554, + "learning_rate": 4.989783927247439e-05, + "loss": 1.9836, + "step": 805 + }, + { + "epoch": 0.14, + "grad_norm": 0.5843701958656311, + "learning_rate": 4.989656554526062e-05, + "loss": 1.8482, + "step": 810 + }, + { + "epoch": 0.15, + "grad_norm": 0.36369892954826355, + "learning_rate": 4.989528394328712e-05, + "loss": 1.9934, + "step": 815 + }, + { + "epoch": 0.15, + "grad_norm": 0.5329487323760986, + "learning_rate": 4.9893994466959246e-05, + "loss": 1.7943, + "step": 820 + }, + { + "epoch": 0.15, + "grad_norm": 0.9195080399513245, + "learning_rate": 4.989269711668487e-05, + "loss": 1.8812, + "step": 825 + }, + { + "epoch": 0.15, + "grad_norm": 0.639167845249176, + "learning_rate": 4.9891391892874354e-05, + "loss": 2.0915, + "step": 830 + }, + { + "epoch": 0.15, + "grad_norm": 0.9392881393432617, + "learning_rate": 4.989007879594051e-05, + "loss": 1.9084, + "step": 835 + }, + { + "epoch": 0.15, + "grad_norm": 3.4047281742095947, + "learning_rate": 4.988875782629869e-05, + "loss": 2.1699, + "step": 840 + }, + { + "epoch": 0.15, + "grad_norm": 1.1853729486465454, + "learning_rate": 4.9887428984366705e-05, + "loss": 2.3892, + "step": 845 + }, + { + "epoch": 0.15, + "grad_norm": 4.790334701538086, + "learning_rate": 4.9886092270564876e-05, + "loss": 1.9039, + "step": 850 + }, + { + "epoch": 0.15, + "grad_norm": 0.5937879681587219, + "learning_rate": 4.9884747685315996e-05, + "loss": 2.0544, + "step": 855 + }, + { + "epoch": 0.15, + "grad_norm": 0.8132315278053284, + "learning_rate": 4.988339522904536e-05, + "loss": 1.9541, + "step": 860 + }, + { + "epoch": 0.15, + "grad_norm": 0.7449702620506287, + "learning_rate": 4.988203490218075e-05, + "loss": 1.9042, + "step": 865 + }, + { + "epoch": 0.16, + "grad_norm": 0.5091820359230042, + "learning_rate": 4.9880666705152434e-05, + "loss": 1.9421, + "step": 870 + }, + { + "epoch": 0.16, + "grad_norm": 0.45351913571357727, + "learning_rate": 4.987929063839317e-05, + "loss": 1.8925, + "step": 875 + }, + { + "epoch": 0.16, + "grad_norm": 0.5173969268798828, + "learning_rate": 4.987790670233821e-05, + "loss": 1.9209, + "step": 880 + }, + { + "epoch": 0.16, + "grad_norm": 0.5571516156196594, + "learning_rate": 4.987651489742528e-05, + "loss": 1.9871, + "step": 885 + }, + { + "epoch": 0.16, + "grad_norm": 0.9875330328941345, + "learning_rate": 4.987511522409463e-05, + "loss": 1.7102, + "step": 890 + }, + { + "epoch": 0.16, + "grad_norm": 0.513702392578125, + "learning_rate": 4.987370768278895e-05, + "loss": 2.0374, + "step": 895 + }, + { + "epoch": 0.16, + "grad_norm": 0.7539028525352478, + "learning_rate": 4.987229227395346e-05, + "loss": 1.9778, + "step": 900 + }, + { + "epoch": 0.16, + "grad_norm": 0.5421642661094666, + "learning_rate": 4.9870868998035845e-05, + "loss": 1.7694, + "step": 905 + }, + { + "epoch": 0.16, + "grad_norm": 0.8099106550216675, + "learning_rate": 4.986943785548629e-05, + "loss": 1.7683, + "step": 910 + }, + { + "epoch": 0.16, + "grad_norm": 0.797715425491333, + "learning_rate": 4.986799884675747e-05, + "loss": 1.8887, + "step": 915 + }, + { + "epoch": 0.16, + "grad_norm": 0.6067102551460266, + "learning_rate": 4.9866551972304526e-05, + "loss": 1.9329, + "step": 920 + }, + { + "epoch": 0.17, + "grad_norm": 0.49789664149284363, + "learning_rate": 4.986509723258511e-05, + "loss": 2.0683, + "step": 925 + }, + { + "epoch": 0.17, + "grad_norm": 0.9681997299194336, + "learning_rate": 4.9863634628059366e-05, + "loss": 1.9902, + "step": 930 + }, + { + "epoch": 0.17, + "grad_norm": 0.5609699487686157, + "learning_rate": 4.98621641591899e-05, + "loss": 1.8425, + "step": 935 + }, + { + "epoch": 0.17, + "grad_norm": 0.5346946120262146, + "learning_rate": 4.986068582644182e-05, + "loss": 2.0863, + "step": 940 + }, + { + "epoch": 0.17, + "grad_norm": 0.893808126449585, + "learning_rate": 4.985919963028274e-05, + "loss": 2.0174, + "step": 945 + }, + { + "epoch": 0.17, + "grad_norm": 0.4146747887134552, + "learning_rate": 4.985770557118272e-05, + "loss": 1.9704, + "step": 950 + }, + { + "epoch": 0.17, + "grad_norm": 1.030465006828308, + "learning_rate": 4.985620364961433e-05, + "loss": 2.1478, + "step": 955 + }, + { + "epoch": 0.17, + "grad_norm": 0.6539037823677063, + "learning_rate": 4.985469386605265e-05, + "loss": 2.1638, + "step": 960 + }, + { + "epoch": 0.17, + "grad_norm": 0.698945164680481, + "learning_rate": 4.9853176220975195e-05, + "loss": 1.93, + "step": 965 + }, + { + "epoch": 0.17, + "grad_norm": 0.6615301370620728, + "learning_rate": 4.9851650714862006e-05, + "loss": 2.0131, + "step": 970 + }, + { + "epoch": 0.17, + "grad_norm": 0.6794114112854004, + "learning_rate": 4.9850117348195606e-05, + "loss": 1.9938, + "step": 975 + }, + { + "epoch": 0.18, + "grad_norm": 0.4842730164527893, + "learning_rate": 4.984857612146099e-05, + "loss": 1.9502, + "step": 980 + }, + { + "epoch": 0.18, + "grad_norm": 1.0093274116516113, + "learning_rate": 4.984702703514565e-05, + "loss": 1.6196, + "step": 985 + }, + { + "epoch": 0.18, + "grad_norm": 0.6907001733779907, + "learning_rate": 4.984547008973956e-05, + "loss": 1.951, + "step": 990 + }, + { + "epoch": 0.18, + "grad_norm": 0.6231794357299805, + "learning_rate": 4.984390528573517e-05, + "loss": 2.0397, + "step": 995 + }, + { + "epoch": 0.18, + "grad_norm": 0.7842473387718201, + "learning_rate": 4.984233262362745e-05, + "loss": 2.0751, + "step": 1000 + }, + { + "epoch": 0.18, + "grad_norm": 1.2024447917938232, + "learning_rate": 4.98407521039138e-05, + "loss": 1.5978, + "step": 1005 + }, + { + "epoch": 0.18, + "grad_norm": 0.8643293976783752, + "learning_rate": 4.983916372709416e-05, + "loss": 2.2393, + "step": 1010 + }, + { + "epoch": 0.18, + "grad_norm": 0.6391337513923645, + "learning_rate": 4.9837567493670936e-05, + "loss": 1.9106, + "step": 1015 + }, + { + "epoch": 0.18, + "grad_norm": 0.7475038766860962, + "learning_rate": 4.983596340414899e-05, + "loss": 2.0462, + "step": 1020 + }, + { + "epoch": 0.18, + "grad_norm": 1.4926930665969849, + "learning_rate": 4.9834351459035714e-05, + "loss": 1.7617, + "step": 1025 + }, + { + "epoch": 0.18, + "grad_norm": 0.6800340414047241, + "learning_rate": 4.9832731658840956e-05, + "loss": 1.7933, + "step": 1030 + }, + { + "epoch": 0.19, + "grad_norm": 0.5612903237342834, + "learning_rate": 4.983110400407707e-05, + "loss": 1.9801, + "step": 1035 + }, + { + "epoch": 0.19, + "grad_norm": 0.5487276315689087, + "learning_rate": 4.982946849525886e-05, + "loss": 1.8137, + "step": 1040 + }, + { + "epoch": 0.19, + "grad_norm": 1.1689997911453247, + "learning_rate": 4.982782513290365e-05, + "loss": 2.0348, + "step": 1045 + }, + { + "epoch": 0.19, + "grad_norm": 0.5050978660583496, + "learning_rate": 4.9826173917531235e-05, + "loss": 2.1974, + "step": 1050 + }, + { + "epoch": 0.19, + "grad_norm": 0.6541104912757874, + "learning_rate": 4.982451484966389e-05, + "loss": 1.824, + "step": 1055 + }, + { + "epoch": 0.19, + "grad_norm": 0.4202798306941986, + "learning_rate": 4.9822847929826374e-05, + "loss": 1.9036, + "step": 1060 + }, + { + "epoch": 0.19, + "grad_norm": 0.6690078377723694, + "learning_rate": 4.9821173158545936e-05, + "loss": 1.8924, + "step": 1065 + }, + { + "epoch": 0.19, + "grad_norm": 0.41221898794174194, + "learning_rate": 4.98194905363523e-05, + "loss": 2.2559, + "step": 1070 + }, + { + "epoch": 0.19, + "grad_norm": 1.146337628364563, + "learning_rate": 4.981780006377769e-05, + "loss": 1.7768, + "step": 1075 + }, + { + "epoch": 0.19, + "grad_norm": 0.376248836517334, + "learning_rate": 4.981610174135678e-05, + "loss": 1.8807, + "step": 1080 + }, + { + "epoch": 0.19, + "grad_norm": 0.8797820806503296, + "learning_rate": 4.981439556962676e-05, + "loss": 1.5606, + "step": 1085 + }, + { + "epoch": 0.2, + "grad_norm": 0.9308553338050842, + "learning_rate": 4.981268154912729e-05, + "loss": 1.7338, + "step": 1090 + }, + { + "epoch": 0.2, + "grad_norm": 0.8552083969116211, + "learning_rate": 4.981095968040053e-05, + "loss": 1.7534, + "step": 1095 + }, + { + "epoch": 0.2, + "grad_norm": 0.9000282287597656, + "learning_rate": 4.9809229963991064e-05, + "loss": 1.7684, + "step": 1100 + }, + { + "epoch": 0.2, + "grad_norm": 1.6335359811782837, + "learning_rate": 4.980749240044604e-05, + "loss": 1.9627, + "step": 1105 + }, + { + "epoch": 0.2, + "grad_norm": 0.8010488152503967, + "learning_rate": 4.9805746990315024e-05, + "loss": 2.0931, + "step": 1110 + }, + { + "epoch": 0.2, + "grad_norm": 0.5501634478569031, + "learning_rate": 4.980399373415009e-05, + "loss": 2.1531, + "step": 1115 + }, + { + "epoch": 0.2, + "grad_norm": 0.876747190952301, + "learning_rate": 4.980223263250581e-05, + "loss": 1.755, + "step": 1120 + }, + { + "epoch": 0.2, + "grad_norm": 0.8235836625099182, + "learning_rate": 4.98004636859392e-05, + "loss": 2.2103, + "step": 1125 + }, + { + "epoch": 0.2, + "grad_norm": 0.3459777534008026, + "learning_rate": 4.979868689500978e-05, + "loss": 1.8597, + "step": 1130 + }, + { + "epoch": 0.2, + "grad_norm": 0.6202408075332642, + "learning_rate": 4.9796902260279546e-05, + "loss": 1.8657, + "step": 1135 + }, + { + "epoch": 0.2, + "grad_norm": 0.912277340888977, + "learning_rate": 4.9795109782312974e-05, + "loss": 1.7297, + "step": 1140 + }, + { + "epoch": 0.2, + "grad_norm": 0.5778829455375671, + "learning_rate": 4.979330946167704e-05, + "loss": 1.939, + "step": 1145 + }, + { + "epoch": 0.21, + "grad_norm": 0.8134537935256958, + "learning_rate": 4.9791501298941165e-05, + "loss": 1.9645, + "step": 1150 + }, + { + "epoch": 0.21, + "grad_norm": 0.8636032938957214, + "learning_rate": 4.978968529467728e-05, + "loss": 1.6756, + "step": 1155 + }, + { + "epoch": 0.21, + "grad_norm": 0.87062668800354, + "learning_rate": 4.978786144945977e-05, + "loss": 1.7352, + "step": 1160 + }, + { + "epoch": 0.21, + "grad_norm": 0.9337445497512817, + "learning_rate": 4.978602976386554e-05, + "loss": 1.6599, + "step": 1165 + }, + { + "epoch": 0.21, + "grad_norm": 0.8104198575019836, + "learning_rate": 4.978419023847393e-05, + "loss": 1.7557, + "step": 1170 + }, + { + "epoch": 0.21, + "grad_norm": 0.3826814591884613, + "learning_rate": 4.978234287386678e-05, + "loss": 2.0881, + "step": 1175 + }, + { + "epoch": 0.21, + "grad_norm": 0.47126176953315735, + "learning_rate": 4.978048767062843e-05, + "loss": 2.2003, + "step": 1180 + }, + { + "epoch": 0.21, + "grad_norm": 0.4971998333930969, + "learning_rate": 4.977862462934566e-05, + "loss": 1.8297, + "step": 1185 + }, + { + "epoch": 0.21, + "grad_norm": 0.5127992033958435, + "learning_rate": 4.977675375060775e-05, + "loss": 1.8892, + "step": 1190 + }, + { + "epoch": 0.21, + "grad_norm": 0.7723473310470581, + "learning_rate": 4.9774875035006464e-05, + "loss": 1.9711, + "step": 1195 + }, + { + "epoch": 0.21, + "grad_norm": 0.5480608344078064, + "learning_rate": 4.977298848313604e-05, + "loss": 1.7975, + "step": 1200 + }, + { + "epoch": 0.22, + "grad_norm": 0.4244174659252167, + "learning_rate": 4.9771094095593176e-05, + "loss": 2.1068, + "step": 1205 + }, + { + "epoch": 0.22, + "grad_norm": 0.5436844825744629, + "learning_rate": 4.9769191872977085e-05, + "loss": 2.09, + "step": 1210 + }, + { + "epoch": 0.22, + "grad_norm": 0.4076632261276245, + "learning_rate": 4.9767281815889425e-05, + "loss": 2.038, + "step": 1215 + }, + { + "epoch": 0.22, + "grad_norm": 0.5819802284240723, + "learning_rate": 4.976536392493435e-05, + "loss": 1.8444, + "step": 1220 + }, + { + "epoch": 0.22, + "grad_norm": 0.7633818984031677, + "learning_rate": 4.976343820071849e-05, + "loss": 2.0922, + "step": 1225 + }, + { + "epoch": 0.22, + "grad_norm": 0.6349738836288452, + "learning_rate": 4.9761504643850945e-05, + "loss": 1.9147, + "step": 1230 + }, + { + "epoch": 0.22, + "grad_norm": 0.5177556276321411, + "learning_rate": 4.9759563254943306e-05, + "loss": 1.7667, + "step": 1235 + }, + { + "epoch": 0.22, + "grad_norm": 1.4259908199310303, + "learning_rate": 4.9757614034609615e-05, + "loss": 1.9067, + "step": 1240 + }, + { + "epoch": 0.22, + "grad_norm": 0.8151448369026184, + "learning_rate": 4.975565698346642e-05, + "loss": 2.0555, + "step": 1245 + }, + { + "epoch": 0.22, + "grad_norm": 0.9140625, + "learning_rate": 4.9753692102132735e-05, + "loss": 1.9399, + "step": 1250 + }, + { + "epoch": 0.22, + "grad_norm": 0.86171954870224, + "learning_rate": 4.975171939123005e-05, + "loss": 2.203, + "step": 1255 + }, + { + "epoch": 0.23, + "grad_norm": 0.751837432384491, + "learning_rate": 4.9749738851382326e-05, + "loss": 1.7247, + "step": 1260 + }, + { + "epoch": 0.23, + "grad_norm": 3.416966438293457, + "learning_rate": 4.9747750483216015e-05, + "loss": 2.1371, + "step": 1265 + }, + { + "epoch": 0.23, + "grad_norm": 0.8290925621986389, + "learning_rate": 4.974575428736002e-05, + "loss": 1.9527, + "step": 1270 + }, + { + "epoch": 0.23, + "grad_norm": 0.5154220461845398, + "learning_rate": 4.974375026444575e-05, + "loss": 2.2402, + "step": 1275 + }, + { + "epoch": 0.23, + "grad_norm": 1.5878034830093384, + "learning_rate": 4.974173841510708e-05, + "loss": 2.008, + "step": 1280 + }, + { + "epoch": 0.23, + "grad_norm": 0.9623072743415833, + "learning_rate": 4.9739718739980346e-05, + "loss": 1.734, + "step": 1285 + }, + { + "epoch": 0.23, + "grad_norm": 0.6699478030204773, + "learning_rate": 4.9737691239704366e-05, + "loss": 1.9048, + "step": 1290 + }, + { + "epoch": 0.23, + "grad_norm": 0.6063868403434753, + "learning_rate": 4.9735655914920445e-05, + "loss": 2.3764, + "step": 1295 + }, + { + "epoch": 0.23, + "grad_norm": 0.598061740398407, + "learning_rate": 4.973361276627235e-05, + "loss": 1.901, + "step": 1300 + }, + { + "epoch": 0.23, + "grad_norm": 0.7591137886047363, + "learning_rate": 4.9731561794406326e-05, + "loss": 1.8706, + "step": 1305 + }, + { + "epoch": 0.23, + "grad_norm": 0.5705850124359131, + "learning_rate": 4.9729502999971086e-05, + "loss": 2.0533, + "step": 1310 + }, + { + "epoch": 0.24, + "grad_norm": 1.9160748720169067, + "learning_rate": 4.9727436383617853e-05, + "loss": 1.8877, + "step": 1315 + }, + { + "epoch": 0.24, + "grad_norm": 0.8184042572975159, + "learning_rate": 4.9725361946000267e-05, + "loss": 1.8283, + "step": 1320 + }, + { + "epoch": 0.24, + "grad_norm": 0.9235120415687561, + "learning_rate": 4.9723279687774485e-05, + "loss": 2.1376, + "step": 1325 + }, + { + "epoch": 0.24, + "grad_norm": 0.4321657121181488, + "learning_rate": 4.9721189609599106e-05, + "loss": 1.9237, + "step": 1330 + }, + { + "epoch": 0.24, + "grad_norm": 0.47391876578330994, + "learning_rate": 4.9719091712135244e-05, + "loss": 1.9186, + "step": 1335 + }, + { + "epoch": 0.24, + "grad_norm": 2.293057441711426, + "learning_rate": 4.9716985996046443e-05, + "loss": 1.8054, + "step": 1340 + }, + { + "epoch": 0.24, + "grad_norm": 0.6522160768508911, + "learning_rate": 4.971487246199875e-05, + "loss": 1.9026, + "step": 1345 + }, + { + "epoch": 0.24, + "grad_norm": 0.6272959113121033, + "learning_rate": 4.971275111066067e-05, + "loss": 2.1646, + "step": 1350 + }, + { + "epoch": 0.24, + "grad_norm": 0.32946717739105225, + "learning_rate": 4.9710621942703186e-05, + "loss": 1.8629, + "step": 1355 + }, + { + "epoch": 0.24, + "grad_norm": 0.5091928839683533, + "learning_rate": 4.970848495879975e-05, + "loss": 2.0348, + "step": 1360 + }, + { + "epoch": 0.24, + "grad_norm": 2.6853437423706055, + "learning_rate": 4.9706340159626284e-05, + "loss": 1.6108, + "step": 1365 + }, + { + "epoch": 0.25, + "grad_norm": 0.7039760947227478, + "learning_rate": 4.970418754586119e-05, + "loss": 2.1027, + "step": 1370 + }, + { + "epoch": 0.25, + "grad_norm": 0.4680666923522949, + "learning_rate": 4.970202711818535e-05, + "loss": 2.0027, + "step": 1375 + }, + { + "epoch": 0.25, + "grad_norm": 0.49407467246055603, + "learning_rate": 4.969985887728208e-05, + "loss": 2.144, + "step": 1380 + }, + { + "epoch": 0.25, + "grad_norm": 0.6088310480117798, + "learning_rate": 4.96976828238372e-05, + "loss": 2.2039, + "step": 1385 + }, + { + "epoch": 0.25, + "grad_norm": 0.39288827776908875, + "learning_rate": 4.9695498958539014e-05, + "loss": 1.9803, + "step": 1390 + }, + { + "epoch": 0.25, + "grad_norm": 0.5070251226425171, + "learning_rate": 4.969330728207825e-05, + "loss": 2.078, + "step": 1395 + }, + { + "epoch": 0.25, + "grad_norm": 0.4578852653503418, + "learning_rate": 4.969110779514815e-05, + "loss": 1.7425, + "step": 1400 + }, + { + "epoch": 0.25, + "grad_norm": 0.8231094479560852, + "learning_rate": 4.96889004984444e-05, + "loss": 2.1139, + "step": 1405 + }, + { + "epoch": 0.25, + "grad_norm": 0.5976428985595703, + "learning_rate": 4.9686685392665174e-05, + "loss": 2.0963, + "step": 1410 + }, + { + "epoch": 0.25, + "grad_norm": 0.8335079550743103, + "learning_rate": 4.968446247851111e-05, + "loss": 1.9815, + "step": 1415 + }, + { + "epoch": 0.25, + "grad_norm": 0.8027464747428894, + "learning_rate": 4.96822317566853e-05, + "loss": 1.878, + "step": 1420 + }, + { + "epoch": 0.26, + "grad_norm": 1.3042505979537964, + "learning_rate": 4.9679993227893326e-05, + "loss": 2.0152, + "step": 1425 + }, + { + "epoch": 0.26, + "grad_norm": 0.6054438352584839, + "learning_rate": 4.967774689284323e-05, + "loss": 2.0464, + "step": 1430 + }, + { + "epoch": 0.26, + "grad_norm": 0.845056414604187, + "learning_rate": 4.9675492752245536e-05, + "loss": 2.0495, + "step": 1435 + }, + { + "epoch": 0.26, + "grad_norm": 0.930960476398468, + "learning_rate": 4.967323080681322e-05, + "loss": 2.0878, + "step": 1440 + }, + { + "epoch": 0.26, + "grad_norm": 3.2143394947052, + "learning_rate": 4.967096105726173e-05, + "loss": 2.1571, + "step": 1445 + }, + { + "epoch": 0.26, + "grad_norm": 0.4864540696144104, + "learning_rate": 4.966868350430899e-05, + "loss": 1.9011, + "step": 1450 + }, + { + "epoch": 0.26, + "grad_norm": 0.6188135147094727, + "learning_rate": 4.9666398148675374e-05, + "loss": 2.0124, + "step": 1455 + }, + { + "epoch": 0.26, + "grad_norm": 1.174637794494629, + "learning_rate": 4.9664104991083757e-05, + "loss": 1.8437, + "step": 1460 + }, + { + "epoch": 0.26, + "grad_norm": 0.6095223426818848, + "learning_rate": 4.9661804032259464e-05, + "loss": 1.8675, + "step": 1465 + }, + { + "epoch": 0.26, + "grad_norm": 0.4776439964771271, + "learning_rate": 4.9659495272930266e-05, + "loss": 2.1693, + "step": 1470 + }, + { + "epoch": 0.26, + "grad_norm": 0.6089404225349426, + "learning_rate": 4.9657178713826434e-05, + "loss": 2.2899, + "step": 1475 + }, + { + "epoch": 0.26, + "grad_norm": 1.6431376934051514, + "learning_rate": 4.96548543556807e-05, + "loss": 1.9234, + "step": 1480 + }, + { + "epoch": 0.27, + "grad_norm": 0.5474340319633484, + "learning_rate": 4.965252219922825e-05, + "loss": 2.0113, + "step": 1485 + }, + { + "epoch": 0.27, + "grad_norm": 0.44193366169929504, + "learning_rate": 4.9650182245206726e-05, + "loss": 1.8976, + "step": 1490 + }, + { + "epoch": 0.27, + "grad_norm": 0.8061282634735107, + "learning_rate": 4.9647834494356274e-05, + "loss": 2.0584, + "step": 1495 + }, + { + "epoch": 0.27, + "grad_norm": 0.5918030738830566, + "learning_rate": 4.964547894741949e-05, + "loss": 2.2051, + "step": 1500 + }, + { + "epoch": 0.27, + "grad_norm": 0.7036905288696289, + "learning_rate": 4.964311560514141e-05, + "loss": 2.1186, + "step": 1505 + }, + { + "epoch": 0.27, + "grad_norm": 0.5520045757293701, + "learning_rate": 4.964074446826957e-05, + "loss": 2.0825, + "step": 1510 + }, + { + "epoch": 0.27, + "grad_norm": 0.34511449933052063, + "learning_rate": 4.9638365537553956e-05, + "loss": 1.9604, + "step": 1515 + }, + { + "epoch": 0.27, + "grad_norm": 1.0205286741256714, + "learning_rate": 4.963597881374702e-05, + "loss": 1.5741, + "step": 1520 + }, + { + "epoch": 0.27, + "grad_norm": 1.0340920686721802, + "learning_rate": 4.9633584297603686e-05, + "loss": 2.076, + "step": 1525 + }, + { + "epoch": 0.27, + "grad_norm": 0.757640540599823, + "learning_rate": 4.963118198988133e-05, + "loss": 2.2935, + "step": 1530 + }, + { + "epoch": 0.27, + "grad_norm": 1.603891134262085, + "learning_rate": 4.9628771891339806e-05, + "loss": 2.0039, + "step": 1535 + }, + { + "epoch": 0.28, + "grad_norm": 0.5811793804168701, + "learning_rate": 4.962635400274142e-05, + "loss": 1.8798, + "step": 1540 + }, + { + "epoch": 0.28, + "grad_norm": 0.6387256383895874, + "learning_rate": 4.962392832485095e-05, + "loss": 1.814, + "step": 1545 + }, + { + "epoch": 0.28, + "grad_norm": 0.7429436445236206, + "learning_rate": 4.962149485843564e-05, + "loss": 1.8334, + "step": 1550 + }, + { + "epoch": 0.28, + "grad_norm": 0.6524866819381714, + "learning_rate": 4.9619053604265185e-05, + "loss": 1.9859, + "step": 1555 + }, + { + "epoch": 0.28, + "grad_norm": 0.7228694558143616, + "learning_rate": 4.961660456311176e-05, + "loss": 1.8701, + "step": 1560 + }, + { + "epoch": 0.28, + "grad_norm": 0.49972599744796753, + "learning_rate": 4.961414773574998e-05, + "loss": 2.0535, + "step": 1565 + }, + { + "epoch": 0.28, + "grad_norm": 0.3244457542896271, + "learning_rate": 4.961168312295696e-05, + "loss": 1.7632, + "step": 1570 + }, + { + "epoch": 0.28, + "grad_norm": 0.6015215516090393, + "learning_rate": 4.9609210725512234e-05, + "loss": 1.8848, + "step": 1575 + }, + { + "epoch": 0.28, + "grad_norm": 0.5646616220474243, + "learning_rate": 4.9606730544197834e-05, + "loss": 1.8899, + "step": 1580 + }, + { + "epoch": 0.28, + "grad_norm": 0.66986483335495, + "learning_rate": 4.960424257979822e-05, + "loss": 2.0326, + "step": 1585 + }, + { + "epoch": 0.28, + "grad_norm": 1.0103802680969238, + "learning_rate": 4.960174683310035e-05, + "loss": 1.9453, + "step": 1590 + }, + { + "epoch": 0.29, + "grad_norm": 0.6305530667304993, + "learning_rate": 4.9599243304893625e-05, + "loss": 1.6763, + "step": 1595 + }, + { + "epoch": 0.29, + "grad_norm": 0.6252381801605225, + "learning_rate": 4.95967319959699e-05, + "loss": 1.7975, + "step": 1600 + }, + { + "epoch": 0.29, + "grad_norm": 1.2989176511764526, + "learning_rate": 4.95942129071235e-05, + "loss": 2.024, + "step": 1605 + }, + { + "epoch": 0.29, + "grad_norm": 0.5174963474273682, + "learning_rate": 4.959168603915122e-05, + "loss": 1.7532, + "step": 1610 + }, + { + "epoch": 0.29, + "grad_norm": 0.730141282081604, + "learning_rate": 4.95891513928523e-05, + "loss": 1.7375, + "step": 1615 + }, + { + "epoch": 0.29, + "grad_norm": 0.690668523311615, + "learning_rate": 4.958660896902844e-05, + "loss": 2.0428, + "step": 1620 + }, + { + "epoch": 0.29, + "grad_norm": 0.5279943346977234, + "learning_rate": 4.958405876848382e-05, + "loss": 1.8309, + "step": 1625 + }, + { + "epoch": 0.29, + "grad_norm": 0.4510974586009979, + "learning_rate": 4.958150079202505e-05, + "loss": 1.7792, + "step": 1630 + }, + { + "epoch": 0.29, + "grad_norm": 0.628743052482605, + "learning_rate": 4.957893504046123e-05, + "loss": 2.0351, + "step": 1635 + }, + { + "epoch": 0.29, + "grad_norm": 0.7251464128494263, + "learning_rate": 4.95763615146039e-05, + "loss": 1.9267, + "step": 1640 + }, + { + "epoch": 0.29, + "grad_norm": 0.6540181040763855, + "learning_rate": 4.957378021526705e-05, + "loss": 1.8565, + "step": 1645 + }, + { + "epoch": 0.3, + "grad_norm": 0.511382520198822, + "learning_rate": 4.957119114326717e-05, + "loss": 2.0029, + "step": 1650 + }, + { + "epoch": 0.3, + "grad_norm": 0.8582594990730286, + "learning_rate": 4.9568594299423154e-05, + "loss": 1.9714, + "step": 1655 + }, + { + "epoch": 0.3, + "grad_norm": 0.29603156447410583, + "learning_rate": 4.9565989684556405e-05, + "loss": 2.0099, + "step": 1660 + }, + { + "epoch": 0.3, + "grad_norm": 0.5270999073982239, + "learning_rate": 4.956337729949074e-05, + "loss": 1.9188, + "step": 1665 + }, + { + "epoch": 0.3, + "grad_norm": 0.8146567940711975, + "learning_rate": 4.9560757145052465e-05, + "loss": 2.0585, + "step": 1670 + }, + { + "epoch": 0.3, + "grad_norm": 0.967383861541748, + "learning_rate": 4.955812922207033e-05, + "loss": 1.86, + "step": 1675 + }, + { + "epoch": 0.3, + "grad_norm": 0.5086208581924438, + "learning_rate": 4.955549353137554e-05, + "loss": 1.8033, + "step": 1680 + }, + { + "epoch": 0.3, + "grad_norm": 0.7942809462547302, + "learning_rate": 4.955285007380177e-05, + "loss": 1.9515, + "step": 1685 + }, + { + "epoch": 0.3, + "grad_norm": 0.9034421443939209, + "learning_rate": 4.9550198850185136e-05, + "loss": 1.747, + "step": 1690 + }, + { + "epoch": 0.3, + "grad_norm": 0.7966691851615906, + "learning_rate": 4.954753986136422e-05, + "loss": 2.2449, + "step": 1695 + }, + { + "epoch": 0.3, + "grad_norm": 0.7726231813430786, + "learning_rate": 4.954487310818006e-05, + "loss": 2.1002, + "step": 1700 + }, + { + "epoch": 0.31, + "grad_norm": 0.5616759061813354, + "learning_rate": 4.9542198591476144e-05, + "loss": 2.0595, + "step": 1705 + }, + { + "epoch": 0.31, + "grad_norm": 0.9209871888160706, + "learning_rate": 4.953951631209842e-05, + "loss": 1.7772, + "step": 1710 + }, + { + "epoch": 0.31, + "grad_norm": 0.5064948201179504, + "learning_rate": 4.953682627089529e-05, + "loss": 1.9936, + "step": 1715 + }, + { + "epoch": 0.31, + "grad_norm": 0.5640893578529358, + "learning_rate": 4.953412846871761e-05, + "loss": 2.0519, + "step": 1720 + }, + { + "epoch": 0.31, + "grad_norm": 0.8482123017311096, + "learning_rate": 4.95314229064187e-05, + "loss": 1.573, + "step": 1725 + }, + { + "epoch": 0.31, + "grad_norm": 1.2031537294387817, + "learning_rate": 4.952870958485432e-05, + "loss": 1.8774, + "step": 1730 + }, + { + "epoch": 0.31, + "grad_norm": 0.6398987174034119, + "learning_rate": 4.952598850488269e-05, + "loss": 2.3118, + "step": 1735 + }, + { + "epoch": 0.31, + "grad_norm": 1.2966880798339844, + "learning_rate": 4.952325966736449e-05, + "loss": 1.9382, + "step": 1740 + }, + { + "epoch": 0.31, + "grad_norm": 1.0261720418930054, + "learning_rate": 4.952052307316284e-05, + "loss": 1.8073, + "step": 1745 + }, + { + "epoch": 0.31, + "grad_norm": 1.920386791229248, + "learning_rate": 4.951777872314333e-05, + "loss": 1.752, + "step": 1750 + }, + { + "epoch": 0.31, + "grad_norm": 0.49061957001686096, + "learning_rate": 4.951502661817399e-05, + "loss": 2.0995, + "step": 1755 + }, + { + "epoch": 0.32, + "grad_norm": 0.8655161261558533, + "learning_rate": 4.951226675912532e-05, + "loss": 1.8597, + "step": 1760 + }, + { + "epoch": 0.32, + "grad_norm": 0.9834558367729187, + "learning_rate": 4.9509499146870236e-05, + "loss": 1.9147, + "step": 1765 + }, + { + "epoch": 0.32, + "grad_norm": 1.557944893836975, + "learning_rate": 4.950672378228416e-05, + "loss": 1.8975, + "step": 1770 + }, + { + "epoch": 0.32, + "grad_norm": 0.3454476296901703, + "learning_rate": 4.950394066624492e-05, + "loss": 1.9411, + "step": 1775 + }, + { + "epoch": 0.32, + "grad_norm": 0.5218179225921631, + "learning_rate": 4.950114979963282e-05, + "loss": 1.8548, + "step": 1780 + }, + { + "epoch": 0.32, + "grad_norm": 0.9810051321983337, + "learning_rate": 4.94983511833306e-05, + "loss": 2.064, + "step": 1785 + }, + { + "epoch": 0.32, + "grad_norm": 0.9594024419784546, + "learning_rate": 4.949554481822347e-05, + "loss": 1.996, + "step": 1790 + }, + { + "epoch": 0.32, + "grad_norm": 0.664357602596283, + "learning_rate": 4.949273070519907e-05, + "loss": 1.8579, + "step": 1795 + }, + { + "epoch": 0.32, + "grad_norm": 0.5438924431800842, + "learning_rate": 4.948990884514752e-05, + "loss": 1.829, + "step": 1800 + }, + { + "epoch": 0.32, + "grad_norm": 0.6013981699943542, + "learning_rate": 4.948707923896134e-05, + "loss": 1.6491, + "step": 1805 + }, + { + "epoch": 0.32, + "grad_norm": 0.6347360014915466, + "learning_rate": 4.948424188753556e-05, + "loss": 2.038, + "step": 1810 + }, + { + "epoch": 0.32, + "grad_norm": 0.27879294753074646, + "learning_rate": 4.948139679176762e-05, + "loss": 1.9321, + "step": 1815 + }, + { + "epoch": 0.33, + "grad_norm": 0.48399272561073303, + "learning_rate": 4.9478543952557425e-05, + "loss": 1.7748, + "step": 1820 + }, + { + "epoch": 0.33, + "grad_norm": 0.5349772572517395, + "learning_rate": 4.9475683370807326e-05, + "loss": 2.0395, + "step": 1825 + }, + { + "epoch": 0.33, + "grad_norm": 1.5943443775177002, + "learning_rate": 4.9472815047422115e-05, + "loss": 1.759, + "step": 1830 + }, + { + "epoch": 0.33, + "grad_norm": 0.6108463406562805, + "learning_rate": 4.9469938983309045e-05, + "loss": 2.1658, + "step": 1835 + }, + { + "epoch": 0.33, + "grad_norm": 1.8649053573608398, + "learning_rate": 4.946705517937782e-05, + "loss": 1.9245, + "step": 1840 + }, + { + "epoch": 0.33, + "grad_norm": 1.3017230033874512, + "learning_rate": 4.946416363654056e-05, + "loss": 1.7443, + "step": 1845 + }, + { + "epoch": 0.33, + "grad_norm": 0.48545828461647034, + "learning_rate": 4.946126435571188e-05, + "loss": 1.9818, + "step": 1850 + }, + { + "epoch": 0.33, + "grad_norm": 0.7874136567115784, + "learning_rate": 4.945835733780881e-05, + "loss": 1.8089, + "step": 1855 + }, + { + "epoch": 0.33, + "grad_norm": 0.6153427958488464, + "learning_rate": 4.945544258375086e-05, + "loss": 1.896, + "step": 1860 + }, + { + "epoch": 0.33, + "grad_norm": 2.7674331665039062, + "learning_rate": 4.945252009445992e-05, + "loss": 2.0585, + "step": 1865 + }, + { + "epoch": 0.33, + "grad_norm": 1.3186115026474, + "learning_rate": 4.94495898708604e-05, + "loss": 2.1453, + "step": 1870 + }, + { + "epoch": 0.34, + "grad_norm": 0.5109472274780273, + "learning_rate": 4.944665191387913e-05, + "loss": 1.7282, + "step": 1875 + }, + { + "epoch": 0.34, + "grad_norm": 0.5344210267066956, + "learning_rate": 4.9443706224445366e-05, + "loss": 1.8201, + "step": 1880 + }, + { + "epoch": 0.34, + "grad_norm": 10.217705726623535, + "learning_rate": 4.9440752803490844e-05, + "loss": 2.2501, + "step": 1885 + }, + { + "epoch": 0.34, + "grad_norm": 0.9253365993499756, + "learning_rate": 4.9437791651949704e-05, + "loss": 1.9486, + "step": 1890 + }, + { + "epoch": 0.34, + "grad_norm": 0.6712260842323303, + "learning_rate": 4.943482277075858e-05, + "loss": 1.8439, + "step": 1895 + }, + { + "epoch": 0.34, + "grad_norm": 0.6539827585220337, + "learning_rate": 4.943184616085652e-05, + "loss": 1.9844, + "step": 1900 + }, + { + "epoch": 0.34, + "grad_norm": 0.4726671278476715, + "learning_rate": 4.9428861823185016e-05, + "loss": 1.7578, + "step": 1905 + }, + { + "epoch": 0.34, + "grad_norm": 0.7064640522003174, + "learning_rate": 4.942586975868801e-05, + "loss": 2.0314, + "step": 1910 + }, + { + "epoch": 0.34, + "grad_norm": 1.1073232889175415, + "learning_rate": 4.94228699683119e-05, + "loss": 1.9822, + "step": 1915 + }, + { + "epoch": 0.34, + "grad_norm": 0.4439001679420471, + "learning_rate": 4.941986245300552e-05, + "loss": 1.9749, + "step": 1920 + }, + { + "epoch": 0.34, + "grad_norm": 0.8952843546867371, + "learning_rate": 4.941684721372012e-05, + "loss": 1.9725, + "step": 1925 + }, + { + "epoch": 0.35, + "grad_norm": 0.6618747115135193, + "learning_rate": 4.941382425140944e-05, + "loss": 1.7512, + "step": 1930 + }, + { + "epoch": 0.35, + "grad_norm": 0.7568296790122986, + "learning_rate": 4.941079356702963e-05, + "loss": 1.9417, + "step": 1935 + }, + { + "epoch": 0.35, + "grad_norm": 0.7739149332046509, + "learning_rate": 4.9407755161539295e-05, + "loss": 2.0415, + "step": 1940 + }, + { + "epoch": 0.35, + "grad_norm": 0.6339311003684998, + "learning_rate": 4.9404709035899475e-05, + "loss": 2.0725, + "step": 1945 + }, + { + "epoch": 0.35, + "grad_norm": 0.8696390390396118, + "learning_rate": 4.940165519107367e-05, + "loss": 1.9008, + "step": 1950 + }, + { + "epoch": 0.35, + "grad_norm": 0.9669596552848816, + "learning_rate": 4.939859362802779e-05, + "loss": 2.1317, + "step": 1955 + }, + { + "epoch": 0.35, + "grad_norm": 1.293050765991211, + "learning_rate": 4.9395524347730224e-05, + "loss": 2.0905, + "step": 1960 + }, + { + "epoch": 0.35, + "grad_norm": 3.0222151279449463, + "learning_rate": 4.9392447351151766e-05, + "loss": 2.1604, + "step": 1965 + }, + { + "epoch": 0.35, + "grad_norm": 1.7745553255081177, + "learning_rate": 4.9389362639265673e-05, + "loss": 1.7385, + "step": 1970 + }, + { + "epoch": 0.35, + "grad_norm": 0.5706347227096558, + "learning_rate": 4.938627021304764e-05, + "loss": 1.6271, + "step": 1975 + }, + { + "epoch": 0.35, + "grad_norm": 0.8289479613304138, + "learning_rate": 4.9383170073475795e-05, + "loss": 2.0758, + "step": 1980 + }, + { + "epoch": 0.36, + "grad_norm": 0.584398090839386, + "learning_rate": 4.938006222153071e-05, + "loss": 2.0172, + "step": 1985 + }, + { + "epoch": 0.36, + "grad_norm": 1.0959614515304565, + "learning_rate": 4.9376946658195385e-05, + "loss": 1.963, + "step": 1990 + }, + { + "epoch": 0.36, + "grad_norm": 0.6466994881629944, + "learning_rate": 4.93738233844553e-05, + "loss": 2.0027, + "step": 1995 + }, + { + "epoch": 0.36, + "grad_norm": 1.194799780845642, + "learning_rate": 4.93706924012983e-05, + "loss": 2.0338, + "step": 2000 + }, + { + "epoch": 0.36, + "grad_norm": 4.551344871520996, + "learning_rate": 4.9367553709714754e-05, + "loss": 2.0308, + "step": 2005 + }, + { + "epoch": 0.36, + "grad_norm": 0.7082229852676392, + "learning_rate": 4.9364407310697394e-05, + "loss": 1.6489, + "step": 2010 + }, + { + "epoch": 0.36, + "grad_norm": 0.5904958248138428, + "learning_rate": 4.9361253205241446e-05, + "loss": 1.8373, + "step": 2015 + }, + { + "epoch": 0.36, + "grad_norm": 1.1805599927902222, + "learning_rate": 4.9358091394344543e-05, + "loss": 1.8094, + "step": 2020 + }, + { + "epoch": 0.36, + "grad_norm": 0.45778775215148926, + "learning_rate": 4.9354921879006755e-05, + "loss": 1.8818, + "step": 2025 + }, + { + "epoch": 0.36, + "grad_norm": 0.8848242163658142, + "learning_rate": 4.93517446602306e-05, + "loss": 1.923, + "step": 2030 + }, + { + "epoch": 0.36, + "grad_norm": 0.4579658508300781, + "learning_rate": 4.934855973902105e-05, + "loss": 2.0447, + "step": 2035 + }, + { + "epoch": 0.37, + "grad_norm": 0.5983306169509888, + "learning_rate": 4.934536711638546e-05, + "loss": 2.0134, + "step": 2040 + }, + { + "epoch": 0.37, + "grad_norm": 1.4230965375900269, + "learning_rate": 4.934216679333367e-05, + "loss": 2.1751, + "step": 2045 + }, + { + "epoch": 0.37, + "grad_norm": 0.8285879492759705, + "learning_rate": 4.933895877087794e-05, + "loss": 1.8737, + "step": 2050 + }, + { + "epoch": 0.37, + "grad_norm": 0.829188346862793, + "learning_rate": 4.933574305003296e-05, + "loss": 1.9408, + "step": 2055 + }, + { + "epoch": 0.37, + "grad_norm": 0.6285657286643982, + "learning_rate": 4.933251963181586e-05, + "loss": 1.8272, + "step": 2060 + }, + { + "epoch": 0.37, + "grad_norm": 0.5883930921554565, + "learning_rate": 4.932928851724621e-05, + "loss": 1.8055, + "step": 2065 + }, + { + "epoch": 0.37, + "grad_norm": 0.6375488042831421, + "learning_rate": 4.9326049707346e-05, + "loss": 1.8569, + "step": 2070 + }, + { + "epoch": 0.37, + "grad_norm": 0.6939455270767212, + "learning_rate": 4.9322803203139666e-05, + "loss": 1.5819, + "step": 2075 + }, + { + "epoch": 0.37, + "grad_norm": 0.8837314248085022, + "learning_rate": 4.931954900565408e-05, + "loss": 1.8026, + "step": 2080 + }, + { + "epoch": 0.37, + "grad_norm": 0.8050704598426819, + "learning_rate": 4.931628711591854e-05, + "loss": 1.9522, + "step": 2085 + }, + { + "epoch": 0.37, + "grad_norm": 0.8270391821861267, + "learning_rate": 4.931301753496476e-05, + "loss": 1.9587, + "step": 2090 + }, + { + "epoch": 0.37, + "grad_norm": 1.1010689735412598, + "learning_rate": 4.930974026382693e-05, + "loss": 2.0557, + "step": 2095 + }, + { + "epoch": 0.38, + "grad_norm": 4.389054298400879, + "learning_rate": 4.930645530354163e-05, + "loss": 1.7961, + "step": 2100 + }, + { + "epoch": 0.38, + "grad_norm": 0.7086299061775208, + "learning_rate": 4.93031626551479e-05, + "loss": 1.8631, + "step": 2105 + }, + { + "epoch": 0.38, + "grad_norm": 0.6452075242996216, + "learning_rate": 4.9299862319687204e-05, + "loss": 2.1335, + "step": 2110 + }, + { + "epoch": 0.38, + "grad_norm": 0.5691851377487183, + "learning_rate": 4.9296554298203423e-05, + "loss": 2.1724, + "step": 2115 + }, + { + "epoch": 0.38, + "grad_norm": 0.40636715292930603, + "learning_rate": 4.929323859174289e-05, + "loss": 1.908, + "step": 2120 + }, + { + "epoch": 0.38, + "grad_norm": 0.7919933795928955, + "learning_rate": 4.928991520135436e-05, + "loss": 1.8257, + "step": 2125 + }, + { + "epoch": 0.38, + "grad_norm": 1.567937970161438, + "learning_rate": 4.928658412808901e-05, + "loss": 1.8642, + "step": 2130 + }, + { + "epoch": 0.38, + "grad_norm": 0.49311545491218567, + "learning_rate": 4.9283245373000465e-05, + "loss": 1.8398, + "step": 2135 + }, + { + "epoch": 0.38, + "grad_norm": 0.7059115767478943, + "learning_rate": 4.927989893714477e-05, + "loss": 1.879, + "step": 2140 + }, + { + "epoch": 0.38, + "grad_norm": 0.5949462652206421, + "learning_rate": 4.927654482158039e-05, + "loss": 1.9516, + "step": 2145 + }, + { + "epoch": 0.38, + "grad_norm": 0.8240839838981628, + "learning_rate": 4.927318302736824e-05, + "loss": 2.192, + "step": 2150 + }, + { + "epoch": 0.39, + "grad_norm": 0.7169739603996277, + "learning_rate": 4.926981355557164e-05, + "loss": 1.7331, + "step": 2155 + }, + { + "epoch": 0.39, + "grad_norm": 3.85520076751709, + "learning_rate": 4.926643640725635e-05, + "loss": 1.9134, + "step": 2160 + }, + { + "epoch": 0.39, + "grad_norm": 0.6009440422058105, + "learning_rate": 4.9263051583490584e-05, + "loss": 1.8528, + "step": 2165 + }, + { + "epoch": 0.39, + "grad_norm": 0.7495572566986084, + "learning_rate": 4.9259659085344925e-05, + "loss": 1.7285, + "step": 2170 + }, + { + "epoch": 0.39, + "grad_norm": 0.4452281892299652, + "learning_rate": 4.925625891389244e-05, + "loss": 1.9832, + "step": 2175 + }, + { + "epoch": 0.39, + "grad_norm": 0.6736578345298767, + "learning_rate": 4.925285107020859e-05, + "loss": 2.4924, + "step": 2180 + }, + { + "epoch": 0.39, + "grad_norm": 0.6530075073242188, + "learning_rate": 4.924943555537128e-05, + "loss": 2.2028, + "step": 2185 + }, + { + "epoch": 0.39, + "grad_norm": 0.629514753818512, + "learning_rate": 4.924601237046082e-05, + "loss": 1.9472, + "step": 2190 + }, + { + "epoch": 0.39, + "grad_norm": 0.8466399908065796, + "learning_rate": 4.924258151655997e-05, + "loss": 1.8294, + "step": 2195 + }, + { + "epoch": 0.39, + "grad_norm": 0.8130293488502502, + "learning_rate": 4.92391429947539e-05, + "loss": 2.1553, + "step": 2200 + }, + { + "epoch": 0.39, + "grad_norm": 0.7702954411506653, + "learning_rate": 4.923569680613023e-05, + "loss": 2.0827, + "step": 2205 + }, + { + "epoch": 0.4, + "grad_norm": 0.4999507963657379, + "learning_rate": 4.923224295177897e-05, + "loss": 1.736, + "step": 2210 + }, + { + "epoch": 0.4, + "grad_norm": 1.1567188501358032, + "learning_rate": 4.922878143279256e-05, + "loss": 1.7352, + "step": 2215 + }, + { + "epoch": 0.4, + "grad_norm": 0.736792802810669, + "learning_rate": 4.92253122502659e-05, + "loss": 1.606, + "step": 2220 + }, + { + "epoch": 0.4, + "grad_norm": 0.5883451700210571, + "learning_rate": 4.922183540529627e-05, + "loss": 1.8151, + "step": 2225 + }, + { + "epoch": 0.4, + "grad_norm": 0.6268717646598816, + "learning_rate": 4.921835089898341e-05, + "loss": 2.3119, + "step": 2230 + }, + { + "epoch": 0.4, + "grad_norm": 2.0409209728240967, + "learning_rate": 4.9214858732429444e-05, + "loss": 1.9868, + "step": 2235 + }, + { + "epoch": 0.4, + "grad_norm": 0.6836051344871521, + "learning_rate": 4.921135890673896e-05, + "loss": 1.8223, + "step": 2240 + }, + { + "epoch": 0.4, + "grad_norm": 0.6672108173370361, + "learning_rate": 4.920785142301893e-05, + "loss": 2.0905, + "step": 2245 + }, + { + "epoch": 0.4, + "grad_norm": 0.7968783378601074, + "learning_rate": 4.920433628237879e-05, + "loss": 1.8637, + "step": 2250 + }, + { + "epoch": 0.4, + "grad_norm": 0.7804060578346252, + "learning_rate": 4.9200813485930375e-05, + "loss": 1.749, + "step": 2255 + }, + { + "epoch": 0.4, + "grad_norm": 1.4104400873184204, + "learning_rate": 4.9197283034787925e-05, + "loss": 1.8482, + "step": 2260 + }, + { + "epoch": 0.41, + "grad_norm": 0.480487585067749, + "learning_rate": 4.919374493006812e-05, + "loss": 1.966, + "step": 2265 + }, + { + "epoch": 0.41, + "grad_norm": 0.7626844644546509, + "learning_rate": 4.919019917289006e-05, + "loss": 1.796, + "step": 2270 + }, + { + "epoch": 0.41, + "grad_norm": 0.4077138304710388, + "learning_rate": 4.918664576437528e-05, + "loss": 2.1519, + "step": 2275 + }, + { + "epoch": 0.41, + "grad_norm": 0.5030271410942078, + "learning_rate": 4.918308470564771e-05, + "loss": 1.8834, + "step": 2280 + }, + { + "epoch": 0.41, + "grad_norm": 0.5887728929519653, + "learning_rate": 4.9179515997833706e-05, + "loss": 2.0374, + "step": 2285 + }, + { + "epoch": 0.41, + "grad_norm": 0.766281247138977, + "learning_rate": 4.917593964206205e-05, + "loss": 1.8359, + "step": 2290 + }, + { + "epoch": 0.41, + "grad_norm": 0.6358659267425537, + "learning_rate": 4.917235563946394e-05, + "loss": 1.7785, + "step": 2295 + }, + { + "epoch": 0.41, + "grad_norm": 0.4064447283744812, + "learning_rate": 4.916876399117299e-05, + "loss": 2.0286, + "step": 2300 + }, + { + "epoch": 0.41, + "grad_norm": 0.7041420936584473, + "learning_rate": 4.916516469832524e-05, + "loss": 1.7674, + "step": 2305 + }, + { + "epoch": 0.41, + "grad_norm": 0.7361471056938171, + "learning_rate": 4.916155776205913e-05, + "loss": 1.7543, + "step": 2310 + }, + { + "epoch": 0.41, + "grad_norm": 1.4390116930007935, + "learning_rate": 4.915794318351555e-05, + "loss": 1.7741, + "step": 2315 + }, + { + "epoch": 0.42, + "grad_norm": 0.5468139052391052, + "learning_rate": 4.915432096383779e-05, + "loss": 1.8634, + "step": 2320 + }, + { + "epoch": 0.42, + "grad_norm": 0.36798736453056335, + "learning_rate": 4.915069110417152e-05, + "loss": 1.9809, + "step": 2325 + }, + { + "epoch": 0.42, + "grad_norm": 0.6476161479949951, + "learning_rate": 4.914705360566489e-05, + "loss": 1.8003, + "step": 2330 + }, + { + "epoch": 0.42, + "grad_norm": 0.4177202880382538, + "learning_rate": 4.914340846946844e-05, + "loss": 1.8988, + "step": 2335 + }, + { + "epoch": 0.42, + "grad_norm": 0.6585730314254761, + "learning_rate": 4.91397556967351e-05, + "loss": 2.0532, + "step": 2340 + }, + { + "epoch": 0.42, + "grad_norm": 0.7709575295448303, + "learning_rate": 4.913609528862027e-05, + "loss": 1.8792, + "step": 2345 + }, + { + "epoch": 0.42, + "grad_norm": 0.782526969909668, + "learning_rate": 4.9132427246281705e-05, + "loss": 1.7209, + "step": 2350 + }, + { + "epoch": 0.42, + "grad_norm": 0.9264500737190247, + "learning_rate": 4.9128751570879616e-05, + "loss": 1.9869, + "step": 2355 + }, + { + "epoch": 0.42, + "grad_norm": 0.5954318642616272, + "learning_rate": 4.912506826357661e-05, + "loss": 1.9642, + "step": 2360 + }, + { + "epoch": 0.42, + "grad_norm": 0.49041104316711426, + "learning_rate": 4.9121377325537724e-05, + "loss": 1.6972, + "step": 2365 + }, + { + "epoch": 0.42, + "grad_norm": 0.8424311280250549, + "learning_rate": 4.911767875793039e-05, + "loss": 1.7111, + "step": 2370 + }, + { + "epoch": 0.43, + "grad_norm": 0.7687463760375977, + "learning_rate": 4.911397256192446e-05, + "loss": 1.8755, + "step": 2375 + }, + { + "epoch": 0.43, + "grad_norm": 0.6133802533149719, + "learning_rate": 4.9110258738692204e-05, + "loss": 2.5453, + "step": 2380 + }, + { + "epoch": 0.43, + "grad_norm": 0.6727443337440491, + "learning_rate": 4.9106537289408305e-05, + "loss": 1.6229, + "step": 2385 + }, + { + "epoch": 0.43, + "grad_norm": 1.0047177076339722, + "learning_rate": 4.910280821524985e-05, + "loss": 2.0174, + "step": 2390 + }, + { + "epoch": 0.43, + "grad_norm": 0.6900597810745239, + "learning_rate": 4.909907151739633e-05, + "loss": 1.8986, + "step": 2395 + }, + { + "epoch": 0.43, + "grad_norm": 0.730687141418457, + "learning_rate": 4.909532719702968e-05, + "loss": 1.8659, + "step": 2400 + }, + { + "epoch": 0.43, + "grad_norm": 0.8315011858940125, + "learning_rate": 4.909157525533421e-05, + "loss": 1.6193, + "step": 2405 + }, + { + "epoch": 0.43, + "grad_norm": 2.3859171867370605, + "learning_rate": 4.908781569349666e-05, + "loss": 2.1098, + "step": 2410 + }, + { + "epoch": 0.43, + "grad_norm": 0.7538830637931824, + "learning_rate": 4.9084048512706174e-05, + "loss": 1.7888, + "step": 2415 + }, + { + "epoch": 0.43, + "grad_norm": 0.840951681137085, + "learning_rate": 4.908027371415431e-05, + "loss": 1.6484, + "step": 2420 + }, + { + "epoch": 0.43, + "grad_norm": 1.1713069677352905, + "learning_rate": 4.907649129903504e-05, + "loss": 1.9293, + "step": 2425 + }, + { + "epoch": 0.43, + "grad_norm": 0.6097638010978699, + "learning_rate": 4.9072701268544726e-05, + "loss": 1.9388, + "step": 2430 + }, + { + "epoch": 0.44, + "grad_norm": 0.6106088161468506, + "learning_rate": 4.906890362388215e-05, + "loss": 1.6406, + "step": 2435 + }, + { + "epoch": 0.44, + "grad_norm": 1.9904814958572388, + "learning_rate": 4.906509836624852e-05, + "loss": 1.6748, + "step": 2440 + }, + { + "epoch": 0.44, + "grad_norm": 1.0274256467819214, + "learning_rate": 4.906128549684741e-05, + "loss": 1.9952, + "step": 2445 + }, + { + "epoch": 0.44, + "grad_norm": 0.5849592685699463, + "learning_rate": 4.905746501688485e-05, + "loss": 1.7149, + "step": 2450 + }, + { + "epoch": 0.44, + "grad_norm": 1.4269081354141235, + "learning_rate": 4.9053636927569246e-05, + "loss": 2.0261, + "step": 2455 + }, + { + "epoch": 0.44, + "grad_norm": 0.9268386363983154, + "learning_rate": 4.904980123011142e-05, + "loss": 1.6845, + "step": 2460 + }, + { + "epoch": 0.44, + "grad_norm": 0.6730044484138489, + "learning_rate": 4.904595792572459e-05, + "loss": 1.9066, + "step": 2465 + }, + { + "epoch": 0.44, + "grad_norm": 0.5785349011421204, + "learning_rate": 4.9042107015624405e-05, + "loss": 1.9116, + "step": 2470 + }, + { + "epoch": 0.44, + "grad_norm": 7.519561767578125, + "learning_rate": 4.903824850102889e-05, + "loss": 1.9645, + "step": 2475 + }, + { + "epoch": 0.44, + "grad_norm": 0.5183913111686707, + "learning_rate": 4.903438238315851e-05, + "loss": 2.0975, + "step": 2480 + }, + { + "epoch": 0.44, + "grad_norm": 0.6047394871711731, + "learning_rate": 4.903050866323609e-05, + "loss": 1.9343, + "step": 2485 + }, + { + "epoch": 0.45, + "grad_norm": 1.560421109199524, + "learning_rate": 4.9026627342486886e-05, + "loss": 1.9887, + "step": 2490 + }, + { + "epoch": 0.45, + "grad_norm": 0.5357934236526489, + "learning_rate": 4.902273842213857e-05, + "loss": 2.06, + "step": 2495 + }, + { + "epoch": 0.45, + "grad_norm": 0.7645955085754395, + "learning_rate": 4.901884190342121e-05, + "loss": 1.7665, + "step": 2500 + }, + { + "epoch": 0.45, + "grad_norm": 0.7442848086357117, + "learning_rate": 4.901493778756725e-05, + "loss": 1.7705, + "step": 2505 + }, + { + "epoch": 0.45, + "grad_norm": 0.5181974768638611, + "learning_rate": 4.901102607581156e-05, + "loss": 2.1552, + "step": 2510 + }, + { + "epoch": 0.45, + "grad_norm": 0.5318804979324341, + "learning_rate": 4.900710676939143e-05, + "loss": 1.925, + "step": 2515 + }, + { + "epoch": 0.45, + "grad_norm": 0.42624104022979736, + "learning_rate": 4.900317986954651e-05, + "loss": 1.8521, + "step": 2520 + }, + { + "epoch": 0.45, + "grad_norm": 0.8136389255523682, + "learning_rate": 4.89992453775189e-05, + "loss": 1.8125, + "step": 2525 + }, + { + "epoch": 0.45, + "grad_norm": 0.8504977226257324, + "learning_rate": 4.8995303294553054e-05, + "loss": 1.704, + "step": 2530 + }, + { + "epoch": 0.45, + "grad_norm": 0.5383654236793518, + "learning_rate": 4.899135362189585e-05, + "loss": 1.8018, + "step": 2535 + }, + { + "epoch": 0.45, + "grad_norm": 0.590053915977478, + "learning_rate": 4.898739636079658e-05, + "loss": 1.7633, + "step": 2540 + }, + { + "epoch": 0.46, + "grad_norm": 0.6471716165542603, + "learning_rate": 4.898343151250691e-05, + "loss": 1.8805, + "step": 2545 + }, + { + "epoch": 0.46, + "grad_norm": 0.8511287569999695, + "learning_rate": 4.8979459078280924e-05, + "loss": 1.5846, + "step": 2550 + }, + { + "epoch": 0.46, + "grad_norm": 0.3268200755119324, + "learning_rate": 4.8975479059375093e-05, + "loss": 1.6611, + "step": 2555 + }, + { + "epoch": 0.46, + "grad_norm": 0.7143041491508484, + "learning_rate": 4.8971491457048305e-05, + "loss": 1.9187, + "step": 2560 + }, + { + "epoch": 0.46, + "grad_norm": 0.5510767698287964, + "learning_rate": 4.896749627256182e-05, + "loss": 2.0012, + "step": 2565 + }, + { + "epoch": 0.46, + "grad_norm": 0.34194180369377136, + "learning_rate": 4.8963493507179314e-05, + "loss": 1.8626, + "step": 2570 + }, + { + "epoch": 0.46, + "grad_norm": 1.2060723304748535, + "learning_rate": 4.895948316216687e-05, + "loss": 1.7144, + "step": 2575 + }, + { + "epoch": 0.46, + "grad_norm": 0.33225440979003906, + "learning_rate": 4.895546523879294e-05, + "loss": 1.709, + "step": 2580 + }, + { + "epoch": 0.46, + "grad_norm": 0.596155047416687, + "learning_rate": 4.895143973832841e-05, + "loss": 2.3011, + "step": 2585 + }, + { + "epoch": 0.46, + "grad_norm": 0.8214730620384216, + "learning_rate": 4.8947406662046516e-05, + "loss": 1.7885, + "step": 2590 + }, + { + "epoch": 0.46, + "grad_norm": 0.9236437082290649, + "learning_rate": 4.894336601122293e-05, + "loss": 2.1127, + "step": 2595 + }, + { + "epoch": 0.47, + "grad_norm": 0.6400870680809021, + "learning_rate": 4.893931778713572e-05, + "loss": 2.0813, + "step": 2600 + }, + { + "epoch": 0.47, + "grad_norm": 0.41277340054512024, + "learning_rate": 4.893526199106531e-05, + "loss": 2.1293, + "step": 2605 + }, + { + "epoch": 0.47, + "grad_norm": 1.0775052309036255, + "learning_rate": 4.893119862429455e-05, + "loss": 1.453, + "step": 2610 + }, + { + "epoch": 0.47, + "grad_norm": 0.6560648083686829, + "learning_rate": 4.892712768810869e-05, + "loss": 1.6368, + "step": 2615 + }, + { + "epoch": 0.47, + "grad_norm": 0.7044461369514465, + "learning_rate": 4.8923049183795355e-05, + "loss": 1.9884, + "step": 2620 + }, + { + "epoch": 0.47, + "grad_norm": 0.6141453385353088, + "learning_rate": 4.8918963112644576e-05, + "loss": 1.7104, + "step": 2625 + }, + { + "epoch": 0.47, + "grad_norm": 0.2917144298553467, + "learning_rate": 4.891486947594878e-05, + "loss": 1.9332, + "step": 2630 + }, + { + "epoch": 0.47, + "grad_norm": 0.97393798828125, + "learning_rate": 4.8910768275002763e-05, + "loss": 1.9389, + "step": 2635 + }, + { + "epoch": 0.47, + "grad_norm": 0.5923995971679688, + "learning_rate": 4.890665951110374e-05, + "loss": 2.0709, + "step": 2640 + }, + { + "epoch": 0.47, + "grad_norm": 0.865432620048523, + "learning_rate": 4.890254318555131e-05, + "loss": 2.1132, + "step": 2645 + }, + { + "epoch": 0.47, + "grad_norm": 0.7784671187400818, + "learning_rate": 4.889841929964746e-05, + "loss": 1.7297, + "step": 2650 + }, + { + "epoch": 0.48, + "grad_norm": 0.5125192999839783, + "learning_rate": 4.889428785469657e-05, + "loss": 1.9602, + "step": 2655 + }, + { + "epoch": 0.48, + "grad_norm": 0.7674616575241089, + "learning_rate": 4.8890148852005416e-05, + "loss": 1.9787, + "step": 2660 + }, + { + "epoch": 0.48, + "grad_norm": 1.3875999450683594, + "learning_rate": 4.888600229288317e-05, + "loss": 1.7716, + "step": 2665 + }, + { + "epoch": 0.48, + "grad_norm": 0.8313968181610107, + "learning_rate": 4.8881848178641364e-05, + "loss": 1.4916, + "step": 2670 + }, + { + "epoch": 0.48, + "grad_norm": 0.8791585564613342, + "learning_rate": 4.887768651059395e-05, + "loss": 1.9918, + "step": 2675 + }, + { + "epoch": 0.48, + "grad_norm": 0.9071698188781738, + "learning_rate": 4.887351729005726e-05, + "loss": 2.0969, + "step": 2680 + }, + { + "epoch": 0.48, + "grad_norm": 0.9274541139602661, + "learning_rate": 4.8869340518350024e-05, + "loss": 2.0151, + "step": 2685 + }, + { + "epoch": 0.48, + "grad_norm": 0.5907272100448608, + "learning_rate": 4.8865156196793334e-05, + "loss": 2.1632, + "step": 2690 + }, + { + "epoch": 0.48, + "grad_norm": 0.831947922706604, + "learning_rate": 4.88609643267107e-05, + "loss": 2.2977, + "step": 2695 + }, + { + "epoch": 0.48, + "grad_norm": 0.7203566431999207, + "learning_rate": 4.885676490942799e-05, + "loss": 2.071, + "step": 2700 + }, + { + "epoch": 0.48, + "grad_norm": 1.22580087184906, + "learning_rate": 4.885255794627349e-05, + "loss": 1.9375, + "step": 2705 + }, + { + "epoch": 0.49, + "grad_norm": 0.7402857542037964, + "learning_rate": 4.884834343857786e-05, + "loss": 2.1846, + "step": 2710 + }, + { + "epoch": 0.49, + "grad_norm": 0.610673189163208, + "learning_rate": 4.884412138767414e-05, + "loss": 1.9636, + "step": 2715 + }, + { + "epoch": 0.49, + "grad_norm": 0.7960859537124634, + "learning_rate": 4.883989179489775e-05, + "loss": 2.0377, + "step": 2720 + }, + { + "epoch": 0.49, + "grad_norm": 0.5835393071174622, + "learning_rate": 4.883565466158653e-05, + "loss": 2.1319, + "step": 2725 + }, + { + "epoch": 0.49, + "grad_norm": 0.7329149842262268, + "learning_rate": 4.8831409989080656e-05, + "loss": 1.7587, + "step": 2730 + }, + { + "epoch": 0.49, + "grad_norm": 0.5767766833305359, + "learning_rate": 4.882715777872273e-05, + "loss": 1.9912, + "step": 2735 + }, + { + "epoch": 0.49, + "grad_norm": 0.826280951499939, + "learning_rate": 4.882289803185772e-05, + "loss": 1.6964, + "step": 2740 + }, + { + "epoch": 0.49, + "grad_norm": 0.6336571574211121, + "learning_rate": 4.881863074983298e-05, + "loss": 2.1097, + "step": 2745 + }, + { + "epoch": 0.49, + "grad_norm": 0.4215265214443207, + "learning_rate": 4.8814355933998235e-05, + "loss": 1.8692, + "step": 2750 + }, + { + "epoch": 0.49, + "grad_norm": 0.4438379108905792, + "learning_rate": 4.881007358570562e-05, + "loss": 1.797, + "step": 2755 + }, + { + "epoch": 0.49, + "grad_norm": 0.6177186369895935, + "learning_rate": 4.880578370630963e-05, + "loss": 1.7992, + "step": 2760 + }, + { + "epoch": 0.49, + "grad_norm": 0.6064658761024475, + "learning_rate": 4.8801486297167156e-05, + "loss": 1.9718, + "step": 2765 + }, + { + "epoch": 0.5, + "grad_norm": 0.5778130888938904, + "learning_rate": 4.879718135963746e-05, + "loss": 2.0216, + "step": 2770 + }, + { + "epoch": 0.5, + "grad_norm": 1.0478150844573975, + "learning_rate": 4.879286889508219e-05, + "loss": 1.6562, + "step": 2775 + }, + { + "epoch": 0.5, + "grad_norm": 0.9771472811698914, + "learning_rate": 4.8788548904865375e-05, + "loss": 1.8386, + "step": 2780 + }, + { + "epoch": 0.5, + "grad_norm": 0.5626159310340881, + "learning_rate": 4.878422139035341e-05, + "loss": 2.2397, + "step": 2785 + }, + { + "epoch": 0.5, + "grad_norm": 0.8984270691871643, + "learning_rate": 4.877988635291511e-05, + "loss": 1.8268, + "step": 2790 + }, + { + "epoch": 0.5, + "grad_norm": 0.7879625558853149, + "learning_rate": 4.877554379392163e-05, + "loss": 1.9463, + "step": 2795 + }, + { + "epoch": 0.5, + "grad_norm": 0.40900498628616333, + "learning_rate": 4.877119371474651e-05, + "loss": 1.8477, + "step": 2800 + }, + { + "epoch": 0.5, + "grad_norm": 0.685408353805542, + "learning_rate": 4.876683611676569e-05, + "loss": 1.9898, + "step": 2805 + }, + { + "epoch": 0.5, + "grad_norm": 0.5986139178276062, + "learning_rate": 4.876247100135746e-05, + "loss": 2.0433, + "step": 2810 + }, + { + "epoch": 0.5, + "grad_norm": 0.7480238676071167, + "learning_rate": 4.875809836990252e-05, + "loss": 1.9656, + "step": 2815 + }, + { + "epoch": 0.5, + "grad_norm": 7.595890522003174, + "learning_rate": 4.8753718223783905e-05, + "loss": 1.8644, + "step": 2820 + }, + { + "epoch": 0.51, + "grad_norm": 1.7732789516448975, + "learning_rate": 4.874933056438707e-05, + "loss": 1.8836, + "step": 2825 + }, + { + "epoch": 0.51, + "grad_norm": 0.520476222038269, + "learning_rate": 4.874493539309982e-05, + "loss": 2.0753, + "step": 2830 + }, + { + "epoch": 0.51, + "grad_norm": 0.2731196880340576, + "learning_rate": 4.874053271131235e-05, + "loss": 2.1824, + "step": 2835 + }, + { + "epoch": 0.51, + "grad_norm": 1.522787094116211, + "learning_rate": 4.8736122520417215e-05, + "loss": 1.776, + "step": 2840 + }, + { + "epoch": 0.51, + "grad_norm": 0.6246161460876465, + "learning_rate": 4.8731704821809354e-05, + "loss": 1.8492, + "step": 2845 + }, + { + "epoch": 0.51, + "grad_norm": 0.5303625464439392, + "learning_rate": 4.872727961688609e-05, + "loss": 1.7064, + "step": 2850 + }, + { + "epoch": 0.51, + "grad_norm": 0.939147412776947, + "learning_rate": 4.87228469070471e-05, + "loss": 1.8514, + "step": 2855 + }, + { + "epoch": 0.51, + "grad_norm": 0.5877857208251953, + "learning_rate": 4.8718406693694465e-05, + "loss": 1.7117, + "step": 2860 + }, + { + "epoch": 0.51, + "grad_norm": 0.4529130458831787, + "learning_rate": 4.87139589782326e-05, + "loss": 1.8436, + "step": 2865 + }, + { + "epoch": 0.51, + "grad_norm": 0.8624293208122253, + "learning_rate": 4.870950376206831e-05, + "loss": 1.8674, + "step": 2870 + }, + { + "epoch": 0.51, + "grad_norm": 0.6913080811500549, + "learning_rate": 4.87050410466108e-05, + "loss": 2.4175, + "step": 2875 + }, + { + "epoch": 0.52, + "grad_norm": 1.0130589008331299, + "learning_rate": 4.87005708332716e-05, + "loss": 1.8629, + "step": 2880 + }, + { + "epoch": 0.52, + "grad_norm": 0.40573757886886597, + "learning_rate": 4.869609312346465e-05, + "loss": 2.1532, + "step": 2885 + }, + { + "epoch": 0.52, + "grad_norm": 1.0451394319534302, + "learning_rate": 4.869160791860623e-05, + "loss": 1.9334, + "step": 2890 + }, + { + "epoch": 0.52, + "grad_norm": 3.0864038467407227, + "learning_rate": 4.868711522011502e-05, + "loss": 1.7183, + "step": 2895 + }, + { + "epoch": 0.52, + "grad_norm": 0.2843054234981537, + "learning_rate": 4.868261502941204e-05, + "loss": 2.0684, + "step": 2900 + }, + { + "epoch": 0.52, + "grad_norm": 0.6705717444419861, + "learning_rate": 4.867810734792072e-05, + "loss": 1.8852, + "step": 2905 + }, + { + "epoch": 0.52, + "grad_norm": 0.8164358735084534, + "learning_rate": 4.867359217706681e-05, + "loss": 2.0016, + "step": 2910 + }, + { + "epoch": 0.52, + "grad_norm": 0.6517994403839111, + "learning_rate": 4.866906951827847e-05, + "loss": 1.936, + "step": 2915 + }, + { + "epoch": 0.52, + "grad_norm": 0.3796793818473816, + "learning_rate": 4.866453937298621e-05, + "loss": 1.9839, + "step": 2920 + }, + { + "epoch": 0.52, + "grad_norm": 0.5019554495811462, + "learning_rate": 4.866000174262291e-05, + "loss": 1.6889, + "step": 2925 + }, + { + "epoch": 0.52, + "grad_norm": 1.4469722509384155, + "learning_rate": 4.865545662862381e-05, + "loss": 1.9799, + "step": 2930 + }, + { + "epoch": 0.53, + "grad_norm": 1.0865347385406494, + "learning_rate": 4.865090403242654e-05, + "loss": 1.7499, + "step": 2935 + }, + { + "epoch": 0.53, + "grad_norm": 0.9820341467857361, + "learning_rate": 4.864634395547106e-05, + "loss": 1.9783, + "step": 2940 + }, + { + "epoch": 0.53, + "grad_norm": 0.41876184940338135, + "learning_rate": 4.8641776399199743e-05, + "loss": 1.991, + "step": 2945 + }, + { + "epoch": 0.53, + "grad_norm": 1.171912431716919, + "learning_rate": 4.863720136505729e-05, + "loss": 1.7386, + "step": 2950 + }, + { + "epoch": 0.53, + "grad_norm": 0.8185043931007385, + "learning_rate": 4.863261885449078e-05, + "loss": 1.852, + "step": 2955 + }, + { + "epoch": 0.53, + "grad_norm": 0.5740297436714172, + "learning_rate": 4.862802886894966e-05, + "loss": 1.4815, + "step": 2960 + }, + { + "epoch": 0.53, + "grad_norm": 0.5629348754882812, + "learning_rate": 4.862343140988573e-05, + "loss": 1.9167, + "step": 2965 + }, + { + "epoch": 0.53, + "grad_norm": 0.5648974180221558, + "learning_rate": 4.861882647875318e-05, + "loss": 2.1185, + "step": 2970 + }, + { + "epoch": 0.53, + "grad_norm": 0.46517616510391235, + "learning_rate": 4.861421407700853e-05, + "loss": 1.8994, + "step": 2975 + }, + { + "epoch": 0.53, + "grad_norm": 0.4326724410057068, + "learning_rate": 4.860959420611067e-05, + "loss": 1.9796, + "step": 2980 + }, + { + "epoch": 0.53, + "grad_norm": 1.0473462343215942, + "learning_rate": 4.860496686752088e-05, + "loss": 1.5259, + "step": 2985 + }, + { + "epoch": 0.54, + "grad_norm": 0.6728710532188416, + "learning_rate": 4.860033206270277e-05, + "loss": 2.0169, + "step": 2990 + }, + { + "epoch": 0.54, + "grad_norm": 1.2514013051986694, + "learning_rate": 4.859568979312233e-05, + "loss": 1.8368, + "step": 2995 + }, + { + "epoch": 0.54, + "grad_norm": 0.6908515095710754, + "learning_rate": 4.8591040060247914e-05, + "loss": 2.2935, + "step": 3000 + }, + { + "epoch": 0.54, + "grad_norm": 0.934448778629303, + "learning_rate": 4.8586382865550216e-05, + "loss": 1.9923, + "step": 3005 + }, + { + "epoch": 0.54, + "grad_norm": 0.7105411887168884, + "learning_rate": 4.85817182105023e-05, + "loss": 1.8267, + "step": 3010 + }, + { + "epoch": 0.54, + "grad_norm": 0.6507242321968079, + "learning_rate": 4.8577046096579596e-05, + "loss": 1.7661, + "step": 3015 + }, + { + "epoch": 0.54, + "grad_norm": 0.576221227645874, + "learning_rate": 4.857236652525989e-05, + "loss": 1.7314, + "step": 3020 + }, + { + "epoch": 0.54, + "grad_norm": 1.374879002571106, + "learning_rate": 4.856767949802333e-05, + "loss": 2.1544, + "step": 3025 + }, + { + "epoch": 0.54, + "grad_norm": 0.6787958145141602, + "learning_rate": 4.856298501635241e-05, + "loss": 1.5669, + "step": 3030 + }, + { + "epoch": 0.54, + "grad_norm": 0.8718627095222473, + "learning_rate": 4.8558283081732004e-05, + "loss": 1.8672, + "step": 3035 + }, + { + "epoch": 0.54, + "grad_norm": 0.5774152278900146, + "learning_rate": 4.855357369564931e-05, + "loss": 1.7822, + "step": 3040 + }, + { + "epoch": 0.55, + "grad_norm": 0.5398827791213989, + "learning_rate": 4.854885685959391e-05, + "loss": 1.974, + "step": 3045 + }, + { + "epoch": 0.55, + "grad_norm": 1.1588878631591797, + "learning_rate": 4.8544132575057745e-05, + "loss": 1.5734, + "step": 3050 + }, + { + "epoch": 0.55, + "grad_norm": 0.9843418598175049, + "learning_rate": 4.8539400843535086e-05, + "loss": 2.267, + "step": 3055 + }, + { + "epoch": 0.55, + "grad_norm": 0.5535194277763367, + "learning_rate": 4.8534661666522584e-05, + "loss": 1.5894, + "step": 3060 + }, + { + "epoch": 0.55, + "grad_norm": 1.1449483633041382, + "learning_rate": 4.852991504551924e-05, + "loss": 1.7232, + "step": 3065 + }, + { + "epoch": 0.55, + "grad_norm": 0.4027251601219177, + "learning_rate": 4.852516098202639e-05, + "loss": 1.934, + "step": 3070 + }, + { + "epoch": 0.55, + "grad_norm": 3.9327101707458496, + "learning_rate": 4.852039947754775e-05, + "loss": 1.9557, + "step": 3075 + }, + { + "epoch": 0.55, + "grad_norm": 0.7538480162620544, + "learning_rate": 4.851563053358938e-05, + "loss": 1.759, + "step": 3080 + }, + { + "epoch": 0.55, + "grad_norm": 0.788642942905426, + "learning_rate": 4.85108541516597e-05, + "loss": 1.9517, + "step": 3085 + }, + { + "epoch": 0.55, + "grad_norm": 0.7686721086502075, + "learning_rate": 4.850607033326946e-05, + "loss": 1.676, + "step": 3090 + }, + { + "epoch": 0.55, + "grad_norm": 1.1780756711959839, + "learning_rate": 4.850127907993178e-05, + "loss": 2.0255, + "step": 3095 + }, + { + "epoch": 0.55, + "grad_norm": 0.5100767612457275, + "learning_rate": 4.8496480393162144e-05, + "loss": 1.7199, + "step": 3100 + }, + { + "epoch": 0.56, + "grad_norm": 1.0231378078460693, + "learning_rate": 4.849167427447835e-05, + "loss": 2.0379, + "step": 3105 + }, + { + "epoch": 0.56, + "grad_norm": 0.6608045697212219, + "learning_rate": 4.8486860725400584e-05, + "loss": 1.9066, + "step": 3110 + }, + { + "epoch": 0.56, + "grad_norm": 0.8580523133277893, + "learning_rate": 4.848203974745136e-05, + "loss": 1.842, + "step": 3115 + }, + { + "epoch": 0.56, + "grad_norm": 1.7334781885147095, + "learning_rate": 4.847721134215555e-05, + "loss": 2.1667, + "step": 3120 + }, + { + "epoch": 0.56, + "grad_norm": 0.904914379119873, + "learning_rate": 4.847237551104037e-05, + "loss": 2.0194, + "step": 3125 + }, + { + "epoch": 0.56, + "grad_norm": 0.9545987248420715, + "learning_rate": 4.846753225563541e-05, + "loss": 1.9362, + "step": 3130 + }, + { + "epoch": 0.56, + "grad_norm": 0.530463457107544, + "learning_rate": 4.846268157747256e-05, + "loss": 2.1253, + "step": 3135 + }, + { + "epoch": 0.56, + "grad_norm": 0.36319640278816223, + "learning_rate": 4.845782347808609e-05, + "loss": 1.9925, + "step": 3140 + }, + { + "epoch": 0.56, + "grad_norm": 1.2937252521514893, + "learning_rate": 4.845295795901262e-05, + "loss": 2.2135, + "step": 3145 + }, + { + "epoch": 0.56, + "grad_norm": 0.6018854379653931, + "learning_rate": 4.8448085021791115e-05, + "loss": 1.7548, + "step": 3150 + }, + { + "epoch": 0.56, + "grad_norm": 0.49405333399772644, + "learning_rate": 4.844320466796286e-05, + "loss": 1.9319, + "step": 3155 + }, + { + "epoch": 0.57, + "grad_norm": 1.0258450508117676, + "learning_rate": 4.8438316899071525e-05, + "loss": 1.969, + "step": 3160 + }, + { + "epoch": 0.57, + "grad_norm": 1.1361362934112549, + "learning_rate": 4.84334217166631e-05, + "loss": 1.9683, + "step": 3165 + }, + { + "epoch": 0.57, + "grad_norm": 0.8203340768814087, + "learning_rate": 4.842851912228592e-05, + "loss": 2.0704, + "step": 3170 + }, + { + "epoch": 0.57, + "grad_norm": 0.7416129112243652, + "learning_rate": 4.8423609117490686e-05, + "loss": 1.8224, + "step": 3175 + }, + { + "epoch": 0.57, + "grad_norm": 1.2014813423156738, + "learning_rate": 4.841869170383041e-05, + "loss": 1.8246, + "step": 3180 + }, + { + "epoch": 0.57, + "grad_norm": 0.5674131512641907, + "learning_rate": 4.841376688286048e-05, + "loss": 2.114, + "step": 3185 + }, + { + "epoch": 0.57, + "grad_norm": 0.6258872151374817, + "learning_rate": 4.8408834656138605e-05, + "loss": 1.9057, + "step": 3190 + }, + { + "epoch": 0.57, + "grad_norm": 0.6485921144485474, + "learning_rate": 4.840389502522484e-05, + "loss": 1.9694, + "step": 3195 + }, + { + "epoch": 0.57, + "grad_norm": 0.786283016204834, + "learning_rate": 4.83989479916816e-05, + "loss": 1.9653, + "step": 3200 + }, + { + "epoch": 0.57, + "grad_norm": 0.5875077843666077, + "learning_rate": 4.8393993557073605e-05, + "loss": 1.8774, + "step": 3205 + }, + { + "epoch": 0.57, + "grad_norm": 0.5160438418388367, + "learning_rate": 4.838903172296796e-05, + "loss": 1.9262, + "step": 3210 + }, + { + "epoch": 0.58, + "grad_norm": 0.4956705868244171, + "learning_rate": 4.838406249093408e-05, + "loss": 2.0985, + "step": 3215 + }, + { + "epoch": 0.58, + "grad_norm": 0.6459410190582275, + "learning_rate": 4.837908586254372e-05, + "loss": 1.9046, + "step": 3220 + }, + { + "epoch": 0.58, + "grad_norm": 0.8042646050453186, + "learning_rate": 4.837410183937099e-05, + "loss": 2.1288, + "step": 3225 + }, + { + "epoch": 0.58, + "grad_norm": 0.6269195079803467, + "learning_rate": 4.836911042299233e-05, + "loss": 2.0259, + "step": 3230 + }, + { + "epoch": 0.58, + "grad_norm": 0.8424185514450073, + "learning_rate": 4.8364111614986527e-05, + "loss": 1.6815, + "step": 3235 + }, + { + "epoch": 0.58, + "grad_norm": 0.9412960410118103, + "learning_rate": 4.835910541693469e-05, + "loss": 1.8376, + "step": 3240 + }, + { + "epoch": 0.58, + "grad_norm": 0.6458824276924133, + "learning_rate": 4.835409183042028e-05, + "loss": 1.803, + "step": 3245 + }, + { + "epoch": 0.58, + "grad_norm": 0.5122315287590027, + "learning_rate": 4.834907085702908e-05, + "loss": 2.0321, + "step": 3250 + }, + { + "epoch": 0.58, + "grad_norm": 0.6900401711463928, + "learning_rate": 4.834404249834924e-05, + "loss": 2.0708, + "step": 3255 + }, + { + "epoch": 0.58, + "grad_norm": 0.9199984669685364, + "learning_rate": 4.83390067559712e-05, + "loss": 1.7106, + "step": 3260 + }, + { + "epoch": 0.58, + "grad_norm": 0.7172281742095947, + "learning_rate": 4.8333963631487776e-05, + "loss": 2.0546, + "step": 3265 + }, + { + "epoch": 0.59, + "grad_norm": 0.753948986530304, + "learning_rate": 4.8328913126494104e-05, + "loss": 1.8142, + "step": 3270 + }, + { + "epoch": 0.59, + "grad_norm": 0.8077734708786011, + "learning_rate": 4.8323855242587644e-05, + "loss": 1.908, + "step": 3275 + }, + { + "epoch": 0.59, + "grad_norm": 0.6292251944541931, + "learning_rate": 4.831878998136821e-05, + "loss": 2.0195, + "step": 3280 + }, + { + "epoch": 0.59, + "grad_norm": 0.6925063133239746, + "learning_rate": 4.8313717344437936e-05, + "loss": 1.9374, + "step": 3285 + }, + { + "epoch": 0.59, + "grad_norm": 0.5431442856788635, + "learning_rate": 4.8308637333401284e-05, + "loss": 1.9119, + "step": 3290 + }, + { + "epoch": 0.59, + "grad_norm": 0.8268663883209229, + "learning_rate": 4.830354994986507e-05, + "loss": 1.8918, + "step": 3295 + }, + { + "epoch": 0.59, + "grad_norm": 0.5958759784698486, + "learning_rate": 4.829845519543842e-05, + "loss": 1.6231, + "step": 3300 + }, + { + "epoch": 0.59, + "grad_norm": 0.7670718431472778, + "learning_rate": 4.82933530717328e-05, + "loss": 1.8069, + "step": 3305 + }, + { + "epoch": 0.59, + "grad_norm": 0.5632594227790833, + "learning_rate": 4.828824358036201e-05, + "loss": 1.8654, + "step": 3310 + }, + { + "epoch": 0.59, + "grad_norm": 1.0903352499008179, + "learning_rate": 4.8283126722942176e-05, + "loss": 2.0945, + "step": 3315 + }, + { + "epoch": 0.59, + "grad_norm": 0.4936974346637726, + "learning_rate": 4.8278002501091755e-05, + "loss": 2.1163, + "step": 3320 + }, + { + "epoch": 0.6, + "grad_norm": 0.2784171402454376, + "learning_rate": 4.827287091643154e-05, + "loss": 1.9931, + "step": 3325 + }, + { + "epoch": 0.6, + "grad_norm": 0.8564805388450623, + "learning_rate": 4.8267731970584626e-05, + "loss": 2.1167, + "step": 3330 + }, + { + "epoch": 0.6, + "grad_norm": 2.2154970169067383, + "learning_rate": 4.8262585665176485e-05, + "loss": 1.7391, + "step": 3335 + }, + { + "epoch": 0.6, + "grad_norm": 0.9126550555229187, + "learning_rate": 4.825743200183486e-05, + "loss": 2.0901, + "step": 3340 + }, + { + "epoch": 0.6, + "grad_norm": 0.4065916836261749, + "learning_rate": 4.825227098218987e-05, + "loss": 1.991, + "step": 3345 + }, + { + "epoch": 0.6, + "grad_norm": 0.732071578502655, + "learning_rate": 4.824710260787394e-05, + "loss": 2.0214, + "step": 3350 + }, + { + "epoch": 0.6, + "grad_norm": 0.5489442348480225, + "learning_rate": 4.824192688052181e-05, + "loss": 1.8027, + "step": 3355 + }, + { + "epoch": 0.6, + "grad_norm": 0.5446940660476685, + "learning_rate": 4.8236743801770565e-05, + "loss": 1.7853, + "step": 3360 + }, + { + "epoch": 0.6, + "grad_norm": 0.5271531343460083, + "learning_rate": 4.823155337325961e-05, + "loss": 1.8151, + "step": 3365 + }, + { + "epoch": 0.6, + "grad_norm": 0.5761736631393433, + "learning_rate": 4.8226355596630655e-05, + "loss": 1.6954, + "step": 3370 + }, + { + "epoch": 0.6, + "grad_norm": 0.9606974124908447, + "learning_rate": 4.8221150473527774e-05, + "loss": 1.7548, + "step": 3375 + }, + { + "epoch": 0.61, + "grad_norm": 0.4853847026824951, + "learning_rate": 4.821593800559734e-05, + "loss": 1.9427, + "step": 3380 + }, + { + "epoch": 0.61, + "grad_norm": 0.9224936366081238, + "learning_rate": 4.8210718194488046e-05, + "loss": 1.707, + "step": 3385 + }, + { + "epoch": 0.61, + "grad_norm": 0.6053968071937561, + "learning_rate": 4.82054910418509e-05, + "loss": 2.1525, + "step": 3390 + }, + { + "epoch": 0.61, + "grad_norm": 0.5924646258354187, + "learning_rate": 4.8200256549339274e-05, + "loss": 1.6875, + "step": 3395 + }, + { + "epoch": 0.61, + "grad_norm": 8.748364448547363, + "learning_rate": 4.8195014718608814e-05, + "loss": 1.9114, + "step": 3400 + }, + { + "epoch": 0.61, + "grad_norm": 0.8054745197296143, + "learning_rate": 4.81897655513175e-05, + "loss": 1.935, + "step": 3405 + }, + { + "epoch": 0.61, + "grad_norm": 0.8999778032302856, + "learning_rate": 4.8184509049125645e-05, + "loss": 1.9618, + "step": 3410 + }, + { + "epoch": 0.61, + "grad_norm": 0.8038251996040344, + "learning_rate": 4.8179245213695887e-05, + "loss": 1.5852, + "step": 3415 + }, + { + "epoch": 0.61, + "grad_norm": 1.1532466411590576, + "learning_rate": 4.817397404669317e-05, + "loss": 1.7721, + "step": 3420 + }, + { + "epoch": 0.61, + "grad_norm": 1.0493075847625732, + "learning_rate": 4.816869554978475e-05, + "loss": 1.8461, + "step": 3425 + }, + { + "epoch": 0.61, + "grad_norm": 0.7406529784202576, + "learning_rate": 4.81634097246402e-05, + "loss": 1.9881, + "step": 3430 + }, + { + "epoch": 0.61, + "grad_norm": 0.6479585766792297, + "learning_rate": 4.8158116572931445e-05, + "loss": 1.7853, + "step": 3435 + }, + { + "epoch": 0.62, + "grad_norm": 0.9244452118873596, + "learning_rate": 4.815281609633269e-05, + "loss": 1.7463, + "step": 3440 + }, + { + "epoch": 0.62, + "grad_norm": 0.5907859802246094, + "learning_rate": 4.814750829652048e-05, + "loss": 1.9086, + "step": 3445 + }, + { + "epoch": 0.62, + "grad_norm": 0.5566309094429016, + "learning_rate": 4.814219317517365e-05, + "loss": 2.0991, + "step": 3450 + }, + { + "epoch": 0.62, + "grad_norm": 0.7224152088165283, + "learning_rate": 4.813687073397339e-05, + "loss": 1.9487, + "step": 3455 + }, + { + "epoch": 0.62, + "grad_norm": 0.6035553216934204, + "learning_rate": 4.8131540974603164e-05, + "loss": 2.0636, + "step": 3460 + }, + { + "epoch": 0.62, + "grad_norm": 0.7073687314987183, + "learning_rate": 4.8126203898748776e-05, + "loss": 1.9388, + "step": 3465 + }, + { + "epoch": 0.62, + "grad_norm": 1.752922534942627, + "learning_rate": 4.812085950809835e-05, + "loss": 1.5911, + "step": 3470 + }, + { + "epoch": 0.62, + "grad_norm": 0.6370113492012024, + "learning_rate": 4.811550780434229e-05, + "loss": 1.7502, + "step": 3475 + }, + { + "epoch": 0.62, + "grad_norm": 0.7335951924324036, + "learning_rate": 4.811014878917335e-05, + "loss": 1.8573, + "step": 3480 + }, + { + "epoch": 0.62, + "grad_norm": 0.6506394743919373, + "learning_rate": 4.810478246428658e-05, + "loss": 1.7608, + "step": 3485 + }, + { + "epoch": 0.62, + "grad_norm": 0.5935482382774353, + "learning_rate": 4.8099408831379335e-05, + "loss": 1.9053, + "step": 3490 + }, + { + "epoch": 0.63, + "grad_norm": 1.0178114175796509, + "learning_rate": 4.80940278921513e-05, + "loss": 1.9641, + "step": 3495 + }, + { + "epoch": 0.63, + "grad_norm": 0.5880244970321655, + "learning_rate": 4.8088639648304455e-05, + "loss": 1.6671, + "step": 3500 + }, + { + "epoch": 0.63, + "grad_norm": 0.6311240792274475, + "learning_rate": 4.80832441015431e-05, + "loss": 1.6607, + "step": 3505 + }, + { + "epoch": 0.63, + "grad_norm": 0.4380526542663574, + "learning_rate": 4.807784125357383e-05, + "loss": 2.2915, + "step": 3510 + }, + { + "epoch": 0.63, + "grad_norm": 0.7547699213027954, + "learning_rate": 4.8072431106105577e-05, + "loss": 1.7945, + "step": 3515 + }, + { + "epoch": 0.63, + "grad_norm": 1.0797021389007568, + "learning_rate": 4.806701366084956e-05, + "loss": 1.9746, + "step": 3520 + }, + { + "epoch": 0.63, + "grad_norm": 1.3622939586639404, + "learning_rate": 4.8061588919519304e-05, + "loss": 1.7607, + "step": 3525 + }, + { + "epoch": 0.63, + "grad_norm": 0.4650992751121521, + "learning_rate": 4.805615688383066e-05, + "loss": 1.9469, + "step": 3530 + }, + { + "epoch": 0.63, + "grad_norm": 0.421215683221817, + "learning_rate": 4.805071755550177e-05, + "loss": 1.7305, + "step": 3535 + }, + { + "epoch": 0.63, + "grad_norm": 3.6225435733795166, + "learning_rate": 4.8045270936253086e-05, + "loss": 1.6456, + "step": 3540 + }, + { + "epoch": 0.63, + "grad_norm": 0.6064786314964294, + "learning_rate": 4.8039817027807374e-05, + "loss": 1.8573, + "step": 3545 + }, + { + "epoch": 0.64, + "grad_norm": 0.45190367102622986, + "learning_rate": 4.803435583188969e-05, + "loss": 1.9488, + "step": 3550 + }, + { + "epoch": 0.64, + "grad_norm": 0.44525858759880066, + "learning_rate": 4.8028887350227424e-05, + "loss": 2.1564, + "step": 3555 + }, + { + "epoch": 0.64, + "grad_norm": 0.6965821385383606, + "learning_rate": 4.802341158455024e-05, + "loss": 1.7877, + "step": 3560 + }, + { + "epoch": 0.64, + "grad_norm": 0.4658873975276947, + "learning_rate": 4.80179285365901e-05, + "loss": 1.9438, + "step": 3565 + }, + { + "epoch": 0.64, + "grad_norm": 0.8804203867912292, + "learning_rate": 4.801243820808132e-05, + "loss": 1.8994, + "step": 3570 + }, + { + "epoch": 0.64, + "grad_norm": 0.5602685809135437, + "learning_rate": 4.8006940600760475e-05, + "loss": 1.8585, + "step": 3575 + }, + { + "epoch": 0.64, + "grad_norm": 0.7118222713470459, + "learning_rate": 4.800143571636644e-05, + "loss": 1.9065, + "step": 3580 + }, + { + "epoch": 0.64, + "grad_norm": 0.5043877959251404, + "learning_rate": 4.799592355664041e-05, + "loss": 1.8838, + "step": 3585 + }, + { + "epoch": 0.64, + "grad_norm": 0.5556797981262207, + "learning_rate": 4.799040412332588e-05, + "loss": 1.8743, + "step": 3590 + }, + { + "epoch": 0.64, + "grad_norm": 0.5163377523422241, + "learning_rate": 4.798487741816864e-05, + "loss": 1.888, + "step": 3595 + }, + { + "epoch": 0.64, + "grad_norm": 0.7609609961509705, + "learning_rate": 4.797934344291678e-05, + "loss": 2.0205, + "step": 3600 + }, + { + "epoch": 0.65, + "grad_norm": 0.6983132362365723, + "learning_rate": 4.797380219932069e-05, + "loss": 1.9689, + "step": 3605 + }, + { + "epoch": 0.65, + "grad_norm": 1.0645403861999512, + "learning_rate": 4.7968253689133074e-05, + "loss": 1.8991, + "step": 3610 + }, + { + "epoch": 0.65, + "grad_norm": 0.5911131501197815, + "learning_rate": 4.79626979141089e-05, + "loss": 2.0719, + "step": 3615 + }, + { + "epoch": 0.65, + "grad_norm": 0.25667804479599, + "learning_rate": 4.795713487600547e-05, + "loss": 2.1776, + "step": 3620 + }, + { + "epoch": 0.65, + "grad_norm": 0.4448702037334442, + "learning_rate": 4.795156457658235e-05, + "loss": 2.1112, + "step": 3625 + }, + { + "epoch": 0.65, + "grad_norm": 0.5844559669494629, + "learning_rate": 4.7945987017601435e-05, + "loss": 2.0799, + "step": 3630 + }, + { + "epoch": 0.65, + "grad_norm": 0.5194575786590576, + "learning_rate": 4.79404022008269e-05, + "loss": 1.9454, + "step": 3635 + }, + { + "epoch": 0.65, + "grad_norm": 0.4168377220630646, + "learning_rate": 4.7934810128025216e-05, + "loss": 1.5136, + "step": 3640 + }, + { + "epoch": 0.65, + "grad_norm": 0.6346775889396667, + "learning_rate": 4.792921080096514e-05, + "loss": 2.225, + "step": 3645 + }, + { + "epoch": 0.65, + "grad_norm": 1.2728739976882935, + "learning_rate": 4.7923604221417754e-05, + "loss": 1.9655, + "step": 3650 + }, + { + "epoch": 0.65, + "grad_norm": 0.6890284419059753, + "learning_rate": 4.79179903911564e-05, + "loss": 1.9297, + "step": 3655 + }, + { + "epoch": 0.66, + "grad_norm": 3.341615676879883, + "learning_rate": 4.7912369311956726e-05, + "loss": 2.0387, + "step": 3660 + }, + { + "epoch": 0.66, + "grad_norm": 0.8217077851295471, + "learning_rate": 4.790674098559668e-05, + "loss": 1.8114, + "step": 3665 + }, + { + "epoch": 0.66, + "grad_norm": 0.9442257881164551, + "learning_rate": 4.790110541385649e-05, + "loss": 2.011, + "step": 3670 + }, + { + "epoch": 0.66, + "grad_norm": 1.0342832803726196, + "learning_rate": 4.789546259851869e-05, + "loss": 1.9695, + "step": 3675 + }, + { + "epoch": 0.66, + "grad_norm": 0.5166803598403931, + "learning_rate": 4.788981254136809e-05, + "loss": 1.6962, + "step": 3680 + }, + { + "epoch": 0.66, + "grad_norm": 0.49156954884529114, + "learning_rate": 4.78841552441918e-05, + "loss": 1.9125, + "step": 3685 + }, + { + "epoch": 0.66, + "grad_norm": 1.0156267881393433, + "learning_rate": 4.7878490708779225e-05, + "loss": 2.0294, + "step": 3690 + }, + { + "epoch": 0.66, + "grad_norm": 0.5214945673942566, + "learning_rate": 4.787281893692204e-05, + "loss": 2.1405, + "step": 3695 + }, + { + "epoch": 0.66, + "grad_norm": 0.5293641090393066, + "learning_rate": 4.7867139930414214e-05, + "loss": 1.8728, + "step": 3700 + }, + { + "epoch": 0.66, + "grad_norm": 0.5196399688720703, + "learning_rate": 4.786145369105204e-05, + "loss": 1.8972, + "step": 3705 + }, + { + "epoch": 0.66, + "grad_norm": 0.6026878952980042, + "learning_rate": 4.785576022063405e-05, + "loss": 1.9923, + "step": 3710 + }, + { + "epoch": 0.66, + "grad_norm": 0.5810438394546509, + "learning_rate": 4.785005952096109e-05, + "loss": 1.9926, + "step": 3715 + }, + { + "epoch": 0.67, + "grad_norm": 0.6136882305145264, + "learning_rate": 4.784435159383627e-05, + "loss": 1.9561, + "step": 3720 + }, + { + "epoch": 0.67, + "grad_norm": 0.7523034811019897, + "learning_rate": 4.783863644106502e-05, + "loss": 1.9247, + "step": 3725 + }, + { + "epoch": 0.67, + "grad_norm": 0.477094441652298, + "learning_rate": 4.783291406445504e-05, + "loss": 2.2932, + "step": 3730 + }, + { + "epoch": 0.67, + "grad_norm": 0.7570971250534058, + "learning_rate": 4.78271844658163e-05, + "loss": 2.1548, + "step": 3735 + }, + { + "epoch": 0.67, + "grad_norm": 0.6923122406005859, + "learning_rate": 4.7821447646961066e-05, + "loss": 1.7722, + "step": 3740 + }, + { + "epoch": 0.67, + "grad_norm": 0.4171823561191559, + "learning_rate": 4.7815703609703896e-05, + "loss": 2.1501, + "step": 3745 + }, + { + "epoch": 0.67, + "grad_norm": 0.8493704199790955, + "learning_rate": 4.7809952355861614e-05, + "loss": 2.1935, + "step": 3750 + }, + { + "epoch": 0.67, + "grad_norm": 1.7510491609573364, + "learning_rate": 4.7804193887253344e-05, + "loss": 1.7398, + "step": 3755 + }, + { + "epoch": 0.67, + "grad_norm": 0.6981149315834045, + "learning_rate": 4.779842820570049e-05, + "loss": 1.857, + "step": 3760 + }, + { + "epoch": 0.67, + "grad_norm": 0.6277537941932678, + "learning_rate": 4.7792655313026715e-05, + "loss": 1.9971, + "step": 3765 + }, + { + "epoch": 0.67, + "grad_norm": 0.963358461856842, + "learning_rate": 4.7786875211058e-05, + "loss": 1.7726, + "step": 3770 + }, + { + "epoch": 0.68, + "grad_norm": 0.5438962578773499, + "learning_rate": 4.778108790162256e-05, + "loss": 1.6827, + "step": 3775 + }, + { + "epoch": 0.68, + "grad_norm": 0.7283356189727783, + "learning_rate": 4.777529338655094e-05, + "loss": 1.915, + "step": 3780 + }, + { + "epoch": 0.68, + "grad_norm": 1.1045600175857544, + "learning_rate": 4.776949166767592e-05, + "loss": 2.3648, + "step": 3785 + }, + { + "epoch": 0.68, + "grad_norm": 0.6527469754219055, + "learning_rate": 4.77636827468326e-05, + "loss": 1.8445, + "step": 3790 + }, + { + "epoch": 0.68, + "grad_norm": 0.3629080057144165, + "learning_rate": 4.775786662585831e-05, + "loss": 1.949, + "step": 3795 + }, + { + "epoch": 0.68, + "grad_norm": 0.49108001589775085, + "learning_rate": 4.77520433065927e-05, + "loss": 1.8602, + "step": 3800 + }, + { + "epoch": 0.68, + "grad_norm": 0.3771425783634186, + "learning_rate": 4.774621279087768e-05, + "loss": 1.8913, + "step": 3805 + }, + { + "epoch": 0.68, + "grad_norm": 0.9145960807800293, + "learning_rate": 4.7740375080557434e-05, + "loss": 1.9314, + "step": 3810 + }, + { + "epoch": 0.68, + "grad_norm": 1.6728583574295044, + "learning_rate": 4.773453017747842e-05, + "loss": 1.6274, + "step": 3815 + }, + { + "epoch": 0.68, + "grad_norm": 0.5668622255325317, + "learning_rate": 4.772867808348938e-05, + "loss": 2.0638, + "step": 3820 + }, + { + "epoch": 0.68, + "grad_norm": 0.6772801876068115, + "learning_rate": 4.7722818800441316e-05, + "loss": 1.9404, + "step": 3825 + }, + { + "epoch": 0.69, + "grad_norm": 10.99435043334961, + "learning_rate": 4.771695233018754e-05, + "loss": 1.8604, + "step": 3830 + }, + { + "epoch": 0.69, + "grad_norm": 0.8190693259239197, + "learning_rate": 4.7711078674583576e-05, + "loss": 1.8342, + "step": 3835 + }, + { + "epoch": 0.69, + "grad_norm": 3.4974558353424072, + "learning_rate": 4.770519783548728e-05, + "loss": 2.1466, + "step": 3840 + }, + { + "epoch": 0.69, + "grad_norm": 0.8280129432678223, + "learning_rate": 4.7699309814758746e-05, + "loss": 1.845, + "step": 3845 + }, + { + "epoch": 0.69, + "grad_norm": 0.6876835823059082, + "learning_rate": 4.7693414614260356e-05, + "loss": 2.0733, + "step": 3850 + }, + { + "epoch": 0.69, + "grad_norm": 1.2137017250061035, + "learning_rate": 4.7687512235856744e-05, + "loss": 2.1651, + "step": 3855 + }, + { + "epoch": 0.69, + "grad_norm": 0.7566338181495667, + "learning_rate": 4.768160268141484e-05, + "loss": 1.515, + "step": 3860 + }, + { + "epoch": 0.69, + "grad_norm": 0.49978703260421753, + "learning_rate": 4.767568595280383e-05, + "loss": 1.8929, + "step": 3865 + }, + { + "epoch": 0.69, + "grad_norm": 0.3817705512046814, + "learning_rate": 4.766976205189516e-05, + "loss": 1.8465, + "step": 3870 + }, + { + "epoch": 0.69, + "grad_norm": 0.4288356900215149, + "learning_rate": 4.766383098056255e-05, + "loss": 1.8791, + "step": 3875 + }, + { + "epoch": 0.69, + "grad_norm": 0.5867508053779602, + "learning_rate": 4.7657892740682006e-05, + "loss": 1.9564, + "step": 3880 + }, + { + "epoch": 0.7, + "grad_norm": 1.2050591707229614, + "learning_rate": 4.765194733413178e-05, + "loss": 1.9315, + "step": 3885 + }, + { + "epoch": 0.7, + "grad_norm": 0.9066573977470398, + "learning_rate": 4.764599476279241e-05, + "loss": 1.9297, + "step": 3890 + }, + { + "epoch": 0.7, + "grad_norm": 1.0444132089614868, + "learning_rate": 4.764003502854668e-05, + "loss": 1.7961, + "step": 3895 + }, + { + "epoch": 0.7, + "grad_norm": 0.708714485168457, + "learning_rate": 4.763406813327964e-05, + "loss": 1.9499, + "step": 3900 + }, + { + "epoch": 0.7, + "grad_norm": 0.7246438264846802, + "learning_rate": 4.762809407887862e-05, + "loss": 1.8408, + "step": 3905 + }, + { + "epoch": 0.7, + "grad_norm": 0.7492526769638062, + "learning_rate": 4.762211286723321e-05, + "loss": 2.181, + "step": 3910 + }, + { + "epoch": 0.7, + "grad_norm": 0.6265992522239685, + "learning_rate": 4.761612450023526e-05, + "loss": 2.1916, + "step": 3915 + }, + { + "epoch": 0.7, + "grad_norm": 1.4639782905578613, + "learning_rate": 4.7610128979778883e-05, + "loss": 2.1112, + "step": 3920 + }, + { + "epoch": 0.7, + "grad_norm": 0.7836752533912659, + "learning_rate": 4.760412630776046e-05, + "loss": 1.8764, + "step": 3925 + }, + { + "epoch": 0.7, + "grad_norm": 0.8531836867332458, + "learning_rate": 4.759811648607862e-05, + "loss": 2.0257, + "step": 3930 + }, + { + "epoch": 0.7, + "grad_norm": 0.540393054485321, + "learning_rate": 4.7592099516634294e-05, + "loss": 1.6083, + "step": 3935 + }, + { + "epoch": 0.71, + "grad_norm": 0.5300696492195129, + "learning_rate": 4.75860754013306e-05, + "loss": 1.8257, + "step": 3940 + }, + { + "epoch": 0.71, + "grad_norm": 0.6871602535247803, + "learning_rate": 4.758004414207299e-05, + "loss": 2.0006, + "step": 3945 + }, + { + "epoch": 0.71, + "grad_norm": 0.5659440159797668, + "learning_rate": 4.7574005740769136e-05, + "loss": 1.6773, + "step": 3950 + }, + { + "epoch": 0.71, + "grad_norm": 0.466205358505249, + "learning_rate": 4.7567960199328984e-05, + "loss": 2.129, + "step": 3955 + }, + { + "epoch": 0.71, + "grad_norm": 1.5387719869613647, + "learning_rate": 4.7561907519664724e-05, + "loss": 1.8874, + "step": 3960 + }, + { + "epoch": 0.71, + "grad_norm": 0.7125961184501648, + "learning_rate": 4.755584770369081e-05, + "loss": 1.919, + "step": 3965 + }, + { + "epoch": 0.71, + "grad_norm": 0.9576629400253296, + "learning_rate": 4.754978075332398e-05, + "loss": 1.9866, + "step": 3970 + }, + { + "epoch": 0.71, + "grad_norm": 0.6893014311790466, + "learning_rate": 4.754370667048317e-05, + "loss": 1.6943, + "step": 3975 + }, + { + "epoch": 0.71, + "grad_norm": 1.0120683908462524, + "learning_rate": 4.753762545708964e-05, + "loss": 1.7324, + "step": 3980 + }, + { + "epoch": 0.71, + "grad_norm": 1.0890415906906128, + "learning_rate": 4.753153711506685e-05, + "loss": 1.8863, + "step": 3985 + }, + { + "epoch": 0.71, + "grad_norm": 0.5287376642227173, + "learning_rate": 4.752544164634054e-05, + "loss": 2.111, + "step": 3990 + }, + { + "epoch": 0.72, + "grad_norm": 1.014571189880371, + "learning_rate": 4.7519339052838706e-05, + "loss": 1.9179, + "step": 3995 + }, + { + "epoch": 0.72, + "grad_norm": 0.6377423405647278, + "learning_rate": 4.7513229336491594e-05, + "loss": 1.736, + "step": 4000 + }, + { + "epoch": 0.72, + "grad_norm": 0.7745433449745178, + "learning_rate": 4.7507112499231696e-05, + "loss": 2.0101, + "step": 4005 + }, + { + "epoch": 0.72, + "grad_norm": 0.7528572678565979, + "learning_rate": 4.750098854299376e-05, + "loss": 1.9943, + "step": 4010 + }, + { + "epoch": 0.72, + "grad_norm": 1.8742436170578003, + "learning_rate": 4.7494857469714796e-05, + "loss": 1.9835, + "step": 4015 + }, + { + "epoch": 0.72, + "grad_norm": 1.895379662513733, + "learning_rate": 4.748871928133405e-05, + "loss": 1.7243, + "step": 4020 + }, + { + "epoch": 0.72, + "grad_norm": 1.6951816082000732, + "learning_rate": 4.748257397979302e-05, + "loss": 1.7516, + "step": 4025 + }, + { + "epoch": 0.72, + "grad_norm": 0.9127728939056396, + "learning_rate": 4.7476421567035475e-05, + "loss": 2.2468, + "step": 4030 + }, + { + "epoch": 0.72, + "grad_norm": 0.9135558605194092, + "learning_rate": 4.747026204500741e-05, + "loss": 1.8177, + "step": 4035 + }, + { + "epoch": 0.72, + "grad_norm": 0.3471672236919403, + "learning_rate": 4.746409541565706e-05, + "loss": 1.8712, + "step": 4040 + }, + { + "epoch": 0.72, + "grad_norm": 0.621139407157898, + "learning_rate": 4.7457921680934946e-05, + "loss": 1.9565, + "step": 4045 + }, + { + "epoch": 0.72, + "grad_norm": 1.2294261455535889, + "learning_rate": 4.745174084279381e-05, + "loss": 2.0262, + "step": 4050 + }, + { + "epoch": 0.73, + "grad_norm": 0.7085707783699036, + "learning_rate": 4.744555290318864e-05, + "loss": 1.8709, + "step": 4055 + }, + { + "epoch": 0.73, + "grad_norm": 0.5060424208641052, + "learning_rate": 4.7439357864076674e-05, + "loss": 1.8826, + "step": 4060 + }, + { + "epoch": 0.73, + "grad_norm": 0.5154075622558594, + "learning_rate": 4.74331557274174e-05, + "loss": 2.1846, + "step": 4065 + }, + { + "epoch": 0.73, + "grad_norm": 0.9464437961578369, + "learning_rate": 4.7426946495172545e-05, + "loss": 2.0822, + "step": 4070 + }, + { + "epoch": 0.73, + "grad_norm": 1.2713757753372192, + "learning_rate": 4.7420730169306084e-05, + "loss": 1.9661, + "step": 4075 + }, + { + "epoch": 0.73, + "grad_norm": 7.036674976348877, + "learning_rate": 4.7414506751784236e-05, + "loss": 1.8481, + "step": 4080 + }, + { + "epoch": 0.73, + "grad_norm": 0.6142359972000122, + "learning_rate": 4.740827624457547e-05, + "loss": 1.8596, + "step": 4085 + }, + { + "epoch": 0.73, + "grad_norm": 0.9795764088630676, + "learning_rate": 4.740203864965048e-05, + "loss": 2.054, + "step": 4090 + }, + { + "epoch": 0.73, + "grad_norm": 0.9507243633270264, + "learning_rate": 4.739579396898222e-05, + "loss": 1.9746, + "step": 4095 + }, + { + "epoch": 0.73, + "grad_norm": 0.5483179092407227, + "learning_rate": 4.738954220454586e-05, + "loss": 2.0328, + "step": 4100 + }, + { + "epoch": 0.73, + "grad_norm": 0.9164968729019165, + "learning_rate": 4.738328335831883e-05, + "loss": 2.0429, + "step": 4105 + }, + { + "epoch": 0.74, + "grad_norm": 0.672351062297821, + "learning_rate": 4.7377017432280815e-05, + "loss": 1.8414, + "step": 4110 + }, + { + "epoch": 0.74, + "grad_norm": 0.5230313539505005, + "learning_rate": 4.7370744428413704e-05, + "loss": 2.2537, + "step": 4115 + }, + { + "epoch": 0.74, + "grad_norm": 0.480780690908432, + "learning_rate": 4.736446434870166e-05, + "loss": 1.8738, + "step": 4120 + }, + { + "epoch": 0.74, + "grad_norm": 0.4368326663970947, + "learning_rate": 4.7358177195131035e-05, + "loss": 1.8399, + "step": 4125 + }, + { + "epoch": 0.74, + "grad_norm": 1.0797245502471924, + "learning_rate": 4.735188296969048e-05, + "loss": 1.8166, + "step": 4130 + }, + { + "epoch": 0.74, + "grad_norm": 0.6688716411590576, + "learning_rate": 4.734558167437084e-05, + "loss": 1.8741, + "step": 4135 + }, + { + "epoch": 0.74, + "grad_norm": 0.3694940507411957, + "learning_rate": 4.733927331116521e-05, + "loss": 2.2319, + "step": 4140 + }, + { + "epoch": 0.74, + "grad_norm": 0.4561655521392822, + "learning_rate": 4.7332957882068925e-05, + "loss": 1.6936, + "step": 4145 + }, + { + "epoch": 0.74, + "grad_norm": 0.8536581993103027, + "learning_rate": 4.732663538907954e-05, + "loss": 1.6583, + "step": 4150 + }, + { + "epoch": 0.74, + "grad_norm": 0.9004095196723938, + "learning_rate": 4.7320305834196856e-05, + "loss": 1.9843, + "step": 4155 + }, + { + "epoch": 0.74, + "grad_norm": 1.3630378246307373, + "learning_rate": 4.7313969219422906e-05, + "loss": 1.6481, + "step": 4160 + }, + { + "epoch": 0.75, + "grad_norm": 0.9355126023292542, + "learning_rate": 4.7307625546761945e-05, + "loss": 1.9516, + "step": 4165 + }, + { + "epoch": 0.75, + "grad_norm": 1.281752347946167, + "learning_rate": 4.73012748182205e-05, + "loss": 1.5777, + "step": 4170 + }, + { + "epoch": 0.75, + "grad_norm": 0.5159614086151123, + "learning_rate": 4.729491703580727e-05, + "loss": 2.0937, + "step": 4175 + }, + { + "epoch": 0.75, + "grad_norm": 0.5500282049179077, + "learning_rate": 4.728855220153322e-05, + "loss": 1.8505, + "step": 4180 + }, + { + "epoch": 0.75, + "grad_norm": 0.47913625836372375, + "learning_rate": 4.728218031741156e-05, + "loss": 2.1684, + "step": 4185 + }, + { + "epoch": 0.75, + "grad_norm": 1.0718451738357544, + "learning_rate": 4.72758013854577e-05, + "loss": 1.9026, + "step": 4190 + }, + { + "epoch": 0.75, + "grad_norm": 1.538801908493042, + "learning_rate": 4.7269415407689276e-05, + "loss": 1.8334, + "step": 4195 + }, + { + "epoch": 0.75, + "grad_norm": 1.0258709192276, + "learning_rate": 4.726302238612619e-05, + "loss": 1.8611, + "step": 4200 + }, + { + "epoch": 0.75, + "grad_norm": 0.7120585441589355, + "learning_rate": 4.725662232279053e-05, + "loss": 1.8925, + "step": 4205 + }, + { + "epoch": 0.75, + "grad_norm": 0.7516504526138306, + "learning_rate": 4.7250215219706636e-05, + "loss": 2.3299, + "step": 4210 + }, + { + "epoch": 0.75, + "grad_norm": 2.125640630722046, + "learning_rate": 4.7243801078901084e-05, + "loss": 2.3689, + "step": 4215 + }, + { + "epoch": 0.76, + "grad_norm": 0.7650467157363892, + "learning_rate": 4.7237379902402636e-05, + "loss": 1.7783, + "step": 4220 + }, + { + "epoch": 0.76, + "grad_norm": 0.7223068475723267, + "learning_rate": 4.723095169224231e-05, + "loss": 2.2361, + "step": 4225 + }, + { + "epoch": 0.76, + "grad_norm": 0.856893002986908, + "learning_rate": 4.722451645045336e-05, + "loss": 1.9778, + "step": 4230 + }, + { + "epoch": 0.76, + "grad_norm": 0.5695852041244507, + "learning_rate": 4.7218074179071225e-05, + "loss": 1.8546, + "step": 4235 + }, + { + "epoch": 0.76, + "grad_norm": 0.7213203310966492, + "learning_rate": 4.72116248801336e-05, + "loss": 1.9171, + "step": 4240 + }, + { + "epoch": 0.76, + "grad_norm": 1.4641231298446655, + "learning_rate": 4.720516855568039e-05, + "loss": 2.0134, + "step": 4245 + }, + { + "epoch": 0.76, + "grad_norm": 0.9336233139038086, + "learning_rate": 4.719870520775372e-05, + "loss": 1.7665, + "step": 4250 + }, + { + "epoch": 0.76, + "grad_norm": 0.6485480070114136, + "learning_rate": 4.719223483839795e-05, + "loss": 1.6734, + "step": 4255 + }, + { + "epoch": 0.76, + "grad_norm": 0.33693981170654297, + "learning_rate": 4.718575744965965e-05, + "loss": 1.7652, + "step": 4260 + }, + { + "epoch": 0.76, + "grad_norm": 0.7903716564178467, + "learning_rate": 4.717927304358761e-05, + "loss": 2.0551, + "step": 4265 + }, + { + "epoch": 0.76, + "grad_norm": 0.736282467842102, + "learning_rate": 4.7172781622232834e-05, + "loss": 1.9317, + "step": 4270 + }, + { + "epoch": 0.77, + "grad_norm": 0.5668120384216309, + "learning_rate": 4.716628318764856e-05, + "loss": 1.8764, + "step": 4275 + }, + { + "epoch": 0.77, + "grad_norm": 0.4904122054576874, + "learning_rate": 4.715977774189023e-05, + "loss": 1.8657, + "step": 4280 + }, + { + "epoch": 0.77, + "grad_norm": 0.4836113154888153, + "learning_rate": 4.715326528701553e-05, + "loss": 2.1163, + "step": 4285 + }, + { + "epoch": 0.77, + "grad_norm": 1.0537573099136353, + "learning_rate": 4.7146745825084315e-05, + "loss": 1.8157, + "step": 4290 + }, + { + "epoch": 0.77, + "grad_norm": 0.7334871888160706, + "learning_rate": 4.71402193581587e-05, + "loss": 1.9083, + "step": 4295 + }, + { + "epoch": 0.77, + "grad_norm": 0.6025909185409546, + "learning_rate": 4.7133685888303e-05, + "loss": 1.6309, + "step": 4300 + }, + { + "epoch": 0.77, + "grad_norm": 0.5878241658210754, + "learning_rate": 4.712714541758374e-05, + "loss": 1.7923, + "step": 4305 + }, + { + "epoch": 0.77, + "grad_norm": 0.9317494630813599, + "learning_rate": 4.7120597948069676e-05, + "loss": 1.6716, + "step": 4310 + }, + { + "epoch": 0.77, + "grad_norm": 1.2589433193206787, + "learning_rate": 4.711404348183175e-05, + "loss": 1.9239, + "step": 4315 + }, + { + "epoch": 0.77, + "grad_norm": 1.0758674144744873, + "learning_rate": 4.710748202094315e-05, + "loss": 1.8766, + "step": 4320 + }, + { + "epoch": 0.77, + "grad_norm": 0.7382737994194031, + "learning_rate": 4.710091356747924e-05, + "loss": 1.8116, + "step": 4325 + }, + { + "epoch": 0.78, + "grad_norm": 0.790633499622345, + "learning_rate": 4.709433812351764e-05, + "loss": 2.1118, + "step": 4330 + }, + { + "epoch": 0.78, + "grad_norm": Infinity, + "learning_rate": 4.708907273658756e-05, + "loss": 1.9079, + "step": 4335 + }, + { + "epoch": 0.78, + "grad_norm": 0.3426882028579712, + "learning_rate": 4.708248471497269e-05, + "loss": 1.958, + "step": 4340 + }, + { + "epoch": 0.78, + "grad_norm": 0.6428537368774414, + "learning_rate": 4.707588970868914e-05, + "loss": 1.9855, + "step": 4345 + }, + { + "epoch": 0.78, + "grad_norm": 0.4597220718860626, + "learning_rate": 4.7069287719822915e-05, + "loss": 1.875, + "step": 4350 + }, + { + "epoch": 0.78, + "grad_norm": 0.8507035970687866, + "learning_rate": 4.7062678750462205e-05, + "loss": 1.7406, + "step": 4355 + }, + { + "epoch": 0.78, + "grad_norm": 0.8498551845550537, + "learning_rate": 4.7056062802697435e-05, + "loss": 1.8201, + "step": 4360 + }, + { + "epoch": 0.78, + "grad_norm": 1.037539005279541, + "learning_rate": 4.704943987862121e-05, + "loss": 2.0407, + "step": 4365 + }, + { + "epoch": 0.78, + "grad_norm": 0.6955700516700745, + "learning_rate": 4.704280998032836e-05, + "loss": 1.8599, + "step": 4370 + }, + { + "epoch": 0.78, + "grad_norm": 0.9754367470741272, + "learning_rate": 4.703617310991592e-05, + "loss": 2.0508, + "step": 4375 + }, + { + "epoch": 0.78, + "grad_norm": 0.6729751825332642, + "learning_rate": 4.702952926948312e-05, + "loss": 1.8073, + "step": 4380 + }, + { + "epoch": 0.78, + "grad_norm": 0.47259873151779175, + "learning_rate": 4.7022878461131404e-05, + "loss": 1.7583, + "step": 4385 + }, + { + "epoch": 0.79, + "grad_norm": 2.3793861865997314, + "learning_rate": 4.701622068696441e-05, + "loss": 1.8839, + "step": 4390 + }, + { + "epoch": 0.79, + "grad_norm": 1.0509848594665527, + "learning_rate": 4.700955594908799e-05, + "loss": 1.8347, + "step": 4395 + }, + { + "epoch": 0.79, + "grad_norm": 0.6066625118255615, + "learning_rate": 4.700288424961019e-05, + "loss": 1.9021, + "step": 4400 + }, + { + "epoch": 0.79, + "grad_norm": 0.6687719821929932, + "learning_rate": 4.6996205590641274e-05, + "loss": 1.9797, + "step": 4405 + }, + { + "epoch": 0.79, + "grad_norm": 0.5127503871917725, + "learning_rate": 4.6989519974293684e-05, + "loss": 1.7509, + "step": 4410 + }, + { + "epoch": 0.79, + "grad_norm": 0.6416090130805969, + "learning_rate": 4.698282740268208e-05, + "loss": 1.8623, + "step": 4415 + }, + { + "epoch": 0.79, + "grad_norm": 0.7178260087966919, + "learning_rate": 4.697612787792331e-05, + "loss": 1.8798, + "step": 4420 + }, + { + "epoch": 0.79, + "grad_norm": 0.5021282434463501, + "learning_rate": 4.696942140213643e-05, + "loss": 2.0592, + "step": 4425 + }, + { + "epoch": 0.79, + "grad_norm": 0.5925451517105103, + "learning_rate": 4.696270797744269e-05, + "loss": 1.859, + "step": 4430 + }, + { + "epoch": 0.79, + "grad_norm": 0.7658485770225525, + "learning_rate": 4.6955987605965555e-05, + "loss": 1.9042, + "step": 4435 + }, + { + "epoch": 0.79, + "grad_norm": 0.7695988416671753, + "learning_rate": 4.6949260289830644e-05, + "loss": 1.482, + "step": 4440 + }, + { + "epoch": 0.8, + "grad_norm": 0.6327041387557983, + "learning_rate": 4.694252603116582e-05, + "loss": 2.0297, + "step": 4445 + }, + { + "epoch": 0.8, + "grad_norm": 0.7171325087547302, + "learning_rate": 4.693578483210113e-05, + "loss": 1.9515, + "step": 4450 + }, + { + "epoch": 0.8, + "grad_norm": 0.562306821346283, + "learning_rate": 4.692903669476878e-05, + "loss": 1.9341, + "step": 4455 + }, + { + "epoch": 0.8, + "grad_norm": 0.8432119488716125, + "learning_rate": 4.692228162130322e-05, + "loss": 2.1466, + "step": 4460 + }, + { + "epoch": 0.8, + "grad_norm": 0.496180921792984, + "learning_rate": 4.691551961384108e-05, + "loss": 2.0088, + "step": 4465 + }, + { + "epoch": 0.8, + "grad_norm": 1.1345199346542358, + "learning_rate": 4.690875067452116e-05, + "loss": 2.2793, + "step": 4470 + }, + { + "epoch": 0.8, + "grad_norm": 0.6553413271903992, + "learning_rate": 4.690197480548447e-05, + "loss": 2.2163, + "step": 4475 + }, + { + "epoch": 0.8, + "grad_norm": 0.8875722289085388, + "learning_rate": 4.6895192008874225e-05, + "loss": 1.7843, + "step": 4480 + }, + { + "epoch": 0.8, + "grad_norm": 0.8258360624313354, + "learning_rate": 4.6888402286835804e-05, + "loss": 1.8453, + "step": 4485 + }, + { + "epoch": 0.8, + "grad_norm": 0.9482008814811707, + "learning_rate": 4.688160564151679e-05, + "loss": 1.6878, + "step": 4490 + }, + { + "epoch": 0.8, + "grad_norm": 0.45167991518974304, + "learning_rate": 4.687480207506697e-05, + "loss": 2.0793, + "step": 4495 + }, + { + "epoch": 0.81, + "grad_norm": 0.2857486605644226, + "learning_rate": 4.6867991589638284e-05, + "loss": 2.0167, + "step": 4500 + }, + { + "epoch": 0.81, + "grad_norm": 0.6648562550544739, + "learning_rate": 4.686117418738489e-05, + "loss": 1.7389, + "step": 4505 + }, + { + "epoch": 0.81, + "grad_norm": 0.9790322780609131, + "learning_rate": 4.685434987046314e-05, + "loss": 2.0685, + "step": 4510 + }, + { + "epoch": 0.81, + "grad_norm": 0.4371451437473297, + "learning_rate": 4.6847518641031544e-05, + "loss": 1.9618, + "step": 4515 + }, + { + "epoch": 0.81, + "grad_norm": 0.7518529891967773, + "learning_rate": 4.6840680501250814e-05, + "loss": 1.7586, + "step": 4520 + }, + { + "epoch": 0.81, + "grad_norm": 0.6136543154716492, + "learning_rate": 4.683383545328385e-05, + "loss": 1.9332, + "step": 4525 + }, + { + "epoch": 0.81, + "grad_norm": 0.8958885073661804, + "learning_rate": 4.682698349929573e-05, + "loss": 1.6817, + "step": 4530 + }, + { + "epoch": 0.81, + "grad_norm": 0.6221014857292175, + "learning_rate": 4.682012464145373e-05, + "loss": 2.2389, + "step": 4535 + }, + { + "epoch": 0.81, + "grad_norm": 1.043770432472229, + "learning_rate": 4.681325888192729e-05, + "loss": 1.7041, + "step": 4540 + }, + { + "epoch": 0.81, + "grad_norm": 0.9577236175537109, + "learning_rate": 4.680638622288804e-05, + "loss": 1.9666, + "step": 4545 + }, + { + "epoch": 0.81, + "grad_norm": 0.5839371681213379, + "learning_rate": 4.679950666650982e-05, + "loss": 1.9035, + "step": 4550 + }, + { + "epoch": 0.82, + "grad_norm": 1.314014196395874, + "learning_rate": 4.67926202149686e-05, + "loss": 1.9786, + "step": 4555 + }, + { + "epoch": 0.82, + "grad_norm": 0.44664058089256287, + "learning_rate": 4.678572687044257e-05, + "loss": 1.9205, + "step": 4560 + }, + { + "epoch": 0.82, + "grad_norm": 0.5979194641113281, + "learning_rate": 4.6778826635112085e-05, + "loss": 1.8723, + "step": 4565 + }, + { + "epoch": 0.82, + "grad_norm": 0.8216999173164368, + "learning_rate": 4.677191951115968e-05, + "loss": 1.9851, + "step": 4570 + }, + { + "epoch": 0.82, + "grad_norm": 0.9313204288482666, + "learning_rate": 4.676638885365804e-05, + "loss": 1.9628, + "step": 4575 + }, + { + "epoch": 0.82, + "grad_norm": 0.9604901075363159, + "learning_rate": 4.675946933569314e-05, + "loss": 1.6638, + "step": 4580 + }, + { + "epoch": 0.82, + "grad_norm": 0.9496554732322693, + "learning_rate": 4.675254293522902e-05, + "loss": 1.7364, + "step": 4585 + }, + { + "epoch": 0.82, + "grad_norm": 0.6214568018913269, + "learning_rate": 4.674560965445649e-05, + "loss": 1.7255, + "step": 4590 + }, + { + "epoch": 0.82, + "grad_norm": 0.6437748670578003, + "learning_rate": 4.673866949556854e-05, + "loss": 2.1914, + "step": 4595 + }, + { + "epoch": 0.82, + "grad_norm": 1.4943290948867798, + "learning_rate": 4.6731722460760355e-05, + "loss": 1.9668, + "step": 4600 + }, + { + "epoch": 0.82, + "grad_norm": 0.41138797998428345, + "learning_rate": 4.672476855222924e-05, + "loss": 1.8255, + "step": 4605 + }, + { + "epoch": 0.83, + "grad_norm": 0.6955471634864807, + "learning_rate": 4.671780777217474e-05, + "loss": 1.9333, + "step": 4610 + }, + { + "epoch": 0.83, + "grad_norm": 1.1246094703674316, + "learning_rate": 4.671084012279853e-05, + "loss": 2.0143, + "step": 4615 + }, + { + "epoch": 0.83, + "grad_norm": 0.7304584980010986, + "learning_rate": 4.6703865606304465e-05, + "loss": 1.9142, + "step": 4620 + }, + { + "epoch": 0.83, + "grad_norm": 1.188251256942749, + "learning_rate": 4.6696884224898584e-05, + "loss": 1.8847, + "step": 4625 + }, + { + "epoch": 0.83, + "grad_norm": 0.9002482295036316, + "learning_rate": 4.66898959807891e-05, + "loss": 2.1051, + "step": 4630 + }, + { + "epoch": 0.83, + "grad_norm": 0.9018658995628357, + "learning_rate": 4.668290087618638e-05, + "loss": 1.8925, + "step": 4635 + }, + { + "epoch": 0.83, + "grad_norm": 0.6331837177276611, + "learning_rate": 4.667589891330297e-05, + "loss": 2.0296, + "step": 4640 + }, + { + "epoch": 0.83, + "grad_norm": 0.5853480696678162, + "learning_rate": 4.666889009435358e-05, + "loss": 1.9348, + "step": 4645 + }, + { + "epoch": 0.83, + "grad_norm": 1.261925458908081, + "learning_rate": 4.6661874421555094e-05, + "loss": 2.1714, + "step": 4650 + }, + { + "epoch": 0.83, + "grad_norm": 0.5459082126617432, + "learning_rate": 4.665485189712657e-05, + "loss": 1.7488, + "step": 4655 + }, + { + "epoch": 0.83, + "grad_norm": 0.5560827851295471, + "learning_rate": 4.664782252328922e-05, + "loss": 1.8829, + "step": 4660 + }, + { + "epoch": 0.84, + "grad_norm": 12.994915008544922, + "learning_rate": 4.664078630226643e-05, + "loss": 2.2486, + "step": 4665 + }, + { + "epoch": 0.84, + "grad_norm": 0.5184732675552368, + "learning_rate": 4.663374323628374e-05, + "loss": 1.5356, + "step": 4670 + }, + { + "epoch": 0.84, + "grad_norm": 0.5672849416732788, + "learning_rate": 4.662669332756887e-05, + "loss": 1.6331, + "step": 4675 + }, + { + "epoch": 0.84, + "grad_norm": 0.5396742224693298, + "learning_rate": 4.661963657835172e-05, + "loss": 2.1822, + "step": 4680 + }, + { + "epoch": 0.84, + "grad_norm": 0.557628333568573, + "learning_rate": 4.6612572990864303e-05, + "loss": 1.9965, + "step": 4685 + }, + { + "epoch": 0.84, + "grad_norm": 0.8263502717018127, + "learning_rate": 4.6605502567340844e-05, + "loss": 2.1538, + "step": 4690 + }, + { + "epoch": 0.84, + "grad_norm": 2.053760051727295, + "learning_rate": 4.6598425310017704e-05, + "loss": 2.0905, + "step": 4695 + }, + { + "epoch": 0.84, + "grad_norm": 1.3697140216827393, + "learning_rate": 4.659134122113341e-05, + "loss": 1.7339, + "step": 4700 + }, + { + "epoch": 0.84, + "grad_norm": 0.7235055565834045, + "learning_rate": 4.658425030292866e-05, + "loss": 1.8672, + "step": 4705 + }, + { + "epoch": 0.84, + "grad_norm": 0.7260095477104187, + "learning_rate": 4.657715255764629e-05, + "loss": 1.7353, + "step": 4710 + }, + { + "epoch": 0.84, + "grad_norm": 0.5732203125953674, + "learning_rate": 4.657004798753133e-05, + "loss": 2.0318, + "step": 4715 + }, + { + "epoch": 0.84, + "grad_norm": 13.018949508666992, + "learning_rate": 4.656293659483093e-05, + "loss": 2.1095, + "step": 4720 + }, + { + "epoch": 0.85, + "grad_norm": 0.6857671737670898, + "learning_rate": 4.655581838179444e-05, + "loss": 1.8601, + "step": 4725 + }, + { + "epoch": 0.85, + "grad_norm": 0.5136845707893372, + "learning_rate": 4.6548693350673325e-05, + "loss": 1.9605, + "step": 4730 + }, + { + "epoch": 0.85, + "grad_norm": 0.44918909668922424, + "learning_rate": 4.654156150372123e-05, + "loss": 1.9067, + "step": 4735 + }, + { + "epoch": 0.85, + "grad_norm": 0.4144088327884674, + "learning_rate": 4.653442284319395e-05, + "loss": 2.0102, + "step": 4740 + }, + { + "epoch": 0.85, + "grad_norm": 1.849292516708374, + "learning_rate": 4.652727737134944e-05, + "loss": 2.1593, + "step": 4745 + }, + { + "epoch": 0.85, + "grad_norm": 0.5621273517608643, + "learning_rate": 4.65201250904478e-05, + "loss": 2.0483, + "step": 4750 + }, + { + "epoch": 0.85, + "grad_norm": 1.3969671726226807, + "learning_rate": 4.65129660027513e-05, + "loss": 1.9204, + "step": 4755 + }, + { + "epoch": 0.85, + "grad_norm": 0.598132312297821, + "learning_rate": 4.6505800110524347e-05, + "loss": 2.0326, + "step": 4760 + }, + { + "epoch": 0.85, + "grad_norm": 0.7579675316810608, + "learning_rate": 4.649862741603349e-05, + "loss": 1.7728, + "step": 4765 + }, + { + "epoch": 0.85, + "grad_norm": 0.7204775810241699, + "learning_rate": 4.649144792154747e-05, + "loss": 1.9556, + "step": 4770 + }, + { + "epoch": 0.85, + "grad_norm": 1.0572607517242432, + "learning_rate": 4.648426162933716e-05, + "loss": 2.1159, + "step": 4775 + }, + { + "epoch": 0.86, + "grad_norm": 0.5266518592834473, + "learning_rate": 4.647706854167554e-05, + "loss": 1.9842, + "step": 4780 + }, + { + "epoch": 0.86, + "grad_norm": 0.8113346695899963, + "learning_rate": 4.6469868660837805e-05, + "loss": 1.7724, + "step": 4785 + }, + { + "epoch": 0.86, + "grad_norm": 0.8336856961250305, + "learning_rate": 4.646266198910126e-05, + "loss": 1.8038, + "step": 4790 + }, + { + "epoch": 0.86, + "grad_norm": 0.7910550832748413, + "learning_rate": 4.645544852874538e-05, + "loss": 1.8405, + "step": 4795 + }, + { + "epoch": 0.86, + "grad_norm": 0.7810003161430359, + "learning_rate": 4.644822828205176e-05, + "loss": 1.825, + "step": 4800 + }, + { + "epoch": 0.86, + "grad_norm": 0.8987298607826233, + "learning_rate": 4.644100125130418e-05, + "loss": 1.9643, + "step": 4805 + }, + { + "epoch": 0.86, + "grad_norm": 0.5562641620635986, + "learning_rate": 4.643376743878852e-05, + "loss": 1.7666, + "step": 4810 + }, + { + "epoch": 0.86, + "grad_norm": 0.8323341012001038, + "learning_rate": 4.642652684679283e-05, + "loss": 1.6686, + "step": 4815 + }, + { + "epoch": 0.86, + "grad_norm": 1.0062700510025024, + "learning_rate": 4.6419279477607314e-05, + "loss": 1.9944, + "step": 4820 + }, + { + "epoch": 0.86, + "grad_norm": 0.6746718883514404, + "learning_rate": 4.64120253335243e-05, + "loss": 1.91, + "step": 4825 + }, + { + "epoch": 0.86, + "grad_norm": 0.5174027681350708, + "learning_rate": 4.6404764416838264e-05, + "loss": 1.8239, + "step": 4830 + }, + { + "epoch": 0.87, + "grad_norm": 0.3364204466342926, + "learning_rate": 4.6397496729845844e-05, + "loss": 1.7488, + "step": 4835 + }, + { + "epoch": 0.87, + "grad_norm": 1.2048060894012451, + "learning_rate": 4.639022227484578e-05, + "loss": 1.7477, + "step": 4840 + }, + { + "epoch": 0.87, + "grad_norm": 0.4208517372608185, + "learning_rate": 4.638294105413898e-05, + "loss": 1.9285, + "step": 4845 + }, + { + "epoch": 0.87, + "grad_norm": 1.080003261566162, + "learning_rate": 4.6375653070028505e-05, + "loss": 1.8835, + "step": 4850 + }, + { + "epoch": 0.87, + "grad_norm": 1.1424204111099243, + "learning_rate": 4.6368358324819524e-05, + "loss": 1.6948, + "step": 4855 + }, + { + "epoch": 0.87, + "grad_norm": 0.866698682308197, + "learning_rate": 4.636105682081935e-05, + "loss": 1.8874, + "step": 4860 + }, + { + "epoch": 0.87, + "grad_norm": 0.8074659705162048, + "learning_rate": 4.6353748560337456e-05, + "loss": 1.7727, + "step": 4865 + }, + { + "epoch": 0.87, + "grad_norm": 2.238422393798828, + "learning_rate": 4.634643354568543e-05, + "loss": 1.9305, + "step": 4870 + }, + { + "epoch": 0.87, + "grad_norm": 0.5514354109764099, + "learning_rate": 4.633911177917701e-05, + "loss": 1.9675, + "step": 4875 + }, + { + "epoch": 0.87, + "grad_norm": 0.41248565912246704, + "learning_rate": 4.633178326312806e-05, + "loss": 1.7863, + "step": 4880 + }, + { + "epoch": 0.87, + "grad_norm": 0.5374939441680908, + "learning_rate": 4.632444799985657e-05, + "loss": 1.8905, + "step": 4885 + }, + { + "epoch": 0.88, + "grad_norm": 0.9222282767295837, + "learning_rate": 4.63171059916827e-05, + "loss": 1.9385, + "step": 4890 + }, + { + "epoch": 0.88, + "grad_norm": 3.0097501277923584, + "learning_rate": 4.630975724092871e-05, + "loss": 1.772, + "step": 4895 + }, + { + "epoch": 0.88, + "grad_norm": 1.815949559211731, + "learning_rate": 4.6302401749919e-05, + "loss": 1.9147, + "step": 4900 + }, + { + "epoch": 0.88, + "grad_norm": 0.4607148766517639, + "learning_rate": 4.629503952098011e-05, + "loss": 1.8251, + "step": 4905 + }, + { + "epoch": 0.88, + "grad_norm": 0.6612110137939453, + "learning_rate": 4.62876705564407e-05, + "loss": 2.0799, + "step": 4910 + }, + { + "epoch": 0.88, + "grad_norm": 0.9041725993156433, + "learning_rate": 4.628029485863157e-05, + "loss": 1.8727, + "step": 4915 + }, + { + "epoch": 0.88, + "grad_norm": 0.5865108370780945, + "learning_rate": 4.627291242988564e-05, + "loss": 2.0539, + "step": 4920 + }, + { + "epoch": 0.88, + "grad_norm": 0.646685779094696, + "learning_rate": 4.6265523272537976e-05, + "loss": 2.1545, + "step": 4925 + }, + { + "epoch": 0.88, + "grad_norm": 0.8874340057373047, + "learning_rate": 4.625812738892575e-05, + "loss": 2.0838, + "step": 4930 + }, + { + "epoch": 0.88, + "grad_norm": 0.48262983560562134, + "learning_rate": 4.625072478138828e-05, + "loss": 1.7378, + "step": 4935 + }, + { + "epoch": 0.88, + "grad_norm": 0.4273892641067505, + "learning_rate": 4.6243315452267014e-05, + "loss": 1.6208, + "step": 4940 + }, + { + "epoch": 0.89, + "grad_norm": 0.860099732875824, + "learning_rate": 4.623589940390549e-05, + "loss": 1.5971, + "step": 4945 + }, + { + "epoch": 0.89, + "grad_norm": 0.6735519170761108, + "learning_rate": 4.622847663864941e-05, + "loss": 1.8806, + "step": 4950 + }, + { + "epoch": 0.89, + "grad_norm": 0.47200706601142883, + "learning_rate": 4.6221047158846594e-05, + "loss": 1.8463, + "step": 4955 + }, + { + "epoch": 0.89, + "grad_norm": 1.0925625562667847, + "learning_rate": 4.621361096684698e-05, + "loss": 1.9257, + "step": 4960 + }, + { + "epoch": 0.89, + "grad_norm": 0.9255615472793579, + "learning_rate": 4.6206168065002613e-05, + "loss": 2.2559, + "step": 4965 + }, + { + "epoch": 0.89, + "grad_norm": 0.8095393776893616, + "learning_rate": 4.619871845566769e-05, + "loss": 1.8567, + "step": 4970 + }, + { + "epoch": 0.89, + "grad_norm": 0.8361244201660156, + "learning_rate": 4.619126214119851e-05, + "loss": 1.4803, + "step": 4975 + }, + { + "epoch": 0.89, + "grad_norm": 0.6798988580703735, + "learning_rate": 4.618379912395349e-05, + "loss": 2.0151, + "step": 4980 + }, + { + "epoch": 0.89, + "grad_norm": 0.3742920756340027, + "learning_rate": 4.617632940629319e-05, + "loss": 2.137, + "step": 4985 + }, + { + "epoch": 0.89, + "grad_norm": 0.6546251177787781, + "learning_rate": 4.616885299058027e-05, + "loss": 2.2428, + "step": 4990 + }, + { + "epoch": 0.89, + "grad_norm": 0.6916753649711609, + "learning_rate": 4.6161369879179504e-05, + "loss": 2.0984, + "step": 4995 + }, + { + "epoch": 0.89, + "grad_norm": 1.0272783041000366, + "learning_rate": 4.61538800744578e-05, + "loss": 1.6963, + "step": 5000 + }, + { + "epoch": 0.9, + "grad_norm": 0.4335922598838806, + "learning_rate": 4.614638357878418e-05, + "loss": 2.0127, + "step": 5005 + }, + { + "epoch": 0.9, + "grad_norm": 1.4880688190460205, + "learning_rate": 4.613888039452978e-05, + "loss": 1.786, + "step": 5010 + }, + { + "epoch": 0.9, + "grad_norm": 0.689954400062561, + "learning_rate": 4.613137052406784e-05, + "loss": 1.6233, + "step": 5015 + }, + { + "epoch": 0.9, + "grad_norm": 1.206842303276062, + "learning_rate": 4.6123853969773726e-05, + "loss": 1.8923, + "step": 5020 + }, + { + "epoch": 0.9, + "grad_norm": 0.6934424042701721, + "learning_rate": 4.611633073402492e-05, + "loss": 1.8961, + "step": 5025 + }, + { + "epoch": 0.9, + "grad_norm": 0.48485127091407776, + "learning_rate": 4.610880081920101e-05, + "loss": 1.6494, + "step": 5030 + }, + { + "epoch": 0.9, + "grad_norm": 0.7396721243858337, + "learning_rate": 4.610126422768372e-05, + "loss": 1.9127, + "step": 5035 + }, + { + "epoch": 0.9, + "grad_norm": 2.117673873901367, + "learning_rate": 4.609372096185683e-05, + "loss": 2.0358, + "step": 5040 + }, + { + "epoch": 0.9, + "grad_norm": 0.5403966307640076, + "learning_rate": 4.608617102410631e-05, + "loss": 1.7811, + "step": 5045 + }, + { + "epoch": 0.9, + "grad_norm": 0.5867857933044434, + "learning_rate": 4.6078614416820164e-05, + "loss": 1.9641, + "step": 5050 + }, + { + "epoch": 0.9, + "grad_norm": 0.7298117876052856, + "learning_rate": 4.6071051142388555e-05, + "loss": 1.7933, + "step": 5055 + }, + { + "epoch": 0.91, + "grad_norm": 1.0257443189620972, + "learning_rate": 4.606348120320374e-05, + "loss": 1.8427, + "step": 5060 + }, + { + "epoch": 0.91, + "grad_norm": 0.8619400262832642, + "learning_rate": 4.6055904601660084e-05, + "loss": 1.9523, + "step": 5065 + }, + { + "epoch": 0.91, + "grad_norm": 0.7043263912200928, + "learning_rate": 4.6048321340154054e-05, + "loss": 1.9647, + "step": 5070 + }, + { + "epoch": 0.91, + "grad_norm": 0.8900740742683411, + "learning_rate": 4.604073142108423e-05, + "loss": 1.8617, + "step": 5075 + }, + { + "epoch": 0.91, + "grad_norm": 0.5621470808982849, + "learning_rate": 4.6033134846851294e-05, + "loss": 2.0817, + "step": 5080 + }, + { + "epoch": 0.91, + "grad_norm": 0.5144213438034058, + "learning_rate": 4.602553161985804e-05, + "loss": 1.9585, + "step": 5085 + }, + { + "epoch": 0.91, + "grad_norm": 0.5859657526016235, + "learning_rate": 4.601792174250935e-05, + "loss": 1.9843, + "step": 5090 + }, + { + "epoch": 0.91, + "grad_norm": 0.48220011591911316, + "learning_rate": 4.6010305217212244e-05, + "loss": 2.3189, + "step": 5095 + }, + { + "epoch": 0.91, + "grad_norm": 0.6994110941886902, + "learning_rate": 4.60026820463758e-05, + "loss": 1.699, + "step": 5100 + }, + { + "epoch": 0.91, + "grad_norm": 4.361588478088379, + "learning_rate": 4.5995052232411216e-05, + "loss": 1.8276, + "step": 5105 + }, + { + "epoch": 0.91, + "grad_norm": 0.627723753452301, + "learning_rate": 4.598741577773182e-05, + "loss": 2.045, + "step": 5110 + }, + { + "epoch": 0.92, + "grad_norm": 1.4922670125961304, + "learning_rate": 4.5979772684752995e-05, + "loss": 1.7747, + "step": 5115 + }, + { + "epoch": 0.92, + "grad_norm": 0.6467026472091675, + "learning_rate": 4.597212295589225e-05, + "loss": 1.7939, + "step": 5120 + }, + { + "epoch": 0.92, + "grad_norm": 0.6912665367126465, + "learning_rate": 4.596446659356919e-05, + "loss": 1.8892, + "step": 5125 + }, + { + "epoch": 0.92, + "grad_norm": 1.146193265914917, + "learning_rate": 4.595680360020551e-05, + "loss": 1.7835, + "step": 5130 + }, + { + "epoch": 0.92, + "grad_norm": 0.549444317817688, + "learning_rate": 4.594913397822501e-05, + "loss": 1.8231, + "step": 5135 + }, + { + "epoch": 0.92, + "grad_norm": 0.6164882779121399, + "learning_rate": 4.594145773005358e-05, + "loss": 1.8279, + "step": 5140 + }, + { + "epoch": 0.92, + "grad_norm": 0.3416946232318878, + "learning_rate": 4.593377485811922e-05, + "loss": 2.243, + "step": 5145 + }, + { + "epoch": 0.92, + "grad_norm": 0.5608302354812622, + "learning_rate": 4.5926085364852e-05, + "loss": 1.7795, + "step": 5150 + }, + { + "epoch": 0.92, + "grad_norm": 0.5376662015914917, + "learning_rate": 4.591838925268411e-05, + "loss": 1.7712, + "step": 5155 + }, + { + "epoch": 0.92, + "grad_norm": 0.6536192893981934, + "learning_rate": 4.591068652404982e-05, + "loss": 2.2747, + "step": 5160 + }, + { + "epoch": 0.92, + "grad_norm": 0.40830197930336, + "learning_rate": 4.59029771813855e-05, + "loss": 1.9315, + "step": 5165 + }, + { + "epoch": 0.93, + "grad_norm": 0.8668363094329834, + "learning_rate": 4.58952612271296e-05, + "loss": 2.0715, + "step": 5170 + }, + { + "epoch": 0.93, + "grad_norm": 0.6481930613517761, + "learning_rate": 4.588753866372267e-05, + "loss": 1.8249, + "step": 5175 + }, + { + "epoch": 0.93, + "grad_norm": 0.3583790063858032, + "learning_rate": 4.5879809493607364e-05, + "loss": 1.8025, + "step": 5180 + }, + { + "epoch": 0.93, + "grad_norm": 0.5957528352737427, + "learning_rate": 4.587207371922838e-05, + "loss": 2.1159, + "step": 5185 + }, + { + "epoch": 0.93, + "grad_norm": 0.6677088141441345, + "learning_rate": 4.586433134303257e-05, + "loss": 1.9156, + "step": 5190 + }, + { + "epoch": 0.93, + "grad_norm": 0.8160932660102844, + "learning_rate": 4.585658236746881e-05, + "loss": 2.0241, + "step": 5195 + }, + { + "epoch": 0.93, + "grad_norm": 0.6950777769088745, + "learning_rate": 4.584882679498813e-05, + "loss": 1.9214, + "step": 5200 + }, + { + "epoch": 0.93, + "grad_norm": 0.726952314376831, + "learning_rate": 4.5841064628043575e-05, + "loss": 1.7244, + "step": 5205 + }, + { + "epoch": 0.93, + "grad_norm": 0.7783024907112122, + "learning_rate": 4.583329586909033e-05, + "loss": 1.8253, + "step": 5210 + }, + { + "epoch": 0.93, + "grad_norm": 0.8075493574142456, + "learning_rate": 4.5825520520585635e-05, + "loss": 1.7076, + "step": 5215 + }, + { + "epoch": 0.93, + "grad_norm": 1.0622972249984741, + "learning_rate": 4.5817738584988835e-05, + "loss": 1.9551, + "step": 5220 + }, + { + "epoch": 0.94, + "grad_norm": 0.6024557948112488, + "learning_rate": 4.580995006476134e-05, + "loss": 1.8075, + "step": 5225 + }, + { + "epoch": 0.94, + "grad_norm": 0.5445288419723511, + "learning_rate": 4.580215496236666e-05, + "loss": 1.7501, + "step": 5230 + }, + { + "epoch": 0.94, + "grad_norm": 1.8697842359542847, + "learning_rate": 4.5794353280270376e-05, + "loss": 1.7579, + "step": 5235 + }, + { + "epoch": 0.94, + "grad_norm": 0.6215696334838867, + "learning_rate": 4.5786545020940155e-05, + "loss": 1.6428, + "step": 5240 + }, + { + "epoch": 0.94, + "grad_norm": 0.6145104169845581, + "learning_rate": 4.577873018684573e-05, + "loss": 1.7593, + "step": 5245 + }, + { + "epoch": 0.94, + "grad_norm": 0.7944318056106567, + "learning_rate": 4.577090878045893e-05, + "loss": 2.1836, + "step": 5250 + }, + { + "epoch": 0.94, + "grad_norm": 0.7711953520774841, + "learning_rate": 4.576308080425367e-05, + "loss": 1.9731, + "step": 5255 + }, + { + "epoch": 0.94, + "grad_norm": 0.5569404363632202, + "learning_rate": 4.575524626070592e-05, + "loss": 2.1212, + "step": 5260 + }, + { + "epoch": 0.94, + "grad_norm": 0.6812270283699036, + "learning_rate": 4.574740515229374e-05, + "loss": 2.0945, + "step": 5265 + }, + { + "epoch": 0.94, + "grad_norm": 0.8699807524681091, + "learning_rate": 4.5739557481497275e-05, + "loss": 2.0372, + "step": 5270 + }, + { + "epoch": 0.94, + "grad_norm": 4.8972649574279785, + "learning_rate": 4.573170325079872e-05, + "loss": 2.3673, + "step": 5275 + }, + { + "epoch": 0.95, + "grad_norm": 0.6763632297515869, + "learning_rate": 4.5723842462682375e-05, + "loss": 1.907, + "step": 5280 + }, + { + "epoch": 0.95, + "grad_norm": 0.44302767515182495, + "learning_rate": 4.571597511963459e-05, + "loss": 1.939, + "step": 5285 + }, + { + "epoch": 0.95, + "grad_norm": 0.5313250422477722, + "learning_rate": 4.5708101224143796e-05, + "loss": 1.8028, + "step": 5290 + }, + { + "epoch": 0.95, + "grad_norm": 0.4340111315250397, + "learning_rate": 4.5700220778700504e-05, + "loss": 1.7074, + "step": 5295 + }, + { + "epoch": 0.95, + "grad_norm": 0.5250926613807678, + "learning_rate": 4.569233378579729e-05, + "loss": 1.956, + "step": 5300 + }, + { + "epoch": 0.95, + "grad_norm": 0.8783259391784668, + "learning_rate": 4.568444024792881e-05, + "loss": 2.1352, + "step": 5305 + }, + { + "epoch": 0.95, + "grad_norm": 0.6077532172203064, + "learning_rate": 4.5676540167591776e-05, + "loss": 1.6879, + "step": 5310 + }, + { + "epoch": 0.95, + "grad_norm": 0.854468584060669, + "learning_rate": 4.5668633547284964e-05, + "loss": 2.0933, + "step": 5315 + }, + { + "epoch": 0.95, + "grad_norm": 4.812654972076416, + "learning_rate": 4.566072038950925e-05, + "loss": 1.6729, + "step": 5320 + }, + { + "epoch": 0.95, + "grad_norm": 0.6669471859931946, + "learning_rate": 4.565280069676755e-05, + "loss": 2.0241, + "step": 5325 + }, + { + "epoch": 0.95, + "grad_norm": 0.4599258303642273, + "learning_rate": 4.5644874471564854e-05, + "loss": 1.802, + "step": 5330 + }, + { + "epoch": 0.95, + "grad_norm": 1.0719258785247803, + "learning_rate": 4.5636941716408224e-05, + "loss": 2.0428, + "step": 5335 + }, + { + "epoch": 0.96, + "grad_norm": 0.8084414601325989, + "learning_rate": 4.5629002433806764e-05, + "loss": 1.8502, + "step": 5340 + }, + { + "epoch": 0.96, + "grad_norm": 0.715983510017395, + "learning_rate": 4.5621056626271694e-05, + "loss": 1.9962, + "step": 5345 + }, + { + "epoch": 0.96, + "grad_norm": 0.682518720626831, + "learning_rate": 4.5613104296316245e-05, + "loss": 1.5369, + "step": 5350 + }, + { + "epoch": 0.96, + "grad_norm": 1.020588994026184, + "learning_rate": 4.560514544645573e-05, + "loss": 1.8893, + "step": 5355 + }, + { + "epoch": 0.96, + "grad_norm": 1.0426610708236694, + "learning_rate": 4.559718007920753e-05, + "loss": 1.7818, + "step": 5360 + }, + { + "epoch": 0.96, + "grad_norm": 0.8958699703216553, + "learning_rate": 4.5589208197091084e-05, + "loss": 1.9423, + "step": 5365 + }, + { + "epoch": 0.96, + "grad_norm": 1.6348686218261719, + "learning_rate": 4.558122980262789e-05, + "loss": 1.9759, + "step": 5370 + }, + { + "epoch": 0.96, + "grad_norm": 1.073533058166504, + "learning_rate": 4.557324489834151e-05, + "loss": 1.9607, + "step": 5375 + }, + { + "epoch": 0.96, + "grad_norm": 0.9768711924552917, + "learning_rate": 4.556525348675755e-05, + "loss": 1.8728, + "step": 5380 + }, + { + "epoch": 0.96, + "grad_norm": 0.6651968359947205, + "learning_rate": 4.55572555704037e-05, + "loss": 1.8073, + "step": 5385 + }, + { + "epoch": 0.96, + "grad_norm": 0.7141188979148865, + "learning_rate": 4.554925115180968e-05, + "loss": 1.7795, + "step": 5390 + }, + { + "epoch": 0.97, + "grad_norm": 1.1313745975494385, + "learning_rate": 4.5541240233507285e-05, + "loss": 1.8481, + "step": 5395 + }, + { + "epoch": 0.97, + "grad_norm": 0.4851437211036682, + "learning_rate": 4.5533222818030376e-05, + "loss": 1.9045, + "step": 5400 + }, + { + "epoch": 0.97, + "grad_norm": 0.3471333384513855, + "learning_rate": 4.5525198907914826e-05, + "loss": 2.006, + "step": 5405 + }, + { + "epoch": 0.97, + "grad_norm": 0.5303329229354858, + "learning_rate": 4.5517168505698616e-05, + "loss": 2.0183, + "step": 5410 + }, + { + "epoch": 0.97, + "grad_norm": 0.40987589955329895, + "learning_rate": 4.550913161392173e-05, + "loss": 2.2088, + "step": 5415 + }, + { + "epoch": 0.97, + "grad_norm": 0.5597338676452637, + "learning_rate": 4.5501088235126254e-05, + "loss": 1.9469, + "step": 5420 + }, + { + "epoch": 0.97, + "grad_norm": 0.5752057433128357, + "learning_rate": 4.549303837185628e-05, + "loss": 1.9666, + "step": 5425 + }, + { + "epoch": 0.97, + "grad_norm": 0.7234611511230469, + "learning_rate": 4.548498202665798e-05, + "loss": 1.6851, + "step": 5430 + }, + { + "epoch": 0.97, + "grad_norm": 0.3815643787384033, + "learning_rate": 4.5476919202079574e-05, + "loss": 2.0537, + "step": 5435 + }, + { + "epoch": 0.97, + "grad_norm": 0.5907308459281921, + "learning_rate": 4.5468849900671324e-05, + "loss": 1.8174, + "step": 5440 + }, + { + "epoch": 0.97, + "grad_norm": 0.5893926024436951, + "learning_rate": 4.546077412498553e-05, + "loss": 1.6955, + "step": 5445 + }, + { + "epoch": 0.98, + "grad_norm": 0.5866884589195251, + "learning_rate": 4.545269187757657e-05, + "loss": 1.7888, + "step": 5450 + }, + { + "epoch": 0.98, + "grad_norm": 0.43314459919929504, + "learning_rate": 4.5444603161000834e-05, + "loss": 1.848, + "step": 5455 + }, + { + "epoch": 0.98, + "grad_norm": 0.9520348906517029, + "learning_rate": 4.543650797781679e-05, + "loss": 2.004, + "step": 5460 + }, + { + "epoch": 0.98, + "grad_norm": 5.312831401824951, + "learning_rate": 4.542840633058493e-05, + "loss": 2.1996, + "step": 5465 + }, + { + "epoch": 0.98, + "grad_norm": 0.944259524345398, + "learning_rate": 4.542029822186779e-05, + "loss": 1.7677, + "step": 5470 + }, + { + "epoch": 0.98, + "grad_norm": 0.5071392059326172, + "learning_rate": 4.5412183654229965e-05, + "loss": 1.67, + "step": 5475 + }, + { + "epoch": 0.98, + "grad_norm": 0.7843353152275085, + "learning_rate": 4.5404062630238094e-05, + "loss": 2.2598, + "step": 5480 + }, + { + "epoch": 0.98, + "grad_norm": 0.4275512397289276, + "learning_rate": 4.5395935152460834e-05, + "loss": 1.7225, + "step": 5485 + }, + { + "epoch": 0.98, + "grad_norm": 0.9836548566818237, + "learning_rate": 4.53878012234689e-05, + "loss": 1.9897, + "step": 5490 + }, + { + "epoch": 0.98, + "grad_norm": 2.572413206100464, + "learning_rate": 4.5379660845835045e-05, + "loss": 1.7828, + "step": 5495 + }, + { + "epoch": 0.98, + "grad_norm": 0.5451618432998657, + "learning_rate": 4.5371514022134085e-05, + "loss": 2.0577, + "step": 5500 + }, + { + "epoch": 0.99, + "grad_norm": 0.5373215675354004, + "learning_rate": 4.536336075494282e-05, + "loss": 1.6858, + "step": 5505 + }, + { + "epoch": 0.99, + "grad_norm": 0.5195179581642151, + "learning_rate": 4.535520104684014e-05, + "loss": 1.8985, + "step": 5510 + }, + { + "epoch": 0.99, + "grad_norm": 0.5343700647354126, + "learning_rate": 4.534703490040695e-05, + "loss": 2.0989, + "step": 5515 + }, + { + "epoch": 0.99, + "grad_norm": 0.7476509213447571, + "learning_rate": 4.533886231822619e-05, + "loss": 1.7542, + "step": 5520 + }, + { + "epoch": 0.99, + "grad_norm": 0.7178905010223389, + "learning_rate": 4.533068330288284e-05, + "loss": 2.0403, + "step": 5525 + }, + { + "epoch": 0.99, + "grad_norm": 0.4951605796813965, + "learning_rate": 4.532249785696392e-05, + "loss": 1.9205, + "step": 5530 + }, + { + "epoch": 0.99, + "grad_norm": 0.46740642189979553, + "learning_rate": 4.531430598305848e-05, + "loss": 2.0617, + "step": 5535 + }, + { + "epoch": 0.99, + "grad_norm": 1.046222448348999, + "learning_rate": 4.5306107683757584e-05, + "loss": 1.6893, + "step": 5540 + }, + { + "epoch": 0.99, + "grad_norm": 0.4250253736972809, + "learning_rate": 4.529790296165437e-05, + "loss": 2.0393, + "step": 5545 + }, + { + "epoch": 0.99, + "grad_norm": 1.2679554224014282, + "learning_rate": 4.528969181934397e-05, + "loss": 1.7908, + "step": 5550 + }, + { + "epoch": 0.99, + "grad_norm": 0.8419818878173828, + "learning_rate": 4.5281474259423565e-05, + "loss": 1.8416, + "step": 5555 + }, + { + "epoch": 1.0, + "grad_norm": 0.6362389922142029, + "learning_rate": 4.527325028449236e-05, + "loss": 2.0143, + "step": 5560 + }, + { + "epoch": 1.0, + "grad_norm": 0.8798420429229736, + "learning_rate": 4.5265019897151595e-05, + "loss": 1.7087, + "step": 5565 + }, + { + "epoch": 1.0, + "grad_norm": 0.9879406690597534, + "learning_rate": 4.525678310000452e-05, + "loss": 2.0423, + "step": 5570 + }, + { + "epoch": 1.0, + "grad_norm": 1.0143115520477295, + "learning_rate": 4.524853989565644e-05, + "loss": 1.9527, + "step": 5575 + }, + { + "epoch": 1.0, + "grad_norm": 0.6878682971000671, + "learning_rate": 4.524029028671467e-05, + "loss": 1.9395, + "step": 5580 + }, + { + "epoch": 1.0, + "grad_norm": 0.6117766499519348, + "learning_rate": 4.523203427578855e-05, + "loss": 1.7447, + "step": 5585 + }, + { + "epoch": 1.0, + "grad_norm": 0.42814651131629944, + "learning_rate": 4.522377186548944e-05, + "loss": 1.8254, + "step": 5590 + }, + { + "epoch": 1.0, + "grad_norm": 0.6543037295341492, + "learning_rate": 4.5215503058430754e-05, + "loss": 2.0367, + "step": 5595 + }, + { + "epoch": 1.0, + "grad_norm": 0.5894500017166138, + "learning_rate": 4.520722785722789e-05, + "loss": 1.8911, + "step": 5600 + }, + { + "epoch": 1.0, + "grad_norm": 0.7925021052360535, + "learning_rate": 4.519894626449829e-05, + "loss": 1.7694, + "step": 5605 + }, + { + "epoch": 1.0, + "grad_norm": 0.6827127933502197, + "learning_rate": 4.519065828286142e-05, + "loss": 1.7906, + "step": 5610 + }, + { + "epoch": 1.01, + "grad_norm": 0.5993368029594421, + "learning_rate": 4.5182363914938754e-05, + "loss": 2.142, + "step": 5615 + }, + { + "epoch": 1.01, + "grad_norm": 0.5092318654060364, + "learning_rate": 4.517406316335379e-05, + "loss": 2.0453, + "step": 5620 + }, + { + "epoch": 1.01, + "grad_norm": 0.5682955980300903, + "learning_rate": 4.5165756030732056e-05, + "loss": 1.866, + "step": 5625 + }, + { + "epoch": 1.01, + "grad_norm": 0.5514644980430603, + "learning_rate": 4.515744251970109e-05, + "loss": 1.9906, + "step": 5630 + }, + { + "epoch": 1.01, + "grad_norm": 0.839066207408905, + "learning_rate": 4.514912263289044e-05, + "loss": 1.9757, + "step": 5635 + }, + { + "epoch": 1.01, + "grad_norm": 0.632759153842926, + "learning_rate": 4.514079637293168e-05, + "loss": 1.8389, + "step": 5640 + }, + { + "epoch": 1.01, + "grad_norm": 0.5392670631408691, + "learning_rate": 4.513246374245841e-05, + "loss": 2.0059, + "step": 5645 + }, + { + "epoch": 1.01, + "grad_norm": 0.7347068190574646, + "learning_rate": 4.5124124744106216e-05, + "loss": 1.8033, + "step": 5650 + }, + { + "epoch": 1.01, + "grad_norm": 1.1633368730545044, + "learning_rate": 4.511577938051272e-05, + "loss": 2.0502, + "step": 5655 + }, + { + "epoch": 1.01, + "grad_norm": 0.4993457794189453, + "learning_rate": 4.510742765431756e-05, + "loss": 2.0386, + "step": 5660 + }, + { + "epoch": 1.01, + "grad_norm": 0.9043493270874023, + "learning_rate": 4.509906956816238e-05, + "loss": 2.1084, + "step": 5665 + }, + { + "epoch": 1.01, + "grad_norm": 0.4216392934322357, + "learning_rate": 4.509070512469081e-05, + "loss": 1.8757, + "step": 5670 + }, + { + "epoch": 1.02, + "grad_norm": 0.5536708235740662, + "learning_rate": 4.508233432654855e-05, + "loss": 1.8147, + "step": 5675 + }, + { + "epoch": 1.02, + "grad_norm": 1.015234112739563, + "learning_rate": 4.507395717638325e-05, + "loss": 1.9113, + "step": 5680 + }, + { + "epoch": 1.02, + "grad_norm": 0.9013848900794983, + "learning_rate": 4.506557367684461e-05, + "loss": 1.6582, + "step": 5685 + }, + { + "epoch": 1.02, + "grad_norm": 0.41156238317489624, + "learning_rate": 4.505718383058431e-05, + "loss": 1.8162, + "step": 5690 + }, + { + "epoch": 1.02, + "grad_norm": 0.8023267984390259, + "learning_rate": 4.504878764025606e-05, + "loss": 1.9033, + "step": 5695 + }, + { + "epoch": 1.02, + "grad_norm": 0.4735568165779114, + "learning_rate": 4.504038510851557e-05, + "loss": 1.604, + "step": 5700 + }, + { + "epoch": 1.02, + "grad_norm": 1.373860478401184, + "learning_rate": 4.503197623802054e-05, + "loss": 1.9613, + "step": 5705 + }, + { + "epoch": 1.02, + "grad_norm": 0.47717657685279846, + "learning_rate": 4.5023561031430696e-05, + "loss": 1.8763, + "step": 5710 + }, + { + "epoch": 1.02, + "grad_norm": 0.9874609708786011, + "learning_rate": 4.501513949140775e-05, + "loss": 1.7237, + "step": 5715 + }, + { + "epoch": 1.02, + "grad_norm": 0.4677652418613434, + "learning_rate": 4.500671162061545e-05, + "loss": 2.2116, + "step": 5720 + }, + { + "epoch": 1.02, + "grad_norm": 0.7714501023292542, + "learning_rate": 4.4998277421719506e-05, + "loss": 1.9144, + "step": 5725 + }, + { + "epoch": 1.03, + "grad_norm": 0.6441267132759094, + "learning_rate": 4.498983689738764e-05, + "loss": 1.7153, + "step": 5730 + }, + { + "epoch": 1.03, + "grad_norm": 1.421938419342041, + "learning_rate": 4.4981390050289595e-05, + "loss": 1.882, + "step": 5735 + }, + { + "epoch": 1.03, + "grad_norm": 0.5032657980918884, + "learning_rate": 4.497293688309709e-05, + "loss": 1.9654, + "step": 5740 + }, + { + "epoch": 1.03, + "grad_norm": 0.5165999531745911, + "learning_rate": 4.4964477398483875e-05, + "loss": 1.7969, + "step": 5745 + }, + { + "epoch": 1.03, + "grad_norm": 0.9800401329994202, + "learning_rate": 4.495601159912566e-05, + "loss": 1.7086, + "step": 5750 + }, + { + "epoch": 1.03, + "grad_norm": 0.6153430342674255, + "learning_rate": 4.494753948770016e-05, + "loss": 1.641, + "step": 5755 + }, + { + "epoch": 1.03, + "grad_norm": 0.5942739844322205, + "learning_rate": 4.493906106688712e-05, + "loss": 1.9742, + "step": 5760 + }, + { + "epoch": 1.03, + "grad_norm": 0.4884500205516815, + "learning_rate": 4.493057633936824e-05, + "loss": 2.0754, + "step": 5765 + }, + { + "epoch": 1.03, + "grad_norm": 0.8295217156410217, + "learning_rate": 4.492208530782722e-05, + "loss": 1.7546, + "step": 5770 + }, + { + "epoch": 1.03, + "grad_norm": 0.7193713784217834, + "learning_rate": 4.49135879749498e-05, + "loss": 1.9467, + "step": 5775 + }, + { + "epoch": 1.03, + "grad_norm": 0.7422948479652405, + "learning_rate": 4.490508434342364e-05, + "loss": 1.8656, + "step": 5780 + }, + { + "epoch": 1.04, + "grad_norm": 3.331986665725708, + "learning_rate": 4.4896574415938465e-05, + "loss": 1.7584, + "step": 5785 + }, + { + "epoch": 1.04, + "grad_norm": 0.6991448402404785, + "learning_rate": 4.4888058195185935e-05, + "loss": 1.6829, + "step": 5790 + }, + { + "epoch": 1.04, + "grad_norm": 0.7543762922286987, + "learning_rate": 4.487953568385972e-05, + "loss": 1.7909, + "step": 5795 + }, + { + "epoch": 1.04, + "grad_norm": 0.5476166605949402, + "learning_rate": 4.4871006884655495e-05, + "loss": 2.1132, + "step": 5800 + }, + { + "epoch": 1.04, + "grad_norm": 0.785237729549408, + "learning_rate": 4.486247180027091e-05, + "loss": 1.6932, + "step": 5805 + }, + { + "epoch": 1.04, + "grad_norm": 0.8334198594093323, + "learning_rate": 4.485393043340559e-05, + "loss": 1.9682, + "step": 5810 + }, + { + "epoch": 1.04, + "grad_norm": 1.4550981521606445, + "learning_rate": 4.484538278676118e-05, + "loss": 1.4896, + "step": 5815 + }, + { + "epoch": 1.04, + "grad_norm": 1.184747576713562, + "learning_rate": 4.483682886304129e-05, + "loss": 1.6799, + "step": 5820 + }, + { + "epoch": 1.04, + "grad_norm": 0.6847954392433167, + "learning_rate": 4.48282686649515e-05, + "loss": 1.8183, + "step": 5825 + }, + { + "epoch": 1.04, + "grad_norm": 0.5142964124679565, + "learning_rate": 4.4819702195199406e-05, + "loss": 2.0018, + "step": 5830 + }, + { + "epoch": 1.04, + "grad_norm": 1.4494707584381104, + "learning_rate": 4.481112945649457e-05, + "loss": 1.7658, + "step": 5835 + }, + { + "epoch": 1.05, + "grad_norm": 0.7955223321914673, + "learning_rate": 4.480255045154854e-05, + "loss": 1.7644, + "step": 5840 + }, + { + "epoch": 1.05, + "grad_norm": 0.46225401759147644, + "learning_rate": 4.479396518307486e-05, + "loss": 1.846, + "step": 5845 + }, + { + "epoch": 1.05, + "grad_norm": 0.8068023920059204, + "learning_rate": 4.478537365378902e-05, + "loss": 1.9631, + "step": 5850 + }, + { + "epoch": 1.05, + "grad_norm": 0.9534473419189453, + "learning_rate": 4.477677586640854e-05, + "loss": 1.9243, + "step": 5855 + }, + { + "epoch": 1.05, + "grad_norm": 1.4949363470077515, + "learning_rate": 4.476817182365286e-05, + "loss": 1.7548, + "step": 5860 + }, + { + "epoch": 1.05, + "grad_norm": 0.6866559386253357, + "learning_rate": 4.4759561528243455e-05, + "loss": 1.7041, + "step": 5865 + }, + { + "epoch": 1.05, + "grad_norm": 0.4342908561229706, + "learning_rate": 4.475094498290374e-05, + "loss": 1.9776, + "step": 5870 + }, + { + "epoch": 1.05, + "grad_norm": 0.4420872628688812, + "learning_rate": 4.474232219035913e-05, + "loss": 2.1547, + "step": 5875 + }, + { + "epoch": 1.05, + "grad_norm": 0.4919748902320862, + "learning_rate": 4.4733693153336995e-05, + "loss": 1.8431, + "step": 5880 + }, + { + "epoch": 1.05, + "grad_norm": 0.8170517683029175, + "learning_rate": 4.472505787456671e-05, + "loss": 1.7578, + "step": 5885 + }, + { + "epoch": 1.05, + "grad_norm": 0.8107596039772034, + "learning_rate": 4.4716416356779576e-05, + "loss": 1.8205, + "step": 5890 + }, + { + "epoch": 1.06, + "grad_norm": 0.5859491229057312, + "learning_rate": 4.470776860270892e-05, + "loss": 1.5296, + "step": 5895 + }, + { + "epoch": 1.06, + "grad_norm": 0.41586703062057495, + "learning_rate": 4.469911461509002e-05, + "loss": 2.2296, + "step": 5900 + }, + { + "epoch": 1.06, + "grad_norm": 0.7300328016281128, + "learning_rate": 4.4690454396660104e-05, + "loss": 1.9785, + "step": 5905 + }, + { + "epoch": 1.06, + "grad_norm": 1.0052844285964966, + "learning_rate": 4.46817879501584e-05, + "loss": 1.8619, + "step": 5910 + }, + { + "epoch": 1.06, + "grad_norm": 0.6375717520713806, + "learning_rate": 4.467311527832611e-05, + "loss": 1.8155, + "step": 5915 + }, + { + "epoch": 1.06, + "grad_norm": 0.5199123620986938, + "learning_rate": 4.466443638390637e-05, + "loss": 1.8485, + "step": 5920 + }, + { + "epoch": 1.06, + "grad_norm": 0.6076669692993164, + "learning_rate": 4.465575126964433e-05, + "loss": 1.644, + "step": 5925 + }, + { + "epoch": 1.06, + "grad_norm": 0.4569863975048065, + "learning_rate": 4.464705993828706e-05, + "loss": 1.9771, + "step": 5930 + }, + { + "epoch": 1.06, + "grad_norm": 0.5174528360366821, + "learning_rate": 4.4638362392583636e-05, + "loss": 1.8152, + "step": 5935 + }, + { + "epoch": 1.06, + "grad_norm": 1.4930440187454224, + "learning_rate": 4.462965863528507e-05, + "loss": 1.7984, + "step": 5940 + }, + { + "epoch": 1.06, + "grad_norm": 0.42374441027641296, + "learning_rate": 4.4620948669144356e-05, + "loss": 1.7553, + "step": 5945 + }, + { + "epoch": 1.07, + "grad_norm": 0.7552632093429565, + "learning_rate": 4.461223249691645e-05, + "loss": 2.0744, + "step": 5950 + }, + { + "epoch": 1.07, + "grad_norm": 0.7354189157485962, + "learning_rate": 4.460351012135828e-05, + "loss": 1.9068, + "step": 5955 + }, + { + "epoch": 1.07, + "grad_norm": 0.44200262427330017, + "learning_rate": 4.4594781545228715e-05, + "loss": 1.8694, + "step": 5960 + }, + { + "epoch": 1.07, + "grad_norm": 0.45945167541503906, + "learning_rate": 4.4586046771288593e-05, + "loss": 2.0557, + "step": 5965 + }, + { + "epoch": 1.07, + "grad_norm": 0.506199061870575, + "learning_rate": 4.4577305802300715e-05, + "loss": 1.8714, + "step": 5970 + }, + { + "epoch": 1.07, + "grad_norm": 0.6749559640884399, + "learning_rate": 4.456855864102984e-05, + "loss": 2.0554, + "step": 5975 + }, + { + "epoch": 1.07, + "grad_norm": 0.7233107686042786, + "learning_rate": 4.455980529024269e-05, + "loss": 1.9416, + "step": 5980 + }, + { + "epoch": 1.07, + "grad_norm": 0.6712331771850586, + "learning_rate": 4.4551045752707945e-05, + "loss": 1.9445, + "step": 5985 + }, + { + "epoch": 1.07, + "grad_norm": 1.0653775930404663, + "learning_rate": 4.4542280031196224e-05, + "loss": 1.9905, + "step": 5990 + }, + { + "epoch": 1.07, + "grad_norm": 1.2245427370071411, + "learning_rate": 4.453350812848014e-05, + "loss": 1.5629, + "step": 5995 + }, + { + "epoch": 1.07, + "grad_norm": 0.6561611890792847, + "learning_rate": 4.452473004733422e-05, + "loss": 2.0267, + "step": 6000 + }, + { + "epoch": 1.07, + "grad_norm": 1.8881734609603882, + "learning_rate": 4.4515945790534964e-05, + "loss": 1.7464, + "step": 6005 + }, + { + "epoch": 1.08, + "grad_norm": 0.6296302080154419, + "learning_rate": 4.450715536086082e-05, + "loss": 1.8395, + "step": 6010 + }, + { + "epoch": 1.08, + "grad_norm": 0.5000322461128235, + "learning_rate": 4.4498358761092204e-05, + "loss": 1.8501, + "step": 6015 + }, + { + "epoch": 1.08, + "grad_norm": 0.5313418507575989, + "learning_rate": 4.448955599401147e-05, + "loss": 1.879, + "step": 6020 + }, + { + "epoch": 1.08, + "grad_norm": 0.8610731363296509, + "learning_rate": 4.448074706240292e-05, + "loss": 1.8825, + "step": 6025 + }, + { + "epoch": 1.08, + "grad_norm": 0.30168166756629944, + "learning_rate": 4.4471931969052816e-05, + "loss": 1.7678, + "step": 6030 + }, + { + "epoch": 1.08, + "grad_norm": 0.8175820708274841, + "learning_rate": 4.446311071674936e-05, + "loss": 1.8989, + "step": 6035 + }, + { + "epoch": 1.08, + "grad_norm": 0.7665614485740662, + "learning_rate": 4.4454283308282695e-05, + "loss": 1.8553, + "step": 6040 + }, + { + "epoch": 1.08, + "grad_norm": 0.42481526732444763, + "learning_rate": 4.444544974644493e-05, + "loss": 2.1974, + "step": 6045 + }, + { + "epoch": 1.08, + "grad_norm": 0.5651946067810059, + "learning_rate": 4.443661003403013e-05, + "loss": 2.1612, + "step": 6050 + }, + { + "epoch": 1.08, + "grad_norm": 0.8740639686584473, + "learning_rate": 4.4427764173834264e-05, + "loss": 1.6421, + "step": 6055 + }, + { + "epoch": 1.08, + "grad_norm": 0.6985417604446411, + "learning_rate": 4.441891216865528e-05, + "loss": 2.0725, + "step": 6060 + }, + { + "epoch": 1.09, + "grad_norm": 0.8365675210952759, + "learning_rate": 4.441005402129306e-05, + "loss": 1.9568, + "step": 6065 + }, + { + "epoch": 1.09, + "grad_norm": 0.5997393727302551, + "learning_rate": 4.4401189734549405e-05, + "loss": 2.0727, + "step": 6070 + }, + { + "epoch": 1.09, + "grad_norm": 1.2542804479599, + "learning_rate": 4.439231931122811e-05, + "loss": 2.039, + "step": 6075 + }, + { + "epoch": 1.09, + "grad_norm": 0.8525872230529785, + "learning_rate": 4.4383442754134874e-05, + "loss": 1.6291, + "step": 6080 + }, + { + "epoch": 1.09, + "grad_norm": 0.5538986325263977, + "learning_rate": 4.4374560066077336e-05, + "loss": 2.0365, + "step": 6085 + }, + { + "epoch": 1.09, + "grad_norm": 0.48354509472846985, + "learning_rate": 4.436567124986508e-05, + "loss": 2.0723, + "step": 6090 + }, + { + "epoch": 1.09, + "grad_norm": 1.0979423522949219, + "learning_rate": 4.435677630830964e-05, + "loss": 1.984, + "step": 6095 + }, + { + "epoch": 1.09, + "grad_norm": 0.6961491107940674, + "learning_rate": 4.434787524422448e-05, + "loss": 1.7996, + "step": 6100 + }, + { + "epoch": 1.09, + "grad_norm": 1.0703699588775635, + "learning_rate": 4.433896806042498e-05, + "loss": 1.904, + "step": 6105 + }, + { + "epoch": 1.09, + "grad_norm": 0.8863664269447327, + "learning_rate": 4.433005475972849e-05, + "loss": 1.7778, + "step": 6110 + }, + { + "epoch": 1.09, + "grad_norm": 0.995624303817749, + "learning_rate": 4.432113534495427e-05, + "loss": 2.0512, + "step": 6115 + }, + { + "epoch": 1.1, + "grad_norm": 0.5747253894805908, + "learning_rate": 4.4312209818923526e-05, + "loss": 1.7252, + "step": 6120 + }, + { + "epoch": 1.1, + "grad_norm": 0.899549663066864, + "learning_rate": 4.430327818445939e-05, + "loss": 1.8408, + "step": 6125 + }, + { + "epoch": 1.1, + "grad_norm": 0.6088781356811523, + "learning_rate": 4.429434044438693e-05, + "loss": 1.7636, + "step": 6130 + }, + { + "epoch": 1.1, + "grad_norm": 0.7908390164375305, + "learning_rate": 4.428539660153315e-05, + "loss": 1.7863, + "step": 6135 + }, + { + "epoch": 1.1, + "grad_norm": 0.4704549312591553, + "learning_rate": 4.427644665872697e-05, + "loss": 1.8403, + "step": 6140 + }, + { + "epoch": 1.1, + "grad_norm": 0.4836312532424927, + "learning_rate": 4.426749061879926e-05, + "loss": 1.8234, + "step": 6145 + }, + { + "epoch": 1.1, + "grad_norm": 0.6054650545120239, + "learning_rate": 4.425852848458279e-05, + "loss": 1.9921, + "step": 6150 + }, + { + "epoch": 1.1, + "grad_norm": 0.598057210445404, + "learning_rate": 4.42495602589123e-05, + "loss": 1.9285, + "step": 6155 + }, + { + "epoch": 1.1, + "grad_norm": 0.6998239755630493, + "learning_rate": 4.42405859446244e-05, + "loss": 1.7761, + "step": 6160 + }, + { + "epoch": 1.1, + "grad_norm": 0.4462050199508667, + "learning_rate": 4.4231605544557684e-05, + "loss": 1.7563, + "step": 6165 + }, + { + "epoch": 1.1, + "grad_norm": 1.1818965673446655, + "learning_rate": 4.422261906155263e-05, + "loss": 2.1757, + "step": 6170 + }, + { + "epoch": 1.11, + "grad_norm": 1.3994632959365845, + "learning_rate": 4.421362649845165e-05, + "loss": 1.8819, + "step": 6175 + }, + { + "epoch": 1.11, + "grad_norm": 0.699133038520813, + "learning_rate": 4.42046278580991e-05, + "loss": 1.9893, + "step": 6180 + }, + { + "epoch": 1.11, + "grad_norm": 0.4134710431098938, + "learning_rate": 4.419562314334123e-05, + "loss": 2.0212, + "step": 6185 + }, + { + "epoch": 1.11, + "grad_norm": 0.8407548666000366, + "learning_rate": 4.4186612357026225e-05, + "loss": 1.8084, + "step": 6190 + }, + { + "epoch": 1.11, + "grad_norm": 0.4963303506374359, + "learning_rate": 4.4177595502004175e-05, + "loss": 1.9406, + "step": 6195 + }, + { + "epoch": 1.11, + "grad_norm": 0.6621739864349365, + "learning_rate": 4.416857258112712e-05, + "loss": 1.9083, + "step": 6200 + }, + { + "epoch": 1.11, + "grad_norm": 0.6170079708099365, + "learning_rate": 4.4159543597248996e-05, + "loss": 1.9234, + "step": 6205 + }, + { + "epoch": 1.11, + "grad_norm": 0.6613223552703857, + "learning_rate": 4.415050855322567e-05, + "loss": 2.0523, + "step": 6210 + }, + { + "epoch": 1.11, + "grad_norm": 1.43656325340271, + "learning_rate": 4.41414674519149e-05, + "loss": 1.9491, + "step": 6215 + }, + { + "epoch": 1.11, + "grad_norm": 0.7505949139595032, + "learning_rate": 4.4132420296176386e-05, + "loss": 1.9439, + "step": 6220 + }, + { + "epoch": 1.11, + "grad_norm": 0.665459394454956, + "learning_rate": 4.4123367088871735e-05, + "loss": 1.7187, + "step": 6225 + }, + { + "epoch": 1.12, + "grad_norm": 0.3662698566913605, + "learning_rate": 4.411430783286447e-05, + "loss": 1.6875, + "step": 6230 + }, + { + "epoch": 1.12, + "grad_norm": 0.3431253731250763, + "learning_rate": 4.410524253102003e-05, + "loss": 1.9651, + "step": 6235 + }, + { + "epoch": 1.12, + "grad_norm": 0.5281440615653992, + "learning_rate": 4.409617118620574e-05, + "loss": 1.9109, + "step": 6240 + }, + { + "epoch": 1.12, + "grad_norm": 0.659487247467041, + "learning_rate": 4.408709380129088e-05, + "loss": 1.9324, + "step": 6245 + }, + { + "epoch": 1.12, + "grad_norm": 0.7130793333053589, + "learning_rate": 4.407801037914662e-05, + "loss": 1.813, + "step": 6250 + }, + { + "epoch": 1.12, + "grad_norm": 0.6839697360992432, + "learning_rate": 4.4068920922646015e-05, + "loss": 1.8927, + "step": 6255 + }, + { + "epoch": 1.12, + "grad_norm": 1.225645661354065, + "learning_rate": 4.405982543466406e-05, + "loss": 1.9195, + "step": 6260 + }, + { + "epoch": 1.12, + "grad_norm": 0.6370910406112671, + "learning_rate": 4.4050723918077664e-05, + "loss": 2.0735, + "step": 6265 + }, + { + "epoch": 1.12, + "grad_norm": 0.8523099422454834, + "learning_rate": 4.4041616375765614e-05, + "loss": 1.8858, + "step": 6270 + }, + { + "epoch": 1.12, + "grad_norm": 1.1719610691070557, + "learning_rate": 4.4032502810608614e-05, + "loss": 1.7128, + "step": 6275 + }, + { + "epoch": 1.12, + "grad_norm": 0.9267966747283936, + "learning_rate": 4.402338322548929e-05, + "loss": 1.6958, + "step": 6280 + }, + { + "epoch": 1.12, + "grad_norm": 0.4360824525356293, + "learning_rate": 4.401425762329214e-05, + "loss": 1.9678, + "step": 6285 + }, + { + "epoch": 1.13, + "grad_norm": 0.7322136163711548, + "learning_rate": 4.400512600690361e-05, + "loss": 1.8844, + "step": 6290 + }, + { + "epoch": 1.13, + "grad_norm": 1.1388806104660034, + "learning_rate": 4.399598837921199e-05, + "loss": 1.645, + "step": 6295 + }, + { + "epoch": 1.13, + "grad_norm": 0.42380547523498535, + "learning_rate": 4.3986844743107525e-05, + "loss": 1.9983, + "step": 6300 + }, + { + "epoch": 1.13, + "grad_norm": 0.9260616302490234, + "learning_rate": 4.397769510148233e-05, + "loss": 1.6449, + "step": 6305 + }, + { + "epoch": 1.13, + "grad_norm": 0.5340147614479065, + "learning_rate": 4.396853945723043e-05, + "loss": 1.7605, + "step": 6310 + }, + { + "epoch": 1.13, + "grad_norm": 0.5331189632415771, + "learning_rate": 4.395937781324774e-05, + "loss": 1.8059, + "step": 6315 + }, + { + "epoch": 1.13, + "grad_norm": 0.6908158659934998, + "learning_rate": 4.395021017243208e-05, + "loss": 1.9612, + "step": 6320 + }, + { + "epoch": 1.13, + "grad_norm": 0.8586407899856567, + "learning_rate": 4.3941036537683176e-05, + "loss": 1.9027, + "step": 6325 + }, + { + "epoch": 1.13, + "grad_norm": 0.7058882713317871, + "learning_rate": 4.393185691190264e-05, + "loss": 1.9719, + "step": 6330 + }, + { + "epoch": 1.13, + "grad_norm": 0.7748107314109802, + "learning_rate": 4.392267129799397e-05, + "loss": 2.0075, + "step": 6335 + }, + { + "epoch": 1.13, + "grad_norm": 0.6403058767318726, + "learning_rate": 4.391347969886256e-05, + "loss": 2.2573, + "step": 6340 + }, + { + "epoch": 1.14, + "grad_norm": 0.7095941305160522, + "learning_rate": 4.3904282117415724e-05, + "loss": 1.8924, + "step": 6345 + }, + { + "epoch": 1.14, + "grad_norm": 0.880134642124176, + "learning_rate": 4.389507855656263e-05, + "loss": 1.6801, + "step": 6350 + }, + { + "epoch": 1.14, + "grad_norm": 0.8959532976150513, + "learning_rate": 4.3885869019214374e-05, + "loss": 1.6508, + "step": 6355 + }, + { + "epoch": 1.14, + "grad_norm": 1.0066273212432861, + "learning_rate": 4.38766535082839e-05, + "loss": 1.9506, + "step": 6360 + }, + { + "epoch": 1.14, + "grad_norm": 0.32858455181121826, + "learning_rate": 4.386743202668609e-05, + "loss": 2.0022, + "step": 6365 + }, + { + "epoch": 1.14, + "grad_norm": 1.6246061325073242, + "learning_rate": 4.3858204577337674e-05, + "loss": 1.8174, + "step": 6370 + }, + { + "epoch": 1.14, + "grad_norm": 0.4996178448200226, + "learning_rate": 4.3848971163157296e-05, + "loss": 1.7198, + "step": 6375 + }, + { + "epoch": 1.14, + "grad_norm": 0.6967160105705261, + "learning_rate": 4.383973178706545e-05, + "loss": 1.9285, + "step": 6380 + }, + { + "epoch": 1.14, + "grad_norm": 1.0754936933517456, + "learning_rate": 4.383048645198458e-05, + "loss": 2.0293, + "step": 6385 + }, + { + "epoch": 1.14, + "grad_norm": 0.49468663334846497, + "learning_rate": 4.3821235160838955e-05, + "loss": 1.8142, + "step": 6390 + }, + { + "epoch": 1.14, + "grad_norm": 0.9518940448760986, + "learning_rate": 4.3811977916554746e-05, + "loss": 2.1539, + "step": 6395 + }, + { + "epoch": 1.15, + "grad_norm": 0.5778300166130066, + "learning_rate": 4.3802714722060024e-05, + "loss": 1.8289, + "step": 6400 + }, + { + "epoch": 1.15, + "grad_norm": 0.8983391523361206, + "learning_rate": 4.379344558028471e-05, + "loss": 1.6746, + "step": 6405 + }, + { + "epoch": 1.15, + "grad_norm": 0.6165648102760315, + "learning_rate": 4.3784170494160646e-05, + "loss": 2.0491, + "step": 6410 + }, + { + "epoch": 1.15, + "grad_norm": 0.9584977626800537, + "learning_rate": 4.377488946662152e-05, + "loss": 1.8224, + "step": 6415 + }, + { + "epoch": 1.15, + "grad_norm": 1.769154667854309, + "learning_rate": 4.376560250060292e-05, + "loss": 1.8792, + "step": 6420 + }, + { + "epoch": 1.15, + "grad_norm": 0.5984408855438232, + "learning_rate": 4.375630959904229e-05, + "loss": 1.9362, + "step": 6425 + }, + { + "epoch": 1.15, + "grad_norm": 0.7467585206031799, + "learning_rate": 4.3747010764878985e-05, + "loss": 1.4512, + "step": 6430 + }, + { + "epoch": 1.15, + "grad_norm": 1.2350887060165405, + "learning_rate": 4.373770600105419e-05, + "loss": 1.8879, + "step": 6435 + }, + { + "epoch": 1.15, + "grad_norm": 3.794597625732422, + "learning_rate": 4.372839531051103e-05, + "loss": 1.7501, + "step": 6440 + }, + { + "epoch": 1.15, + "grad_norm": 0.3860040605068207, + "learning_rate": 4.371907869619444e-05, + "loss": 1.7878, + "step": 6445 + }, + { + "epoch": 1.15, + "grad_norm": 0.4788179099559784, + "learning_rate": 4.3709756161051266e-05, + "loss": 2.0143, + "step": 6450 + }, + { + "epoch": 1.16, + "grad_norm": 0.5330110192298889, + "learning_rate": 4.3700427708030205e-05, + "loss": 1.6505, + "step": 6455 + }, + { + "epoch": 1.16, + "grad_norm": 0.7788562178611755, + "learning_rate": 4.369109334008186e-05, + "loss": 1.9456, + "step": 6460 + }, + { + "epoch": 1.16, + "grad_norm": 0.4161795675754547, + "learning_rate": 4.3681753060158656e-05, + "loss": 2.1514, + "step": 6465 + }, + { + "epoch": 1.16, + "grad_norm": 0.8420090079307556, + "learning_rate": 4.367240687121495e-05, + "loss": 1.8194, + "step": 6470 + }, + { + "epoch": 1.16, + "grad_norm": 0.37459495663642883, + "learning_rate": 4.366305477620689e-05, + "loss": 1.8492, + "step": 6475 + }, + { + "epoch": 1.16, + "grad_norm": 0.8855760097503662, + "learning_rate": 4.365369677809257e-05, + "loss": 1.6327, + "step": 6480 + }, + { + "epoch": 1.16, + "grad_norm": 1.2534199953079224, + "learning_rate": 4.36443328798319e-05, + "loss": 1.9092, + "step": 6485 + }, + { + "epoch": 1.16, + "grad_norm": 0.44040653109550476, + "learning_rate": 4.363496308438666e-05, + "loss": 1.9596, + "step": 6490 + }, + { + "epoch": 1.16, + "grad_norm": 0.6483808755874634, + "learning_rate": 4.3625587394720534e-05, + "loss": 1.8081, + "step": 6495 + }, + { + "epoch": 1.16, + "grad_norm": 1.1044362783432007, + "learning_rate": 4.3616205813799025e-05, + "loss": 1.8881, + "step": 6500 + }, + { + "epoch": 1.16, + "grad_norm": 1.2582200765609741, + "learning_rate": 4.360681834458953e-05, + "loss": 2.0978, + "step": 6505 + }, + { + "epoch": 1.17, + "grad_norm": 0.6778598427772522, + "learning_rate": 4.359742499006129e-05, + "loss": 2.1348, + "step": 6510 + }, + { + "epoch": 1.17, + "grad_norm": 0.45896151661872864, + "learning_rate": 4.35880257531854e-05, + "loss": 1.8157, + "step": 6515 + }, + { + "epoch": 1.17, + "grad_norm": 0.8110174536705017, + "learning_rate": 4.357862063693486e-05, + "loss": 1.9552, + "step": 6520 + }, + { + "epoch": 1.17, + "grad_norm": 0.5105950236320496, + "learning_rate": 4.356920964428448e-05, + "loss": 2.0259, + "step": 6525 + }, + { + "epoch": 1.17, + "grad_norm": 0.7677923440933228, + "learning_rate": 4.3559792778210945e-05, + "loss": 1.9829, + "step": 6530 + }, + { + "epoch": 1.17, + "grad_norm": 2.0216026306152344, + "learning_rate": 4.3550370041692805e-05, + "loss": 1.8117, + "step": 6535 + }, + { + "epoch": 1.17, + "grad_norm": 0.8491010069847107, + "learning_rate": 4.354094143771047e-05, + "loss": 1.9182, + "step": 6540 + }, + { + "epoch": 1.17, + "grad_norm": 0.7707613706588745, + "learning_rate": 4.353150696924619e-05, + "loss": 1.9682, + "step": 6545 + }, + { + "epoch": 1.17, + "grad_norm": 0.6204885840415955, + "learning_rate": 4.3522066639284085e-05, + "loss": 1.641, + "step": 6550 + }, + { + "epoch": 1.17, + "grad_norm": 0.7384446263313293, + "learning_rate": 4.3512620450810114e-05, + "loss": 1.9758, + "step": 6555 + }, + { + "epoch": 1.17, + "grad_norm": 0.7315991520881653, + "learning_rate": 4.35031684068121e-05, + "loss": 1.6545, + "step": 6560 + }, + { + "epoch": 1.18, + "grad_norm": 0.5005430579185486, + "learning_rate": 4.349371051027972e-05, + "loss": 1.9201, + "step": 6565 + }, + { + "epoch": 1.18, + "grad_norm": 1.0951027870178223, + "learning_rate": 4.348424676420449e-05, + "loss": 1.9609, + "step": 6570 + }, + { + "epoch": 1.18, + "grad_norm": 0.5726012587547302, + "learning_rate": 4.3474777171579794e-05, + "loss": 1.835, + "step": 6575 + }, + { + "epoch": 1.18, + "grad_norm": 0.7009186744689941, + "learning_rate": 4.3465301735400844e-05, + "loss": 2.0274, + "step": 6580 + }, + { + "epoch": 1.18, + "grad_norm": 0.8929822444915771, + "learning_rate": 4.345582045866472e-05, + "loss": 1.8169, + "step": 6585 + }, + { + "epoch": 1.18, + "grad_norm": 0.615864098072052, + "learning_rate": 4.3446333344370326e-05, + "loss": 1.7477, + "step": 6590 + }, + { + "epoch": 1.18, + "grad_norm": 0.7010105848312378, + "learning_rate": 4.343684039551845e-05, + "loss": 1.6822, + "step": 6595 + }, + { + "epoch": 1.18, + "grad_norm": 0.7396606206893921, + "learning_rate": 4.3427341615111696e-05, + "loss": 1.7189, + "step": 6600 + }, + { + "epoch": 1.18, + "grad_norm": 0.8250526189804077, + "learning_rate": 4.3417837006154505e-05, + "loss": 1.8397, + "step": 6605 + }, + { + "epoch": 1.18, + "grad_norm": 0.417876660823822, + "learning_rate": 4.340832657165318e-05, + "loss": 1.7366, + "step": 6610 + }, + { + "epoch": 1.18, + "grad_norm": 0.5691598653793335, + "learning_rate": 4.3398810314615876e-05, + "loss": 1.678, + "step": 6615 + }, + { + "epoch": 1.18, + "grad_norm": 0.3957613706588745, + "learning_rate": 4.338928823805256e-05, + "loss": 2.2207, + "step": 6620 + }, + { + "epoch": 1.19, + "grad_norm": 0.5127105712890625, + "learning_rate": 4.337976034497507e-05, + "loss": 2.0833, + "step": 6625 + }, + { + "epoch": 1.19, + "grad_norm": 0.7685470581054688, + "learning_rate": 4.337022663839706e-05, + "loss": 1.6234, + "step": 6630 + }, + { + "epoch": 1.19, + "grad_norm": 0.8764786124229431, + "learning_rate": 4.336068712133403e-05, + "loss": 1.8451, + "step": 6635 + }, + { + "epoch": 1.19, + "grad_norm": 0.5960032939910889, + "learning_rate": 4.335114179680333e-05, + "loss": 1.9596, + "step": 6640 + }, + { + "epoch": 1.19, + "grad_norm": 0.9183564782142639, + "learning_rate": 4.334159066782413e-05, + "loss": 1.9834, + "step": 6645 + }, + { + "epoch": 1.19, + "grad_norm": 1.964171051979065, + "learning_rate": 4.333203373741744e-05, + "loss": 1.8609, + "step": 6650 + }, + { + "epoch": 1.19, + "grad_norm": 0.5139592885971069, + "learning_rate": 4.332247100860612e-05, + "loss": 1.8969, + "step": 6655 + }, + { + "epoch": 1.19, + "grad_norm": 0.40804627537727356, + "learning_rate": 4.331290248441483e-05, + "loss": 1.6632, + "step": 6660 + }, + { + "epoch": 1.19, + "grad_norm": 0.5056566596031189, + "learning_rate": 4.3303328167870105e-05, + "loss": 1.7616, + "step": 6665 + }, + { + "epoch": 1.19, + "grad_norm": 2.207601547241211, + "learning_rate": 4.32937480620003e-05, + "loss": 1.9739, + "step": 6670 + }, + { + "epoch": 1.19, + "grad_norm": 0.5137903094291687, + "learning_rate": 4.3284162169835565e-05, + "loss": 1.8736, + "step": 6675 + }, + { + "epoch": 1.2, + "grad_norm": 0.5918135046958923, + "learning_rate": 4.327457049440792e-05, + "loss": 2.1611, + "step": 6680 + }, + { + "epoch": 1.2, + "grad_norm": 0.4136945903301239, + "learning_rate": 4.326497303875121e-05, + "loss": 2.0066, + "step": 6685 + }, + { + "epoch": 1.2, + "grad_norm": 0.7371619343757629, + "learning_rate": 4.3255369805901096e-05, + "loss": 1.5216, + "step": 6690 + }, + { + "epoch": 1.2, + "grad_norm": 7.495757579803467, + "learning_rate": 4.324576079889507e-05, + "loss": 1.752, + "step": 6695 + }, + { + "epoch": 1.2, + "grad_norm": 0.8859977722167969, + "learning_rate": 4.323614602077246e-05, + "loss": 1.7231, + "step": 6700 + }, + { + "epoch": 1.2, + "grad_norm": 0.48695892095565796, + "learning_rate": 4.3226525474574405e-05, + "loss": 1.612, + "step": 6705 + }, + { + "epoch": 1.2, + "grad_norm": 0.5769939422607422, + "learning_rate": 4.321689916334387e-05, + "loss": 1.9836, + "step": 6710 + }, + { + "epoch": 1.2, + "grad_norm": 0.7489534616470337, + "learning_rate": 4.3207267090125646e-05, + "loss": 1.6957, + "step": 6715 + }, + { + "epoch": 1.2, + "grad_norm": 1.0079684257507324, + "learning_rate": 4.3197629257966365e-05, + "loss": 1.89, + "step": 6720 + }, + { + "epoch": 1.2, + "grad_norm": 0.8229489326477051, + "learning_rate": 4.3187985669914454e-05, + "loss": 1.929, + "step": 6725 + }, + { + "epoch": 1.2, + "grad_norm": 0.6666339039802551, + "learning_rate": 4.317833632902016e-05, + "loss": 1.823, + "step": 6730 + }, + { + "epoch": 1.21, + "grad_norm": 0.8089262247085571, + "learning_rate": 4.316868123833558e-05, + "loss": 1.9112, + "step": 6735 + }, + { + "epoch": 1.21, + "grad_norm": 0.6247695684432983, + "learning_rate": 4.31590204009146e-05, + "loss": 1.637, + "step": 6740 + }, + { + "epoch": 1.21, + "grad_norm": 0.4710233211517334, + "learning_rate": 4.314935381981293e-05, + "loss": 2.0317, + "step": 6745 + }, + { + "epoch": 1.21, + "grad_norm": 0.6251475214958191, + "learning_rate": 4.31396814980881e-05, + "loss": 2.1173, + "step": 6750 + }, + { + "epoch": 1.21, + "grad_norm": 0.8826267719268799, + "learning_rate": 4.3130003438799466e-05, + "loss": 2.0483, + "step": 6755 + }, + { + "epoch": 1.21, + "grad_norm": 0.6971041560173035, + "learning_rate": 4.312031964500818e-05, + "loss": 1.921, + "step": 6760 + }, + { + "epoch": 1.21, + "grad_norm": 0.6338077187538147, + "learning_rate": 4.311063011977723e-05, + "loss": 1.7711, + "step": 6765 + }, + { + "epoch": 1.21, + "grad_norm": 1.1039173603057861, + "learning_rate": 4.3100934866171385e-05, + "loss": 1.5759, + "step": 6770 + }, + { + "epoch": 1.21, + "grad_norm": 1.1805471181869507, + "learning_rate": 4.3091233887257256e-05, + "loss": 1.7043, + "step": 6775 + }, + { + "epoch": 1.21, + "grad_norm": 0.7616626024246216, + "learning_rate": 4.308152718610324e-05, + "loss": 1.739, + "step": 6780 + }, + { + "epoch": 1.21, + "grad_norm": 0.3182941675186157, + "learning_rate": 4.307181476577957e-05, + "loss": 1.8309, + "step": 6785 + }, + { + "epoch": 1.22, + "grad_norm": 1.0036624670028687, + "learning_rate": 4.3062096629358285e-05, + "loss": 1.7662, + "step": 6790 + }, + { + "epoch": 1.22, + "grad_norm": 4.941795349121094, + "learning_rate": 4.3052372779913194e-05, + "loss": 1.7195, + "step": 6795 + }, + { + "epoch": 1.22, + "grad_norm": 0.5514867901802063, + "learning_rate": 4.304264322051997e-05, + "loss": 1.899, + "step": 6800 + }, + { + "epoch": 1.22, + "grad_norm": 0.600469708442688, + "learning_rate": 4.303290795425605e-05, + "loss": 2.0107, + "step": 6805 + }, + { + "epoch": 1.22, + "grad_norm": 0.567573070526123, + "learning_rate": 4.302316698420068e-05, + "loss": 1.89, + "step": 6810 + }, + { + "epoch": 1.22, + "grad_norm": 0.5658138394355774, + "learning_rate": 4.301342031343493e-05, + "loss": 2.2113, + "step": 6815 + }, + { + "epoch": 1.22, + "grad_norm": 1.3791340589523315, + "learning_rate": 4.300366794504167e-05, + "loss": 2.0512, + "step": 6820 + }, + { + "epoch": 1.22, + "grad_norm": 0.7716684937477112, + "learning_rate": 4.2993909882105546e-05, + "loss": 1.9361, + "step": 6825 + }, + { + "epoch": 1.22, + "grad_norm": 0.4616861939430237, + "learning_rate": 4.2984146127713046e-05, + "loss": 2.2654, + "step": 6830 + }, + { + "epoch": 1.22, + "grad_norm": 1.1081273555755615, + "learning_rate": 4.2974376684952414e-05, + "loss": 2.059, + "step": 6835 + }, + { + "epoch": 1.22, + "grad_norm": 0.547261118888855, + "learning_rate": 4.296460155691373e-05, + "loss": 1.8463, + "step": 6840 + }, + { + "epoch": 1.23, + "grad_norm": 0.597197949886322, + "learning_rate": 4.295482074668885e-05, + "loss": 1.8631, + "step": 6845 + }, + { + "epoch": 1.23, + "grad_norm": 0.4494568705558777, + "learning_rate": 4.2945034257371445e-05, + "loss": 1.693, + "step": 6850 + }, + { + "epoch": 1.23, + "grad_norm": 0.7037421464920044, + "learning_rate": 4.293524209205696e-05, + "loss": 1.7386, + "step": 6855 + }, + { + "epoch": 1.23, + "grad_norm": 0.509117066860199, + "learning_rate": 4.292544425384265e-05, + "loss": 1.7397, + "step": 6860 + }, + { + "epoch": 1.23, + "grad_norm": 0.8861694931983948, + "learning_rate": 4.2915640745827576e-05, + "loss": 1.8453, + "step": 6865 + }, + { + "epoch": 1.23, + "grad_norm": 0.8716956973075867, + "learning_rate": 4.290583157111255e-05, + "loss": 1.8874, + "step": 6870 + }, + { + "epoch": 1.23, + "grad_norm": 0.608605682849884, + "learning_rate": 4.2896016732800235e-05, + "loss": 1.6868, + "step": 6875 + }, + { + "epoch": 1.23, + "grad_norm": 0.43817174434661865, + "learning_rate": 4.288619623399503e-05, + "loss": 1.7757, + "step": 6880 + }, + { + "epoch": 1.23, + "grad_norm": 0.7979812622070312, + "learning_rate": 4.2876370077803174e-05, + "loss": 1.9506, + "step": 6885 + }, + { + "epoch": 1.23, + "grad_norm": 0.8355962634086609, + "learning_rate": 4.286653826733265e-05, + "loss": 1.8282, + "step": 6890 + }, + { + "epoch": 1.23, + "grad_norm": 0.4820092022418976, + "learning_rate": 4.285670080569326e-05, + "loss": 1.7797, + "step": 6895 + }, + { + "epoch": 1.24, + "grad_norm": 0.9699495434761047, + "learning_rate": 4.284685769599658e-05, + "loss": 1.8737, + "step": 6900 + }, + { + "epoch": 1.24, + "grad_norm": 0.6602120995521545, + "learning_rate": 4.283700894135597e-05, + "loss": 1.8891, + "step": 6905 + }, + { + "epoch": 1.24, + "grad_norm": 0.5803684592247009, + "learning_rate": 4.282715454488661e-05, + "loss": 2.1173, + "step": 6910 + }, + { + "epoch": 1.24, + "grad_norm": 0.8737105131149292, + "learning_rate": 4.2817294509705394e-05, + "loss": 1.9188, + "step": 6915 + }, + { + "epoch": 1.24, + "grad_norm": 1.0067740678787231, + "learning_rate": 4.280742883893106e-05, + "loss": 1.9328, + "step": 6920 + }, + { + "epoch": 1.24, + "grad_norm": 0.7453013062477112, + "learning_rate": 4.279755753568413e-05, + "loss": 2.0628, + "step": 6925 + }, + { + "epoch": 1.24, + "grad_norm": 0.7748093008995056, + "learning_rate": 4.278768060308686e-05, + "loss": 1.8465, + "step": 6930 + }, + { + "epoch": 1.24, + "grad_norm": 0.8429641723632812, + "learning_rate": 4.277779804426333e-05, + "loss": 1.6647, + "step": 6935 + }, + { + "epoch": 1.24, + "grad_norm": 0.9702791571617126, + "learning_rate": 4.2767909862339374e-05, + "loss": 2.1712, + "step": 6940 + }, + { + "epoch": 1.24, + "grad_norm": 3.3242080211639404, + "learning_rate": 4.275801606044262e-05, + "loss": 1.812, + "step": 6945 + }, + { + "epoch": 1.24, + "grad_norm": 0.5527956485748291, + "learning_rate": 4.2748116641702466e-05, + "loss": 1.7873, + "step": 6950 + }, + { + "epoch": 1.24, + "grad_norm": 0.6284675002098083, + "learning_rate": 4.27382116092501e-05, + "loss": 1.9656, + "step": 6955 + }, + { + "epoch": 1.25, + "grad_norm": 10.499287605285645, + "learning_rate": 4.272830096621845e-05, + "loss": 1.724, + "step": 6960 + }, + { + "epoch": 1.25, + "grad_norm": 0.9335476756095886, + "learning_rate": 4.271838471574227e-05, + "loss": 1.9242, + "step": 6965 + }, + { + "epoch": 1.25, + "grad_norm": 0.3686840832233429, + "learning_rate": 4.270846286095803e-05, + "loss": 2.0095, + "step": 6970 + }, + { + "epoch": 1.25, + "grad_norm": 0.7806186676025391, + "learning_rate": 4.2698535405004034e-05, + "loss": 1.7676, + "step": 6975 + }, + { + "epoch": 1.25, + "grad_norm": 1.4177755117416382, + "learning_rate": 4.2688602351020315e-05, + "loss": 1.824, + "step": 6980 + }, + { + "epoch": 1.25, + "grad_norm": 0.5843412280082703, + "learning_rate": 4.267866370214868e-05, + "loss": 1.9222, + "step": 6985 + }, + { + "epoch": 1.25, + "grad_norm": 0.8569367527961731, + "learning_rate": 4.266871946153272e-05, + "loss": 1.8849, + "step": 6990 + }, + { + "epoch": 1.25, + "grad_norm": 0.5695585012435913, + "learning_rate": 4.265876963231779e-05, + "loss": 2.0957, + "step": 6995 + }, + { + "epoch": 1.25, + "grad_norm": 6.371403694152832, + "learning_rate": 4.2648814217651013e-05, + "loss": 1.9637, + "step": 7000 + }, + { + "epoch": 1.25, + "grad_norm": 0.9307239651679993, + "learning_rate": 4.2638853220681276e-05, + "loss": 2.0494, + "step": 7005 + }, + { + "epoch": 1.25, + "grad_norm": 1.0416122674942017, + "learning_rate": 4.262888664455923e-05, + "loss": 1.9864, + "step": 7010 + }, + { + "epoch": 1.26, + "grad_norm": 0.673014223575592, + "learning_rate": 4.261891449243729e-05, + "loss": 1.7286, + "step": 7015 + }, + { + "epoch": 1.26, + "grad_norm": 0.7358102202415466, + "learning_rate": 4.2608936767469655e-05, + "loss": 1.7191, + "step": 7020 + }, + { + "epoch": 1.26, + "grad_norm": 0.7470425367355347, + "learning_rate": 4.259895347281225e-05, + "loss": 1.6247, + "step": 7025 + }, + { + "epoch": 1.26, + "grad_norm": 0.47723567485809326, + "learning_rate": 4.258896461162279e-05, + "loss": 1.7392, + "step": 7030 + }, + { + "epoch": 1.26, + "grad_norm": 0.9144250154495239, + "learning_rate": 4.257897018706074e-05, + "loss": 1.8239, + "step": 7035 + }, + { + "epoch": 1.26, + "grad_norm": 0.484836608171463, + "learning_rate": 4.2568970202287334e-05, + "loss": 1.6783, + "step": 7040 + }, + { + "epoch": 1.26, + "grad_norm": 1.3341679573059082, + "learning_rate": 4.255896466046555e-05, + "loss": 1.8347, + "step": 7045 + }, + { + "epoch": 1.26, + "grad_norm": 0.928821861743927, + "learning_rate": 4.254895356476013e-05, + "loss": 1.8622, + "step": 7050 + }, + { + "epoch": 1.26, + "grad_norm": 0.6659140586853027, + "learning_rate": 4.253893691833758e-05, + "loss": 2.1149, + "step": 7055 + }, + { + "epoch": 1.26, + "grad_norm": 0.630410373210907, + "learning_rate": 4.252891472436616e-05, + "loss": 1.8716, + "step": 7060 + }, + { + "epoch": 1.26, + "grad_norm": 0.746567964553833, + "learning_rate": 4.251888698601587e-05, + "loss": 1.8274, + "step": 7065 + }, + { + "epoch": 1.27, + "grad_norm": 0.6550970673561096, + "learning_rate": 4.250885370645847e-05, + "loss": 1.9049, + "step": 7070 + }, + { + "epoch": 1.27, + "grad_norm": 0.8050583004951477, + "learning_rate": 4.249881488886749e-05, + "loss": 1.9531, + "step": 7075 + }, + { + "epoch": 1.27, + "grad_norm": 0.8060429692268372, + "learning_rate": 4.248877053641819e-05, + "loss": 1.84, + "step": 7080 + }, + { + "epoch": 1.27, + "grad_norm": 0.6672247648239136, + "learning_rate": 4.2478720652287596e-05, + "loss": 1.9365, + "step": 7085 + }, + { + "epoch": 1.27, + "grad_norm": 0.9606338739395142, + "learning_rate": 4.246866523965446e-05, + "loss": 1.8334, + "step": 7090 + }, + { + "epoch": 1.27, + "grad_norm": 0.8220806121826172, + "learning_rate": 4.2458604301699324e-05, + "loss": 2.2147, + "step": 7095 + }, + { + "epoch": 1.27, + "grad_norm": 0.5984706282615662, + "learning_rate": 4.2448537841604435e-05, + "loss": 1.9588, + "step": 7100 + }, + { + "epoch": 1.27, + "grad_norm": 0.6690345406532288, + "learning_rate": 4.243846586255381e-05, + "loss": 1.8757, + "step": 7105 + }, + { + "epoch": 1.27, + "grad_norm": 2.075214385986328, + "learning_rate": 4.242838836773321e-05, + "loss": 1.7474, + "step": 7110 + }, + { + "epoch": 1.27, + "grad_norm": 0.8031553626060486, + "learning_rate": 4.2418305360330134e-05, + "loss": 1.6749, + "step": 7115 + }, + { + "epoch": 1.27, + "grad_norm": 0.6923268437385559, + "learning_rate": 4.240821684353382e-05, + "loss": 1.7302, + "step": 7120 + }, + { + "epoch": 1.28, + "grad_norm": 0.5841795206069946, + "learning_rate": 4.2398122820535266e-05, + "loss": 1.7478, + "step": 7125 + }, + { + "epoch": 1.28, + "grad_norm": 0.6711515188217163, + "learning_rate": 4.2388023294527204e-05, + "loss": 2.0452, + "step": 7130 + }, + { + "epoch": 1.28, + "grad_norm": 0.6497505903244019, + "learning_rate": 4.237791826870409e-05, + "loss": 1.7706, + "step": 7135 + }, + { + "epoch": 1.28, + "grad_norm": 0.7864512801170349, + "learning_rate": 4.236780774626215e-05, + "loss": 2.1351, + "step": 7140 + }, + { + "epoch": 1.28, + "grad_norm": 0.8343969583511353, + "learning_rate": 4.235769173039932e-05, + "loss": 1.7934, + "step": 7145 + }, + { + "epoch": 1.28, + "grad_norm": 0.734727144241333, + "learning_rate": 4.234959496459615e-05, + "loss": 1.9032, + "step": 7150 + }, + { + "epoch": 1.28, + "grad_norm": 1.37156343460083, + "learning_rate": 4.233946906864011e-05, + "loss": 2.0043, + "step": 7155 + }, + { + "epoch": 1.28, + "grad_norm": 0.48946473002433777, + "learning_rate": 4.232933768822668e-05, + "loss": 2.1292, + "step": 7160 + }, + { + "epoch": 1.28, + "grad_norm": 0.64715576171875, + "learning_rate": 4.231920082656041e-05, + "loss": 1.7692, + "step": 7165 + }, + { + "epoch": 1.28, + "grad_norm": 0.7342175245285034, + "learning_rate": 4.230905848684757e-05, + "loss": 1.8173, + "step": 7170 + }, + { + "epoch": 1.28, + "grad_norm": 1.2754048109054565, + "learning_rate": 4.2298910672296186e-05, + "loss": 1.6126, + "step": 7175 + }, + { + "epoch": 1.29, + "grad_norm": 0.9194117784500122, + "learning_rate": 4.2288757386115994e-05, + "loss": 1.9669, + "step": 7180 + }, + { + "epoch": 1.29, + "grad_norm": 3.632533550262451, + "learning_rate": 4.227859863151847e-05, + "loss": 1.7307, + "step": 7185 + }, + { + "epoch": 1.29, + "grad_norm": 0.4630647599697113, + "learning_rate": 4.226843441171682e-05, + "loss": 1.6671, + "step": 7190 + }, + { + "epoch": 1.29, + "grad_norm": 0.5320610404014587, + "learning_rate": 4.225826472992598e-05, + "loss": 1.8363, + "step": 7195 + }, + { + "epoch": 1.29, + "grad_norm": 0.6387033462524414, + "learning_rate": 4.2248089589362593e-05, + "loss": 2.0306, + "step": 7200 + }, + { + "epoch": 1.29, + "grad_norm": 0.5456752181053162, + "learning_rate": 4.223790899324507e-05, + "loss": 1.7391, + "step": 7205 + }, + { + "epoch": 1.29, + "grad_norm": 6.061885833740234, + "learning_rate": 4.2227722944793515e-05, + "loss": 1.9154, + "step": 7210 + }, + { + "epoch": 1.29, + "grad_norm": 0.6948996186256409, + "learning_rate": 4.221753144722976e-05, + "loss": 1.8669, + "step": 7215 + }, + { + "epoch": 1.29, + "grad_norm": 0.5726954936981201, + "learning_rate": 4.220733450377737e-05, + "loss": 2.0606, + "step": 7220 + }, + { + "epoch": 1.29, + "grad_norm": 0.9554140567779541, + "learning_rate": 4.219713211766162e-05, + "loss": 1.8438, + "step": 7225 + }, + { + "epoch": 1.29, + "grad_norm": 0.6293787360191345, + "learning_rate": 4.218692429210953e-05, + "loss": 1.825, + "step": 7230 + }, + { + "epoch": 1.3, + "grad_norm": 1.0556727647781372, + "learning_rate": 4.217671103034982e-05, + "loss": 1.7117, + "step": 7235 + }, + { + "epoch": 1.3, + "grad_norm": 0.6158782243728638, + "learning_rate": 4.216649233561293e-05, + "loss": 1.9096, + "step": 7240 + }, + { + "epoch": 1.3, + "grad_norm": 0.9434940218925476, + "learning_rate": 4.2156268211131027e-05, + "loss": 1.8082, + "step": 7245 + }, + { + "epoch": 1.3, + "grad_norm": 0.7455072402954102, + "learning_rate": 4.214603866013799e-05, + "loss": 1.9597, + "step": 7250 + }, + { + "epoch": 1.3, + "grad_norm": 0.912811279296875, + "learning_rate": 4.213580368586942e-05, + "loss": 1.8557, + "step": 7255 + }, + { + "epoch": 1.3, + "grad_norm": 0.6361580491065979, + "learning_rate": 4.212556329156263e-05, + "loss": 2.0177, + "step": 7260 + }, + { + "epoch": 1.3, + "grad_norm": 6.6606597900390625, + "learning_rate": 4.211531748045664e-05, + "loss": 1.7674, + "step": 7265 + }, + { + "epoch": 1.3, + "grad_norm": 0.5496176481246948, + "learning_rate": 4.2105066255792185e-05, + "loss": 2.1907, + "step": 7270 + }, + { + "epoch": 1.3, + "grad_norm": 0.6689378023147583, + "learning_rate": 4.209480962081174e-05, + "loss": 1.7675, + "step": 7275 + }, + { + "epoch": 1.3, + "grad_norm": 0.8176302909851074, + "learning_rate": 4.208454757875945e-05, + "loss": 1.6746, + "step": 7280 + }, + { + "epoch": 1.3, + "grad_norm": 0.5315504670143127, + "learning_rate": 4.2074280132881184e-05, + "loss": 1.6911, + "step": 7285 + }, + { + "epoch": 1.3, + "grad_norm": 0.6745172739028931, + "learning_rate": 4.2064007286424545e-05, + "loss": 1.786, + "step": 7290 + }, + { + "epoch": 1.31, + "grad_norm": 0.8373345136642456, + "learning_rate": 4.205372904263881e-05, + "loss": 2.0911, + "step": 7295 + }, + { + "epoch": 1.31, + "grad_norm": 0.54158616065979, + "learning_rate": 4.204344540477499e-05, + "loss": 1.7611, + "step": 7300 + }, + { + "epoch": 1.31, + "grad_norm": 0.6230586767196655, + "learning_rate": 4.203315637608578e-05, + "loss": 1.7406, + "step": 7305 + }, + { + "epoch": 1.31, + "grad_norm": 2.31952166557312, + "learning_rate": 4.202286195982559e-05, + "loss": 1.8494, + "step": 7310 + }, + { + "epoch": 1.31, + "grad_norm": 0.6334202885627747, + "learning_rate": 4.201256215925054e-05, + "loss": 1.6764, + "step": 7315 + }, + { + "epoch": 1.31, + "grad_norm": 0.43206092715263367, + "learning_rate": 4.200225697761844e-05, + "loss": 1.9862, + "step": 7320 + }, + { + "epoch": 1.31, + "grad_norm": 0.6146365404129028, + "learning_rate": 4.1991946418188805e-05, + "loss": 2.0393, + "step": 7325 + }, + { + "epoch": 1.31, + "grad_norm": 0.48394322395324707, + "learning_rate": 4.198163048422287e-05, + "loss": 1.8087, + "step": 7330 + }, + { + "epoch": 1.31, + "grad_norm": 0.8574031591415405, + "learning_rate": 4.197130917898355e-05, + "loss": 1.9837, + "step": 7335 + }, + { + "epoch": 1.31, + "grad_norm": 0.4108412563800812, + "learning_rate": 4.196098250573545e-05, + "loss": 2.2631, + "step": 7340 + }, + { + "epoch": 1.31, + "grad_norm": 0.6564179062843323, + "learning_rate": 4.195065046774491e-05, + "loss": 2.0216, + "step": 7345 + }, + { + "epoch": 1.32, + "grad_norm": 0.3432505130767822, + "learning_rate": 4.194031306827993e-05, + "loss": 2.1613, + "step": 7350 + }, + { + "epoch": 1.32, + "grad_norm": 0.463270366191864, + "learning_rate": 4.192997031061021e-05, + "loss": 1.6699, + "step": 7355 + }, + { + "epoch": 1.32, + "grad_norm": 0.4397541284561157, + "learning_rate": 4.1919622198007176e-05, + "loss": 1.7908, + "step": 7360 + }, + { + "epoch": 1.32, + "grad_norm": 0.724583089351654, + "learning_rate": 4.190926873374391e-05, + "loss": 1.9014, + "step": 7365 + }, + { + "epoch": 1.32, + "grad_norm": 1.0059683322906494, + "learning_rate": 4.189890992109521e-05, + "loss": 1.6797, + "step": 7370 + }, + { + "epoch": 1.32, + "grad_norm": 1.4795385599136353, + "learning_rate": 4.1888545763337554e-05, + "loss": 1.8065, + "step": 7375 + }, + { + "epoch": 1.32, + "grad_norm": 0.9191558361053467, + "learning_rate": 4.187817626374912e-05, + "loss": 1.9115, + "step": 7380 + }, + { + "epoch": 1.32, + "grad_norm": 0.3600243628025055, + "learning_rate": 4.186780142560977e-05, + "loss": 2.096, + "step": 7385 + }, + { + "epoch": 1.32, + "grad_norm": 0.8530924320220947, + "learning_rate": 4.185742125220106e-05, + "loss": 1.5549, + "step": 7390 + }, + { + "epoch": 1.32, + "grad_norm": 0.6997917890548706, + "learning_rate": 4.1847035746806205e-05, + "loss": 1.9537, + "step": 7395 + }, + { + "epoch": 1.32, + "grad_norm": 0.5921128392219543, + "learning_rate": 4.1836644912710166e-05, + "loss": 2.0546, + "step": 7400 + }, + { + "epoch": 1.33, + "grad_norm": 0.5461764335632324, + "learning_rate": 4.1826248753199526e-05, + "loss": 1.5721, + "step": 7405 + }, + { + "epoch": 1.33, + "grad_norm": 0.8535491228103638, + "learning_rate": 4.1815847271562594e-05, + "loss": 1.8103, + "step": 7410 + }, + { + "epoch": 1.33, + "grad_norm": 0.9533465504646301, + "learning_rate": 4.180544047108934e-05, + "loss": 1.87, + "step": 7415 + }, + { + "epoch": 1.33, + "grad_norm": 0.49469488859176636, + "learning_rate": 4.179502835507143e-05, + "loss": 1.8931, + "step": 7420 + }, + { + "epoch": 1.33, + "grad_norm": 0.4135587215423584, + "learning_rate": 4.1784610926802214e-05, + "loss": 2.1653, + "step": 7425 + }, + { + "epoch": 1.33, + "grad_norm": 0.6948500871658325, + "learning_rate": 4.1774188189576705e-05, + "loss": 1.8297, + "step": 7430 + }, + { + "epoch": 1.33, + "grad_norm": 0.7468460202217102, + "learning_rate": 4.17637601466916e-05, + "loss": 1.9556, + "step": 7435 + }, + { + "epoch": 1.33, + "grad_norm": 0.8151171207427979, + "learning_rate": 4.1753326801445285e-05, + "loss": 1.6892, + "step": 7440 + }, + { + "epoch": 1.33, + "grad_norm": 0.8748107552528381, + "learning_rate": 4.174288815713783e-05, + "loss": 1.7057, + "step": 7445 + }, + { + "epoch": 1.33, + "grad_norm": 0.40320098400115967, + "learning_rate": 4.1732444217070943e-05, + "loss": 1.8913, + "step": 7450 + }, + { + "epoch": 1.33, + "grad_norm": 0.8582679629325867, + "learning_rate": 4.172199498454804e-05, + "loss": 1.9204, + "step": 7455 + }, + { + "epoch": 1.34, + "grad_norm": 0.35141122341156006, + "learning_rate": 4.1711540462874224e-05, + "loss": 1.9048, + "step": 7460 + }, + { + "epoch": 1.34, + "grad_norm": 0.42094314098358154, + "learning_rate": 4.170108065535622e-05, + "loss": 1.9885, + "step": 7465 + }, + { + "epoch": 1.34, + "grad_norm": 0.6282821297645569, + "learning_rate": 4.169061556530248e-05, + "loss": 1.7992, + "step": 7470 + }, + { + "epoch": 1.34, + "grad_norm": 0.7469441890716553, + "learning_rate": 4.168014519602309e-05, + "loss": 1.9007, + "step": 7475 + }, + { + "epoch": 1.34, + "grad_norm": 0.5168988704681396, + "learning_rate": 4.1669669550829815e-05, + "loss": 1.8918, + "step": 7480 + }, + { + "epoch": 1.34, + "grad_norm": 0.7953056693077087, + "learning_rate": 4.165918863303609e-05, + "loss": 2.1881, + "step": 7485 + }, + { + "epoch": 1.34, + "grad_norm": 0.6245642900466919, + "learning_rate": 4.164870244595703e-05, + "loss": 1.9878, + "step": 7490 + }, + { + "epoch": 1.34, + "grad_norm": 1.3559062480926514, + "learning_rate": 4.16382109929094e-05, + "loss": 2.1096, + "step": 7495 + }, + { + "epoch": 1.34, + "grad_norm": 1.1444271802902222, + "learning_rate": 4.162771427721164e-05, + "loss": 2.078, + "step": 7500 + }, + { + "epoch": 1.34, + "grad_norm": 0.5032902359962463, + "learning_rate": 4.161721230218384e-05, + "loss": 1.814, + "step": 7505 + }, + { + "epoch": 1.34, + "grad_norm": 1.2069705724716187, + "learning_rate": 4.1606705071147775e-05, + "loss": 1.8121, + "step": 7510 + }, + { + "epoch": 1.35, + "grad_norm": 0.5158296823501587, + "learning_rate": 4.1596192587426874e-05, + "loss": 1.7489, + "step": 7515 + }, + { + "epoch": 1.35, + "grad_norm": 0.6261261701583862, + "learning_rate": 4.158567485434622e-05, + "loss": 1.8973, + "step": 7520 + }, + { + "epoch": 1.35, + "grad_norm": 1.6703084707260132, + "learning_rate": 4.1575151875232565e-05, + "loss": 2.0307, + "step": 7525 + }, + { + "epoch": 1.35, + "grad_norm": 0.8123947381973267, + "learning_rate": 4.156462365341431e-05, + "loss": 1.6317, + "step": 7530 + }, + { + "epoch": 1.35, + "grad_norm": 1.7036327123641968, + "learning_rate": 4.1554090192221525e-05, + "loss": 1.4194, + "step": 7535 + }, + { + "epoch": 1.35, + "grad_norm": 0.5714519023895264, + "learning_rate": 4.154355149498594e-05, + "loss": 2.0492, + "step": 7540 + }, + { + "epoch": 1.35, + "grad_norm": 0.5328961610794067, + "learning_rate": 4.153300756504093e-05, + "loss": 1.8173, + "step": 7545 + }, + { + "epoch": 1.35, + "grad_norm": 0.6683387756347656, + "learning_rate": 4.152245840572153e-05, + "loss": 2.0118, + "step": 7550 + }, + { + "epoch": 1.35, + "grad_norm": 0.5071770548820496, + "learning_rate": 4.151190402036443e-05, + "loss": 1.9307, + "step": 7555 + }, + { + "epoch": 1.35, + "grad_norm": 0.5669893622398376, + "learning_rate": 4.150134441230797e-05, + "loss": 2.0048, + "step": 7560 + }, + { + "epoch": 1.35, + "grad_norm": 0.7352867722511292, + "learning_rate": 4.149077958489214e-05, + "loss": 2.0041, + "step": 7565 + }, + { + "epoch": 1.35, + "grad_norm": 0.7905187010765076, + "learning_rate": 4.148020954145859e-05, + "loss": 2.0104, + "step": 7570 + }, + { + "epoch": 1.36, + "grad_norm": 0.7344765067100525, + "learning_rate": 4.146963428535062e-05, + "loss": 1.8857, + "step": 7575 + }, + { + "epoch": 1.36, + "grad_norm": 0.9353702664375305, + "learning_rate": 4.1459053819913164e-05, + "loss": 1.9718, + "step": 7580 + }, + { + "epoch": 1.36, + "grad_norm": 0.8625378012657166, + "learning_rate": 4.144846814849282e-05, + "loss": 1.9804, + "step": 7585 + }, + { + "epoch": 1.36, + "grad_norm": 0.47366541624069214, + "learning_rate": 4.1437877274437817e-05, + "loss": 1.7501, + "step": 7590 + }, + { + "epoch": 1.36, + "grad_norm": 0.9116063117980957, + "learning_rate": 4.1427281201098046e-05, + "loss": 1.4518, + "step": 7595 + }, + { + "epoch": 1.36, + "grad_norm": 1.8927298784255981, + "learning_rate": 4.141667993182503e-05, + "loss": 1.7712, + "step": 7600 + }, + { + "epoch": 1.36, + "grad_norm": 0.8056049942970276, + "learning_rate": 4.140607346997194e-05, + "loss": 1.6893, + "step": 7605 + }, + { + "epoch": 1.36, + "grad_norm": 2.4493560791015625, + "learning_rate": 4.13954618188936e-05, + "loss": 1.5022, + "step": 7610 + }, + { + "epoch": 1.36, + "grad_norm": 0.4495934844017029, + "learning_rate": 4.138484498194645e-05, + "loss": 2.0982, + "step": 7615 + }, + { + "epoch": 1.36, + "grad_norm": 0.6876751780509949, + "learning_rate": 4.1374222962488594e-05, + "loss": 1.8517, + "step": 7620 + }, + { + "epoch": 1.36, + "grad_norm": 0.509681224822998, + "learning_rate": 4.136359576387976e-05, + "loss": 1.9499, + "step": 7625 + }, + { + "epoch": 1.37, + "grad_norm": 0.3566737771034241, + "learning_rate": 4.135296338948134e-05, + "loss": 2.1347, + "step": 7630 + }, + { + "epoch": 1.37, + "grad_norm": 0.6193088889122009, + "learning_rate": 4.1342325842656315e-05, + "loss": 1.9541, + "step": 7635 + }, + { + "epoch": 1.37, + "grad_norm": 0.8318669199943542, + "learning_rate": 4.133168312676935e-05, + "loss": 1.6854, + "step": 7640 + }, + { + "epoch": 1.37, + "grad_norm": 0.5070250034332275, + "learning_rate": 4.1321035245186726e-05, + "loss": 1.8134, + "step": 7645 + }, + { + "epoch": 1.37, + "grad_norm": 6.577232360839844, + "learning_rate": 4.1310382201276354e-05, + "loss": 2.0211, + "step": 7650 + }, + { + "epoch": 1.37, + "grad_norm": 1.0688321590423584, + "learning_rate": 4.1299723998407774e-05, + "loss": 1.5149, + "step": 7655 + }, + { + "epoch": 1.37, + "grad_norm": 3.326978921890259, + "learning_rate": 4.128906063995217e-05, + "loss": 2.042, + "step": 7660 + }, + { + "epoch": 1.37, + "grad_norm": 0.7662460803985596, + "learning_rate": 4.127839212928236e-05, + "loss": 1.7589, + "step": 7665 + }, + { + "epoch": 1.37, + "grad_norm": 0.7111654877662659, + "learning_rate": 4.126771846977277e-05, + "loss": 1.8719, + "step": 7670 + }, + { + "epoch": 1.37, + "grad_norm": 0.8834118843078613, + "learning_rate": 4.125703966479948e-05, + "loss": 2.0589, + "step": 7675 + }, + { + "epoch": 1.37, + "grad_norm": 0.599921464920044, + "learning_rate": 4.124635571774018e-05, + "loss": 1.9423, + "step": 7680 + }, + { + "epoch": 1.38, + "grad_norm": 0.6738287806510925, + "learning_rate": 4.12356666319742e-05, + "loss": 1.9728, + "step": 7685 + }, + { + "epoch": 1.38, + "grad_norm": 0.6264171004295349, + "learning_rate": 4.122497241088247e-05, + "loss": 1.961, + "step": 7690 + }, + { + "epoch": 1.38, + "grad_norm": 0.831802487373352, + "learning_rate": 4.121427305784758e-05, + "loss": 2.0417, + "step": 7695 + }, + { + "epoch": 1.38, + "grad_norm": 0.9367872476577759, + "learning_rate": 4.120356857625372e-05, + "loss": 1.8298, + "step": 7700 + }, + { + "epoch": 1.38, + "grad_norm": 0.6761164665222168, + "learning_rate": 4.11928589694867e-05, + "loss": 1.6786, + "step": 7705 + }, + { + "epoch": 1.38, + "grad_norm": 0.7731988430023193, + "learning_rate": 4.118214424093396e-05, + "loss": 1.8711, + "step": 7710 + }, + { + "epoch": 1.38, + "grad_norm": 0.530742347240448, + "learning_rate": 4.117142439398457e-05, + "loss": 2.009, + "step": 7715 + }, + { + "epoch": 1.38, + "grad_norm": 0.6078718900680542, + "learning_rate": 4.116069943202919e-05, + "loss": 1.9871, + "step": 7720 + }, + { + "epoch": 1.38, + "grad_norm": 0.6330639719963074, + "learning_rate": 4.114996935846014e-05, + "loss": 1.7457, + "step": 7725 + }, + { + "epoch": 1.38, + "grad_norm": 0.661787211894989, + "learning_rate": 4.113923417667131e-05, + "loss": 1.7891, + "step": 7730 + }, + { + "epoch": 1.38, + "grad_norm": 0.9162965416908264, + "learning_rate": 4.112849389005823e-05, + "loss": 1.9484, + "step": 7735 + }, + { + "epoch": 1.39, + "grad_norm": 0.8108046054840088, + "learning_rate": 4.1117748502018036e-05, + "loss": 1.5814, + "step": 7740 + }, + { + "epoch": 1.39, + "grad_norm": 0.5153409838676453, + "learning_rate": 4.110699801594951e-05, + "loss": 1.4343, + "step": 7745 + }, + { + "epoch": 1.39, + "grad_norm": 0.30527204275131226, + "learning_rate": 4.1096242435253e-05, + "loss": 2.0052, + "step": 7750 + }, + { + "epoch": 1.39, + "grad_norm": 0.5223674178123474, + "learning_rate": 4.108548176333049e-05, + "loss": 2.1677, + "step": 7755 + }, + { + "epoch": 1.39, + "grad_norm": 0.5189133286476135, + "learning_rate": 4.107471600358557e-05, + "loss": 1.9642, + "step": 7760 + }, + { + "epoch": 1.39, + "grad_norm": 0.6149393320083618, + "learning_rate": 4.1063945159423433e-05, + "loss": 1.8314, + "step": 7765 + }, + { + "epoch": 1.39, + "grad_norm": 0.6912689805030823, + "learning_rate": 4.105316923425091e-05, + "loss": 1.9248, + "step": 7770 + }, + { + "epoch": 1.39, + "grad_norm": 0.5106381177902222, + "learning_rate": 4.1042388231476384e-05, + "loss": 2.0442, + "step": 7775 + }, + { + "epoch": 1.39, + "grad_norm": 0.4784005880355835, + "learning_rate": 4.10316021545099e-05, + "loss": 1.7323, + "step": 7780 + }, + { + "epoch": 1.39, + "grad_norm": 0.8782150149345398, + "learning_rate": 4.102081100676307e-05, + "loss": 1.8736, + "step": 7785 + }, + { + "epoch": 1.39, + "grad_norm": 1.3147215843200684, + "learning_rate": 4.101001479164913e-05, + "loss": 1.7307, + "step": 7790 + }, + { + "epoch": 1.4, + "grad_norm": 0.5481510758399963, + "learning_rate": 4.099921351258292e-05, + "loss": 1.6104, + "step": 7795 + }, + { + "epoch": 1.4, + "grad_norm": 0.617058515548706, + "learning_rate": 4.098840717298085e-05, + "loss": 1.9166, + "step": 7800 + }, + { + "epoch": 1.4, + "grad_norm": 2.2154157161712646, + "learning_rate": 4.097759577626098e-05, + "loss": 1.8153, + "step": 7805 + }, + { + "epoch": 1.4, + "grad_norm": 0.7002790570259094, + "learning_rate": 4.096677932584293e-05, + "loss": 1.7273, + "step": 7810 + }, + { + "epoch": 1.4, + "grad_norm": 1.020469069480896, + "learning_rate": 4.095595782514794e-05, + "loss": 2.0839, + "step": 7815 + }, + { + "epoch": 1.4, + "grad_norm": 0.7469928860664368, + "learning_rate": 4.094513127759883e-05, + "loss": 1.7802, + "step": 7820 + }, + { + "epoch": 1.4, + "grad_norm": 0.54106605052948, + "learning_rate": 4.093429968662005e-05, + "loss": 1.7512, + "step": 7825 + }, + { + "epoch": 1.4, + "grad_norm": 0.8594502210617065, + "learning_rate": 4.09234630556376e-05, + "loss": 1.9866, + "step": 7830 + }, + { + "epoch": 1.4, + "grad_norm": 0.5994930267333984, + "learning_rate": 4.09126213880791e-05, + "loss": 1.8757, + "step": 7835 + }, + { + "epoch": 1.4, + "grad_norm": 1.0593057870864868, + "learning_rate": 4.090177468737375e-05, + "loss": 1.6099, + "step": 7840 + }, + { + "epoch": 1.4, + "grad_norm": 0.458686888217926, + "learning_rate": 4.089092295695238e-05, + "loss": 2.0074, + "step": 7845 + }, + { + "epoch": 1.41, + "grad_norm": 0.5471193790435791, + "learning_rate": 4.0880066200247346e-05, + "loss": 2.0962, + "step": 7850 + }, + { + "epoch": 1.41, + "grad_norm": 0.5086942315101624, + "learning_rate": 4.0869204420692665e-05, + "loss": 1.5895, + "step": 7855 + }, + { + "epoch": 1.41, + "grad_norm": 0.7652096748352051, + "learning_rate": 4.085833762172387e-05, + "loss": 2.0743, + "step": 7860 + }, + { + "epoch": 1.41, + "grad_norm": 2.0196499824523926, + "learning_rate": 4.084746580677816e-05, + "loss": 2.0508, + "step": 7865 + }, + { + "epoch": 1.41, + "grad_norm": 0.541129469871521, + "learning_rate": 4.083658897929426e-05, + "loss": 1.857, + "step": 7870 + }, + { + "epoch": 1.41, + "grad_norm": 2.4016308784484863, + "learning_rate": 4.082570714271248e-05, + "loss": 1.669, + "step": 7875 + }, + { + "epoch": 1.41, + "grad_norm": 0.5275091528892517, + "learning_rate": 4.081482030047476e-05, + "loss": 1.9097, + "step": 7880 + }, + { + "epoch": 1.41, + "grad_norm": 0.7216492295265198, + "learning_rate": 4.080392845602459e-05, + "loss": 1.9592, + "step": 7885 + }, + { + "epoch": 1.41, + "grad_norm": 0.5998548865318298, + "learning_rate": 4.079303161280706e-05, + "loss": 1.7357, + "step": 7890 + }, + { + "epoch": 1.41, + "grad_norm": 0.7786138653755188, + "learning_rate": 4.0782129774268815e-05, + "loss": 2.0231, + "step": 7895 + }, + { + "epoch": 1.41, + "grad_norm": 0.4815676510334015, + "learning_rate": 4.0771222943858104e-05, + "loss": 1.8351, + "step": 7900 + }, + { + "epoch": 1.41, + "grad_norm": 0.5540018677711487, + "learning_rate": 4.076031112502474e-05, + "loss": 1.8817, + "step": 7905 + }, + { + "epoch": 1.42, + "grad_norm": 1.1450433731079102, + "learning_rate": 4.074939432122014e-05, + "loss": 2.1311, + "step": 7910 + }, + { + "epoch": 1.42, + "grad_norm": 0.612622857093811, + "learning_rate": 4.073847253589725e-05, + "loss": 1.9964, + "step": 7915 + }, + { + "epoch": 1.42, + "grad_norm": 0.6572409868240356, + "learning_rate": 4.0727545772510645e-05, + "loss": 1.8965, + "step": 7920 + }, + { + "epoch": 1.42, + "grad_norm": 0.8887383937835693, + "learning_rate": 4.071661403451643e-05, + "loss": 1.9419, + "step": 7925 + }, + { + "epoch": 1.42, + "grad_norm": 0.8722497820854187, + "learning_rate": 4.070567732537232e-05, + "loss": 2.0361, + "step": 7930 + }, + { + "epoch": 1.42, + "grad_norm": 0.48383402824401855, + "learning_rate": 4.0694735648537584e-05, + "loss": 1.7524, + "step": 7935 + }, + { + "epoch": 1.42, + "grad_norm": 0.5321690440177917, + "learning_rate": 4.0683789007473055e-05, + "loss": 1.8623, + "step": 7940 + }, + { + "epoch": 1.42, + "grad_norm": 0.6886895298957825, + "learning_rate": 4.067283740564115e-05, + "loss": 1.5339, + "step": 7945 + }, + { + "epoch": 1.42, + "grad_norm": 1.0236868858337402, + "learning_rate": 4.066188084650584e-05, + "loss": 1.8861, + "step": 7950 + }, + { + "epoch": 1.42, + "grad_norm": 0.5006799697875977, + "learning_rate": 4.065091933353269e-05, + "loss": 2.1672, + "step": 7955 + }, + { + "epoch": 1.42, + "grad_norm": 1.0537916421890259, + "learning_rate": 4.0639952870188815e-05, + "loss": 1.8532, + "step": 7960 + }, + { + "epoch": 1.43, + "grad_norm": 0.6310824155807495, + "learning_rate": 4.062898145994288e-05, + "loss": 1.9918, + "step": 7965 + }, + { + "epoch": 1.43, + "grad_norm": 0.6223093271255493, + "learning_rate": 4.061800510626515e-05, + "loss": 1.8719, + "step": 7970 + }, + { + "epoch": 1.43, + "grad_norm": 2.3217649459838867, + "learning_rate": 4.060702381262742e-05, + "loss": 1.9299, + "step": 7975 + }, + { + "epoch": 1.43, + "grad_norm": 0.998820960521698, + "learning_rate": 4.0596037582503084e-05, + "loss": 1.8601, + "step": 7980 + }, + { + "epoch": 1.43, + "grad_norm": 0.5109826922416687, + "learning_rate": 4.0585046419367053e-05, + "loss": 1.8392, + "step": 7985 + }, + { + "epoch": 1.43, + "grad_norm": 0.45510098338127136, + "learning_rate": 4.057405032669582e-05, + "loss": 1.8877, + "step": 7990 + }, + { + "epoch": 1.43, + "grad_norm": 0.46888962388038635, + "learning_rate": 4.056304930796746e-05, + "loss": 1.783, + "step": 7995 + }, + { + "epoch": 1.43, + "grad_norm": 6.063878059387207, + "learning_rate": 4.055204336666158e-05, + "loss": 2.0542, + "step": 8000 + }, + { + "epoch": 1.43, + "grad_norm": 0.7420514822006226, + "learning_rate": 4.054103250625933e-05, + "loss": 2.0758, + "step": 8005 + }, + { + "epoch": 1.43, + "grad_norm": 0.3912978768348694, + "learning_rate": 4.053001673024346e-05, + "loss": 1.8226, + "step": 8010 + }, + { + "epoch": 1.43, + "grad_norm": 0.851407527923584, + "learning_rate": 4.051899604209823e-05, + "loss": 1.8753, + "step": 8015 + }, + { + "epoch": 1.44, + "grad_norm": 1.0191857814788818, + "learning_rate": 4.0507970445309476e-05, + "loss": 2.0584, + "step": 8020 + }, + { + "epoch": 1.44, + "grad_norm": 0.5699238181114197, + "learning_rate": 4.049693994336459e-05, + "loss": 1.9835, + "step": 8025 + }, + { + "epoch": 1.44, + "grad_norm": 0.5822308659553528, + "learning_rate": 4.0485904539752503e-05, + "loss": 1.8653, + "step": 8030 + }, + { + "epoch": 1.44, + "grad_norm": 0.7040959000587463, + "learning_rate": 4.0474864237963704e-05, + "loss": 1.7044, + "step": 8035 + }, + { + "epoch": 1.44, + "grad_norm": 0.6993011832237244, + "learning_rate": 4.046381904149024e-05, + "loss": 1.733, + "step": 8040 + }, + { + "epoch": 1.44, + "grad_norm": 0.9968368411064148, + "learning_rate": 4.045276895382568e-05, + "loss": 1.8138, + "step": 8045 + }, + { + "epoch": 1.44, + "grad_norm": 0.7696248888969421, + "learning_rate": 4.044171397846517e-05, + "loss": 1.8775, + "step": 8050 + }, + { + "epoch": 1.44, + "grad_norm": 0.6908630728721619, + "learning_rate": 4.0430654118905374e-05, + "loss": 1.8886, + "step": 8055 + }, + { + "epoch": 1.44, + "grad_norm": 0.7053937315940857, + "learning_rate": 4.041958937864453e-05, + "loss": 1.6934, + "step": 8060 + }, + { + "epoch": 1.44, + "grad_norm": 0.7439703941345215, + "learning_rate": 4.040851976118239e-05, + "loss": 1.8191, + "step": 8065 + }, + { + "epoch": 1.44, + "grad_norm": 0.9062896370887756, + "learning_rate": 4.039744527002027e-05, + "loss": 1.6822, + "step": 8070 + }, + { + "epoch": 1.45, + "grad_norm": 0.5621775388717651, + "learning_rate": 4.038636590866103e-05, + "loss": 1.7791, + "step": 8075 + }, + { + "epoch": 1.45, + "grad_norm": 0.6721243858337402, + "learning_rate": 4.0375281680609046e-05, + "loss": 1.9383, + "step": 8080 + }, + { + "epoch": 1.45, + "grad_norm": 0.5987261533737183, + "learning_rate": 4.036419258937026e-05, + "loss": 2.1479, + "step": 8085 + }, + { + "epoch": 1.45, + "grad_norm": 0.3930743634700775, + "learning_rate": 4.0353098638452134e-05, + "loss": 2.0115, + "step": 8090 + }, + { + "epoch": 1.45, + "grad_norm": 0.7518051862716675, + "learning_rate": 4.0341999831363676e-05, + "loss": 2.1983, + "step": 8095 + }, + { + "epoch": 1.45, + "grad_norm": 1.2611430883407593, + "learning_rate": 4.0330896171615434e-05, + "loss": 1.9812, + "step": 8100 + }, + { + "epoch": 1.45, + "grad_norm": 0.8846983313560486, + "learning_rate": 4.031978766271948e-05, + "loss": 1.7216, + "step": 8105 + }, + { + "epoch": 1.45, + "grad_norm": 1.0537869930267334, + "learning_rate": 4.030867430818941e-05, + "loss": 1.9612, + "step": 8110 + }, + { + "epoch": 1.45, + "grad_norm": 0.5428566336631775, + "learning_rate": 4.0297556111540393e-05, + "loss": 1.9787, + "step": 8115 + }, + { + "epoch": 1.45, + "grad_norm": 0.7369449138641357, + "learning_rate": 4.028643307628909e-05, + "loss": 1.8887, + "step": 8120 + }, + { + "epoch": 1.45, + "grad_norm": 1.3817769289016724, + "learning_rate": 4.027530520595371e-05, + "loss": 1.8445, + "step": 8125 + }, + { + "epoch": 1.46, + "grad_norm": 0.8463249802589417, + "learning_rate": 4.026417250405399e-05, + "loss": 2.0274, + "step": 8130 + }, + { + "epoch": 1.46, + "grad_norm": 1.6427886486053467, + "learning_rate": 4.025303497411118e-05, + "loss": 2.0033, + "step": 8135 + }, + { + "epoch": 1.46, + "grad_norm": 0.6956122517585754, + "learning_rate": 4.024189261964808e-05, + "loss": 1.5285, + "step": 8140 + }, + { + "epoch": 1.46, + "grad_norm": 0.7735291123390198, + "learning_rate": 4.023074544418901e-05, + "loss": 1.6345, + "step": 8145 + }, + { + "epoch": 1.46, + "grad_norm": 0.6430752873420715, + "learning_rate": 4.021959345125981e-05, + "loss": 1.7802, + "step": 8150 + }, + { + "epoch": 1.46, + "grad_norm": 0.8027344942092896, + "learning_rate": 4.0208436644387834e-05, + "loss": 1.6531, + "step": 8155 + }, + { + "epoch": 1.46, + "grad_norm": 2.263455867767334, + "learning_rate": 4.019727502710197e-05, + "loss": 1.8208, + "step": 8160 + }, + { + "epoch": 1.46, + "grad_norm": 0.4575340747833252, + "learning_rate": 4.018610860293264e-05, + "loss": 1.7946, + "step": 8165 + }, + { + "epoch": 1.46, + "grad_norm": 0.8558323383331299, + "learning_rate": 4.017493737541177e-05, + "loss": 1.9303, + "step": 8170 + }, + { + "epoch": 1.46, + "grad_norm": 0.5351092219352722, + "learning_rate": 4.0163761348072804e-05, + "loss": 1.8344, + "step": 8175 + }, + { + "epoch": 1.46, + "grad_norm": 0.9965373873710632, + "learning_rate": 4.01525805244507e-05, + "loss": 1.7591, + "step": 8180 + }, + { + "epoch": 1.47, + "grad_norm": 0.5350607633590698, + "learning_rate": 4.014139490808196e-05, + "loss": 1.8909, + "step": 8185 + }, + { + "epoch": 1.47, + "grad_norm": 0.6248508095741272, + "learning_rate": 4.0130204502504575e-05, + "loss": 1.7733, + "step": 8190 + }, + { + "epoch": 1.47, + "grad_norm": 0.5423145890235901, + "learning_rate": 4.0119009311258057e-05, + "loss": 1.9219, + "step": 8195 + }, + { + "epoch": 1.47, + "grad_norm": 0.892905056476593, + "learning_rate": 4.0107809337883435e-05, + "loss": 1.7403, + "step": 8200 + }, + { + "epoch": 1.47, + "grad_norm": 0.8622453808784485, + "learning_rate": 4.0096604585923246e-05, + "loss": 1.7071, + "step": 8205 + }, + { + "epoch": 1.47, + "grad_norm": 0.459806352853775, + "learning_rate": 4.008539505892156e-05, + "loss": 1.8587, + "step": 8210 + }, + { + "epoch": 1.47, + "grad_norm": 2.5942025184631348, + "learning_rate": 4.007418076042392e-05, + "loss": 1.7092, + "step": 8215 + }, + { + "epoch": 1.47, + "grad_norm": 0.9066646695137024, + "learning_rate": 4.0062961693977406e-05, + "loss": 1.9558, + "step": 8220 + }, + { + "epoch": 1.47, + "grad_norm": 1.168616533279419, + "learning_rate": 4.0051737863130594e-05, + "loss": 1.7656, + "step": 8225 + }, + { + "epoch": 1.47, + "grad_norm": 0.33232593536376953, + "learning_rate": 4.004050927143358e-05, + "loss": 1.9195, + "step": 8230 + }, + { + "epoch": 1.47, + "grad_norm": 0.546242892742157, + "learning_rate": 4.002927592243794e-05, + "loss": 1.9144, + "step": 8235 + }, + { + "epoch": 1.47, + "grad_norm": 0.4449472427368164, + "learning_rate": 4.00180378196968e-05, + "loss": 1.5383, + "step": 8240 + }, + { + "epoch": 1.48, + "grad_norm": 0.6070382595062256, + "learning_rate": 4.000679496676473e-05, + "loss": 1.5511, + "step": 8245 + }, + { + "epoch": 1.48, + "grad_norm": 0.3525921404361725, + "learning_rate": 3.9995547367197845e-05, + "loss": 1.9672, + "step": 8250 + }, + { + "epoch": 1.48, + "grad_norm": 2.3565738201141357, + "learning_rate": 3.998429502455375e-05, + "loss": 1.61, + "step": 8255 + }, + { + "epoch": 1.48, + "grad_norm": 0.6370993256568909, + "learning_rate": 3.9973037942391564e-05, + "loss": 1.9055, + "step": 8260 + }, + { + "epoch": 1.48, + "grad_norm": 0.565390408039093, + "learning_rate": 3.996177612427187e-05, + "loss": 1.8377, + "step": 8265 + }, + { + "epoch": 1.48, + "grad_norm": 0.6728577017784119, + "learning_rate": 3.995050957375678e-05, + "loss": 1.9646, + "step": 8270 + }, + { + "epoch": 1.48, + "grad_norm": 1.3911441564559937, + "learning_rate": 3.99392382944099e-05, + "loss": 1.8547, + "step": 8275 + }, + { + "epoch": 1.48, + "grad_norm": 1.7410084009170532, + "learning_rate": 3.992796228979631e-05, + "loss": 1.803, + "step": 8280 + }, + { + "epoch": 1.48, + "grad_norm": 0.7153067588806152, + "learning_rate": 3.991668156348261e-05, + "loss": 1.7957, + "step": 8285 + }, + { + "epoch": 1.48, + "grad_norm": 0.6121552586555481, + "learning_rate": 3.9905396119036876e-05, + "loss": 1.8041, + "step": 8290 + }, + { + "epoch": 1.48, + "grad_norm": 0.8138649463653564, + "learning_rate": 3.989410596002869e-05, + "loss": 1.8954, + "step": 8295 + }, + { + "epoch": 1.49, + "grad_norm": 1.1422274112701416, + "learning_rate": 3.9882811090029106e-05, + "loss": 1.8859, + "step": 8300 + }, + { + "epoch": 1.49, + "grad_norm": 0.29510730504989624, + "learning_rate": 3.98715115126107e-05, + "loss": 1.9913, + "step": 8305 + }, + { + "epoch": 1.49, + "grad_norm": 0.8496959209442139, + "learning_rate": 3.986020723134751e-05, + "loss": 1.8877, + "step": 8310 + }, + { + "epoch": 1.49, + "grad_norm": 0.5750755071640015, + "learning_rate": 3.984889824981506e-05, + "loss": 1.6277, + "step": 8315 + }, + { + "epoch": 1.49, + "grad_norm": 0.6985676288604736, + "learning_rate": 3.983758457159037e-05, + "loss": 1.823, + "step": 8320 + }, + { + "epoch": 1.49, + "grad_norm": 1.0394619703292847, + "learning_rate": 3.982626620025196e-05, + "loss": 2.0183, + "step": 8325 + }, + { + "epoch": 1.49, + "grad_norm": 0.700357973575592, + "learning_rate": 3.9814943139379815e-05, + "loss": 1.825, + "step": 8330 + }, + { + "epoch": 1.49, + "grad_norm": 0.3674771785736084, + "learning_rate": 3.980361539255541e-05, + "loss": 2.0308, + "step": 8335 + }, + { + "epoch": 1.49, + "grad_norm": 0.899864912033081, + "learning_rate": 3.979228296336168e-05, + "loss": 1.9477, + "step": 8340 + }, + { + "epoch": 1.49, + "grad_norm": 0.9185581803321838, + "learning_rate": 3.978094585538308e-05, + "loss": 1.8853, + "step": 8345 + }, + { + "epoch": 1.49, + "grad_norm": 0.47985824942588806, + "learning_rate": 3.9769604072205524e-05, + "loss": 1.742, + "step": 8350 + }, + { + "epoch": 1.5, + "grad_norm": 0.42667725682258606, + "learning_rate": 3.9758257617416414e-05, + "loss": 1.935, + "step": 8355 + }, + { + "epoch": 1.5, + "grad_norm": 1.3752597570419312, + "learning_rate": 3.974690649460461e-05, + "loss": 1.9357, + "step": 8360 + }, + { + "epoch": 1.5, + "grad_norm": 0.3676702380180359, + "learning_rate": 3.973555070736047e-05, + "loss": 1.7689, + "step": 8365 + }, + { + "epoch": 1.5, + "grad_norm": 0.7576048374176025, + "learning_rate": 3.9724190259275804e-05, + "loss": 1.9353, + "step": 8370 + }, + { + "epoch": 1.5, + "grad_norm": 0.8016186356544495, + "learning_rate": 3.9712825153943934e-05, + "loss": 1.8762, + "step": 8375 + }, + { + "epoch": 1.5, + "grad_norm": 0.8371986150741577, + "learning_rate": 3.970145539495962e-05, + "loss": 1.7327, + "step": 8380 + }, + { + "epoch": 1.5, + "grad_norm": 0.9057819843292236, + "learning_rate": 3.9690080985919096e-05, + "loss": 1.5471, + "step": 8385 + }, + { + "epoch": 1.5, + "grad_norm": 1.966324806213379, + "learning_rate": 3.9678701930420095e-05, + "loss": 1.915, + "step": 8390 + }, + { + "epoch": 1.5, + "grad_norm": 1.0594321489334106, + "learning_rate": 3.966731823206179e-05, + "loss": 2.0922, + "step": 8395 + }, + { + "epoch": 1.5, + "grad_norm": 0.3403851389884949, + "learning_rate": 3.965592989444484e-05, + "loss": 1.8441, + "step": 8400 + }, + { + "epoch": 1.5, + "grad_norm": 0.611869215965271, + "learning_rate": 3.964453692117136e-05, + "loss": 2.1253, + "step": 8405 + }, + { + "epoch": 1.51, + "grad_norm": 0.7241839170455933, + "learning_rate": 3.963313931584493e-05, + "loss": 1.7705, + "step": 8410 + }, + { + "epoch": 1.51, + "grad_norm": 1.3061637878417969, + "learning_rate": 3.9621737082070616e-05, + "loss": 1.9305, + "step": 8415 + }, + { + "epoch": 1.51, + "grad_norm": 1.108342170715332, + "learning_rate": 3.9610330223454926e-05, + "loss": 1.8427, + "step": 8420 + }, + { + "epoch": 1.51, + "grad_norm": 0.5697190761566162, + "learning_rate": 3.959891874360584e-05, + "loss": 1.7063, + "step": 8425 + }, + { + "epoch": 1.51, + "grad_norm": 0.7562960982322693, + "learning_rate": 3.9587502646132804e-05, + "loss": 2.0017, + "step": 8430 + }, + { + "epoch": 1.51, + "grad_norm": 1.1858628988265991, + "learning_rate": 3.95760819346467e-05, + "loss": 1.8896, + "step": 8435 + }, + { + "epoch": 1.51, + "grad_norm": 0.5219876766204834, + "learning_rate": 3.95646566127599e-05, + "loss": 1.8532, + "step": 8440 + }, + { + "epoch": 1.51, + "grad_norm": 0.36081287264823914, + "learning_rate": 3.955322668408623e-05, + "loss": 1.7424, + "step": 8445 + }, + { + "epoch": 1.51, + "grad_norm": 1.1630758047103882, + "learning_rate": 3.9541792152240965e-05, + "loss": 1.6692, + "step": 8450 + }, + { + "epoch": 1.51, + "grad_norm": 0.41798633337020874, + "learning_rate": 3.953035302084082e-05, + "loss": 1.8561, + "step": 8455 + }, + { + "epoch": 1.51, + "grad_norm": 0.932482898235321, + "learning_rate": 3.9518909293503994e-05, + "loss": 1.9695, + "step": 8460 + }, + { + "epoch": 1.52, + "grad_norm": 0.6918326616287231, + "learning_rate": 3.950746097385012e-05, + "loss": 1.8665, + "step": 8465 + }, + { + "epoch": 1.52, + "grad_norm": 0.8644784688949585, + "learning_rate": 3.9496008065500303e-05, + "loss": 1.9897, + "step": 8470 + }, + { + "epoch": 1.52, + "grad_norm": 0.774658203125, + "learning_rate": 3.9484550572077075e-05, + "loss": 1.7163, + "step": 8475 + }, + { + "epoch": 1.52, + "grad_norm": 0.9765104651451111, + "learning_rate": 3.9473088497204424e-05, + "loss": 1.848, + "step": 8480 + }, + { + "epoch": 1.52, + "grad_norm": 0.5454521775245667, + "learning_rate": 3.946162184450781e-05, + "loss": 1.8886, + "step": 8485 + }, + { + "epoch": 1.52, + "grad_norm": 0.514015793800354, + "learning_rate": 3.945015061761412e-05, + "loss": 2.0871, + "step": 8490 + }, + { + "epoch": 1.52, + "grad_norm": 0.8409627079963684, + "learning_rate": 3.943867482015169e-05, + "loss": 1.8131, + "step": 8495 + }, + { + "epoch": 1.52, + "grad_norm": 0.5961334705352783, + "learning_rate": 3.94271944557503e-05, + "loss": 1.9685, + "step": 8500 + }, + { + "epoch": 1.52, + "grad_norm": 1.0173949003219604, + "learning_rate": 3.941570952804118e-05, + "loss": 1.8924, + "step": 8505 + }, + { + "epoch": 1.52, + "grad_norm": 0.6633391380310059, + "learning_rate": 3.9404220040657e-05, + "loss": 1.8846, + "step": 8510 + }, + { + "epoch": 1.52, + "grad_norm": 12.116934776306152, + "learning_rate": 3.9392725997231874e-05, + "loss": 1.9804, + "step": 8515 + }, + { + "epoch": 1.53, + "grad_norm": 0.48671266436576843, + "learning_rate": 3.938122740140137e-05, + "loss": 1.921, + "step": 8520 + }, + { + "epoch": 1.53, + "grad_norm": 0.747097373008728, + "learning_rate": 3.936972425680246e-05, + "loss": 1.962, + "step": 8525 + }, + { + "epoch": 1.53, + "grad_norm": 0.5572634935379028, + "learning_rate": 3.935821656707359e-05, + "loss": 1.6872, + "step": 8530 + }, + { + "epoch": 1.53, + "grad_norm": 0.41499078273773193, + "learning_rate": 3.934670433585464e-05, + "loss": 1.7176, + "step": 8535 + }, + { + "epoch": 1.53, + "grad_norm": 0.8987358212471008, + "learning_rate": 3.93351875667869e-05, + "loss": 2.0023, + "step": 8540 + }, + { + "epoch": 1.53, + "grad_norm": 0.6154505610466003, + "learning_rate": 3.932366626351313e-05, + "loss": 2.1713, + "step": 8545 + }, + { + "epoch": 1.53, + "grad_norm": 0.38522958755493164, + "learning_rate": 3.931214042967749e-05, + "loss": 1.9133, + "step": 8550 + }, + { + "epoch": 1.53, + "grad_norm": 0.7890143990516663, + "learning_rate": 3.93006100689256e-05, + "loss": 2.0062, + "step": 8555 + }, + { + "epoch": 1.53, + "grad_norm": 0.5682582855224609, + "learning_rate": 3.92890751849045e-05, + "loss": 1.9966, + "step": 8560 + }, + { + "epoch": 1.53, + "grad_norm": 0.6989255547523499, + "learning_rate": 3.9277535781262676e-05, + "loss": 1.6446, + "step": 8565 + }, + { + "epoch": 1.53, + "grad_norm": 0.769821286201477, + "learning_rate": 3.926599186165001e-05, + "loss": 1.9349, + "step": 8570 + }, + { + "epoch": 1.53, + "grad_norm": 0.46688029170036316, + "learning_rate": 3.925444342971783e-05, + "loss": 2.1585, + "step": 8575 + }, + { + "epoch": 1.54, + "grad_norm": 0.609228789806366, + "learning_rate": 3.9242890489118914e-05, + "loss": 1.8908, + "step": 8580 + }, + { + "epoch": 1.54, + "grad_norm": 0.7954393625259399, + "learning_rate": 3.9231333043507436e-05, + "loss": 1.9862, + "step": 8585 + }, + { + "epoch": 1.54, + "grad_norm": 0.45553261041641235, + "learning_rate": 3.9219771096539e-05, + "loss": 2.0167, + "step": 8590 + }, + { + "epoch": 1.54, + "grad_norm": 0.3282541334629059, + "learning_rate": 3.9208204651870653e-05, + "loss": 2.0051, + "step": 8595 + }, + { + "epoch": 1.54, + "grad_norm": 0.7702216506004333, + "learning_rate": 3.9196633713160834e-05, + "loss": 1.8989, + "step": 8600 + }, + { + "epoch": 1.54, + "grad_norm": 1.1692415475845337, + "learning_rate": 3.918505828406942e-05, + "loss": 1.9096, + "step": 8605 + }, + { + "epoch": 1.54, + "grad_norm": 0.8676744699478149, + "learning_rate": 3.917347836825773e-05, + "loss": 1.9068, + "step": 8610 + }, + { + "epoch": 1.54, + "grad_norm": 0.6879375576972961, + "learning_rate": 3.9161893969388465e-05, + "loss": 1.8098, + "step": 8615 + }, + { + "epoch": 1.54, + "grad_norm": 0.6627622842788696, + "learning_rate": 3.9150305091125746e-05, + "loss": 1.7859, + "step": 8620 + }, + { + "epoch": 1.54, + "grad_norm": 3.1366701126098633, + "learning_rate": 3.913871173713515e-05, + "loss": 2.2024, + "step": 8625 + }, + { + "epoch": 1.54, + "grad_norm": 0.37004053592681885, + "learning_rate": 3.912711391108362e-05, + "loss": 1.688, + "step": 8630 + }, + { + "epoch": 1.55, + "grad_norm": 1.0141103267669678, + "learning_rate": 3.911551161663956e-05, + "loss": 2.1577, + "step": 8635 + }, + { + "epoch": 1.55, + "grad_norm": 0.5730118155479431, + "learning_rate": 3.9103904857472745e-05, + "loss": 1.8459, + "step": 8640 + }, + { + "epoch": 1.55, + "grad_norm": 0.5071932077407837, + "learning_rate": 3.9092293637254395e-05, + "loss": 2.0975, + "step": 8645 + }, + { + "epoch": 1.55, + "grad_norm": 1.376279354095459, + "learning_rate": 3.908067795965712e-05, + "loss": 2.0358, + "step": 8650 + }, + { + "epoch": 1.55, + "grad_norm": 1.1163078546524048, + "learning_rate": 3.9069057828354947e-05, + "loss": 2.0103, + "step": 8655 + }, + { + "epoch": 1.55, + "grad_norm": 0.7586190700531006, + "learning_rate": 3.905743324702332e-05, + "loss": 2.0341, + "step": 8660 + }, + { + "epoch": 1.55, + "grad_norm": 0.7790800333023071, + "learning_rate": 3.9045804219339076e-05, + "loss": 1.836, + "step": 8665 + }, + { + "epoch": 1.55, + "grad_norm": 0.8625195622444153, + "learning_rate": 3.903417074898047e-05, + "loss": 2.0557, + "step": 8670 + }, + { + "epoch": 1.55, + "grad_norm": 0.7535802721977234, + "learning_rate": 3.902253283962716e-05, + "loss": 1.9474, + "step": 8675 + }, + { + "epoch": 1.55, + "grad_norm": 0.9609790444374084, + "learning_rate": 3.9010890494960195e-05, + "loss": 1.8005, + "step": 8680 + }, + { + "epoch": 1.55, + "grad_norm": 0.8396172523498535, + "learning_rate": 3.8999243718662036e-05, + "loss": 1.8434, + "step": 8685 + }, + { + "epoch": 1.56, + "grad_norm": 0.6551474332809448, + "learning_rate": 3.8987592514416564e-05, + "loss": 1.8392, + "step": 8690 + }, + { + "epoch": 1.56, + "grad_norm": 0.9734852313995361, + "learning_rate": 3.8975936885909035e-05, + "loss": 1.8953, + "step": 8695 + }, + { + "epoch": 1.56, + "grad_norm": 0.37571898102760315, + "learning_rate": 3.896427683682611e-05, + "loss": 1.7285, + "step": 8700 + }, + { + "epoch": 1.56, + "grad_norm": 0.8209627866744995, + "learning_rate": 3.895261237085585e-05, + "loss": 1.833, + "step": 8705 + }, + { + "epoch": 1.56, + "grad_norm": 0.3994194269180298, + "learning_rate": 3.894094349168772e-05, + "loss": 2.1348, + "step": 8710 + }, + { + "epoch": 1.56, + "grad_norm": 0.766764223575592, + "learning_rate": 3.892927020301257e-05, + "loss": 1.9434, + "step": 8715 + }, + { + "epoch": 1.56, + "grad_norm": 0.7344279885292053, + "learning_rate": 3.891759250852266e-05, + "loss": 1.9841, + "step": 8720 + }, + { + "epoch": 1.56, + "grad_norm": 0.8418555855751038, + "learning_rate": 3.8905910411911625e-05, + "loss": 1.9083, + "step": 8725 + }, + { + "epoch": 1.56, + "grad_norm": 0.784453272819519, + "learning_rate": 3.8894223916874494e-05, + "loss": 2.1818, + "step": 8730 + }, + { + "epoch": 1.56, + "grad_norm": 0.8457757830619812, + "learning_rate": 3.8882533027107713e-05, + "loss": 2.121, + "step": 8735 + }, + { + "epoch": 1.56, + "grad_norm": 0.5709465742111206, + "learning_rate": 3.887083774630908e-05, + "loss": 1.8507, + "step": 8740 + }, + { + "epoch": 1.57, + "grad_norm": 0.5889132022857666, + "learning_rate": 3.885913807817781e-05, + "loss": 1.7544, + "step": 8745 + }, + { + "epoch": 1.57, + "grad_norm": 2.483522653579712, + "learning_rate": 3.8847434026414516e-05, + "loss": 2.0034, + "step": 8750 + }, + { + "epoch": 1.57, + "grad_norm": 0.6778923869132996, + "learning_rate": 3.883572559472114e-05, + "loss": 1.7116, + "step": 8755 + }, + { + "epoch": 1.57, + "grad_norm": 0.8211702704429626, + "learning_rate": 3.882401278680107e-05, + "loss": 1.7848, + "step": 8760 + }, + { + "epoch": 1.57, + "grad_norm": 1.0367469787597656, + "learning_rate": 3.881229560635905e-05, + "loss": 1.9696, + "step": 8765 + }, + { + "epoch": 1.57, + "grad_norm": 0.47565755248069763, + "learning_rate": 3.8800574057101227e-05, + "loss": 1.8395, + "step": 8770 + }, + { + "epoch": 1.57, + "grad_norm": 5.1431565284729, + "learning_rate": 3.878884814273509e-05, + "loss": 1.5831, + "step": 8775 + }, + { + "epoch": 1.57, + "grad_norm": 0.6216502785682678, + "learning_rate": 3.877711786696956e-05, + "loss": 2.0567, + "step": 8780 + }, + { + "epoch": 1.57, + "grad_norm": 1.0751134157180786, + "learning_rate": 3.8765383233514895e-05, + "loss": 1.9522, + "step": 8785 + }, + { + "epoch": 1.57, + "grad_norm": 0.7744889259338379, + "learning_rate": 3.875364424608275e-05, + "loss": 1.8294, + "step": 8790 + }, + { + "epoch": 1.57, + "grad_norm": 0.7963035106658936, + "learning_rate": 3.874190090838616e-05, + "loss": 2.0646, + "step": 8795 + }, + { + "epoch": 1.58, + "grad_norm": 0.6751787662506104, + "learning_rate": 3.873015322413954e-05, + "loss": 1.864, + "step": 8800 + }, + { + "epoch": 1.58, + "grad_norm": 1.275324821472168, + "learning_rate": 3.871840119705866e-05, + "loss": 1.6613, + "step": 8805 + }, + { + "epoch": 1.58, + "grad_norm": 11.015603065490723, + "learning_rate": 3.870664483086067e-05, + "loss": 1.9226, + "step": 8810 + }, + { + "epoch": 1.58, + "grad_norm": 0.6475034952163696, + "learning_rate": 3.869488412926411e-05, + "loss": 1.6963, + "step": 8815 + }, + { + "epoch": 1.58, + "grad_norm": 0.9630057215690613, + "learning_rate": 3.8683119095988865e-05, + "loss": 2.0588, + "step": 8820 + }, + { + "epoch": 1.58, + "grad_norm": 0.8182287812232971, + "learning_rate": 3.867134973475622e-05, + "loss": 2.4044, + "step": 8825 + }, + { + "epoch": 1.58, + "grad_norm": 1.1581512689590454, + "learning_rate": 3.8659576049288806e-05, + "loss": 1.6287, + "step": 8830 + }, + { + "epoch": 1.58, + "grad_norm": 0.7855355143547058, + "learning_rate": 3.864779804331061e-05, + "loss": 1.6889, + "step": 8835 + }, + { + "epoch": 1.58, + "grad_norm": 0.5677868723869324, + "learning_rate": 3.863601572054704e-05, + "loss": 1.8155, + "step": 8840 + }, + { + "epoch": 1.58, + "grad_norm": 0.6653741002082825, + "learning_rate": 3.8624229084724804e-05, + "loss": 2.2333, + "step": 8845 + }, + { + "epoch": 1.58, + "grad_norm": 0.5179418325424194, + "learning_rate": 3.861243813957201e-05, + "loss": 1.884, + "step": 8850 + }, + { + "epoch": 1.59, + "grad_norm": 0.8245850801467896, + "learning_rate": 3.8600642888818125e-05, + "loss": 1.9703, + "step": 8855 + }, + { + "epoch": 1.59, + "grad_norm": 0.5349978804588318, + "learning_rate": 3.8588843336193994e-05, + "loss": 2.0805, + "step": 8860 + }, + { + "epoch": 1.59, + "grad_norm": 0.5746064782142639, + "learning_rate": 3.857703948543176e-05, + "loss": 1.5872, + "step": 8865 + }, + { + "epoch": 1.59, + "grad_norm": 0.7160893082618713, + "learning_rate": 3.856523134026502e-05, + "loss": 1.8665, + "step": 8870 + }, + { + "epoch": 1.59, + "grad_norm": 1.8419469594955444, + "learning_rate": 3.8553418904428644e-05, + "loss": 1.4556, + "step": 8875 + }, + { + "epoch": 1.59, + "grad_norm": 0.6989808678627014, + "learning_rate": 3.854160218165891e-05, + "loss": 1.6613, + "step": 8880 + }, + { + "epoch": 1.59, + "grad_norm": 1.752234697341919, + "learning_rate": 3.8529781175693423e-05, + "loss": 1.8097, + "step": 8885 + }, + { + "epoch": 1.59, + "grad_norm": 0.7804884910583496, + "learning_rate": 3.851795589027117e-05, + "loss": 1.7908, + "step": 8890 + }, + { + "epoch": 1.59, + "grad_norm": 0.6645786762237549, + "learning_rate": 3.850612632913247e-05, + "loss": 2.0249, + "step": 8895 + }, + { + "epoch": 1.59, + "grad_norm": 0.454006165266037, + "learning_rate": 3.849429249601901e-05, + "loss": 1.7901, + "step": 8900 + }, + { + "epoch": 1.59, + "grad_norm": 0.7006505131721497, + "learning_rate": 3.8482454394673806e-05, + "loss": 1.9092, + "step": 8905 + }, + { + "epoch": 1.59, + "grad_norm": 0.7154279947280884, + "learning_rate": 3.8470612028841245e-05, + "loss": 1.8582, + "step": 8910 + }, + { + "epoch": 1.6, + "grad_norm": 0.37950852513313293, + "learning_rate": 3.845876540226706e-05, + "loss": 1.9758, + "step": 8915 + }, + { + "epoch": 1.6, + "grad_norm": 0.5836817622184753, + "learning_rate": 3.844691451869832e-05, + "loss": 1.7204, + "step": 8920 + }, + { + "epoch": 1.6, + "grad_norm": 0.4787684679031372, + "learning_rate": 3.843505938188346e-05, + "loss": 1.781, + "step": 8925 + }, + { + "epoch": 1.6, + "grad_norm": 0.5430890917778015, + "learning_rate": 3.8423199995572235e-05, + "loss": 2.0103, + "step": 8930 + }, + { + "epoch": 1.6, + "grad_norm": 0.5893062353134155, + "learning_rate": 3.841133636351576e-05, + "loss": 1.9518, + "step": 8935 + }, + { + "epoch": 1.6, + "grad_norm": 0.7547013163566589, + "learning_rate": 3.83994684894665e-05, + "loss": 1.7754, + "step": 8940 + }, + { + "epoch": 1.6, + "grad_norm": 2.5084164142608643, + "learning_rate": 3.8387596377178236e-05, + "loss": 1.6288, + "step": 8945 + }, + { + "epoch": 1.6, + "grad_norm": 0.533129870891571, + "learning_rate": 3.837572003040612e-05, + "loss": 1.7496, + "step": 8950 + }, + { + "epoch": 1.6, + "grad_norm": 0.6257514357566833, + "learning_rate": 3.8363839452906627e-05, + "loss": 1.8528, + "step": 8955 + }, + { + "epoch": 1.6, + "grad_norm": 0.8380822539329529, + "learning_rate": 3.835195464843757e-05, + "loss": 1.8115, + "step": 8960 + }, + { + "epoch": 1.6, + "grad_norm": 0.9693661332130432, + "learning_rate": 3.834006562075809e-05, + "loss": 1.6759, + "step": 8965 + }, + { + "epoch": 1.61, + "grad_norm": 0.6388950943946838, + "learning_rate": 3.83281723736287e-05, + "loss": 1.7906, + "step": 8970 + }, + { + "epoch": 1.61, + "grad_norm": 0.9138167500495911, + "learning_rate": 3.83162749108112e-05, + "loss": 1.7656, + "step": 8975 + }, + { + "epoch": 1.61, + "grad_norm": 0.5847556591033936, + "learning_rate": 3.830437323606876e-05, + "loss": 1.7698, + "step": 8980 + }, + { + "epoch": 1.61, + "grad_norm": 0.6280438899993896, + "learning_rate": 3.8292467353165864e-05, + "loss": 2.0533, + "step": 8985 + }, + { + "epoch": 1.61, + "grad_norm": 9.531767845153809, + "learning_rate": 3.828055726586832e-05, + "loss": 2.0169, + "step": 8990 + }, + { + "epoch": 1.61, + "grad_norm": 0.7199810743331909, + "learning_rate": 3.826864297794329e-05, + "loss": 1.8667, + "step": 8995 + }, + { + "epoch": 1.61, + "grad_norm": 0.5033280849456787, + "learning_rate": 3.8256724493159246e-05, + "loss": 1.6972, + "step": 9000 + }, + { + "epoch": 1.61, + "grad_norm": 1.0077893733978271, + "learning_rate": 3.8244801815286e-05, + "loss": 1.8997, + "step": 9005 + }, + { + "epoch": 1.61, + "grad_norm": 0.8881186246871948, + "learning_rate": 3.823287494809469e-05, + "loss": 2.0588, + "step": 9010 + }, + { + "epoch": 1.61, + "grad_norm": 0.8047612309455872, + "learning_rate": 3.822094389535775e-05, + "loss": 2.0178, + "step": 9015 + }, + { + "epoch": 1.61, + "grad_norm": 0.4620480239391327, + "learning_rate": 3.8209008660848974e-05, + "loss": 2.0099, + "step": 9020 + }, + { + "epoch": 1.62, + "grad_norm": 0.5921469926834106, + "learning_rate": 3.819706924834346e-05, + "loss": 1.8967, + "step": 9025 + }, + { + "epoch": 1.62, + "grad_norm": 0.6395788192749023, + "learning_rate": 3.818512566161765e-05, + "loss": 1.7851, + "step": 9030 + }, + { + "epoch": 1.62, + "grad_norm": 0.7558723092079163, + "learning_rate": 3.8173177904449265e-05, + "loss": 1.6899, + "step": 9035 + }, + { + "epoch": 1.62, + "grad_norm": 1.2595165967941284, + "learning_rate": 3.816122598061739e-05, + "loss": 1.9932, + "step": 9040 + }, + { + "epoch": 1.62, + "grad_norm": 0.8348121643066406, + "learning_rate": 3.814926989390238e-05, + "loss": 1.7467, + "step": 9045 + }, + { + "epoch": 1.62, + "grad_norm": 0.8437023162841797, + "learning_rate": 3.8137309648085965e-05, + "loss": 1.5459, + "step": 9050 + }, + { + "epoch": 1.62, + "grad_norm": 0.6804198622703552, + "learning_rate": 3.8125345246951137e-05, + "loss": 1.9587, + "step": 9055 + }, + { + "epoch": 1.62, + "grad_norm": 1.2239028215408325, + "learning_rate": 3.8113376694282245e-05, + "loss": 1.8473, + "step": 9060 + }, + { + "epoch": 1.62, + "grad_norm": 0.9916425943374634, + "learning_rate": 3.8101403993864905e-05, + "loss": 1.7582, + "step": 9065 + }, + { + "epoch": 1.62, + "grad_norm": 0.7374111413955688, + "learning_rate": 3.808942714948609e-05, + "loss": 1.8658, + "step": 9070 + }, + { + "epoch": 1.62, + "grad_norm": 0.6606670022010803, + "learning_rate": 3.807744616493405e-05, + "loss": 1.7126, + "step": 9075 + }, + { + "epoch": 1.63, + "grad_norm": 0.7266793847084045, + "learning_rate": 3.806546104399837e-05, + "loss": 2.1215, + "step": 9080 + }, + { + "epoch": 1.63, + "grad_norm": 0.6009851098060608, + "learning_rate": 3.8053471790469935e-05, + "loss": 1.691, + "step": 9085 + }, + { + "epoch": 1.63, + "grad_norm": 0.6415896415710449, + "learning_rate": 3.8041478408140926e-05, + "loss": 1.4786, + "step": 9090 + }, + { + "epoch": 1.63, + "grad_norm": 0.6692395806312561, + "learning_rate": 3.802948090080484e-05, + "loss": 1.8868, + "step": 9095 + }, + { + "epoch": 1.63, + "grad_norm": 2.1220877170562744, + "learning_rate": 3.8017479272256476e-05, + "loss": 1.8618, + "step": 9100 + }, + { + "epoch": 1.63, + "grad_norm": 0.643251895904541, + "learning_rate": 3.800547352629195e-05, + "loss": 1.9866, + "step": 9105 + }, + { + "epoch": 1.63, + "grad_norm": 0.6525068283081055, + "learning_rate": 3.799346366670864e-05, + "loss": 1.7077, + "step": 9110 + }, + { + "epoch": 1.63, + "grad_norm": 0.711988091468811, + "learning_rate": 3.798144969730528e-05, + "loss": 1.7389, + "step": 9115 + }, + { + "epoch": 1.63, + "grad_norm": 0.5814893841743469, + "learning_rate": 3.796943162188186e-05, + "loss": 2.0051, + "step": 9120 + }, + { + "epoch": 1.63, + "grad_norm": 1.351395845413208, + "learning_rate": 3.79574094442397e-05, + "loss": 2.038, + "step": 9125 + }, + { + "epoch": 1.63, + "grad_norm": 0.42251327633857727, + "learning_rate": 3.7945383168181405e-05, + "loss": 1.7681, + "step": 9130 + }, + { + "epoch": 1.64, + "grad_norm": 1.0346791744232178, + "learning_rate": 3.793335279751085e-05, + "loss": 2.0539, + "step": 9135 + }, + { + "epoch": 1.64, + "grad_norm": 0.7705110311508179, + "learning_rate": 3.792131833603325e-05, + "loss": 2.0802, + "step": 9140 + }, + { + "epoch": 1.64, + "grad_norm": 0.7287485003471375, + "learning_rate": 3.790927978755509e-05, + "loss": 1.7397, + "step": 9145 + }, + { + "epoch": 1.64, + "grad_norm": 0.5948812365531921, + "learning_rate": 3.789723715588416e-05, + "loss": 1.8581, + "step": 9150 + }, + { + "epoch": 1.64, + "grad_norm": 0.4935462772846222, + "learning_rate": 3.7885190444829503e-05, + "loss": 1.7997, + "step": 9155 + }, + { + "epoch": 1.64, + "grad_norm": 0.9329087138175964, + "learning_rate": 3.7873139658201516e-05, + "loss": 2.0113, + "step": 9160 + }, + { + "epoch": 1.64, + "grad_norm": 0.4054046869277954, + "learning_rate": 3.786108479981183e-05, + "loss": 2.0666, + "step": 9165 + }, + { + "epoch": 1.64, + "grad_norm": 0.6794940829277039, + "learning_rate": 3.7849025873473404e-05, + "loss": 2.0231, + "step": 9170 + }, + { + "epoch": 1.64, + "grad_norm": 0.5740146636962891, + "learning_rate": 3.7836962883000446e-05, + "loss": 1.82, + "step": 9175 + }, + { + "epoch": 1.64, + "grad_norm": 0.6178922057151794, + "learning_rate": 3.782489583220847e-05, + "loss": 2.0086, + "step": 9180 + }, + { + "epoch": 1.64, + "grad_norm": 0.6487544775009155, + "learning_rate": 3.781282472491429e-05, + "loss": 1.7158, + "step": 9185 + }, + { + "epoch": 1.64, + "grad_norm": 0.6807827353477478, + "learning_rate": 3.780074956493597e-05, + "loss": 2.0595, + "step": 9190 + }, + { + "epoch": 1.65, + "grad_norm": 0.687058687210083, + "learning_rate": 3.778867035609288e-05, + "loss": 1.5461, + "step": 9195 + }, + { + "epoch": 1.65, + "grad_norm": 0.5961658954620361, + "learning_rate": 3.777658710220564e-05, + "loss": 2.0018, + "step": 9200 + }, + { + "epoch": 1.65, + "grad_norm": 0.5087503790855408, + "learning_rate": 3.776449980709621e-05, + "loss": 2.0864, + "step": 9205 + }, + { + "epoch": 1.65, + "grad_norm": 0.8616182804107666, + "learning_rate": 3.775240847458775e-05, + "loss": 1.7007, + "step": 9210 + }, + { + "epoch": 1.65, + "grad_norm": 0.5542956590652466, + "learning_rate": 3.774031310850477e-05, + "loss": 2.0111, + "step": 9215 + }, + { + "epoch": 1.65, + "grad_norm": 0.8051539063453674, + "learning_rate": 3.772821371267301e-05, + "loss": 1.7596, + "step": 9220 + }, + { + "epoch": 1.65, + "grad_norm": 1.0442497730255127, + "learning_rate": 3.771611029091948e-05, + "loss": 1.6548, + "step": 9225 + }, + { + "epoch": 1.65, + "grad_norm": 0.7157022953033447, + "learning_rate": 3.77040028470725e-05, + "loss": 1.5846, + "step": 9230 + }, + { + "epoch": 1.65, + "grad_norm": 0.660575807094574, + "learning_rate": 3.7691891384961654e-05, + "loss": 2.0322, + "step": 9235 + }, + { + "epoch": 1.65, + "grad_norm": 1.2163981199264526, + "learning_rate": 3.767977590841776e-05, + "loss": 2.1145, + "step": 9240 + }, + { + "epoch": 1.65, + "grad_norm": 0.7265014052391052, + "learning_rate": 3.7667656421272946e-05, + "loss": 1.7375, + "step": 9245 + }, + { + "epoch": 1.66, + "grad_norm": 1.1795376539230347, + "learning_rate": 3.765553292736059e-05, + "loss": 1.9286, + "step": 9250 + }, + { + "epoch": 1.66, + "grad_norm": 0.5306074023246765, + "learning_rate": 3.7643405430515344e-05, + "loss": 1.9642, + "step": 9255 + }, + { + "epoch": 1.66, + "grad_norm": 0.6961469650268555, + "learning_rate": 3.763127393457311e-05, + "loss": 1.9714, + "step": 9260 + }, + { + "epoch": 1.66, + "grad_norm": 0.5733311772346497, + "learning_rate": 3.76191384433711e-05, + "loss": 1.6244, + "step": 9265 + }, + { + "epoch": 1.66, + "grad_norm": 0.6173118352890015, + "learning_rate": 3.7606998960747726e-05, + "loss": 1.9729, + "step": 9270 + }, + { + "epoch": 1.66, + "grad_norm": 0.7085820436477661, + "learning_rate": 3.7594855490542704e-05, + "loss": 1.8824, + "step": 9275 + }, + { + "epoch": 1.66, + "grad_norm": 1.8288882970809937, + "learning_rate": 3.758270803659701e-05, + "loss": 1.7626, + "step": 9280 + }, + { + "epoch": 1.66, + "grad_norm": 0.7774258255958557, + "learning_rate": 3.757055660275286e-05, + "loss": 1.7774, + "step": 9285 + }, + { + "epoch": 1.66, + "grad_norm": 1.5239425897598267, + "learning_rate": 3.755840119285377e-05, + "loss": 2.165, + "step": 9290 + }, + { + "epoch": 1.66, + "grad_norm": 0.4820414185523987, + "learning_rate": 3.7546241810744445e-05, + "loss": 2.2087, + "step": 9295 + }, + { + "epoch": 1.66, + "grad_norm": 0.6915215849876404, + "learning_rate": 3.75340784602709e-05, + "loss": 1.6879, + "step": 9300 + }, + { + "epoch": 1.67, + "grad_norm": 0.5616482496261597, + "learning_rate": 3.7521911145280395e-05, + "loss": 2.1273, + "step": 9305 + }, + { + "epoch": 1.67, + "grad_norm": 0.9838537573814392, + "learning_rate": 3.750973986962145e-05, + "loss": 1.7339, + "step": 9310 + }, + { + "epoch": 1.67, + "grad_norm": 0.5095938444137573, + "learning_rate": 3.7497564637143814e-05, + "loss": 1.8669, + "step": 9315 + }, + { + "epoch": 1.67, + "grad_norm": 0.5213245153427124, + "learning_rate": 3.74853854516985e-05, + "loss": 1.8368, + "step": 9320 + }, + { + "epoch": 1.67, + "grad_norm": 0.6392638087272644, + "learning_rate": 3.747320231713778e-05, + "loss": 1.8477, + "step": 9325 + }, + { + "epoch": 1.67, + "grad_norm": 8.093245506286621, + "learning_rate": 3.746101523731515e-05, + "loss": 1.8944, + "step": 9330 + }, + { + "epoch": 1.67, + "grad_norm": 0.5117234587669373, + "learning_rate": 3.744882421608541e-05, + "loss": 1.73, + "step": 9335 + }, + { + "epoch": 1.67, + "grad_norm": 0.5771470665931702, + "learning_rate": 3.743662925730453e-05, + "loss": 2.0571, + "step": 9340 + }, + { + "epoch": 1.67, + "grad_norm": 0.6059357523918152, + "learning_rate": 3.7424430364829785e-05, + "loss": 1.735, + "step": 9345 + }, + { + "epoch": 1.67, + "grad_norm": 0.48577621579170227, + "learning_rate": 3.7412227542519664e-05, + "loss": 1.8274, + "step": 9350 + }, + { + "epoch": 1.67, + "grad_norm": 0.7163307070732117, + "learning_rate": 3.740002079423392e-05, + "loss": 2.0151, + "step": 9355 + }, + { + "epoch": 1.68, + "grad_norm": 0.6120556592941284, + "learning_rate": 3.738781012383351e-05, + "loss": 1.8655, + "step": 9360 + }, + { + "epoch": 1.68, + "grad_norm": 0.6820749640464783, + "learning_rate": 3.737559553518067e-05, + "loss": 1.8406, + "step": 9365 + }, + { + "epoch": 1.68, + "grad_norm": 0.6751840114593506, + "learning_rate": 3.736337703213888e-05, + "loss": 1.8045, + "step": 9370 + }, + { + "epoch": 1.68, + "grad_norm": 0.5991414785385132, + "learning_rate": 3.735115461857282e-05, + "loss": 1.7511, + "step": 9375 + }, + { + "epoch": 1.68, + "grad_norm": 0.6772866249084473, + "learning_rate": 3.733892829834845e-05, + "loss": 2.0622, + "step": 9380 + }, + { + "epoch": 1.68, + "grad_norm": 0.37456727027893066, + "learning_rate": 3.732669807533291e-05, + "loss": 1.7938, + "step": 9385 + }, + { + "epoch": 1.68, + "grad_norm": 0.5582310557365417, + "learning_rate": 3.731446395339463e-05, + "loss": 1.7566, + "step": 9390 + }, + { + "epoch": 1.68, + "grad_norm": 0.5218795537948608, + "learning_rate": 3.730222593640324e-05, + "loss": 1.6361, + "step": 9395 + }, + { + "epoch": 1.68, + "grad_norm": 0.6812488436698914, + "learning_rate": 3.7289984028229636e-05, + "loss": 1.9274, + "step": 9400 + }, + { + "epoch": 1.68, + "grad_norm": 0.6946431398391724, + "learning_rate": 3.72777382327459e-05, + "loss": 1.8467, + "step": 9405 + }, + { + "epoch": 1.68, + "grad_norm": 2.7319812774658203, + "learning_rate": 3.726548855382536e-05, + "loss": 1.696, + "step": 9410 + }, + { + "epoch": 1.69, + "grad_norm": 0.6396403312683105, + "learning_rate": 3.725323499534259e-05, + "loss": 1.8228, + "step": 9415 + }, + { + "epoch": 1.69, + "grad_norm": 0.4877493977546692, + "learning_rate": 3.724097756117338e-05, + "loss": 2.1406, + "step": 9420 + }, + { + "epoch": 1.69, + "grad_norm": 0.5904247164726257, + "learning_rate": 3.722871625519475e-05, + "loss": 1.9487, + "step": 9425 + }, + { + "epoch": 1.69, + "grad_norm": 0.6823410391807556, + "learning_rate": 3.721645108128493e-05, + "loss": 1.7107, + "step": 9430 + }, + { + "epoch": 1.69, + "grad_norm": 0.688463568687439, + "learning_rate": 3.720418204332338e-05, + "loss": 1.7925, + "step": 9435 + }, + { + "epoch": 1.69, + "grad_norm": 1.0077917575836182, + "learning_rate": 3.719190914519078e-05, + "loss": 1.8868, + "step": 9440 + }, + { + "epoch": 1.69, + "grad_norm": 1.1595532894134521, + "learning_rate": 3.7179632390769055e-05, + "loss": 1.6869, + "step": 9445 + }, + { + "epoch": 1.69, + "grad_norm": 0.8011316657066345, + "learning_rate": 3.716735178394131e-05, + "loss": 1.7987, + "step": 9450 + }, + { + "epoch": 1.69, + "grad_norm": 0.5646600723266602, + "learning_rate": 3.7155067328591906e-05, + "loss": 1.8537, + "step": 9455 + }, + { + "epoch": 1.69, + "grad_norm": 0.5149386525154114, + "learning_rate": 3.7142779028606405e-05, + "loss": 1.8974, + "step": 9460 + }, + { + "epoch": 1.69, + "grad_norm": 1.7945959568023682, + "learning_rate": 3.713048688787157e-05, + "loss": 1.6476, + "step": 9465 + }, + { + "epoch": 1.7, + "grad_norm": 0.4259641766548157, + "learning_rate": 3.7118190910275394e-05, + "loss": 1.8111, + "step": 9470 + }, + { + "epoch": 1.7, + "grad_norm": 0.5765038728713989, + "learning_rate": 3.71058910997071e-05, + "loss": 1.571, + "step": 9475 + }, + { + "epoch": 1.7, + "grad_norm": 0.6007339954376221, + "learning_rate": 3.7093587460057087e-05, + "loss": 1.8102, + "step": 9480 + }, + { + "epoch": 1.7, + "grad_norm": 0.3550678491592407, + "learning_rate": 3.708127999521701e-05, + "loss": 1.8255, + "step": 9485 + }, + { + "epoch": 1.7, + "grad_norm": 1.5561153888702393, + "learning_rate": 3.706896870907967e-05, + "loss": 1.8103, + "step": 9490 + }, + { + "epoch": 1.7, + "grad_norm": 0.6592022180557251, + "learning_rate": 3.705665360553914e-05, + "loss": 1.7564, + "step": 9495 + }, + { + "epoch": 1.7, + "grad_norm": 0.9379505515098572, + "learning_rate": 3.704433468849068e-05, + "loss": 1.9543, + "step": 9500 + }, + { + "epoch": 1.7, + "grad_norm": 0.7542968392372131, + "learning_rate": 3.7032011961830744e-05, + "loss": 1.9984, + "step": 9505 + }, + { + "epoch": 1.7, + "grad_norm": 1.1859865188598633, + "learning_rate": 3.7019685429456986e-05, + "loss": 1.8548, + "step": 9510 + }, + { + "epoch": 1.7, + "grad_norm": 1.0546746253967285, + "learning_rate": 3.70073550952683e-05, + "loss": 1.6335, + "step": 9515 + }, + { + "epoch": 1.7, + "grad_norm": 0.41656044125556946, + "learning_rate": 3.699502096316474e-05, + "loss": 1.817, + "step": 9520 + }, + { + "epoch": 1.7, + "grad_norm": 0.3669623136520386, + "learning_rate": 3.698268303704759e-05, + "loss": 1.8767, + "step": 9525 + }, + { + "epoch": 1.71, + "grad_norm": 0.5422895550727844, + "learning_rate": 3.69728099670865e-05, + "loss": 2.1527, + "step": 9530 + }, + { + "epoch": 1.71, + "grad_norm": 0.6797967553138733, + "learning_rate": 3.6960465221579927e-05, + "loss": 1.7675, + "step": 9535 + }, + { + "epoch": 1.71, + "grad_norm": 0.9210271239280701, + "learning_rate": 3.69481166929897e-05, + "loss": 1.6502, + "step": 9540 + }, + { + "epoch": 1.71, + "grad_norm": 0.8029117584228516, + "learning_rate": 3.693576438522167e-05, + "loss": 1.8089, + "step": 9545 + }, + { + "epoch": 1.71, + "grad_norm": 0.47192418575286865, + "learning_rate": 3.6923408302182834e-05, + "loss": 2.0336, + "step": 9550 + }, + { + "epoch": 1.71, + "grad_norm": 0.66648268699646, + "learning_rate": 3.691104844778142e-05, + "loss": 1.7003, + "step": 9555 + }, + { + "epoch": 1.71, + "grad_norm": 0.6902146935462952, + "learning_rate": 3.689868482592684e-05, + "loss": 1.8137, + "step": 9560 + }, + { + "epoch": 1.71, + "grad_norm": 0.8713681697845459, + "learning_rate": 3.688631744052969e-05, + "loss": 2.1054, + "step": 9565 + }, + { + "epoch": 1.71, + "grad_norm": 0.8756383657455444, + "learning_rate": 3.687394629550178e-05, + "loss": 2.0403, + "step": 9570 + }, + { + "epoch": 1.71, + "grad_norm": 0.8325067758560181, + "learning_rate": 3.6861571394756066e-05, + "loss": 1.8843, + "step": 9575 + }, + { + "epoch": 1.71, + "grad_norm": 0.5879186987876892, + "learning_rate": 3.684919274220674e-05, + "loss": 1.9288, + "step": 9580 + }, + { + "epoch": 1.72, + "grad_norm": 0.5836779475212097, + "learning_rate": 3.683681034176914e-05, + "loss": 2.024, + "step": 9585 + }, + { + "epoch": 1.72, + "grad_norm": 0.7767724990844727, + "learning_rate": 3.682442419735982e-05, + "loss": 2.3006, + "step": 9590 + }, + { + "epoch": 1.72, + "grad_norm": 0.923039972782135, + "learning_rate": 3.681203431289649e-05, + "loss": 1.9507, + "step": 9595 + }, + { + "epoch": 1.72, + "grad_norm": 0.46944764256477356, + "learning_rate": 3.6799640692298075e-05, + "loss": 1.9669, + "step": 9600 + }, + { + "epoch": 1.72, + "grad_norm": 0.29924699664115906, + "learning_rate": 3.678724333948466e-05, + "loss": 1.8226, + "step": 9605 + }, + { + "epoch": 1.72, + "grad_norm": 0.4777134954929352, + "learning_rate": 3.6774842258377515e-05, + "loss": 2.3745, + "step": 9610 + }, + { + "epoch": 1.72, + "grad_norm": 0.8692833781242371, + "learning_rate": 3.676243745289909e-05, + "loss": 1.9515, + "step": 9615 + }, + { + "epoch": 1.72, + "grad_norm": 0.9050225019454956, + "learning_rate": 3.675002892697301e-05, + "loss": 1.9985, + "step": 9620 + }, + { + "epoch": 1.72, + "grad_norm": 0.6240570545196533, + "learning_rate": 3.673761668452408e-05, + "loss": 1.8733, + "step": 9625 + }, + { + "epoch": 1.72, + "grad_norm": 0.6922764778137207, + "learning_rate": 3.6725200729478285e-05, + "loss": 1.6951, + "step": 9630 + }, + { + "epoch": 1.72, + "grad_norm": 0.4850800931453705, + "learning_rate": 3.671278106576278e-05, + "loss": 1.9839, + "step": 9635 + }, + { + "epoch": 1.73, + "grad_norm": 0.7302365303039551, + "learning_rate": 3.670035769730589e-05, + "loss": 2.0729, + "step": 9640 + }, + { + "epoch": 1.73, + "grad_norm": 0.43896591663360596, + "learning_rate": 3.668793062803711e-05, + "loss": 1.8726, + "step": 9645 + }, + { + "epoch": 1.73, + "grad_norm": 0.8171257972717285, + "learning_rate": 3.667549986188712e-05, + "loss": 1.9601, + "step": 9650 + }, + { + "epoch": 1.73, + "grad_norm": 0.631952166557312, + "learning_rate": 3.666306540278775e-05, + "loss": 1.745, + "step": 9655 + }, + { + "epoch": 1.73, + "grad_norm": 0.7533524036407471, + "learning_rate": 3.665062725467201e-05, + "loss": 1.9608, + "step": 9660 + }, + { + "epoch": 1.73, + "grad_norm": 0.5362209677696228, + "learning_rate": 3.6638185421474084e-05, + "loss": 1.5627, + "step": 9665 + }, + { + "epoch": 1.73, + "grad_norm": 0.6645787358283997, + "learning_rate": 3.6625739907129295e-05, + "loss": 1.5983, + "step": 9670 + }, + { + "epoch": 1.73, + "grad_norm": 0.5133046507835388, + "learning_rate": 3.6613290715574155e-05, + "loss": 1.9776, + "step": 9675 + }, + { + "epoch": 1.73, + "grad_norm": 1.0483719110488892, + "learning_rate": 3.660083785074634e-05, + "loss": 2.0728, + "step": 9680 + }, + { + "epoch": 1.73, + "grad_norm": 0.8712103366851807, + "learning_rate": 3.6588381316584674e-05, + "loss": 1.7619, + "step": 9685 + }, + { + "epoch": 1.73, + "grad_norm": 1.295785665512085, + "learning_rate": 3.6575921117029136e-05, + "loss": 1.8996, + "step": 9690 + }, + { + "epoch": 1.74, + "grad_norm": 0.5898334383964539, + "learning_rate": 3.6563457256020884e-05, + "loss": 1.8068, + "step": 9695 + }, + { + "epoch": 1.74, + "grad_norm": 0.5830318331718445, + "learning_rate": 3.655098973750223e-05, + "loss": 1.8471, + "step": 9700 + }, + { + "epoch": 1.74, + "grad_norm": 0.6544398069381714, + "learning_rate": 3.653851856541662e-05, + "loss": 1.9984, + "step": 9705 + }, + { + "epoch": 1.74, + "grad_norm": 0.37488725781440735, + "learning_rate": 3.6526043743708704e-05, + "loss": 1.8106, + "step": 9710 + }, + { + "epoch": 1.74, + "grad_norm": 1.239549160003662, + "learning_rate": 3.651356527632423e-05, + "loss": 2.1417, + "step": 9715 + }, + { + "epoch": 1.74, + "grad_norm": 0.47542452812194824, + "learning_rate": 3.650108316721013e-05, + "loss": 1.6287, + "step": 9720 + }, + { + "epoch": 1.74, + "grad_norm": 0.43621698021888733, + "learning_rate": 3.648859742031449e-05, + "loss": 1.96, + "step": 9725 + }, + { + "epoch": 1.74, + "grad_norm": 0.5922963619232178, + "learning_rate": 3.6476108039586535e-05, + "loss": 1.8913, + "step": 9730 + }, + { + "epoch": 1.74, + "grad_norm": 0.6242755055427551, + "learning_rate": 3.646361502897665e-05, + "loss": 1.8241, + "step": 9735 + }, + { + "epoch": 1.74, + "grad_norm": 1.1280851364135742, + "learning_rate": 3.645111839243636e-05, + "loss": 2.1662, + "step": 9740 + }, + { + "epoch": 1.74, + "grad_norm": 0.6557629704475403, + "learning_rate": 3.643861813391833e-05, + "loss": 2.0015, + "step": 9745 + }, + { + "epoch": 1.75, + "grad_norm": 0.6168567538261414, + "learning_rate": 3.642611425737639e-05, + "loss": 1.5228, + "step": 9750 + }, + { + "epoch": 1.75, + "grad_norm": 0.4185939133167267, + "learning_rate": 3.6413606766765506e-05, + "loss": 2.1472, + "step": 9755 + }, + { + "epoch": 1.75, + "grad_norm": 0.49710220098495483, + "learning_rate": 3.6401095666041765e-05, + "loss": 1.9794, + "step": 9760 + }, + { + "epoch": 1.75, + "grad_norm": 0.6820828914642334, + "learning_rate": 3.638858095916244e-05, + "loss": 1.7363, + "step": 9765 + }, + { + "epoch": 1.75, + "grad_norm": 0.7763912081718445, + "learning_rate": 3.637606265008592e-05, + "loss": 1.9053, + "step": 9770 + }, + { + "epoch": 1.75, + "grad_norm": 0.9690907001495361, + "learning_rate": 3.636354074277172e-05, + "loss": 1.6437, + "step": 9775 + }, + { + "epoch": 1.75, + "grad_norm": 0.8117865324020386, + "learning_rate": 3.635101524118051e-05, + "loss": 1.7684, + "step": 9780 + }, + { + "epoch": 1.75, + "grad_norm": 0.878116250038147, + "learning_rate": 3.63384861492741e-05, + "loss": 1.9089, + "step": 9785 + }, + { + "epoch": 1.75, + "grad_norm": 0.3111972510814667, + "learning_rate": 3.632595347101543e-05, + "loss": 2.0609, + "step": 9790 + }, + { + "epoch": 1.75, + "grad_norm": 0.38425540924072266, + "learning_rate": 3.6313417210368564e-05, + "loss": 1.7993, + "step": 9795 + }, + { + "epoch": 1.75, + "grad_norm": 0.6652374267578125, + "learning_rate": 3.630087737129872e-05, + "loss": 2.0774, + "step": 9800 + }, + { + "epoch": 1.76, + "grad_norm": 0.9802791476249695, + "learning_rate": 3.628833395777224e-05, + "loss": 1.8374, + "step": 9805 + }, + { + "epoch": 1.76, + "grad_norm": 1.093260407447815, + "learning_rate": 3.6275786973756575e-05, + "loss": 1.945, + "step": 9810 + }, + { + "epoch": 1.76, + "grad_norm": 0.5013108253479004, + "learning_rate": 3.626323642322035e-05, + "loss": 1.8702, + "step": 9815 + }, + { + "epoch": 1.76, + "grad_norm": 0.7069824934005737, + "learning_rate": 3.6250682310133266e-05, + "loss": 2.0702, + "step": 9820 + }, + { + "epoch": 1.76, + "grad_norm": 0.539284348487854, + "learning_rate": 3.6238124638466195e-05, + "loss": 1.7797, + "step": 9825 + }, + { + "epoch": 1.76, + "grad_norm": 0.642426609992981, + "learning_rate": 3.622556341219111e-05, + "loss": 1.8979, + "step": 9830 + }, + { + "epoch": 1.76, + "grad_norm": 0.7472493052482605, + "learning_rate": 3.6212998635281116e-05, + "loss": 1.8212, + "step": 9835 + }, + { + "epoch": 1.76, + "grad_norm": 1.1682623624801636, + "learning_rate": 3.620043031171043e-05, + "loss": 1.7595, + "step": 9840 + }, + { + "epoch": 1.76, + "grad_norm": 0.6802049279212952, + "learning_rate": 3.618785844545443e-05, + "loss": 1.9368, + "step": 9845 + }, + { + "epoch": 1.76, + "grad_norm": 1.0584665536880493, + "learning_rate": 3.617528304048955e-05, + "loss": 1.91, + "step": 9850 + }, + { + "epoch": 1.76, + "grad_norm": 0.6849753260612488, + "learning_rate": 3.616270410079339e-05, + "loss": 1.8772, + "step": 9855 + }, + { + "epoch": 1.76, + "grad_norm": 0.9579459428787231, + "learning_rate": 3.6150121630344676e-05, + "loss": 1.9455, + "step": 9860 + }, + { + "epoch": 1.77, + "grad_norm": 0.8555591702461243, + "learning_rate": 3.61375356331232e-05, + "loss": 1.4362, + "step": 9865 + }, + { + "epoch": 1.77, + "grad_norm": 0.5340035557746887, + "learning_rate": 3.612494611310992e-05, + "loss": 1.9012, + "step": 9870 + }, + { + "epoch": 1.77, + "grad_norm": 0.8525397777557373, + "learning_rate": 3.61123530742869e-05, + "loss": 1.9174, + "step": 9875 + }, + { + "epoch": 1.77, + "grad_norm": 0.6355140209197998, + "learning_rate": 3.6099756520637274e-05, + "loss": 1.7693, + "step": 9880 + }, + { + "epoch": 1.77, + "grad_norm": 0.794104814529419, + "learning_rate": 3.608715645614534e-05, + "loss": 1.9271, + "step": 9885 + }, + { + "epoch": 1.77, + "grad_norm": 0.5898195505142212, + "learning_rate": 3.6074552884796485e-05, + "loss": 1.969, + "step": 9890 + }, + { + "epoch": 1.77, + "grad_norm": 0.8139387965202332, + "learning_rate": 3.606194581057721e-05, + "loss": 1.8805, + "step": 9895 + }, + { + "epoch": 1.77, + "grad_norm": 0.7168921828269958, + "learning_rate": 3.60493352374751e-05, + "loss": 1.7252, + "step": 9900 + }, + { + "epoch": 1.77, + "grad_norm": 0.9840201735496521, + "learning_rate": 3.6036721169478896e-05, + "loss": 1.8256, + "step": 9905 + }, + { + "epoch": 1.77, + "grad_norm": 0.7537431120872498, + "learning_rate": 3.60241036105784e-05, + "loss": 2.0456, + "step": 9910 + }, + { + "epoch": 1.77, + "grad_norm": 1.142112374305725, + "learning_rate": 3.601148256476454e-05, + "loss": 1.987, + "step": 9915 + }, + { + "epoch": 1.78, + "grad_norm": 0.9110394716262817, + "learning_rate": 3.599885803602933e-05, + "loss": 1.7752, + "step": 9920 + }, + { + "epoch": 1.78, + "grad_norm": 0.6123466491699219, + "learning_rate": 3.59862300283659e-05, + "loss": 2.0613, + "step": 9925 + }, + { + "epoch": 1.78, + "grad_norm": 0.7708951234817505, + "learning_rate": 3.597359854576848e-05, + "loss": 1.8793, + "step": 9930 + }, + { + "epoch": 1.78, + "grad_norm": 0.6696035861968994, + "learning_rate": 3.596096359223241e-05, + "loss": 1.8185, + "step": 9935 + }, + { + "epoch": 1.78, + "grad_norm": 1.329271674156189, + "learning_rate": 3.594832517175408e-05, + "loss": 2.0871, + "step": 9940 + }, + { + "epoch": 1.78, + "grad_norm": 1.1870595216751099, + "learning_rate": 3.593568328833104e-05, + "loss": 1.6247, + "step": 9945 + }, + { + "epoch": 1.78, + "grad_norm": 0.8780333399772644, + "learning_rate": 3.59230379459619e-05, + "loss": 1.6585, + "step": 9950 + }, + { + "epoch": 1.78, + "grad_norm": 16.52887725830078, + "learning_rate": 3.591038914864635e-05, + "loss": 2.0053, + "step": 9955 + }, + { + "epoch": 1.78, + "grad_norm": 0.8829307556152344, + "learning_rate": 3.5897736900385213e-05, + "loss": 1.7779, + "step": 9960 + }, + { + "epoch": 1.78, + "grad_norm": 0.539667546749115, + "learning_rate": 3.5885081205180377e-05, + "loss": 1.761, + "step": 9965 + }, + { + "epoch": 1.78, + "grad_norm": 0.6431927680969238, + "learning_rate": 3.5872422067034824e-05, + "loss": 1.908, + "step": 9970 + }, + { + "epoch": 1.79, + "grad_norm": 1.2214795351028442, + "learning_rate": 3.5859759489952635e-05, + "loss": 1.8322, + "step": 9975 + }, + { + "epoch": 1.79, + "grad_norm": 1.3363548517227173, + "learning_rate": 3.5847093477938956e-05, + "loss": 1.9769, + "step": 9980 + }, + { + "epoch": 1.79, + "grad_norm": 0.6570048928260803, + "learning_rate": 3.583442403500005e-05, + "loss": 1.7981, + "step": 9985 + }, + { + "epoch": 1.79, + "grad_norm": 1.1348395347595215, + "learning_rate": 3.582175116514324e-05, + "loss": 1.7122, + "step": 9990 + }, + { + "epoch": 1.79, + "grad_norm": 0.6550275683403015, + "learning_rate": 3.5809074872376943e-05, + "loss": 1.8589, + "step": 9995 + }, + { + "epoch": 1.79, + "grad_norm": 1.4769278764724731, + "learning_rate": 3.5796395160710666e-05, + "loss": 1.8013, + "step": 10000 + }, + { + "epoch": 1.79, + "grad_norm": 0.5346553921699524, + "learning_rate": 3.5783712034154985e-05, + "loss": 1.988, + "step": 10005 + }, + { + "epoch": 1.79, + "grad_norm": 0.48149240016937256, + "learning_rate": 3.5771025496721556e-05, + "loss": 1.7784, + "step": 10010 + }, + { + "epoch": 1.79, + "grad_norm": 0.6305123567581177, + "learning_rate": 3.5758335552423127e-05, + "loss": 2.0311, + "step": 10015 + }, + { + "epoch": 1.79, + "grad_norm": 0.48739558458328247, + "learning_rate": 3.574564220527351e-05, + "loss": 2.1409, + "step": 10020 + }, + { + "epoch": 1.79, + "grad_norm": 1.4256173372268677, + "learning_rate": 3.57329454592876e-05, + "loss": 1.6745, + "step": 10025 + }, + { + "epoch": 1.8, + "grad_norm": 0.9247502088546753, + "learning_rate": 3.572024531848136e-05, + "loss": 1.7922, + "step": 10030 + }, + { + "epoch": 1.8, + "grad_norm": 5.438934326171875, + "learning_rate": 3.5707541786871845e-05, + "loss": 1.6375, + "step": 10035 + }, + { + "epoch": 1.8, + "grad_norm": 0.6272771954536438, + "learning_rate": 3.5694834868477154e-05, + "loss": 1.8961, + "step": 10040 + }, + { + "epoch": 1.8, + "grad_norm": 0.6681824326515198, + "learning_rate": 3.5682124567316485e-05, + "loss": 1.716, + "step": 10045 + }, + { + "epoch": 1.8, + "grad_norm": 0.6500905156135559, + "learning_rate": 3.5669410887410095e-05, + "loss": 1.8117, + "step": 10050 + }, + { + "epoch": 1.8, + "grad_norm": 0.7739772796630859, + "learning_rate": 3.5656693832779295e-05, + "loss": 1.7639, + "step": 10055 + }, + { + "epoch": 1.8, + "grad_norm": 1.4250242710113525, + "learning_rate": 3.5643973407446496e-05, + "loss": 1.9421, + "step": 10060 + }, + { + "epoch": 1.8, + "grad_norm": 0.9519471526145935, + "learning_rate": 3.5631249615435145e-05, + "loss": 1.9734, + "step": 10065 + }, + { + "epoch": 1.8, + "grad_norm": 0.9607334733009338, + "learning_rate": 3.561852246076976e-05, + "loss": 1.7387, + "step": 10070 + }, + { + "epoch": 1.8, + "grad_norm": 0.9569016695022583, + "learning_rate": 3.560579194747593e-05, + "loss": 1.6301, + "step": 10075 + }, + { + "epoch": 1.8, + "grad_norm": 0.48008155822753906, + "learning_rate": 3.559305807958032e-05, + "loss": 1.9027, + "step": 10080 + }, + { + "epoch": 1.81, + "grad_norm": 0.8056952953338623, + "learning_rate": 3.5580320861110625e-05, + "loss": 1.9209, + "step": 10085 + }, + { + "epoch": 1.81, + "grad_norm": 0.4301571547985077, + "learning_rate": 3.556758029609561e-05, + "loss": 1.9884, + "step": 10090 + }, + { + "epoch": 1.81, + "grad_norm": 1.0041254758834839, + "learning_rate": 3.555483638856513e-05, + "loss": 1.6986, + "step": 10095 + }, + { + "epoch": 1.81, + "grad_norm": 0.7964014410972595, + "learning_rate": 3.554208914255004e-05, + "loss": 1.7893, + "step": 10100 + }, + { + "epoch": 1.81, + "grad_norm": 0.7074928283691406, + "learning_rate": 3.552933856208229e-05, + "loss": 1.8304, + "step": 10105 + }, + { + "epoch": 1.81, + "grad_norm": 1.151492953300476, + "learning_rate": 3.55165846511949e-05, + "loss": 1.6251, + "step": 10110 + }, + { + "epoch": 1.81, + "grad_norm": 0.4866889417171478, + "learning_rate": 3.5503827413921885e-05, + "loss": 1.7865, + "step": 10115 + }, + { + "epoch": 1.81, + "grad_norm": 0.7697849273681641, + "learning_rate": 3.549106685429836e-05, + "loss": 1.669, + "step": 10120 + }, + { + "epoch": 1.81, + "grad_norm": 0.7082263827323914, + "learning_rate": 3.5478302976360486e-05, + "loss": 1.9564, + "step": 10125 + }, + { + "epoch": 1.81, + "grad_norm": 1.869172215461731, + "learning_rate": 3.5465535784145464e-05, + "loss": 2.0057, + "step": 10130 + }, + { + "epoch": 1.81, + "grad_norm": 0.6560725569725037, + "learning_rate": 3.545276528169154e-05, + "loss": 1.9155, + "step": 10135 + }, + { + "epoch": 1.82, + "grad_norm": 1.027157187461853, + "learning_rate": 3.5439991473038006e-05, + "loss": 1.6909, + "step": 10140 + }, + { + "epoch": 1.82, + "grad_norm": 0.7855679392814636, + "learning_rate": 3.542721436222521e-05, + "loss": 1.8473, + "step": 10145 + }, + { + "epoch": 1.82, + "grad_norm": 0.6946498155593872, + "learning_rate": 3.541443395329454e-05, + "loss": 2.1364, + "step": 10150 + }, + { + "epoch": 1.82, + "grad_norm": 1.001931071281433, + "learning_rate": 3.540165025028844e-05, + "loss": 1.8402, + "step": 10155 + }, + { + "epoch": 1.82, + "grad_norm": 0.9406866431236267, + "learning_rate": 3.538886325725036e-05, + "loss": 1.9564, + "step": 10160 + }, + { + "epoch": 1.82, + "grad_norm": 0.3592713177204132, + "learning_rate": 3.5376072978224824e-05, + "loss": 1.5525, + "step": 10165 + }, + { + "epoch": 1.82, + "grad_norm": 1.2049317359924316, + "learning_rate": 3.536327941725739e-05, + "loss": 1.9862, + "step": 10170 + }, + { + "epoch": 1.82, + "grad_norm": 0.8280571699142456, + "learning_rate": 3.5350482578394636e-05, + "loss": 1.745, + "step": 10175 + }, + { + "epoch": 1.82, + "grad_norm": 0.7504371404647827, + "learning_rate": 3.533768246568421e-05, + "loss": 1.9558, + "step": 10180 + }, + { + "epoch": 1.82, + "grad_norm": 0.9918588995933533, + "learning_rate": 3.532487908317476e-05, + "loss": 1.979, + "step": 10185 + }, + { + "epoch": 1.82, + "grad_norm": 0.46797382831573486, + "learning_rate": 3.5312072434915986e-05, + "loss": 1.99, + "step": 10190 + }, + { + "epoch": 1.82, + "grad_norm": 0.5668039917945862, + "learning_rate": 3.5299262524958615e-05, + "loss": 1.7308, + "step": 10195 + }, + { + "epoch": 1.83, + "grad_norm": 1.981918454170227, + "learning_rate": 3.528644935735442e-05, + "loss": 1.7743, + "step": 10200 + }, + { + "epoch": 1.83, + "grad_norm": 0.9254132509231567, + "learning_rate": 3.527363293615619e-05, + "loss": 2.0246, + "step": 10205 + }, + { + "epoch": 1.83, + "grad_norm": 0.9930950403213501, + "learning_rate": 3.5260813265417735e-05, + "loss": 1.9832, + "step": 10210 + }, + { + "epoch": 1.83, + "grad_norm": 0.7647066116333008, + "learning_rate": 3.524799034919392e-05, + "loss": 2.1351, + "step": 10215 + }, + { + "epoch": 1.83, + "grad_norm": 1.070019245147705, + "learning_rate": 3.523516419154062e-05, + "loss": 1.942, + "step": 10220 + }, + { + "epoch": 1.83, + "grad_norm": 0.79027259349823, + "learning_rate": 3.522233479651472e-05, + "loss": 1.9146, + "step": 10225 + }, + { + "epoch": 1.83, + "grad_norm": 0.4503646194934845, + "learning_rate": 3.520950216817417e-05, + "loss": 2.0948, + "step": 10230 + }, + { + "epoch": 1.83, + "grad_norm": 0.7014816999435425, + "learning_rate": 3.519666631057789e-05, + "loss": 1.7254, + "step": 10235 + }, + { + "epoch": 1.83, + "grad_norm": 1.0421819686889648, + "learning_rate": 3.518382722778587e-05, + "loss": 1.8633, + "step": 10240 + }, + { + "epoch": 1.83, + "grad_norm": 0.7871792316436768, + "learning_rate": 3.5170984923859095e-05, + "loss": 1.7731, + "step": 10245 + }, + { + "epoch": 1.83, + "grad_norm": 0.5907774567604065, + "learning_rate": 3.515813940285957e-05, + "loss": 1.9179, + "step": 10250 + }, + { + "epoch": 1.84, + "grad_norm": 0.4600268304347992, + "learning_rate": 3.514529066885032e-05, + "loss": 1.7114, + "step": 10255 + }, + { + "epoch": 1.84, + "grad_norm": 0.6806623339653015, + "learning_rate": 3.51324387258954e-05, + "loss": 1.6823, + "step": 10260 + }, + { + "epoch": 1.84, + "grad_norm": 0.7156978249549866, + "learning_rate": 3.5119583578059846e-05, + "loss": 1.709, + "step": 10265 + }, + { + "epoch": 1.84, + "grad_norm": 0.7115353345870972, + "learning_rate": 3.510672522940975e-05, + "loss": 1.7488, + "step": 10270 + }, + { + "epoch": 1.84, + "grad_norm": 0.8930706977844238, + "learning_rate": 3.509386368401218e-05, + "loss": 1.6939, + "step": 10275 + }, + { + "epoch": 1.84, + "grad_norm": 0.555728018283844, + "learning_rate": 3.508099894593523e-05, + "loss": 1.7732, + "step": 10280 + }, + { + "epoch": 1.84, + "grad_norm": 0.6877692341804504, + "learning_rate": 3.506813101924802e-05, + "loss": 1.799, + "step": 10285 + }, + { + "epoch": 1.84, + "grad_norm": 0.9091320633888245, + "learning_rate": 3.5055259908020645e-05, + "loss": 1.6652, + "step": 10290 + }, + { + "epoch": 1.84, + "grad_norm": 0.6716402173042297, + "learning_rate": 3.504238561632424e-05, + "loss": 1.6759, + "step": 10295 + }, + { + "epoch": 1.84, + "grad_norm": 0.615459144115448, + "learning_rate": 3.502950814823092e-05, + "loss": 1.7817, + "step": 10300 + }, + { + "epoch": 1.84, + "grad_norm": 0.8127800822257996, + "learning_rate": 3.501662750781382e-05, + "loss": 2.2239, + "step": 10305 + }, + { + "epoch": 1.85, + "grad_norm": 0.9452939033508301, + "learning_rate": 3.5003743699147076e-05, + "loss": 2.1167, + "step": 10310 + }, + { + "epoch": 1.85, + "grad_norm": 0.7668461799621582, + "learning_rate": 3.499085672630582e-05, + "loss": 1.6316, + "step": 10315 + }, + { + "epoch": 1.85, + "grad_norm": 0.7267149090766907, + "learning_rate": 3.497796659336619e-05, + "loss": 2.0752, + "step": 10320 + }, + { + "epoch": 1.85, + "grad_norm": 0.7957209944725037, + "learning_rate": 3.4965073304405326e-05, + "loss": 1.8228, + "step": 10325 + }, + { + "epoch": 1.85, + "grad_norm": 0.5732484459877014, + "learning_rate": 3.495217686350136e-05, + "loss": 1.8593, + "step": 10330 + }, + { + "epoch": 1.85, + "grad_norm": 1.3122698068618774, + "learning_rate": 3.493927727473343e-05, + "loss": 1.6822, + "step": 10335 + }, + { + "epoch": 1.85, + "grad_norm": 0.5948165655136108, + "learning_rate": 3.492637454218166e-05, + "loss": 1.9183, + "step": 10340 + }, + { + "epoch": 1.85, + "grad_norm": 0.938048243522644, + "learning_rate": 3.491346866992716e-05, + "loss": 2.0546, + "step": 10345 + }, + { + "epoch": 1.85, + "grad_norm": 0.6968057751655579, + "learning_rate": 3.490055966205207e-05, + "loss": 1.8256, + "step": 10350 + }, + { + "epoch": 1.85, + "grad_norm": 0.5820090174674988, + "learning_rate": 3.4887647522639464e-05, + "loss": 1.9695, + "step": 10355 + }, + { + "epoch": 1.85, + "grad_norm": 0.7669956088066101, + "learning_rate": 3.4874732255773465e-05, + "loss": 1.7784, + "step": 10360 + }, + { + "epoch": 1.86, + "grad_norm": 0.5515414476394653, + "learning_rate": 3.486181386553916e-05, + "loss": 1.9826, + "step": 10365 + }, + { + "epoch": 1.86, + "grad_norm": 7.995153427124023, + "learning_rate": 3.484889235602261e-05, + "loss": 1.8934, + "step": 10370 + }, + { + "epoch": 1.86, + "grad_norm": 0.8782067894935608, + "learning_rate": 3.483596773131088e-05, + "loss": 1.7218, + "step": 10375 + }, + { + "epoch": 1.86, + "grad_norm": 0.6286051273345947, + "learning_rate": 3.482303999549201e-05, + "loss": 1.9686, + "step": 10380 + }, + { + "epoch": 1.86, + "grad_norm": 0.4949547350406647, + "learning_rate": 3.481010915265504e-05, + "loss": 2.0401, + "step": 10385 + }, + { + "epoch": 1.86, + "grad_norm": 0.9229368567466736, + "learning_rate": 3.4797175206889984e-05, + "loss": 1.6785, + "step": 10390 + }, + { + "epoch": 1.86, + "grad_norm": 0.5106030702590942, + "learning_rate": 3.478423816228784e-05, + "loss": 1.9079, + "step": 10395 + }, + { + "epoch": 1.86, + "grad_norm": 0.9068930745124817, + "learning_rate": 3.477129802294057e-05, + "loss": 1.9197, + "step": 10400 + }, + { + "epoch": 1.86, + "grad_norm": 0.7757366299629211, + "learning_rate": 3.475835479294114e-05, + "loss": 1.9806, + "step": 10405 + }, + { + "epoch": 1.86, + "grad_norm": 0.814594566822052, + "learning_rate": 3.4745408476383476e-05, + "loss": 2.0118, + "step": 10410 + }, + { + "epoch": 1.86, + "grad_norm": 0.643321692943573, + "learning_rate": 3.473245907736248e-05, + "loss": 1.9629, + "step": 10415 + }, + { + "epoch": 1.87, + "grad_norm": 0.7608657479286194, + "learning_rate": 3.471950659997404e-05, + "loss": 1.8221, + "step": 10420 + }, + { + "epoch": 1.87, + "grad_norm": 1.1012566089630127, + "learning_rate": 3.470655104831501e-05, + "loss": 1.87, + "step": 10425 + }, + { + "epoch": 1.87, + "grad_norm": 0.7623602151870728, + "learning_rate": 3.469359242648321e-05, + "loss": 1.9752, + "step": 10430 + }, + { + "epoch": 1.87, + "grad_norm": 0.9718965888023376, + "learning_rate": 3.468063073857747e-05, + "loss": 1.784, + "step": 10435 + }, + { + "epoch": 1.87, + "grad_norm": 3.79907488822937, + "learning_rate": 3.466766598869753e-05, + "loss": 2.1134, + "step": 10440 + }, + { + "epoch": 1.87, + "grad_norm": 15.143911361694336, + "learning_rate": 3.4654698180944134e-05, + "loss": 2.1167, + "step": 10445 + }, + { + "epoch": 1.87, + "grad_norm": 0.5659872889518738, + "learning_rate": 3.4641727319418995e-05, + "loss": 2.2435, + "step": 10450 + }, + { + "epoch": 1.87, + "grad_norm": 0.6032043695449829, + "learning_rate": 3.4628753408224765e-05, + "loss": 2.0959, + "step": 10455 + }, + { + "epoch": 1.87, + "grad_norm": 0.6183554530143738, + "learning_rate": 3.461577645146511e-05, + "loss": 1.8351, + "step": 10460 + }, + { + "epoch": 1.87, + "grad_norm": 0.806084156036377, + "learning_rate": 3.4602796453244615e-05, + "loss": 1.6052, + "step": 10465 + }, + { + "epoch": 1.87, + "grad_norm": 0.7526884078979492, + "learning_rate": 3.4589813417668823e-05, + "loss": 1.9665, + "step": 10470 + }, + { + "epoch": 1.87, + "grad_norm": 0.9556746482849121, + "learning_rate": 3.457682734884428e-05, + "loss": 1.8647, + "step": 10475 + }, + { + "epoch": 1.88, + "grad_norm": 0.5155915021896362, + "learning_rate": 3.456383825087846e-05, + "loss": 1.9263, + "step": 10480 + }, + { + "epoch": 1.88, + "grad_norm": 0.661363422870636, + "learning_rate": 3.45508461278798e-05, + "loss": 1.7769, + "step": 10485 + }, + { + "epoch": 1.88, + "grad_norm": 0.5568251609802246, + "learning_rate": 3.453785098395769e-05, + "loss": 1.9676, + "step": 10490 + }, + { + "epoch": 1.88, + "grad_norm": 0.6141484975814819, + "learning_rate": 3.45248528232225e-05, + "loss": 2.0722, + "step": 10495 + }, + { + "epoch": 1.88, + "grad_norm": 0.6515774726867676, + "learning_rate": 3.4511851649785506e-05, + "loss": 1.9458, + "step": 10500 + }, + { + "epoch": 1.88, + "grad_norm": 0.6424517631530762, + "learning_rate": 3.4498847467759e-05, + "loss": 1.9665, + "step": 10505 + }, + { + "epoch": 1.88, + "grad_norm": 0.39145955443382263, + "learning_rate": 3.4485840281256176e-05, + "loss": 1.9062, + "step": 10510 + }, + { + "epoch": 1.88, + "grad_norm": 0.9824689030647278, + "learning_rate": 3.447283009439119e-05, + "loss": 1.6084, + "step": 10515 + }, + { + "epoch": 1.88, + "grad_norm": 0.44701799750328064, + "learning_rate": 3.445981691127915e-05, + "loss": 1.9367, + "step": 10520 + }, + { + "epoch": 1.88, + "grad_norm": 0.8132763504981995, + "learning_rate": 3.4446800736036134e-05, + "loss": 1.8618, + "step": 10525 + }, + { + "epoch": 1.88, + "grad_norm": 0.5754613876342773, + "learning_rate": 3.4433781572779126e-05, + "loss": 1.72, + "step": 10530 + }, + { + "epoch": 1.89, + "grad_norm": 0.46498289704322815, + "learning_rate": 3.442075942562608e-05, + "loss": 1.9345, + "step": 10535 + }, + { + "epoch": 1.89, + "grad_norm": 0.9286674857139587, + "learning_rate": 3.4407734298695895e-05, + "loss": 1.7018, + "step": 10540 + }, + { + "epoch": 1.89, + "grad_norm": 0.5865563750267029, + "learning_rate": 3.439470619610839e-05, + "loss": 1.9787, + "step": 10545 + }, + { + "epoch": 1.89, + "grad_norm": 1.0381340980529785, + "learning_rate": 3.438167512198436e-05, + "loss": 1.7573, + "step": 10550 + }, + { + "epoch": 1.89, + "grad_norm": 1.0353657007217407, + "learning_rate": 3.4368641080445504e-05, + "loss": 1.7011, + "step": 10555 + }, + { + "epoch": 1.89, + "grad_norm": 0.5357837677001953, + "learning_rate": 3.4355604075614493e-05, + "loss": 2.1718, + "step": 10560 + }, + { + "epoch": 1.89, + "grad_norm": 0.7515667080879211, + "learning_rate": 3.434256411161491e-05, + "loss": 1.9217, + "step": 10565 + }, + { + "epoch": 1.89, + "grad_norm": 0.7363743185997009, + "learning_rate": 3.432952119257128e-05, + "loss": 1.9062, + "step": 10570 + }, + { + "epoch": 1.89, + "grad_norm": 0.6471055746078491, + "learning_rate": 3.431647532260908e-05, + "loss": 2.0589, + "step": 10575 + }, + { + "epoch": 1.89, + "grad_norm": 0.6963765621185303, + "learning_rate": 3.4303426505854695e-05, + "loss": 1.9334, + "step": 10580 + }, + { + "epoch": 1.89, + "grad_norm": 1.006638765335083, + "learning_rate": 3.4290374746435454e-05, + "loss": 1.8345, + "step": 10585 + }, + { + "epoch": 1.9, + "grad_norm": 0.5938763618469238, + "learning_rate": 3.427732004847961e-05, + "loss": 1.7264, + "step": 10590 + }, + { + "epoch": 1.9, + "grad_norm": 0.7721897959709167, + "learning_rate": 3.4264262416116366e-05, + "loss": 1.7195, + "step": 10595 + }, + { + "epoch": 1.9, + "grad_norm": 0.6883012652397156, + "learning_rate": 3.425120185347584e-05, + "loss": 1.5873, + "step": 10600 + }, + { + "epoch": 1.9, + "grad_norm": 0.9483616352081299, + "learning_rate": 3.423813836468904e-05, + "loss": 1.8211, + "step": 10605 + }, + { + "epoch": 1.9, + "grad_norm": 0.6309101581573486, + "learning_rate": 3.4225071953887976e-05, + "loss": 1.9336, + "step": 10610 + }, + { + "epoch": 1.9, + "grad_norm": 0.8639585971832275, + "learning_rate": 3.421200262520553e-05, + "loss": 2.0485, + "step": 10615 + }, + { + "epoch": 1.9, + "grad_norm": 0.8056395649909973, + "learning_rate": 3.419893038277552e-05, + "loss": 1.8187, + "step": 10620 + }, + { + "epoch": 1.9, + "grad_norm": 1.7254058122634888, + "learning_rate": 3.418585523073266e-05, + "loss": 1.8431, + "step": 10625 + }, + { + "epoch": 1.9, + "grad_norm": 0.5998656153678894, + "learning_rate": 3.417277717321264e-05, + "loss": 1.8515, + "step": 10630 + }, + { + "epoch": 1.9, + "grad_norm": 0.8533723950386047, + "learning_rate": 3.4159696214352014e-05, + "loss": 1.9317, + "step": 10635 + }, + { + "epoch": 1.9, + "grad_norm": 0.44307661056518555, + "learning_rate": 3.414661235828829e-05, + "loss": 1.7573, + "step": 10640 + }, + { + "epoch": 1.91, + "grad_norm": 0.5623877644538879, + "learning_rate": 3.413352560915988e-05, + "loss": 1.9952, + "step": 10645 + }, + { + "epoch": 1.91, + "grad_norm": 0.5950538516044617, + "learning_rate": 3.4120435971106105e-05, + "loss": 2.0118, + "step": 10650 + }, + { + "epoch": 1.91, + "grad_norm": 0.7914562225341797, + "learning_rate": 3.410734344826719e-05, + "loss": 1.6905, + "step": 10655 + }, + { + "epoch": 1.91, + "grad_norm": 2.2145016193389893, + "learning_rate": 3.409424804478431e-05, + "loss": 1.9919, + "step": 10660 + }, + { + "epoch": 1.91, + "grad_norm": 1.1050101518630981, + "learning_rate": 3.408114976479953e-05, + "loss": 1.8972, + "step": 10665 + }, + { + "epoch": 1.91, + "grad_norm": 0.7993802428245544, + "learning_rate": 3.406804861245581e-05, + "loss": 1.7432, + "step": 10670 + }, + { + "epoch": 1.91, + "grad_norm": 0.7174950838088989, + "learning_rate": 3.4054944591897034e-05, + "loss": 1.7941, + "step": 10675 + }, + { + "epoch": 1.91, + "grad_norm": 2.201014518737793, + "learning_rate": 3.4041837707267996e-05, + "loss": 1.8024, + "step": 10680 + }, + { + "epoch": 1.91, + "grad_norm": 0.560925304889679, + "learning_rate": 3.402872796271438e-05, + "loss": 1.6103, + "step": 10685 + }, + { + "epoch": 1.91, + "grad_norm": 0.6595713496208191, + "learning_rate": 3.4015615362382814e-05, + "loss": 1.983, + "step": 10690 + }, + { + "epoch": 1.91, + "grad_norm": 0.6843532919883728, + "learning_rate": 3.400249991042076e-05, + "loss": 2.0138, + "step": 10695 + }, + { + "epoch": 1.92, + "grad_norm": 0.5372428297996521, + "learning_rate": 3.398938161097665e-05, + "loss": 2.0141, + "step": 10700 + }, + { + "epoch": 1.92, + "grad_norm": 0.6535273790359497, + "learning_rate": 3.3976260468199785e-05, + "loss": 2.0042, + "step": 10705 + }, + { + "epoch": 1.92, + "grad_norm": 0.6673722863197327, + "learning_rate": 3.3963136486240366e-05, + "loss": 1.8941, + "step": 10710 + }, + { + "epoch": 1.92, + "grad_norm": 0.6364368200302124, + "learning_rate": 3.3950009669249497e-05, + "loss": 1.7138, + "step": 10715 + }, + { + "epoch": 1.92, + "grad_norm": 0.6928640007972717, + "learning_rate": 3.393688002137919e-05, + "loss": 1.6929, + "step": 10720 + }, + { + "epoch": 1.92, + "grad_norm": 0.8325533270835876, + "learning_rate": 3.392374754678231e-05, + "loss": 1.9077, + "step": 10725 + }, + { + "epoch": 1.92, + "grad_norm": 1.1905181407928467, + "learning_rate": 3.3910612249612674e-05, + "loss": 1.9971, + "step": 10730 + }, + { + "epoch": 1.92, + "grad_norm": 0.6571048498153687, + "learning_rate": 3.3897474134024953e-05, + "loss": 2.2925, + "step": 10735 + }, + { + "epoch": 1.92, + "grad_norm": 0.5732169151306152, + "learning_rate": 3.3884333204174724e-05, + "loss": 1.9575, + "step": 10740 + }, + { + "epoch": 1.92, + "grad_norm": 0.4101000130176544, + "learning_rate": 3.3871189464218445e-05, + "loss": 2.1045, + "step": 10745 + }, + { + "epoch": 1.92, + "grad_norm": 0.73232102394104, + "learning_rate": 3.385804291831347e-05, + "loss": 1.9257, + "step": 10750 + }, + { + "epoch": 1.93, + "grad_norm": 0.6931615471839905, + "learning_rate": 3.384489357061804e-05, + "loss": 1.878, + "step": 10755 + }, + { + "epoch": 1.93, + "grad_norm": 0.5579245686531067, + "learning_rate": 3.3831741425291294e-05, + "loss": 1.9672, + "step": 10760 + }, + { + "epoch": 1.93, + "grad_norm": 0.762345016002655, + "learning_rate": 3.381858648649322e-05, + "loss": 1.8285, + "step": 10765 + }, + { + "epoch": 1.93, + "grad_norm": 1.3375375270843506, + "learning_rate": 3.380542875838472e-05, + "loss": 2.3338, + "step": 10770 + }, + { + "epoch": 1.93, + "grad_norm": 1.3226951360702515, + "learning_rate": 3.379226824512758e-05, + "loss": 1.9114, + "step": 10775 + }, + { + "epoch": 1.93, + "grad_norm": 0.7157413363456726, + "learning_rate": 3.3779104950884446e-05, + "loss": 1.8751, + "step": 10780 + }, + { + "epoch": 1.93, + "grad_norm": 0.9126617312431335, + "learning_rate": 3.376593887981887e-05, + "loss": 2.0082, + "step": 10785 + }, + { + "epoch": 1.93, + "grad_norm": 0.915809690952301, + "learning_rate": 3.3752770036095236e-05, + "loss": 2.2042, + "step": 10790 + }, + { + "epoch": 1.93, + "grad_norm": 1.350906252861023, + "learning_rate": 3.373959842387888e-05, + "loss": 2.0439, + "step": 10795 + }, + { + "epoch": 1.93, + "grad_norm": 0.6320156455039978, + "learning_rate": 3.3726424047335943e-05, + "loss": 2.0582, + "step": 10800 + }, + { + "epoch": 1.93, + "grad_norm": 0.42162302136421204, + "learning_rate": 3.3713246910633473e-05, + "loss": 2.2003, + "step": 10805 + }, + { + "epoch": 1.93, + "grad_norm": 0.7747112512588501, + "learning_rate": 3.370006701793939e-05, + "loss": 1.7979, + "step": 10810 + }, + { + "epoch": 1.94, + "grad_norm": 0.5144717693328857, + "learning_rate": 3.3686884373422465e-05, + "loss": 1.8804, + "step": 10815 + }, + { + "epoch": 1.94, + "grad_norm": 0.6446430683135986, + "learning_rate": 3.367369898125238e-05, + "loss": 2.1296, + "step": 10820 + }, + { + "epoch": 1.94, + "grad_norm": 0.71882164478302, + "learning_rate": 3.366051084559965e-05, + "loss": 2.0616, + "step": 10825 + }, + { + "epoch": 1.94, + "grad_norm": 1.0291924476623535, + "learning_rate": 3.3647319970635665e-05, + "loss": 1.9716, + "step": 10830 + }, + { + "epoch": 1.94, + "grad_norm": 1.1064375638961792, + "learning_rate": 3.363412636053269e-05, + "loss": 1.7938, + "step": 10835 + }, + { + "epoch": 1.94, + "grad_norm": 1.4092391729354858, + "learning_rate": 3.362093001946386e-05, + "loss": 2.3134, + "step": 10840 + }, + { + "epoch": 1.94, + "grad_norm": 0.5512030124664307, + "learning_rate": 3.360773095160315e-05, + "loss": 1.8288, + "step": 10845 + }, + { + "epoch": 1.94, + "grad_norm": 2.6880881786346436, + "learning_rate": 3.3594529161125424e-05, + "loss": 1.606, + "step": 10850 + }, + { + "epoch": 1.94, + "grad_norm": 0.32284656167030334, + "learning_rate": 3.358132465220639e-05, + "loss": 1.8226, + "step": 10855 + }, + { + "epoch": 1.94, + "grad_norm": 1.3082622289657593, + "learning_rate": 3.356811742902262e-05, + "loss": 1.9702, + "step": 10860 + }, + { + "epoch": 1.94, + "grad_norm": 0.579648494720459, + "learning_rate": 3.355490749575155e-05, + "loss": 1.8468, + "step": 10865 + }, + { + "epoch": 1.95, + "grad_norm": 0.6377948522567749, + "learning_rate": 3.354169485657147e-05, + "loss": 2.1329, + "step": 10870 + }, + { + "epoch": 1.95, + "grad_norm": 0.47102659940719604, + "learning_rate": 3.3528479515661514e-05, + "loss": 1.7664, + "step": 10875 + }, + { + "epoch": 1.95, + "grad_norm": 0.959218442440033, + "learning_rate": 3.35152614772017e-05, + "loss": 1.4881, + "step": 10880 + }, + { + "epoch": 1.95, + "grad_norm": 0.5902130007743835, + "learning_rate": 3.3502040745372874e-05, + "loss": 1.9162, + "step": 10885 + }, + { + "epoch": 1.95, + "grad_norm": 1.0470603704452515, + "learning_rate": 3.348881732435673e-05, + "loss": 1.4373, + "step": 10890 + }, + { + "epoch": 1.95, + "grad_norm": 0.8185494542121887, + "learning_rate": 3.347559121833584e-05, + "loss": 1.8553, + "step": 10895 + }, + { + "epoch": 1.95, + "grad_norm": 0.7984978556632996, + "learning_rate": 3.34623624314936e-05, + "loss": 1.7071, + "step": 10900 + }, + { + "epoch": 1.95, + "grad_norm": 0.5953229665756226, + "learning_rate": 3.344913096801425e-05, + "loss": 1.9628, + "step": 10905 + }, + { + "epoch": 1.95, + "grad_norm": 0.4407386779785156, + "learning_rate": 3.3435896832082916e-05, + "loss": 1.8533, + "step": 10910 + }, + { + "epoch": 1.95, + "grad_norm": 9.959710121154785, + "learning_rate": 3.3422660027885515e-05, + "loss": 1.8898, + "step": 10915 + }, + { + "epoch": 1.95, + "grad_norm": 1.026064157485962, + "learning_rate": 3.340942055960886e-05, + "loss": 1.9131, + "step": 10920 + }, + { + "epoch": 1.96, + "grad_norm": 0.8007922172546387, + "learning_rate": 3.339617843144057e-05, + "loss": 1.7598, + "step": 10925 + }, + { + "epoch": 1.96, + "grad_norm": 0.5460824966430664, + "learning_rate": 3.3382933647569115e-05, + "loss": 2.0906, + "step": 10930 + }, + { + "epoch": 1.96, + "grad_norm": 0.6225911378860474, + "learning_rate": 3.33696862121838e-05, + "loss": 1.8498, + "step": 10935 + }, + { + "epoch": 1.96, + "grad_norm": 0.6068547368049622, + "learning_rate": 3.33564361294748e-05, + "loss": 1.8413, + "step": 10940 + }, + { + "epoch": 1.96, + "grad_norm": 2.0755717754364014, + "learning_rate": 3.334318340363309e-05, + "loss": 2.0584, + "step": 10945 + }, + { + "epoch": 1.96, + "grad_norm": 0.5796096920967102, + "learning_rate": 3.332992803885049e-05, + "loss": 2.1057, + "step": 10950 + }, + { + "epoch": 1.96, + "grad_norm": 2.20377254486084, + "learning_rate": 3.331667003931967e-05, + "loss": 1.8257, + "step": 10955 + }, + { + "epoch": 1.96, + "grad_norm": 0.8372711539268494, + "learning_rate": 3.330340940923411e-05, + "loss": 1.9423, + "step": 10960 + }, + { + "epoch": 1.96, + "grad_norm": 0.5656445622444153, + "learning_rate": 3.3290146152788136e-05, + "loss": 1.7365, + "step": 10965 + }, + { + "epoch": 1.96, + "grad_norm": 0.3391028344631195, + "learning_rate": 3.3276880274176927e-05, + "loss": 1.7596, + "step": 10970 + }, + { + "epoch": 1.96, + "grad_norm": 1.2373957633972168, + "learning_rate": 3.326361177759643e-05, + "loss": 1.7008, + "step": 10975 + }, + { + "epoch": 1.97, + "grad_norm": 0.6096294522285461, + "learning_rate": 3.325034066724348e-05, + "loss": 1.8067, + "step": 10980 + }, + { + "epoch": 1.97, + "grad_norm": 0.7483140230178833, + "learning_rate": 3.323706694731572e-05, + "loss": 1.6209, + "step": 10985 + }, + { + "epoch": 1.97, + "grad_norm": 0.4939500391483307, + "learning_rate": 3.3223790622011595e-05, + "loss": 1.7462, + "step": 10990 + }, + { + "epoch": 1.97, + "grad_norm": 1.016813039779663, + "learning_rate": 3.321051169553042e-05, + "loss": 1.8239, + "step": 10995 + }, + { + "epoch": 1.97, + "grad_norm": 0.5926358103752136, + "learning_rate": 3.3197230172072294e-05, + "loss": 2.0223, + "step": 11000 + }, + { + "epoch": 1.97, + "grad_norm": 0.7104119062423706, + "learning_rate": 3.318394605583813e-05, + "loss": 1.7247, + "step": 11005 + }, + { + "epoch": 1.97, + "grad_norm": 1.1022257804870605, + "learning_rate": 3.317065935102972e-05, + "loss": 1.6968, + "step": 11010 + }, + { + "epoch": 1.97, + "grad_norm": 0.7626497745513916, + "learning_rate": 3.315737006184961e-05, + "loss": 1.9668, + "step": 11015 + }, + { + "epoch": 1.97, + "grad_norm": 0.606796383857727, + "learning_rate": 3.314407819250119e-05, + "loss": 1.8814, + "step": 11020 + }, + { + "epoch": 1.97, + "grad_norm": 0.34029826521873474, + "learning_rate": 3.313078374718868e-05, + "loss": 1.6823, + "step": 11025 + }, + { + "epoch": 1.97, + "grad_norm": 0.5755228400230408, + "learning_rate": 3.311748673011709e-05, + "loss": 1.6421, + "step": 11030 + }, + { + "epoch": 1.98, + "grad_norm": 0.7753984332084656, + "learning_rate": 3.310418714549225e-05, + "loss": 2.0978, + "step": 11035 + }, + { + "epoch": 1.98, + "grad_norm": 0.5059102177619934, + "learning_rate": 3.309088499752082e-05, + "loss": 2.0625, + "step": 11040 + }, + { + "epoch": 1.98, + "grad_norm": 0.6016696691513062, + "learning_rate": 3.307758029041024e-05, + "loss": 1.8624, + "step": 11045 + }, + { + "epoch": 1.98, + "grad_norm": 0.7924507856369019, + "learning_rate": 3.306427302836879e-05, + "loss": 1.8362, + "step": 11050 + }, + { + "epoch": 1.98, + "grad_norm": 0.7843905687332153, + "learning_rate": 3.3050963215605526e-05, + "loss": 2.0905, + "step": 11055 + }, + { + "epoch": 1.98, + "grad_norm": 1.0598030090332031, + "learning_rate": 3.3037650856330354e-05, + "loss": 1.7577, + "step": 11060 + }, + { + "epoch": 1.98, + "grad_norm": 0.7119802832603455, + "learning_rate": 3.3024335954753946e-05, + "loss": 1.896, + "step": 11065 + }, + { + "epoch": 1.98, + "grad_norm": 0.5891833901405334, + "learning_rate": 3.301101851508779e-05, + "loss": 2.03, + "step": 11070 + }, + { + "epoch": 1.98, + "grad_norm": 0.3929714262485504, + "learning_rate": 3.2997698541544184e-05, + "loss": 1.6645, + "step": 11075 + }, + { + "epoch": 1.98, + "grad_norm": 0.6735256910324097, + "learning_rate": 3.298437603833622e-05, + "loss": 2.2231, + "step": 11080 + }, + { + "epoch": 1.98, + "grad_norm": 0.9890113472938538, + "learning_rate": 3.29710510096778e-05, + "loss": 1.8691, + "step": 11085 + }, + { + "epoch": 1.99, + "grad_norm": 0.49276089668273926, + "learning_rate": 3.295772345978361e-05, + "loss": 1.8855, + "step": 11090 + }, + { + "epoch": 1.99, + "grad_norm": 0.7664940357208252, + "learning_rate": 3.2944393392869145e-05, + "loss": 2.1009, + "step": 11095 + }, + { + "epoch": 1.99, + "grad_norm": 0.7249503135681152, + "learning_rate": 3.2931060813150685e-05, + "loss": 1.8206, + "step": 11100 + }, + { + "epoch": 1.99, + "grad_norm": 1.5629411935806274, + "learning_rate": 3.291772572484533e-05, + "loss": 1.7675, + "step": 11105 + }, + { + "epoch": 1.99, + "grad_norm": 0.38072723150253296, + "learning_rate": 3.2904388132170936e-05, + "loss": 1.8236, + "step": 11110 + }, + { + "epoch": 1.99, + "grad_norm": 0.7046849727630615, + "learning_rate": 3.2891048039346177e-05, + "loss": 1.7859, + "step": 11115 + }, + { + "epoch": 1.99, + "grad_norm": 0.5897841453552246, + "learning_rate": 3.2877705450590526e-05, + "loss": 1.9604, + "step": 11120 + }, + { + "epoch": 1.99, + "grad_norm": 0.6484891772270203, + "learning_rate": 3.2864360370124206e-05, + "loss": 1.8623, + "step": 11125 + }, + { + "epoch": 1.99, + "grad_norm": 0.7585556507110596, + "learning_rate": 3.285101280216827e-05, + "loss": 1.746, + "step": 11130 + }, + { + "epoch": 1.99, + "grad_norm": 0.6930608153343201, + "learning_rate": 3.2837662750944535e-05, + "loss": 1.5794, + "step": 11135 + }, + { + "epoch": 1.99, + "grad_norm": 0.684201717376709, + "learning_rate": 3.282431022067561e-05, + "loss": 1.8985, + "step": 11140 + }, + { + "epoch": 1.99, + "grad_norm": 0.8467980027198792, + "learning_rate": 3.281095521558488e-05, + "loss": 1.8468, + "step": 11145 + }, + { + "epoch": 2.0, + "grad_norm": 2.533331871032715, + "learning_rate": 3.2797597739896545e-05, + "loss": 1.6749, + "step": 11150 + }, + { + "epoch": 2.0, + "grad_norm": 0.5449894666671753, + "learning_rate": 3.278423779783554e-05, + "loss": 1.7722, + "step": 11155 + }, + { + "epoch": 2.0, + "grad_norm": 0.5651440024375916, + "learning_rate": 3.27708753936276e-05, + "loss": 1.8541, + "step": 11160 + }, + { + "epoch": 2.0, + "grad_norm": 0.9599629640579224, + "learning_rate": 3.2757510531499256e-05, + "loss": 1.9124, + "step": 11165 + }, + { + "epoch": 2.0, + "grad_norm": 0.530594527721405, + "learning_rate": 3.274414321567779e-05, + "loss": 2.0135, + "step": 11170 + }, + { + "epoch": 2.0, + "grad_norm": 0.8075482249259949, + "learning_rate": 3.273077345039127e-05, + "loss": 1.8081, + "step": 11175 + }, + { + "epoch": 2.0, + "grad_norm": 0.5179402232170105, + "learning_rate": 3.271740123986856e-05, + "loss": 1.8615, + "step": 11180 + }, + { + "epoch": 2.0, + "grad_norm": 0.6782111525535583, + "learning_rate": 3.270402658833924e-05, + "loss": 1.9938, + "step": 11185 + }, + { + "epoch": 2.0, + "grad_norm": 0.5111522674560547, + "learning_rate": 3.2690649500033726e-05, + "loss": 1.5514, + "step": 11190 + }, + { + "epoch": 2.0, + "grad_norm": 1.3096226453781128, + "learning_rate": 3.267726997918318e-05, + "loss": 1.7645, + "step": 11195 + }, + { + "epoch": 2.0, + "grad_norm": 0.4817447364330292, + "learning_rate": 3.266388803001951e-05, + "loss": 1.7603, + "step": 11200 + }, + { + "epoch": 2.01, + "grad_norm": 0.8163765668869019, + "learning_rate": 3.2650503656775446e-05, + "loss": 1.8734, + "step": 11205 + }, + { + "epoch": 2.01, + "grad_norm": 0.504024863243103, + "learning_rate": 3.2637116863684426e-05, + "loss": 1.5327, + "step": 11210 + }, + { + "epoch": 2.01, + "grad_norm": 0.4556671380996704, + "learning_rate": 3.2623727654980686e-05, + "loss": 1.9113, + "step": 11215 + }, + { + "epoch": 2.01, + "grad_norm": 0.9641478657722473, + "learning_rate": 3.261033603489923e-05, + "loss": 1.8351, + "step": 11220 + }, + { + "epoch": 2.01, + "grad_norm": 0.5921196937561035, + "learning_rate": 3.259694200767579e-05, + "loss": 1.9184, + "step": 11225 + }, + { + "epoch": 2.01, + "grad_norm": 0.8275460004806519, + "learning_rate": 3.258354557754691e-05, + "loss": 1.7349, + "step": 11230 + }, + { + "epoch": 2.01, + "grad_norm": 0.7848659753799438, + "learning_rate": 3.257014674874986e-05, + "loss": 1.7097, + "step": 11235 + }, + { + "epoch": 2.01, + "grad_norm": 0.5022772550582886, + "learning_rate": 3.255674552552267e-05, + "loss": 1.7641, + "step": 11240 + }, + { + "epoch": 2.01, + "grad_norm": 1.0205901861190796, + "learning_rate": 3.254334191210414e-05, + "loss": 1.9206, + "step": 11245 + }, + { + "epoch": 2.01, + "grad_norm": 0.4597795009613037, + "learning_rate": 3.252993591273382e-05, + "loss": 1.9408, + "step": 11250 + }, + { + "epoch": 2.01, + "grad_norm": 0.4975505769252777, + "learning_rate": 3.251652753165202e-05, + "loss": 1.9817, + "step": 11255 + }, + { + "epoch": 2.02, + "grad_norm": 0.8713247776031494, + "learning_rate": 3.2503116773099786e-05, + "loss": 1.69, + "step": 11260 + }, + { + "epoch": 2.02, + "grad_norm": 0.6077904105186462, + "learning_rate": 3.248970364131894e-05, + "loss": 1.8712, + "step": 11265 + }, + { + "epoch": 2.02, + "grad_norm": 0.489361971616745, + "learning_rate": 3.2476288140552026e-05, + "loss": 1.7067, + "step": 11270 + }, + { + "epoch": 2.02, + "grad_norm": 0.7086228728294373, + "learning_rate": 3.246287027504237e-05, + "loss": 1.7504, + "step": 11275 + }, + { + "epoch": 2.02, + "grad_norm": 0.7792586088180542, + "learning_rate": 3.2449450049034024e-05, + "loss": 1.874, + "step": 11280 + }, + { + "epoch": 2.02, + "grad_norm": 0.6796438097953796, + "learning_rate": 3.243602746677179e-05, + "loss": 1.8736, + "step": 11285 + }, + { + "epoch": 2.02, + "grad_norm": 1.0525963306427002, + "learning_rate": 3.242260253250122e-05, + "loss": 1.7056, + "step": 11290 + }, + { + "epoch": 2.02, + "grad_norm": 0.3810064196586609, + "learning_rate": 3.240917525046862e-05, + "loss": 1.7772, + "step": 11295 + }, + { + "epoch": 2.02, + "grad_norm": 0.3621659278869629, + "learning_rate": 3.2395745624921e-05, + "loss": 1.8997, + "step": 11300 + }, + { + "epoch": 2.02, + "grad_norm": 0.7644492387771606, + "learning_rate": 3.238231366010616e-05, + "loss": 1.7473, + "step": 11305 + }, + { + "epoch": 2.02, + "grad_norm": 0.8346506953239441, + "learning_rate": 3.2368879360272606e-05, + "loss": 1.7338, + "step": 11310 + }, + { + "epoch": 2.03, + "grad_norm": 0.7360906004905701, + "learning_rate": 3.235544272966961e-05, + "loss": 1.6659, + "step": 11315 + }, + { + "epoch": 2.03, + "grad_norm": 0.8705644607543945, + "learning_rate": 3.2342003772547145e-05, + "loss": 1.9399, + "step": 11320 + }, + { + "epoch": 2.03, + "grad_norm": 1.1617865562438965, + "learning_rate": 3.232856249315595e-05, + "loss": 1.7525, + "step": 11325 + }, + { + "epoch": 2.03, + "grad_norm": 0.6664028167724609, + "learning_rate": 3.231511889574748e-05, + "loss": 1.7085, + "step": 11330 + }, + { + "epoch": 2.03, + "grad_norm": 0.378801167011261, + "learning_rate": 3.230167298457395e-05, + "loss": 2.1498, + "step": 11335 + }, + { + "epoch": 2.03, + "grad_norm": 2.0433850288391113, + "learning_rate": 3.2288224763888274e-05, + "loss": 1.9798, + "step": 11340 + }, + { + "epoch": 2.03, + "grad_norm": 0.8449577689170837, + "learning_rate": 3.227477423794412e-05, + "loss": 1.9539, + "step": 11345 + }, + { + "epoch": 2.03, + "grad_norm": 1.6414170265197754, + "learning_rate": 3.226132141099586e-05, + "loss": 1.8589, + "step": 11350 + }, + { + "epoch": 2.03, + "grad_norm": 0.9575155377388, + "learning_rate": 3.2247866287298634e-05, + "loss": 1.749, + "step": 11355 + }, + { + "epoch": 2.03, + "grad_norm": 1.6495996713638306, + "learning_rate": 3.2234408871108266e-05, + "loss": 1.7329, + "step": 11360 + }, + { + "epoch": 2.03, + "grad_norm": 0.5812102556228638, + "learning_rate": 3.2220949166681335e-05, + "loss": 1.9318, + "step": 11365 + }, + { + "epoch": 2.04, + "grad_norm": 0.8048359155654907, + "learning_rate": 3.220748717827513e-05, + "loss": 1.853, + "step": 11370 + }, + { + "epoch": 2.04, + "grad_norm": 0.7342902421951294, + "learning_rate": 3.219402291014766e-05, + "loss": 1.9342, + "step": 11375 + }, + { + "epoch": 2.04, + "grad_norm": 0.7486286163330078, + "learning_rate": 3.218055636655766e-05, + "loss": 2.1832, + "step": 11380 + }, + { + "epoch": 2.04, + "grad_norm": 1.1646840572357178, + "learning_rate": 3.21670875517646e-05, + "loss": 1.7443, + "step": 11385 + }, + { + "epoch": 2.04, + "grad_norm": 1.269112467765808, + "learning_rate": 3.215361647002863e-05, + "loss": 1.9097, + "step": 11390 + }, + { + "epoch": 2.04, + "grad_norm": 0.7798238396644592, + "learning_rate": 3.2140143125610654e-05, + "loss": 1.9486, + "step": 11395 + }, + { + "epoch": 2.04, + "grad_norm": 0.9379304647445679, + "learning_rate": 3.212666752277228e-05, + "loss": 1.7346, + "step": 11400 + }, + { + "epoch": 2.04, + "grad_norm": 1.0831714868545532, + "learning_rate": 3.211318966577581e-05, + "loss": 2.0097, + "step": 11405 + }, + { + "epoch": 2.04, + "grad_norm": 0.3674708902835846, + "learning_rate": 3.20997095588843e-05, + "loss": 2.05, + "step": 11410 + }, + { + "epoch": 2.04, + "grad_norm": 0.9015005826950073, + "learning_rate": 3.20862272063615e-05, + "loss": 1.7768, + "step": 11415 + }, + { + "epoch": 2.04, + "grad_norm": 0.8480490446090698, + "learning_rate": 3.207274261247183e-05, + "loss": 1.768, + "step": 11420 + }, + { + "epoch": 2.05, + "grad_norm": 0.48236319422721863, + "learning_rate": 3.2059255781480475e-05, + "loss": 1.9133, + "step": 11425 + }, + { + "epoch": 2.05, + "grad_norm": 1.2355780601501465, + "learning_rate": 3.204576671765331e-05, + "loss": 1.9076, + "step": 11430 + }, + { + "epoch": 2.05, + "grad_norm": 0.6403537392616272, + "learning_rate": 3.2032275425256916e-05, + "loss": 1.7401, + "step": 11435 + }, + { + "epoch": 2.05, + "grad_norm": 0.7580591440200806, + "learning_rate": 3.201878190855857e-05, + "loss": 1.8118, + "step": 11440 + }, + { + "epoch": 2.05, + "grad_norm": 0.9600961208343506, + "learning_rate": 3.2005286171826256e-05, + "loss": 2.1253, + "step": 11445 + }, + { + "epoch": 2.05, + "grad_norm": 0.496980756521225, + "learning_rate": 3.199178821932865e-05, + "loss": 1.9104, + "step": 11450 + }, + { + "epoch": 2.05, + "grad_norm": 0.6483522653579712, + "learning_rate": 3.1978288055335164e-05, + "loss": 1.7537, + "step": 11455 + }, + { + "epoch": 2.05, + "grad_norm": 0.5102001428604126, + "learning_rate": 3.196478568411589e-05, + "loss": 1.8392, + "step": 11460 + }, + { + "epoch": 2.05, + "grad_norm": 0.5300964713096619, + "learning_rate": 3.195128110994159e-05, + "loss": 2.1472, + "step": 11465 + }, + { + "epoch": 2.05, + "grad_norm": 0.7955127358436584, + "learning_rate": 3.193777433708376e-05, + "loss": 1.9474, + "step": 11470 + }, + { + "epoch": 2.05, + "grad_norm": 0.7563099265098572, + "learning_rate": 3.192426536981459e-05, + "loss": 1.9155, + "step": 11475 + }, + { + "epoch": 2.05, + "grad_norm": 0.6697816252708435, + "learning_rate": 3.191075421240694e-05, + "loss": 1.9573, + "step": 11480 + }, + { + "epoch": 2.06, + "grad_norm": 0.7206788063049316, + "learning_rate": 3.189724086913438e-05, + "loss": 1.8042, + "step": 11485 + }, + { + "epoch": 2.06, + "grad_norm": 0.4096161127090454, + "learning_rate": 3.188372534427117e-05, + "loss": 2.0095, + "step": 11490 + }, + { + "epoch": 2.06, + "grad_norm": 0.6973406672477722, + "learning_rate": 3.1870207642092246e-05, + "loss": 1.4596, + "step": 11495 + }, + { + "epoch": 2.06, + "grad_norm": 0.4881574213504791, + "learning_rate": 3.185668776687326e-05, + "loss": 1.9685, + "step": 11500 + }, + { + "epoch": 2.06, + "grad_norm": 0.6438494920730591, + "learning_rate": 3.184316572289053e-05, + "loss": 1.9556, + "step": 11505 + }, + { + "epoch": 2.06, + "grad_norm": 0.7263823747634888, + "learning_rate": 3.182964151442107e-05, + "loss": 2.1189, + "step": 11510 + }, + { + "epoch": 2.06, + "grad_norm": 0.909968912601471, + "learning_rate": 3.181611514574255e-05, + "loss": 1.7608, + "step": 11515 + }, + { + "epoch": 2.06, + "grad_norm": 0.6028372645378113, + "learning_rate": 3.180258662113338e-05, + "loss": 1.7227, + "step": 11520 + }, + { + "epoch": 2.06, + "grad_norm": 0.7164033651351929, + "learning_rate": 3.17890559448726e-05, + "loss": 1.978, + "step": 11525 + }, + { + "epoch": 2.06, + "grad_norm": 0.8094786405563354, + "learning_rate": 3.177552312123995e-05, + "loss": 1.957, + "step": 11530 + }, + { + "epoch": 2.06, + "grad_norm": 0.6405372619628906, + "learning_rate": 3.1761988154515864e-05, + "loss": 1.9312, + "step": 11535 + }, + { + "epoch": 2.07, + "grad_norm": 0.37368497252464294, + "learning_rate": 3.1748451048981424e-05, + "loss": 2.1636, + "step": 11540 + }, + { + "epoch": 2.07, + "grad_norm": 0.5130252242088318, + "learning_rate": 3.1734911808918406e-05, + "loss": 1.5754, + "step": 11545 + }, + { + "epoch": 2.07, + "grad_norm": 0.6798210740089417, + "learning_rate": 3.172137043860927e-05, + "loss": 1.8592, + "step": 11550 + }, + { + "epoch": 2.07, + "grad_norm": 0.8292738199234009, + "learning_rate": 3.170782694233712e-05, + "loss": 1.6516, + "step": 11555 + }, + { + "epoch": 2.07, + "grad_norm": 0.5600850582122803, + "learning_rate": 3.169428132438576e-05, + "loss": 1.7959, + "step": 11560 + }, + { + "epoch": 2.07, + "grad_norm": 0.9339403510093689, + "learning_rate": 3.168073358903966e-05, + "loss": 1.8377, + "step": 11565 + }, + { + "epoch": 2.07, + "grad_norm": 0.5029419660568237, + "learning_rate": 3.166718374058395e-05, + "loss": 2.0719, + "step": 11570 + }, + { + "epoch": 2.07, + "grad_norm": 1.2558355331420898, + "learning_rate": 3.165363178330444e-05, + "loss": 1.8537, + "step": 11575 + }, + { + "epoch": 2.07, + "grad_norm": 0.9240514039993286, + "learning_rate": 3.16400777214876e-05, + "loss": 1.5651, + "step": 11580 + }, + { + "epoch": 2.07, + "grad_norm": 0.5042521357536316, + "learning_rate": 3.1626521559420556e-05, + "loss": 1.6923, + "step": 11585 + }, + { + "epoch": 2.07, + "grad_norm": 0.8794254064559937, + "learning_rate": 3.161296330139111e-05, + "loss": 1.6979, + "step": 11590 + }, + { + "epoch": 2.08, + "grad_norm": 0.6789154410362244, + "learning_rate": 3.1599402951687744e-05, + "loss": 1.9248, + "step": 11595 + }, + { + "epoch": 2.08, + "grad_norm": 0.4555959403514862, + "learning_rate": 3.1585840514599574e-05, + "loss": 1.7823, + "step": 11600 + }, + { + "epoch": 2.08, + "grad_norm": 0.4320109784603119, + "learning_rate": 3.1572275994416376e-05, + "loss": 1.9124, + "step": 11605 + }, + { + "epoch": 2.08, + "grad_norm": 0.5282646417617798, + "learning_rate": 3.155870939542861e-05, + "loss": 1.8452, + "step": 11610 + }, + { + "epoch": 2.08, + "grad_norm": 1.2354097366333008, + "learning_rate": 3.154514072192736e-05, + "loss": 2.0078, + "step": 11615 + }, + { + "epoch": 2.08, + "grad_norm": 0.7362090945243835, + "learning_rate": 3.153156997820441e-05, + "loss": 1.7717, + "step": 11620 + }, + { + "epoch": 2.08, + "grad_norm": 0.5220533609390259, + "learning_rate": 3.151799716855215e-05, + "loss": 1.6834, + "step": 11625 + }, + { + "epoch": 2.08, + "grad_norm": 0.6556394696235657, + "learning_rate": 3.150442229726366e-05, + "loss": 1.7327, + "step": 11630 + }, + { + "epoch": 2.08, + "grad_norm": 0.4095345139503479, + "learning_rate": 3.1490845368632645e-05, + "loss": 1.852, + "step": 11635 + }, + { + "epoch": 2.08, + "grad_norm": 0.6544451117515564, + "learning_rate": 3.147726638695349e-05, + "loss": 1.8067, + "step": 11640 + }, + { + "epoch": 2.08, + "grad_norm": 0.5166847705841064, + "learning_rate": 3.14636853565212e-05, + "loss": 1.7826, + "step": 11645 + }, + { + "epoch": 2.09, + "grad_norm": 0.8674813508987427, + "learning_rate": 3.145010228163145e-05, + "loss": 1.9693, + "step": 11650 + }, + { + "epoch": 2.09, + "grad_norm": 0.7544355988502502, + "learning_rate": 3.1436517166580565e-05, + "loss": 1.6409, + "step": 11655 + }, + { + "epoch": 2.09, + "grad_norm": 0.8396444916725159, + "learning_rate": 3.1422930015665484e-05, + "loss": 1.9227, + "step": 11660 + }, + { + "epoch": 2.09, + "grad_norm": 0.7315942049026489, + "learning_rate": 3.140934083318382e-05, + "loss": 1.6208, + "step": 11665 + }, + { + "epoch": 2.09, + "grad_norm": 3.2649600505828857, + "learning_rate": 3.139574962343381e-05, + "loss": 1.7472, + "step": 11670 + }, + { + "epoch": 2.09, + "grad_norm": 0.7523483037948608, + "learning_rate": 3.138215639071435e-05, + "loss": 1.7216, + "step": 11675 + }, + { + "epoch": 2.09, + "grad_norm": 0.9226972460746765, + "learning_rate": 3.1368561139324956e-05, + "loss": 1.6117, + "step": 11680 + }, + { + "epoch": 2.09, + "grad_norm": 1.0279871225357056, + "learning_rate": 3.13549638735658e-05, + "loss": 2.0684, + "step": 11685 + }, + { + "epoch": 2.09, + "grad_norm": 0.8119601607322693, + "learning_rate": 3.1341364597737686e-05, + "loss": 1.7426, + "step": 11690 + }, + { + "epoch": 2.09, + "grad_norm": 0.5219517350196838, + "learning_rate": 3.132776331614205e-05, + "loss": 2.1323, + "step": 11695 + }, + { + "epoch": 2.09, + "grad_norm": 0.8303288221359253, + "learning_rate": 3.131416003308097e-05, + "loss": 1.7259, + "step": 11700 + }, + { + "epoch": 2.1, + "grad_norm": 0.494863897562027, + "learning_rate": 3.130055475285714e-05, + "loss": 1.9558, + "step": 11705 + }, + { + "epoch": 2.1, + "grad_norm": 1.1602771282196045, + "learning_rate": 3.12869474797739e-05, + "loss": 1.8207, + "step": 11710 + }, + { + "epoch": 2.1, + "grad_norm": 0.6153246164321899, + "learning_rate": 3.127333821813522e-05, + "loss": 1.8699, + "step": 11715 + }, + { + "epoch": 2.1, + "grad_norm": 0.7908428907394409, + "learning_rate": 3.1259726972245694e-05, + "loss": 1.8281, + "step": 11720 + }, + { + "epoch": 2.1, + "grad_norm": 0.8119539618492126, + "learning_rate": 3.124611374641056e-05, + "loss": 1.8657, + "step": 11725 + }, + { + "epoch": 2.1, + "grad_norm": 0.5964948534965515, + "learning_rate": 3.1232498544935635e-05, + "loss": 1.8478, + "step": 11730 + }, + { + "epoch": 2.1, + "grad_norm": 1.046675682067871, + "learning_rate": 3.121888137212742e-05, + "loss": 1.7418, + "step": 11735 + }, + { + "epoch": 2.1, + "grad_norm": 0.6229978799819946, + "learning_rate": 3.120526223229302e-05, + "loss": 1.8828, + "step": 11740 + }, + { + "epoch": 2.1, + "grad_norm": 1.7325795888900757, + "learning_rate": 3.119164112974014e-05, + "loss": 1.6943, + "step": 11745 + }, + { + "epoch": 2.1, + "grad_norm": 0.5110823512077332, + "learning_rate": 3.1178018068777125e-05, + "loss": 2.0465, + "step": 11750 + }, + { + "epoch": 2.1, + "grad_norm": 0.5617272257804871, + "learning_rate": 3.1164393053712944e-05, + "loss": 2.0785, + "step": 11755 + }, + { + "epoch": 2.1, + "grad_norm": 0.6913292407989502, + "learning_rate": 3.115076608885716e-05, + "loss": 1.8045, + "step": 11760 + }, + { + "epoch": 2.11, + "grad_norm": 1.4254094362258911, + "learning_rate": 3.1137137178519985e-05, + "loss": 1.7095, + "step": 11765 + }, + { + "epoch": 2.11, + "grad_norm": 0.7044830322265625, + "learning_rate": 3.112350632701222e-05, + "loss": 1.9113, + "step": 11770 + }, + { + "epoch": 2.11, + "grad_norm": 0.994310200214386, + "learning_rate": 3.110987353864529e-05, + "loss": 1.6473, + "step": 11775 + }, + { + "epoch": 2.11, + "grad_norm": 3.0846924781799316, + "learning_rate": 3.109623881773124e-05, + "loss": 1.7661, + "step": 11780 + }, + { + "epoch": 2.11, + "grad_norm": 1.5573928356170654, + "learning_rate": 3.108260216858272e-05, + "loss": 1.8612, + "step": 11785 + }, + { + "epoch": 2.11, + "grad_norm": 1.210252046585083, + "learning_rate": 3.106896359551299e-05, + "loss": 1.9547, + "step": 11790 + }, + { + "epoch": 2.11, + "grad_norm": 0.9023343920707703, + "learning_rate": 3.1055323102835895e-05, + "loss": 1.7782, + "step": 11795 + }, + { + "epoch": 2.11, + "grad_norm": 21.55355453491211, + "learning_rate": 3.1041680694865935e-05, + "loss": 1.7996, + "step": 11800 + }, + { + "epoch": 2.11, + "grad_norm": 0.7996496558189392, + "learning_rate": 3.102803637591818e-05, + "loss": 2.1707, + "step": 11805 + }, + { + "epoch": 2.11, + "grad_norm": 0.8086062073707581, + "learning_rate": 3.1014390150308326e-05, + "loss": 2.1546, + "step": 11810 + }, + { + "epoch": 2.11, + "grad_norm": 3.054373025894165, + "learning_rate": 3.1003471799924244e-05, + "loss": 2.0085, + "step": 11815 + }, + { + "epoch": 2.12, + "grad_norm": 0.8012299537658691, + "learning_rate": 3.098982215320005e-05, + "loss": 1.6379, + "step": 11820 + }, + { + "epoch": 2.12, + "grad_norm": 1.008513331413269, + "learning_rate": 3.097617061190086e-05, + "loss": 1.9324, + "step": 11825 + }, + { + "epoch": 2.12, + "grad_norm": 0.827659547328949, + "learning_rate": 3.096251718034466e-05, + "loss": 2.1212, + "step": 11830 + }, + { + "epoch": 2.12, + "grad_norm": 0.8780943155288696, + "learning_rate": 3.0948861862850005e-05, + "loss": 1.4271, + "step": 11835 + }, + { + "epoch": 2.12, + "grad_norm": 0.5806859731674194, + "learning_rate": 3.093520466373607e-05, + "loss": 1.9781, + "step": 11840 + }, + { + "epoch": 2.12, + "grad_norm": 0.8726419806480408, + "learning_rate": 3.0921545587322605e-05, + "loss": 1.7361, + "step": 11845 + }, + { + "epoch": 2.12, + "grad_norm": 0.8282438516616821, + "learning_rate": 3.090788463792996e-05, + "loss": 1.4147, + "step": 11850 + }, + { + "epoch": 2.12, + "grad_norm": 1.0596107244491577, + "learning_rate": 3.0894221819879094e-05, + "loss": 1.8455, + "step": 11855 + }, + { + "epoch": 2.12, + "grad_norm": 0.9032155275344849, + "learning_rate": 3.0880557137491546e-05, + "loss": 1.897, + "step": 11860 + }, + { + "epoch": 2.12, + "grad_norm": 2.334949016571045, + "learning_rate": 3.0866890595089414e-05, + "loss": 2.0026, + "step": 11865 + }, + { + "epoch": 2.12, + "grad_norm": 0.5920724868774414, + "learning_rate": 3.085322219699544e-05, + "loss": 1.7887, + "step": 11870 + }, + { + "epoch": 2.13, + "grad_norm": 0.8088483214378357, + "learning_rate": 3.083955194753291e-05, + "loss": 1.6292, + "step": 11875 + }, + { + "epoch": 2.13, + "grad_norm": 1.0029832124710083, + "learning_rate": 3.082587985102572e-05, + "loss": 1.8685, + "step": 11880 + }, + { + "epoch": 2.13, + "grad_norm": 0.9885581135749817, + "learning_rate": 3.0812205911798336e-05, + "loss": 1.812, + "step": 11885 + }, + { + "epoch": 2.13, + "grad_norm": 1.0413260459899902, + "learning_rate": 3.079853013417582e-05, + "loss": 2.1284, + "step": 11890 + }, + { + "epoch": 2.13, + "grad_norm": 1.0664329528808594, + "learning_rate": 3.078485252248379e-05, + "loss": 2.0554, + "step": 11895 + }, + { + "epoch": 2.13, + "grad_norm": 0.3859197795391083, + "learning_rate": 3.077117308104846e-05, + "loss": 2.0608, + "step": 11900 + }, + { + "epoch": 2.13, + "grad_norm": 0.6689034104347229, + "learning_rate": 3.0757491814196665e-05, + "loss": 1.7033, + "step": 11905 + }, + { + "epoch": 2.13, + "grad_norm": 0.5232422351837158, + "learning_rate": 3.074380872625573e-05, + "loss": 1.7849, + "step": 11910 + }, + { + "epoch": 2.13, + "grad_norm": 0.7212871313095093, + "learning_rate": 3.073012382155362e-05, + "loss": 1.9207, + "step": 11915 + }, + { + "epoch": 2.13, + "grad_norm": 0.5604449510574341, + "learning_rate": 3.071643710441886e-05, + "loss": 1.8573, + "step": 11920 + }, + { + "epoch": 2.13, + "grad_norm": 1.3720588684082031, + "learning_rate": 3.070274857918054e-05, + "loss": 1.696, + "step": 11925 + }, + { + "epoch": 2.14, + "grad_norm": 0.821916937828064, + "learning_rate": 3.068905825016834e-05, + "loss": 1.5985, + "step": 11930 + }, + { + "epoch": 2.14, + "grad_norm": 0.8182123899459839, + "learning_rate": 3.0675366121712476e-05, + "loss": 2.0291, + "step": 11935 + }, + { + "epoch": 2.14, + "grad_norm": 1.4312397241592407, + "learning_rate": 3.066167219814376e-05, + "loss": 1.5368, + "step": 11940 + }, + { + "epoch": 2.14, + "grad_norm": 0.5408018231391907, + "learning_rate": 3.0647976483793584e-05, + "loss": 1.828, + "step": 11945 + }, + { + "epoch": 2.14, + "grad_norm": 0.6024120450019836, + "learning_rate": 3.063427898299388e-05, + "loss": 1.8548, + "step": 11950 + }, + { + "epoch": 2.14, + "grad_norm": 0.9336116313934326, + "learning_rate": 3.062057970007715e-05, + "loss": 1.6518, + "step": 11955 + }, + { + "epoch": 2.14, + "grad_norm": 2.2404088973999023, + "learning_rate": 3.060687863937647e-05, + "loss": 2.039, + "step": 11960 + }, + { + "epoch": 2.14, + "grad_norm": 0.6012473106384277, + "learning_rate": 3.059317580522546e-05, + "loss": 2.2862, + "step": 11965 + }, + { + "epoch": 2.14, + "grad_norm": 0.5676471590995789, + "learning_rate": 3.057947120195833e-05, + "loss": 2.1326, + "step": 11970 + }, + { + "epoch": 2.14, + "grad_norm": 0.6940319538116455, + "learning_rate": 3.056576483390983e-05, + "loss": 2.0709, + "step": 11975 + }, + { + "epoch": 2.14, + "grad_norm": 0.5876992344856262, + "learning_rate": 3.055205670541527e-05, + "loss": 1.8725, + "step": 11980 + }, + { + "epoch": 2.15, + "grad_norm": 1.3545783758163452, + "learning_rate": 3.05383468208105e-05, + "loss": 1.7014, + "step": 11985 + }, + { + "epoch": 2.15, + "grad_norm": 0.659228503704071, + "learning_rate": 3.052463518443196e-05, + "loss": 1.6168, + "step": 11990 + }, + { + "epoch": 2.15, + "grad_norm": 0.3598320782184601, + "learning_rate": 3.0510921800616633e-05, + "loss": 1.812, + "step": 11995 + }, + { + "epoch": 2.15, + "grad_norm": 1.2681336402893066, + "learning_rate": 3.049720667370204e-05, + "loss": 1.6559, + "step": 12000 + }, + { + "epoch": 2.15, + "grad_norm": 0.691615879535675, + "learning_rate": 3.048348980802626e-05, + "loss": 1.7639, + "step": 12005 + }, + { + "epoch": 2.15, + "grad_norm": 0.6840038299560547, + "learning_rate": 3.0469771207927932e-05, + "loss": 1.7349, + "step": 12010 + }, + { + "epoch": 2.15, + "grad_norm": 0.38991519808769226, + "learning_rate": 3.0456050877746228e-05, + "loss": 2.1607, + "step": 12015 + }, + { + "epoch": 2.15, + "grad_norm": 0.6570853590965271, + "learning_rate": 3.044232882182088e-05, + "loss": 2.0544, + "step": 12020 + }, + { + "epoch": 2.15, + "grad_norm": 0.6140663027763367, + "learning_rate": 3.042860504449217e-05, + "loss": 1.7201, + "step": 12025 + }, + { + "epoch": 2.15, + "grad_norm": 0.6192611455917358, + "learning_rate": 3.0414879550100894e-05, + "loss": 1.895, + "step": 12030 + }, + { + "epoch": 2.15, + "grad_norm": 0.8460562825202942, + "learning_rate": 3.0401152342988426e-05, + "loss": 1.7679, + "step": 12035 + }, + { + "epoch": 2.16, + "grad_norm": 0.6160430312156677, + "learning_rate": 3.0387423427496674e-05, + "loss": 1.7706, + "step": 12040 + }, + { + "epoch": 2.16, + "grad_norm": 0.532014787197113, + "learning_rate": 3.0373692807968074e-05, + "loss": 2.0207, + "step": 12045 + }, + { + "epoch": 2.16, + "grad_norm": 0.6476467847824097, + "learning_rate": 3.0359960488745598e-05, + "loss": 1.6105, + "step": 12050 + }, + { + "epoch": 2.16, + "grad_norm": 0.520656406879425, + "learning_rate": 3.0346226474172783e-05, + "loss": 1.8744, + "step": 12055 + }, + { + "epoch": 2.16, + "grad_norm": 0.8512612581253052, + "learning_rate": 3.0332490768593675e-05, + "loss": 1.8777, + "step": 12060 + }, + { + "epoch": 2.16, + "grad_norm": 0.6043909788131714, + "learning_rate": 3.0318753376352866e-05, + "loss": 1.6042, + "step": 12065 + }, + { + "epoch": 2.16, + "grad_norm": 1.555930256843567, + "learning_rate": 3.030501430179548e-05, + "loss": 1.7851, + "step": 12070 + }, + { + "epoch": 2.16, + "grad_norm": 0.8706346750259399, + "learning_rate": 3.0291273549267173e-05, + "loss": 1.9402, + "step": 12075 + }, + { + "epoch": 2.16, + "grad_norm": 0.7135600447654724, + "learning_rate": 3.027753112311413e-05, + "loss": 2.1418, + "step": 12080 + }, + { + "epoch": 2.16, + "grad_norm": 1.1376060247421265, + "learning_rate": 3.026378702768307e-05, + "loss": 1.8637, + "step": 12085 + }, + { + "epoch": 2.16, + "grad_norm": 0.6475717425346375, + "learning_rate": 3.0250041267321232e-05, + "loss": 2.0024, + "step": 12090 + }, + { + "epoch": 2.16, + "grad_norm": 0.3832349181175232, + "learning_rate": 3.02362938463764e-05, + "loss": 1.837, + "step": 12095 + }, + { + "epoch": 2.17, + "grad_norm": 1.567743182182312, + "learning_rate": 3.0222544769196858e-05, + "loss": 1.7733, + "step": 12100 + }, + { + "epoch": 2.17, + "grad_norm": 0.7041072249412537, + "learning_rate": 3.0208794040131426e-05, + "loss": 1.5469, + "step": 12105 + }, + { + "epoch": 2.17, + "grad_norm": 0.7668848633766174, + "learning_rate": 3.0195041663529456e-05, + "loss": 1.9095, + "step": 12110 + }, + { + "epoch": 2.17, + "grad_norm": 0.9507167935371399, + "learning_rate": 3.01812876437408e-05, + "loss": 1.8337, + "step": 12115 + }, + { + "epoch": 2.17, + "grad_norm": 0.7399880886077881, + "learning_rate": 3.0167531985115842e-05, + "loss": 1.6083, + "step": 12120 + }, + { + "epoch": 2.17, + "grad_norm": 0.739559531211853, + "learning_rate": 3.0153774692005492e-05, + "loss": 1.6946, + "step": 12125 + }, + { + "epoch": 2.17, + "grad_norm": 0.6560977697372437, + "learning_rate": 3.0140015768761164e-05, + "loss": 1.874, + "step": 12130 + }, + { + "epoch": 2.17, + "grad_norm": 0.5614013075828552, + "learning_rate": 3.012625521973479e-05, + "loss": 1.8449, + "step": 12135 + }, + { + "epoch": 2.17, + "grad_norm": 0.746187686920166, + "learning_rate": 3.011249304927883e-05, + "loss": 1.7156, + "step": 12140 + }, + { + "epoch": 2.17, + "grad_norm": 0.5409942865371704, + "learning_rate": 3.0098729261746228e-05, + "loss": 2.0243, + "step": 12145 + }, + { + "epoch": 2.17, + "grad_norm": 1.222755789756775, + "learning_rate": 3.0084963861490468e-05, + "loss": 1.7754, + "step": 12150 + }, + { + "epoch": 2.18, + "grad_norm": 1.2106776237487793, + "learning_rate": 3.0071196852865528e-05, + "loss": 1.9023, + "step": 12155 + }, + { + "epoch": 2.18, + "grad_norm": 0.8225529193878174, + "learning_rate": 3.0057428240225897e-05, + "loss": 2.0112, + "step": 12160 + }, + { + "epoch": 2.18, + "grad_norm": 0.5646623969078064, + "learning_rate": 3.0043658027926585e-05, + "loss": 1.5788, + "step": 12165 + }, + { + "epoch": 2.18, + "grad_norm": 0.7176980972290039, + "learning_rate": 3.0029886220323082e-05, + "loss": 1.9195, + "step": 12170 + }, + { + "epoch": 2.18, + "grad_norm": 0.6058060526847839, + "learning_rate": 3.0016112821771418e-05, + "loss": 1.9483, + "step": 12175 + }, + { + "epoch": 2.18, + "grad_norm": 0.7406783699989319, + "learning_rate": 3.000233783662808e-05, + "loss": 1.8359, + "step": 12180 + }, + { + "epoch": 2.18, + "grad_norm": 1.0844707489013672, + "learning_rate": 2.998856126925011e-05, + "loss": 1.972, + "step": 12185 + }, + { + "epoch": 2.18, + "grad_norm": 0.5166330933570862, + "learning_rate": 2.9974783123995005e-05, + "loss": 1.7504, + "step": 12190 + }, + { + "epoch": 2.18, + "grad_norm": 1.1674580574035645, + "learning_rate": 2.9961003405220774e-05, + "loss": 1.6967, + "step": 12195 + }, + { + "epoch": 2.18, + "grad_norm": 1.0376147031784058, + "learning_rate": 2.9947222117285945e-05, + "loss": 1.8488, + "step": 12200 + }, + { + "epoch": 2.18, + "grad_norm": 1.062048077583313, + "learning_rate": 2.9933439264549518e-05, + "loss": 1.6505, + "step": 12205 + }, + { + "epoch": 2.19, + "grad_norm": 0.7804186344146729, + "learning_rate": 2.9919654851370998e-05, + "loss": 1.6969, + "step": 12210 + }, + { + "epoch": 2.19, + "grad_norm": 0.533622145652771, + "learning_rate": 2.9905868882110378e-05, + "loss": 1.9222, + "step": 12215 + }, + { + "epoch": 2.19, + "grad_norm": 0.6791996955871582, + "learning_rate": 2.989208136112815e-05, + "loss": 1.8727, + "step": 12220 + }, + { + "epoch": 2.19, + "grad_norm": 1.5420125722885132, + "learning_rate": 2.9878292292785293e-05, + "loss": 1.4943, + "step": 12225 + }, + { + "epoch": 2.19, + "grad_norm": 0.7872235774993896, + "learning_rate": 2.9864501681443276e-05, + "loss": 1.5611, + "step": 12230 + }, + { + "epoch": 2.19, + "grad_norm": 0.7012057900428772, + "learning_rate": 2.985070953146406e-05, + "loss": 1.6986, + "step": 12235 + }, + { + "epoch": 2.19, + "grad_norm": 0.6529300212860107, + "learning_rate": 2.983691584721008e-05, + "loss": 1.6813, + "step": 12240 + }, + { + "epoch": 2.19, + "grad_norm": 0.9842402935028076, + "learning_rate": 2.982312063304427e-05, + "loss": 1.9067, + "step": 12245 + }, + { + "epoch": 2.19, + "grad_norm": 0.594903826713562, + "learning_rate": 2.9809323893330043e-05, + "loss": 1.6995, + "step": 12250 + }, + { + "epoch": 2.19, + "grad_norm": 1.0693496465682983, + "learning_rate": 2.9795525632431297e-05, + "loss": 1.4401, + "step": 12255 + }, + { + "epoch": 2.19, + "grad_norm": 0.7465589046478271, + "learning_rate": 2.978172585471241e-05, + "loss": 1.9998, + "step": 12260 + }, + { + "epoch": 2.2, + "grad_norm": 0.5380420684814453, + "learning_rate": 2.976792456453823e-05, + "loss": 1.8497, + "step": 12265 + }, + { + "epoch": 2.2, + "grad_norm": 0.618114709854126, + "learning_rate": 2.975412176627409e-05, + "loss": 1.7707, + "step": 12270 + }, + { + "epoch": 2.2, + "grad_norm": 1.5173262357711792, + "learning_rate": 2.9740317464285816e-05, + "loss": 1.8605, + "step": 12275 + }, + { + "epoch": 2.2, + "grad_norm": 0.9060313701629639, + "learning_rate": 2.9726511662939695e-05, + "loss": 1.8223, + "step": 12280 + }, + { + "epoch": 2.2, + "grad_norm": 1.1166000366210938, + "learning_rate": 2.971270436660247e-05, + "loss": 1.7321, + "step": 12285 + }, + { + "epoch": 2.2, + "grad_norm": 0.9780591726303101, + "learning_rate": 2.969889557964139e-05, + "loss": 1.6634, + "step": 12290 + }, + { + "epoch": 2.2, + "grad_norm": 1.0793691873550415, + "learning_rate": 2.9685085306424154e-05, + "loss": 2.0318, + "step": 12295 + }, + { + "epoch": 2.2, + "grad_norm": 0.39177799224853516, + "learning_rate": 2.967127355131894e-05, + "loss": 1.8835, + "step": 12300 + }, + { + "epoch": 2.2, + "grad_norm": 0.9409979581832886, + "learning_rate": 2.9657460318694407e-05, + "loss": 1.7953, + "step": 12305 + }, + { + "epoch": 2.2, + "grad_norm": 1.2627776861190796, + "learning_rate": 2.9643645612919646e-05, + "loss": 1.9271, + "step": 12310 + }, + { + "epoch": 2.2, + "grad_norm": 0.5884200930595398, + "learning_rate": 2.9629829438364248e-05, + "loss": 1.6694, + "step": 12315 + }, + { + "epoch": 2.21, + "grad_norm": 0.5460672378540039, + "learning_rate": 2.9616011799398253e-05, + "loss": 1.9784, + "step": 12320 + }, + { + "epoch": 2.21, + "grad_norm": 1.2458994388580322, + "learning_rate": 2.9602192700392168e-05, + "loss": 2.0202, + "step": 12325 + }, + { + "epoch": 2.21, + "grad_norm": 1.5217969417572021, + "learning_rate": 2.9588372145716948e-05, + "loss": 1.7552, + "step": 12330 + }, + { + "epoch": 2.21, + "grad_norm": 0.5435410737991333, + "learning_rate": 2.957455013974404e-05, + "loss": 1.8492, + "step": 12335 + }, + { + "epoch": 2.21, + "grad_norm": 0.8544949293136597, + "learning_rate": 2.956072668684532e-05, + "loss": 1.9221, + "step": 12340 + }, + { + "epoch": 2.21, + "grad_norm": 0.841983437538147, + "learning_rate": 2.9546901791393134e-05, + "loss": 1.8283, + "step": 12345 + }, + { + "epoch": 2.21, + "grad_norm": 0.6002446413040161, + "learning_rate": 2.9533075457760296e-05, + "loss": 1.7687, + "step": 12350 + }, + { + "epoch": 2.21, + "grad_norm": 0.6456916332244873, + "learning_rate": 2.951924769032004e-05, + "loss": 2.0222, + "step": 12355 + }, + { + "epoch": 2.21, + "grad_norm": 0.909529447555542, + "learning_rate": 2.9505418493446087e-05, + "loss": 1.9912, + "step": 12360 + }, + { + "epoch": 2.21, + "grad_norm": 0.6185993552207947, + "learning_rate": 2.9491587871512598e-05, + "loss": 1.5441, + "step": 12365 + }, + { + "epoch": 2.21, + "grad_norm": 0.7631831169128418, + "learning_rate": 2.947775582889419e-05, + "loss": 2.0657, + "step": 12370 + }, + { + "epoch": 2.22, + "grad_norm": 0.7581762075424194, + "learning_rate": 2.9463922369965917e-05, + "loss": 1.6854, + "step": 12375 + }, + { + "epoch": 2.22, + "grad_norm": 0.6361576318740845, + "learning_rate": 2.9450087499103297e-05, + "loss": 2.0618, + "step": 12380 + }, + { + "epoch": 2.22, + "grad_norm": 0.9206913113594055, + "learning_rate": 2.9436251220682275e-05, + "loss": 1.8886, + "step": 12385 + }, + { + "epoch": 2.22, + "grad_norm": 0.7134048342704773, + "learning_rate": 2.9422413539079263e-05, + "loss": 1.6918, + "step": 12390 + }, + { + "epoch": 2.22, + "grad_norm": 0.76563960313797, + "learning_rate": 2.9408574458671106e-05, + "loss": 1.9828, + "step": 12395 + }, + { + "epoch": 2.22, + "grad_norm": 0.9646211862564087, + "learning_rate": 2.9394733983835082e-05, + "loss": 1.8371, + "step": 12400 + }, + { + "epoch": 2.22, + "grad_norm": 0.6462224125862122, + "learning_rate": 2.938089211894892e-05, + "loss": 1.7227, + "step": 12405 + }, + { + "epoch": 2.22, + "grad_norm": 0.7270526885986328, + "learning_rate": 2.9367048868390812e-05, + "loss": 1.8023, + "step": 12410 + }, + { + "epoch": 2.22, + "grad_norm": 0.6550453305244446, + "learning_rate": 2.9353204236539334e-05, + "loss": 1.9456, + "step": 12415 + }, + { + "epoch": 2.22, + "grad_norm": 0.5939522385597229, + "learning_rate": 2.933935822777355e-05, + "loss": 2.0319, + "step": 12420 + }, + { + "epoch": 2.22, + "grad_norm": 0.8000824451446533, + "learning_rate": 2.932551084647293e-05, + "loss": 1.7618, + "step": 12425 + }, + { + "epoch": 2.22, + "grad_norm": 1.1937888860702515, + "learning_rate": 2.931166209701739e-05, + "loss": 1.9905, + "step": 12430 + }, + { + "epoch": 2.23, + "grad_norm": 0.8587405681610107, + "learning_rate": 2.929781198378727e-05, + "loss": 1.8422, + "step": 12435 + }, + { + "epoch": 2.23, + "grad_norm": 0.4814565181732178, + "learning_rate": 2.928396051116336e-05, + "loss": 1.8622, + "step": 12440 + }, + { + "epoch": 2.23, + "grad_norm": 0.6059118509292603, + "learning_rate": 2.9270107683526866e-05, + "loss": 1.7572, + "step": 12445 + }, + { + "epoch": 2.23, + "grad_norm": 0.7698599696159363, + "learning_rate": 2.9256253505259406e-05, + "loss": 1.8953, + "step": 12450 + }, + { + "epoch": 2.23, + "grad_norm": 1.1599043607711792, + "learning_rate": 2.924239798074307e-05, + "loss": 1.6808, + "step": 12455 + }, + { + "epoch": 2.23, + "grad_norm": 0.7120723128318787, + "learning_rate": 2.922854111436032e-05, + "loss": 1.7554, + "step": 12460 + }, + { + "epoch": 2.23, + "grad_norm": 0.5135558247566223, + "learning_rate": 2.9214682910494096e-05, + "loss": 2.0082, + "step": 12465 + }, + { + "epoch": 2.23, + "grad_norm": 0.6316121816635132, + "learning_rate": 2.9200823373527713e-05, + "loss": 2.05, + "step": 12470 + }, + { + "epoch": 2.23, + "grad_norm": 0.9234532713890076, + "learning_rate": 2.918696250784494e-05, + "loss": 1.6301, + "step": 12475 + }, + { + "epoch": 2.23, + "grad_norm": 0.7441158294677734, + "learning_rate": 2.917310031782995e-05, + "loss": 2.0701, + "step": 12480 + }, + { + "epoch": 2.23, + "grad_norm": 1.686782717704773, + "learning_rate": 2.9159236807867345e-05, + "loss": 1.6456, + "step": 12485 + }, + { + "epoch": 2.24, + "grad_norm": 0.481157511472702, + "learning_rate": 2.9145371982342145e-05, + "loss": 2.0247, + "step": 12490 + }, + { + "epoch": 2.24, + "grad_norm": 0.3480195999145508, + "learning_rate": 2.913150584563977e-05, + "loss": 1.7576, + "step": 12495 + }, + { + "epoch": 2.24, + "grad_norm": 0.9952110052108765, + "learning_rate": 2.911763840214607e-05, + "loss": 1.8107, + "step": 12500 + }, + { + "epoch": 2.24, + "grad_norm": 1.2428638935089111, + "learning_rate": 2.91037696562473e-05, + "loss": 1.8556, + "step": 12505 + }, + { + "epoch": 2.24, + "grad_norm": 0.8979426026344299, + "learning_rate": 2.908989961233014e-05, + "loss": 1.8072, + "step": 12510 + }, + { + "epoch": 2.24, + "grad_norm": 2.5153095722198486, + "learning_rate": 2.9076028274781663e-05, + "loss": 2.073, + "step": 12515 + }, + { + "epoch": 2.24, + "grad_norm": 0.7544877529144287, + "learning_rate": 2.9062155647989364e-05, + "loss": 1.8057, + "step": 12520 + }, + { + "epoch": 2.24, + "grad_norm": 1.3223302364349365, + "learning_rate": 2.904828173634114e-05, + "loss": 1.9224, + "step": 12525 + }, + { + "epoch": 2.24, + "grad_norm": 0.5328083634376526, + "learning_rate": 2.90344065442253e-05, + "loss": 1.9428, + "step": 12530 + }, + { + "epoch": 2.24, + "grad_norm": 0.7167631387710571, + "learning_rate": 2.902053007603055e-05, + "loss": 2.0441, + "step": 12535 + }, + { + "epoch": 2.24, + "grad_norm": 1.436819314956665, + "learning_rate": 2.9006652336146e-05, + "loss": 2.0272, + "step": 12540 + }, + { + "epoch": 2.25, + "grad_norm": 0.5790032148361206, + "learning_rate": 2.8992773328961177e-05, + "loss": 1.9511, + "step": 12545 + }, + { + "epoch": 2.25, + "grad_norm": 0.8261239528656006, + "learning_rate": 2.8978893058865987e-05, + "loss": 1.8447, + "step": 12550 + }, + { + "epoch": 2.25, + "grad_norm": 0.655191957950592, + "learning_rate": 2.896501153025075e-05, + "loss": 1.9432, + "step": 12555 + }, + { + "epoch": 2.25, + "grad_norm": 0.519561231136322, + "learning_rate": 2.8951128747506184e-05, + "loss": 2.0886, + "step": 12560 + }, + { + "epoch": 2.25, + "grad_norm": 0.7673883438110352, + "learning_rate": 2.8937244715023386e-05, + "loss": 1.8421, + "step": 12565 + }, + { + "epoch": 2.25, + "grad_norm": 0.6510341167449951, + "learning_rate": 2.8923359437193877e-05, + "loss": 1.6667, + "step": 12570 + }, + { + "epoch": 2.25, + "grad_norm": 0.6979449987411499, + "learning_rate": 2.8909472918409552e-05, + "loss": 1.8703, + "step": 12575 + }, + { + "epoch": 2.25, + "grad_norm": 0.6348164677619934, + "learning_rate": 2.88955851630627e-05, + "loss": 1.782, + "step": 12580 + }, + { + "epoch": 2.25, + "grad_norm": 0.9662460088729858, + "learning_rate": 2.8881696175546014e-05, + "loss": 1.9121, + "step": 12585 + }, + { + "epoch": 2.25, + "grad_norm": 0.7864895462989807, + "learning_rate": 2.886780596025256e-05, + "loss": 1.7273, + "step": 12590 + }, + { + "epoch": 2.25, + "grad_norm": 1.4080077409744263, + "learning_rate": 2.8853914521575797e-05, + "loss": 1.6424, + "step": 12595 + }, + { + "epoch": 2.26, + "grad_norm": 0.8357173800468445, + "learning_rate": 2.8840021863909583e-05, + "loss": 1.9837, + "step": 12600 + }, + { + "epoch": 2.26, + "grad_norm": 0.4318985641002655, + "learning_rate": 2.882612799164815e-05, + "loss": 1.7542, + "step": 12605 + }, + { + "epoch": 2.26, + "grad_norm": 0.6467979550361633, + "learning_rate": 2.8812232909186103e-05, + "loss": 1.8621, + "step": 12610 + }, + { + "epoch": 2.26, + "grad_norm": 0.7634948492050171, + "learning_rate": 2.8798336620918464e-05, + "loss": 1.719, + "step": 12615 + }, + { + "epoch": 2.26, + "grad_norm": 0.7211525440216064, + "learning_rate": 2.878443913124059e-05, + "loss": 1.7789, + "step": 12620 + }, + { + "epoch": 2.26, + "grad_norm": 0.5822710394859314, + "learning_rate": 2.8770540444548272e-05, + "loss": 1.9444, + "step": 12625 + }, + { + "epoch": 2.26, + "grad_norm": 0.549856424331665, + "learning_rate": 2.8756640565237637e-05, + "loss": 2.1138, + "step": 12630 + }, + { + "epoch": 2.26, + "grad_norm": 1.3974264860153198, + "learning_rate": 2.8742739497705208e-05, + "loss": 1.9182, + "step": 12635 + }, + { + "epoch": 2.26, + "grad_norm": 0.5173325538635254, + "learning_rate": 2.8728837246347867e-05, + "loss": 1.8516, + "step": 12640 + }, + { + "epoch": 2.26, + "grad_norm": 0.45196789503097534, + "learning_rate": 2.8714933815562894e-05, + "loss": 1.7245, + "step": 12645 + }, + { + "epoch": 2.26, + "grad_norm": 0.7211712598800659, + "learning_rate": 2.870102920974793e-05, + "loss": 1.924, + "step": 12650 + }, + { + "epoch": 2.27, + "grad_norm": 0.7739723920822144, + "learning_rate": 2.868712343330099e-05, + "loss": 1.7423, + "step": 12655 + }, + { + "epoch": 2.27, + "grad_norm": 0.738148033618927, + "learning_rate": 2.8673216490620452e-05, + "loss": 1.9875, + "step": 12660 + }, + { + "epoch": 2.27, + "grad_norm": 0.5198020935058594, + "learning_rate": 2.8659308386105065e-05, + "loss": 1.8309, + "step": 12665 + }, + { + "epoch": 2.27, + "grad_norm": 0.786413848400116, + "learning_rate": 2.864539912415396e-05, + "loss": 2.0195, + "step": 12670 + }, + { + "epoch": 2.27, + "grad_norm": 0.3828994035720825, + "learning_rate": 2.8631488709166625e-05, + "loss": 1.9273, + "step": 12675 + }, + { + "epoch": 2.27, + "grad_norm": 0.7168930172920227, + "learning_rate": 2.8617577145542902e-05, + "loss": 1.8926, + "step": 12680 + }, + { + "epoch": 2.27, + "grad_norm": 0.955230176448822, + "learning_rate": 2.8603664437682998e-05, + "loss": 1.6857, + "step": 12685 + }, + { + "epoch": 2.27, + "grad_norm": 0.8382362127304077, + "learning_rate": 2.8589750589987506e-05, + "loss": 2.0365, + "step": 12690 + }, + { + "epoch": 2.27, + "grad_norm": 0.8835332989692688, + "learning_rate": 2.8575835606857356e-05, + "loss": 2.0321, + "step": 12695 + }, + { + "epoch": 2.27, + "grad_norm": 0.4743799865245819, + "learning_rate": 2.8561919492693845e-05, + "loss": 2.0926, + "step": 12700 + }, + { + "epoch": 2.27, + "grad_norm": 0.7596274614334106, + "learning_rate": 2.854800225189862e-05, + "loss": 1.8994, + "step": 12705 + }, + { + "epoch": 2.28, + "grad_norm": 0.7106197476387024, + "learning_rate": 2.8534083888873697e-05, + "loss": 1.7113, + "step": 12710 + }, + { + "epoch": 2.28, + "grad_norm": 0.7384430766105652, + "learning_rate": 2.8520164408021432e-05, + "loss": 1.742, + "step": 12715 + }, + { + "epoch": 2.28, + "grad_norm": 0.9046937227249146, + "learning_rate": 2.850624381374456e-05, + "loss": 1.9561, + "step": 12720 + }, + { + "epoch": 2.28, + "grad_norm": 0.8590306639671326, + "learning_rate": 2.8492322110446146e-05, + "loss": 1.8945, + "step": 12725 + }, + { + "epoch": 2.28, + "grad_norm": 0.8791319727897644, + "learning_rate": 2.8478399302529597e-05, + "loss": 1.8657, + "step": 12730 + }, + { + "epoch": 2.28, + "grad_norm": 0.6100798845291138, + "learning_rate": 2.84644753943987e-05, + "loss": 1.8644, + "step": 12735 + }, + { + "epoch": 2.28, + "grad_norm": 0.6638292670249939, + "learning_rate": 2.8450550390457565e-05, + "loss": 1.8004, + "step": 12740 + }, + { + "epoch": 2.28, + "grad_norm": 0.814220666885376, + "learning_rate": 2.8436624295110663e-05, + "loss": 1.885, + "step": 12745 + }, + { + "epoch": 2.28, + "grad_norm": 1.095098614692688, + "learning_rate": 2.8422697112762803e-05, + "loss": 2.1555, + "step": 12750 + }, + { + "epoch": 2.28, + "grad_norm": 0.6496490836143494, + "learning_rate": 2.8408768847819133e-05, + "loss": 1.7966, + "step": 12755 + }, + { + "epoch": 2.28, + "grad_norm": 0.9682343006134033, + "learning_rate": 2.8394839504685162e-05, + "loss": 1.8505, + "step": 12760 + }, + { + "epoch": 2.28, + "grad_norm": 0.9972975850105286, + "learning_rate": 2.8380909087766717e-05, + "loss": 1.7333, + "step": 12765 + }, + { + "epoch": 2.29, + "grad_norm": 0.9131410121917725, + "learning_rate": 2.8366977601469986e-05, + "loss": 1.9954, + "step": 12770 + }, + { + "epoch": 2.29, + "grad_norm": 0.8180422782897949, + "learning_rate": 2.8353045050201465e-05, + "loss": 1.7182, + "step": 12775 + }, + { + "epoch": 2.29, + "grad_norm": 0.5822163820266724, + "learning_rate": 2.8339111438368032e-05, + "loss": 1.6172, + "step": 12780 + }, + { + "epoch": 2.29, + "grad_norm": 0.9533834457397461, + "learning_rate": 2.832517677037686e-05, + "loss": 1.971, + "step": 12785 + }, + { + "epoch": 2.29, + "grad_norm": 0.9467795491218567, + "learning_rate": 2.8311241050635467e-05, + "loss": 1.8465, + "step": 12790 + }, + { + "epoch": 2.29, + "grad_norm": 0.5219370722770691, + "learning_rate": 2.8297304283551728e-05, + "loss": 1.932, + "step": 12795 + }, + { + "epoch": 2.29, + "grad_norm": 0.45871034264564514, + "learning_rate": 2.8283366473533808e-05, + "loss": 2.0782, + "step": 12800 + }, + { + "epoch": 2.29, + "grad_norm": 0.48181986808776855, + "learning_rate": 2.8269427624990224e-05, + "loss": 2.0225, + "step": 12805 + }, + { + "epoch": 2.29, + "grad_norm": 0.6003161668777466, + "learning_rate": 2.8255487742329838e-05, + "loss": 1.7363, + "step": 12810 + }, + { + "epoch": 2.29, + "grad_norm": 0.6212576031684875, + "learning_rate": 2.8241546829961808e-05, + "loss": 1.887, + "step": 12815 + }, + { + "epoch": 2.29, + "grad_norm": 0.5601562261581421, + "learning_rate": 2.8227604892295628e-05, + "loss": 1.8317, + "step": 12820 + }, + { + "epoch": 2.3, + "grad_norm": 0.9833465814590454, + "learning_rate": 2.8213661933741132e-05, + "loss": 1.8634, + "step": 12825 + }, + { + "epoch": 2.3, + "grad_norm": 4.311710357666016, + "learning_rate": 2.8199717958708455e-05, + "loss": 1.6882, + "step": 12830 + }, + { + "epoch": 2.3, + "grad_norm": 0.8071384429931641, + "learning_rate": 2.818577297160806e-05, + "loss": 2.0808, + "step": 12835 + }, + { + "epoch": 2.3, + "grad_norm": 0.8173998594284058, + "learning_rate": 2.8171826976850753e-05, + "loss": 1.7736, + "step": 12840 + }, + { + "epoch": 2.3, + "grad_norm": 0.7156624794006348, + "learning_rate": 2.815787997884761e-05, + "loss": 1.89, + "step": 12845 + }, + { + "epoch": 2.3, + "grad_norm": 1.1727192401885986, + "learning_rate": 2.8143931982010068e-05, + "loss": 1.8794, + "step": 12850 + }, + { + "epoch": 2.3, + "grad_norm": 0.5154191255569458, + "learning_rate": 2.8129982990749872e-05, + "loss": 1.9237, + "step": 12855 + }, + { + "epoch": 2.3, + "grad_norm": 1.5282955169677734, + "learning_rate": 2.8116033009479058e-05, + "loss": 1.9519, + "step": 12860 + }, + { + "epoch": 2.3, + "grad_norm": 0.8198376297950745, + "learning_rate": 2.8102082042610007e-05, + "loss": 1.8245, + "step": 12865 + }, + { + "epoch": 2.3, + "grad_norm": 1.2161682844161987, + "learning_rate": 2.8088130094555394e-05, + "loss": 1.7885, + "step": 12870 + }, + { + "epoch": 2.3, + "grad_norm": 4.205202579498291, + "learning_rate": 2.8074177169728188e-05, + "loss": 1.9236, + "step": 12875 + }, + { + "epoch": 2.31, + "grad_norm": 0.6405089497566223, + "learning_rate": 2.8060223272541706e-05, + "loss": 1.863, + "step": 12880 + }, + { + "epoch": 2.31, + "grad_norm": 1.1483161449432373, + "learning_rate": 2.8046268407409555e-05, + "loss": 1.8218, + "step": 12885 + }, + { + "epoch": 2.31, + "grad_norm": 1.4012528657913208, + "learning_rate": 2.8032312578745634e-05, + "loss": 1.7995, + "step": 12890 + }, + { + "epoch": 2.31, + "grad_norm": 0.5227799415588379, + "learning_rate": 2.8018355790964153e-05, + "loss": 2.0666, + "step": 12895 + }, + { + "epoch": 2.31, + "grad_norm": 0.7263579368591309, + "learning_rate": 2.8004398048479646e-05, + "loss": 1.5815, + "step": 12900 + }, + { + "epoch": 2.31, + "grad_norm": 0.8712220191955566, + "learning_rate": 2.799043935570692e-05, + "loss": 1.6493, + "step": 12905 + }, + { + "epoch": 2.31, + "grad_norm": 0.4584944248199463, + "learning_rate": 2.797647971706111e-05, + "loss": 2.0632, + "step": 12910 + }, + { + "epoch": 2.31, + "grad_norm": 0.9673914909362793, + "learning_rate": 2.7962519136957625e-05, + "loss": 1.7794, + "step": 12915 + }, + { + "epoch": 2.31, + "grad_norm": 0.5151163339614868, + "learning_rate": 2.7948557619812183e-05, + "loss": 1.8515, + "step": 12920 + }, + { + "epoch": 2.31, + "grad_norm": 0.7004354596138, + "learning_rate": 2.7934595170040802e-05, + "loss": 1.7434, + "step": 12925 + }, + { + "epoch": 2.31, + "grad_norm": 0.5903719663619995, + "learning_rate": 2.7920631792059797e-05, + "loss": 1.9251, + "step": 12930 + }, + { + "epoch": 2.32, + "grad_norm": 1.2344070672988892, + "learning_rate": 2.7906667490285765e-05, + "loss": 1.7713, + "step": 12935 + }, + { + "epoch": 2.32, + "grad_norm": 1.2571988105773926, + "learning_rate": 2.7892702269135595e-05, + "loss": 1.5384, + "step": 12940 + }, + { + "epoch": 2.32, + "grad_norm": 0.5835108757019043, + "learning_rate": 2.787873613302649e-05, + "loss": 1.7925, + "step": 12945 + }, + { + "epoch": 2.32, + "grad_norm": 0.5990691184997559, + "learning_rate": 2.7864769086375903e-05, + "loss": 1.7678, + "step": 12950 + }, + { + "epoch": 2.32, + "grad_norm": 1.5972462892532349, + "learning_rate": 2.7850801133601616e-05, + "loss": 1.8765, + "step": 12955 + }, + { + "epoch": 2.32, + "grad_norm": 2.4750313758850098, + "learning_rate": 2.7836832279121678e-05, + "loss": 2.0151, + "step": 12960 + }, + { + "epoch": 2.32, + "grad_norm": 1.888633131980896, + "learning_rate": 2.782286252735441e-05, + "loss": 1.9451, + "step": 12965 + }, + { + "epoch": 2.32, + "grad_norm": 0.58510422706604, + "learning_rate": 2.7808891882718445e-05, + "loss": 2.0672, + "step": 12970 + }, + { + "epoch": 2.32, + "grad_norm": 0.7262510657310486, + "learning_rate": 2.7794920349632675e-05, + "loss": 1.9223, + "step": 12975 + }, + { + "epoch": 2.32, + "grad_norm": 0.7543483972549438, + "learning_rate": 2.7780947932516294e-05, + "loss": 1.9088, + "step": 12980 + }, + { + "epoch": 2.32, + "grad_norm": 5.534985542297363, + "learning_rate": 2.7766974635788743e-05, + "loss": 2.0211, + "step": 12985 + }, + { + "epoch": 2.33, + "grad_norm": 0.6918667554855347, + "learning_rate": 2.7753000463869794e-05, + "loss": 1.7378, + "step": 12990 + }, + { + "epoch": 2.33, + "grad_norm": 0.7650964260101318, + "learning_rate": 2.7739025421179433e-05, + "loss": 1.5172, + "step": 12995 + }, + { + "epoch": 2.33, + "grad_norm": 0.662441611289978, + "learning_rate": 2.7725049512137967e-05, + "loss": 1.9162, + "step": 13000 + }, + { + "epoch": 2.33, + "grad_norm": 0.6917517781257629, + "learning_rate": 2.771107274116597e-05, + "loss": 1.7873, + "step": 13005 + }, + { + "epoch": 2.33, + "grad_norm": 0.6874393820762634, + "learning_rate": 2.769709511268427e-05, + "loss": 1.9959, + "step": 13010 + }, + { + "epoch": 2.33, + "grad_norm": 0.8905332684516907, + "learning_rate": 2.7683116631113976e-05, + "loss": 1.9593, + "step": 13015 + }, + { + "epoch": 2.33, + "grad_norm": 1.5240230560302734, + "learning_rate": 2.7669137300876467e-05, + "loss": 1.6583, + "step": 13020 + }, + { + "epoch": 2.33, + "grad_norm": 1.5556925535202026, + "learning_rate": 2.7655157126393404e-05, + "loss": 1.6085, + "step": 13025 + }, + { + "epoch": 2.33, + "grad_norm": 0.9076879620552063, + "learning_rate": 2.7641176112086698e-05, + "loss": 1.6375, + "step": 13030 + }, + { + "epoch": 2.33, + "grad_norm": 0.4439728260040283, + "learning_rate": 2.762719426237853e-05, + "loss": 1.9813, + "step": 13035 + }, + { + "epoch": 2.33, + "grad_norm": 0.7149770259857178, + "learning_rate": 2.761321158169134e-05, + "loss": 1.9144, + "step": 13040 + }, + { + "epoch": 2.33, + "grad_norm": 0.49863100051879883, + "learning_rate": 2.759922807444784e-05, + "loss": 2.1206, + "step": 13045 + }, + { + "epoch": 2.34, + "grad_norm": 1.2692310810089111, + "learning_rate": 2.7585243745071003e-05, + "loss": 1.9092, + "step": 13050 + }, + { + "epoch": 2.34, + "grad_norm": 0.9751031398773193, + "learning_rate": 2.757125859798405e-05, + "loss": 2.052, + "step": 13055 + }, + { + "epoch": 2.34, + "grad_norm": 0.8686983585357666, + "learning_rate": 2.7557272637610483e-05, + "loss": 1.8621, + "step": 13060 + }, + { + "epoch": 2.34, + "grad_norm": 0.48079320788383484, + "learning_rate": 2.7543285868374037e-05, + "loss": 1.6149, + "step": 13065 + }, + { + "epoch": 2.34, + "grad_norm": 0.8371298909187317, + "learning_rate": 2.7529298294698712e-05, + "loss": 1.9314, + "step": 13070 + }, + { + "epoch": 2.34, + "grad_norm": 0.3321702182292938, + "learning_rate": 2.7515309921008775e-05, + "loss": 1.8415, + "step": 13075 + }, + { + "epoch": 2.34, + "grad_norm": 0.8658364415168762, + "learning_rate": 2.7501320751728732e-05, + "loss": 1.7368, + "step": 13080 + }, + { + "epoch": 2.34, + "grad_norm": 0.8022560477256775, + "learning_rate": 2.7487330791283335e-05, + "loss": 1.6356, + "step": 13085 + }, + { + "epoch": 2.34, + "grad_norm": 1.0247143507003784, + "learning_rate": 2.7473340044097605e-05, + "loss": 1.8981, + "step": 13090 + }, + { + "epoch": 2.34, + "grad_norm": 0.5062198042869568, + "learning_rate": 2.7459348514596796e-05, + "loss": 2.1258, + "step": 13095 + }, + { + "epoch": 2.34, + "grad_norm": 6.957434177398682, + "learning_rate": 2.7445356207206424e-05, + "loss": 1.7353, + "step": 13100 + }, + { + "epoch": 2.35, + "grad_norm": 0.8973393440246582, + "learning_rate": 2.7431363126352232e-05, + "loss": 1.5304, + "step": 13105 + }, + { + "epoch": 2.35, + "grad_norm": 0.7698760628700256, + "learning_rate": 2.7417369276460215e-05, + "loss": 1.8007, + "step": 13110 + }, + { + "epoch": 2.35, + "grad_norm": 0.6482462882995605, + "learning_rate": 2.7403374661956626e-05, + "loss": 1.9363, + "step": 13115 + }, + { + "epoch": 2.35, + "grad_norm": 0.528232216835022, + "learning_rate": 2.7389379287267953e-05, + "loss": 1.5902, + "step": 13120 + }, + { + "epoch": 2.35, + "grad_norm": 0.5823779702186584, + "learning_rate": 2.7375383156820895e-05, + "loss": 1.8967, + "step": 13125 + }, + { + "epoch": 2.35, + "grad_norm": 1.153917908668518, + "learning_rate": 2.7361386275042434e-05, + "loss": 1.8673, + "step": 13130 + }, + { + "epoch": 2.35, + "grad_norm": 1.1044551134109497, + "learning_rate": 2.734738864635977e-05, + "loss": 1.9955, + "step": 13135 + }, + { + "epoch": 2.35, + "grad_norm": 0.7324090600013733, + "learning_rate": 2.733339027520032e-05, + "loss": 1.8696, + "step": 13140 + }, + { + "epoch": 2.35, + "grad_norm": 0.6724053621292114, + "learning_rate": 2.7319391165991786e-05, + "loss": 2.0311, + "step": 13145 + }, + { + "epoch": 2.35, + "grad_norm": 1.5398845672607422, + "learning_rate": 2.7305391323162056e-05, + "loss": 1.8971, + "step": 13150 + }, + { + "epoch": 2.35, + "grad_norm": 0.6307987570762634, + "learning_rate": 2.7291390751139263e-05, + "loss": 1.998, + "step": 13155 + }, + { + "epoch": 2.36, + "grad_norm": 0.5545775890350342, + "learning_rate": 2.7277389454351777e-05, + "loss": 1.9609, + "step": 13160 + }, + { + "epoch": 2.36, + "grad_norm": 0.9437121748924255, + "learning_rate": 2.7263387437228204e-05, + "loss": 1.9158, + "step": 13165 + }, + { + "epoch": 2.36, + "grad_norm": 0.5333415865898132, + "learning_rate": 2.724938470419736e-05, + "loss": 2.0586, + "step": 13170 + }, + { + "epoch": 2.36, + "grad_norm": 0.7932983636856079, + "learning_rate": 2.72353812596883e-05, + "loss": 1.9107, + "step": 13175 + }, + { + "epoch": 2.36, + "grad_norm": 0.802893877029419, + "learning_rate": 2.7221377108130292e-05, + "loss": 2.0469, + "step": 13180 + }, + { + "epoch": 2.36, + "grad_norm": 0.4990454614162445, + "learning_rate": 2.720737225395284e-05, + "loss": 1.9607, + "step": 13185 + }, + { + "epoch": 2.36, + "grad_norm": 1.1566194295883179, + "learning_rate": 2.7193366701585677e-05, + "loss": 1.781, + "step": 13190 + }, + { + "epoch": 2.36, + "grad_norm": 0.773506224155426, + "learning_rate": 2.717936045545873e-05, + "loss": 1.8001, + "step": 13195 + }, + { + "epoch": 2.36, + "grad_norm": 0.6509556174278259, + "learning_rate": 2.7165353520002167e-05, + "loss": 1.9832, + "step": 13200 + }, + { + "epoch": 2.36, + "grad_norm": 0.9481406807899475, + "learning_rate": 2.7151345899646363e-05, + "loss": 1.8591, + "step": 13205 + }, + { + "epoch": 2.36, + "grad_norm": 0.6072911620140076, + "learning_rate": 2.7137337598821932e-05, + "loss": 1.8942, + "step": 13210 + }, + { + "epoch": 2.37, + "grad_norm": 0.9392735362052917, + "learning_rate": 2.712332862195967e-05, + "loss": 1.8107, + "step": 13215 + }, + { + "epoch": 2.37, + "grad_norm": 0.8217921257019043, + "learning_rate": 2.7109318973490606e-05, + "loss": 1.8219, + "step": 13220 + }, + { + "epoch": 2.37, + "grad_norm": 1.2502552270889282, + "learning_rate": 2.709530865784598e-05, + "loss": 2.1015, + "step": 13225 + }, + { + "epoch": 2.37, + "grad_norm": 0.7115715742111206, + "learning_rate": 2.7081297679457236e-05, + "loss": 1.8984, + "step": 13230 + }, + { + "epoch": 2.37, + "grad_norm": 0.6438332796096802, + "learning_rate": 2.706728604275604e-05, + "loss": 1.9235, + "step": 13235 + }, + { + "epoch": 2.37, + "grad_norm": 0.5191622376441956, + "learning_rate": 2.705327375217427e-05, + "loss": 1.9651, + "step": 13240 + }, + { + "epoch": 2.37, + "grad_norm": 1.5185312032699585, + "learning_rate": 2.703926081214397e-05, + "loss": 1.9163, + "step": 13245 + }, + { + "epoch": 2.37, + "grad_norm": 0.4574000835418701, + "learning_rate": 2.702524722709744e-05, + "loss": 1.8515, + "step": 13250 + }, + { + "epoch": 2.37, + "grad_norm": 0.6382564902305603, + "learning_rate": 2.7011233001467166e-05, + "loss": 2.0203, + "step": 13255 + }, + { + "epoch": 2.37, + "grad_norm": 0.9020372629165649, + "learning_rate": 2.6997218139685825e-05, + "loss": 1.7434, + "step": 13260 + }, + { + "epoch": 2.37, + "grad_norm": 0.9500882625579834, + "learning_rate": 2.6983202646186306e-05, + "loss": 1.6759, + "step": 13265 + }, + { + "epoch": 2.38, + "grad_norm": 0.5408741235733032, + "learning_rate": 2.69691865254017e-05, + "loss": 1.9249, + "step": 13270 + }, + { + "epoch": 2.38, + "grad_norm": 0.5403386950492859, + "learning_rate": 2.695516978176529e-05, + "loss": 1.9793, + "step": 13275 + }, + { + "epoch": 2.38, + "grad_norm": 1.1900765895843506, + "learning_rate": 2.694115241971056e-05, + "loss": 1.5909, + "step": 13280 + }, + { + "epoch": 2.38, + "grad_norm": 0.8298088908195496, + "learning_rate": 2.6927134443671187e-05, + "loss": 1.6979, + "step": 13285 + }, + { + "epoch": 2.38, + "grad_norm": 0.30263030529022217, + "learning_rate": 2.6913115858081038e-05, + "loss": 2.0684, + "step": 13290 + }, + { + "epoch": 2.38, + "grad_norm": 0.6629304885864258, + "learning_rate": 2.6899096667374187e-05, + "loss": 1.7992, + "step": 13295 + }, + { + "epoch": 2.38, + "grad_norm": 0.8511559367179871, + "learning_rate": 2.6885076875984888e-05, + "loss": 1.8706, + "step": 13300 + }, + { + "epoch": 2.38, + "grad_norm": 0.7309024333953857, + "learning_rate": 2.6871056488347583e-05, + "loss": 1.6392, + "step": 13305 + }, + { + "epoch": 2.38, + "grad_norm": 0.796414315700531, + "learning_rate": 2.6857035508896917e-05, + "loss": 1.8747, + "step": 13310 + }, + { + "epoch": 2.38, + "grad_norm": 0.9297599792480469, + "learning_rate": 2.6843013942067702e-05, + "loss": 1.527, + "step": 13315 + }, + { + "epoch": 2.38, + "grad_norm": 1.2013111114501953, + "learning_rate": 2.6828991792294943e-05, + "loss": 1.8731, + "step": 13320 + }, + { + "epoch": 2.39, + "grad_norm": 0.5262635946273804, + "learning_rate": 2.6814969064013846e-05, + "loss": 1.898, + "step": 13325 + }, + { + "epoch": 2.39, + "grad_norm": 0.6759445071220398, + "learning_rate": 2.6800945761659784e-05, + "loss": 1.9081, + "step": 13330 + }, + { + "epoch": 2.39, + "grad_norm": 0.5417052507400513, + "learning_rate": 2.6786921889668303e-05, + "loss": 1.689, + "step": 13335 + }, + { + "epoch": 2.39, + "grad_norm": 0.7009138464927673, + "learning_rate": 2.677289745247515e-05, + "loss": 1.9229, + "step": 13340 + }, + { + "epoch": 2.39, + "grad_norm": 0.689923882484436, + "learning_rate": 2.675887245451624e-05, + "loss": 1.8571, + "step": 13345 + }, + { + "epoch": 2.39, + "grad_norm": 1.4503270387649536, + "learning_rate": 2.6744846900227654e-05, + "loss": 1.8241, + "step": 13350 + }, + { + "epoch": 2.39, + "grad_norm": 1.711902141571045, + "learning_rate": 2.673082079404568e-05, + "loss": 1.7829, + "step": 13355 + }, + { + "epoch": 2.39, + "grad_norm": 0.37441059947013855, + "learning_rate": 2.6716794140406758e-05, + "loss": 1.8407, + "step": 13360 + }, + { + "epoch": 2.39, + "grad_norm": 0.5013666749000549, + "learning_rate": 2.6702766943747493e-05, + "loss": 1.9828, + "step": 13365 + }, + { + "epoch": 2.39, + "grad_norm": 0.7650493383407593, + "learning_rate": 2.6688739208504686e-05, + "loss": 1.7357, + "step": 13370 + }, + { + "epoch": 2.39, + "grad_norm": 0.6591855883598328, + "learning_rate": 2.667471093911529e-05, + "loss": 2.0887, + "step": 13375 + }, + { + "epoch": 2.39, + "grad_norm": 0.5767456889152527, + "learning_rate": 2.6660682140016435e-05, + "loss": 1.9841, + "step": 13380 + }, + { + "epoch": 2.4, + "grad_norm": 0.6293467283248901, + "learning_rate": 2.6646652815645412e-05, + "loss": 2.0894, + "step": 13385 + }, + { + "epoch": 2.4, + "grad_norm": 0.4856579601764679, + "learning_rate": 2.6632622970439698e-05, + "loss": 1.8616, + "step": 13390 + }, + { + "epoch": 2.4, + "grad_norm": 0.9088009595870972, + "learning_rate": 2.6618592608836896e-05, + "loss": 1.8978, + "step": 13395 + }, + { + "epoch": 2.4, + "grad_norm": 1.0228464603424072, + "learning_rate": 2.660456173527482e-05, + "loss": 1.7805, + "step": 13400 + }, + { + "epoch": 2.4, + "grad_norm": 0.8853181004524231, + "learning_rate": 2.6590530354191405e-05, + "loss": 1.683, + "step": 13405 + }, + { + "epoch": 2.4, + "grad_norm": 0.7312606573104858, + "learning_rate": 2.657649847002477e-05, + "loss": 1.9513, + "step": 13410 + }, + { + "epoch": 2.4, + "grad_norm": 0.7548524737358093, + "learning_rate": 2.6562466087213188e-05, + "loss": 1.7518, + "step": 13415 + }, + { + "epoch": 2.4, + "grad_norm": 1.1628096103668213, + "learning_rate": 2.654843321019508e-05, + "loss": 1.9882, + "step": 13420 + }, + { + "epoch": 2.4, + "grad_norm": 0.8537325263023376, + "learning_rate": 2.653439984340904e-05, + "loss": 2.1221, + "step": 13425 + }, + { + "epoch": 2.4, + "grad_norm": 0.9286057949066162, + "learning_rate": 2.65203659912938e-05, + "loss": 1.7583, + "step": 13430 + }, + { + "epoch": 2.4, + "grad_norm": 0.7813141942024231, + "learning_rate": 2.650633165828827e-05, + "loss": 1.7862, + "step": 13435 + }, + { + "epoch": 2.41, + "grad_norm": 0.770072877407074, + "learning_rate": 2.6492296848831476e-05, + "loss": 1.7671, + "step": 13440 + }, + { + "epoch": 2.41, + "grad_norm": 0.7098547220230103, + "learning_rate": 2.6478261567362628e-05, + "loss": 2.02, + "step": 13445 + }, + { + "epoch": 2.41, + "grad_norm": 1.0164227485656738, + "learning_rate": 2.646422581832107e-05, + "loss": 1.6729, + "step": 13450 + }, + { + "epoch": 2.41, + "grad_norm": 1.548316478729248, + "learning_rate": 2.645018960614629e-05, + "loss": 1.694, + "step": 13455 + }, + { + "epoch": 2.41, + "grad_norm": 0.7778927683830261, + "learning_rate": 2.6436152935277936e-05, + "loss": 1.775, + "step": 13460 + }, + { + "epoch": 2.41, + "grad_norm": 0.7858615517616272, + "learning_rate": 2.642211581015579e-05, + "loss": 1.6574, + "step": 13465 + }, + { + "epoch": 2.41, + "grad_norm": 0.9521759748458862, + "learning_rate": 2.640807823521978e-05, + "loss": 1.7402, + "step": 13470 + }, + { + "epoch": 2.41, + "grad_norm": 1.4299689531326294, + "learning_rate": 2.6394040214909976e-05, + "loss": 1.7257, + "step": 13475 + }, + { + "epoch": 2.41, + "grad_norm": 0.9395617246627808, + "learning_rate": 2.6380001753666593e-05, + "loss": 1.8034, + "step": 13480 + }, + { + "epoch": 2.41, + "grad_norm": 0.4695361256599426, + "learning_rate": 2.636596285592999e-05, + "loss": 1.8073, + "step": 13485 + }, + { + "epoch": 2.41, + "grad_norm": 0.31577566266059875, + "learning_rate": 2.635192352614064e-05, + "loss": 1.9157, + "step": 13490 + }, + { + "epoch": 2.42, + "grad_norm": 0.4319940507411957, + "learning_rate": 2.6337883768739192e-05, + "loss": 1.969, + "step": 13495 + }, + { + "epoch": 2.42, + "grad_norm": 0.9783098101615906, + "learning_rate": 2.632384358816638e-05, + "loss": 2.0406, + "step": 13500 + }, + { + "epoch": 2.42, + "grad_norm": 1.060631513595581, + "learning_rate": 2.6309802988863124e-05, + "loss": 1.7076, + "step": 13505 + }, + { + "epoch": 2.42, + "grad_norm": 0.9120562672615051, + "learning_rate": 2.6295761975270432e-05, + "loss": 1.958, + "step": 13510 + }, + { + "epoch": 2.42, + "grad_norm": 0.6509984135627747, + "learning_rate": 2.628172055182948e-05, + "loss": 1.9328, + "step": 13515 + }, + { + "epoch": 2.42, + "grad_norm": 0.6889690160751343, + "learning_rate": 2.626767872298156e-05, + "loss": 2.1166, + "step": 13520 + }, + { + "epoch": 2.42, + "grad_norm": 1.167043924331665, + "learning_rate": 2.6253636493168065e-05, + "loss": 1.9546, + "step": 13525 + }, + { + "epoch": 2.42, + "grad_norm": 0.5428257584571838, + "learning_rate": 2.623959386683056e-05, + "loss": 1.8359, + "step": 13530 + }, + { + "epoch": 2.42, + "grad_norm": 0.911302387714386, + "learning_rate": 2.6225550848410706e-05, + "loss": 1.7099, + "step": 13535 + }, + { + "epoch": 2.42, + "grad_norm": 0.8027520775794983, + "learning_rate": 2.6211507442350303e-05, + "loss": 1.9308, + "step": 13540 + }, + { + "epoch": 2.42, + "grad_norm": 0.5085805058479309, + "learning_rate": 2.619746365309125e-05, + "loss": 1.7125, + "step": 13545 + }, + { + "epoch": 2.43, + "grad_norm": 0.828308641910553, + "learning_rate": 2.6183419485075605e-05, + "loss": 2.0749, + "step": 13550 + }, + { + "epoch": 2.43, + "grad_norm": 1.7583789825439453, + "learning_rate": 2.6169374942745505e-05, + "loss": 1.8622, + "step": 13555 + }, + { + "epoch": 2.43, + "grad_norm": 0.5300992131233215, + "learning_rate": 2.615533003054324e-05, + "loss": 1.4851, + "step": 13560 + }, + { + "epoch": 2.43, + "grad_norm": 0.7071987986564636, + "learning_rate": 2.6141284752911205e-05, + "loss": 1.7153, + "step": 13565 + }, + { + "epoch": 2.43, + "grad_norm": 1.1922173500061035, + "learning_rate": 2.612723911429189e-05, + "loss": 1.8535, + "step": 13570 + }, + { + "epoch": 2.43, + "grad_norm": 0.631430447101593, + "learning_rate": 2.6113193119127934e-05, + "loss": 2.0645, + "step": 13575 + }, + { + "epoch": 2.43, + "grad_norm": 0.7159907817840576, + "learning_rate": 2.6099146771862064e-05, + "loss": 1.748, + "step": 13580 + }, + { + "epoch": 2.43, + "grad_norm": 0.622367262840271, + "learning_rate": 2.6085100076937126e-05, + "loss": 1.5421, + "step": 13585 + }, + { + "epoch": 2.43, + "grad_norm": 0.9370682239532471, + "learning_rate": 2.6071053038796077e-05, + "loss": 1.8222, + "step": 13590 + }, + { + "epoch": 2.43, + "grad_norm": 0.9740180969238281, + "learning_rate": 2.6057005661881982e-05, + "loss": 1.6439, + "step": 13595 + }, + { + "epoch": 2.43, + "grad_norm": 1.1773573160171509, + "learning_rate": 2.6042957950638013e-05, + "loss": 1.9287, + "step": 13600 + }, + { + "epoch": 2.44, + "grad_norm": 0.8247451186180115, + "learning_rate": 2.602890990950745e-05, + "loss": 1.8199, + "step": 13605 + }, + { + "epoch": 2.44, + "grad_norm": 0.7261055111885071, + "learning_rate": 2.6014861542933676e-05, + "loss": 1.7212, + "step": 13610 + }, + { + "epoch": 2.44, + "grad_norm": 0.8083186149597168, + "learning_rate": 2.600081285536016e-05, + "loss": 1.9574, + "step": 13615 + }, + { + "epoch": 2.44, + "grad_norm": 0.9959443211555481, + "learning_rate": 2.5986763851230506e-05, + "loss": 1.982, + "step": 13620 + }, + { + "epoch": 2.44, + "grad_norm": 0.7056101560592651, + "learning_rate": 2.5972714534988403e-05, + "loss": 1.7576, + "step": 13625 + }, + { + "epoch": 2.44, + "grad_norm": 0.7754709720611572, + "learning_rate": 2.5958664911077614e-05, + "loss": 1.8267, + "step": 13630 + }, + { + "epoch": 2.44, + "grad_norm": 0.6835538744926453, + "learning_rate": 2.5944614983942044e-05, + "loss": 1.7875, + "step": 13635 + }, + { + "epoch": 2.44, + "grad_norm": 0.8215372562408447, + "learning_rate": 2.5930564758025665e-05, + "loss": 1.462, + "step": 13640 + }, + { + "epoch": 2.44, + "grad_norm": 0.608349621295929, + "learning_rate": 2.5916514237772543e-05, + "loss": 1.9472, + "step": 13645 + }, + { + "epoch": 2.44, + "grad_norm": 0.9655773639678955, + "learning_rate": 2.590246342762685e-05, + "loss": 1.7016, + "step": 13650 + }, + { + "epoch": 2.44, + "grad_norm": 0.6545687317848206, + "learning_rate": 2.5888412332032847e-05, + "loss": 1.7792, + "step": 13655 + }, + { + "epoch": 2.45, + "grad_norm": 0.8544228672981262, + "learning_rate": 2.587436095543488e-05, + "loss": 1.7444, + "step": 13660 + }, + { + "epoch": 2.45, + "grad_norm": 0.4463987946510315, + "learning_rate": 2.5860309302277373e-05, + "loss": 1.8348, + "step": 13665 + }, + { + "epoch": 2.45, + "grad_norm": 0.6795549392700195, + "learning_rate": 2.5846257377004874e-05, + "loss": 1.95, + "step": 13670 + }, + { + "epoch": 2.45, + "grad_norm": 0.8079219460487366, + "learning_rate": 2.583220518406197e-05, + "loss": 1.7795, + "step": 13675 + }, + { + "epoch": 2.45, + "grad_norm": 0.4398018419742584, + "learning_rate": 2.581815272789337e-05, + "loss": 2.0165, + "step": 13680 + }, + { + "epoch": 2.45, + "grad_norm": 1.9223191738128662, + "learning_rate": 2.5804100012943855e-05, + "loss": 1.7688, + "step": 13685 + }, + { + "epoch": 2.45, + "grad_norm": 0.9821711778640747, + "learning_rate": 2.579004704365827e-05, + "loss": 1.8864, + "step": 13690 + }, + { + "epoch": 2.45, + "grad_norm": 0.7216715812683105, + "learning_rate": 2.5775993824481565e-05, + "loss": 1.8994, + "step": 13695 + }, + { + "epoch": 2.45, + "grad_norm": 0.4654800295829773, + "learning_rate": 2.5761940359858767e-05, + "loss": 2.0082, + "step": 13700 + }, + { + "epoch": 2.45, + "grad_norm": 1.3378876447677612, + "learning_rate": 2.5747886654234967e-05, + "loss": 1.799, + "step": 13705 + }, + { + "epoch": 2.45, + "grad_norm": 1.4554535150527954, + "learning_rate": 2.5733832712055328e-05, + "loss": 1.8686, + "step": 13710 + }, + { + "epoch": 2.45, + "grad_norm": 0.4863360822200775, + "learning_rate": 2.5719778537765115e-05, + "loss": 2.0027, + "step": 13715 + }, + { + "epoch": 2.46, + "grad_norm": 6.204029560089111, + "learning_rate": 2.570572413580964e-05, + "loss": 2.0457, + "step": 13720 + }, + { + "epoch": 2.46, + "grad_norm": 1.010169267654419, + "learning_rate": 2.5691669510634302e-05, + "loss": 1.829, + "step": 13725 + }, + { + "epoch": 2.46, + "grad_norm": 0.463751882314682, + "learning_rate": 2.5677614666684567e-05, + "loss": 1.9938, + "step": 13730 + }, + { + "epoch": 2.46, + "grad_norm": 0.6773427724838257, + "learning_rate": 2.566355960840596e-05, + "loss": 2.0168, + "step": 13735 + }, + { + "epoch": 2.46, + "grad_norm": 0.6346594095230103, + "learning_rate": 2.5649504340244085e-05, + "loss": 1.7716, + "step": 13740 + }, + { + "epoch": 2.46, + "grad_norm": 0.40613624453544617, + "learning_rate": 2.5635448866644613e-05, + "loss": 1.7708, + "step": 13745 + }, + { + "epoch": 2.46, + "grad_norm": 0.7240647077560425, + "learning_rate": 2.5621393192053278e-05, + "loss": 2.1106, + "step": 13750 + }, + { + "epoch": 2.46, + "grad_norm": 0.6415520906448364, + "learning_rate": 2.560733732091587e-05, + "loss": 1.5509, + "step": 13755 + }, + { + "epoch": 2.46, + "grad_norm": 0.5846617221832275, + "learning_rate": 2.559328125767826e-05, + "loss": 1.7258, + "step": 13760 + }, + { + "epoch": 2.46, + "grad_norm": 0.5477493405342102, + "learning_rate": 2.557922500678635e-05, + "loss": 1.7324, + "step": 13765 + }, + { + "epoch": 2.46, + "grad_norm": 0.5833098292350769, + "learning_rate": 2.556516857268613e-05, + "loss": 1.9903, + "step": 13770 + }, + { + "epoch": 2.47, + "grad_norm": 0.5727924108505249, + "learning_rate": 2.555111195982364e-05, + "loss": 1.9487, + "step": 13775 + }, + { + "epoch": 2.47, + "grad_norm": 0.5207077264785767, + "learning_rate": 2.553705517264496e-05, + "loss": 1.9171, + "step": 13780 + }, + { + "epoch": 2.47, + "grad_norm": 0.6740149259567261, + "learning_rate": 2.5522998215596245e-05, + "loss": 1.8721, + "step": 13785 + }, + { + "epoch": 2.47, + "grad_norm": 2.3125998973846436, + "learning_rate": 2.55089410931237e-05, + "loss": 1.8894, + "step": 13790 + }, + { + "epoch": 2.47, + "grad_norm": 0.8759080767631531, + "learning_rate": 2.5494883809673576e-05, + "loss": 1.7691, + "step": 13795 + }, + { + "epoch": 2.47, + "grad_norm": 0.6559151411056519, + "learning_rate": 2.5480826369692178e-05, + "loss": 1.8659, + "step": 13800 + }, + { + "epoch": 2.47, + "grad_norm": 1.1714446544647217, + "learning_rate": 2.5466768777625872e-05, + "loss": 1.8289, + "step": 13805 + }, + { + "epoch": 2.47, + "grad_norm": 1.3722327947616577, + "learning_rate": 2.5452711037921033e-05, + "loss": 1.5297, + "step": 13810 + }, + { + "epoch": 2.47, + "grad_norm": 0.6491648554801941, + "learning_rate": 2.5438653155024145e-05, + "loss": 1.8004, + "step": 13815 + }, + { + "epoch": 2.47, + "grad_norm": 0.7832010984420776, + "learning_rate": 2.542459513338168e-05, + "loss": 1.7827, + "step": 13820 + }, + { + "epoch": 2.47, + "grad_norm": 0.854123055934906, + "learning_rate": 2.541053697744018e-05, + "loss": 2.0299, + "step": 13825 + }, + { + "epoch": 2.48, + "grad_norm": 0.6451117992401123, + "learning_rate": 2.5396478691646227e-05, + "loss": 1.848, + "step": 13830 + }, + { + "epoch": 2.48, + "grad_norm": 0.6874570846557617, + "learning_rate": 2.538242028044645e-05, + "loss": 1.7095, + "step": 13835 + }, + { + "epoch": 2.48, + "grad_norm": 2.2778282165527344, + "learning_rate": 2.53683617482875e-05, + "loss": 1.7892, + "step": 13840 + }, + { + "epoch": 2.48, + "grad_norm": 0.7223300337791443, + "learning_rate": 2.535430309961609e-05, + "loss": 1.8236, + "step": 13845 + }, + { + "epoch": 2.48, + "grad_norm": 0.7660056948661804, + "learning_rate": 2.5340244338878948e-05, + "loss": 2.1062, + "step": 13850 + }, + { + "epoch": 2.48, + "grad_norm": 0.7210371494293213, + "learning_rate": 2.5326185470522852e-05, + "loss": 1.7827, + "step": 13855 + }, + { + "epoch": 2.48, + "grad_norm": 0.532279372215271, + "learning_rate": 2.5312126498994603e-05, + "loss": 1.7299, + "step": 13860 + }, + { + "epoch": 2.48, + "grad_norm": 1.1334110498428345, + "learning_rate": 2.5298067428741034e-05, + "loss": 1.8157, + "step": 13865 + }, + { + "epoch": 2.48, + "grad_norm": 2.2065303325653076, + "learning_rate": 2.5284008264209037e-05, + "loss": 1.842, + "step": 13870 + }, + { + "epoch": 2.48, + "grad_norm": 1.4892088174819946, + "learning_rate": 2.5269949009845495e-05, + "loss": 2.1346, + "step": 13875 + }, + { + "epoch": 2.48, + "grad_norm": 0.8326558470726013, + "learning_rate": 2.525588967009734e-05, + "loss": 1.7902, + "step": 13880 + }, + { + "epoch": 2.49, + "grad_norm": 0.9677508473396301, + "learning_rate": 2.5241830249411523e-05, + "loss": 1.4476, + "step": 13885 + }, + { + "epoch": 2.49, + "grad_norm": 0.45370617508888245, + "learning_rate": 2.5227770752235043e-05, + "loss": 2.0553, + "step": 13890 + }, + { + "epoch": 2.49, + "grad_norm": 0.6809149980545044, + "learning_rate": 2.5213711183014888e-05, + "loss": 1.8837, + "step": 13895 + }, + { + "epoch": 2.49, + "grad_norm": 0.7268977165222168, + "learning_rate": 2.5199651546198084e-05, + "loss": 1.9655, + "step": 13900 + }, + { + "epoch": 2.49, + "grad_norm": 0.5356599688529968, + "learning_rate": 2.51855918462317e-05, + "loss": 1.7184, + "step": 13905 + }, + { + "epoch": 2.49, + "grad_norm": 0.551061749458313, + "learning_rate": 2.5171532087562782e-05, + "loss": 1.8176, + "step": 13910 + }, + { + "epoch": 2.49, + "grad_norm": 0.8018007278442383, + "learning_rate": 2.5157472274638437e-05, + "loss": 2.0117, + "step": 13915 + }, + { + "epoch": 2.49, + "grad_norm": 1.2345116138458252, + "learning_rate": 2.5143412411905766e-05, + "loss": 1.6238, + "step": 13920 + }, + { + "epoch": 2.49, + "grad_norm": 0.5574769377708435, + "learning_rate": 2.5129352503811876e-05, + "loss": 1.8545, + "step": 13925 + }, + { + "epoch": 2.49, + "grad_norm": 0.5945510268211365, + "learning_rate": 2.5115292554803915e-05, + "loss": 1.8348, + "step": 13930 + }, + { + "epoch": 2.49, + "grad_norm": 1.0902730226516724, + "learning_rate": 2.5101232569329042e-05, + "loss": 1.7905, + "step": 13935 + }, + { + "epoch": 2.5, + "grad_norm": 0.5143458843231201, + "learning_rate": 2.50871725518344e-05, + "loss": 1.7065, + "step": 13940 + }, + { + "epoch": 2.5, + "grad_norm": 1.1873408555984497, + "learning_rate": 2.5073112506767156e-05, + "loss": 1.9102, + "step": 13945 + }, + { + "epoch": 2.5, + "grad_norm": 0.836678147315979, + "learning_rate": 2.5059052438574504e-05, + "loss": 1.9174, + "step": 13950 + }, + { + "epoch": 2.5, + "grad_norm": 0.9396674036979675, + "learning_rate": 2.5044992351703617e-05, + "loss": 1.8286, + "step": 13955 + }, + { + "epoch": 2.5, + "grad_norm": 0.3221425712108612, + "learning_rate": 2.5030932250601696e-05, + "loss": 2.0022, + "step": 13960 + }, + { + "epoch": 2.5, + "grad_norm": 1.404722809791565, + "learning_rate": 2.5016872139715934e-05, + "loss": 1.8733, + "step": 13965 + }, + { + "epoch": 2.5, + "grad_norm": 1.7955238819122314, + "learning_rate": 2.5002812023493526e-05, + "loss": 1.7287, + "step": 13970 + }, + { + "epoch": 2.5, + "grad_norm": 0.7894271016120911, + "learning_rate": 2.498875190638168e-05, + "loss": 1.9474, + "step": 13975 + }, + { + "epoch": 2.5, + "grad_norm": 0.6397268176078796, + "learning_rate": 2.4974691792827598e-05, + "loss": 1.7299, + "step": 13980 + }, + { + "epoch": 2.5, + "grad_norm": 0.6210111975669861, + "learning_rate": 2.4960631687278475e-05, + "loss": 1.9172, + "step": 13985 + }, + { + "epoch": 2.5, + "grad_norm": 0.5007354617118835, + "learning_rate": 2.494657159418151e-05, + "loss": 2.2217, + "step": 13990 + }, + { + "epoch": 2.51, + "grad_norm": 0.6257606744766235, + "learning_rate": 2.4932511517983906e-05, + "loss": 1.8129, + "step": 13995 + }, + { + "epoch": 2.51, + "grad_norm": 0.7970540523529053, + "learning_rate": 2.4918451463132843e-05, + "loss": 1.7572, + "step": 14000 + }, + { + "epoch": 2.51, + "grad_norm": 0.6435466408729553, + "learning_rate": 2.4904391434075495e-05, + "loss": 1.827, + "step": 14005 + }, + { + "epoch": 2.51, + "grad_norm": 1.1177036762237549, + "learning_rate": 2.489033143525905e-05, + "loss": 1.8556, + "step": 14010 + }, + { + "epoch": 2.51, + "grad_norm": 1.462189793586731, + "learning_rate": 2.4876271471130664e-05, + "loss": 1.8425, + "step": 14015 + }, + { + "epoch": 2.51, + "grad_norm": 0.4156283438205719, + "learning_rate": 2.486221154613749e-05, + "loss": 1.9366, + "step": 14020 + }, + { + "epoch": 2.51, + "grad_norm": 1.2464938163757324, + "learning_rate": 2.484815166472668e-05, + "loss": 1.6903, + "step": 14025 + }, + { + "epoch": 2.51, + "grad_norm": 0.604915201663971, + "learning_rate": 2.4834091831345343e-05, + "loss": 1.6903, + "step": 14030 + }, + { + "epoch": 2.51, + "grad_norm": 0.5225396156311035, + "learning_rate": 2.4820032050440602e-05, + "loss": 1.7955, + "step": 14035 + }, + { + "epoch": 2.51, + "grad_norm": 0.6467166543006897, + "learning_rate": 2.4805972326459555e-05, + "loss": 1.9819, + "step": 14040 + }, + { + "epoch": 2.51, + "grad_norm": 1.0003973245620728, + "learning_rate": 2.479191266384928e-05, + "loss": 1.5891, + "step": 14045 + }, + { + "epoch": 2.51, + "grad_norm": 1.0571430921554565, + "learning_rate": 2.4777853067056824e-05, + "loss": 1.7885, + "step": 14050 + }, + { + "epoch": 2.52, + "grad_norm": 0.7798799872398376, + "learning_rate": 2.4763793540529226e-05, + "loss": 1.5714, + "step": 14055 + }, + { + "epoch": 2.52, + "grad_norm": 0.5746772289276123, + "learning_rate": 2.4749734088713502e-05, + "loss": 1.8001, + "step": 14060 + }, + { + "epoch": 2.52, + "grad_norm": 1.2530102729797363, + "learning_rate": 2.473567471605665e-05, + "loss": 1.8787, + "step": 14065 + }, + { + "epoch": 2.52, + "grad_norm": 0.9434322714805603, + "learning_rate": 2.4721615427005628e-05, + "loss": 2.0167, + "step": 14070 + }, + { + "epoch": 2.52, + "grad_norm": 1.5446343421936035, + "learning_rate": 2.4707556226007377e-05, + "loss": 1.5517, + "step": 14075 + }, + { + "epoch": 2.52, + "grad_norm": 0.794992983341217, + "learning_rate": 2.469349711750881e-05, + "loss": 2.0354, + "step": 14080 + }, + { + "epoch": 2.52, + "grad_norm": 0.5005180239677429, + "learning_rate": 2.467943810595682e-05, + "loss": 1.7495, + "step": 14085 + }, + { + "epoch": 2.52, + "grad_norm": 1.5188912153244019, + "learning_rate": 2.4665379195798247e-05, + "loss": 1.722, + "step": 14090 + }, + { + "epoch": 2.52, + "grad_norm": 3.6539127826690674, + "learning_rate": 2.4651320391479902e-05, + "loss": 1.621, + "step": 14095 + }, + { + "epoch": 2.52, + "grad_norm": 0.638843297958374, + "learning_rate": 2.463726169744859e-05, + "loss": 1.8223, + "step": 14100 + }, + { + "epoch": 2.52, + "grad_norm": 0.6514829397201538, + "learning_rate": 2.462320311815105e-05, + "loss": 1.6494, + "step": 14105 + }, + { + "epoch": 2.53, + "grad_norm": 1.295366883277893, + "learning_rate": 2.4609144658034e-05, + "loss": 1.8201, + "step": 14110 + }, + { + "epoch": 2.53, + "grad_norm": 0.8184346556663513, + "learning_rate": 2.4595086321544125e-05, + "loss": 2.0595, + "step": 14115 + }, + { + "epoch": 2.53, + "grad_norm": 1.1989121437072754, + "learning_rate": 2.458102811312805e-05, + "loss": 1.9181, + "step": 14120 + }, + { + "epoch": 2.53, + "grad_norm": 1.7824451923370361, + "learning_rate": 2.4566970037232385e-05, + "loss": 1.7497, + "step": 14125 + }, + { + "epoch": 2.53, + "grad_norm": 0.6604727506637573, + "learning_rate": 2.4552912098303675e-05, + "loss": 2.0074, + "step": 14130 + }, + { + "epoch": 2.53, + "grad_norm": 0.795128583908081, + "learning_rate": 2.453885430078845e-05, + "loss": 2.2564, + "step": 14135 + }, + { + "epoch": 2.53, + "grad_norm": Infinity, + "learning_rate": 2.4527608167581993e-05, + "loss": 2.0231, + "step": 14140 + }, + { + "epoch": 2.53, + "grad_norm": 0.7836667895317078, + "learning_rate": 2.451355063581608e-05, + "loss": 1.9791, + "step": 14145 + }, + { + "epoch": 2.53, + "grad_norm": 1.0901525020599365, + "learning_rate": 2.4499493257913634e-05, + "loss": 1.497, + "step": 14150 + }, + { + "epoch": 2.53, + "grad_norm": 2.3279173374176025, + "learning_rate": 2.4485436038320983e-05, + "loss": 2.0759, + "step": 14155 + }, + { + "epoch": 2.53, + "grad_norm": 1.8435465097427368, + "learning_rate": 2.447137898148443e-05, + "loss": 2.0998, + "step": 14160 + }, + { + "epoch": 2.54, + "grad_norm": 1.8504188060760498, + "learning_rate": 2.4457322091850183e-05, + "loss": 1.6399, + "step": 14165 + }, + { + "epoch": 2.54, + "grad_norm": 0.5952614545822144, + "learning_rate": 2.444326537386444e-05, + "loss": 2.058, + "step": 14170 + }, + { + "epoch": 2.54, + "grad_norm": 0.9884322285652161, + "learning_rate": 2.4429208831973333e-05, + "loss": 1.7262, + "step": 14175 + }, + { + "epoch": 2.54, + "grad_norm": 1.2859538793563843, + "learning_rate": 2.441515247062291e-05, + "loss": 1.7738, + "step": 14180 + }, + { + "epoch": 2.54, + "grad_norm": 0.47512462735176086, + "learning_rate": 2.4401096294259217e-05, + "loss": 1.8085, + "step": 14185 + }, + { + "epoch": 2.54, + "grad_norm": 0.804999828338623, + "learning_rate": 2.438704030732818e-05, + "loss": 1.7905, + "step": 14190 + }, + { + "epoch": 2.54, + "grad_norm": 1.0102812051773071, + "learning_rate": 2.43729845142757e-05, + "loss": 2.0173, + "step": 14195 + }, + { + "epoch": 2.54, + "grad_norm": 0.7093918919563293, + "learning_rate": 2.4358928919547616e-05, + "loss": 1.8977, + "step": 14200 + }, + { + "epoch": 2.54, + "grad_norm": 1.1343001127243042, + "learning_rate": 2.43448735275897e-05, + "loss": 1.5511, + "step": 14205 + }, + { + "epoch": 2.54, + "grad_norm": 0.6553163528442383, + "learning_rate": 2.4330818342847655e-05, + "loss": 1.5871, + "step": 14210 + }, + { + "epoch": 2.54, + "grad_norm": 1.924633502960205, + "learning_rate": 2.4316763369767127e-05, + "loss": 2.0279, + "step": 14215 + }, + { + "epoch": 2.55, + "grad_norm": 1.7396385669708252, + "learning_rate": 2.4302708612793686e-05, + "loss": 1.8374, + "step": 14220 + }, + { + "epoch": 2.55, + "grad_norm": 0.5724390149116516, + "learning_rate": 2.4288654076372842e-05, + "loss": 2.1751, + "step": 14225 + }, + { + "epoch": 2.55, + "grad_norm": 0.5967565774917603, + "learning_rate": 2.4274599764950034e-05, + "loss": 1.8294, + "step": 14230 + }, + { + "epoch": 2.55, + "grad_norm": 0.9949921369552612, + "learning_rate": 2.426054568297062e-05, + "loss": 1.8312, + "step": 14235 + }, + { + "epoch": 2.55, + "grad_norm": 0.9570918083190918, + "learning_rate": 2.4246491834879892e-05, + "loss": 1.8949, + "step": 14240 + }, + { + "epoch": 2.55, + "grad_norm": 0.5771768093109131, + "learning_rate": 2.423243822512307e-05, + "loss": 1.8138, + "step": 14245 + }, + { + "epoch": 2.55, + "grad_norm": 0.2878669500350952, + "learning_rate": 2.4218384858145306e-05, + "loss": 1.7788, + "step": 14250 + }, + { + "epoch": 2.55, + "grad_norm": 1.2565678358078003, + "learning_rate": 2.420433173839165e-05, + "loss": 1.6618, + "step": 14255 + }, + { + "epoch": 2.55, + "grad_norm": 2.0233705043792725, + "learning_rate": 2.41902788703071e-05, + "loss": 1.8718, + "step": 14260 + }, + { + "epoch": 2.55, + "grad_norm": 1.1918349266052246, + "learning_rate": 2.4176226258336574e-05, + "loss": 1.7568, + "step": 14265 + }, + { + "epoch": 2.55, + "grad_norm": 0.5747388601303101, + "learning_rate": 2.4162173906924882e-05, + "loss": 1.975, + "step": 14270 + }, + { + "epoch": 2.56, + "grad_norm": 0.6768869161605835, + "learning_rate": 2.4148121820516785e-05, + "loss": 2.0272, + "step": 14275 + }, + { + "epoch": 2.56, + "grad_norm": 0.4785836338996887, + "learning_rate": 2.413407000355694e-05, + "loss": 1.7301, + "step": 14280 + }, + { + "epoch": 2.56, + "grad_norm": 0.7693573236465454, + "learning_rate": 2.412001846048991e-05, + "loss": 1.7174, + "step": 14285 + }, + { + "epoch": 2.56, + "grad_norm": 1.5978009700775146, + "learning_rate": 2.4105967195760192e-05, + "loss": 1.8307, + "step": 14290 + }, + { + "epoch": 2.56, + "grad_norm": 0.6870273947715759, + "learning_rate": 2.40919162138122e-05, + "loss": 1.6285, + "step": 14295 + }, + { + "epoch": 2.56, + "grad_norm": 0.6152418255805969, + "learning_rate": 2.407786551909023e-05, + "loss": 2.058, + "step": 14300 + }, + { + "epoch": 2.56, + "grad_norm": 1.458072543144226, + "learning_rate": 2.4063815116038512e-05, + "loss": 1.4863, + "step": 14305 + }, + { + "epoch": 2.56, + "grad_norm": 0.782149612903595, + "learning_rate": 2.4049765009101165e-05, + "loss": 1.7991, + "step": 14310 + }, + { + "epoch": 2.56, + "grad_norm": 0.7315062880516052, + "learning_rate": 2.4035715202722237e-05, + "loss": 1.6951, + "step": 14315 + }, + { + "epoch": 2.56, + "grad_norm": 2.745988130569458, + "learning_rate": 2.4021665701345667e-05, + "loss": 1.8848, + "step": 14320 + }, + { + "epoch": 2.56, + "grad_norm": 0.6406875848770142, + "learning_rate": 2.40076165094153e-05, + "loss": 2.0148, + "step": 14325 + }, + { + "epoch": 2.56, + "grad_norm": 0.698235809803009, + "learning_rate": 2.3993567631374862e-05, + "loss": 2.1415, + "step": 14330 + }, + { + "epoch": 2.57, + "grad_norm": 0.499514639377594, + "learning_rate": 2.397951907166802e-05, + "loss": 2.03, + "step": 14335 + }, + { + "epoch": 2.57, + "grad_norm": 1.9097588062286377, + "learning_rate": 2.3965470834738312e-05, + "loss": 1.8086, + "step": 14340 + }, + { + "epoch": 2.57, + "grad_norm": 0.8855141401290894, + "learning_rate": 2.3951422925029184e-05, + "loss": 1.465, + "step": 14345 + }, + { + "epoch": 2.57, + "grad_norm": 0.5352545976638794, + "learning_rate": 2.393737534698398e-05, + "loss": 1.7741, + "step": 14350 + }, + { + "epoch": 2.57, + "grad_norm": 2.3942019939422607, + "learning_rate": 2.3923328105045928e-05, + "loss": 2.0296, + "step": 14355 + }, + { + "epoch": 2.57, + "grad_norm": 1.186553716659546, + "learning_rate": 2.390928120365816e-05, + "loss": 2.0242, + "step": 14360 + }, + { + "epoch": 2.57, + "grad_norm": 0.4111693203449249, + "learning_rate": 2.3895234647263704e-05, + "loss": 1.9497, + "step": 14365 + }, + { + "epoch": 2.57, + "grad_norm": 0.9013734459877014, + "learning_rate": 2.3881188440305468e-05, + "loss": 1.9767, + "step": 14370 + }, + { + "epoch": 2.57, + "grad_norm": 0.960102379322052, + "learning_rate": 2.3867142587226247e-05, + "loss": 2.0041, + "step": 14375 + }, + { + "epoch": 2.57, + "grad_norm": 0.4831627905368805, + "learning_rate": 2.3853097092468732e-05, + "loss": 1.5793, + "step": 14380 + }, + { + "epoch": 2.57, + "grad_norm": 0.7343013882637024, + "learning_rate": 2.383905196047551e-05, + "loss": 1.647, + "step": 14385 + }, + { + "epoch": 2.58, + "grad_norm": 1.5119633674621582, + "learning_rate": 2.382500719568903e-05, + "loss": 1.7457, + "step": 14390 + }, + { + "epoch": 2.58, + "grad_norm": 0.567718505859375, + "learning_rate": 2.381096280255165e-05, + "loss": 1.9885, + "step": 14395 + }, + { + "epoch": 2.58, + "grad_norm": 1.3946149349212646, + "learning_rate": 2.3796918785505587e-05, + "loss": 1.9862, + "step": 14400 + }, + { + "epoch": 2.58, + "grad_norm": 0.8330457210540771, + "learning_rate": 2.3782875148992954e-05, + "loss": 1.8574, + "step": 14405 + }, + { + "epoch": 2.58, + "grad_norm": 0.6044062376022339, + "learning_rate": 2.3768831897455745e-05, + "loss": 1.796, + "step": 14410 + }, + { + "epoch": 2.58, + "grad_norm": 1.53191077709198, + "learning_rate": 2.375478903533582e-05, + "loss": 1.9155, + "step": 14415 + }, + { + "epoch": 2.58, + "grad_norm": 0.6012637615203857, + "learning_rate": 2.3740746567074935e-05, + "loss": 2.1244, + "step": 14420 + }, + { + "epoch": 2.58, + "grad_norm": 0.6626573204994202, + "learning_rate": 2.3726704497114687e-05, + "loss": 1.6726, + "step": 14425 + }, + { + "epoch": 2.58, + "grad_norm": 1.1490142345428467, + "learning_rate": 2.371266282989659e-05, + "loss": 1.7387, + "step": 14430 + }, + { + "epoch": 2.58, + "grad_norm": 0.9652857780456543, + "learning_rate": 2.3698621569861996e-05, + "loss": 1.6093, + "step": 14435 + }, + { + "epoch": 2.58, + "grad_norm": 0.8638285398483276, + "learning_rate": 2.3684580721452153e-05, + "loss": 1.9487, + "step": 14440 + }, + { + "epoch": 2.59, + "grad_norm": 0.6422768235206604, + "learning_rate": 2.3670540289108153e-05, + "loss": 1.8297, + "step": 14445 + }, + { + "epoch": 2.59, + "grad_norm": 1.0574924945831299, + "learning_rate": 2.3656500277270983e-05, + "loss": 1.9849, + "step": 14450 + }, + { + "epoch": 2.59, + "grad_norm": 0.574671745300293, + "learning_rate": 2.3642460690381495e-05, + "loss": 2.0423, + "step": 14455 + }, + { + "epoch": 2.59, + "grad_norm": 0.7926428914070129, + "learning_rate": 2.362842153288037e-05, + "loss": 1.5625, + "step": 14460 + }, + { + "epoch": 2.59, + "grad_norm": 0.752262532711029, + "learning_rate": 2.361438280920821e-05, + "loss": 2.0275, + "step": 14465 + }, + { + "epoch": 2.59, + "grad_norm": 0.7062698602676392, + "learning_rate": 2.3600344523805424e-05, + "loss": 2.1019, + "step": 14470 + }, + { + "epoch": 2.59, + "grad_norm": 0.3775525689125061, + "learning_rate": 2.3586306681112323e-05, + "loss": 1.9513, + "step": 14475 + }, + { + "epoch": 2.59, + "grad_norm": 1.0212444067001343, + "learning_rate": 2.3572269285569054e-05, + "loss": 1.9915, + "step": 14480 + }, + { + "epoch": 2.59, + "grad_norm": 0.917746901512146, + "learning_rate": 2.3558232341615643e-05, + "loss": 1.8467, + "step": 14485 + }, + { + "epoch": 2.59, + "grad_norm": 0.5298284888267517, + "learning_rate": 2.354419585369195e-05, + "loss": 1.8151, + "step": 14490 + }, + { + "epoch": 2.59, + "grad_norm": 0.6283917427062988, + "learning_rate": 2.3530159826237707e-05, + "loss": 2.2388, + "step": 14495 + }, + { + "epoch": 2.6, + "grad_norm": 0.9578955769538879, + "learning_rate": 2.3516124263692502e-05, + "loss": 1.5422, + "step": 14500 + }, + { + "epoch": 2.6, + "grad_norm": 0.6109092235565186, + "learning_rate": 2.3502089170495762e-05, + "loss": 1.9209, + "step": 14505 + }, + { + "epoch": 2.6, + "grad_norm": 0.5404300093650818, + "learning_rate": 2.3488054551086785e-05, + "loss": 1.9089, + "step": 14510 + }, + { + "epoch": 2.6, + "grad_norm": 0.5609441995620728, + "learning_rate": 2.347402040990469e-05, + "loss": 1.8755, + "step": 14515 + }, + { + "epoch": 2.6, + "grad_norm": 4.9784345626831055, + "learning_rate": 2.3459986751388478e-05, + "loss": 1.9347, + "step": 14520 + }, + { + "epoch": 2.6, + "grad_norm": 2.010941505432129, + "learning_rate": 2.3445953579976968e-05, + "loss": 1.7676, + "step": 14525 + }, + { + "epoch": 2.6, + "grad_norm": 0.6856008172035217, + "learning_rate": 2.343192090010885e-05, + "loss": 1.9795, + "step": 14530 + }, + { + "epoch": 2.6, + "grad_norm": 0.712049126625061, + "learning_rate": 2.3417888716222643e-05, + "loss": 1.8283, + "step": 14535 + }, + { + "epoch": 2.6, + "grad_norm": 0.41500189900398254, + "learning_rate": 2.3403857032756706e-05, + "loss": 1.8836, + "step": 14540 + }, + { + "epoch": 2.6, + "grad_norm": 0.3549801707267761, + "learning_rate": 2.3389825854149263e-05, + "loss": 1.8883, + "step": 14545 + }, + { + "epoch": 2.6, + "grad_norm": 0.8253669142723083, + "learning_rate": 2.337579518483835e-05, + "loss": 1.522, + "step": 14550 + }, + { + "epoch": 2.61, + "grad_norm": 0.8156489729881287, + "learning_rate": 2.3361765029261863e-05, + "loss": 1.7102, + "step": 14555 + }, + { + "epoch": 2.61, + "grad_norm": 0.8357172608375549, + "learning_rate": 2.334773539185752e-05, + "loss": 1.7754, + "step": 14560 + }, + { + "epoch": 2.61, + "grad_norm": 0.7100852727890015, + "learning_rate": 2.333370627706288e-05, + "loss": 1.4992, + "step": 14565 + }, + { + "epoch": 2.61, + "grad_norm": 0.7589089870452881, + "learning_rate": 2.3319677689315338e-05, + "loss": 2.224, + "step": 14570 + }, + { + "epoch": 2.61, + "grad_norm": 0.49599790573120117, + "learning_rate": 2.3305649633052133e-05, + "loss": 1.5944, + "step": 14575 + }, + { + "epoch": 2.61, + "grad_norm": 0.9817653894424438, + "learning_rate": 2.3291622112710316e-05, + "loss": 1.6229, + "step": 14580 + }, + { + "epoch": 2.61, + "grad_norm": 0.6684935688972473, + "learning_rate": 2.327759513272678e-05, + "loss": 1.863, + "step": 14585 + }, + { + "epoch": 2.61, + "grad_norm": 0.7037563323974609, + "learning_rate": 2.326356869753825e-05, + "loss": 2.113, + "step": 14590 + }, + { + "epoch": 2.61, + "grad_norm": 0.6015907526016235, + "learning_rate": 2.324954281158127e-05, + "loss": 1.7673, + "step": 14595 + }, + { + "epoch": 2.61, + "grad_norm": 0.7683700323104858, + "learning_rate": 2.323551747929222e-05, + "loss": 1.9316, + "step": 14600 + }, + { + "epoch": 2.61, + "grad_norm": 3.4007463455200195, + "learning_rate": 2.3221492705107294e-05, + "loss": 1.6905, + "step": 14605 + }, + { + "epoch": 2.62, + "grad_norm": 1.5388293266296387, + "learning_rate": 2.320746849346251e-05, + "loss": 2.086, + "step": 14610 + }, + { + "epoch": 2.62, + "grad_norm": 0.5219169855117798, + "learning_rate": 2.3193444848793713e-05, + "loss": 1.9448, + "step": 14615 + }, + { + "epoch": 2.62, + "grad_norm": 1.7064671516418457, + "learning_rate": 2.317942177553658e-05, + "loss": 1.5938, + "step": 14620 + }, + { + "epoch": 2.62, + "grad_norm": 2.014631986618042, + "learning_rate": 2.3165399278126583e-05, + "loss": 1.9672, + "step": 14625 + }, + { + "epoch": 2.62, + "grad_norm": 0.6399771571159363, + "learning_rate": 2.3151377360999025e-05, + "loss": 1.986, + "step": 14630 + }, + { + "epoch": 2.62, + "grad_norm": 1.9371715784072876, + "learning_rate": 2.3137356028589042e-05, + "loss": 1.8568, + "step": 14635 + }, + { + "epoch": 2.62, + "grad_norm": 1.1944173574447632, + "learning_rate": 2.3123335285331547e-05, + "loss": 2.001, + "step": 14640 + }, + { + "epoch": 2.62, + "grad_norm": 0.6138972640037537, + "learning_rate": 2.3109315135661302e-05, + "loss": 1.8779, + "step": 14645 + }, + { + "epoch": 2.62, + "grad_norm": 2.2491681575775146, + "learning_rate": 2.3095295584012865e-05, + "loss": 1.6365, + "step": 14650 + }, + { + "epoch": 2.62, + "grad_norm": 0.8489646911621094, + "learning_rate": 2.3081276634820595e-05, + "loss": 1.7617, + "step": 14655 + }, + { + "epoch": 2.62, + "grad_norm": 0.6606999635696411, + "learning_rate": 2.3067258292518675e-05, + "loss": 1.9637, + "step": 14660 + }, + { + "epoch": 2.62, + "grad_norm": 0.7351112961769104, + "learning_rate": 2.3053240561541107e-05, + "loss": 1.6616, + "step": 14665 + }, + { + "epoch": 2.63, + "grad_norm": 0.5655643343925476, + "learning_rate": 2.303922344632167e-05, + "loss": 1.6532, + "step": 14670 + }, + { + "epoch": 2.63, + "grad_norm": 0.6417161226272583, + "learning_rate": 2.3025206951293973e-05, + "loss": 1.8476, + "step": 14675 + }, + { + "epoch": 2.63, + "grad_norm": 0.9483529925346375, + "learning_rate": 2.301119108089142e-05, + "loss": 1.5566, + "step": 14680 + }, + { + "epoch": 2.63, + "grad_norm": 0.7348762154579163, + "learning_rate": 2.2997175839547215e-05, + "loss": 1.8403, + "step": 14685 + }, + { + "epoch": 2.63, + "grad_norm": 1.085129976272583, + "learning_rate": 2.2983161231694364e-05, + "loss": 1.5614, + "step": 14690 + }, + { + "epoch": 2.63, + "grad_norm": 0.830359697341919, + "learning_rate": 2.2969147261765675e-05, + "loss": 1.5355, + "step": 14695 + }, + { + "epoch": 2.63, + "grad_norm": 1.0169621706008911, + "learning_rate": 2.2955133934193762e-05, + "loss": 2.0021, + "step": 14700 + }, + { + "epoch": 2.63, + "grad_norm": 0.5929272174835205, + "learning_rate": 2.2941121253411008e-05, + "loss": 1.65, + "step": 14705 + }, + { + "epoch": 2.63, + "grad_norm": 1.135832667350769, + "learning_rate": 2.2927109223849626e-05, + "loss": 1.8439, + "step": 14710 + }, + { + "epoch": 2.63, + "grad_norm": 0.7020158767700195, + "learning_rate": 2.2913097849941602e-05, + "loss": 1.857, + "step": 14715 + }, + { + "epoch": 2.63, + "grad_norm": 0.45904645323753357, + "learning_rate": 2.2899087136118717e-05, + "loss": 1.6038, + "step": 14720 + }, + { + "epoch": 2.64, + "grad_norm": 0.3881944417953491, + "learning_rate": 2.2885077086812555e-05, + "loss": 2.0703, + "step": 14725 + }, + { + "epoch": 2.64, + "grad_norm": 0.6580604910850525, + "learning_rate": 2.2871067706454472e-05, + "loss": 1.718, + "step": 14730 + }, + { + "epoch": 2.64, + "grad_norm": 1.6806758642196655, + "learning_rate": 2.285705899947563e-05, + "loss": 1.5215, + "step": 14735 + }, + { + "epoch": 2.64, + "grad_norm": 0.8426775932312012, + "learning_rate": 2.284305097030696e-05, + "loss": 1.8828, + "step": 14740 + }, + { + "epoch": 2.64, + "grad_norm": 0.8580935597419739, + "learning_rate": 2.2829043623379202e-05, + "loss": 1.9616, + "step": 14745 + }, + { + "epoch": 2.64, + "grad_norm": 0.5852141380310059, + "learning_rate": 2.281503696312285e-05, + "loss": 1.8733, + "step": 14750 + }, + { + "epoch": 2.64, + "grad_norm": 0.9512518644332886, + "learning_rate": 2.2801030993968213e-05, + "loss": 2.0427, + "step": 14755 + }, + { + "epoch": 2.64, + "grad_norm": 0.8054193258285522, + "learning_rate": 2.278702572034535e-05, + "loss": 1.6561, + "step": 14760 + }, + { + "epoch": 2.64, + "grad_norm": 0.729395866394043, + "learning_rate": 2.277302114668412e-05, + "loss": 1.7398, + "step": 14765 + }, + { + "epoch": 2.64, + "grad_norm": 0.750243067741394, + "learning_rate": 2.2759017277414166e-05, + "loss": 1.7189, + "step": 14770 + }, + { + "epoch": 2.64, + "grad_norm": 0.8767192959785461, + "learning_rate": 2.2745014116964884e-05, + "loss": 1.7503, + "step": 14775 + }, + { + "epoch": 2.65, + "grad_norm": 1.0850597620010376, + "learning_rate": 2.273101166976548e-05, + "loss": 1.8113, + "step": 14780 + }, + { + "epoch": 2.65, + "grad_norm": 1.1997833251953125, + "learning_rate": 2.2717009940244892e-05, + "loss": 1.9981, + "step": 14785 + }, + { + "epoch": 2.65, + "grad_norm": 0.7039926648139954, + "learning_rate": 2.2703008932831874e-05, + "loss": 1.8546, + "step": 14790 + }, + { + "epoch": 2.65, + "grad_norm": 0.5928205847740173, + "learning_rate": 2.268900865195491e-05, + "loss": 1.786, + "step": 14795 + }, + { + "epoch": 2.65, + "grad_norm": 2.150042772293091, + "learning_rate": 2.2675009102042297e-05, + "loss": 2.0728, + "step": 14800 + }, + { + "epoch": 2.65, + "grad_norm": 0.6576092839241028, + "learning_rate": 2.2661010287522057e-05, + "loss": 1.9185, + "step": 14805 + }, + { + "epoch": 2.65, + "grad_norm": 0.43387851119041443, + "learning_rate": 2.2647012212822023e-05, + "loss": 2.0831, + "step": 14810 + }, + { + "epoch": 2.65, + "grad_norm": 2.2747631072998047, + "learning_rate": 2.263301488236976e-05, + "loss": 1.9771, + "step": 14815 + }, + { + "epoch": 2.65, + "grad_norm": 0.780261218547821, + "learning_rate": 2.2619018300592603e-05, + "loss": 1.7917, + "step": 14820 + }, + { + "epoch": 2.65, + "grad_norm": 0.6798030734062195, + "learning_rate": 2.260502247191768e-05, + "loss": 1.5835, + "step": 14825 + }, + { + "epoch": 2.65, + "grad_norm": 1.7987754344940186, + "learning_rate": 2.2591027400771838e-05, + "loss": 1.5079, + "step": 14830 + }, + { + "epoch": 2.66, + "grad_norm": 0.6671625971794128, + "learning_rate": 2.2577033091581715e-05, + "loss": 1.9078, + "step": 14835 + }, + { + "epoch": 2.66, + "grad_norm": 0.9358892440795898, + "learning_rate": 2.25630395487737e-05, + "loss": 1.7289, + "step": 14840 + }, + { + "epoch": 2.66, + "grad_norm": 0.6218108534812927, + "learning_rate": 2.2549046776773932e-05, + "loss": 1.6662, + "step": 14845 + }, + { + "epoch": 2.66, + "grad_norm": 1.6321266889572144, + "learning_rate": 2.2535054780008302e-05, + "loss": 1.9075, + "step": 14850 + }, + { + "epoch": 2.66, + "grad_norm": 1.0128587484359741, + "learning_rate": 2.252106356290248e-05, + "loss": 1.9468, + "step": 14855 + }, + { + "epoch": 2.66, + "grad_norm": 0.7265017628669739, + "learning_rate": 2.2507073129881863e-05, + "loss": 1.8416, + "step": 14860 + }, + { + "epoch": 2.66, + "grad_norm": 0.9810836911201477, + "learning_rate": 2.249308348537162e-05, + "loss": 1.5815, + "step": 14865 + }, + { + "epoch": 2.66, + "grad_norm": 0.7224981784820557, + "learning_rate": 2.247909463379666e-05, + "loss": 1.9182, + "step": 14870 + }, + { + "epoch": 2.66, + "grad_norm": 0.7442790269851685, + "learning_rate": 2.246510657958164e-05, + "loss": 1.9225, + "step": 14875 + }, + { + "epoch": 2.66, + "grad_norm": 0.815650224685669, + "learning_rate": 2.245111932715097e-05, + "loss": 1.8601, + "step": 14880 + }, + { + "epoch": 2.66, + "grad_norm": 1.262229323387146, + "learning_rate": 2.2437132880928817e-05, + "loss": 2.0791, + "step": 14885 + }, + { + "epoch": 2.67, + "grad_norm": 0.8286881446838379, + "learning_rate": 2.2423147245339062e-05, + "loss": 1.9286, + "step": 14890 + }, + { + "epoch": 2.67, + "grad_norm": 2.7588956356048584, + "learning_rate": 2.240916242480535e-05, + "loss": 1.697, + "step": 14895 + }, + { + "epoch": 2.67, + "grad_norm": 0.852202296257019, + "learning_rate": 2.2395178423751076e-05, + "loss": 1.7105, + "step": 14900 + }, + { + "epoch": 2.67, + "grad_norm": 0.785529375076294, + "learning_rate": 2.2381195246599356e-05, + "loss": 1.7383, + "step": 14905 + }, + { + "epoch": 2.67, + "grad_norm": 0.9461120963096619, + "learning_rate": 2.236721289777306e-05, + "loss": 1.8059, + "step": 14910 + }, + { + "epoch": 2.67, + "grad_norm": 0.5357199311256409, + "learning_rate": 2.2353231381694797e-05, + "loss": 2.066, + "step": 14915 + }, + { + "epoch": 2.67, + "grad_norm": 2.159280300140381, + "learning_rate": 2.23392507027869e-05, + "loss": 1.8313, + "step": 14920 + }, + { + "epoch": 2.67, + "grad_norm": 0.9953837394714355, + "learning_rate": 2.2325270865471444e-05, + "loss": 1.8698, + "step": 14925 + }, + { + "epoch": 2.67, + "grad_norm": 0.6990963220596313, + "learning_rate": 2.231129187417025e-05, + "loss": 1.7352, + "step": 14930 + }, + { + "epoch": 2.67, + "grad_norm": 0.6935223937034607, + "learning_rate": 2.2297313733304845e-05, + "loss": 1.514, + "step": 14935 + }, + { + "epoch": 2.67, + "grad_norm": 0.5469807386398315, + "learning_rate": 2.22833364472965e-05, + "loss": 1.8815, + "step": 14940 + }, + { + "epoch": 2.68, + "grad_norm": 0.5818589925765991, + "learning_rate": 2.2269360020566232e-05, + "loss": 1.9459, + "step": 14945 + }, + { + "epoch": 2.68, + "grad_norm": 1.0151652097702026, + "learning_rate": 2.2255384457534757e-05, + "loss": 1.8557, + "step": 14950 + }, + { + "epoch": 2.68, + "grad_norm": 1.0191221237182617, + "learning_rate": 2.2241409762622534e-05, + "loss": 1.5992, + "step": 14955 + }, + { + "epoch": 2.68, + "grad_norm": 1.0580757856369019, + "learning_rate": 2.222743594024976e-05, + "loss": 1.8454, + "step": 14960 + }, + { + "epoch": 2.68, + "grad_norm": 0.6717436909675598, + "learning_rate": 2.2213462994836323e-05, + "loss": 2.0803, + "step": 14965 + }, + { + "epoch": 2.68, + "grad_norm": 0.7515537738800049, + "learning_rate": 2.2199490930801862e-05, + "loss": 1.6758, + "step": 14970 + }, + { + "epoch": 2.68, + "grad_norm": 1.05194890499115, + "learning_rate": 2.2185519752565733e-05, + "loss": 1.7331, + "step": 14975 + }, + { + "epoch": 2.68, + "grad_norm": 1.2085919380187988, + "learning_rate": 2.2171549464547e-05, + "loss": 1.8599, + "step": 14980 + }, + { + "epoch": 2.68, + "grad_norm": 0.7703227400779724, + "learning_rate": 2.215758007116444e-05, + "loss": 1.8927, + "step": 14985 + }, + { + "epoch": 2.68, + "grad_norm": 0.7714345455169678, + "learning_rate": 2.214361157683658e-05, + "loss": 1.4521, + "step": 14990 + }, + { + "epoch": 2.68, + "grad_norm": 0.7530484795570374, + "learning_rate": 2.212964398598162e-05, + "loss": 1.9209, + "step": 14995 + }, + { + "epoch": 2.68, + "grad_norm": 0.6522998213768005, + "learning_rate": 2.211567730301751e-05, + "loss": 1.9337, + "step": 15000 + }, + { + "epoch": 2.69, + "grad_norm": 0.6403537392616272, + "learning_rate": 2.21017115323619e-05, + "loss": 1.8753, + "step": 15005 + }, + { + "epoch": 2.69, + "grad_norm": 0.9187439680099487, + "learning_rate": 2.2087746678432135e-05, + "loss": 1.7973, + "step": 15010 + }, + { + "epoch": 2.69, + "grad_norm": 0.840508222579956, + "learning_rate": 2.20737827456453e-05, + "loss": 2.1179, + "step": 15015 + }, + { + "epoch": 2.69, + "grad_norm": 1.1166287660598755, + "learning_rate": 2.205981973841817e-05, + "loss": 1.5053, + "step": 15020 + }, + { + "epoch": 2.69, + "grad_norm": 1.1791390180587769, + "learning_rate": 2.2045857661167235e-05, + "loss": 1.7282, + "step": 15025 + }, + { + "epoch": 2.69, + "grad_norm": 1.0932645797729492, + "learning_rate": 2.203189651830867e-05, + "loss": 1.9909, + "step": 15030 + }, + { + "epoch": 2.69, + "grad_norm": 2.5372047424316406, + "learning_rate": 2.2017936314258385e-05, + "loss": 2.1166, + "step": 15035 + }, + { + "epoch": 2.69, + "grad_norm": 0.7102367877960205, + "learning_rate": 2.2003977053431972e-05, + "loss": 2.0205, + "step": 15040 + }, + { + "epoch": 2.69, + "grad_norm": 3.9497880935668945, + "learning_rate": 2.1990018740244733e-05, + "loss": 1.8872, + "step": 15045 + }, + { + "epoch": 2.69, + "grad_norm": 0.714215874671936, + "learning_rate": 2.1976061379111677e-05, + "loss": 1.9495, + "step": 15050 + }, + { + "epoch": 2.69, + "grad_norm": 0.8167940974235535, + "learning_rate": 2.196210497444749e-05, + "loss": 1.8913, + "step": 15055 + }, + { + "epoch": 2.7, + "grad_norm": 1.2327383756637573, + "learning_rate": 2.194814953066659e-05, + "loss": 1.9469, + "step": 15060 + }, + { + "epoch": 2.7, + "grad_norm": 0.683508574962616, + "learning_rate": 2.1934195052183047e-05, + "loss": 1.9397, + "step": 15065 + }, + { + "epoch": 2.7, + "grad_norm": 1.7274824380874634, + "learning_rate": 2.1920241543410673e-05, + "loss": 1.9172, + "step": 15070 + }, + { + "epoch": 2.7, + "grad_norm": 0.8672675490379333, + "learning_rate": 2.1906289008762926e-05, + "loss": 1.7328, + "step": 15075 + }, + { + "epoch": 2.7, + "grad_norm": 1.210166335105896, + "learning_rate": 2.1892337452652996e-05, + "loss": 1.6836, + "step": 15080 + }, + { + "epoch": 2.7, + "grad_norm": 0.42006659507751465, + "learning_rate": 2.1878386879493732e-05, + "loss": 1.7958, + "step": 15085 + }, + { + "epoch": 2.7, + "grad_norm": 0.6762058734893799, + "learning_rate": 2.18644372936977e-05, + "loss": 1.9698, + "step": 15090 + }, + { + "epoch": 2.7, + "grad_norm": 2.0761775970458984, + "learning_rate": 2.1850488699677137e-05, + "loss": 1.711, + "step": 15095 + }, + { + "epoch": 2.7, + "grad_norm": 0.4220300614833832, + "learning_rate": 2.1836541101843964e-05, + "loss": 1.9976, + "step": 15100 + }, + { + "epoch": 2.7, + "grad_norm": 1.0192917585372925, + "learning_rate": 2.1822594504609802e-05, + "loss": 1.6178, + "step": 15105 + }, + { + "epoch": 2.7, + "grad_norm": 0.5573521256446838, + "learning_rate": 2.180864891238594e-05, + "loss": 1.8285, + "step": 15110 + }, + { + "epoch": 2.71, + "grad_norm": 0.7691249847412109, + "learning_rate": 2.1794704329583353e-05, + "loss": 1.8416, + "step": 15115 + }, + { + "epoch": 2.71, + "grad_norm": 0.6303778290748596, + "learning_rate": 2.178076076061272e-05, + "loss": 1.9445, + "step": 15120 + }, + { + "epoch": 2.71, + "grad_norm": 1.794248342514038, + "learning_rate": 2.1766818209884355e-05, + "loss": 1.772, + "step": 15125 + }, + { + "epoch": 2.71, + "grad_norm": 0.5555631518363953, + "learning_rate": 2.1752876681808272e-05, + "loss": 1.9228, + "step": 15130 + }, + { + "epoch": 2.71, + "grad_norm": 0.8108105063438416, + "learning_rate": 2.1738936180794173e-05, + "loss": 1.7449, + "step": 15135 + }, + { + "epoch": 2.71, + "grad_norm": 0.45947834849357605, + "learning_rate": 2.1724996711251426e-05, + "loss": 2.1185, + "step": 15140 + }, + { + "epoch": 2.71, + "grad_norm": 0.4830736815929413, + "learning_rate": 2.1711058277589066e-05, + "loss": 1.8752, + "step": 15145 + }, + { + "epoch": 2.71, + "grad_norm": 0.6274514198303223, + "learning_rate": 2.1697120884215817e-05, + "loss": 2.2124, + "step": 15150 + }, + { + "epoch": 2.71, + "grad_norm": 0.7346020340919495, + "learning_rate": 2.1683184535540046e-05, + "loss": 1.7558, + "step": 15155 + }, + { + "epoch": 2.71, + "grad_norm": 1.0452593564987183, + "learning_rate": 2.166924923596982e-05, + "loss": 1.692, + "step": 15160 + }, + { + "epoch": 2.71, + "grad_norm": 0.8475756645202637, + "learning_rate": 2.165531498991286e-05, + "loss": 1.6763, + "step": 15165 + }, + { + "epoch": 2.72, + "grad_norm": 0.5865801572799683, + "learning_rate": 2.1641381801776557e-05, + "loss": 1.998, + "step": 15170 + }, + { + "epoch": 2.72, + "grad_norm": 0.8659747242927551, + "learning_rate": 2.1627449675967945e-05, + "loss": 1.9164, + "step": 15175 + }, + { + "epoch": 2.72, + "grad_norm": 0.834214985370636, + "learning_rate": 2.161351861689376e-05, + "loss": 1.8393, + "step": 15180 + }, + { + "epoch": 2.72, + "grad_norm": 0.6650885939598083, + "learning_rate": 2.1599588628960384e-05, + "loss": 1.8873, + "step": 15185 + }, + { + "epoch": 2.72, + "grad_norm": 0.6627525687217712, + "learning_rate": 2.158565971657385e-05, + "loss": 1.8317, + "step": 15190 + }, + { + "epoch": 2.72, + "grad_norm": 2.031599998474121, + "learning_rate": 2.157173188413987e-05, + "loss": 2.1292, + "step": 15195 + }, + { + "epoch": 2.72, + "grad_norm": 0.6217500567436218, + "learning_rate": 2.1557805136063787e-05, + "loss": 1.9539, + "step": 15200 + }, + { + "epoch": 2.72, + "grad_norm": 0.5643689036369324, + "learning_rate": 2.1543879476750632e-05, + "loss": 2.012, + "step": 15205 + }, + { + "epoch": 2.72, + "grad_norm": 0.619275689125061, + "learning_rate": 2.1529954910605084e-05, + "loss": 1.8118, + "step": 15210 + }, + { + "epoch": 2.72, + "grad_norm": 0.6966983675956726, + "learning_rate": 2.1516031442031452e-05, + "loss": 1.9702, + "step": 15215 + }, + { + "epoch": 2.72, + "grad_norm": 0.824952244758606, + "learning_rate": 2.1502109075433725e-05, + "loss": 1.8539, + "step": 15220 + }, + { + "epoch": 2.73, + "grad_norm": 0.7268326878547668, + "learning_rate": 2.1488187815215527e-05, + "loss": 1.8079, + "step": 15225 + }, + { + "epoch": 2.73, + "grad_norm": 0.47795695066452026, + "learning_rate": 2.1474267665780153e-05, + "loss": 1.8452, + "step": 15230 + }, + { + "epoch": 2.73, + "grad_norm": 1.0499993562698364, + "learning_rate": 2.146034863153052e-05, + "loss": 1.9181, + "step": 15235 + }, + { + "epoch": 2.73, + "grad_norm": 0.5682938694953918, + "learning_rate": 2.1446430716869215e-05, + "loss": 1.7971, + "step": 15240 + }, + { + "epoch": 2.73, + "grad_norm": 0.4888734519481659, + "learning_rate": 2.143251392619845e-05, + "loss": 1.979, + "step": 15245 + }, + { + "epoch": 2.73, + "grad_norm": 1.2355575561523438, + "learning_rate": 2.14185982639201e-05, + "loss": 1.6015, + "step": 15250 + }, + { + "epoch": 2.73, + "grad_norm": 2.380397319793701, + "learning_rate": 2.140468373443568e-05, + "loss": 1.8096, + "step": 15255 + }, + { + "epoch": 2.73, + "grad_norm": 5.283520698547363, + "learning_rate": 2.1390770342146338e-05, + "loss": 1.8967, + "step": 15260 + }, + { + "epoch": 2.73, + "grad_norm": 1.0056599378585815, + "learning_rate": 2.137685809145286e-05, + "loss": 1.6557, + "step": 15265 + }, + { + "epoch": 2.73, + "grad_norm": 0.4523605704307556, + "learning_rate": 2.1362946986755678e-05, + "loss": 1.7145, + "step": 15270 + }, + { + "epoch": 2.73, + "grad_norm": 0.9183207154273987, + "learning_rate": 2.134903703245487e-05, + "loss": 2.0983, + "step": 15275 + }, + { + "epoch": 2.74, + "grad_norm": 0.6543048620223999, + "learning_rate": 2.1335128232950135e-05, + "loss": 2.0348, + "step": 15280 + }, + { + "epoch": 2.74, + "grad_norm": 0.8366292119026184, + "learning_rate": 2.132122059264082e-05, + "loss": 1.6401, + "step": 15285 + }, + { + "epoch": 2.74, + "grad_norm": 0.6985805034637451, + "learning_rate": 2.1307314115925888e-05, + "loss": 1.749, + "step": 15290 + }, + { + "epoch": 2.74, + "grad_norm": 0.4857269823551178, + "learning_rate": 2.1293408807203947e-05, + "loss": 1.916, + "step": 15295 + }, + { + "epoch": 2.74, + "grad_norm": 0.9709117412567139, + "learning_rate": 2.1279504670873245e-05, + "loss": 2.0241, + "step": 15300 + }, + { + "epoch": 2.74, + "grad_norm": 1.052292823791504, + "learning_rate": 2.1265601711331636e-05, + "loss": 1.5705, + "step": 15305 + }, + { + "epoch": 2.74, + "grad_norm": 0.7669774889945984, + "learning_rate": 2.1251699932976606e-05, + "loss": 1.7281, + "step": 15310 + }, + { + "epoch": 2.74, + "grad_norm": 1.21905517578125, + "learning_rate": 2.1237799340205283e-05, + "loss": 2.013, + "step": 15315 + }, + { + "epoch": 2.74, + "grad_norm": 0.9224725365638733, + "learning_rate": 2.1223899937414415e-05, + "loss": 1.8188, + "step": 15320 + }, + { + "epoch": 2.74, + "grad_norm": 0.9945480823516846, + "learning_rate": 2.1210001729000357e-05, + "loss": 1.7489, + "step": 15325 + }, + { + "epoch": 2.74, + "grad_norm": 1.0888783931732178, + "learning_rate": 2.1196104719359115e-05, + "loss": 1.8444, + "step": 15330 + }, + { + "epoch": 2.74, + "grad_norm": 0.7765944004058838, + "learning_rate": 2.1182208912886283e-05, + "loss": 1.9981, + "step": 15335 + }, + { + "epoch": 2.75, + "grad_norm": 0.3819294571876526, + "learning_rate": 2.11683143139771e-05, + "loss": 1.8742, + "step": 15340 + }, + { + "epoch": 2.75, + "grad_norm": 1.1613242626190186, + "learning_rate": 2.1154420927026424e-05, + "loss": 1.526, + "step": 15345 + }, + { + "epoch": 2.75, + "grad_norm": 0.8339876532554626, + "learning_rate": 2.114052875642871e-05, + "loss": 2.2053, + "step": 15350 + }, + { + "epoch": 2.75, + "grad_norm": 0.7060007452964783, + "learning_rate": 2.1126637806578027e-05, + "loss": 1.9372, + "step": 15355 + }, + { + "epoch": 2.75, + "grad_norm": 1.6718441247940063, + "learning_rate": 2.111274808186809e-05, + "loss": 1.7533, + "step": 15360 + }, + { + "epoch": 2.75, + "grad_norm": 0.6338340044021606, + "learning_rate": 2.1098859586692184e-05, + "loss": 1.8297, + "step": 15365 + }, + { + "epoch": 2.75, + "grad_norm": 0.91750568151474, + "learning_rate": 2.108497232544324e-05, + "loss": 1.8179, + "step": 15370 + }, + { + "epoch": 2.75, + "grad_norm": 0.8784101009368896, + "learning_rate": 2.1071086302513793e-05, + "loss": 1.7564, + "step": 15375 + }, + { + "epoch": 2.75, + "grad_norm": 0.9052116870880127, + "learning_rate": 2.105720152229596e-05, + "loss": 1.8512, + "step": 15380 + }, + { + "epoch": 2.75, + "grad_norm": 1.1224867105484009, + "learning_rate": 2.1043317989181497e-05, + "loss": 1.7728, + "step": 15385 + }, + { + "epoch": 2.75, + "grad_norm": 0.5035684704780579, + "learning_rate": 2.102943570756175e-05, + "loss": 1.9403, + "step": 15390 + }, + { + "epoch": 2.76, + "grad_norm": 1.207096815109253, + "learning_rate": 2.1015554681827665e-05, + "loss": 1.7796, + "step": 15395 + }, + { + "epoch": 2.76, + "grad_norm": 0.99371737241745, + "learning_rate": 2.1001674916369812e-05, + "loss": 2.0566, + "step": 15400 + }, + { + "epoch": 2.76, + "grad_norm": 1.190355658531189, + "learning_rate": 2.0987796415578333e-05, + "loss": 1.8686, + "step": 15405 + }, + { + "epoch": 2.76, + "grad_norm": 0.4896886944770813, + "learning_rate": 2.0973919183842983e-05, + "loss": 1.9051, + "step": 15410 + }, + { + "epoch": 2.76, + "grad_norm": 1.8628058433532715, + "learning_rate": 2.096004322555312e-05, + "loss": 1.5376, + "step": 15415 + }, + { + "epoch": 2.76, + "grad_norm": 2.157165765762329, + "learning_rate": 2.0946168545097708e-05, + "loss": 1.5625, + "step": 15420 + }, + { + "epoch": 2.76, + "grad_norm": 0.9407357573509216, + "learning_rate": 2.0932295146865277e-05, + "loss": 1.674, + "step": 15425 + }, + { + "epoch": 2.76, + "grad_norm": 0.7408738136291504, + "learning_rate": 2.0918423035243983e-05, + "loss": 1.8977, + "step": 15430 + }, + { + "epoch": 2.76, + "grad_norm": 0.6515112519264221, + "learning_rate": 2.090455221462156e-05, + "loss": 1.8421, + "step": 15435 + }, + { + "epoch": 2.76, + "grad_norm": 0.649272620677948, + "learning_rate": 2.0890682689385324e-05, + "loss": 1.9893, + "step": 15440 + }, + { + "epoch": 2.76, + "grad_norm": 0.8793091177940369, + "learning_rate": 2.0876814463922213e-05, + "loss": 1.5602, + "step": 15445 + }, + { + "epoch": 2.77, + "grad_norm": 0.6581525206565857, + "learning_rate": 2.086294754261872e-05, + "loss": 1.7371, + "step": 15450 + }, + { + "epoch": 2.77, + "grad_norm": 0.5119839906692505, + "learning_rate": 2.084908192986093e-05, + "loss": 2.0622, + "step": 15455 + }, + { + "epoch": 2.77, + "grad_norm": 0.5535023808479309, + "learning_rate": 2.083521763003453e-05, + "loss": 2.2007, + "step": 15460 + }, + { + "epoch": 2.77, + "grad_norm": 1.1636162996292114, + "learning_rate": 2.08213546475248e-05, + "loss": 1.9082, + "step": 15465 + }, + { + "epoch": 2.77, + "grad_norm": 0.48526015877723694, + "learning_rate": 2.0807492986716566e-05, + "loss": 2.0381, + "step": 15470 + }, + { + "epoch": 2.77, + "grad_norm": 1.322176456451416, + "learning_rate": 2.0793632651994268e-05, + "loss": 1.7674, + "step": 15475 + }, + { + "epoch": 2.77, + "grad_norm": 0.6021888256072998, + "learning_rate": 2.0779773647741924e-05, + "loss": 1.6768, + "step": 15480 + }, + { + "epoch": 2.77, + "grad_norm": 0.8677191138267517, + "learning_rate": 2.076591597834311e-05, + "loss": 1.6359, + "step": 15485 + }, + { + "epoch": 2.77, + "grad_norm": 0.6849371790885925, + "learning_rate": 2.0752059648181008e-05, + "loss": 1.7706, + "step": 15490 + }, + { + "epoch": 2.77, + "grad_norm": 1.1195003986358643, + "learning_rate": 2.073820466163835e-05, + "loss": 1.7344, + "step": 15495 + }, + { + "epoch": 2.77, + "grad_norm": 0.6078161001205444, + "learning_rate": 2.072435102309745e-05, + "loss": 1.687, + "step": 15500 + }, + { + "epoch": 2.78, + "grad_norm": 0.8624263405799866, + "learning_rate": 2.0710498736940208e-05, + "loss": 1.8988, + "step": 15505 + }, + { + "epoch": 2.78, + "grad_norm": 1.518776535987854, + "learning_rate": 2.06966478075481e-05, + "loss": 1.9482, + "step": 15510 + }, + { + "epoch": 2.78, + "grad_norm": 1.0695035457611084, + "learning_rate": 2.068279823930214e-05, + "loss": 1.7764, + "step": 15515 + }, + { + "epoch": 2.78, + "grad_norm": 0.537037193775177, + "learning_rate": 2.066895003658295e-05, + "loss": 1.8001, + "step": 15520 + }, + { + "epoch": 2.78, + "grad_norm": 0.7043410539627075, + "learning_rate": 2.0655103203770702e-05, + "loss": 1.8046, + "step": 15525 + }, + { + "epoch": 2.78, + "grad_norm": 0.6016290783882141, + "learning_rate": 2.0641257745245124e-05, + "loss": 1.8151, + "step": 15530 + }, + { + "epoch": 2.78, + "grad_norm": 1.0047852993011475, + "learning_rate": 2.0627413665385533e-05, + "loss": 1.9005, + "step": 15535 + }, + { + "epoch": 2.78, + "grad_norm": 0.5631789565086365, + "learning_rate": 2.06135709685708e-05, + "loss": 1.7507, + "step": 15540 + }, + { + "epoch": 2.78, + "grad_norm": 0.7759091258049011, + "learning_rate": 2.059972965917934e-05, + "loss": 1.7911, + "step": 15545 + }, + { + "epoch": 2.78, + "grad_norm": 0.9690524339675903, + "learning_rate": 2.0585889741589155e-05, + "loss": 1.6226, + "step": 15550 + }, + { + "epoch": 2.78, + "grad_norm": 0.9261754751205444, + "learning_rate": 2.0572051220177807e-05, + "loss": 1.722, + "step": 15555 + }, + { + "epoch": 2.79, + "grad_norm": 0.7827115058898926, + "learning_rate": 2.0558214099322393e-05, + "loss": 1.8318, + "step": 15560 + }, + { + "epoch": 2.79, + "grad_norm": 0.7148807644844055, + "learning_rate": 2.0544378383399593e-05, + "loss": 1.6204, + "step": 15565 + }, + { + "epoch": 2.79, + "grad_norm": 0.6687959432601929, + "learning_rate": 2.0530544076785617e-05, + "loss": 1.9251, + "step": 15570 + }, + { + "epoch": 2.79, + "grad_norm": 1.5032340288162231, + "learning_rate": 2.051671118385625e-05, + "loss": 1.9225, + "step": 15575 + }, + { + "epoch": 2.79, + "grad_norm": 1.197785496711731, + "learning_rate": 2.050287970898683e-05, + "loss": 1.9162, + "step": 15580 + }, + { + "epoch": 2.79, + "grad_norm": 1.1399624347686768, + "learning_rate": 2.0489049656552236e-05, + "loss": 1.8481, + "step": 15585 + }, + { + "epoch": 2.79, + "grad_norm": 1.0693291425704956, + "learning_rate": 2.0475221030926888e-05, + "loss": 1.919, + "step": 15590 + }, + { + "epoch": 2.79, + "grad_norm": 0.7608375549316406, + "learning_rate": 2.0461393836484776e-05, + "loss": 1.8845, + "step": 15595 + }, + { + "epoch": 2.79, + "grad_norm": 0.8283123970031738, + "learning_rate": 2.0447568077599432e-05, + "loss": 1.6992, + "step": 15600 + }, + { + "epoch": 2.79, + "grad_norm": 0.8869904279708862, + "learning_rate": 2.043374375864392e-05, + "loss": 1.91, + "step": 15605 + }, + { + "epoch": 2.79, + "grad_norm": 0.9428117871284485, + "learning_rate": 2.0419920883990874e-05, + "loss": 2.0304, + "step": 15610 + }, + { + "epoch": 2.8, + "grad_norm": 1.6648744344711304, + "learning_rate": 2.0406099458012438e-05, + "loss": 1.5347, + "step": 15615 + }, + { + "epoch": 2.8, + "grad_norm": 1.004983901977539, + "learning_rate": 2.0392279485080323e-05, + "loss": 1.6797, + "step": 15620 + }, + { + "epoch": 2.8, + "grad_norm": 0.7050873637199402, + "learning_rate": 2.0378460969565782e-05, + "loss": 1.8073, + "step": 15625 + }, + { + "epoch": 2.8, + "grad_norm": 0.7096034288406372, + "learning_rate": 2.0364643915839587e-05, + "loss": 1.694, + "step": 15630 + }, + { + "epoch": 2.8, + "grad_norm": 0.6012222170829773, + "learning_rate": 2.0350828328272076e-05, + "loss": 1.8169, + "step": 15635 + }, + { + "epoch": 2.8, + "grad_norm": 0.9153398275375366, + "learning_rate": 2.0337014211233077e-05, + "loss": 1.96, + "step": 15640 + }, + { + "epoch": 2.8, + "grad_norm": 1.016750454902649, + "learning_rate": 2.0323201569092003e-05, + "loss": 1.9078, + "step": 15645 + }, + { + "epoch": 2.8, + "grad_norm": 1.7948215007781982, + "learning_rate": 2.0309390406217772e-05, + "loss": 1.9035, + "step": 15650 + }, + { + "epoch": 2.8, + "grad_norm": 0.7154805660247803, + "learning_rate": 2.029558072697885e-05, + "loss": 1.9442, + "step": 15655 + }, + { + "epoch": 2.8, + "grad_norm": 0.8079678416252136, + "learning_rate": 2.028177253574321e-05, + "loss": 1.6577, + "step": 15660 + }, + { + "epoch": 2.8, + "grad_norm": 0.8642460107803345, + "learning_rate": 2.026796583687838e-05, + "loss": 1.6969, + "step": 15665 + }, + { + "epoch": 2.8, + "grad_norm": 0.655441164970398, + "learning_rate": 2.025416063475141e-05, + "loss": 1.854, + "step": 15670 + }, + { + "epoch": 2.81, + "grad_norm": 0.6256705522537231, + "learning_rate": 2.024035693372886e-05, + "loss": 1.9581, + "step": 15675 + }, + { + "epoch": 2.81, + "grad_norm": 0.7426342964172363, + "learning_rate": 2.0226554738176845e-05, + "loss": 1.8966, + "step": 15680 + }, + { + "epoch": 2.81, + "grad_norm": 0.7382630109786987, + "learning_rate": 2.0212754052460963e-05, + "loss": 1.9278, + "step": 15685 + }, + { + "epoch": 2.81, + "grad_norm": 1.1039477586746216, + "learning_rate": 2.0198954880946376e-05, + "loss": 1.8093, + "step": 15690 + }, + { + "epoch": 2.81, + "grad_norm": 0.5927040576934814, + "learning_rate": 2.018515722799774e-05, + "loss": 1.8377, + "step": 15695 + }, + { + "epoch": 2.81, + "grad_norm": 0.4983259439468384, + "learning_rate": 2.017136109797924e-05, + "loss": 1.9169, + "step": 15700 + }, + { + "epoch": 2.81, + "grad_norm": 0.9285030364990234, + "learning_rate": 2.0157566495254583e-05, + "loss": 1.9493, + "step": 15705 + }, + { + "epoch": 2.81, + "grad_norm": 0.9399527311325073, + "learning_rate": 2.014377342418698e-05, + "loss": 1.8094, + "step": 15710 + }, + { + "epoch": 2.81, + "grad_norm": 0.7926543951034546, + "learning_rate": 2.0129981889139175e-05, + "loss": 1.8163, + "step": 15715 + }, + { + "epoch": 2.81, + "grad_norm": 1.1448842287063599, + "learning_rate": 2.0116191894473414e-05, + "loss": 1.8834, + "step": 15720 + }, + { + "epoch": 2.81, + "grad_norm": 1.2125980854034424, + "learning_rate": 2.0102403444551467e-05, + "loss": 1.9686, + "step": 15725 + }, + { + "epoch": 2.82, + "grad_norm": 0.6270211935043335, + "learning_rate": 2.008861654373459e-05, + "loss": 1.6633, + "step": 15730 + }, + { + "epoch": 2.82, + "grad_norm": 0.845684826374054, + "learning_rate": 2.007483119638358e-05, + "loss": 1.5904, + "step": 15735 + }, + { + "epoch": 2.82, + "grad_norm": 0.9219332337379456, + "learning_rate": 2.0061047406858725e-05, + "loss": 1.7978, + "step": 15740 + }, + { + "epoch": 2.82, + "grad_norm": 0.5408844351768494, + "learning_rate": 2.0047265179519825e-05, + "loss": 1.7812, + "step": 15745 + }, + { + "epoch": 2.82, + "grad_norm": 0.7204087972640991, + "learning_rate": 2.0033484518726182e-05, + "loss": 2.0534, + "step": 15750 + }, + { + "epoch": 2.82, + "grad_norm": 0.9531112313270569, + "learning_rate": 2.0019705428836604e-05, + "loss": 1.6982, + "step": 15755 + }, + { + "epoch": 2.82, + "grad_norm": 0.7499538064002991, + "learning_rate": 2.0005927914209417e-05, + "loss": 1.8288, + "step": 15760 + }, + { + "epoch": 2.82, + "grad_norm": 1.6334372758865356, + "learning_rate": 1.9992151979202424e-05, + "loss": 1.6551, + "step": 15765 + }, + { + "epoch": 2.82, + "grad_norm": 0.9548289775848389, + "learning_rate": 1.9978377628172938e-05, + "loss": 2.0638, + "step": 15770 + }, + { + "epoch": 2.82, + "grad_norm": 0.6916335225105286, + "learning_rate": 1.996460486547779e-05, + "loss": 2.0167, + "step": 15775 + }, + { + "epoch": 2.82, + "grad_norm": 0.789394199848175, + "learning_rate": 1.9950833695473277e-05, + "loss": 1.9085, + "step": 15780 + }, + { + "epoch": 2.83, + "grad_norm": 0.42912745475769043, + "learning_rate": 1.9937064122515202e-05, + "loss": 1.8567, + "step": 15785 + }, + { + "epoch": 2.83, + "grad_norm": 1.1623276472091675, + "learning_rate": 1.992329615095888e-05, + "loss": 1.8448, + "step": 15790 + }, + { + "epoch": 2.83, + "grad_norm": 0.7881998419761658, + "learning_rate": 1.9909529785159097e-05, + "loss": 1.9888, + "step": 15795 + }, + { + "epoch": 2.83, + "grad_norm": 0.5520839691162109, + "learning_rate": 1.9895765029470148e-05, + "loss": 2.0507, + "step": 15800 + }, + { + "epoch": 2.83, + "grad_norm": 1.4102293252944946, + "learning_rate": 1.9882001888245816e-05, + "loss": 1.9575, + "step": 15805 + }, + { + "epoch": 2.83, + "grad_norm": 0.5933654308319092, + "learning_rate": 1.9868240365839358e-05, + "loss": 2.1449, + "step": 15810 + }, + { + "epoch": 2.83, + "grad_norm": 0.37906643748283386, + "learning_rate": 1.9854480466603542e-05, + "loss": 1.9846, + "step": 15815 + }, + { + "epoch": 2.83, + "grad_norm": 0.675449550151825, + "learning_rate": 1.9840722194890604e-05, + "loss": 2.0511, + "step": 15820 + }, + { + "epoch": 2.83, + "grad_norm": 0.8554467558860779, + "learning_rate": 1.9826965555052274e-05, + "loss": 1.7434, + "step": 15825 + }, + { + "epoch": 2.83, + "grad_norm": 0.68226557970047, + "learning_rate": 1.9813210551439756e-05, + "loss": 1.9633, + "step": 15830 + }, + { + "epoch": 2.83, + "grad_norm": 0.6609245538711548, + "learning_rate": 1.9799457188403758e-05, + "loss": 2.0091, + "step": 15835 + }, + { + "epoch": 2.84, + "grad_norm": 0.8956989049911499, + "learning_rate": 1.9785705470294447e-05, + "loss": 1.775, + "step": 15840 + }, + { + "epoch": 2.84, + "grad_norm": 0.621740996837616, + "learning_rate": 1.9771955401461477e-05, + "loss": 1.8911, + "step": 15845 + }, + { + "epoch": 2.84, + "grad_norm": 1.393864393234253, + "learning_rate": 1.975820698625399e-05, + "loss": 1.5529, + "step": 15850 + }, + { + "epoch": 2.84, + "grad_norm": 0.920356273651123, + "learning_rate": 1.9744460229020583e-05, + "loss": 1.9129, + "step": 15855 + }, + { + "epoch": 2.84, + "grad_norm": 1.2865760326385498, + "learning_rate": 1.973071513410936e-05, + "loss": 2.1003, + "step": 15860 + }, + { + "epoch": 2.84, + "grad_norm": 0.5856181383132935, + "learning_rate": 1.971697170586787e-05, + "loss": 1.755, + "step": 15865 + }, + { + "epoch": 2.84, + "grad_norm": 0.4005410969257355, + "learning_rate": 1.970322994864315e-05, + "loss": 1.992, + "step": 15870 + }, + { + "epoch": 2.84, + "grad_norm": 0.9305061101913452, + "learning_rate": 1.9689489866781695e-05, + "loss": 1.9721, + "step": 15875 + }, + { + "epoch": 2.84, + "grad_norm": 0.9853824377059937, + "learning_rate": 1.9675751464629493e-05, + "loss": 2.1053, + "step": 15880 + }, + { + "epoch": 2.84, + "grad_norm": 0.6760913133621216, + "learning_rate": 1.9662014746531977e-05, + "loss": 1.8466, + "step": 15885 + }, + { + "epoch": 2.84, + "grad_norm": 0.451269268989563, + "learning_rate": 1.9648279716834058e-05, + "loss": 2.0028, + "step": 15890 + }, + { + "epoch": 2.85, + "grad_norm": 0.890579879283905, + "learning_rate": 1.9634546379880127e-05, + "loss": 1.7298, + "step": 15895 + }, + { + "epoch": 2.85, + "grad_norm": 1.1749356985092163, + "learning_rate": 1.962081474001401e-05, + "loss": 2.0342, + "step": 15900 + }, + { + "epoch": 2.85, + "grad_norm": 0.923694908618927, + "learning_rate": 1.9607084801579018e-05, + "loss": 2.0843, + "step": 15905 + }, + { + "epoch": 2.85, + "grad_norm": 0.7525226473808289, + "learning_rate": 1.9593356568917913e-05, + "loss": 2.129, + "step": 15910 + }, + { + "epoch": 2.85, + "grad_norm": 0.7968067526817322, + "learning_rate": 1.9579630046372936e-05, + "loss": 2.1804, + "step": 15915 + }, + { + "epoch": 2.85, + "grad_norm": 0.6558468341827393, + "learning_rate": 1.956590523828575e-05, + "loss": 1.6117, + "step": 15920 + }, + { + "epoch": 2.85, + "grad_norm": 0.8354988098144531, + "learning_rate": 1.9552182148997513e-05, + "loss": 1.7763, + "step": 15925 + }, + { + "epoch": 2.85, + "grad_norm": 0.8877277374267578, + "learning_rate": 1.9538460782848822e-05, + "loss": 1.9353, + "step": 15930 + }, + { + "epoch": 2.85, + "grad_norm": 0.990190327167511, + "learning_rate": 1.9524741144179727e-05, + "loss": 1.6819, + "step": 15935 + }, + { + "epoch": 2.85, + "grad_norm": 1.0683060884475708, + "learning_rate": 1.951102323732975e-05, + "loss": 1.7892, + "step": 15940 + }, + { + "epoch": 2.85, + "grad_norm": 0.5219915509223938, + "learning_rate": 1.9497307066637837e-05, + "loss": 2.1233, + "step": 15945 + }, + { + "epoch": 2.85, + "grad_norm": 0.791999101638794, + "learning_rate": 1.948359263644241e-05, + "loss": 1.8079, + "step": 15950 + }, + { + "epoch": 2.86, + "grad_norm": 1.5373420715332031, + "learning_rate": 1.9469879951081323e-05, + "loss": 1.8237, + "step": 15955 + }, + { + "epoch": 2.86, + "grad_norm": 0.6582045555114746, + "learning_rate": 1.94561690148919e-05, + "loss": 1.8346, + "step": 15960 + }, + { + "epoch": 2.86, + "grad_norm": 1.0421522855758667, + "learning_rate": 1.944245983221087e-05, + "loss": 2.09, + "step": 15965 + }, + { + "epoch": 2.86, + "grad_norm": 0.5510540008544922, + "learning_rate": 1.942875240737446e-05, + "loss": 2.0754, + "step": 15970 + }, + { + "epoch": 2.86, + "grad_norm": 0.7140408158302307, + "learning_rate": 1.9415046744718295e-05, + "loss": 2.1591, + "step": 15975 + }, + { + "epoch": 2.86, + "grad_norm": 0.533048689365387, + "learning_rate": 1.9401342848577476e-05, + "loss": 2.0399, + "step": 15980 + }, + { + "epoch": 2.86, + "grad_norm": 1.1084396839141846, + "learning_rate": 1.9387640723286533e-05, + "loss": 1.5784, + "step": 15985 + }, + { + "epoch": 2.86, + "grad_norm": 0.6885467767715454, + "learning_rate": 1.9373940373179424e-05, + "loss": 1.6761, + "step": 15990 + }, + { + "epoch": 2.86, + "grad_norm": 0.7324159145355225, + "learning_rate": 1.936024180258957e-05, + "loss": 1.9015, + "step": 15995 + }, + { + "epoch": 2.86, + "grad_norm": 1.0003372430801392, + "learning_rate": 1.93465450158498e-05, + "loss": 1.5935, + "step": 16000 + }, + { + "epoch": 2.86, + "grad_norm": 0.4618781805038452, + "learning_rate": 1.933285001729242e-05, + "loss": 2.0161, + "step": 16005 + }, + { + "epoch": 2.87, + "grad_norm": 1.0251598358154297, + "learning_rate": 1.931915681124911e-05, + "loss": 1.7638, + "step": 16010 + }, + { + "epoch": 2.87, + "grad_norm": 1.092031478881836, + "learning_rate": 1.9305465402051047e-05, + "loss": 1.7572, + "step": 16015 + }, + { + "epoch": 2.87, + "grad_norm": 0.4748564064502716, + "learning_rate": 1.92917757940288e-05, + "loss": 1.8665, + "step": 16020 + }, + { + "epoch": 2.87, + "grad_norm": 1.7850896120071411, + "learning_rate": 1.927808799151237e-05, + "loss": 1.5633, + "step": 16025 + }, + { + "epoch": 2.87, + "grad_norm": 0.7904292941093445, + "learning_rate": 1.9264401998831213e-05, + "loss": 1.7375, + "step": 16030 + }, + { + "epoch": 2.87, + "grad_norm": 0.9015944600105286, + "learning_rate": 1.9250717820314182e-05, + "loss": 1.5826, + "step": 16035 + }, + { + "epoch": 2.87, + "grad_norm": 0.7508746981620789, + "learning_rate": 1.923703546028958e-05, + "loss": 2.1586, + "step": 16040 + }, + { + "epoch": 2.87, + "grad_norm": 4.131949424743652, + "learning_rate": 1.922335492308511e-05, + "loss": 1.7214, + "step": 16045 + }, + { + "epoch": 2.87, + "grad_norm": 1.2245509624481201, + "learning_rate": 1.9209676213027923e-05, + "loss": 1.7981, + "step": 16050 + }, + { + "epoch": 2.87, + "grad_norm": 0.4659061133861542, + "learning_rate": 1.919599933444459e-05, + "loss": 1.8612, + "step": 16055 + }, + { + "epoch": 2.87, + "grad_norm": 0.6791583895683289, + "learning_rate": 1.9182324291661084e-05, + "loss": 1.8788, + "step": 16060 + }, + { + "epoch": 2.88, + "grad_norm": 1.2033220529556274, + "learning_rate": 1.91686510890028e-05, + "loss": 1.7272, + "step": 16065 + }, + { + "epoch": 2.88, + "grad_norm": 0.5756292343139648, + "learning_rate": 1.915497973079457e-05, + "loss": 1.9688, + "step": 16070 + }, + { + "epoch": 2.88, + "grad_norm": 0.4528789222240448, + "learning_rate": 1.9141310221360632e-05, + "loss": 1.9315, + "step": 16075 + }, + { + "epoch": 2.88, + "grad_norm": 0.4836488664150238, + "learning_rate": 1.9127642565024628e-05, + "loss": 1.7283, + "step": 16080 + }, + { + "epoch": 2.88, + "grad_norm": 1.785624623298645, + "learning_rate": 1.9113976766109643e-05, + "loss": 1.7548, + "step": 16085 + }, + { + "epoch": 2.88, + "grad_norm": 0.9146924018859863, + "learning_rate": 1.9100312828938135e-05, + "loss": 1.8917, + "step": 16090 + }, + { + "epoch": 2.88, + "grad_norm": 0.9764569401741028, + "learning_rate": 1.9086650757832007e-05, + "loss": 1.6859, + "step": 16095 + }, + { + "epoch": 2.88, + "grad_norm": 0.8843137621879578, + "learning_rate": 1.9072990557112564e-05, + "loss": 1.8221, + "step": 16100 + }, + { + "epoch": 2.88, + "grad_norm": 0.6954315900802612, + "learning_rate": 1.905933223110051e-05, + "loss": 1.8598, + "step": 16105 + }, + { + "epoch": 2.88, + "grad_norm": 0.5043737888336182, + "learning_rate": 1.904567578411594e-05, + "loss": 2.0398, + "step": 16110 + }, + { + "epoch": 2.88, + "grad_norm": 0.8501962423324585, + "learning_rate": 1.9032021220478405e-05, + "loss": 1.8814, + "step": 16115 + }, + { + "epoch": 2.89, + "grad_norm": 0.6526318192481995, + "learning_rate": 1.901836854450681e-05, + "loss": 1.9833, + "step": 16120 + }, + { + "epoch": 2.89, + "grad_norm": 0.7955963611602783, + "learning_rate": 1.900471776051949e-05, + "loss": 1.8694, + "step": 16125 + }, + { + "epoch": 2.89, + "grad_norm": 1.724168062210083, + "learning_rate": 1.8991068872834185e-05, + "loss": 2.09, + "step": 16130 + }, + { + "epoch": 2.89, + "grad_norm": 0.7435161471366882, + "learning_rate": 1.8977421885768004e-05, + "loss": 1.896, + "step": 16135 + }, + { + "epoch": 2.89, + "grad_norm": 0.8156474828720093, + "learning_rate": 1.896377680363749e-05, + "loss": 2.0581, + "step": 16140 + }, + { + "epoch": 2.89, + "grad_norm": 0.7331948280334473, + "learning_rate": 1.8950133630758582e-05, + "loss": 1.7547, + "step": 16145 + }, + { + "epoch": 2.89, + "grad_norm": 0.7191774845123291, + "learning_rate": 1.893649237144658e-05, + "loss": 1.6202, + "step": 16150 + }, + { + "epoch": 2.89, + "grad_norm": 1.1362369060516357, + "learning_rate": 1.89228530300162e-05, + "loss": 1.8106, + "step": 16155 + }, + { + "epoch": 2.89, + "grad_norm": 0.5437344908714294, + "learning_rate": 1.8909215610781566e-05, + "loss": 1.9975, + "step": 16160 + }, + { + "epoch": 2.89, + "grad_norm": 0.9459233283996582, + "learning_rate": 1.8895580118056174e-05, + "loss": 1.7396, + "step": 16165 + }, + { + "epoch": 2.89, + "grad_norm": 0.5802138447761536, + "learning_rate": 1.8881946556152918e-05, + "loss": 1.8485, + "step": 16170 + }, + { + "epoch": 2.9, + "grad_norm": 0.6844445466995239, + "learning_rate": 1.8868314929384084e-05, + "loss": 1.5007, + "step": 16175 + }, + { + "epoch": 2.9, + "grad_norm": 0.7569360136985779, + "learning_rate": 1.885468524206134e-05, + "loss": 1.5597, + "step": 16180 + }, + { + "epoch": 2.9, + "grad_norm": 0.7909181714057922, + "learning_rate": 1.8841057498495736e-05, + "loss": 1.8607, + "step": 16185 + }, + { + "epoch": 2.9, + "grad_norm": 0.9658223986625671, + "learning_rate": 1.8827431702997732e-05, + "loss": 1.7466, + "step": 16190 + }, + { + "epoch": 2.9, + "grad_norm": 0.6813267469406128, + "learning_rate": 1.8813807859877147e-05, + "loss": 1.925, + "step": 16195 + }, + { + "epoch": 2.9, + "grad_norm": 0.9319490790367126, + "learning_rate": 1.8800185973443174e-05, + "loss": 1.9117, + "step": 16200 + }, + { + "epoch": 2.9, + "grad_norm": 0.8836780190467834, + "learning_rate": 1.878656604800442e-05, + "loss": 2.0741, + "step": 16205 + }, + { + "epoch": 2.9, + "grad_norm": 0.99202960729599, + "learning_rate": 1.8772948087868846e-05, + "loss": 1.7016, + "step": 16210 + }, + { + "epoch": 2.9, + "grad_norm": 0.7995858788490295, + "learning_rate": 1.8759332097343797e-05, + "loss": 2.0791, + "step": 16215 + }, + { + "epoch": 2.9, + "grad_norm": 1.3566383123397827, + "learning_rate": 1.874571808073601e-05, + "loss": 1.5975, + "step": 16220 + }, + { + "epoch": 2.9, + "grad_norm": 1.005405306816101, + "learning_rate": 1.8732106042351572e-05, + "loss": 1.9958, + "step": 16225 + }, + { + "epoch": 2.91, + "grad_norm": 0.589972198009491, + "learning_rate": 1.8718495986495967e-05, + "loss": 2.1223, + "step": 16230 + }, + { + "epoch": 2.91, + "grad_norm": 1.41853928565979, + "learning_rate": 1.8704887917474042e-05, + "loss": 2.0237, + "step": 16235 + }, + { + "epoch": 2.91, + "grad_norm": 0.5799002051353455, + "learning_rate": 1.8691281839590018e-05, + "loss": 1.8955, + "step": 16240 + }, + { + "epoch": 2.91, + "grad_norm": 0.6206422448158264, + "learning_rate": 1.867767775714747e-05, + "loss": 1.9139, + "step": 16245 + }, + { + "epoch": 2.91, + "grad_norm": 1.6891002655029297, + "learning_rate": 1.8664075674449376e-05, + "loss": 2.0885, + "step": 16250 + }, + { + "epoch": 2.91, + "grad_norm": 0.5611288547515869, + "learning_rate": 1.865047559579804e-05, + "loss": 2.031, + "step": 16255 + }, + { + "epoch": 2.91, + "grad_norm": 1.0590778589248657, + "learning_rate": 1.8636877525495173e-05, + "loss": 1.6809, + "step": 16260 + }, + { + "epoch": 2.91, + "grad_norm": 0.43978947401046753, + "learning_rate": 1.8623281467841823e-05, + "loss": 1.9269, + "step": 16265 + }, + { + "epoch": 2.91, + "grad_norm": 0.8819534182548523, + "learning_rate": 1.8609687427138407e-05, + "loss": 1.8985, + "step": 16270 + }, + { + "epoch": 2.91, + "grad_norm": 1.4660248756408691, + "learning_rate": 1.859609540768471e-05, + "loss": 1.8683, + "step": 16275 + }, + { + "epoch": 2.91, + "grad_norm": 0.7070092558860779, + "learning_rate": 1.8582505413779885e-05, + "loss": 1.7901, + "step": 16280 + }, + { + "epoch": 2.91, + "grad_norm": 0.8060388565063477, + "learning_rate": 1.8568917449722427e-05, + "loss": 1.6429, + "step": 16285 + }, + { + "epoch": 2.92, + "grad_norm": 1.0174509286880493, + "learning_rate": 1.8555331519810188e-05, + "loss": 1.8506, + "step": 16290 + }, + { + "epoch": 2.92, + "grad_norm": 1.0679351091384888, + "learning_rate": 1.8541747628340388e-05, + "loss": 1.7145, + "step": 16295 + }, + { + "epoch": 2.92, + "grad_norm": 0.9246571660041809, + "learning_rate": 1.8528165779609603e-05, + "loss": 1.6584, + "step": 16300 + }, + { + "epoch": 2.92, + "grad_norm": 0.5674405097961426, + "learning_rate": 1.851458597791375e-05, + "loss": 1.5784, + "step": 16305 + }, + { + "epoch": 2.92, + "grad_norm": 0.738040566444397, + "learning_rate": 1.8501008227548123e-05, + "loss": 1.8847, + "step": 16310 + }, + { + "epoch": 2.92, + "grad_norm": 0.651881754398346, + "learning_rate": 1.8487432532807335e-05, + "loss": 1.8891, + "step": 16315 + }, + { + "epoch": 2.92, + "grad_norm": 0.8285610675811768, + "learning_rate": 1.847385889798538e-05, + "loss": 1.7669, + "step": 16320 + }, + { + "epoch": 2.92, + "grad_norm": 0.5357155203819275, + "learning_rate": 1.8460287327375576e-05, + "loss": 1.9032, + "step": 16325 + }, + { + "epoch": 2.92, + "grad_norm": 0.7268362045288086, + "learning_rate": 1.8446717825270595e-05, + "loss": 1.841, + "step": 16330 + }, + { + "epoch": 2.92, + "grad_norm": 0.7002309560775757, + "learning_rate": 1.8433150395962473e-05, + "loss": 2.0385, + "step": 16335 + }, + { + "epoch": 2.92, + "grad_norm": 1.3066308498382568, + "learning_rate": 1.841958504374256e-05, + "loss": 1.8508, + "step": 16340 + }, + { + "epoch": 2.93, + "grad_norm": 0.6260136961936951, + "learning_rate": 1.8406021772901566e-05, + "loss": 2.0906, + "step": 16345 + }, + { + "epoch": 2.93, + "grad_norm": 0.523491382598877, + "learning_rate": 1.839246058772954e-05, + "loss": 1.9063, + "step": 16350 + }, + { + "epoch": 2.93, + "grad_norm": 1.4432581663131714, + "learning_rate": 1.837890149251588e-05, + "loss": 2.143, + "step": 16355 + }, + { + "epoch": 2.93, + "grad_norm": 1.09247887134552, + "learning_rate": 1.83653444915493e-05, + "loss": 1.9382, + "step": 16360 + }, + { + "epoch": 2.93, + "grad_norm": 1.2057552337646484, + "learning_rate": 1.835178958911789e-05, + "loss": 1.8019, + "step": 16365 + }, + { + "epoch": 2.93, + "grad_norm": 0.8609539866447449, + "learning_rate": 1.8338236789509024e-05, + "loss": 1.9375, + "step": 16370 + }, + { + "epoch": 2.93, + "grad_norm": 1.2171326875686646, + "learning_rate": 1.832468609700946e-05, + "loss": 1.9367, + "step": 16375 + }, + { + "epoch": 2.93, + "grad_norm": 1.5746159553527832, + "learning_rate": 1.831113751590527e-05, + "loss": 1.6848, + "step": 16380 + }, + { + "epoch": 2.93, + "grad_norm": 0.9621313810348511, + "learning_rate": 1.8297591050481843e-05, + "loss": 1.7429, + "step": 16385 + }, + { + "epoch": 2.93, + "grad_norm": 0.3991534113883972, + "learning_rate": 1.8284046705023915e-05, + "loss": 1.6909, + "step": 16390 + }, + { + "epoch": 2.93, + "grad_norm": 0.6495162844657898, + "learning_rate": 1.8270504483815558e-05, + "loss": 1.974, + "step": 16395 + }, + { + "epoch": 2.94, + "grad_norm": 0.6558966040611267, + "learning_rate": 1.8256964391140164e-05, + "loss": 1.9515, + "step": 16400 + }, + { + "epoch": 2.94, + "grad_norm": 0.7836574912071228, + "learning_rate": 1.824342643128044e-05, + "loss": 1.9209, + "step": 16405 + }, + { + "epoch": 2.94, + "grad_norm": 0.685867190361023, + "learning_rate": 1.822989060851844e-05, + "loss": 1.6289, + "step": 16410 + }, + { + "epoch": 2.94, + "grad_norm": 2.84834361076355, + "learning_rate": 1.8216356927135525e-05, + "loss": 1.7064, + "step": 16415 + }, + { + "epoch": 2.94, + "grad_norm": 0.5934162139892578, + "learning_rate": 1.8202825391412393e-05, + "loss": 1.7927, + "step": 16420 + }, + { + "epoch": 2.94, + "grad_norm": 0.5504593253135681, + "learning_rate": 1.8192001710585122e-05, + "loss": 2.1149, + "step": 16425 + }, + { + "epoch": 2.94, + "grad_norm": 0.6468228697776794, + "learning_rate": 1.817847404783477e-05, + "loss": 2.1836, + "step": 16430 + }, + { + "epoch": 2.94, + "grad_norm": 0.6115560531616211, + "learning_rate": 1.8164948542726506e-05, + "loss": 1.7916, + "step": 16435 + }, + { + "epoch": 2.94, + "grad_norm": 0.9618609547615051, + "learning_rate": 1.8151425199538447e-05, + "loss": 1.7555, + "step": 16440 + }, + { + "epoch": 2.94, + "grad_norm": 0.8971806168556213, + "learning_rate": 1.813790402254801e-05, + "loss": 1.837, + "step": 16445 + }, + { + "epoch": 2.94, + "grad_norm": 0.7342554926872253, + "learning_rate": 1.812438501603193e-05, + "loss": 1.9009, + "step": 16450 + }, + { + "epoch": 2.95, + "grad_norm": 0.743880033493042, + "learning_rate": 1.811086818426625e-05, + "loss": 1.7526, + "step": 16455 + }, + { + "epoch": 2.95, + "grad_norm": 1.2517614364624023, + "learning_rate": 1.8097353531526346e-05, + "loss": 1.9244, + "step": 16460 + }, + { + "epoch": 2.95, + "grad_norm": 0.933478057384491, + "learning_rate": 1.8083841062086875e-05, + "loss": 1.7212, + "step": 16465 + }, + { + "epoch": 2.95, + "grad_norm": 0.49430811405181885, + "learning_rate": 1.807033078022183e-05, + "loss": 1.997, + "step": 16470 + }, + { + "epoch": 2.95, + "grad_norm": 1.0263806581497192, + "learning_rate": 1.8056822690204495e-05, + "loss": 1.9313, + "step": 16475 + }, + { + "epoch": 2.95, + "grad_norm": 0.8995507955551147, + "learning_rate": 1.804331679630746e-05, + "loss": 1.7927, + "step": 16480 + }, + { + "epoch": 2.95, + "grad_norm": 1.0173105001449585, + "learning_rate": 1.8029813102802624e-05, + "loss": 1.6534, + "step": 16485 + }, + { + "epoch": 2.95, + "grad_norm": 1.0232231616973877, + "learning_rate": 1.8016311613961206e-05, + "loss": 1.9429, + "step": 16490 + }, + { + "epoch": 2.95, + "grad_norm": 0.8685727715492249, + "learning_rate": 1.8002812334053703e-05, + "loss": 1.7248, + "step": 16495 + }, + { + "epoch": 2.95, + "grad_norm": 0.9551222324371338, + "learning_rate": 1.7989315267349936e-05, + "loss": 1.978, + "step": 16500 + }, + { + "epoch": 2.95, + "grad_norm": 0.7746654748916626, + "learning_rate": 1.7975820418119e-05, + "loss": 2.0558, + "step": 16505 + }, + { + "epoch": 2.96, + "grad_norm": 1.1619644165039062, + "learning_rate": 1.7962327790629308e-05, + "loss": 1.8762, + "step": 16510 + }, + { + "epoch": 2.96, + "grad_norm": 0.7726320028305054, + "learning_rate": 1.794883738914857e-05, + "loss": 1.74, + "step": 16515 + }, + { + "epoch": 2.96, + "grad_norm": 0.9551023244857788, + "learning_rate": 1.793534921794379e-05, + "loss": 1.9455, + "step": 16520 + }, + { + "epoch": 2.96, + "grad_norm": 1.1138999462127686, + "learning_rate": 1.792186328128125e-05, + "loss": 1.5777, + "step": 16525 + }, + { + "epoch": 2.96, + "grad_norm": 0.5583162307739258, + "learning_rate": 1.7908379583426542e-05, + "loss": 1.774, + "step": 16530 + }, + { + "epoch": 2.96, + "grad_norm": 0.6044538617134094, + "learning_rate": 1.7894898128644557e-05, + "loss": 1.7541, + "step": 16535 + }, + { + "epoch": 2.96, + "grad_norm": 0.9106922149658203, + "learning_rate": 1.7881418921199454e-05, + "loss": 1.956, + "step": 16540 + }, + { + "epoch": 2.96, + "grad_norm": 0.6646421551704407, + "learning_rate": 1.7867941965354706e-05, + "loss": 1.8526, + "step": 16545 + }, + { + "epoch": 2.96, + "grad_norm": 0.8308748006820679, + "learning_rate": 1.7854467265373047e-05, + "loss": 1.6069, + "step": 16550 + }, + { + "epoch": 2.96, + "grad_norm": 0.7582127451896667, + "learning_rate": 1.7840994825516523e-05, + "loss": 1.835, + "step": 16555 + }, + { + "epoch": 2.96, + "grad_norm": 0.9956608414649963, + "learning_rate": 1.7827524650046447e-05, + "loss": 1.7829, + "step": 16560 + }, + { + "epoch": 2.97, + "grad_norm": 1.3976410627365112, + "learning_rate": 1.7814056743223424e-05, + "loss": 1.8531, + "step": 16565 + }, + { + "epoch": 2.97, + "grad_norm": 0.6910321712493896, + "learning_rate": 1.780059110930735e-05, + "loss": 1.8633, + "step": 16570 + }, + { + "epoch": 2.97, + "grad_norm": 1.205909013748169, + "learning_rate": 1.7787127752557378e-05, + "loss": 1.9529, + "step": 16575 + }, + { + "epoch": 2.97, + "grad_norm": 1.323149561882019, + "learning_rate": 1.777366667723196e-05, + "loss": 1.7019, + "step": 16580 + }, + { + "epoch": 2.97, + "grad_norm": 0.8329334855079651, + "learning_rate": 1.776020788758882e-05, + "loss": 1.757, + "step": 16585 + }, + { + "epoch": 2.97, + "grad_norm": 2.090344190597534, + "learning_rate": 1.7746751387884962e-05, + "loss": 1.7325, + "step": 16590 + }, + { + "epoch": 2.97, + "grad_norm": 1.0283842086791992, + "learning_rate": 1.7733297182376663e-05, + "loss": 1.8203, + "step": 16595 + }, + { + "epoch": 2.97, + "grad_norm": 2.198268175125122, + "learning_rate": 1.7719845275319473e-05, + "loss": 1.5753, + "step": 16600 + }, + { + "epoch": 2.97, + "grad_norm": 0.6753280758857727, + "learning_rate": 1.7706395670968225e-05, + "loss": 1.7627, + "step": 16605 + }, + { + "epoch": 2.97, + "grad_norm": 0.7107426524162292, + "learning_rate": 1.7692948373577003e-05, + "loss": 1.8784, + "step": 16610 + }, + { + "epoch": 2.97, + "grad_norm": 0.8274734020233154, + "learning_rate": 1.7679503387399193e-05, + "loss": 2.058, + "step": 16615 + }, + { + "epoch": 2.97, + "grad_norm": 0.8373821973800659, + "learning_rate": 1.766606071668741e-05, + "loss": 2.1438, + "step": 16620 + }, + { + "epoch": 2.98, + "grad_norm": 0.6570589542388916, + "learning_rate": 1.765262036569358e-05, + "loss": 2.1135, + "step": 16625 + }, + { + "epoch": 2.98, + "grad_norm": 0.4979616105556488, + "learning_rate": 1.7639182338668847e-05, + "loss": 1.8345, + "step": 16630 + }, + { + "epoch": 2.98, + "grad_norm": 1.0736268758773804, + "learning_rate": 1.7625746639863673e-05, + "loss": 1.9596, + "step": 16635 + }, + { + "epoch": 2.98, + "grad_norm": 0.7851144075393677, + "learning_rate": 1.761231327352773e-05, + "loss": 1.8697, + "step": 16640 + }, + { + "epoch": 2.98, + "grad_norm": 0.7080187201499939, + "learning_rate": 1.7598882243909997e-05, + "loss": 1.5315, + "step": 16645 + }, + { + "epoch": 2.98, + "grad_norm": 0.8879645466804504, + "learning_rate": 1.7585453555258697e-05, + "loss": 2.0436, + "step": 16650 + }, + { + "epoch": 2.98, + "grad_norm": 0.9378961324691772, + "learning_rate": 1.75720272118213e-05, + "loss": 2.0527, + "step": 16655 + }, + { + "epoch": 2.98, + "grad_norm": 0.886584460735321, + "learning_rate": 1.755860321784456e-05, + "loss": 1.8691, + "step": 16660 + }, + { + "epoch": 2.98, + "grad_norm": 0.5075838565826416, + "learning_rate": 1.7545181577574454e-05, + "loss": 1.7712, + "step": 16665 + }, + { + "epoch": 2.98, + "grad_norm": 0.6004984974861145, + "learning_rate": 1.7531762295256254e-05, + "loss": 1.8251, + "step": 16670 + }, + { + "epoch": 2.98, + "grad_norm": 0.7136458158493042, + "learning_rate": 1.7518345375134447e-05, + "loss": 1.7217, + "step": 16675 + }, + { + "epoch": 2.99, + "grad_norm": 0.8066684603691101, + "learning_rate": 1.7504930821452808e-05, + "loss": 2.0242, + "step": 16680 + }, + { + "epoch": 2.99, + "grad_norm": 0.7061532735824585, + "learning_rate": 1.7491518638454336e-05, + "loss": 1.7245, + "step": 16685 + }, + { + "epoch": 2.99, + "grad_norm": 0.7429984211921692, + "learning_rate": 1.7478108830381296e-05, + "loss": 1.6728, + "step": 16690 + }, + { + "epoch": 2.99, + "grad_norm": 0.9607851505279541, + "learning_rate": 1.74647014014752e-05, + "loss": 1.3616, + "step": 16695 + }, + { + "epoch": 2.99, + "grad_norm": 0.46374931931495667, + "learning_rate": 1.74512963559768e-05, + "loss": 1.7103, + "step": 16700 + }, + { + "epoch": 2.99, + "grad_norm": 1.2484031915664673, + "learning_rate": 1.7437893698126102e-05, + "loss": 1.745, + "step": 16705 + }, + { + "epoch": 2.99, + "grad_norm": 1.0168793201446533, + "learning_rate": 1.7424493432162366e-05, + "loss": 1.8125, + "step": 16710 + }, + { + "epoch": 2.99, + "grad_norm": 0.9517542123794556, + "learning_rate": 1.7411095562324062e-05, + "loss": 1.7851, + "step": 16715 + }, + { + "epoch": 2.99, + "grad_norm": 1.2883968353271484, + "learning_rate": 1.7397700092848927e-05, + "loss": 1.9432, + "step": 16720 + }, + { + "epoch": 2.99, + "grad_norm": 0.5470624566078186, + "learning_rate": 1.7384307027973946e-05, + "loss": 1.8344, + "step": 16725 + }, + { + "epoch": 2.99, + "grad_norm": 1.2388825416564941, + "learning_rate": 1.7370916371935322e-05, + "loss": 1.8179, + "step": 16730 + }, + { + "epoch": 3.0, + "grad_norm": 0.9549524784088135, + "learning_rate": 1.7357528128968513e-05, + "loss": 1.6871, + "step": 16735 + }, + { + "epoch": 3.0, + "grad_norm": 0.6752102375030518, + "learning_rate": 1.7344142303308208e-05, + "loss": 1.9497, + "step": 16740 + }, + { + "epoch": 3.0, + "grad_norm": 0.8145148158073425, + "learning_rate": 1.7330758899188322e-05, + "loss": 1.8557, + "step": 16745 + }, + { + "epoch": 3.0, + "grad_norm": 0.9011813402175903, + "learning_rate": 1.7317377920842028e-05, + "loss": 2.0228, + "step": 16750 + }, + { + "epoch": 3.0, + "grad_norm": 0.5068954825401306, + "learning_rate": 1.7303999372501705e-05, + "loss": 1.677, + "step": 16755 + }, + { + "epoch": 3.0, + "grad_norm": 0.4357677102088928, + "learning_rate": 1.729062325839898e-05, + "loss": 2.1066, + "step": 16760 + }, + { + "epoch": 3.0, + "grad_norm": 0.6191133856773376, + "learning_rate": 1.7277249582764697e-05, + "loss": 1.931, + "step": 16765 + }, + { + "epoch": 3.0, + "grad_norm": 0.5191856026649475, + "learning_rate": 1.726387834982895e-05, + "loss": 1.5549, + "step": 16770 + }, + { + "epoch": 3.0, + "grad_norm": 0.7998222708702087, + "learning_rate": 1.7250509563821032e-05, + "loss": 1.5933, + "step": 16775 + }, + { + "epoch": 3.0, + "grad_norm": 1.8126819133758545, + "learning_rate": 1.7237143228969488e-05, + "loss": 2.0985, + "step": 16780 + }, + { + "epoch": 3.0, + "grad_norm": 0.6312077641487122, + "learning_rate": 1.7223779349502073e-05, + "loss": 1.9076, + "step": 16785 + }, + { + "epoch": 3.01, + "grad_norm": 0.9065338969230652, + "learning_rate": 1.721041792964577e-05, + "loss": 1.7754, + "step": 16790 + }, + { + "epoch": 3.01, + "grad_norm": 0.9230442047119141, + "learning_rate": 1.7197058973626785e-05, + "loss": 2.0562, + "step": 16795 + }, + { + "epoch": 3.01, + "grad_norm": 0.8840682506561279, + "learning_rate": 1.7183702485670543e-05, + "loss": 1.9154, + "step": 16800 + }, + { + "epoch": 3.01, + "grad_norm": 0.8318727612495422, + "learning_rate": 1.7170348470001688e-05, + "loss": 1.9867, + "step": 16805 + }, + { + "epoch": 3.01, + "grad_norm": 0.6302641034126282, + "learning_rate": 1.7156996930844073e-05, + "loss": 1.9369, + "step": 16810 + }, + { + "epoch": 3.01, + "grad_norm": 0.49491533637046814, + "learning_rate": 1.7143647872420793e-05, + "loss": 1.569, + "step": 16815 + }, + { + "epoch": 3.01, + "grad_norm": 1.2738268375396729, + "learning_rate": 1.7130301298954126e-05, + "loss": 1.817, + "step": 16820 + }, + { + "epoch": 3.01, + "grad_norm": 0.8552365899085999, + "learning_rate": 1.7116957214665587e-05, + "loss": 1.6973, + "step": 16825 + }, + { + "epoch": 3.01, + "grad_norm": 1.1242488622665405, + "learning_rate": 1.710361562377591e-05, + "loss": 1.721, + "step": 16830 + }, + { + "epoch": 3.01, + "grad_norm": 0.8571417927742004, + "learning_rate": 1.7090276530505004e-05, + "loss": 1.7216, + "step": 16835 + }, + { + "epoch": 3.01, + "grad_norm": 8.810229301452637, + "learning_rate": 1.707693993907203e-05, + "loss": 2.1472, + "step": 16840 + }, + { + "epoch": 3.02, + "grad_norm": 0.7794356346130371, + "learning_rate": 1.706360585369533e-05, + "loss": 1.9728, + "step": 16845 + }, + { + "epoch": 3.02, + "grad_norm": 0.5598608255386353, + "learning_rate": 1.705027427859247e-05, + "loss": 1.7381, + "step": 16850 + }, + { + "epoch": 3.02, + "grad_norm": 0.5534650087356567, + "learning_rate": 1.7036945217980205e-05, + "loss": 1.9653, + "step": 16855 + }, + { + "epoch": 3.02, + "grad_norm": 1.1248143911361694, + "learning_rate": 1.702361867607451e-05, + "loss": 1.8914, + "step": 16860 + }, + { + "epoch": 3.02, + "grad_norm": 5.001594543457031, + "learning_rate": 1.7010294657090548e-05, + "loss": 1.7459, + "step": 16865 + }, + { + "epoch": 3.02, + "grad_norm": 0.5881880521774292, + "learning_rate": 1.6996973165242704e-05, + "loss": 1.5695, + "step": 16870 + }, + { + "epoch": 3.02, + "grad_norm": 0.516708493232727, + "learning_rate": 1.6983654204744552e-05, + "loss": 1.8567, + "step": 16875 + }, + { + "epoch": 3.02, + "grad_norm": 1.0162904262542725, + "learning_rate": 1.6970337779808866e-05, + "loss": 1.7251, + "step": 16880 + }, + { + "epoch": 3.02, + "grad_norm": 0.7917419075965881, + "learning_rate": 1.6957023894647614e-05, + "loss": 1.7346, + "step": 16885 + }, + { + "epoch": 3.02, + "grad_norm": 0.6009075045585632, + "learning_rate": 1.6943712553471966e-05, + "loss": 2.0054, + "step": 16890 + }, + { + "epoch": 3.02, + "grad_norm": 1.0688451528549194, + "learning_rate": 1.6930403760492297e-05, + "loss": 1.7544, + "step": 16895 + }, + { + "epoch": 3.03, + "grad_norm": 0.9357558488845825, + "learning_rate": 1.691709751991815e-05, + "loss": 1.9594, + "step": 16900 + }, + { + "epoch": 3.03, + "grad_norm": 0.6222114562988281, + "learning_rate": 1.6903793835958283e-05, + "loss": 1.9862, + "step": 16905 + }, + { + "epoch": 3.03, + "grad_norm": 1.3084805011749268, + "learning_rate": 1.6890492712820636e-05, + "loss": 1.7035, + "step": 16910 + }, + { + "epoch": 3.03, + "grad_norm": 0.9012706279754639, + "learning_rate": 1.6877194154712342e-05, + "loss": 2.0331, + "step": 16915 + }, + { + "epoch": 3.03, + "grad_norm": 2.868881940841675, + "learning_rate": 1.686389816583973e-05, + "loss": 1.6892, + "step": 16920 + }, + { + "epoch": 3.03, + "grad_norm": 0.7443296909332275, + "learning_rate": 1.6850604750408296e-05, + "loss": 1.6757, + "step": 16925 + }, + { + "epoch": 3.03, + "grad_norm": 1.3445460796356201, + "learning_rate": 1.6837313912622748e-05, + "loss": 1.9912, + "step": 16930 + }, + { + "epoch": 3.03, + "grad_norm": 0.43768608570098877, + "learning_rate": 1.682402565668695e-05, + "loss": 1.843, + "step": 16935 + }, + { + "epoch": 3.03, + "grad_norm": 1.3455616235733032, + "learning_rate": 1.6810739986803987e-05, + "loss": 1.9418, + "step": 16940 + }, + { + "epoch": 3.03, + "grad_norm": 0.627327561378479, + "learning_rate": 1.6797456907176074e-05, + "loss": 1.3489, + "step": 16945 + }, + { + "epoch": 3.03, + "grad_norm": 1.0638902187347412, + "learning_rate": 1.678417642200466e-05, + "loss": 1.8686, + "step": 16950 + }, + { + "epoch": 3.03, + "grad_norm": 0.8032281398773193, + "learning_rate": 1.6770898535490333e-05, + "loss": 1.6365, + "step": 16955 + }, + { + "epoch": 3.04, + "grad_norm": 0.4566420912742615, + "learning_rate": 1.6757623251832887e-05, + "loss": 2.1101, + "step": 16960 + }, + { + "epoch": 3.04, + "grad_norm": 0.840287446975708, + "learning_rate": 1.674435057523128e-05, + "loss": 1.5239, + "step": 16965 + }, + { + "epoch": 3.04, + "grad_norm": 1.571700930595398, + "learning_rate": 1.673108050988364e-05, + "loss": 1.9991, + "step": 16970 + }, + { + "epoch": 3.04, + "grad_norm": 0.6242066621780396, + "learning_rate": 1.6717813059987293e-05, + "loss": 2.0588, + "step": 16975 + }, + { + "epoch": 3.04, + "grad_norm": 1.1033291816711426, + "learning_rate": 1.6704548229738697e-05, + "loss": 1.7984, + "step": 16980 + }, + { + "epoch": 3.04, + "grad_norm": 0.7740524411201477, + "learning_rate": 1.6691286023333517e-05, + "loss": 1.5941, + "step": 16985 + }, + { + "epoch": 3.04, + "grad_norm": 0.677450954914093, + "learning_rate": 1.6678026444966587e-05, + "loss": 1.9171, + "step": 16990 + }, + { + "epoch": 3.04, + "grad_norm": 0.6410496234893799, + "learning_rate": 1.6664769498831884e-05, + "loss": 1.9519, + "step": 16995 + }, + { + "epoch": 3.04, + "grad_norm": 1.0398961305618286, + "learning_rate": 1.6651515189122564e-05, + "loss": 1.7254, + "step": 17000 + }, + { + "epoch": 3.04, + "grad_norm": 0.7212021350860596, + "learning_rate": 1.663826352003096e-05, + "loss": 1.9161, + "step": 17005 + }, + { + "epoch": 3.04, + "grad_norm": 0.8684729337692261, + "learning_rate": 1.6625014495748566e-05, + "loss": 1.8468, + "step": 17010 + }, + { + "epoch": 3.05, + "grad_norm": 0.5007403492927551, + "learning_rate": 1.661176812046603e-05, + "loss": 2.1323, + "step": 17015 + }, + { + "epoch": 3.05, + "grad_norm": 1.1957733631134033, + "learning_rate": 1.659852439837317e-05, + "loss": 1.7557, + "step": 17020 + }, + { + "epoch": 3.05, + "grad_norm": 0.935827910900116, + "learning_rate": 1.658528333365896e-05, + "loss": 1.9512, + "step": 17025 + }, + { + "epoch": 3.05, + "grad_norm": 0.5936657786369324, + "learning_rate": 1.657204493051153e-05, + "loss": 1.8086, + "step": 17030 + }, + { + "epoch": 3.05, + "grad_norm": 0.7128114700317383, + "learning_rate": 1.6558809193118202e-05, + "loss": 1.7417, + "step": 17035 + }, + { + "epoch": 3.05, + "grad_norm": 1.5274105072021484, + "learning_rate": 1.6545576125665395e-05, + "loss": 2.0459, + "step": 17040 + }, + { + "epoch": 3.05, + "grad_norm": 1.2550373077392578, + "learning_rate": 1.6532345732338724e-05, + "loss": 1.689, + "step": 17045 + }, + { + "epoch": 3.05, + "grad_norm": 0.6413518786430359, + "learning_rate": 1.651911801732296e-05, + "loss": 1.9651, + "step": 17050 + }, + { + "epoch": 3.05, + "grad_norm": 0.7453106045722961, + "learning_rate": 1.6505892984802e-05, + "loss": 1.8327, + "step": 17055 + }, + { + "epoch": 3.05, + "grad_norm": 0.7208839058876038, + "learning_rate": 1.6492670638958924e-05, + "loss": 1.7667, + "step": 17060 + }, + { + "epoch": 3.05, + "grad_norm": 4.1532416343688965, + "learning_rate": 1.6479450983975946e-05, + "loss": 1.7473, + "step": 17065 + }, + { + "epoch": 3.06, + "grad_norm": 1.0695805549621582, + "learning_rate": 1.646623402403442e-05, + "loss": 1.7827, + "step": 17070 + }, + { + "epoch": 3.06, + "grad_norm": 0.7231548428535461, + "learning_rate": 1.6453019763314862e-05, + "loss": 1.8391, + "step": 17075 + }, + { + "epoch": 3.06, + "grad_norm": 0.7367892265319824, + "learning_rate": 1.6439808205996942e-05, + "loss": 1.6613, + "step": 17080 + }, + { + "epoch": 3.06, + "grad_norm": 0.9162821769714355, + "learning_rate": 1.642659935625945e-05, + "loss": 2.0619, + "step": 17085 + }, + { + "epoch": 3.06, + "grad_norm": 0.6432048678398132, + "learning_rate": 1.6413393218280328e-05, + "loss": 2.0551, + "step": 17090 + }, + { + "epoch": 3.06, + "grad_norm": 1.1106834411621094, + "learning_rate": 1.640018979623668e-05, + "loss": 1.7662, + "step": 17095 + }, + { + "epoch": 3.06, + "grad_norm": 0.7890145778656006, + "learning_rate": 1.6386989094304715e-05, + "loss": 1.8003, + "step": 17100 + }, + { + "epoch": 3.06, + "grad_norm": 0.9444851875305176, + "learning_rate": 1.6373791116659823e-05, + "loss": 1.7911, + "step": 17105 + }, + { + "epoch": 3.06, + "grad_norm": 1.1127926111221313, + "learning_rate": 1.6360595867476504e-05, + "loss": 1.7545, + "step": 17110 + }, + { + "epoch": 3.06, + "grad_norm": 0.997949481010437, + "learning_rate": 1.63474033509284e-05, + "loss": 2.0693, + "step": 17115 + }, + { + "epoch": 3.06, + "grad_norm": 0.6662190556526184, + "learning_rate": 1.6334213571188288e-05, + "loss": 1.7301, + "step": 17120 + }, + { + "epoch": 3.07, + "grad_norm": 1.0271848440170288, + "learning_rate": 1.63210265324281e-05, + "loss": 1.9007, + "step": 17125 + }, + { + "epoch": 3.07, + "grad_norm": 1.0467654466629028, + "learning_rate": 1.6307842238818874e-05, + "loss": 1.7783, + "step": 17130 + }, + { + "epoch": 3.07, + "grad_norm": 0.8379384279251099, + "learning_rate": 1.6294660694530776e-05, + "loss": 1.629, + "step": 17135 + }, + { + "epoch": 3.07, + "grad_norm": 0.7403329014778137, + "learning_rate": 1.628148190373313e-05, + "loss": 1.5475, + "step": 17140 + }, + { + "epoch": 3.07, + "grad_norm": 0.8592426776885986, + "learning_rate": 1.626830587059437e-05, + "loss": 1.7707, + "step": 17145 + }, + { + "epoch": 3.07, + "grad_norm": 0.8583236932754517, + "learning_rate": 1.6255132599282065e-05, + "loss": 1.785, + "step": 17150 + }, + { + "epoch": 3.07, + "grad_norm": 0.5623230934143066, + "learning_rate": 1.624196209396291e-05, + "loss": 1.9508, + "step": 17155 + }, + { + "epoch": 3.07, + "grad_norm": 0.7288662195205688, + "learning_rate": 1.622879435880272e-05, + "loss": 1.9152, + "step": 17160 + }, + { + "epoch": 3.07, + "grad_norm": 0.5778056383132935, + "learning_rate": 1.621562939796643e-05, + "loss": 1.8435, + "step": 17165 + }, + { + "epoch": 3.07, + "grad_norm": 0.7465753555297852, + "learning_rate": 1.620246721561812e-05, + "loss": 1.6645, + "step": 17170 + }, + { + "epoch": 3.07, + "grad_norm": 0.6527841687202454, + "learning_rate": 1.6189307815920973e-05, + "loss": 1.7769, + "step": 17175 + }, + { + "epoch": 3.08, + "grad_norm": 0.9524030089378357, + "learning_rate": 1.617615120303728e-05, + "loss": 1.9306, + "step": 17180 + }, + { + "epoch": 3.08, + "grad_norm": 0.8811773061752319, + "learning_rate": 1.6162997381128478e-05, + "loss": 1.9073, + "step": 17185 + }, + { + "epoch": 3.08, + "grad_norm": 0.5965471863746643, + "learning_rate": 1.61498463543551e-05, + "loss": 2.1639, + "step": 17190 + }, + { + "epoch": 3.08, + "grad_norm": 0.8437951803207397, + "learning_rate": 1.6136698126876806e-05, + "loss": 1.8132, + "step": 17195 + }, + { + "epoch": 3.08, + "grad_norm": 0.6007370948791504, + "learning_rate": 1.6123552702852372e-05, + "loss": 1.9358, + "step": 17200 + }, + { + "epoch": 3.08, + "grad_norm": 0.843265950679779, + "learning_rate": 1.611041008643968e-05, + "loss": 1.9867, + "step": 17205 + }, + { + "epoch": 3.08, + "grad_norm": 0.7209492325782776, + "learning_rate": 1.6097270281795722e-05, + "loss": 2.0699, + "step": 17210 + }, + { + "epoch": 3.08, + "grad_norm": 1.3360052108764648, + "learning_rate": 1.608413329307662e-05, + "loss": 1.8391, + "step": 17215 + }, + { + "epoch": 3.08, + "grad_norm": 0.6539368033409119, + "learning_rate": 1.6070999124437586e-05, + "loss": 2.1616, + "step": 17220 + }, + { + "epoch": 3.08, + "grad_norm": 0.6861870884895325, + "learning_rate": 1.605786778003293e-05, + "loss": 1.6254, + "step": 17225 + }, + { + "epoch": 3.08, + "grad_norm": 0.804837703704834, + "learning_rate": 1.60447392640161e-05, + "loss": 1.6783, + "step": 17230 + }, + { + "epoch": 3.08, + "grad_norm": 0.64955735206604, + "learning_rate": 1.6031613580539617e-05, + "loss": 1.9525, + "step": 17235 + }, + { + "epoch": 3.09, + "grad_norm": 1.0166507959365845, + "learning_rate": 1.6018490733755137e-05, + "loss": 1.7277, + "step": 17240 + }, + { + "epoch": 3.09, + "grad_norm": 0.7587583661079407, + "learning_rate": 1.60053707278134e-05, + "loss": 1.7941, + "step": 17245 + }, + { + "epoch": 3.09, + "grad_norm": 0.623679518699646, + "learning_rate": 1.5992253566864244e-05, + "loss": 1.8599, + "step": 17250 + }, + { + "epoch": 3.09, + "grad_norm": 0.8486801385879517, + "learning_rate": 1.597913925505663e-05, + "loss": 1.7489, + "step": 17255 + }, + { + "epoch": 3.09, + "grad_norm": 0.41435787081718445, + "learning_rate": 1.5966027796538586e-05, + "loss": 1.8594, + "step": 17260 + }, + { + "epoch": 3.09, + "grad_norm": 2.0519394874572754, + "learning_rate": 1.595291919545726e-05, + "loss": 2.012, + "step": 17265 + }, + { + "epoch": 3.09, + "grad_norm": 1.2812122106552124, + "learning_rate": 1.59398134559589e-05, + "loss": 2.0921, + "step": 17270 + }, + { + "epoch": 3.09, + "grad_norm": 2.542084217071533, + "learning_rate": 1.5926710582188826e-05, + "loss": 1.8695, + "step": 17275 + }, + { + "epoch": 3.09, + "grad_norm": 0.564811646938324, + "learning_rate": 1.591361057829146e-05, + "loss": 2.3541, + "step": 17280 + }, + { + "epoch": 3.09, + "grad_norm": 0.9718075394630432, + "learning_rate": 1.5900513448410332e-05, + "loss": 1.6868, + "step": 17285 + }, + { + "epoch": 3.09, + "grad_norm": 0.9781553149223328, + "learning_rate": 1.588741919668805e-05, + "loss": 1.9755, + "step": 17290 + }, + { + "epoch": 3.1, + "grad_norm": 0.5526376366615295, + "learning_rate": 1.5874327827266304e-05, + "loss": 2.0345, + "step": 17295 + }, + { + "epoch": 3.1, + "grad_norm": 3.0677220821380615, + "learning_rate": 1.5861239344285893e-05, + "loss": 1.9055, + "step": 17300 + }, + { + "epoch": 3.1, + "grad_norm": 1.1272741556167603, + "learning_rate": 1.5848153751886685e-05, + "loss": 2.0391, + "step": 17305 + }, + { + "epoch": 3.1, + "grad_norm": 0.8001586198806763, + "learning_rate": 1.5835071054207643e-05, + "loss": 1.9079, + "step": 17310 + }, + { + "epoch": 3.1, + "grad_norm": 0.9023197293281555, + "learning_rate": 1.582199125538682e-05, + "loss": 1.7115, + "step": 17315 + }, + { + "epoch": 3.1, + "grad_norm": 0.6295259594917297, + "learning_rate": 1.5808914359561328e-05, + "loss": 1.819, + "step": 17320 + }, + { + "epoch": 3.1, + "grad_norm": 0.5197964310646057, + "learning_rate": 1.579584037086738e-05, + "loss": 1.8448, + "step": 17325 + }, + { + "epoch": 3.1, + "grad_norm": 0.8185873031616211, + "learning_rate": 1.578276929344027e-05, + "loss": 1.9797, + "step": 17330 + }, + { + "epoch": 3.1, + "grad_norm": 1.1019208431243896, + "learning_rate": 1.5769701131414372e-05, + "loss": 1.5105, + "step": 17335 + }, + { + "epoch": 3.1, + "grad_norm": 0.8213723301887512, + "learning_rate": 1.575663588892312e-05, + "loss": 1.8677, + "step": 17340 + }, + { + "epoch": 3.1, + "grad_norm": 0.8821647763252258, + "learning_rate": 1.574357357009905e-05, + "loss": 1.7404, + "step": 17345 + }, + { + "epoch": 3.11, + "grad_norm": 1.0914586782455444, + "learning_rate": 1.5730514179073756e-05, + "loss": 1.6639, + "step": 17350 + }, + { + "epoch": 3.11, + "grad_norm": 0.6956515908241272, + "learning_rate": 1.5717457719977906e-05, + "loss": 1.9229, + "step": 17355 + }, + { + "epoch": 3.11, + "grad_norm": 1.160091757774353, + "learning_rate": 1.5704404196941265e-05, + "loss": 1.4537, + "step": 17360 + }, + { + "epoch": 3.11, + "grad_norm": 0.5331469774246216, + "learning_rate": 1.5691353614092627e-05, + "loss": 1.842, + "step": 17365 + }, + { + "epoch": 3.11, + "grad_norm": 0.9176687598228455, + "learning_rate": 1.5678305975559882e-05, + "loss": 1.5422, + "step": 17370 + }, + { + "epoch": 3.11, + "grad_norm": 0.5114845633506775, + "learning_rate": 1.5665261285469992e-05, + "loss": 2.0008, + "step": 17375 + }, + { + "epoch": 3.11, + "grad_norm": 0.7605366706848145, + "learning_rate": 1.5652219547948982e-05, + "loss": 1.871, + "step": 17380 + }, + { + "epoch": 3.11, + "grad_norm": 1.129357933998108, + "learning_rate": 1.5639180767121938e-05, + "loss": 1.7841, + "step": 17385 + }, + { + "epoch": 3.11, + "grad_norm": 0.8872073292732239, + "learning_rate": 1.562614494711301e-05, + "loss": 1.8101, + "step": 17390 + }, + { + "epoch": 3.11, + "grad_norm": 1.4235471487045288, + "learning_rate": 1.5613112092045418e-05, + "loss": 2.1259, + "step": 17395 + }, + { + "epoch": 3.11, + "grad_norm": 0.5211644172668457, + "learning_rate": 1.5600082206041443e-05, + "loss": 2.0976, + "step": 17400 + }, + { + "epoch": 3.12, + "grad_norm": 1.07545804977417, + "learning_rate": 1.558705529322243e-05, + "loss": 2.0866, + "step": 17405 + }, + { + "epoch": 3.12, + "grad_norm": 0.8453513383865356, + "learning_rate": 1.5574031357708778e-05, + "loss": 1.822, + "step": 17410 + }, + { + "epoch": 3.12, + "grad_norm": 0.5668812394142151, + "learning_rate": 1.556101040361993e-05, + "loss": 1.8889, + "step": 17415 + }, + { + "epoch": 3.12, + "grad_norm": 0.6529809236526489, + "learning_rate": 1.554799243507441e-05, + "loss": 2.2048, + "step": 17420 + }, + { + "epoch": 3.12, + "grad_norm": 1.1768380403518677, + "learning_rate": 1.5534977456189797e-05, + "loss": 2.0006, + "step": 17425 + }, + { + "epoch": 3.12, + "grad_norm": 0.7664825320243835, + "learning_rate": 1.552196547108271e-05, + "loss": 1.8376, + "step": 17430 + }, + { + "epoch": 3.12, + "grad_norm": 0.5944017767906189, + "learning_rate": 1.550895648386883e-05, + "loss": 1.7861, + "step": 17435 + }, + { + "epoch": 3.12, + "grad_norm": 1.4491263628005981, + "learning_rate": 1.5495950498662877e-05, + "loss": 1.8945, + "step": 17440 + }, + { + "epoch": 3.12, + "grad_norm": 0.961166501045227, + "learning_rate": 1.5482947519578645e-05, + "loss": 1.9871, + "step": 17445 + }, + { + "epoch": 3.12, + "grad_norm": 1.2184598445892334, + "learning_rate": 1.5469947550728958e-05, + "loss": 1.7694, + "step": 17450 + }, + { + "epoch": 3.12, + "grad_norm": 1.1210654973983765, + "learning_rate": 1.5456950596225703e-05, + "loss": 1.6446, + "step": 17455 + }, + { + "epoch": 3.13, + "grad_norm": 0.3188503682613373, + "learning_rate": 1.5443956660179783e-05, + "loss": 1.8352, + "step": 17460 + }, + { + "epoch": 3.13, + "grad_norm": 0.4729178249835968, + "learning_rate": 1.5430965746701183e-05, + "loss": 1.9481, + "step": 17465 + }, + { + "epoch": 3.13, + "grad_norm": 0.6622750163078308, + "learning_rate": 1.5417977859898914e-05, + "loss": 1.9198, + "step": 17470 + }, + { + "epoch": 3.13, + "grad_norm": 2.716395854949951, + "learning_rate": 1.5404993003881027e-05, + "loss": 1.7338, + "step": 17475 + }, + { + "epoch": 3.13, + "grad_norm": 0.9567592144012451, + "learning_rate": 1.5392011182754627e-05, + "loss": 1.5414, + "step": 17480 + }, + { + "epoch": 3.13, + "grad_norm": 0.6734287738800049, + "learning_rate": 1.537903240062585e-05, + "loss": 2.1804, + "step": 17485 + }, + { + "epoch": 3.13, + "grad_norm": 0.9989105463027954, + "learning_rate": 1.5366056661599866e-05, + "loss": 1.6213, + "step": 17490 + }, + { + "epoch": 3.13, + "grad_norm": 1.0770398378372192, + "learning_rate": 1.5353083969780897e-05, + "loss": 1.9834, + "step": 17495 + }, + { + "epoch": 3.13, + "grad_norm": 1.3205726146697998, + "learning_rate": 1.5340114329272194e-05, + "loss": 1.8774, + "step": 17500 + }, + { + "epoch": 3.13, + "grad_norm": 0.6853746771812439, + "learning_rate": 1.532714774417603e-05, + "loss": 1.8502, + "step": 17505 + }, + { + "epoch": 3.13, + "grad_norm": 0.6563792824745178, + "learning_rate": 1.5314184218593724e-05, + "loss": 1.6979, + "step": 17510 + }, + { + "epoch": 3.14, + "grad_norm": 0.8047040104866028, + "learning_rate": 1.5301223756625643e-05, + "loss": 1.5599, + "step": 17515 + }, + { + "epoch": 3.14, + "grad_norm": 1.3527451753616333, + "learning_rate": 1.5288266362371155e-05, + "loss": 1.8567, + "step": 17520 + }, + { + "epoch": 3.14, + "grad_norm": 0.8321057558059692, + "learning_rate": 1.527531203992868e-05, + "loss": 1.9165, + "step": 17525 + }, + { + "epoch": 3.14, + "grad_norm": 0.5902025699615479, + "learning_rate": 1.526236079339565e-05, + "loss": 1.8397, + "step": 17530 + }, + { + "epoch": 3.14, + "grad_norm": 0.8331001400947571, + "learning_rate": 1.5249412626868526e-05, + "loss": 1.9709, + "step": 17535 + }, + { + "epoch": 3.14, + "grad_norm": 1.3767504692077637, + "learning_rate": 1.5236467544442821e-05, + "loss": 1.9877, + "step": 17540 + }, + { + "epoch": 3.14, + "grad_norm": 0.8942415714263916, + "learning_rate": 1.5223525550213034e-05, + "loss": 1.8899, + "step": 17545 + }, + { + "epoch": 3.14, + "grad_norm": 1.9923580884933472, + "learning_rate": 1.521058664827272e-05, + "loss": 1.6486, + "step": 17550 + }, + { + "epoch": 3.14, + "grad_norm": 0.7297930121421814, + "learning_rate": 1.5197650842714426e-05, + "loss": 1.8752, + "step": 17555 + }, + { + "epoch": 3.14, + "grad_norm": 0.7140510082244873, + "learning_rate": 1.5184718137629745e-05, + "loss": 1.8954, + "step": 17560 + }, + { + "epoch": 3.14, + "grad_norm": 0.7358191013336182, + "learning_rate": 1.5171788537109272e-05, + "loss": 2.0138, + "step": 17565 + }, + { + "epoch": 3.14, + "grad_norm": 0.8002922534942627, + "learning_rate": 1.5158862045242633e-05, + "loss": 2.0375, + "step": 17570 + }, + { + "epoch": 3.15, + "grad_norm": 0.6648169755935669, + "learning_rate": 1.5145938666118459e-05, + "loss": 1.9708, + "step": 17575 + }, + { + "epoch": 3.15, + "grad_norm": 1.1531585454940796, + "learning_rate": 1.5133018403824411e-05, + "loss": 1.72, + "step": 17580 + }, + { + "epoch": 3.15, + "grad_norm": 0.5088279843330383, + "learning_rate": 1.512010126244715e-05, + "loss": 1.7109, + "step": 17585 + }, + { + "epoch": 3.15, + "grad_norm": 0.825671911239624, + "learning_rate": 1.5107187246072357e-05, + "loss": 1.7681, + "step": 17590 + }, + { + "epoch": 3.15, + "grad_norm": 1.1927964687347412, + "learning_rate": 1.5094276358784728e-05, + "loss": 1.897, + "step": 17595 + }, + { + "epoch": 3.15, + "grad_norm": 0.9928845763206482, + "learning_rate": 1.5081368604667956e-05, + "loss": 1.8343, + "step": 17600 + }, + { + "epoch": 3.15, + "grad_norm": 0.9306197166442871, + "learning_rate": 1.5068463987804748e-05, + "loss": 1.9001, + "step": 17605 + }, + { + "epoch": 3.15, + "grad_norm": 0.7192040681838989, + "learning_rate": 1.5055562512276827e-05, + "loss": 1.8987, + "step": 17610 + }, + { + "epoch": 3.15, + "grad_norm": 2.9316582679748535, + "learning_rate": 1.5042664182164926e-05, + "loss": 1.9867, + "step": 17615 + }, + { + "epoch": 3.15, + "grad_norm": 0.7092540860176086, + "learning_rate": 1.502976900154876e-05, + "loss": 1.9289, + "step": 17620 + }, + { + "epoch": 3.15, + "grad_norm": 3.189732313156128, + "learning_rate": 1.5016876974507065e-05, + "loss": 1.6191, + "step": 17625 + }, + { + "epoch": 3.16, + "grad_norm": 0.6250412464141846, + "learning_rate": 1.500398810511759e-05, + "loss": 1.7705, + "step": 17630 + }, + { + "epoch": 3.16, + "grad_norm": 0.9590062499046326, + "learning_rate": 1.4991102397457058e-05, + "loss": 1.6913, + "step": 17635 + }, + { + "epoch": 3.16, + "grad_norm": 0.5470601916313171, + "learning_rate": 1.4978219855601216e-05, + "loss": 1.912, + "step": 17640 + }, + { + "epoch": 3.16, + "grad_norm": 0.9383836984634399, + "learning_rate": 1.496534048362479e-05, + "loss": 1.8818, + "step": 17645 + }, + { + "epoch": 3.16, + "grad_norm": 0.911783754825592, + "learning_rate": 1.4952464285601515e-05, + "loss": 1.9046, + "step": 17650 + }, + { + "epoch": 3.16, + "grad_norm": 2.2841453552246094, + "learning_rate": 1.493959126560412e-05, + "loss": 1.5841, + "step": 17655 + }, + { + "epoch": 3.16, + "grad_norm": 0.586314857006073, + "learning_rate": 1.4926721427704335e-05, + "loss": 1.8118, + "step": 17660 + }, + { + "epoch": 3.16, + "grad_norm": 0.8905709981918335, + "learning_rate": 1.4913854775972869e-05, + "loss": 1.8675, + "step": 17665 + }, + { + "epoch": 3.16, + "grad_norm": 1.4882047176361084, + "learning_rate": 1.4900991314479434e-05, + "loss": 1.5654, + "step": 17670 + }, + { + "epoch": 3.16, + "grad_norm": 0.8829624652862549, + "learning_rate": 1.4888131047292742e-05, + "loss": 1.697, + "step": 17675 + }, + { + "epoch": 3.16, + "grad_norm": 1.1058586835861206, + "learning_rate": 1.4875273978480464e-05, + "loss": 1.7776, + "step": 17680 + }, + { + "epoch": 3.17, + "grad_norm": 1.0767732858657837, + "learning_rate": 1.4862420112109293e-05, + "loss": 1.9039, + "step": 17685 + }, + { + "epoch": 3.17, + "grad_norm": 0.7137855291366577, + "learning_rate": 1.48495694522449e-05, + "loss": 1.7499, + "step": 17690 + }, + { + "epoch": 3.17, + "grad_norm": 0.44608253240585327, + "learning_rate": 1.4836722002951908e-05, + "loss": 1.8009, + "step": 17695 + }, + { + "epoch": 3.17, + "grad_norm": 1.0002092123031616, + "learning_rate": 1.4823877768293975e-05, + "loss": 1.4692, + "step": 17700 + }, + { + "epoch": 3.17, + "grad_norm": 0.8534446358680725, + "learning_rate": 1.4811036752333718e-05, + "loss": 1.6299, + "step": 17705 + }, + { + "epoch": 3.17, + "grad_norm": 0.4848932921886444, + "learning_rate": 1.4798198959132733e-05, + "loss": 2.0872, + "step": 17710 + }, + { + "epoch": 3.17, + "grad_norm": 0.5030261874198914, + "learning_rate": 1.4785364392751606e-05, + "loss": 2.0136, + "step": 17715 + }, + { + "epoch": 3.17, + "grad_norm": 0.644991397857666, + "learning_rate": 1.4772533057249898e-05, + "loss": 1.8112, + "step": 17720 + }, + { + "epoch": 3.17, + "grad_norm": 0.925753653049469, + "learning_rate": 1.4759704956686144e-05, + "loss": 1.55, + "step": 17725 + }, + { + "epoch": 3.17, + "grad_norm": 0.7059409618377686, + "learning_rate": 1.4746880095117866e-05, + "loss": 1.8275, + "step": 17730 + }, + { + "epoch": 3.17, + "grad_norm": 1.2741386890411377, + "learning_rate": 1.4734058476601553e-05, + "loss": 1.9445, + "step": 17735 + }, + { + "epoch": 3.18, + "grad_norm": 0.7439374923706055, + "learning_rate": 1.4721240105192663e-05, + "loss": 1.7957, + "step": 17740 + }, + { + "epoch": 3.18, + "grad_norm": 0.9652694463729858, + "learning_rate": 1.470842498494564e-05, + "loss": 1.8712, + "step": 17745 + }, + { + "epoch": 3.18, + "grad_norm": 0.7117319107055664, + "learning_rate": 1.4695613119913903e-05, + "loss": 1.6265, + "step": 17750 + }, + { + "epoch": 3.18, + "grad_norm": 1.2906383275985718, + "learning_rate": 1.4682804514149818e-05, + "loss": 1.7699, + "step": 17755 + }, + { + "epoch": 3.18, + "grad_norm": 1.508944034576416, + "learning_rate": 1.466999917170474e-05, + "loss": 1.7239, + "step": 17760 + }, + { + "epoch": 3.18, + "grad_norm": 0.925870954990387, + "learning_rate": 1.4657197096628994e-05, + "loss": 2.3444, + "step": 17765 + }, + { + "epoch": 3.18, + "grad_norm": 1.065234899520874, + "learning_rate": 1.464439829297185e-05, + "loss": 1.5987, + "step": 17770 + }, + { + "epoch": 3.18, + "grad_norm": 0.7614918351173401, + "learning_rate": 1.4631602764781572e-05, + "loss": 2.0636, + "step": 17775 + }, + { + "epoch": 3.18, + "grad_norm": 0.5512299537658691, + "learning_rate": 1.4618810516105371e-05, + "loss": 1.9394, + "step": 17780 + }, + { + "epoch": 3.18, + "grad_norm": 0.7986428141593933, + "learning_rate": 1.460602155098941e-05, + "loss": 1.8252, + "step": 17785 + }, + { + "epoch": 3.18, + "grad_norm": 0.6706057190895081, + "learning_rate": 1.4593235873478839e-05, + "loss": 1.8949, + "step": 17790 + }, + { + "epoch": 3.19, + "grad_norm": 0.6929710507392883, + "learning_rate": 1.4580453487617745e-05, + "loss": 1.8602, + "step": 17795 + }, + { + "epoch": 3.19, + "grad_norm": 0.9884799122810364, + "learning_rate": 1.456767439744919e-05, + "loss": 1.6815, + "step": 17800 + }, + { + "epoch": 3.19, + "grad_norm": 1.1615020036697388, + "learning_rate": 1.455489860701519e-05, + "loss": 1.8135, + "step": 17805 + }, + { + "epoch": 3.19, + "grad_norm": 0.9147149920463562, + "learning_rate": 1.4542126120356714e-05, + "loss": 1.7469, + "step": 17810 + }, + { + "epoch": 3.19, + "grad_norm": 0.612957239151001, + "learning_rate": 1.4529356941513697e-05, + "loss": 1.8893, + "step": 17815 + }, + { + "epoch": 3.19, + "grad_norm": 0.8941155076026917, + "learning_rate": 1.4516591074524999e-05, + "loss": 2.1368, + "step": 17820 + }, + { + "epoch": 3.19, + "grad_norm": 0.798802375793457, + "learning_rate": 1.4503828523428458e-05, + "loss": 1.7414, + "step": 17825 + }, + { + "epoch": 3.19, + "grad_norm": 1.113351821899414, + "learning_rate": 1.4491069292260868e-05, + "loss": 1.9364, + "step": 17830 + }, + { + "epoch": 3.19, + "grad_norm": 0.9882722496986389, + "learning_rate": 1.4478313385057946e-05, + "loss": 1.75, + "step": 17835 + }, + { + "epoch": 3.19, + "grad_norm": 0.7189344167709351, + "learning_rate": 1.4465560805854378e-05, + "loss": 1.9663, + "step": 17840 + }, + { + "epoch": 3.19, + "grad_norm": 0.7250414490699768, + "learning_rate": 1.4452811558683804e-05, + "loss": 1.8795, + "step": 17845 + }, + { + "epoch": 3.2, + "grad_norm": 0.8930559754371643, + "learning_rate": 1.4440065647578777e-05, + "loss": 1.9817, + "step": 17850 + }, + { + "epoch": 3.2, + "grad_norm": 0.7246281504631042, + "learning_rate": 1.4427323076570831e-05, + "loss": 1.9332, + "step": 17855 + }, + { + "epoch": 3.2, + "grad_norm": 0.7635440230369568, + "learning_rate": 1.4414583849690428e-05, + "loss": 1.8095, + "step": 17860 + }, + { + "epoch": 3.2, + "grad_norm": 0.7749634981155396, + "learning_rate": 1.4401847970966966e-05, + "loss": 1.7567, + "step": 17865 + }, + { + "epoch": 3.2, + "grad_norm": 0.7701842784881592, + "learning_rate": 1.4389115444428811e-05, + "loss": 1.8668, + "step": 17870 + }, + { + "epoch": 3.2, + "grad_norm": 2.8975706100463867, + "learning_rate": 1.4376386274103231e-05, + "loss": 1.7546, + "step": 17875 + }, + { + "epoch": 3.2, + "grad_norm": 1.0762674808502197, + "learning_rate": 1.436366046401645e-05, + "loss": 1.8692, + "step": 17880 + }, + { + "epoch": 3.2, + "grad_norm": 0.6440033316612244, + "learning_rate": 1.4350938018193626e-05, + "loss": 1.6009, + "step": 17885 + }, + { + "epoch": 3.2, + "grad_norm": 0.7783496975898743, + "learning_rate": 1.433821894065887e-05, + "loss": 1.7383, + "step": 17890 + }, + { + "epoch": 3.2, + "grad_norm": 0.7856084704399109, + "learning_rate": 1.4325503235435207e-05, + "loss": 2.0661, + "step": 17895 + }, + { + "epoch": 3.2, + "grad_norm": 0.7685439586639404, + "learning_rate": 1.4312790906544598e-05, + "loss": 1.596, + "step": 17900 + }, + { + "epoch": 3.2, + "grad_norm": 0.8082401156425476, + "learning_rate": 1.430008195800796e-05, + "loss": 1.8206, + "step": 17905 + }, + { + "epoch": 3.21, + "grad_norm": 0.9814727306365967, + "learning_rate": 1.4287376393845095e-05, + "loss": 1.61, + "step": 17910 + }, + { + "epoch": 3.21, + "grad_norm": 0.779301106929779, + "learning_rate": 1.4274674218074775e-05, + "loss": 1.5867, + "step": 17915 + }, + { + "epoch": 3.21, + "grad_norm": 1.0549635887145996, + "learning_rate": 1.4261975434714686e-05, + "loss": 1.9271, + "step": 17920 + }, + { + "epoch": 3.21, + "grad_norm": 0.9940875768661499, + "learning_rate": 1.4249280047781435e-05, + "loss": 2.0389, + "step": 17925 + }, + { + "epoch": 3.21, + "grad_norm": 0.7739433646202087, + "learning_rate": 1.4236588061290563e-05, + "loss": 1.9533, + "step": 17930 + }, + { + "epoch": 3.21, + "grad_norm": 0.6601684093475342, + "learning_rate": 1.4223899479256542e-05, + "loss": 1.6776, + "step": 17935 + }, + { + "epoch": 3.21, + "grad_norm": 0.9212928414344788, + "learning_rate": 1.4211214305692744e-05, + "loss": 1.8223, + "step": 17940 + }, + { + "epoch": 3.21, + "grad_norm": 0.4190042018890381, + "learning_rate": 1.419853254461148e-05, + "loss": 1.9103, + "step": 17945 + }, + { + "epoch": 3.21, + "grad_norm": 1.1490883827209473, + "learning_rate": 1.4185854200023984e-05, + "loss": 1.5747, + "step": 17950 + }, + { + "epoch": 3.21, + "grad_norm": 0.8107929825782776, + "learning_rate": 1.4173179275940401e-05, + "loss": 1.9681, + "step": 17955 + }, + { + "epoch": 3.21, + "grad_norm": 1.9732346534729004, + "learning_rate": 1.4160507776369813e-05, + "loss": 1.5155, + "step": 17960 + }, + { + "epoch": 3.22, + "grad_norm": 0.7424911856651306, + "learning_rate": 1.4147839705320177e-05, + "loss": 1.8159, + "step": 17965 + }, + { + "epoch": 3.22, + "grad_norm": 1.0739436149597168, + "learning_rate": 1.4135175066798412e-05, + "loss": 1.7462, + "step": 17970 + }, + { + "epoch": 3.22, + "grad_norm": 0.9837630987167358, + "learning_rate": 1.4122513864810317e-05, + "loss": 1.5915, + "step": 17975 + }, + { + "epoch": 3.22, + "grad_norm": 0.7608397006988525, + "learning_rate": 1.410985610336062e-05, + "loss": 1.9469, + "step": 17980 + }, + { + "epoch": 3.22, + "grad_norm": 0.3973992168903351, + "learning_rate": 1.4097201786452963e-05, + "loss": 1.9932, + "step": 17985 + }, + { + "epoch": 3.22, + "grad_norm": 0.6849148273468018, + "learning_rate": 1.4084550918089895e-05, + "loss": 1.4885, + "step": 17990 + }, + { + "epoch": 3.22, + "grad_norm": 0.8641477227210999, + "learning_rate": 1.4071903502272882e-05, + "loss": 1.9552, + "step": 17995 + }, + { + "epoch": 3.22, + "grad_norm": 1.1732696294784546, + "learning_rate": 1.4059259543002267e-05, + "loss": 1.7839, + "step": 18000 + }, + { + "epoch": 3.22, + "grad_norm": 0.6170725226402283, + "learning_rate": 1.4046619044277337e-05, + "loss": 2.0093, + "step": 18005 + }, + { + "epoch": 3.22, + "grad_norm": 0.7883812785148621, + "learning_rate": 1.4033982010096269e-05, + "loss": 1.5738, + "step": 18010 + }, + { + "epoch": 3.22, + "grad_norm": 0.8948926329612732, + "learning_rate": 1.402134844445615e-05, + "loss": 1.7868, + "step": 18015 + }, + { + "epoch": 3.23, + "grad_norm": 0.8349456787109375, + "learning_rate": 1.400871835135295e-05, + "loss": 1.7468, + "step": 18020 + }, + { + "epoch": 3.23, + "grad_norm": 0.6893956661224365, + "learning_rate": 1.3996091734781575e-05, + "loss": 1.8127, + "step": 18025 + }, + { + "epoch": 3.23, + "grad_norm": 0.509495198726654, + "learning_rate": 1.3983468598735786e-05, + "loss": 1.6751, + "step": 18030 + }, + { + "epoch": 3.23, + "grad_norm": 0.7317885756492615, + "learning_rate": 1.3970848947208284e-05, + "loss": 2.0498, + "step": 18035 + }, + { + "epoch": 3.23, + "grad_norm": 0.7369408011436462, + "learning_rate": 1.395823278419065e-05, + "loss": 1.6929, + "step": 18040 + }, + { + "epoch": 3.23, + "grad_norm": 1.6330751180648804, + "learning_rate": 1.3945620113673369e-05, + "loss": 1.5587, + "step": 18045 + }, + { + "epoch": 3.23, + "grad_norm": 0.796711802482605, + "learning_rate": 1.3933010939645821e-05, + "loss": 1.7163, + "step": 18050 + }, + { + "epoch": 3.23, + "grad_norm": 1.373434066772461, + "learning_rate": 1.392040526609626e-05, + "loss": 1.7293, + "step": 18055 + }, + { + "epoch": 3.23, + "grad_norm": 0.7607307434082031, + "learning_rate": 1.3907803097011862e-05, + "loss": 1.6402, + "step": 18060 + }, + { + "epoch": 3.23, + "grad_norm": 1.729021668434143, + "learning_rate": 1.3895204436378667e-05, + "loss": 2.0188, + "step": 18065 + }, + { + "epoch": 3.23, + "grad_norm": 0.6117586493492126, + "learning_rate": 1.3882609288181628e-05, + "loss": 1.5768, + "step": 18070 + }, + { + "epoch": 3.24, + "grad_norm": 1.12637460231781, + "learning_rate": 1.3870017656404577e-05, + "loss": 1.6816, + "step": 18075 + }, + { + "epoch": 3.24, + "grad_norm": 1.0976755619049072, + "learning_rate": 1.3857429545030246e-05, + "loss": 1.8734, + "step": 18080 + }, + { + "epoch": 3.24, + "grad_norm": 0.9964720010757446, + "learning_rate": 1.384484495804022e-05, + "loss": 1.8126, + "step": 18085 + }, + { + "epoch": 3.24, + "grad_norm": 0.9928443431854248, + "learning_rate": 1.3832263899415006e-05, + "loss": 1.8391, + "step": 18090 + }, + { + "epoch": 3.24, + "grad_norm": 1.7136353254318237, + "learning_rate": 1.381968637313398e-05, + "loss": 1.5318, + "step": 18095 + }, + { + "epoch": 3.24, + "grad_norm": 0.8411396741867065, + "learning_rate": 1.3807112383175396e-05, + "loss": 1.89, + "step": 18100 + }, + { + "epoch": 3.24, + "grad_norm": 0.641628623008728, + "learning_rate": 1.3794541933516402e-05, + "loss": 1.9114, + "step": 18105 + }, + { + "epoch": 3.24, + "grad_norm": 0.8874943852424622, + "learning_rate": 1.3781975028133027e-05, + "loss": 2.0295, + "step": 18110 + }, + { + "epoch": 3.24, + "grad_norm": 0.7927294373512268, + "learning_rate": 1.376941167100016e-05, + "loss": 1.7748, + "step": 18115 + }, + { + "epoch": 3.24, + "grad_norm": 1.0050328969955444, + "learning_rate": 1.3756851866091566e-05, + "loss": 2.0182, + "step": 18120 + }, + { + "epoch": 3.24, + "grad_norm": 0.8478461503982544, + "learning_rate": 1.3744295617379916e-05, + "loss": 1.7941, + "step": 18125 + }, + { + "epoch": 3.25, + "grad_norm": 0.9578462243080139, + "learning_rate": 1.3731742928836732e-05, + "loss": 1.8209, + "step": 18130 + }, + { + "epoch": 3.25, + "grad_norm": 0.9452106356620789, + "learning_rate": 1.3719193804432423e-05, + "loss": 1.8019, + "step": 18135 + }, + { + "epoch": 3.25, + "grad_norm": 0.9227070808410645, + "learning_rate": 1.3706648248136272e-05, + "loss": 2.0451, + "step": 18140 + }, + { + "epoch": 3.25, + "grad_norm": 0.7945765256881714, + "learning_rate": 1.3694106263916406e-05, + "loss": 1.8797, + "step": 18145 + }, + { + "epoch": 3.25, + "grad_norm": 0.7952302098274231, + "learning_rate": 1.3681567855739857e-05, + "loss": 1.8476, + "step": 18150 + }, + { + "epoch": 3.25, + "grad_norm": 0.5315759778022766, + "learning_rate": 1.3669033027572518e-05, + "loss": 1.8467, + "step": 18155 + }, + { + "epoch": 3.25, + "grad_norm": 0.5972190499305725, + "learning_rate": 1.3656501783379122e-05, + "loss": 1.7655, + "step": 18160 + }, + { + "epoch": 3.25, + "grad_norm": 0.8763524293899536, + "learning_rate": 1.3643974127123298e-05, + "loss": 1.7197, + "step": 18165 + }, + { + "epoch": 3.25, + "grad_norm": 0.714412271976471, + "learning_rate": 1.363145006276755e-05, + "loss": 1.8869, + "step": 18170 + }, + { + "epoch": 3.25, + "grad_norm": 0.8367454409599304, + "learning_rate": 1.36189295942732e-05, + "loss": 1.7424, + "step": 18175 + }, + { + "epoch": 3.25, + "grad_norm": 0.7740775942802429, + "learning_rate": 1.3606412725600471e-05, + "loss": 1.7942, + "step": 18180 + }, + { + "epoch": 3.26, + "grad_norm": 0.6157845854759216, + "learning_rate": 1.359389946070844e-05, + "loss": 2.0431, + "step": 18185 + }, + { + "epoch": 3.26, + "grad_norm": 0.6158450245857239, + "learning_rate": 1.3581389803555036e-05, + "loss": 2.0421, + "step": 18190 + }, + { + "epoch": 3.26, + "grad_norm": 1.7718108892440796, + "learning_rate": 1.3568883758097056e-05, + "loss": 1.6756, + "step": 18195 + }, + { + "epoch": 3.26, + "grad_norm": 1.0088618993759155, + "learning_rate": 1.3556381328290154e-05, + "loss": 1.6387, + "step": 18200 + }, + { + "epoch": 3.26, + "grad_norm": 2.629340171813965, + "learning_rate": 1.3543882518088835e-05, + "loss": 1.8785, + "step": 18205 + }, + { + "epoch": 3.26, + "grad_norm": 2.2919623851776123, + "learning_rate": 1.3531387331446444e-05, + "loss": 1.9667, + "step": 18210 + }, + { + "epoch": 3.26, + "grad_norm": 0.5759145617485046, + "learning_rate": 1.3518895772315207e-05, + "loss": 1.8605, + "step": 18215 + }, + { + "epoch": 3.26, + "grad_norm": 0.9002760648727417, + "learning_rate": 1.3506407844646196e-05, + "loss": 1.8257, + "step": 18220 + }, + { + "epoch": 3.26, + "grad_norm": 0.7235890030860901, + "learning_rate": 1.3493923552389326e-05, + "loss": 1.8972, + "step": 18225 + }, + { + "epoch": 3.26, + "grad_norm": 0.6796399354934692, + "learning_rate": 1.3481442899493382e-05, + "loss": 1.7873, + "step": 18230 + }, + { + "epoch": 3.26, + "grad_norm": 2.119182586669922, + "learning_rate": 1.346896588990596e-05, + "loss": 1.9575, + "step": 18235 + }, + { + "epoch": 3.26, + "grad_norm": 0.8993787169456482, + "learning_rate": 1.3456492527573533e-05, + "loss": 1.5285, + "step": 18240 + }, + { + "epoch": 3.27, + "grad_norm": 1.0154492855072021, + "learning_rate": 1.3444022816441414e-05, + "loss": 1.89, + "step": 18245 + }, + { + "epoch": 3.27, + "grad_norm": 0.5175526738166809, + "learning_rate": 1.3431556760453778e-05, + "loss": 1.8722, + "step": 18250 + }, + { + "epoch": 3.27, + "grad_norm": 0.8413906097412109, + "learning_rate": 1.3419094363553597e-05, + "loss": 1.6884, + "step": 18255 + }, + { + "epoch": 3.27, + "grad_norm": 0.7729244232177734, + "learning_rate": 1.3406635629682743e-05, + "loss": 1.8852, + "step": 18260 + }, + { + "epoch": 3.27, + "grad_norm": 0.7230870127677917, + "learning_rate": 1.3394180562781877e-05, + "loss": 1.9346, + "step": 18265 + }, + { + "epoch": 3.27, + "grad_norm": 0.7597714066505432, + "learning_rate": 1.3381729166790535e-05, + "loss": 1.7774, + "step": 18270 + }, + { + "epoch": 3.27, + "grad_norm": 2.0124258995056152, + "learning_rate": 1.336928144564708e-05, + "loss": 1.8878, + "step": 18275 + }, + { + "epoch": 3.27, + "grad_norm": 0.7441228628158569, + "learning_rate": 1.3356837403288719e-05, + "loss": 1.9835, + "step": 18280 + }, + { + "epoch": 3.27, + "grad_norm": 0.8413273692131042, + "learning_rate": 1.3344397043651496e-05, + "loss": 1.7068, + "step": 18285 + }, + { + "epoch": 3.27, + "grad_norm": 0.9278790950775146, + "learning_rate": 1.333196037067027e-05, + "loss": 1.572, + "step": 18290 + }, + { + "epoch": 3.27, + "grad_norm": 1.6057875156402588, + "learning_rate": 1.3319527388278764e-05, + "loss": 1.6434, + "step": 18295 + }, + { + "epoch": 3.28, + "grad_norm": 0.7727483510971069, + "learning_rate": 1.33070981004095e-05, + "loss": 1.8625, + "step": 18300 + }, + { + "epoch": 3.28, + "grad_norm": 0.9005012512207031, + "learning_rate": 1.329467251099386e-05, + "loss": 1.8556, + "step": 18305 + }, + { + "epoch": 3.28, + "grad_norm": 0.4766554832458496, + "learning_rate": 1.3282250623962045e-05, + "loss": 1.7374, + "step": 18310 + }, + { + "epoch": 3.28, + "grad_norm": 1.3196171522140503, + "learning_rate": 1.3269832443243085e-05, + "loss": 1.6436, + "step": 18315 + }, + { + "epoch": 3.28, + "grad_norm": 0.6074075698852539, + "learning_rate": 1.3257417972764852e-05, + "loss": 1.7342, + "step": 18320 + }, + { + "epoch": 3.28, + "grad_norm": 1.836259126663208, + "learning_rate": 1.3245007216454008e-05, + "loss": 1.7356, + "step": 18325 + }, + { + "epoch": 3.28, + "grad_norm": 1.7985725402832031, + "learning_rate": 1.3232600178236077e-05, + "loss": 2.0223, + "step": 18330 + }, + { + "epoch": 3.28, + "grad_norm": 1.469251275062561, + "learning_rate": 1.3220196862035388e-05, + "loss": 1.8288, + "step": 18335 + }, + { + "epoch": 3.28, + "grad_norm": 0.7703099250793457, + "learning_rate": 1.3207797271775113e-05, + "loss": 1.6407, + "step": 18340 + }, + { + "epoch": 3.28, + "grad_norm": 4.6768717765808105, + "learning_rate": 1.319540141137721e-05, + "loss": 1.9388, + "step": 18345 + }, + { + "epoch": 3.28, + "grad_norm": 0.891442596912384, + "learning_rate": 1.3183009284762492e-05, + "loss": 1.8492, + "step": 18350 + }, + { + "epoch": 3.29, + "grad_norm": 0.5014522075653076, + "learning_rate": 1.3170620895850563e-05, + "loss": 1.8215, + "step": 18355 + }, + { + "epoch": 3.29, + "grad_norm": 2.594623327255249, + "learning_rate": 1.3158236248559866e-05, + "loss": 1.6727, + "step": 18360 + }, + { + "epoch": 3.29, + "grad_norm": 2.9839723110198975, + "learning_rate": 1.3145855346807651e-05, + "loss": 1.3427, + "step": 18365 + }, + { + "epoch": 3.29, + "grad_norm": 0.809145450592041, + "learning_rate": 1.313347819450999e-05, + "loss": 2.0794, + "step": 18370 + }, + { + "epoch": 3.29, + "grad_norm": 1.4444146156311035, + "learning_rate": 1.3121104795581774e-05, + "loss": 1.8111, + "step": 18375 + }, + { + "epoch": 3.29, + "grad_norm": 3.0015711784362793, + "learning_rate": 1.3108735153936669e-05, + "loss": 2.0326, + "step": 18380 + }, + { + "epoch": 3.29, + "grad_norm": 0.8941876292228699, + "learning_rate": 1.3096369273487202e-05, + "loss": 1.8635, + "step": 18385 + }, + { + "epoch": 3.29, + "grad_norm": 0.9546188116073608, + "learning_rate": 1.308400715814469e-05, + "loss": 1.6841, + "step": 18390 + }, + { + "epoch": 3.29, + "grad_norm": 2.428107500076294, + "learning_rate": 1.3071648811819248e-05, + "loss": 1.8043, + "step": 18395 + }, + { + "epoch": 3.29, + "grad_norm": 1.22321617603302, + "learning_rate": 1.3059294238419809e-05, + "loss": 1.7878, + "step": 18400 + }, + { + "epoch": 3.29, + "grad_norm": 0.8854243755340576, + "learning_rate": 1.3046943441854115e-05, + "loss": 1.8578, + "step": 18405 + }, + { + "epoch": 3.3, + "grad_norm": 0.663280189037323, + "learning_rate": 1.3034596426028727e-05, + "loss": 1.9011, + "step": 18410 + }, + { + "epoch": 3.3, + "grad_norm": 0.9913401007652283, + "learning_rate": 1.3022253194848968e-05, + "loss": 1.7046, + "step": 18415 + }, + { + "epoch": 3.3, + "grad_norm": 0.7245838046073914, + "learning_rate": 1.3009913752219006e-05, + "loss": 2.0733, + "step": 18420 + }, + { + "epoch": 3.3, + "grad_norm": 0.7399362921714783, + "learning_rate": 1.2997578102041785e-05, + "loss": 1.7638, + "step": 18425 + }, + { + "epoch": 3.3, + "grad_norm": 0.6854051351547241, + "learning_rate": 1.2985246248219068e-05, + "loss": 1.6586, + "step": 18430 + }, + { + "epoch": 3.3, + "grad_norm": 3.867100715637207, + "learning_rate": 1.2972918194651417e-05, + "loss": 1.7975, + "step": 18435 + }, + { + "epoch": 3.3, + "grad_norm": 0.889519214630127, + "learning_rate": 1.2960593945238173e-05, + "loss": 1.6503, + "step": 18440 + }, + { + "epoch": 3.3, + "grad_norm": 1.0203913450241089, + "learning_rate": 1.2948273503877473e-05, + "loss": 1.8388, + "step": 18445 + }, + { + "epoch": 3.3, + "grad_norm": 0.6218311786651611, + "learning_rate": 1.293595687446627e-05, + "loss": 1.8764, + "step": 18450 + }, + { + "epoch": 3.3, + "grad_norm": 0.6152927875518799, + "learning_rate": 1.2923644060900305e-05, + "loss": 1.722, + "step": 18455 + }, + { + "epoch": 3.3, + "grad_norm": 1.706099033355713, + "learning_rate": 1.2911335067074107e-05, + "loss": 1.5927, + "step": 18460 + }, + { + "epoch": 3.31, + "grad_norm": 0.9180427193641663, + "learning_rate": 1.289902989688101e-05, + "loss": 1.608, + "step": 18465 + }, + { + "epoch": 3.31, + "grad_norm": 0.6481319665908813, + "learning_rate": 1.2886728554213107e-05, + "loss": 1.6192, + "step": 18470 + }, + { + "epoch": 3.31, + "grad_norm": 0.6985259056091309, + "learning_rate": 1.2874431042961311e-05, + "loss": 2.0233, + "step": 18475 + }, + { + "epoch": 3.31, + "grad_norm": 0.6424618363380432, + "learning_rate": 1.2862137367015312e-05, + "loss": 1.8963, + "step": 18480 + }, + { + "epoch": 3.31, + "grad_norm": 0.7724943161010742, + "learning_rate": 1.2849847530263598e-05, + "loss": 1.8944, + "step": 18485 + }, + { + "epoch": 3.31, + "grad_norm": 0.7090172171592712, + "learning_rate": 1.2837561536593414e-05, + "loss": 1.8762, + "step": 18490 + }, + { + "epoch": 3.31, + "grad_norm": 0.8090688586235046, + "learning_rate": 1.2825279389890817e-05, + "loss": 1.7738, + "step": 18495 + }, + { + "epoch": 3.31, + "grad_norm": 0.6650318503379822, + "learning_rate": 1.2813001094040649e-05, + "loss": 1.6909, + "step": 18500 + }, + { + "epoch": 3.31, + "grad_norm": 0.4980815351009369, + "learning_rate": 1.2800726652926503e-05, + "loss": 1.7088, + "step": 18505 + }, + { + "epoch": 3.31, + "grad_norm": 0.8817710280418396, + "learning_rate": 1.2788456070430782e-05, + "loss": 1.6356, + "step": 18510 + }, + { + "epoch": 3.31, + "grad_norm": 0.5760951638221741, + "learning_rate": 1.2776189350434659e-05, + "loss": 1.7639, + "step": 18515 + }, + { + "epoch": 3.31, + "grad_norm": 3.9502806663513184, + "learning_rate": 1.2763926496818088e-05, + "loss": 1.4146, + "step": 18520 + }, + { + "epoch": 3.32, + "grad_norm": 1.6004568338394165, + "learning_rate": 1.2751667513459802e-05, + "loss": 2.014, + "step": 18525 + }, + { + "epoch": 3.32, + "grad_norm": 0.7599541544914246, + "learning_rate": 1.2739412404237306e-05, + "loss": 1.9472, + "step": 18530 + }, + { + "epoch": 3.32, + "grad_norm": 1.0068447589874268, + "learning_rate": 1.2727161173026861e-05, + "loss": 2.1016, + "step": 18535 + }, + { + "epoch": 3.32, + "grad_norm": 0.5630433559417725, + "learning_rate": 1.2714913823703534e-05, + "loss": 1.8729, + "step": 18540 + }, + { + "epoch": 3.32, + "grad_norm": 0.7253677845001221, + "learning_rate": 1.2702670360141144e-05, + "loss": 1.6124, + "step": 18545 + }, + { + "epoch": 3.32, + "grad_norm": 1.3994909524917603, + "learning_rate": 1.2690430786212292e-05, + "loss": 1.65, + "step": 18550 + }, + { + "epoch": 3.32, + "grad_norm": 1.1676279306411743, + "learning_rate": 1.2678195105788353e-05, + "loss": 1.5819, + "step": 18555 + }, + { + "epoch": 3.32, + "grad_norm": 1.0580713748931885, + "learning_rate": 1.2665963322739439e-05, + "loss": 1.9325, + "step": 18560 + }, + { + "epoch": 3.32, + "grad_norm": 1.0381492376327515, + "learning_rate": 1.2653735440934464e-05, + "loss": 2.1058, + "step": 18565 + }, + { + "epoch": 3.32, + "grad_norm": 1.4881700277328491, + "learning_rate": 1.264151146424109e-05, + "loss": 1.8533, + "step": 18570 + }, + { + "epoch": 3.32, + "grad_norm": 1.4090508222579956, + "learning_rate": 1.2629291396525767e-05, + "loss": 1.5407, + "step": 18575 + }, + { + "epoch": 3.33, + "grad_norm": 1.4334440231323242, + "learning_rate": 1.2617075241653665e-05, + "loss": 1.7129, + "step": 18580 + }, + { + "epoch": 3.33, + "grad_norm": 1.431878685951233, + "learning_rate": 1.2604863003488752e-05, + "loss": 1.6889, + "step": 18585 + }, + { + "epoch": 3.33, + "grad_norm": 1.138646125793457, + "learning_rate": 1.2592654685893757e-05, + "loss": 2.1111, + "step": 18590 + }, + { + "epoch": 3.33, + "grad_norm": 2.590043306350708, + "learning_rate": 1.2580450292730142e-05, + "loss": 1.9615, + "step": 18595 + }, + { + "epoch": 3.33, + "grad_norm": 1.139367699623108, + "learning_rate": 1.2568249827858153e-05, + "loss": 1.7285, + "step": 18600 + }, + { + "epoch": 3.33, + "grad_norm": 1.8558638095855713, + "learning_rate": 1.2556053295136785e-05, + "loss": 1.6827, + "step": 18605 + }, + { + "epoch": 3.33, + "grad_norm": 0.39854827523231506, + "learning_rate": 1.2543860698423792e-05, + "loss": 1.9213, + "step": 18610 + }, + { + "epoch": 3.33, + "grad_norm": 1.0664751529693604, + "learning_rate": 1.2531672041575688e-05, + "loss": 1.8553, + "step": 18615 + }, + { + "epoch": 3.33, + "grad_norm": 0.6911807656288147, + "learning_rate": 1.2519487328447715e-05, + "loss": 1.8099, + "step": 18620 + }, + { + "epoch": 3.33, + "grad_norm": 0.5039658546447754, + "learning_rate": 1.2507306562893903e-05, + "loss": 1.9478, + "step": 18625 + }, + { + "epoch": 3.33, + "grad_norm": 0.6167506575584412, + "learning_rate": 1.2495129748766998e-05, + "loss": 1.7913, + "step": 18630 + }, + { + "epoch": 3.34, + "grad_norm": 0.8103507161140442, + "learning_rate": 1.2482956889918529e-05, + "loss": 1.8247, + "step": 18635 + }, + { + "epoch": 3.34, + "grad_norm": 1.3701515197753906, + "learning_rate": 1.2470787990198755e-05, + "loss": 1.7334, + "step": 18640 + }, + { + "epoch": 3.34, + "grad_norm": 0.8373657464981079, + "learning_rate": 1.2458623053456697e-05, + "loss": 1.7793, + "step": 18645 + }, + { + "epoch": 3.34, + "grad_norm": 0.7973329424858093, + "learning_rate": 1.2446462083540094e-05, + "loss": 1.5442, + "step": 18650 + }, + { + "epoch": 3.34, + "grad_norm": 0.8437389731407166, + "learning_rate": 1.2434305084295458e-05, + "loss": 1.6464, + "step": 18655 + }, + { + "epoch": 3.34, + "grad_norm": 0.8967138528823853, + "learning_rate": 1.242215205956804e-05, + "loss": 1.9497, + "step": 18660 + }, + { + "epoch": 3.34, + "grad_norm": 0.665662407875061, + "learning_rate": 1.2410003013201826e-05, + "loss": 1.7312, + "step": 18665 + }, + { + "epoch": 3.34, + "grad_norm": 0.7674746513366699, + "learning_rate": 1.2397857949039557e-05, + "loss": 1.9874, + "step": 18670 + }, + { + "epoch": 3.34, + "grad_norm": 1.0936229228973389, + "learning_rate": 1.2385716870922698e-05, + "loss": 1.9003, + "step": 18675 + }, + { + "epoch": 3.34, + "grad_norm": 0.903713047504425, + "learning_rate": 1.237357978269145e-05, + "loss": 1.6076, + "step": 18680 + }, + { + "epoch": 3.34, + "grad_norm": 0.5833523273468018, + "learning_rate": 1.2361446688184772e-05, + "loss": 2.0229, + "step": 18685 + }, + { + "epoch": 3.35, + "grad_norm": 1.014012098312378, + "learning_rate": 1.234931759124035e-05, + "loss": 1.9358, + "step": 18690 + }, + { + "epoch": 3.35, + "grad_norm": 0.6757218241691589, + "learning_rate": 1.2337192495694605e-05, + "loss": 1.7718, + "step": 18695 + }, + { + "epoch": 3.35, + "grad_norm": 1.0655688047409058, + "learning_rate": 1.2325071405382693e-05, + "loss": 1.7798, + "step": 18700 + }, + { + "epoch": 3.35, + "grad_norm": 0.6979892253875732, + "learning_rate": 1.2312954324138514e-05, + "loss": 1.7062, + "step": 18705 + }, + { + "epoch": 3.35, + "grad_norm": 0.6946207880973816, + "learning_rate": 1.2300841255794668e-05, + "loss": 1.9337, + "step": 18710 + }, + { + "epoch": 3.35, + "grad_norm": 0.827663242816925, + "learning_rate": 1.2288732204182529e-05, + "loss": 1.7177, + "step": 18715 + }, + { + "epoch": 3.35, + "grad_norm": 0.8608758449554443, + "learning_rate": 1.2276627173132158e-05, + "loss": 1.7821, + "step": 18720 + }, + { + "epoch": 3.35, + "grad_norm": 1.8402502536773682, + "learning_rate": 1.2264526166472373e-05, + "loss": 1.7818, + "step": 18725 + }, + { + "epoch": 3.35, + "grad_norm": 0.789306104183197, + "learning_rate": 1.2252429188030709e-05, + "loss": 2.2084, + "step": 18730 + }, + { + "epoch": 3.35, + "grad_norm": 0.7342821359634399, + "learning_rate": 1.2240336241633443e-05, + "loss": 1.8832, + "step": 18735 + }, + { + "epoch": 3.35, + "grad_norm": 0.7892515659332275, + "learning_rate": 1.2228247331105541e-05, + "loss": 1.6436, + "step": 18740 + }, + { + "epoch": 3.36, + "grad_norm": 1.011290431022644, + "learning_rate": 1.2216162460270725e-05, + "loss": 1.8126, + "step": 18745 + }, + { + "epoch": 3.36, + "grad_norm": 0.9589811563491821, + "learning_rate": 1.2204081632951425e-05, + "loss": 1.8529, + "step": 18750 + }, + { + "epoch": 3.36, + "grad_norm": 0.4915600121021271, + "learning_rate": 1.2192004852968797e-05, + "loss": 2.1086, + "step": 18755 + }, + { + "epoch": 3.36, + "grad_norm": 0.8448190689086914, + "learning_rate": 1.217993212414272e-05, + "loss": 1.8251, + "step": 18760 + }, + { + "epoch": 3.36, + "grad_norm": 0.5597540140151978, + "learning_rate": 1.2167863450291786e-05, + "loss": 1.8408, + "step": 18765 + }, + { + "epoch": 3.36, + "grad_norm": 0.7109097242355347, + "learning_rate": 1.2155798835233286e-05, + "loss": 2.3664, + "step": 18770 + }, + { + "epoch": 3.36, + "grad_norm": 0.5528181195259094, + "learning_rate": 1.2143738282783263e-05, + "loss": 2.0561, + "step": 18775 + }, + { + "epoch": 3.36, + "grad_norm": 0.8648270964622498, + "learning_rate": 1.2131681796756452e-05, + "loss": 1.7651, + "step": 18780 + }, + { + "epoch": 3.36, + "grad_norm": 1.4095818996429443, + "learning_rate": 1.2119629380966313e-05, + "loss": 1.7121, + "step": 18785 + }, + { + "epoch": 3.36, + "grad_norm": 0.9153962135314941, + "learning_rate": 1.2107581039225014e-05, + "loss": 1.9327, + "step": 18790 + }, + { + "epoch": 3.36, + "grad_norm": 0.4868113398551941, + "learning_rate": 1.2095536775343436e-05, + "loss": 1.9, + "step": 18795 + }, + { + "epoch": 3.37, + "grad_norm": 0.7522435784339905, + "learning_rate": 1.2083496593131155e-05, + "loss": 1.6748, + "step": 18800 + }, + { + "epoch": 3.37, + "grad_norm": 0.6917173266410828, + "learning_rate": 1.2071460496396481e-05, + "loss": 1.7236, + "step": 18805 + }, + { + "epoch": 3.37, + "grad_norm": 0.5699201822280884, + "learning_rate": 1.2059428488946423e-05, + "loss": 1.7056, + "step": 18810 + }, + { + "epoch": 3.37, + "grad_norm": 1.4349066019058228, + "learning_rate": 1.2047400574586676e-05, + "loss": 1.6144, + "step": 18815 + }, + { + "epoch": 3.37, + "grad_norm": 0.8018911480903625, + "learning_rate": 1.203537675712167e-05, + "loss": 2.1065, + "step": 18820 + }, + { + "epoch": 3.37, + "grad_norm": 0.7882276773452759, + "learning_rate": 1.2023357040354535e-05, + "loss": 1.9657, + "step": 18825 + }, + { + "epoch": 3.37, + "grad_norm": 1.0160123109817505, + "learning_rate": 1.2011341428087077e-05, + "loss": 2.0429, + "step": 18830 + }, + { + "epoch": 3.37, + "grad_norm": 1.0329535007476807, + "learning_rate": 1.1999329924119832e-05, + "loss": 1.856, + "step": 18835 + }, + { + "epoch": 3.37, + "grad_norm": 1.044445514678955, + "learning_rate": 1.1987322532252024e-05, + "loss": 1.3516, + "step": 18840 + }, + { + "epoch": 3.37, + "grad_norm": 0.7211053371429443, + "learning_rate": 1.1975319256281586e-05, + "loss": 1.9731, + "step": 18845 + }, + { + "epoch": 3.37, + "grad_norm": 6.382941246032715, + "learning_rate": 1.1963320100005146e-05, + "loss": 1.7734, + "step": 18850 + }, + { + "epoch": 3.37, + "grad_norm": 0.6544294357299805, + "learning_rate": 1.195132506721802e-05, + "loss": 1.6961, + "step": 18855 + }, + { + "epoch": 3.38, + "grad_norm": 1.0940073728561401, + "learning_rate": 1.1939334161714216e-05, + "loss": 1.8632, + "step": 18860 + }, + { + "epoch": 3.38, + "grad_norm": 0.816892147064209, + "learning_rate": 1.1927347387286452e-05, + "loss": 1.9797, + "step": 18865 + }, + { + "epoch": 3.38, + "grad_norm": 0.7033507227897644, + "learning_rate": 1.1915364747726135e-05, + "loss": 2.0543, + "step": 18870 + }, + { + "epoch": 3.38, + "grad_norm": 1.3776378631591797, + "learning_rate": 1.1903386246823361e-05, + "loss": 1.7737, + "step": 18875 + }, + { + "epoch": 3.38, + "grad_norm": 1.1460137367248535, + "learning_rate": 1.1891411888366933e-05, + "loss": 1.607, + "step": 18880 + }, + { + "epoch": 3.38, + "grad_norm": 0.7334376573562622, + "learning_rate": 1.1879441676144303e-05, + "loss": 1.8029, + "step": 18885 + }, + { + "epoch": 3.38, + "grad_norm": 1.3489125967025757, + "learning_rate": 1.1867475613941653e-05, + "loss": 1.8603, + "step": 18890 + }, + { + "epoch": 3.38, + "grad_norm": 1.415173053741455, + "learning_rate": 1.1855513705543833e-05, + "loss": 1.7612, + "step": 18895 + }, + { + "epoch": 3.38, + "grad_norm": 2.525381088256836, + "learning_rate": 1.1843555954734387e-05, + "loss": 1.7866, + "step": 18900 + }, + { + "epoch": 3.38, + "grad_norm": 0.555586040019989, + "learning_rate": 1.1831602365295545e-05, + "loss": 1.9931, + "step": 18905 + }, + { + "epoch": 3.38, + "grad_norm": 1.590984582901001, + "learning_rate": 1.1819652941008197e-05, + "loss": 1.6122, + "step": 18910 + }, + { + "epoch": 3.39, + "grad_norm": 0.7002542614936829, + "learning_rate": 1.1807707685651959e-05, + "loss": 1.6119, + "step": 18915 + }, + { + "epoch": 3.39, + "grad_norm": 0.8759981989860535, + "learning_rate": 1.179576660300508e-05, + "loss": 1.8901, + "step": 18920 + }, + { + "epoch": 3.39, + "grad_norm": 0.4278295338153839, + "learning_rate": 1.1783829696844522e-05, + "loss": 1.7944, + "step": 18925 + }, + { + "epoch": 3.39, + "grad_norm": 0.7735504508018494, + "learning_rate": 1.177189697094592e-05, + "loss": 1.6411, + "step": 18930 + }, + { + "epoch": 3.39, + "grad_norm": 0.8619164824485779, + "learning_rate": 1.175996842908358e-05, + "loss": 2.0436, + "step": 18935 + }, + { + "epoch": 3.39, + "grad_norm": 0.7432487607002258, + "learning_rate": 1.1748044075030504e-05, + "loss": 1.924, + "step": 18940 + }, + { + "epoch": 3.39, + "grad_norm": 0.8288931846618652, + "learning_rate": 1.1736123912558327e-05, + "loss": 1.9304, + "step": 18945 + }, + { + "epoch": 3.39, + "grad_norm": 2.158874273300171, + "learning_rate": 1.172420794543741e-05, + "loss": 1.8055, + "step": 18950 + }, + { + "epoch": 3.39, + "grad_norm": 0.8662157654762268, + "learning_rate": 1.171229617743674e-05, + "loss": 1.7861, + "step": 18955 + }, + { + "epoch": 3.39, + "grad_norm": 0.9218710064888, + "learning_rate": 1.170038861232401e-05, + "loss": 2.0901, + "step": 18960 + }, + { + "epoch": 3.39, + "grad_norm": 0.7493339776992798, + "learning_rate": 1.1688485253865567e-05, + "loss": 1.9827, + "step": 18965 + }, + { + "epoch": 3.4, + "grad_norm": 0.8299612402915955, + "learning_rate": 1.1676586105826442e-05, + "loss": 1.7931, + "step": 18970 + }, + { + "epoch": 3.4, + "grad_norm": 1.1126405000686646, + "learning_rate": 1.166469117197031e-05, + "loss": 1.918, + "step": 18975 + }, + { + "epoch": 3.4, + "grad_norm": 0.9959370493888855, + "learning_rate": 1.1652800456059532e-05, + "loss": 1.8812, + "step": 18980 + }, + { + "epoch": 3.4, + "grad_norm": 0.6793087124824524, + "learning_rate": 1.1640913961855132e-05, + "loss": 2.0555, + "step": 18985 + }, + { + "epoch": 3.4, + "grad_norm": 0.6176539659500122, + "learning_rate": 1.1629031693116796e-05, + "loss": 1.706, + "step": 18990 + }, + { + "epoch": 3.4, + "grad_norm": 0.9862648248672485, + "learning_rate": 1.1617153653602883e-05, + "loss": 1.55, + "step": 18995 + }, + { + "epoch": 3.4, + "grad_norm": 0.554785966873169, + "learning_rate": 1.1605279847070388e-05, + "loss": 2.1433, + "step": 19000 + }, + { + "epoch": 3.4, + "grad_norm": 0.6524524688720703, + "learning_rate": 1.1593410277275003e-05, + "loss": 1.7382, + "step": 19005 + }, + { + "epoch": 3.4, + "grad_norm": 0.8985192179679871, + "learning_rate": 1.158154494797104e-05, + "loss": 2.1055, + "step": 19010 + }, + { + "epoch": 3.4, + "grad_norm": 0.636359453201294, + "learning_rate": 1.1569683862911507e-05, + "loss": 1.927, + "step": 19015 + }, + { + "epoch": 3.4, + "grad_norm": 0.5719195604324341, + "learning_rate": 1.1557827025848047e-05, + "loss": 1.6219, + "step": 19020 + }, + { + "epoch": 3.41, + "grad_norm": 0.8190729022026062, + "learning_rate": 1.1545974440530968e-05, + "loss": 1.5272, + "step": 19025 + }, + { + "epoch": 3.41, + "grad_norm": 0.7199324369430542, + "learning_rate": 1.1534126110709246e-05, + "loss": 1.9192, + "step": 19030 + }, + { + "epoch": 3.41, + "grad_norm": 0.5589447021484375, + "learning_rate": 1.152228204013047e-05, + "loss": 1.9119, + "step": 19035 + }, + { + "epoch": 3.41, + "grad_norm": 0.8806366324424744, + "learning_rate": 1.151044223254092e-05, + "loss": 1.6215, + "step": 19040 + }, + { + "epoch": 3.41, + "grad_norm": 0.65036940574646, + "learning_rate": 1.1498606691685526e-05, + "loss": 1.8842, + "step": 19045 + }, + { + "epoch": 3.41, + "grad_norm": 0.6128280162811279, + "learning_rate": 1.1486775421307842e-05, + "loss": 1.9723, + "step": 19050 + }, + { + "epoch": 3.41, + "grad_norm": 0.8853219747543335, + "learning_rate": 1.1474948425150092e-05, + "loss": 1.8919, + "step": 19055 + }, + { + "epoch": 3.41, + "grad_norm": 1.1091639995574951, + "learning_rate": 1.1463125706953156e-05, + "loss": 2.0356, + "step": 19060 + }, + { + "epoch": 3.41, + "grad_norm": 0.6628566980361938, + "learning_rate": 1.145130727045653e-05, + "loss": 1.9267, + "step": 19065 + }, + { + "epoch": 3.41, + "grad_norm": 1.4559956789016724, + "learning_rate": 1.1439493119398381e-05, + "loss": 1.7764, + "step": 19070 + }, + { + "epoch": 3.41, + "grad_norm": 0.5751055479049683, + "learning_rate": 1.1427683257515515e-05, + "loss": 1.5434, + "step": 19075 + }, + { + "epoch": 3.42, + "grad_norm": 0.7229131460189819, + "learning_rate": 1.1415877688543378e-05, + "loss": 1.4448, + "step": 19080 + }, + { + "epoch": 3.42, + "grad_norm": 0.6522566676139832, + "learning_rate": 1.1404076416216072e-05, + "loss": 1.9578, + "step": 19085 + }, + { + "epoch": 3.42, + "grad_norm": 1.1385724544525146, + "learning_rate": 1.139227944426631e-05, + "loss": 1.794, + "step": 19090 + }, + { + "epoch": 3.42, + "grad_norm": 1.248424768447876, + "learning_rate": 1.1380486776425477e-05, + "loss": 1.6419, + "step": 19095 + }, + { + "epoch": 3.42, + "grad_norm": 0.9762007594108582, + "learning_rate": 1.136869841642357e-05, + "loss": 1.6544, + "step": 19100 + }, + { + "epoch": 3.42, + "grad_norm": 0.4511853754520416, + "learning_rate": 1.135691436798924e-05, + "loss": 1.8262, + "step": 19105 + }, + { + "epoch": 3.42, + "grad_norm": 0.8737003803253174, + "learning_rate": 1.1345134634849775e-05, + "loss": 1.9587, + "step": 19110 + }, + { + "epoch": 3.42, + "grad_norm": 0.5674211382865906, + "learning_rate": 1.133335922073109e-05, + "loss": 1.9214, + "step": 19115 + }, + { + "epoch": 3.42, + "grad_norm": 0.7211077213287354, + "learning_rate": 1.1321588129357746e-05, + "loss": 1.7593, + "step": 19120 + }, + { + "epoch": 3.42, + "grad_norm": 1.222629427909851, + "learning_rate": 1.130982136445291e-05, + "loss": 1.971, + "step": 19125 + }, + { + "epoch": 3.42, + "grad_norm": 1.0511137247085571, + "learning_rate": 1.1298058929738411e-05, + "loss": 1.7639, + "step": 19130 + }, + { + "epoch": 3.43, + "grad_norm": 1.5264891386032104, + "learning_rate": 1.1286300828934702e-05, + "loss": 1.7966, + "step": 19135 + }, + { + "epoch": 3.43, + "grad_norm": 1.0392403602600098, + "learning_rate": 1.1274547065760838e-05, + "loss": 1.5846, + "step": 19140 + }, + { + "epoch": 3.43, + "grad_norm": 1.030380368232727, + "learning_rate": 1.1262797643934537e-05, + "loss": 1.7171, + "step": 19145 + }, + { + "epoch": 3.43, + "grad_norm": 1.143563985824585, + "learning_rate": 1.1251052567172135e-05, + "loss": 2.1491, + "step": 19150 + }, + { + "epoch": 3.43, + "grad_norm": 0.6801722049713135, + "learning_rate": 1.1239311839188571e-05, + "loss": 1.9382, + "step": 19155 + }, + { + "epoch": 3.43, + "grad_norm": 1.0771989822387695, + "learning_rate": 1.122757546369744e-05, + "loss": 1.6079, + "step": 19160 + }, + { + "epoch": 3.43, + "grad_norm": 1.1643553972244263, + "learning_rate": 1.121584344441094e-05, + "loss": 1.8147, + "step": 19165 + }, + { + "epoch": 3.43, + "grad_norm": 1.202305555343628, + "learning_rate": 1.1204115785039901e-05, + "loss": 1.631, + "step": 19170 + }, + { + "epoch": 3.43, + "grad_norm": 0.6472001075744629, + "learning_rate": 1.1192392489293777e-05, + "loss": 1.9842, + "step": 19175 + }, + { + "epoch": 3.43, + "grad_norm": 0.7353944182395935, + "learning_rate": 1.1180673560880619e-05, + "loss": 1.6733, + "step": 19180 + }, + { + "epoch": 3.43, + "grad_norm": 0.6678537726402283, + "learning_rate": 1.1168959003507129e-05, + "loss": 1.8173, + "step": 19185 + }, + { + "epoch": 3.43, + "grad_norm": 0.6647874116897583, + "learning_rate": 1.1157248820878596e-05, + "loss": 1.7445, + "step": 19190 + }, + { + "epoch": 3.44, + "grad_norm": 0.5945031046867371, + "learning_rate": 1.1145543016698946e-05, + "loss": 1.9277, + "step": 19195 + }, + { + "epoch": 3.44, + "grad_norm": 2.9035160541534424, + "learning_rate": 1.1133841594670711e-05, + "loss": 2.0099, + "step": 19200 + }, + { + "epoch": 3.44, + "grad_norm": 0.5868393182754517, + "learning_rate": 1.1122144558495043e-05, + "loss": 1.6714, + "step": 19205 + }, + { + "epoch": 3.44, + "grad_norm": 0.993786633014679, + "learning_rate": 1.111045191187171e-05, + "loss": 1.7378, + "step": 19210 + }, + { + "epoch": 3.44, + "grad_norm": 0.8114561438560486, + "learning_rate": 1.109876365849907e-05, + "loss": 2.2337, + "step": 19215 + }, + { + "epoch": 3.44, + "grad_norm": 0.6973251700401306, + "learning_rate": 1.1087079802074112e-05, + "loss": 2.1974, + "step": 19220 + }, + { + "epoch": 3.44, + "grad_norm": 1.055870771408081, + "learning_rate": 1.1075400346292433e-05, + "loss": 1.596, + "step": 19225 + }, + { + "epoch": 3.44, + "grad_norm": 1.0499026775360107, + "learning_rate": 1.1063725294848238e-05, + "loss": 1.7357, + "step": 19230 + }, + { + "epoch": 3.44, + "grad_norm": 1.200201392173767, + "learning_rate": 1.1052054651434318e-05, + "loss": 1.8395, + "step": 19235 + }, + { + "epoch": 3.44, + "grad_norm": 0.6308977007865906, + "learning_rate": 1.1040388419742106e-05, + "loss": 2.024, + "step": 19240 + }, + { + "epoch": 3.44, + "grad_norm": 0.704451322555542, + "learning_rate": 1.1028726603461597e-05, + "loss": 1.8261, + "step": 19245 + }, + { + "epoch": 3.45, + "grad_norm": 1.1138906478881836, + "learning_rate": 1.1017069206281427e-05, + "loss": 1.6242, + "step": 19250 + }, + { + "epoch": 3.45, + "grad_norm": 0.6243778467178345, + "learning_rate": 1.1005416231888819e-05, + "loss": 1.9646, + "step": 19255 + }, + { + "epoch": 3.45, + "grad_norm": 0.723716676235199, + "learning_rate": 1.0993767683969594e-05, + "loss": 1.9183, + "step": 19260 + }, + { + "epoch": 3.45, + "grad_norm": 0.4877179265022278, + "learning_rate": 1.0982123566208185e-05, + "loss": 1.7803, + "step": 19265 + }, + { + "epoch": 3.45, + "grad_norm": 1.7348027229309082, + "learning_rate": 1.0970483882287602e-05, + "loss": 1.7823, + "step": 19270 + }, + { + "epoch": 3.45, + "grad_norm": 1.0215563774108887, + "learning_rate": 1.0958848635889482e-05, + "loss": 1.6437, + "step": 19275 + }, + { + "epoch": 3.45, + "grad_norm": 0.9767959117889404, + "learning_rate": 1.0947217830694023e-05, + "loss": 1.6133, + "step": 19280 + }, + { + "epoch": 3.45, + "grad_norm": 1.0302644968032837, + "learning_rate": 1.0935591470380046e-05, + "loss": 1.8479, + "step": 19285 + }, + { + "epoch": 3.45, + "grad_norm": 0.5490931868553162, + "learning_rate": 1.0923969558624959e-05, + "loss": 1.6583, + "step": 19290 + }, + { + "epoch": 3.45, + "grad_norm": 1.0158607959747314, + "learning_rate": 1.0912352099104762e-05, + "loss": 1.7078, + "step": 19295 + }, + { + "epoch": 3.45, + "grad_norm": 0.7652471661567688, + "learning_rate": 1.0900739095494053e-05, + "loss": 1.9895, + "step": 19300 + }, + { + "epoch": 3.46, + "grad_norm": 0.7485408782958984, + "learning_rate": 1.0889130551465999e-05, + "loss": 1.7783, + "step": 19305 + }, + { + "epoch": 3.46, + "grad_norm": 0.9016578197479248, + "learning_rate": 1.0877526470692378e-05, + "loss": 2.0336, + "step": 19310 + }, + { + "epoch": 3.46, + "grad_norm": 0.8303315043449402, + "learning_rate": 1.086592685684355e-05, + "loss": 2.0914, + "step": 19315 + }, + { + "epoch": 3.46, + "grad_norm": 0.6593153476715088, + "learning_rate": 1.0854331713588461e-05, + "loss": 1.8333, + "step": 19320 + }, + { + "epoch": 3.46, + "grad_norm": 0.5653439164161682, + "learning_rate": 1.0842741044594656e-05, + "loss": 1.7719, + "step": 19325 + }, + { + "epoch": 3.46, + "grad_norm": 0.8699783682823181, + "learning_rate": 1.0831154853528238e-05, + "loss": 2.05, + "step": 19330 + }, + { + "epoch": 3.46, + "grad_norm": 2.1618435382843018, + "learning_rate": 1.0819573144053904e-05, + "loss": 1.7721, + "step": 19335 + }, + { + "epoch": 3.46, + "grad_norm": 0.6418623924255371, + "learning_rate": 1.0807995919834946e-05, + "loss": 2.0052, + "step": 19340 + }, + { + "epoch": 3.46, + "grad_norm": 0.4572185277938843, + "learning_rate": 1.0796423184533224e-05, + "loss": 2.1008, + "step": 19345 + }, + { + "epoch": 3.46, + "grad_norm": 2.1507296562194824, + "learning_rate": 1.0784854941809192e-05, + "loss": 1.8341, + "step": 19350 + }, + { + "epoch": 3.46, + "grad_norm": 1.2364892959594727, + "learning_rate": 1.0773291195321872e-05, + "loss": 1.9426, + "step": 19355 + }, + { + "epoch": 3.47, + "grad_norm": 0.9573560357093811, + "learning_rate": 1.0761731948728857e-05, + "loss": 1.9652, + "step": 19360 + }, + { + "epoch": 3.47, + "grad_norm": 1.1250308752059937, + "learning_rate": 1.075017720568633e-05, + "loss": 1.8002, + "step": 19365 + }, + { + "epoch": 3.47, + "grad_norm": 3.7866458892822266, + "learning_rate": 1.0738626969849056e-05, + "loss": 1.7255, + "step": 19370 + }, + { + "epoch": 3.47, + "grad_norm": 1.0425763130187988, + "learning_rate": 1.0727081244870343e-05, + "loss": 1.7018, + "step": 19375 + }, + { + "epoch": 3.47, + "grad_norm": 0.9461259245872498, + "learning_rate": 1.0715540034402105e-05, + "loss": 1.9527, + "step": 19380 + }, + { + "epoch": 3.47, + "grad_norm": 1.1015312671661377, + "learning_rate": 1.0704003342094823e-05, + "loss": 1.7186, + "step": 19385 + }, + { + "epoch": 3.47, + "grad_norm": 0.7580620646476746, + "learning_rate": 1.0692471171597524e-05, + "loss": 1.8424, + "step": 19390 + }, + { + "epoch": 3.47, + "grad_norm": 1.06261146068573, + "learning_rate": 1.0680943526557829e-05, + "loss": 1.8829, + "step": 19395 + }, + { + "epoch": 3.47, + "grad_norm": 0.9854909777641296, + "learning_rate": 1.0669420410621928e-05, + "loss": 1.7523, + "step": 19400 + }, + { + "epoch": 3.47, + "grad_norm": 0.6900574564933777, + "learning_rate": 1.0657901827434563e-05, + "loss": 1.9449, + "step": 19405 + }, + { + "epoch": 3.47, + "grad_norm": 0.6487289667129517, + "learning_rate": 1.0646387780639055e-05, + "loss": 1.8822, + "step": 19410 + }, + { + "epoch": 3.48, + "grad_norm": 0.5597279667854309, + "learning_rate": 1.0634878273877297e-05, + "loss": 2.0047, + "step": 19415 + }, + { + "epoch": 3.48, + "grad_norm": 0.49827390909194946, + "learning_rate": 1.0623373310789722e-05, + "loss": 1.9747, + "step": 19420 + }, + { + "epoch": 3.48, + "grad_norm": 0.751218855381012, + "learning_rate": 1.0611872895015328e-05, + "loss": 1.7458, + "step": 19425 + }, + { + "epoch": 3.48, + "grad_norm": 1.029354214668274, + "learning_rate": 1.0600377030191701e-05, + "loss": 1.8894, + "step": 19430 + }, + { + "epoch": 3.48, + "grad_norm": 1.2062971591949463, + "learning_rate": 1.0588885719954971e-05, + "loss": 1.876, + "step": 19435 + }, + { + "epoch": 3.48, + "grad_norm": 0.9020774960517883, + "learning_rate": 1.0577398967939822e-05, + "loss": 1.855, + "step": 19440 + }, + { + "epoch": 3.48, + "grad_norm": 0.6630207300186157, + "learning_rate": 1.0565916777779519e-05, + "loss": 2.1588, + "step": 19445 + }, + { + "epoch": 3.48, + "grad_norm": 0.8399782180786133, + "learning_rate": 1.0554439153105844e-05, + "loss": 1.6518, + "step": 19450 + }, + { + "epoch": 3.48, + "grad_norm": 0.8747025728225708, + "learning_rate": 1.054296609754917e-05, + "loss": 1.7769, + "step": 19455 + }, + { + "epoch": 3.48, + "grad_norm": 0.9904999732971191, + "learning_rate": 1.0531497614738414e-05, + "loss": 1.6096, + "step": 19460 + }, + { + "epoch": 3.48, + "grad_norm": 1.0800423622131348, + "learning_rate": 1.0520033708301056e-05, + "loss": 1.9036, + "step": 19465 + }, + { + "epoch": 3.49, + "grad_norm": 2.292233467102051, + "learning_rate": 1.0508574381863095e-05, + "loss": 1.7432, + "step": 19470 + }, + { + "epoch": 3.49, + "grad_norm": 2.433584451675415, + "learning_rate": 1.049711963904913e-05, + "loss": 1.6728, + "step": 19475 + }, + { + "epoch": 3.49, + "grad_norm": 0.8242534399032593, + "learning_rate": 1.0485669483482266e-05, + "loss": 1.6268, + "step": 19480 + }, + { + "epoch": 3.49, + "grad_norm": 0.3501177728176117, + "learning_rate": 1.0474223918784178e-05, + "loss": 1.8424, + "step": 19485 + }, + { + "epoch": 3.49, + "grad_norm": 0.7704662084579468, + "learning_rate": 1.0462782948575098e-05, + "loss": 1.5104, + "step": 19490 + }, + { + "epoch": 3.49, + "grad_norm": 1.05587637424469, + "learning_rate": 1.0451346576473787e-05, + "loss": 2.0312, + "step": 19495 + }, + { + "epoch": 3.49, + "grad_norm": 0.6842361688613892, + "learning_rate": 1.0439914806097559e-05, + "loss": 1.8268, + "step": 19500 + }, + { + "epoch": 3.49, + "grad_norm": 1.027362585067749, + "learning_rate": 1.042848764106228e-05, + "loss": 2.1638, + "step": 19505 + }, + { + "epoch": 3.49, + "grad_norm": 1.474015235900879, + "learning_rate": 1.0417065084982346e-05, + "loss": 1.6934, + "step": 19510 + }, + { + "epoch": 3.49, + "grad_norm": 0.5891749858856201, + "learning_rate": 1.040564714147069e-05, + "loss": 1.8321, + "step": 19515 + }, + { + "epoch": 3.49, + "grad_norm": 0.8095927238464355, + "learning_rate": 1.0394233814138804e-05, + "loss": 1.7532, + "step": 19520 + }, + { + "epoch": 3.49, + "grad_norm": 1.7392970323562622, + "learning_rate": 1.0382825106596713e-05, + "loss": 2.0015, + "step": 19525 + }, + { + "epoch": 3.5, + "grad_norm": 1.0401033163070679, + "learning_rate": 1.0371421022452981e-05, + "loss": 2.0158, + "step": 19530 + }, + { + "epoch": 3.5, + "grad_norm": 1.081748127937317, + "learning_rate": 1.036002156531472e-05, + "loss": 2.0205, + "step": 19535 + }, + { + "epoch": 3.5, + "grad_norm": 0.47808757424354553, + "learning_rate": 1.0348626738787548e-05, + "loss": 1.4998, + "step": 19540 + }, + { + "epoch": 3.5, + "grad_norm": 0.770613968372345, + "learning_rate": 1.0337236546475642e-05, + "loss": 1.8638, + "step": 19545 + }, + { + "epoch": 3.5, + "grad_norm": 0.6831596493721008, + "learning_rate": 1.0325850991981715e-05, + "loss": 1.833, + "step": 19550 + }, + { + "epoch": 3.5, + "grad_norm": 0.8213655352592468, + "learning_rate": 1.0314470078907013e-05, + "loss": 1.8172, + "step": 19555 + }, + { + "epoch": 3.5, + "grad_norm": 0.9885990619659424, + "learning_rate": 1.0303093810851288e-05, + "loss": 1.7939, + "step": 19560 + }, + { + "epoch": 3.5, + "grad_norm": 1.137285590171814, + "learning_rate": 1.0291722191412865e-05, + "loss": 1.7968, + "step": 19565 + }, + { + "epoch": 3.5, + "grad_norm": 1.3749232292175293, + "learning_rate": 1.0280355224188556e-05, + "loss": 1.8718, + "step": 19570 + }, + { + "epoch": 3.5, + "grad_norm": 0.9204155802726746, + "learning_rate": 1.0268992912773732e-05, + "loss": 1.6898, + "step": 19575 + }, + { + "epoch": 3.5, + "grad_norm": 0.7502628564834595, + "learning_rate": 1.025763526076228e-05, + "loss": 1.669, + "step": 19580 + }, + { + "epoch": 3.51, + "grad_norm": 1.0295674800872803, + "learning_rate": 1.0246282271746612e-05, + "loss": 1.7433, + "step": 19585 + }, + { + "epoch": 3.51, + "grad_norm": 0.7040165662765503, + "learning_rate": 1.0234933949317683e-05, + "loss": 1.6466, + "step": 19590 + }, + { + "epoch": 3.51, + "grad_norm": 0.6841382384300232, + "learning_rate": 1.0223590297064933e-05, + "loss": 2.0555, + "step": 19595 + }, + { + "epoch": 3.51, + "grad_norm": 0.6829349994659424, + "learning_rate": 1.0212251318576357e-05, + "loss": 1.7514, + "step": 19600 + }, + { + "epoch": 3.51, + "grad_norm": 1.0354279279708862, + "learning_rate": 1.0200917017438478e-05, + "loss": 1.7173, + "step": 19605 + }, + { + "epoch": 3.51, + "grad_norm": 0.7070810794830322, + "learning_rate": 1.01895873972363e-05, + "loss": 2.1088, + "step": 19610 + }, + { + "epoch": 3.51, + "grad_norm": 1.9190095663070679, + "learning_rate": 1.017826246155338e-05, + "loss": 2.2198, + "step": 19615 + }, + { + "epoch": 3.51, + "grad_norm": 0.7562488913536072, + "learning_rate": 1.0166942213971792e-05, + "loss": 1.7641, + "step": 19620 + }, + { + "epoch": 3.51, + "grad_norm": 0.9604626893997192, + "learning_rate": 1.015562665807212e-05, + "loss": 1.9173, + "step": 19625 + }, + { + "epoch": 3.51, + "grad_norm": 2.0906529426574707, + "learning_rate": 1.0144315797433449e-05, + "loss": 1.9204, + "step": 19630 + }, + { + "epoch": 3.51, + "grad_norm": 1.8429858684539795, + "learning_rate": 1.0133009635633402e-05, + "loss": 1.7915, + "step": 19635 + }, + { + "epoch": 3.52, + "grad_norm": 1.0076621770858765, + "learning_rate": 1.0121708176248107e-05, + "loss": 1.6617, + "step": 19640 + }, + { + "epoch": 3.52, + "grad_norm": 0.9586327075958252, + "learning_rate": 1.0110411422852206e-05, + "loss": 1.8037, + "step": 19645 + }, + { + "epoch": 3.52, + "grad_norm": 0.7485795617103577, + "learning_rate": 1.0099119379018856e-05, + "loss": 2.0374, + "step": 19650 + }, + { + "epoch": 3.52, + "grad_norm": 0.6466397643089294, + "learning_rate": 1.0087832048319716e-05, + "loss": 1.795, + "step": 19655 + }, + { + "epoch": 3.52, + "grad_norm": 1.1926106214523315, + "learning_rate": 1.0076549434324944e-05, + "loss": 1.8342, + "step": 19660 + }, + { + "epoch": 3.52, + "grad_norm": 0.621414840221405, + "learning_rate": 1.0065271540603236e-05, + "loss": 1.7218, + "step": 19665 + }, + { + "epoch": 3.52, + "grad_norm": 1.1308784484863281, + "learning_rate": 1.005399837072177e-05, + "loss": 1.7901, + "step": 19670 + }, + { + "epoch": 3.52, + "grad_norm": 0.8960298299789429, + "learning_rate": 1.0042729928246245e-05, + "loss": 1.8848, + "step": 19675 + }, + { + "epoch": 3.52, + "grad_norm": 0.4410213232040405, + "learning_rate": 1.0031466216740865e-05, + "loss": 1.7415, + "step": 19680 + }, + { + "epoch": 3.52, + "grad_norm": 0.7294899821281433, + "learning_rate": 1.0020207239768314e-05, + "loss": 2.1899, + "step": 19685 + }, + { + "epoch": 3.52, + "grad_norm": 0.6985868215560913, + "learning_rate": 1.0008953000889803e-05, + "loss": 1.848, + "step": 19690 + }, + { + "epoch": 3.53, + "grad_norm": 0.5390515327453613, + "learning_rate": 9.997703503665051e-06, + "loss": 2.0746, + "step": 19695 + }, + { + "epoch": 3.53, + "grad_norm": 0.9798139929771423, + "learning_rate": 9.986458751652238e-06, + "loss": 1.8011, + "step": 19700 + }, + { + "epoch": 3.53, + "grad_norm": 0.7954207062721252, + "learning_rate": 9.975218748408083e-06, + "loss": 1.9957, + "step": 19705 + }, + { + "epoch": 3.53, + "grad_norm": 1.3126145601272583, + "learning_rate": 9.963983497487783e-06, + "loss": 1.8887, + "step": 19710 + }, + { + "epoch": 3.53, + "grad_norm": 0.8574180006980896, + "learning_rate": 9.95275300244505e-06, + "loss": 1.6767, + "step": 19715 + }, + { + "epoch": 3.53, + "grad_norm": 0.6159719228744507, + "learning_rate": 9.941527266832062e-06, + "loss": 2.1028, + "step": 19720 + }, + { + "epoch": 3.53, + "grad_norm": 0.8486490249633789, + "learning_rate": 9.930306294199512e-06, + "loss": 1.8083, + "step": 19725 + }, + { + "epoch": 3.53, + "grad_norm": 2.6029884815216064, + "learning_rate": 9.919090088096589e-06, + "loss": 1.7197, + "step": 19730 + }, + { + "epoch": 3.53, + "grad_norm": 0.9481103420257568, + "learning_rate": 9.907878652070962e-06, + "loss": 1.7045, + "step": 19735 + }, + { + "epoch": 3.53, + "grad_norm": 0.3375113606452942, + "learning_rate": 9.896671989668813e-06, + "loss": 1.8396, + "step": 19740 + }, + { + "epoch": 3.53, + "grad_norm": 0.7272607684135437, + "learning_rate": 9.885470104434783e-06, + "loss": 1.7523, + "step": 19745 + }, + { + "epoch": 3.54, + "grad_norm": 1.5993422269821167, + "learning_rate": 9.874272999912013e-06, + "loss": 2.0852, + "step": 19750 + }, + { + "epoch": 3.54, + "grad_norm": 0.6138380169868469, + "learning_rate": 9.863080679642143e-06, + "loss": 1.8729, + "step": 19755 + }, + { + "epoch": 3.54, + "grad_norm": 0.8599143028259277, + "learning_rate": 9.851893147165295e-06, + "loss": 2.0843, + "step": 19760 + }, + { + "epoch": 3.54, + "grad_norm": 0.7874997854232788, + "learning_rate": 9.840710406020072e-06, + "loss": 1.6064, + "step": 19765 + }, + { + "epoch": 3.54, + "grad_norm": 0.9153751730918884, + "learning_rate": 9.829532459743579e-06, + "loss": 1.6868, + "step": 19770 + }, + { + "epoch": 3.54, + "grad_norm": 0.6881905794143677, + "learning_rate": 9.818359311871367e-06, + "loss": 1.6391, + "step": 19775 + }, + { + "epoch": 3.54, + "grad_norm": 1.0899308919906616, + "learning_rate": 9.807190965937499e-06, + "loss": 1.9639, + "step": 19780 + }, + { + "epoch": 3.54, + "grad_norm": 1.196337342262268, + "learning_rate": 9.796027425474515e-06, + "loss": 1.7011, + "step": 19785 + }, + { + "epoch": 3.54, + "grad_norm": 1.8400663137435913, + "learning_rate": 9.784868694013444e-06, + "loss": 1.9638, + "step": 19790 + }, + { + "epoch": 3.54, + "grad_norm": 0.783830463886261, + "learning_rate": 9.773714775083759e-06, + "loss": 1.7963, + "step": 19795 + }, + { + "epoch": 3.54, + "grad_norm": 0.5852648615837097, + "learning_rate": 9.762565672213444e-06, + "loss": 1.6803, + "step": 19800 + }, + { + "epoch": 3.54, + "grad_norm": 0.6797320246696472, + "learning_rate": 9.751421388928961e-06, + "loss": 1.9709, + "step": 19805 + }, + { + "epoch": 3.55, + "grad_norm": 0.8105748295783997, + "learning_rate": 9.740281928755218e-06, + "loss": 2.0418, + "step": 19810 + }, + { + "epoch": 3.55, + "grad_norm": 0.5812113285064697, + "learning_rate": 9.72914729521562e-06, + "loss": 1.8629, + "step": 19815 + }, + { + "epoch": 3.55, + "grad_norm": 0.9974383115768433, + "learning_rate": 9.718017491832046e-06, + "loss": 2.231, + "step": 19820 + }, + { + "epoch": 3.55, + "grad_norm": 1.628359079360962, + "learning_rate": 9.706892522124839e-06, + "loss": 1.7025, + "step": 19825 + }, + { + "epoch": 3.55, + "grad_norm": 2.6423473358154297, + "learning_rate": 9.695772389612826e-06, + "loss": 1.7726, + "step": 19830 + }, + { + "epoch": 3.55, + "grad_norm": 0.6964113712310791, + "learning_rate": 9.684657097813282e-06, + "loss": 1.5745, + "step": 19835 + }, + { + "epoch": 3.55, + "grad_norm": 1.0182359218597412, + "learning_rate": 9.673546650241963e-06, + "loss": 2.1207, + "step": 19840 + }, + { + "epoch": 3.55, + "grad_norm": 0.8013817071914673, + "learning_rate": 9.662441050413093e-06, + "loss": 1.7262, + "step": 19845 + }, + { + "epoch": 3.55, + "grad_norm": 0.9642962217330933, + "learning_rate": 9.651340301839367e-06, + "loss": 1.9043, + "step": 19850 + }, + { + "epoch": 3.55, + "grad_norm": 0.8014324307441711, + "learning_rate": 9.64024440803194e-06, + "loss": 1.8698, + "step": 19855 + }, + { + "epoch": 3.55, + "grad_norm": 0.7242090106010437, + "learning_rate": 9.62915337250044e-06, + "loss": 1.6975, + "step": 19860 + }, + { + "epoch": 3.56, + "grad_norm": 0.8667596578598022, + "learning_rate": 9.618067198752937e-06, + "loss": 1.7597, + "step": 19865 + }, + { + "epoch": 3.56, + "grad_norm": 0.5412402749061584, + "learning_rate": 9.606985890295984e-06, + "loss": 1.7658, + "step": 19870 + }, + { + "epoch": 3.56, + "grad_norm": 1.8349127769470215, + "learning_rate": 9.595909450634586e-06, + "loss": 1.7718, + "step": 19875 + }, + { + "epoch": 3.56, + "grad_norm": 1.097348928451538, + "learning_rate": 9.58483788327221e-06, + "loss": 2.0921, + "step": 19880 + }, + { + "epoch": 3.56, + "grad_norm": 0.8967131972312927, + "learning_rate": 9.573771191710795e-06, + "loss": 1.8385, + "step": 19885 + }, + { + "epoch": 3.56, + "grad_norm": 7.748429298400879, + "learning_rate": 9.562709379450705e-06, + "loss": 1.4055, + "step": 19890 + }, + { + "epoch": 3.56, + "grad_norm": 0.7774618268013, + "learning_rate": 9.551652449990797e-06, + "loss": 2.0261, + "step": 19895 + }, + { + "epoch": 3.56, + "grad_norm": 0.6186455488204956, + "learning_rate": 9.540600406828354e-06, + "loss": 1.8112, + "step": 19900 + }, + { + "epoch": 3.56, + "grad_norm": 0.5887669324874878, + "learning_rate": 9.529553253459132e-06, + "loss": 1.9148, + "step": 19905 + }, + { + "epoch": 3.56, + "grad_norm": 0.9519124627113342, + "learning_rate": 9.518510993377331e-06, + "loss": 1.7988, + "step": 19910 + }, + { + "epoch": 3.56, + "grad_norm": 0.5737485885620117, + "learning_rate": 9.507473630075613e-06, + "loss": 1.8965, + "step": 19915 + }, + { + "epoch": 3.57, + "grad_norm": 1.0009839534759521, + "learning_rate": 9.496441167045095e-06, + "loss": 1.9366, + "step": 19920 + }, + { + "epoch": 3.57, + "grad_norm": 0.9404077529907227, + "learning_rate": 9.485413607775314e-06, + "loss": 1.7763, + "step": 19925 + }, + { + "epoch": 3.57, + "grad_norm": 0.8102321624755859, + "learning_rate": 9.474390955754293e-06, + "loss": 1.6678, + "step": 19930 + }, + { + "epoch": 3.57, + "grad_norm": 1.2429808378219604, + "learning_rate": 9.46337321446847e-06, + "loss": 1.8401, + "step": 19935 + }, + { + "epoch": 3.57, + "grad_norm": 0.66902095079422, + "learning_rate": 9.452360387402756e-06, + "loss": 1.6209, + "step": 19940 + }, + { + "epoch": 3.57, + "grad_norm": 0.5885403752326965, + "learning_rate": 9.441352478040497e-06, + "loss": 2.0758, + "step": 19945 + }, + { + "epoch": 3.57, + "grad_norm": 0.7154830694198608, + "learning_rate": 9.430349489863493e-06, + "loss": 1.9128, + "step": 19950 + }, + { + "epoch": 3.57, + "grad_norm": 0.5632511973381042, + "learning_rate": 9.419351426351958e-06, + "loss": 2.2468, + "step": 19955 + }, + { + "epoch": 3.57, + "grad_norm": 2.063032388687134, + "learning_rate": 9.40835829098458e-06, + "loss": 1.6646, + "step": 19960 + }, + { + "epoch": 3.57, + "grad_norm": 0.9876983165740967, + "learning_rate": 9.397370087238483e-06, + "loss": 1.8661, + "step": 19965 + }, + { + "epoch": 3.57, + "grad_norm": 0.797542154788971, + "learning_rate": 9.386386818589212e-06, + "loss": 2.0163, + "step": 19970 + }, + { + "epoch": 3.58, + "grad_norm": 0.8260084390640259, + "learning_rate": 9.375408488510787e-06, + "loss": 1.7415, + "step": 19975 + }, + { + "epoch": 3.58, + "grad_norm": 0.849568247795105, + "learning_rate": 9.364435100475625e-06, + "loss": 1.648, + "step": 19980 + }, + { + "epoch": 3.58, + "grad_norm": 0.8121174573898315, + "learning_rate": 9.35346665795459e-06, + "loss": 2.2069, + "step": 19985 + }, + { + "epoch": 3.58, + "grad_norm": 0.9171432256698608, + "learning_rate": 9.342503164417005e-06, + "loss": 2.1348, + "step": 19990 + }, + { + "epoch": 3.58, + "grad_norm": 0.9130405783653259, + "learning_rate": 9.331544623330606e-06, + "loss": 2.0652, + "step": 19995 + }, + { + "epoch": 3.58, + "grad_norm": 0.814951479434967, + "learning_rate": 9.320591038161574e-06, + "loss": 1.6998, + "step": 20000 + }, + { + "epoch": 3.58, + "grad_norm": 1.7941076755523682, + "learning_rate": 9.309642412374514e-06, + "loss": 1.7472, + "step": 20005 + }, + { + "epoch": 3.58, + "grad_norm": 0.9372169971466064, + "learning_rate": 9.298698749432474e-06, + "loss": 1.759, + "step": 20010 + }, + { + "epoch": 3.58, + "grad_norm": 0.6178033351898193, + "learning_rate": 9.287760052796909e-06, + "loss": 1.8053, + "step": 20015 + }, + { + "epoch": 3.58, + "grad_norm": 0.8956480622291565, + "learning_rate": 9.27682632592773e-06, + "loss": 1.8945, + "step": 20020 + }, + { + "epoch": 3.58, + "grad_norm": 0.7049922347068787, + "learning_rate": 9.265897572283263e-06, + "loss": 1.9184, + "step": 20025 + }, + { + "epoch": 3.59, + "grad_norm": 0.6929253935813904, + "learning_rate": 9.254973795320254e-06, + "loss": 1.9086, + "step": 20030 + }, + { + "epoch": 3.59, + "grad_norm": 0.7709065675735474, + "learning_rate": 9.244054998493886e-06, + "loss": 1.7822, + "step": 20035 + }, + { + "epoch": 3.59, + "grad_norm": 0.817094624042511, + "learning_rate": 9.23314118525778e-06, + "loss": 1.7846, + "step": 20040 + }, + { + "epoch": 3.59, + "grad_norm": 0.9673941135406494, + "learning_rate": 9.22223235906394e-06, + "loss": 1.8459, + "step": 20045 + }, + { + "epoch": 3.59, + "grad_norm": 0.639333188533783, + "learning_rate": 9.211328523362828e-06, + "loss": 1.8805, + "step": 20050 + }, + { + "epoch": 3.59, + "grad_norm": 1.8863416910171509, + "learning_rate": 9.200429681603318e-06, + "loss": 1.7241, + "step": 20055 + }, + { + "epoch": 3.59, + "grad_norm": 1.824625849723816, + "learning_rate": 9.189535837232701e-06, + "loss": 1.8988, + "step": 20060 + }, + { + "epoch": 3.59, + "grad_norm": 0.7470659017562866, + "learning_rate": 9.178646993696702e-06, + "loss": 1.3453, + "step": 20065 + }, + { + "epoch": 3.59, + "grad_norm": 1.1714149713516235, + "learning_rate": 9.167763154439437e-06, + "loss": 2.1238, + "step": 20070 + }, + { + "epoch": 3.59, + "grad_norm": 0.8737668991088867, + "learning_rate": 9.156884322903452e-06, + "loss": 1.9031, + "step": 20075 + }, + { + "epoch": 3.59, + "grad_norm": 1.0085196495056152, + "learning_rate": 9.146010502529717e-06, + "loss": 1.58, + "step": 20080 + }, + { + "epoch": 3.6, + "grad_norm": 0.6037978529930115, + "learning_rate": 9.13514169675761e-06, + "loss": 1.8138, + "step": 20085 + }, + { + "epoch": 3.6, + "grad_norm": 1.0545234680175781, + "learning_rate": 9.124277909024923e-06, + "loss": 1.7113, + "step": 20090 + }, + { + "epoch": 3.6, + "grad_norm": 0.7123447060585022, + "learning_rate": 9.113419142767864e-06, + "loss": 1.7098, + "step": 20095 + }, + { + "epoch": 3.6, + "grad_norm": 0.9203841090202332, + "learning_rate": 9.102565401421056e-06, + "loss": 1.6357, + "step": 20100 + }, + { + "epoch": 3.6, + "grad_norm": 0.8946130871772766, + "learning_rate": 9.091716688417515e-06, + "loss": 2.1125, + "step": 20105 + }, + { + "epoch": 3.6, + "grad_norm": 0.6853888630867004, + "learning_rate": 9.080873007188681e-06, + "loss": 1.9502, + "step": 20110 + }, + { + "epoch": 3.6, + "grad_norm": 0.7109887003898621, + "learning_rate": 9.070034361164412e-06, + "loss": 1.7719, + "step": 20115 + }, + { + "epoch": 3.6, + "grad_norm": 1.7917437553405762, + "learning_rate": 9.059200753772942e-06, + "loss": 1.8331, + "step": 20120 + }, + { + "epoch": 3.6, + "grad_norm": 0.8030440211296082, + "learning_rate": 9.048372188440942e-06, + "loss": 1.8035, + "step": 20125 + }, + { + "epoch": 3.6, + "grad_norm": 0.9415981769561768, + "learning_rate": 9.037548668593482e-06, + "loss": 2.0076, + "step": 20130 + }, + { + "epoch": 3.6, + "grad_norm": 1.300551176071167, + "learning_rate": 9.026730197654018e-06, + "loss": 1.7933, + "step": 20135 + }, + { + "epoch": 3.6, + "grad_norm": 0.8916707038879395, + "learning_rate": 9.015916779044428e-06, + "loss": 1.9135, + "step": 20140 + }, + { + "epoch": 3.61, + "grad_norm": 0.4446997046470642, + "learning_rate": 9.005108416184985e-06, + "loss": 1.8592, + "step": 20145 + }, + { + "epoch": 3.61, + "grad_norm": 0.7679259777069092, + "learning_rate": 8.994305112494364e-06, + "loss": 1.8635, + "step": 20150 + }, + { + "epoch": 3.61, + "grad_norm": 1.084495186805725, + "learning_rate": 8.983506871389648e-06, + "loss": 1.6656, + "step": 20155 + }, + { + "epoch": 3.61, + "grad_norm": 0.5544742941856384, + "learning_rate": 8.972713696286297e-06, + "loss": 2.0044, + "step": 20160 + }, + { + "epoch": 3.61, + "grad_norm": 1.2430366277694702, + "learning_rate": 8.961925590598196e-06, + "loss": 1.9011, + "step": 20165 + }, + { + "epoch": 3.61, + "grad_norm": 0.7293367385864258, + "learning_rate": 8.951142557737597e-06, + "loss": 1.8652, + "step": 20170 + }, + { + "epoch": 3.61, + "grad_norm": 0.6293432116508484, + "learning_rate": 8.940364601115167e-06, + "loss": 2.0155, + "step": 20175 + }, + { + "epoch": 3.61, + "grad_norm": 0.6995351910591125, + "learning_rate": 8.929591724139966e-06, + "loss": 1.5788, + "step": 20180 + }, + { + "epoch": 3.61, + "grad_norm": 1.0130141973495483, + "learning_rate": 8.918823930219455e-06, + "loss": 2.1183, + "step": 20185 + }, + { + "epoch": 3.61, + "grad_norm": 0.8194257020950317, + "learning_rate": 8.908061222759457e-06, + "loss": 1.8322, + "step": 20190 + }, + { + "epoch": 3.61, + "grad_norm": 3.565441608428955, + "learning_rate": 8.899454721330733e-06, + "loss": 1.7912, + "step": 20195 + }, + { + "epoch": 3.62, + "grad_norm": 0.643989622592926, + "learning_rate": 8.888701178077265e-06, + "loss": 1.8781, + "step": 20200 + }, + { + "epoch": 3.62, + "grad_norm": 0.9238978624343872, + "learning_rate": 8.877952730812116e-06, + "loss": 1.8105, + "step": 20205 + }, + { + "epoch": 3.62, + "grad_norm": 0.7382200360298157, + "learning_rate": 8.867209382935022e-06, + "loss": 1.6843, + "step": 20210 + }, + { + "epoch": 3.62, + "grad_norm": 1.2155033349990845, + "learning_rate": 8.856471137844071e-06, + "loss": 1.6075, + "step": 20215 + }, + { + "epoch": 3.62, + "grad_norm": 12.773721694946289, + "learning_rate": 8.845737998935779e-06, + "loss": 1.8195, + "step": 20220 + }, + { + "epoch": 3.62, + "grad_norm": 1.3359897136688232, + "learning_rate": 8.835009969605012e-06, + "loss": 2.098, + "step": 20225 + }, + { + "epoch": 3.62, + "grad_norm": 0.8200415968894958, + "learning_rate": 8.824287053245046e-06, + "loss": 1.639, + "step": 20230 + }, + { + "epoch": 3.62, + "grad_norm": 1.8653521537780762, + "learning_rate": 8.813569253247522e-06, + "loss": 1.714, + "step": 20235 + }, + { + "epoch": 3.62, + "grad_norm": 0.7212638258934021, + "learning_rate": 8.802856573002474e-06, + "loss": 1.5678, + "step": 20240 + }, + { + "epoch": 3.62, + "grad_norm": 0.6007484793663025, + "learning_rate": 8.792149015898321e-06, + "loss": 2.021, + "step": 20245 + }, + { + "epoch": 3.62, + "grad_norm": 0.8171059489250183, + "learning_rate": 8.781446585321836e-06, + "loss": 1.644, + "step": 20250 + }, + { + "epoch": 3.63, + "grad_norm": 0.5657952427864075, + "learning_rate": 8.770749284658198e-06, + "loss": 1.7338, + "step": 20255 + }, + { + "epoch": 3.63, + "grad_norm": 0.8464135527610779, + "learning_rate": 8.760057117290956e-06, + "loss": 1.8144, + "step": 20260 + }, + { + "epoch": 3.63, + "grad_norm": 0.8611558079719543, + "learning_rate": 8.749370086602022e-06, + "loss": 1.9741, + "step": 20265 + }, + { + "epoch": 3.63, + "grad_norm": 0.8753948211669922, + "learning_rate": 8.738688195971698e-06, + "loss": 2.034, + "step": 20270 + }, + { + "epoch": 3.63, + "grad_norm": 0.4790874421596527, + "learning_rate": 8.728011448778652e-06, + "loss": 1.8537, + "step": 20275 + }, + { + "epoch": 3.63, + "grad_norm": 0.5300187468528748, + "learning_rate": 8.717339848399947e-06, + "loss": 1.6698, + "step": 20280 + }, + { + "epoch": 3.63, + "grad_norm": 1.3581863641738892, + "learning_rate": 8.706673398210973e-06, + "loss": 1.9417, + "step": 20285 + }, + { + "epoch": 3.63, + "grad_norm": 0.47221723198890686, + "learning_rate": 8.696012101585533e-06, + "loss": 2.0264, + "step": 20290 + }, + { + "epoch": 3.63, + "grad_norm": 1.0025262832641602, + "learning_rate": 8.685355961895784e-06, + "loss": 1.7893, + "step": 20295 + }, + { + "epoch": 3.63, + "grad_norm": 1.1813714504241943, + "learning_rate": 8.67470498251225e-06, + "loss": 1.8123, + "step": 20300 + }, + { + "epoch": 3.63, + "grad_norm": 2.1088335514068604, + "learning_rate": 8.664059166803834e-06, + "loss": 1.8788, + "step": 20305 + }, + { + "epoch": 3.64, + "grad_norm": 0.9035912752151489, + "learning_rate": 8.653418518137792e-06, + "loss": 2.0769, + "step": 20310 + }, + { + "epoch": 3.64, + "grad_norm": 0.8217169046401978, + "learning_rate": 8.642783039879742e-06, + "loss": 1.7871, + "step": 20315 + }, + { + "epoch": 3.64, + "grad_norm": 1.7109413146972656, + "learning_rate": 8.632152735393682e-06, + "loss": 1.7824, + "step": 20320 + }, + { + "epoch": 3.64, + "grad_norm": 1.142771601676941, + "learning_rate": 8.621527608041969e-06, + "loss": 1.7042, + "step": 20325 + }, + { + "epoch": 3.64, + "grad_norm": 0.5646505355834961, + "learning_rate": 8.610907661185316e-06, + "loss": 1.8401, + "step": 20330 + }, + { + "epoch": 3.64, + "grad_norm": 0.9124100208282471, + "learning_rate": 8.600292898182815e-06, + "loss": 2.1127, + "step": 20335 + }, + { + "epoch": 3.64, + "grad_norm": 0.7633247971534729, + "learning_rate": 8.589683322391889e-06, + "loss": 1.9485, + "step": 20340 + }, + { + "epoch": 3.64, + "grad_norm": 0.6282685399055481, + "learning_rate": 8.579078937168344e-06, + "loss": 1.8386, + "step": 20345 + }, + { + "epoch": 3.64, + "grad_norm": 0.593157172203064, + "learning_rate": 8.568479745866344e-06, + "loss": 1.806, + "step": 20350 + }, + { + "epoch": 3.64, + "grad_norm": 0.9064671397209167, + "learning_rate": 8.557885751838384e-06, + "loss": 1.8968, + "step": 20355 + }, + { + "epoch": 3.64, + "grad_norm": 1.2595146894454956, + "learning_rate": 8.54729695843535e-06, + "loss": 1.8831, + "step": 20360 + }, + { + "epoch": 3.65, + "grad_norm": 0.8819836378097534, + "learning_rate": 8.53671336900647e-06, + "loss": 1.7677, + "step": 20365 + }, + { + "epoch": 3.65, + "grad_norm": 0.46707066893577576, + "learning_rate": 8.526134986899304e-06, + "loss": 1.7894, + "step": 20370 + }, + { + "epoch": 3.65, + "grad_norm": 3.5591564178466797, + "learning_rate": 8.5155618154598e-06, + "loss": 1.6036, + "step": 20375 + }, + { + "epoch": 3.65, + "grad_norm": 0.959492564201355, + "learning_rate": 8.504993858032234e-06, + "loss": 1.9008, + "step": 20380 + }, + { + "epoch": 3.65, + "grad_norm": 0.6960819959640503, + "learning_rate": 8.494431117959247e-06, + "loss": 1.6686, + "step": 20385 + }, + { + "epoch": 3.65, + "grad_norm": 1.4117478132247925, + "learning_rate": 8.48387359858182e-06, + "loss": 1.6402, + "step": 20390 + }, + { + "epoch": 3.65, + "grad_norm": 1.1412084102630615, + "learning_rate": 8.473321303239296e-06, + "loss": 1.7246, + "step": 20395 + }, + { + "epoch": 3.65, + "grad_norm": 0.6889460682868958, + "learning_rate": 8.462774235269348e-06, + "loss": 1.9935, + "step": 20400 + }, + { + "epoch": 3.65, + "grad_norm": 3.105839490890503, + "learning_rate": 8.452232398007993e-06, + "loss": 1.9629, + "step": 20405 + }, + { + "epoch": 3.65, + "grad_norm": 1.2569962739944458, + "learning_rate": 8.44169579478961e-06, + "loss": 1.8072, + "step": 20410 + }, + { + "epoch": 3.65, + "grad_norm": 0.9013957977294922, + "learning_rate": 8.431164428946927e-06, + "loss": 2.0294, + "step": 20415 + }, + { + "epoch": 3.66, + "grad_norm": 1.8932322263717651, + "learning_rate": 8.420638303810993e-06, + "loss": 1.7084, + "step": 20420 + }, + { + "epoch": 3.66, + "grad_norm": 0.6387518644332886, + "learning_rate": 8.410117422711227e-06, + "loss": 1.9067, + "step": 20425 + }, + { + "epoch": 3.66, + "grad_norm": 0.5416659712791443, + "learning_rate": 8.399601788975354e-06, + "loss": 1.9721, + "step": 20430 + }, + { + "epoch": 3.66, + "grad_norm": 1.3782933950424194, + "learning_rate": 8.389091405929467e-06, + "loss": 1.9061, + "step": 20435 + }, + { + "epoch": 3.66, + "grad_norm": 2.072787046432495, + "learning_rate": 8.37858627689799e-06, + "loss": 1.9161, + "step": 20440 + }, + { + "epoch": 3.66, + "grad_norm": 0.5644092559814453, + "learning_rate": 8.368086405203696e-06, + "loss": 1.7495, + "step": 20445 + }, + { + "epoch": 3.66, + "grad_norm": 1.1922764778137207, + "learning_rate": 8.357591794167666e-06, + "loss": 1.8278, + "step": 20450 + }, + { + "epoch": 3.66, + "grad_norm": 0.7478966116905212, + "learning_rate": 8.34710244710935e-06, + "loss": 1.8286, + "step": 20455 + }, + { + "epoch": 3.66, + "grad_norm": 0.6147327423095703, + "learning_rate": 8.336618367346505e-06, + "loss": 1.615, + "step": 20460 + }, + { + "epoch": 3.66, + "grad_norm": 1.372970700263977, + "learning_rate": 8.326139558195242e-06, + "loss": 1.9454, + "step": 20465 + }, + { + "epoch": 3.66, + "grad_norm": 0.6343486905097961, + "learning_rate": 8.315666022969998e-06, + "loss": 1.8179, + "step": 20470 + }, + { + "epoch": 3.66, + "grad_norm": 0.6342081427574158, + "learning_rate": 8.30519776498354e-06, + "loss": 1.7307, + "step": 20475 + }, + { + "epoch": 3.67, + "grad_norm": 1.3767012357711792, + "learning_rate": 8.294734787546975e-06, + "loss": 1.8573, + "step": 20480 + }, + { + "epoch": 3.67, + "grad_norm": 0.8521339297294617, + "learning_rate": 8.284277093969737e-06, + "loss": 1.9258, + "step": 20485 + }, + { + "epoch": 3.67, + "grad_norm": 0.3690893054008484, + "learning_rate": 8.273824687559573e-06, + "loss": 1.9283, + "step": 20490 + }, + { + "epoch": 3.67, + "grad_norm": 0.61278235912323, + "learning_rate": 8.263377571622563e-06, + "loss": 1.6369, + "step": 20495 + }, + { + "epoch": 3.67, + "grad_norm": 0.7713987827301025, + "learning_rate": 8.252935749463132e-06, + "loss": 1.9637, + "step": 20500 + }, + { + "epoch": 3.67, + "grad_norm": 0.4912928640842438, + "learning_rate": 8.242499224384012e-06, + "loss": 1.9876, + "step": 20505 + }, + { + "epoch": 3.67, + "grad_norm": 1.3459941148757935, + "learning_rate": 8.232067999686269e-06, + "loss": 1.5705, + "step": 20510 + }, + { + "epoch": 3.67, + "grad_norm": 0.9344834089279175, + "learning_rate": 8.221642078669296e-06, + "loss": 1.7796, + "step": 20515 + }, + { + "epoch": 3.67, + "grad_norm": 1.331256628036499, + "learning_rate": 8.211221464630788e-06, + "loss": 1.85, + "step": 20520 + }, + { + "epoch": 3.67, + "grad_norm": 0.6549345850944519, + "learning_rate": 8.200806160866784e-06, + "loss": 1.7886, + "step": 20525 + }, + { + "epoch": 3.67, + "grad_norm": 0.852634847164154, + "learning_rate": 8.19039617067163e-06, + "loss": 1.6675, + "step": 20530 + }, + { + "epoch": 3.68, + "grad_norm": 0.6992741227149963, + "learning_rate": 8.179991497337997e-06, + "loss": 1.9973, + "step": 20535 + }, + { + "epoch": 3.68, + "grad_norm": 0.8704904317855835, + "learning_rate": 8.169592144156885e-06, + "loss": 1.7932, + "step": 20540 + }, + { + "epoch": 3.68, + "grad_norm": 4.262266635894775, + "learning_rate": 8.159198114417591e-06, + "loss": 1.679, + "step": 20545 + }, + { + "epoch": 3.68, + "grad_norm": 0.6595445275306702, + "learning_rate": 8.148809411407723e-06, + "loss": 1.8552, + "step": 20550 + }, + { + "epoch": 3.68, + "grad_norm": 0.7737083435058594, + "learning_rate": 8.13842603841323e-06, + "loss": 1.8383, + "step": 20555 + }, + { + "epoch": 3.68, + "grad_norm": 0.8114953637123108, + "learning_rate": 8.128047998718363e-06, + "loss": 1.8305, + "step": 20560 + }, + { + "epoch": 3.68, + "grad_norm": 0.9654433131217957, + "learning_rate": 8.117675295605685e-06, + "loss": 1.7202, + "step": 20565 + }, + { + "epoch": 3.68, + "grad_norm": 1.0531843900680542, + "learning_rate": 8.10730793235608e-06, + "loss": 1.437, + "step": 20570 + }, + { + "epoch": 3.68, + "grad_norm": 0.9250448942184448, + "learning_rate": 8.096945912248721e-06, + "loss": 1.4706, + "step": 20575 + }, + { + "epoch": 3.68, + "grad_norm": 0.9229045510292053, + "learning_rate": 8.08658923856111e-06, + "loss": 1.8187, + "step": 20580 + }, + { + "epoch": 3.68, + "grad_norm": 0.9853529930114746, + "learning_rate": 8.076237914569063e-06, + "loss": 1.7397, + "step": 20585 + }, + { + "epoch": 3.69, + "grad_norm": 0.8005128502845764, + "learning_rate": 8.065891943546675e-06, + "loss": 1.9982, + "step": 20590 + }, + { + "epoch": 3.69, + "grad_norm": 0.5977509021759033, + "learning_rate": 8.055551328766378e-06, + "loss": 1.8997, + "step": 20595 + }, + { + "epoch": 3.69, + "grad_norm": 1.766401767730713, + "learning_rate": 8.045216073498896e-06, + "loss": 1.6968, + "step": 20600 + }, + { + "epoch": 3.69, + "grad_norm": 1.244626760482788, + "learning_rate": 8.03488618101327e-06, + "loss": 1.7645, + "step": 20605 + }, + { + "epoch": 3.69, + "grad_norm": 0.5766518115997314, + "learning_rate": 8.02456165457682e-06, + "loss": 1.8566, + "step": 20610 + }, + { + "epoch": 3.69, + "grad_norm": 0.8886863589286804, + "learning_rate": 8.014242497455191e-06, + "loss": 1.7796, + "step": 20615 + }, + { + "epoch": 3.69, + "grad_norm": 0.8238760232925415, + "learning_rate": 8.00392871291232e-06, + "loss": 2.0147, + "step": 20620 + }, + { + "epoch": 3.69, + "grad_norm": 0.5203852653503418, + "learning_rate": 7.993620304210447e-06, + "loss": 2.0413, + "step": 20625 + }, + { + "epoch": 3.69, + "grad_norm": 0.9186719059944153, + "learning_rate": 7.983317274610125e-06, + "loss": 1.8603, + "step": 20630 + }, + { + "epoch": 3.69, + "grad_norm": 1.096434473991394, + "learning_rate": 7.973019627370182e-06, + "loss": 1.6407, + "step": 20635 + }, + { + "epoch": 3.69, + "grad_norm": 0.8249112963676453, + "learning_rate": 7.96272736574774e-06, + "loss": 1.7465, + "step": 20640 + }, + { + "epoch": 3.7, + "grad_norm": 1.604615330696106, + "learning_rate": 7.95244049299825e-06, + "loss": 1.7763, + "step": 20645 + }, + { + "epoch": 3.7, + "grad_norm": 0.730076789855957, + "learning_rate": 7.942159012375434e-06, + "loss": 1.9924, + "step": 20650 + }, + { + "epoch": 3.7, + "grad_norm": 1.068292260169983, + "learning_rate": 7.93188292713131e-06, + "loss": 2.1643, + "step": 20655 + }, + { + "epoch": 3.7, + "grad_norm": 1.0101828575134277, + "learning_rate": 7.92161224051621e-06, + "loss": 1.4485, + "step": 20660 + }, + { + "epoch": 3.7, + "grad_norm": 0.4581923186779022, + "learning_rate": 7.911346955778722e-06, + "loss": 1.9251, + "step": 20665 + }, + { + "epoch": 3.7, + "grad_norm": 0.5387589335441589, + "learning_rate": 7.901087076165756e-06, + "loss": 1.9034, + "step": 20670 + }, + { + "epoch": 3.7, + "grad_norm": 0.7266547679901123, + "learning_rate": 7.8908326049225e-06, + "loss": 1.8593, + "step": 20675 + }, + { + "epoch": 3.7, + "grad_norm": 2.4284040927886963, + "learning_rate": 7.880583545292441e-06, + "loss": 1.5051, + "step": 20680 + }, + { + "epoch": 3.7, + "grad_norm": 0.9465983510017395, + "learning_rate": 7.870339900517335e-06, + "loss": 1.8325, + "step": 20685 + }, + { + "epoch": 3.7, + "grad_norm": 0.9285613894462585, + "learning_rate": 7.86010167383724e-06, + "loss": 1.8667, + "step": 20690 + }, + { + "epoch": 3.7, + "grad_norm": 0.7407609820365906, + "learning_rate": 7.849868868490506e-06, + "loss": 1.8178, + "step": 20695 + }, + { + "epoch": 3.71, + "grad_norm": 5.604633331298828, + "learning_rate": 7.839641487713745e-06, + "loss": 1.994, + "step": 20700 + }, + { + "epoch": 3.71, + "grad_norm": 0.8995658159255981, + "learning_rate": 7.829419534741875e-06, + "loss": 1.9699, + "step": 20705 + }, + { + "epoch": 3.71, + "grad_norm": 1.138853907585144, + "learning_rate": 7.819203012808091e-06, + "loss": 1.9048, + "step": 20710 + }, + { + "epoch": 3.71, + "grad_norm": 0.8742569088935852, + "learning_rate": 7.808991925143869e-06, + "loss": 2.0381, + "step": 20715 + }, + { + "epoch": 3.71, + "grad_norm": 0.40514862537384033, + "learning_rate": 7.798786274978969e-06, + "loss": 2.1029, + "step": 20720 + }, + { + "epoch": 3.71, + "grad_norm": 1.0639851093292236, + "learning_rate": 7.788586065541431e-06, + "loss": 1.6771, + "step": 20725 + }, + { + "epoch": 3.71, + "grad_norm": 2.2875404357910156, + "learning_rate": 7.778391300057553e-06, + "loss": 1.8035, + "step": 20730 + }, + { + "epoch": 3.71, + "grad_norm": 0.6936883330345154, + "learning_rate": 7.768201981751944e-06, + "loss": 1.8963, + "step": 20735 + }, + { + "epoch": 3.71, + "grad_norm": 0.7434184551239014, + "learning_rate": 7.758018113847473e-06, + "loss": 1.6205, + "step": 20740 + }, + { + "epoch": 3.71, + "grad_norm": 1.422929048538208, + "learning_rate": 7.747839699565288e-06, + "loss": 2.0673, + "step": 20745 + }, + { + "epoch": 3.71, + "grad_norm": 0.9202067852020264, + "learning_rate": 7.737666742124816e-06, + "loss": 1.6292, + "step": 20750 + }, + { + "epoch": 3.72, + "grad_norm": 0.48007720708847046, + "learning_rate": 7.727499244743747e-06, + "loss": 1.8354, + "step": 20755 + }, + { + "epoch": 3.72, + "grad_norm": 1.398300051689148, + "learning_rate": 7.717337210638049e-06, + "loss": 1.7383, + "step": 20760 + }, + { + "epoch": 3.72, + "grad_norm": 1.650846004486084, + "learning_rate": 7.707180643021969e-06, + "loss": 1.8777, + "step": 20765 + }, + { + "epoch": 3.72, + "grad_norm": 3.9538509845733643, + "learning_rate": 7.697029545108025e-06, + "loss": 1.6718, + "step": 20770 + }, + { + "epoch": 3.72, + "grad_norm": 0.817905843257904, + "learning_rate": 7.686883920106986e-06, + "loss": 1.6816, + "step": 20775 + }, + { + "epoch": 3.72, + "grad_norm": 0.4926592707633972, + "learning_rate": 7.676743771227906e-06, + "loss": 1.7673, + "step": 20780 + }, + { + "epoch": 3.72, + "grad_norm": 0.5661248564720154, + "learning_rate": 7.666609101678121e-06, + "loss": 1.7792, + "step": 20785 + }, + { + "epoch": 3.72, + "grad_norm": 0.8012621402740479, + "learning_rate": 7.656479914663195e-06, + "loss": 1.7374, + "step": 20790 + }, + { + "epoch": 3.72, + "grad_norm": 1.1758484840393066, + "learning_rate": 7.646356213386988e-06, + "loss": 1.5582, + "step": 20795 + }, + { + "epoch": 3.72, + "grad_norm": 1.1315603256225586, + "learning_rate": 7.63623800105162e-06, + "loss": 2.1121, + "step": 20800 + }, + { + "epoch": 3.72, + "grad_norm": 0.9538705348968506, + "learning_rate": 7.626125280857471e-06, + "loss": 1.5779, + "step": 20805 + }, + { + "epoch": 3.72, + "grad_norm": 0.8569843769073486, + "learning_rate": 7.616018056003191e-06, + "loss": 1.8019, + "step": 20810 + }, + { + "epoch": 3.73, + "grad_norm": 1.2387229204177856, + "learning_rate": 7.605916329685672e-06, + "loss": 1.6668, + "step": 20815 + }, + { + "epoch": 3.73, + "grad_norm": 0.8599191308021545, + "learning_rate": 7.595820105100094e-06, + "loss": 1.7848, + "step": 20820 + }, + { + "epoch": 3.73, + "grad_norm": 0.7321486473083496, + "learning_rate": 7.585729385439869e-06, + "loss": 1.7338, + "step": 20825 + }, + { + "epoch": 3.73, + "grad_norm": 0.7330595254898071, + "learning_rate": 7.575644173896687e-06, + "loss": 1.9163, + "step": 20830 + }, + { + "epoch": 3.73, + "grad_norm": 0.7383427619934082, + "learning_rate": 7.5655644736604945e-06, + "loss": 2.0798, + "step": 20835 + }, + { + "epoch": 3.73, + "grad_norm": 0.7931711673736572, + "learning_rate": 7.555490287919497e-06, + "loss": 1.9642, + "step": 20840 + }, + { + "epoch": 3.73, + "grad_norm": 0.7924222350120544, + "learning_rate": 7.545421619860133e-06, + "loss": 1.6436, + "step": 20845 + }, + { + "epoch": 3.73, + "grad_norm": 1.1156904697418213, + "learning_rate": 7.53535847266712e-06, + "loss": 1.853, + "step": 20850 + }, + { + "epoch": 3.73, + "grad_norm": 0.7994991540908813, + "learning_rate": 7.5253008495234255e-06, + "loss": 1.7786, + "step": 20855 + }, + { + "epoch": 3.73, + "grad_norm": 1.3960700035095215, + "learning_rate": 7.515248753610266e-06, + "loss": 1.9, + "step": 20860 + }, + { + "epoch": 3.73, + "grad_norm": 0.9603045582771301, + "learning_rate": 7.505202188107113e-06, + "loss": 1.8195, + "step": 20865 + }, + { + "epoch": 3.74, + "grad_norm": 1.181504487991333, + "learning_rate": 7.495161156191679e-06, + "loss": 1.6414, + "step": 20870 + }, + { + "epoch": 3.74, + "grad_norm": 0.9459575414657593, + "learning_rate": 7.485125661039927e-06, + "loss": 1.9267, + "step": 20875 + }, + { + "epoch": 3.74, + "grad_norm": 1.0895607471466064, + "learning_rate": 7.47509570582608e-06, + "loss": 1.8476, + "step": 20880 + }, + { + "epoch": 3.74, + "grad_norm": 0.9939213395118713, + "learning_rate": 7.465071293722606e-06, + "loss": 1.8669, + "step": 20885 + }, + { + "epoch": 3.74, + "grad_norm": 0.4708957076072693, + "learning_rate": 7.455052427900217e-06, + "loss": 1.7555, + "step": 20890 + }, + { + "epoch": 3.74, + "grad_norm": 0.8547396659851074, + "learning_rate": 7.445039111527866e-06, + "loss": 1.8426, + "step": 20895 + }, + { + "epoch": 3.74, + "grad_norm": 0.9265462160110474, + "learning_rate": 7.435031347772767e-06, + "loss": 2.0446, + "step": 20900 + }, + { + "epoch": 3.74, + "grad_norm": 0.7409059405326843, + "learning_rate": 7.425029139800349e-06, + "loss": 2.0207, + "step": 20905 + }, + { + "epoch": 3.74, + "grad_norm": 0.8008315563201904, + "learning_rate": 7.415032490774318e-06, + "loss": 1.6952, + "step": 20910 + }, + { + "epoch": 3.74, + "grad_norm": 0.6471626162528992, + "learning_rate": 7.4050414038565856e-06, + "loss": 1.807, + "step": 20915 + }, + { + "epoch": 3.74, + "grad_norm": 0.8940064907073975, + "learning_rate": 7.3950558822073354e-06, + "loss": 1.6712, + "step": 20920 + }, + { + "epoch": 3.75, + "grad_norm": 0.8687788844108582, + "learning_rate": 7.3850759289849745e-06, + "loss": 1.9076, + "step": 20925 + }, + { + "epoch": 3.75, + "grad_norm": 0.44189202785491943, + "learning_rate": 7.3751015473461634e-06, + "loss": 1.7675, + "step": 20930 + }, + { + "epoch": 3.75, + "grad_norm": 0.8087650537490845, + "learning_rate": 7.3671260556953974e-06, + "loss": 1.9859, + "step": 20935 + }, + { + "epoch": 3.75, + "grad_norm": 1.3803483247756958, + "learning_rate": 7.357161710856081e-06, + "loss": 2.1581, + "step": 20940 + }, + { + "epoch": 3.75, + "grad_norm": 0.845194935798645, + "learning_rate": 7.347202946429546e-06, + "loss": 1.6597, + "step": 20945 + }, + { + "epoch": 3.75, + "grad_norm": 1.7201052904129028, + "learning_rate": 7.337249765565737e-06, + "loss": 1.9686, + "step": 20950 + }, + { + "epoch": 3.75, + "grad_norm": 0.8273900747299194, + "learning_rate": 7.327302171412848e-06, + "loss": 1.7416, + "step": 20955 + }, + { + "epoch": 3.75, + "grad_norm": 0.9852076768875122, + "learning_rate": 7.317360167117274e-06, + "loss": 1.5727, + "step": 20960 + }, + { + "epoch": 3.75, + "grad_norm": 0.5173888802528381, + "learning_rate": 7.307423755823678e-06, + "loss": 1.8316, + "step": 20965 + }, + { + "epoch": 3.75, + "grad_norm": 0.5297005772590637, + "learning_rate": 7.2974929406749246e-06, + "loss": 1.5954, + "step": 20970 + }, + { + "epoch": 3.75, + "grad_norm": 0.7687706351280212, + "learning_rate": 7.287567724812125e-06, + "loss": 2.2233, + "step": 20975 + }, + { + "epoch": 3.76, + "grad_norm": 0.9378607869148254, + "learning_rate": 7.277648111374616e-06, + "loss": 1.8363, + "step": 20980 + }, + { + "epoch": 3.76, + "grad_norm": 0.6281090378761292, + "learning_rate": 7.2677341034999625e-06, + "loss": 1.5648, + "step": 20985 + }, + { + "epoch": 3.76, + "grad_norm": 1.1043148040771484, + "learning_rate": 7.257825704323964e-06, + "loss": 1.8477, + "step": 20990 + }, + { + "epoch": 3.76, + "grad_norm": 1.0617643594741821, + "learning_rate": 7.247922916980624e-06, + "loss": 1.6808, + "step": 20995 + }, + { + "epoch": 3.76, + "grad_norm": 2.293015480041504, + "learning_rate": 7.238025744602186e-06, + "loss": 1.8726, + "step": 21000 + }, + { + "epoch": 3.76, + "grad_norm": 0.5096902847290039, + "learning_rate": 7.22813419031913e-06, + "loss": 1.7613, + "step": 21005 + }, + { + "epoch": 3.76, + "grad_norm": 0.8138590455055237, + "learning_rate": 7.218248257260127e-06, + "loss": 1.6788, + "step": 21010 + }, + { + "epoch": 3.76, + "grad_norm": 0.5406817197799683, + "learning_rate": 7.2083679485520974e-06, + "loss": 1.8645, + "step": 21015 + }, + { + "epoch": 3.76, + "grad_norm": 0.9503873586654663, + "learning_rate": 7.198493267320178e-06, + "loss": 1.6199, + "step": 21020 + }, + { + "epoch": 3.76, + "grad_norm": 1.1343449354171753, + "learning_rate": 7.1886242166877084e-06, + "loss": 1.4947, + "step": 21025 + }, + { + "epoch": 3.76, + "grad_norm": 0.753250002861023, + "learning_rate": 7.1787607997762654e-06, + "loss": 2.0704, + "step": 21030 + }, + { + "epoch": 3.77, + "grad_norm": 0.606606662273407, + "learning_rate": 7.168903019705642e-06, + "loss": 1.5729, + "step": 21035 + }, + { + "epoch": 3.77, + "grad_norm": 1.2045537233352661, + "learning_rate": 7.159050879593843e-06, + "loss": 1.6266, + "step": 21040 + }, + { + "epoch": 3.77, + "grad_norm": 1.3641705513000488, + "learning_rate": 7.149204382557095e-06, + "loss": 1.6646, + "step": 21045 + }, + { + "epoch": 3.77, + "grad_norm": 1.94281804561615, + "learning_rate": 7.139363531709825e-06, + "loss": 1.7295, + "step": 21050 + }, + { + "epoch": 3.77, + "grad_norm": 0.4692668318748474, + "learning_rate": 7.129528330164695e-06, + "loss": 1.7785, + "step": 21055 + }, + { + "epoch": 3.77, + "grad_norm": 1.879866123199463, + "learning_rate": 7.11969878103256e-06, + "loss": 1.866, + "step": 21060 + }, + { + "epoch": 3.77, + "grad_norm": 0.6057711243629456, + "learning_rate": 7.109874887422499e-06, + "loss": 1.9315, + "step": 21065 + }, + { + "epoch": 3.77, + "grad_norm": 0.5978086590766907, + "learning_rate": 7.1000566524418055e-06, + "loss": 1.9369, + "step": 21070 + }, + { + "epoch": 3.77, + "grad_norm": 1.631386637687683, + "learning_rate": 7.090244079195976e-06, + "loss": 1.6194, + "step": 21075 + }, + { + "epoch": 3.77, + "grad_norm": 0.8467076420783997, + "learning_rate": 7.080437170788723e-06, + "loss": 1.9249, + "step": 21080 + }, + { + "epoch": 3.77, + "grad_norm": 0.6821494698524475, + "learning_rate": 7.070635930321948e-06, + "loss": 1.7521, + "step": 21085 + }, + { + "epoch": 3.78, + "grad_norm": 0.9568500518798828, + "learning_rate": 7.060840360895785e-06, + "loss": 1.7341, + "step": 21090 + }, + { + "epoch": 3.78, + "grad_norm": 1.729252576828003, + "learning_rate": 7.051050465608558e-06, + "loss": 1.6962, + "step": 21095 + }, + { + "epoch": 3.78, + "grad_norm": 1.2900949716567993, + "learning_rate": 7.041266247556813e-06, + "loss": 2.2104, + "step": 21100 + }, + { + "epoch": 3.78, + "grad_norm": 1.797668695449829, + "learning_rate": 7.031487709835272e-06, + "loss": 1.7064, + "step": 21105 + }, + { + "epoch": 3.78, + "grad_norm": 0.8635187745094299, + "learning_rate": 7.021714855536893e-06, + "loss": 1.7889, + "step": 21110 + }, + { + "epoch": 3.78, + "grad_norm": 0.6783146858215332, + "learning_rate": 7.011947687752804e-06, + "loss": 1.8269, + "step": 21115 + }, + { + "epoch": 3.78, + "grad_norm": 1.27726411819458, + "learning_rate": 7.002186209572359e-06, + "loss": 1.9167, + "step": 21120 + }, + { + "epoch": 3.78, + "grad_norm": 0.873021125793457, + "learning_rate": 6.9924304240831045e-06, + "loss": 1.7524, + "step": 21125 + }, + { + "epoch": 3.78, + "grad_norm": 0.6796855926513672, + "learning_rate": 6.982680334370786e-06, + "loss": 1.899, + "step": 21130 + }, + { + "epoch": 3.78, + "grad_norm": 1.1195898056030273, + "learning_rate": 6.972935943519354e-06, + "loss": 2.2136, + "step": 21135 + }, + { + "epoch": 3.78, + "grad_norm": 1.5191144943237305, + "learning_rate": 6.963197254610937e-06, + "loss": 2.0849, + "step": 21140 + }, + { + "epoch": 3.78, + "grad_norm": 0.978111982345581, + "learning_rate": 6.9534642707258875e-06, + "loss": 1.9883, + "step": 21145 + }, + { + "epoch": 3.79, + "grad_norm": 1.8795119524002075, + "learning_rate": 6.943736994942721e-06, + "loss": 1.6622, + "step": 21150 + }, + { + "epoch": 3.79, + "grad_norm": 1.2708956003189087, + "learning_rate": 6.934015430338176e-06, + "loss": 1.9199, + "step": 21155 + }, + { + "epoch": 3.79, + "grad_norm": 0.7446703910827637, + "learning_rate": 6.924299579987173e-06, + "loss": 1.885, + "step": 21160 + }, + { + "epoch": 3.79, + "grad_norm": 1.003434658050537, + "learning_rate": 6.914589446962824e-06, + "loss": 1.8679, + "step": 21165 + }, + { + "epoch": 3.79, + "grad_norm": 0.9047329425811768, + "learning_rate": 6.904885034336448e-06, + "loss": 1.7717, + "step": 21170 + }, + { + "epoch": 3.79, + "grad_norm": 0.6826780438423157, + "learning_rate": 6.895186345177521e-06, + "loss": 1.9632, + "step": 21175 + }, + { + "epoch": 3.79, + "grad_norm": 0.5355373024940491, + "learning_rate": 6.885493382553737e-06, + "loss": 2.0283, + "step": 21180 + }, + { + "epoch": 3.79, + "grad_norm": 0.8014163374900818, + "learning_rate": 6.875806149530975e-06, + "loss": 1.7532, + "step": 21185 + }, + { + "epoch": 3.79, + "grad_norm": 0.9986274242401123, + "learning_rate": 6.866124649173295e-06, + "loss": 1.9851, + "step": 21190 + }, + { + "epoch": 3.79, + "grad_norm": 1.0653488636016846, + "learning_rate": 6.8564488845429515e-06, + "loss": 2.1604, + "step": 21195 + }, + { + "epoch": 3.79, + "grad_norm": 1.1097596883773804, + "learning_rate": 6.846778858700373e-06, + "loss": 1.6555, + "step": 21200 + }, + { + "epoch": 3.8, + "grad_norm": 15.828474044799805, + "learning_rate": 6.837114574704176e-06, + "loss": 1.6847, + "step": 21205 + }, + { + "epoch": 3.8, + "grad_norm": 3.438732385635376, + "learning_rate": 6.827456035611166e-06, + "loss": 1.7038, + "step": 21210 + }, + { + "epoch": 3.8, + "grad_norm": 0.7653161287307739, + "learning_rate": 6.817803244476331e-06, + "loss": 1.8852, + "step": 21215 + }, + { + "epoch": 3.8, + "grad_norm": 1.15346360206604, + "learning_rate": 6.8081562043528445e-06, + "loss": 1.6198, + "step": 21220 + }, + { + "epoch": 3.8, + "grad_norm": 0.6364823579788208, + "learning_rate": 6.7985149182920564e-06, + "loss": 1.8619, + "step": 21225 + }, + { + "epoch": 3.8, + "grad_norm": 1.149183988571167, + "learning_rate": 6.788879389343486e-06, + "loss": 1.9567, + "step": 21230 + }, + { + "epoch": 3.8, + "grad_norm": 0.8027864694595337, + "learning_rate": 6.7792496205548475e-06, + "loss": 1.9675, + "step": 21235 + }, + { + "epoch": 3.8, + "grad_norm": 0.7436734437942505, + "learning_rate": 6.769625614972036e-06, + "loss": 1.8813, + "step": 21240 + }, + { + "epoch": 3.8, + "grad_norm": 1.2169238328933716, + "learning_rate": 6.760007375639099e-06, + "loss": 1.8129, + "step": 21245 + }, + { + "epoch": 3.8, + "grad_norm": 0.6234614253044128, + "learning_rate": 6.750394905598284e-06, + "loss": 1.8025, + "step": 21250 + }, + { + "epoch": 3.8, + "grad_norm": 2.2738096714019775, + "learning_rate": 6.740788207890017e-06, + "loss": 2.0495, + "step": 21255 + }, + { + "epoch": 3.81, + "grad_norm": 4.580897808074951, + "learning_rate": 6.731187285552865e-06, + "loss": 1.4265, + "step": 21260 + }, + { + "epoch": 3.81, + "grad_norm": 1.344910740852356, + "learning_rate": 6.721592141623606e-06, + "loss": 1.9222, + "step": 21265 + }, + { + "epoch": 3.81, + "grad_norm": 1.4023634195327759, + "learning_rate": 6.712002779137169e-06, + "loss": 1.7676, + "step": 21270 + }, + { + "epoch": 3.81, + "grad_norm": 2.361953020095825, + "learning_rate": 6.702419201126661e-06, + "loss": 1.6256, + "step": 21275 + }, + { + "epoch": 3.81, + "grad_norm": 0.7495207190513611, + "learning_rate": 6.69284141062336e-06, + "loss": 1.9166, + "step": 21280 + }, + { + "epoch": 3.81, + "grad_norm": 0.5383853316307068, + "learning_rate": 6.68326941065672e-06, + "loss": 1.8247, + "step": 21285 + }, + { + "epoch": 3.81, + "grad_norm": 0.7718889713287354, + "learning_rate": 6.67370320425435e-06, + "loss": 1.959, + "step": 21290 + }, + { + "epoch": 3.81, + "grad_norm": 0.666534423828125, + "learning_rate": 6.6641427944420185e-06, + "loss": 1.9477, + "step": 21295 + }, + { + "epoch": 3.81, + "grad_norm": 1.1692932844161987, + "learning_rate": 6.654588184243682e-06, + "loss": 1.7154, + "step": 21300 + }, + { + "epoch": 3.81, + "grad_norm": 0.8114296793937683, + "learning_rate": 6.645039376681459e-06, + "loss": 1.6824, + "step": 21305 + }, + { + "epoch": 3.81, + "grad_norm": 1.2504725456237793, + "learning_rate": 6.635496374775626e-06, + "loss": 1.7564, + "step": 21310 + }, + { + "epoch": 3.82, + "grad_norm": 1.4884883165359497, + "learning_rate": 6.6259591815446335e-06, + "loss": 1.4289, + "step": 21315 + }, + { + "epoch": 3.82, + "grad_norm": 0.705243706703186, + "learning_rate": 6.616427800005068e-06, + "loss": 1.8026, + "step": 21320 + }, + { + "epoch": 3.82, + "grad_norm": 0.7473030090332031, + "learning_rate": 6.606902233171711e-06, + "loss": 1.9331, + "step": 21325 + }, + { + "epoch": 3.82, + "grad_norm": 0.8204745054244995, + "learning_rate": 6.597382484057482e-06, + "loss": 1.798, + "step": 21330 + }, + { + "epoch": 3.82, + "grad_norm": 0.48442649841308594, + "learning_rate": 6.587868555673479e-06, + "loss": 1.8777, + "step": 21335 + }, + { + "epoch": 3.82, + "grad_norm": 1.1391490697860718, + "learning_rate": 6.5783604510289365e-06, + "loss": 1.8687, + "step": 21340 + }, + { + "epoch": 3.82, + "grad_norm": 0.8502964973449707, + "learning_rate": 6.56885817313127e-06, + "loss": 1.8788, + "step": 21345 + }, + { + "epoch": 3.82, + "grad_norm": 2.4996724128723145, + "learning_rate": 6.559361724986029e-06, + "loss": 2.0761, + "step": 21350 + }, + { + "epoch": 3.82, + "grad_norm": 0.40672042965888977, + "learning_rate": 6.549871109596939e-06, + "loss": 1.8876, + "step": 21355 + }, + { + "epoch": 3.82, + "grad_norm": 1.889902949333191, + "learning_rate": 6.5403863299658665e-06, + "loss": 1.8512, + "step": 21360 + }, + { + "epoch": 3.82, + "grad_norm": 0.7777634263038635, + "learning_rate": 6.530907389092842e-06, + "loss": 2.0392, + "step": 21365 + }, + { + "epoch": 3.83, + "grad_norm": 0.6818479299545288, + "learning_rate": 6.521434289976047e-06, + "loss": 1.6088, + "step": 21370 + }, + { + "epoch": 3.83, + "grad_norm": 0.5815435647964478, + "learning_rate": 6.511967035611816e-06, + "loss": 1.9384, + "step": 21375 + }, + { + "epoch": 3.83, + "grad_norm": 1.1326960325241089, + "learning_rate": 6.5025056289946315e-06, + "loss": 2.0075, + "step": 21380 + }, + { + "epoch": 3.83, + "grad_norm": 0.7850456833839417, + "learning_rate": 6.493050073117116e-06, + "loss": 1.8053, + "step": 21385 + }, + { + "epoch": 3.83, + "grad_norm": 1.2165858745574951, + "learning_rate": 6.483600370970061e-06, + "loss": 1.6863, + "step": 21390 + }, + { + "epoch": 3.83, + "grad_norm": 1.1443709135055542, + "learning_rate": 6.474156525542399e-06, + "loss": 1.9593, + "step": 21395 + }, + { + "epoch": 3.83, + "grad_norm": 0.9057660698890686, + "learning_rate": 6.4647185398212075e-06, + "loss": 1.7776, + "step": 21400 + }, + { + "epoch": 3.83, + "grad_norm": 1.2972538471221924, + "learning_rate": 6.45528641679172e-06, + "loss": 1.7882, + "step": 21405 + }, + { + "epoch": 3.83, + "grad_norm": 1.1421040296554565, + "learning_rate": 6.445860159437295e-06, + "loss": 1.4755, + "step": 21410 + }, + { + "epoch": 3.83, + "grad_norm": 1.4928812980651855, + "learning_rate": 6.4364397707394555e-06, + "loss": 1.8019, + "step": 21415 + }, + { + "epoch": 3.83, + "grad_norm": 1.1022720336914062, + "learning_rate": 6.427025253677857e-06, + "loss": 2.0423, + "step": 21420 + }, + { + "epoch": 3.83, + "grad_norm": 0.6213718056678772, + "learning_rate": 6.417616611230318e-06, + "loss": 1.8774, + "step": 21425 + }, + { + "epoch": 3.84, + "grad_norm": 1.03056800365448, + "learning_rate": 6.40821384637276e-06, + "loss": 1.995, + "step": 21430 + }, + { + "epoch": 3.84, + "grad_norm": 0.6993535757064819, + "learning_rate": 6.398816962079287e-06, + "loss": 2.0732, + "step": 21435 + }, + { + "epoch": 3.84, + "grad_norm": 1.2051020860671997, + "learning_rate": 6.3894259613221095e-06, + "loss": 1.5418, + "step": 21440 + }, + { + "epoch": 3.84, + "grad_norm": 0.8141611218452454, + "learning_rate": 6.380040847071603e-06, + "loss": 1.536, + "step": 21445 + }, + { + "epoch": 3.84, + "grad_norm": 0.784510612487793, + "learning_rate": 6.370661622296261e-06, + "loss": 1.8054, + "step": 21450 + }, + { + "epoch": 3.84, + "grad_norm": 1.0735670328140259, + "learning_rate": 6.361288289962733e-06, + "loss": 1.9683, + "step": 21455 + }, + { + "epoch": 3.84, + "grad_norm": 0.6221914887428284, + "learning_rate": 6.351920853035795e-06, + "loss": 1.829, + "step": 21460 + }, + { + "epoch": 3.84, + "grad_norm": 0.5578571557998657, + "learning_rate": 6.34255931447835e-06, + "loss": 1.6748, + "step": 21465 + }, + { + "epoch": 3.84, + "grad_norm": 0.4949614703655243, + "learning_rate": 6.333203677251445e-06, + "loss": 1.9189, + "step": 21470 + }, + { + "epoch": 3.84, + "grad_norm": 1.253214716911316, + "learning_rate": 6.323853944314267e-06, + "loss": 1.5789, + "step": 21475 + }, + { + "epoch": 3.84, + "grad_norm": 0.5360049605369568, + "learning_rate": 6.3145101186241175e-06, + "loss": 1.7498, + "step": 21480 + }, + { + "epoch": 3.85, + "grad_norm": 0.6865981221199036, + "learning_rate": 6.305172203136447e-06, + "loss": 1.8506, + "step": 21485 + }, + { + "epoch": 3.85, + "grad_norm": 0.7552772760391235, + "learning_rate": 6.295840200804823e-06, + "loss": 1.9033, + "step": 21490 + }, + { + "epoch": 3.85, + "grad_norm": 0.9978206157684326, + "learning_rate": 6.2865141145809604e-06, + "loss": 2.001, + "step": 21495 + }, + { + "epoch": 3.85, + "grad_norm": 0.6220951080322266, + "learning_rate": 6.277193947414678e-06, + "loss": 1.809, + "step": 21500 + }, + { + "epoch": 3.85, + "grad_norm": 1.6220732927322388, + "learning_rate": 6.267879702253937e-06, + "loss": 1.789, + "step": 21505 + }, + { + "epoch": 3.85, + "grad_norm": 0.6907435059547424, + "learning_rate": 6.258571382044831e-06, + "loss": 2.0546, + "step": 21510 + }, + { + "epoch": 3.85, + "grad_norm": 0.738203227519989, + "learning_rate": 6.2492689897315685e-06, + "loss": 2.0193, + "step": 21515 + }, + { + "epoch": 3.85, + "grad_norm": 0.5000423192977905, + "learning_rate": 6.239972528256497e-06, + "loss": 1.666, + "step": 21520 + }, + { + "epoch": 3.85, + "grad_norm": 0.42941489815711975, + "learning_rate": 6.23068200056007e-06, + "loss": 1.9748, + "step": 21525 + }, + { + "epoch": 3.85, + "grad_norm": 0.4890025854110718, + "learning_rate": 6.221397409580864e-06, + "loss": 2.0923, + "step": 21530 + }, + { + "epoch": 3.85, + "grad_norm": 0.8558998107910156, + "learning_rate": 6.212118758255595e-06, + "loss": 1.7637, + "step": 21535 + }, + { + "epoch": 3.86, + "grad_norm": 0.7938655614852905, + "learning_rate": 6.202846049519093e-06, + "loss": 1.752, + "step": 21540 + }, + { + "epoch": 3.86, + "grad_norm": 1.0123392343521118, + "learning_rate": 6.193579286304302e-06, + "loss": 1.6815, + "step": 21545 + }, + { + "epoch": 3.86, + "grad_norm": 3.046292781829834, + "learning_rate": 6.184318471542303e-06, + "loss": 1.7829, + "step": 21550 + }, + { + "epoch": 3.86, + "grad_norm": 0.652489423751831, + "learning_rate": 6.175063608162266e-06, + "loss": 1.903, + "step": 21555 + }, + { + "epoch": 3.86, + "grad_norm": 0.838995099067688, + "learning_rate": 6.165814699091499e-06, + "loss": 1.789, + "step": 21560 + }, + { + "epoch": 3.86, + "grad_norm": 0.7666052579879761, + "learning_rate": 6.156571747255435e-06, + "loss": 1.6717, + "step": 21565 + }, + { + "epoch": 3.86, + "grad_norm": 0.5006678700447083, + "learning_rate": 6.147334755577596e-06, + "loss": 1.9561, + "step": 21570 + }, + { + "epoch": 3.86, + "grad_norm": 0.8589296340942383, + "learning_rate": 6.138103726979641e-06, + "loss": 1.9194, + "step": 21575 + }, + { + "epoch": 3.86, + "grad_norm": 0.4786919951438904, + "learning_rate": 6.128878664381332e-06, + "loss": 1.9861, + "step": 21580 + }, + { + "epoch": 3.86, + "grad_norm": 1.460368275642395, + "learning_rate": 6.119659570700559e-06, + "loss": 1.6571, + "step": 21585 + }, + { + "epoch": 3.86, + "grad_norm": 0.6441243290901184, + "learning_rate": 6.110446448853293e-06, + "loss": 2.1022, + "step": 21590 + }, + { + "epoch": 3.87, + "grad_norm": 0.36151692271232605, + "learning_rate": 6.10123930175365e-06, + "loss": 2.0311, + "step": 21595 + }, + { + "epoch": 3.87, + "grad_norm": 0.6082046031951904, + "learning_rate": 6.092038132313835e-06, + "loss": 1.7961, + "step": 21600 + }, + { + "epoch": 3.87, + "grad_norm": 0.5249208211898804, + "learning_rate": 6.082842943444172e-06, + "loss": 1.7438, + "step": 21605 + }, + { + "epoch": 3.87, + "grad_norm": 0.9609794020652771, + "learning_rate": 6.073653738053098e-06, + "loss": 2.2195, + "step": 21610 + }, + { + "epoch": 3.87, + "grad_norm": 0.8128685355186462, + "learning_rate": 6.064470519047141e-06, + "loss": 1.6784, + "step": 21615 + }, + { + "epoch": 3.87, + "grad_norm": 1.8232717514038086, + "learning_rate": 6.0552932893309374e-06, + "loss": 1.8214, + "step": 21620 + }, + { + "epoch": 3.87, + "grad_norm": 1.522959589958191, + "learning_rate": 6.046122051807246e-06, + "loss": 1.5712, + "step": 21625 + }, + { + "epoch": 3.87, + "grad_norm": 0.9963710904121399, + "learning_rate": 6.036956809376915e-06, + "loss": 1.7489, + "step": 21630 + }, + { + "epoch": 3.87, + "grad_norm": 1.037346601486206, + "learning_rate": 6.027797564938906e-06, + "loss": 1.5724, + "step": 21635 + }, + { + "epoch": 3.87, + "grad_norm": 1.6422029733657837, + "learning_rate": 6.018644321390288e-06, + "loss": 1.7118, + "step": 21640 + }, + { + "epoch": 3.87, + "grad_norm": 0.9936122298240662, + "learning_rate": 6.009497081626203e-06, + "loss": 1.8194, + "step": 21645 + }, + { + "epoch": 3.88, + "grad_norm": 1.9703397750854492, + "learning_rate": 6.000355848539926e-06, + "loss": 1.6014, + "step": 21650 + }, + { + "epoch": 3.88, + "grad_norm": 0.7200276851654053, + "learning_rate": 5.99122062502282e-06, + "loss": 1.9046, + "step": 21655 + }, + { + "epoch": 3.88, + "grad_norm": 0.6927645802497864, + "learning_rate": 5.982091413964352e-06, + "loss": 1.6747, + "step": 21660 + }, + { + "epoch": 3.88, + "grad_norm": 1.2343568801879883, + "learning_rate": 5.97296821825207e-06, + "loss": 1.6779, + "step": 21665 + }, + { + "epoch": 3.88, + "grad_norm": 0.5954350829124451, + "learning_rate": 5.9638510407716394e-06, + "loss": 1.791, + "step": 21670 + }, + { + "epoch": 3.88, + "grad_norm": 1.0342341661453247, + "learning_rate": 5.954739884406821e-06, + "loss": 1.7339, + "step": 21675 + }, + { + "epoch": 3.88, + "grad_norm": 1.0114802122116089, + "learning_rate": 5.94563475203945e-06, + "loss": 1.9951, + "step": 21680 + }, + { + "epoch": 3.88, + "grad_norm": 2.355144500732422, + "learning_rate": 5.936535646549479e-06, + "loss": 1.668, + "step": 21685 + }, + { + "epoch": 3.88, + "grad_norm": 0.5509099960327148, + "learning_rate": 5.927442570814948e-06, + "loss": 1.819, + "step": 21690 + }, + { + "epoch": 3.88, + "grad_norm": 0.8965473175048828, + "learning_rate": 5.918355527711983e-06, + "loss": 1.8374, + "step": 21695 + }, + { + "epoch": 3.88, + "grad_norm": 0.8329721093177795, + "learning_rate": 5.9092745201148215e-06, + "loss": 1.7613, + "step": 21700 + }, + { + "epoch": 3.89, + "grad_norm": 0.7440474033355713, + "learning_rate": 5.900199550895763e-06, + "loss": 1.7955, + "step": 21705 + }, + { + "epoch": 3.89, + "grad_norm": 0.6702372431755066, + "learning_rate": 5.891130622925209e-06, + "loss": 1.5468, + "step": 21710 + }, + { + "epoch": 3.89, + "grad_norm": 1.7360087633132935, + "learning_rate": 5.882067739071659e-06, + "loss": 1.8326, + "step": 21715 + }, + { + "epoch": 3.89, + "grad_norm": 0.6325078010559082, + "learning_rate": 5.873010902201695e-06, + "loss": 1.8646, + "step": 21720 + }, + { + "epoch": 3.89, + "grad_norm": 0.45870426297187805, + "learning_rate": 5.863960115179984e-06, + "loss": 1.8511, + "step": 21725 + }, + { + "epoch": 3.89, + "grad_norm": 1.7255089282989502, + "learning_rate": 5.854915380869286e-06, + "loss": 1.8966, + "step": 21730 + }, + { + "epoch": 3.89, + "grad_norm": 0.8191224932670593, + "learning_rate": 5.845876702130434e-06, + "loss": 1.8138, + "step": 21735 + }, + { + "epoch": 3.89, + "grad_norm": 1.0439651012420654, + "learning_rate": 5.836844081822357e-06, + "loss": 1.6274, + "step": 21740 + }, + { + "epoch": 3.89, + "grad_norm": 0.7920172810554504, + "learning_rate": 5.827817522802065e-06, + "loss": 1.9206, + "step": 21745 + }, + { + "epoch": 3.89, + "grad_norm": 1.2962287664413452, + "learning_rate": 5.818797027924647e-06, + "loss": 1.9135, + "step": 21750 + }, + { + "epoch": 3.89, + "grad_norm": 0.9365116357803345, + "learning_rate": 5.809782600043287e-06, + "loss": 1.7649, + "step": 21755 + }, + { + "epoch": 3.89, + "grad_norm": 3.8692009449005127, + "learning_rate": 5.800774242009227e-06, + "loss": 1.8553, + "step": 21760 + }, + { + "epoch": 3.9, + "grad_norm": 2.046795606613159, + "learning_rate": 5.7917719566718095e-06, + "loss": 1.7213, + "step": 21765 + }, + { + "epoch": 3.9, + "grad_norm": 0.8467017412185669, + "learning_rate": 5.782775746878444e-06, + "loss": 1.9483, + "step": 21770 + }, + { + "epoch": 3.9, + "grad_norm": 0.7216947078704834, + "learning_rate": 5.773785615474622e-06, + "loss": 2.0848, + "step": 21775 + }, + { + "epoch": 3.9, + "grad_norm": 0.9281513690948486, + "learning_rate": 5.764801565303918e-06, + "loss": 1.9586, + "step": 21780 + }, + { + "epoch": 3.9, + "grad_norm": 3.031953811645508, + "learning_rate": 5.755823599207977e-06, + "loss": 1.7133, + "step": 21785 + }, + { + "epoch": 3.9, + "grad_norm": 0.6615840196609497, + "learning_rate": 5.7468517200265276e-06, + "loss": 2.0092, + "step": 21790 + }, + { + "epoch": 3.9, + "grad_norm": 1.0913735628128052, + "learning_rate": 5.737885930597351e-06, + "loss": 1.6966, + "step": 21795 + }, + { + "epoch": 3.9, + "grad_norm": 1.035690188407898, + "learning_rate": 5.728926233756338e-06, + "loss": 1.7959, + "step": 21800 + }, + { + "epoch": 3.9, + "grad_norm": 0.7709459662437439, + "learning_rate": 5.719972632337409e-06, + "loss": 1.9455, + "step": 21805 + }, + { + "epoch": 3.9, + "grad_norm": 0.6541898250579834, + "learning_rate": 5.711025129172595e-06, + "loss": 1.6532, + "step": 21810 + }, + { + "epoch": 3.9, + "grad_norm": 0.6813231110572815, + "learning_rate": 5.702083727091978e-06, + "loss": 1.9995, + "step": 21815 + }, + { + "epoch": 3.91, + "grad_norm": 0.7361324429512024, + "learning_rate": 5.693148428923722e-06, + "loss": 1.9465, + "step": 21820 + }, + { + "epoch": 3.91, + "grad_norm": 1.0799798965454102, + "learning_rate": 5.68421923749404e-06, + "loss": 1.8866, + "step": 21825 + }, + { + "epoch": 3.91, + "grad_norm": 0.9929239749908447, + "learning_rate": 5.675296155627236e-06, + "loss": 1.7591, + "step": 21830 + }, + { + "epoch": 3.91, + "grad_norm": 0.9932503700256348, + "learning_rate": 5.666379186145668e-06, + "loss": 1.7421, + "step": 21835 + }, + { + "epoch": 3.91, + "grad_norm": 0.8309764862060547, + "learning_rate": 5.6574683318697665e-06, + "loss": 1.9331, + "step": 21840 + }, + { + "epoch": 3.91, + "grad_norm": 0.8258433938026428, + "learning_rate": 5.6485635956180365e-06, + "loss": 1.8122, + "step": 21845 + }, + { + "epoch": 3.91, + "grad_norm": 1.11664617061615, + "learning_rate": 5.639664980207024e-06, + "loss": 1.8511, + "step": 21850 + }, + { + "epoch": 3.91, + "grad_norm": 0.7466225624084473, + "learning_rate": 5.63077248845135e-06, + "loss": 1.9555, + "step": 21855 + }, + { + "epoch": 3.91, + "grad_norm": 0.9185487031936646, + "learning_rate": 5.621886123163708e-06, + "loss": 1.6394, + "step": 21860 + }, + { + "epoch": 3.91, + "grad_norm": 1.338221788406372, + "learning_rate": 5.613005887154845e-06, + "loss": 2.0797, + "step": 21865 + }, + { + "epoch": 3.91, + "grad_norm": 0.5035430192947388, + "learning_rate": 5.604131783233574e-06, + "loss": 1.6932, + "step": 21870 + }, + { + "epoch": 3.92, + "grad_norm": 1.3614168167114258, + "learning_rate": 5.595263814206761e-06, + "loss": 1.7473, + "step": 21875 + }, + { + "epoch": 3.92, + "grad_norm": 1.0385257005691528, + "learning_rate": 5.586401982879347e-06, + "loss": 1.7702, + "step": 21880 + }, + { + "epoch": 3.92, + "grad_norm": 0.7399263381958008, + "learning_rate": 5.5775462920543035e-06, + "loss": 1.8132, + "step": 21885 + }, + { + "epoch": 3.92, + "grad_norm": 0.5867356061935425, + "learning_rate": 5.568696744532687e-06, + "loss": 1.7692, + "step": 21890 + }, + { + "epoch": 3.92, + "grad_norm": 1.5751367807388306, + "learning_rate": 5.5598533431136055e-06, + "loss": 2.0297, + "step": 21895 + }, + { + "epoch": 3.92, + "grad_norm": 0.8599883317947388, + "learning_rate": 5.551016090594208e-06, + "loss": 1.8222, + "step": 21900 + }, + { + "epoch": 3.92, + "grad_norm": 0.6561763882637024, + "learning_rate": 5.542184989769711e-06, + "loss": 1.8176, + "step": 21905 + }, + { + "epoch": 3.92, + "grad_norm": 1.0254570245742798, + "learning_rate": 5.533360043433394e-06, + "loss": 1.8785, + "step": 21910 + }, + { + "epoch": 3.92, + "grad_norm": 1.0083361864089966, + "learning_rate": 5.524541254376564e-06, + "loss": 1.8599, + "step": 21915 + }, + { + "epoch": 3.92, + "grad_norm": 0.6952515244483948, + "learning_rate": 5.515728625388603e-06, + "loss": 1.6481, + "step": 21920 + }, + { + "epoch": 3.92, + "grad_norm": 0.6888887286186218, + "learning_rate": 5.506922159256933e-06, + "loss": 1.9352, + "step": 21925 + }, + { + "epoch": 3.93, + "grad_norm": 1.0823732614517212, + "learning_rate": 5.498121858767036e-06, + "loss": 1.7055, + "step": 21930 + }, + { + "epoch": 3.93, + "grad_norm": 1.0809731483459473, + "learning_rate": 5.489327726702445e-06, + "loss": 2.1196, + "step": 21935 + }, + { + "epoch": 3.93, + "grad_norm": 2.2783148288726807, + "learning_rate": 5.480539765844725e-06, + "loss": 1.6809, + "step": 21940 + }, + { + "epoch": 3.93, + "grad_norm": 1.1743180751800537, + "learning_rate": 5.471757978973496e-06, + "loss": 1.5346, + "step": 21945 + }, + { + "epoch": 3.93, + "grad_norm": 0.7503143548965454, + "learning_rate": 5.462982368866435e-06, + "loss": 1.7871, + "step": 21950 + }, + { + "epoch": 3.93, + "grad_norm": 0.5664401650428772, + "learning_rate": 5.454212938299258e-06, + "loss": 1.9305, + "step": 21955 + }, + { + "epoch": 3.93, + "grad_norm": 0.5282362103462219, + "learning_rate": 5.445449690045725e-06, + "loss": 2.0377, + "step": 21960 + }, + { + "epoch": 3.93, + "grad_norm": 0.9779241681098938, + "learning_rate": 5.436692626877648e-06, + "loss": 1.4596, + "step": 21965 + }, + { + "epoch": 3.93, + "grad_norm": 1.3879930973052979, + "learning_rate": 5.427941751564886e-06, + "loss": 1.8408, + "step": 21970 + }, + { + "epoch": 3.93, + "grad_norm": 0.5937609672546387, + "learning_rate": 5.419197066875312e-06, + "loss": 1.9729, + "step": 21975 + }, + { + "epoch": 3.93, + "grad_norm": 1.0135959386825562, + "learning_rate": 5.41045857557487e-06, + "loss": 1.7513, + "step": 21980 + }, + { + "epoch": 3.94, + "grad_norm": 0.7260029315948486, + "learning_rate": 5.401726280427541e-06, + "loss": 1.919, + "step": 21985 + }, + { + "epoch": 3.94, + "grad_norm": 0.8399494290351868, + "learning_rate": 5.393000184195343e-06, + "loss": 1.6943, + "step": 21990 + }, + { + "epoch": 3.94, + "grad_norm": 1.7650916576385498, + "learning_rate": 5.384280289638322e-06, + "loss": 1.5243, + "step": 21995 + }, + { + "epoch": 3.94, + "grad_norm": 0.5501816868782043, + "learning_rate": 5.375566599514581e-06, + "loss": 2.116, + "step": 22000 + }, + { + "epoch": 3.94, + "grad_norm": 0.9165323376655579, + "learning_rate": 5.366859116580244e-06, + "loss": 1.7613, + "step": 22005 + }, + { + "epoch": 3.94, + "grad_norm": 0.5392761826515198, + "learning_rate": 5.358157843589484e-06, + "loss": 1.7364, + "step": 22010 + }, + { + "epoch": 3.94, + "grad_norm": 0.6568076014518738, + "learning_rate": 5.349462783294504e-06, + "loss": 1.7346, + "step": 22015 + }, + { + "epoch": 3.94, + "grad_norm": 1.0636271238327026, + "learning_rate": 5.340773938445545e-06, + "loss": 1.8723, + "step": 22020 + }, + { + "epoch": 3.94, + "grad_norm": 0.8600271344184875, + "learning_rate": 5.332091311790885e-06, + "loss": 1.5156, + "step": 22025 + }, + { + "epoch": 3.94, + "grad_norm": 0.602660596370697, + "learning_rate": 5.323414906076821e-06, + "loss": 1.7696, + "step": 22030 + }, + { + "epoch": 3.94, + "grad_norm": 0.6825761198997498, + "learning_rate": 5.314744724047702e-06, + "loss": 1.6636, + "step": 22035 + }, + { + "epoch": 3.95, + "grad_norm": 0.6560852527618408, + "learning_rate": 5.306080768445881e-06, + "loss": 1.8196, + "step": 22040 + }, + { + "epoch": 3.95, + "grad_norm": 0.8872010111808777, + "learning_rate": 5.297423042011774e-06, + "loss": 1.9585, + "step": 22045 + }, + { + "epoch": 3.95, + "grad_norm": 0.6532019972801208, + "learning_rate": 5.288771547483806e-06, + "loss": 1.7353, + "step": 22050 + }, + { + "epoch": 3.95, + "grad_norm": 1.1290751695632935, + "learning_rate": 5.280126287598444e-06, + "loss": 1.617, + "step": 22055 + }, + { + "epoch": 3.95, + "grad_norm": 0.7250970602035522, + "learning_rate": 5.271487265090163e-06, + "loss": 1.6928, + "step": 22060 + }, + { + "epoch": 3.95, + "grad_norm": 0.8637431859970093, + "learning_rate": 5.2628544826914846e-06, + "loss": 1.5799, + "step": 22065 + }, + { + "epoch": 3.95, + "grad_norm": 3.6587893962860107, + "learning_rate": 5.254227943132948e-06, + "loss": 1.6546, + "step": 22070 + }, + { + "epoch": 3.95, + "grad_norm": 0.8665122985839844, + "learning_rate": 5.245607649143119e-06, + "loss": 1.8547, + "step": 22075 + }, + { + "epoch": 3.95, + "grad_norm": 0.883367657661438, + "learning_rate": 5.236993603448595e-06, + "loss": 1.7483, + "step": 22080 + }, + { + "epoch": 3.95, + "grad_norm": 2.016577959060669, + "learning_rate": 5.2283858087739806e-06, + "loss": 1.5007, + "step": 22085 + }, + { + "epoch": 3.95, + "grad_norm": 0.9476630687713623, + "learning_rate": 5.2197842678419204e-06, + "loss": 1.8659, + "step": 22090 + }, + { + "epoch": 3.95, + "grad_norm": 1.0685213804244995, + "learning_rate": 5.211188983373067e-06, + "loss": 1.8435, + "step": 22095 + }, + { + "epoch": 3.96, + "grad_norm": 1.0245375633239746, + "learning_rate": 5.202599958086102e-06, + "loss": 1.6491, + "step": 22100 + }, + { + "epoch": 3.96, + "grad_norm": 1.622461199760437, + "learning_rate": 5.194017194697728e-06, + "loss": 1.7717, + "step": 22105 + }, + { + "epoch": 3.96, + "grad_norm": 0.615947961807251, + "learning_rate": 5.185440695922664e-06, + "loss": 1.9252, + "step": 22110 + }, + { + "epoch": 3.96, + "grad_norm": 0.5909544229507446, + "learning_rate": 5.176870464473657e-06, + "loss": 2.1178, + "step": 22115 + }, + { + "epoch": 3.96, + "grad_norm": 0.8277536034584045, + "learning_rate": 5.168306503061446e-06, + "loss": 1.8633, + "step": 22120 + }, + { + "epoch": 3.96, + "grad_norm": 0.8978816270828247, + "learning_rate": 5.159748814394813e-06, + "loss": 1.622, + "step": 22125 + }, + { + "epoch": 3.96, + "grad_norm": 0.6580782532691956, + "learning_rate": 5.151197401180552e-06, + "loss": 1.6254, + "step": 22130 + }, + { + "epoch": 3.96, + "grad_norm": 0.612777590751648, + "learning_rate": 5.142652266123454e-06, + "loss": 1.7793, + "step": 22135 + }, + { + "epoch": 3.96, + "grad_norm": 0.7944098114967346, + "learning_rate": 5.1341134119263476e-06, + "loss": 1.5077, + "step": 22140 + }, + { + "epoch": 3.96, + "grad_norm": 0.791584849357605, + "learning_rate": 5.125580841290062e-06, + "loss": 1.7463, + "step": 22145 + }, + { + "epoch": 3.96, + "grad_norm": 1.069318175315857, + "learning_rate": 5.1170545569134365e-06, + "loss": 1.7364, + "step": 22150 + }, + { + "epoch": 3.97, + "grad_norm": 0.9720897078514099, + "learning_rate": 5.108534561493328e-06, + "loss": 1.8411, + "step": 22155 + }, + { + "epoch": 3.97, + "grad_norm": 0.4824497699737549, + "learning_rate": 5.100020857724605e-06, + "loss": 1.7391, + "step": 22160 + }, + { + "epoch": 3.97, + "grad_norm": 1.237189531326294, + "learning_rate": 5.091513448300144e-06, + "loss": 1.7938, + "step": 22165 + }, + { + "epoch": 3.97, + "grad_norm": 0.8964689373970032, + "learning_rate": 5.083012335910828e-06, + "loss": 1.8008, + "step": 22170 + }, + { + "epoch": 3.97, + "grad_norm": 0.6493983864784241, + "learning_rate": 5.074517523245561e-06, + "loss": 2.0167, + "step": 22175 + }, + { + "epoch": 3.97, + "grad_norm": 1.8859643936157227, + "learning_rate": 5.066029012991236e-06, + "loss": 1.8854, + "step": 22180 + }, + { + "epoch": 3.97, + "grad_norm": 0.8033244609832764, + "learning_rate": 5.05754680783275e-06, + "loss": 1.9866, + "step": 22185 + }, + { + "epoch": 3.97, + "grad_norm": 1.5448501110076904, + "learning_rate": 5.04907091045303e-06, + "loss": 1.7147, + "step": 22190 + }, + { + "epoch": 3.97, + "grad_norm": 2.1923139095306396, + "learning_rate": 5.04060132353299e-06, + "loss": 1.752, + "step": 22195 + }, + { + "epoch": 3.97, + "grad_norm": 0.8507521152496338, + "learning_rate": 5.03213804975155e-06, + "loss": 1.8768, + "step": 22200 + }, + { + "epoch": 3.97, + "grad_norm": 0.5235335826873779, + "learning_rate": 5.0236810917856485e-06, + "loss": 1.9388, + "step": 22205 + }, + { + "epoch": 3.98, + "grad_norm": 0.9762120842933655, + "learning_rate": 5.015230452310193e-06, + "loss": 1.7633, + "step": 22210 + }, + { + "epoch": 3.98, + "grad_norm": 0.9791060090065002, + "learning_rate": 5.006786133998126e-06, + "loss": 1.5675, + "step": 22215 + }, + { + "epoch": 3.98, + "grad_norm": 1.307426929473877, + "learning_rate": 4.998348139520378e-06, + "loss": 1.5626, + "step": 22220 + }, + { + "epoch": 3.98, + "grad_norm": 2.5995423793792725, + "learning_rate": 4.989916471545869e-06, + "loss": 1.8907, + "step": 22225 + }, + { + "epoch": 3.98, + "grad_norm": 1.0652718544006348, + "learning_rate": 4.981491132741531e-06, + "loss": 2.1655, + "step": 22230 + }, + { + "epoch": 3.98, + "grad_norm": 0.8676015734672546, + "learning_rate": 4.973072125772302e-06, + "loss": 1.9071, + "step": 22235 + }, + { + "epoch": 3.98, + "grad_norm": 0.7337488532066345, + "learning_rate": 4.9646594533010875e-06, + "loss": 1.6795, + "step": 22240 + }, + { + "epoch": 3.98, + "grad_norm": 0.6141374111175537, + "learning_rate": 4.956253117988816e-06, + "loss": 2.0898, + "step": 22245 + }, + { + "epoch": 3.98, + "grad_norm": 0.5068919062614441, + "learning_rate": 4.947853122494406e-06, + "loss": 1.964, + "step": 22250 + }, + { + "epoch": 3.98, + "grad_norm": 1.0111421346664429, + "learning_rate": 4.939459469474764e-06, + "loss": 1.8229, + "step": 22255 + }, + { + "epoch": 3.98, + "grad_norm": 0.9964427947998047, + "learning_rate": 4.931072161584802e-06, + "loss": 1.886, + "step": 22260 + }, + { + "epoch": 3.99, + "grad_norm": 0.7016617059707642, + "learning_rate": 4.922691201477406e-06, + "loss": 1.7212, + "step": 22265 + }, + { + "epoch": 3.99, + "grad_norm": 0.8478853106498718, + "learning_rate": 4.914316591803475e-06, + "loss": 1.7993, + "step": 22270 + }, + { + "epoch": 3.99, + "grad_norm": 0.7734459042549133, + "learning_rate": 4.905948335211882e-06, + "loss": 2.101, + "step": 22275 + }, + { + "epoch": 3.99, + "grad_norm": 0.6338139176368713, + "learning_rate": 4.897586434349502e-06, + "loss": 1.8311, + "step": 22280 + }, + { + "epoch": 3.99, + "grad_norm": 2.914543867111206, + "learning_rate": 4.889230891861196e-06, + "loss": 1.9766, + "step": 22285 + }, + { + "epoch": 3.99, + "grad_norm": 0.9734477400779724, + "learning_rate": 4.880881710389817e-06, + "loss": 1.8301, + "step": 22290 + }, + { + "epoch": 3.99, + "grad_norm": 1.7465955018997192, + "learning_rate": 4.8725388925762064e-06, + "loss": 1.8134, + "step": 22295 + }, + { + "epoch": 3.99, + "grad_norm": 0.780785322189331, + "learning_rate": 4.864202441059176e-06, + "loss": 1.8292, + "step": 22300 + }, + { + "epoch": 3.99, + "grad_norm": 0.9263495206832886, + "learning_rate": 4.855872358475547e-06, + "loss": 1.8219, + "step": 22305 + }, + { + "epoch": 3.99, + "grad_norm": 0.8709249496459961, + "learning_rate": 4.847548647460112e-06, + "loss": 1.8204, + "step": 22310 + }, + { + "epoch": 3.99, + "grad_norm": 0.9024534821510315, + "learning_rate": 4.839231310645664e-06, + "loss": 1.7256, + "step": 22315 + }, + { + "epoch": 4.0, + "grad_norm": 1.1584830284118652, + "learning_rate": 4.830920350662954e-06, + "loss": 1.6609, + "step": 22320 + }, + { + "epoch": 4.0, + "grad_norm": 0.5752809643745422, + "learning_rate": 4.822615770140745e-06, + "loss": 1.6129, + "step": 22325 + }, + { + "epoch": 4.0, + "grad_norm": 1.2050139904022217, + "learning_rate": 4.8143175717057505e-06, + "loss": 1.5614, + "step": 22330 + }, + { + "epoch": 4.0, + "grad_norm": 0.9474218487739563, + "learning_rate": 4.806025757982693e-06, + "loss": 1.9284, + "step": 22335 + }, + { + "epoch": 4.0, + "grad_norm": 0.6500478982925415, + "learning_rate": 4.797740331594264e-06, + "loss": 1.9486, + "step": 22340 + }, + { + "epoch": 4.0, + "grad_norm": 1.0756356716156006, + "learning_rate": 4.789461295161138e-06, + "loss": 1.9763, + "step": 22345 + }, + { + "epoch": 4.0, + "grad_norm": 0.6653231382369995, + "learning_rate": 4.781188651301969e-06, + "loss": 2.0301, + "step": 22350 + }, + { + "epoch": 4.0, + "grad_norm": 0.7933474779129028, + "learning_rate": 4.772922402633373e-06, + "loss": 1.9962, + "step": 22355 + }, + { + "epoch": 4.0, + "grad_norm": 2.7801687717437744, + "learning_rate": 4.764662551769975e-06, + "loss": 1.9394, + "step": 22360 + }, + { + "epoch": 4.0, + "grad_norm": 0.8057134747505188, + "learning_rate": 4.756409101324338e-06, + "loss": 1.7502, + "step": 22365 + }, + { + "epoch": 4.0, + "grad_norm": 2.9395413398742676, + "learning_rate": 4.748162053907035e-06, + "loss": 1.8419, + "step": 22370 + }, + { + "epoch": 4.01, + "grad_norm": 2.3610217571258545, + "learning_rate": 4.739921412126591e-06, + "loss": 1.9091, + "step": 22375 + }, + { + "epoch": 4.01, + "grad_norm": 0.6767581105232239, + "learning_rate": 4.731687178589517e-06, + "loss": 1.7989, + "step": 22380 + }, + { + "epoch": 4.01, + "grad_norm": 0.8745439052581787, + "learning_rate": 4.7234593559003e-06, + "loss": 1.8339, + "step": 22385 + }, + { + "epoch": 4.01, + "grad_norm": 0.7209843993186951, + "learning_rate": 4.715237946661377e-06, + "loss": 1.8103, + "step": 22390 + }, + { + "epoch": 4.01, + "grad_norm": 2.5547735691070557, + "learning_rate": 4.707022953473184e-06, + "loss": 2.2534, + "step": 22395 + }, + { + "epoch": 4.01, + "grad_norm": 0.7963310480117798, + "learning_rate": 4.698814378934108e-06, + "loss": 1.888, + "step": 22400 + }, + { + "epoch": 4.01, + "grad_norm": 1.050917148590088, + "learning_rate": 4.690612225640517e-06, + "loss": 1.9726, + "step": 22405 + }, + { + "epoch": 4.01, + "grad_norm": 0.7625211477279663, + "learning_rate": 4.68241649618675e-06, + "loss": 1.9173, + "step": 22410 + }, + { + "epoch": 4.01, + "grad_norm": 0.531043291091919, + "learning_rate": 4.674227193165107e-06, + "loss": 2.0842, + "step": 22415 + }, + { + "epoch": 4.01, + "grad_norm": 0.671322226524353, + "learning_rate": 4.666044319165841e-06, + "loss": 1.6739, + "step": 22420 + }, + { + "epoch": 4.01, + "grad_norm": 0.8251287341117859, + "learning_rate": 4.6578678767771975e-06, + "loss": 1.9518, + "step": 22425 + }, + { + "epoch": 4.01, + "grad_norm": 1.6571036577224731, + "learning_rate": 4.6496978685853786e-06, + "loss": 1.6041, + "step": 22430 + }, + { + "epoch": 4.02, + "grad_norm": 0.7980236411094666, + "learning_rate": 4.641534297174549e-06, + "loss": 1.4048, + "step": 22435 + }, + { + "epoch": 4.02, + "grad_norm": 1.075265884399414, + "learning_rate": 4.633377165126848e-06, + "loss": 1.9821, + "step": 22440 + }, + { + "epoch": 4.02, + "grad_norm": 1.2659225463867188, + "learning_rate": 4.625226475022351e-06, + "loss": 1.6767, + "step": 22445 + }, + { + "epoch": 4.02, + "grad_norm": 3.332648754119873, + "learning_rate": 4.6170822294391205e-06, + "loss": 1.6765, + "step": 22450 + }, + { + "epoch": 4.02, + "grad_norm": 1.1672558784484863, + "learning_rate": 4.608944430953185e-06, + "loss": 1.6675, + "step": 22455 + }, + { + "epoch": 4.02, + "grad_norm": 0.6216001510620117, + "learning_rate": 4.6008130821385025e-06, + "loss": 1.7697, + "step": 22460 + }, + { + "epoch": 4.02, + "grad_norm": 0.9190794825553894, + "learning_rate": 4.592688185567023e-06, + "loss": 1.9444, + "step": 22465 + }, + { + "epoch": 4.02, + "grad_norm": 0.6193316578865051, + "learning_rate": 4.58456974380864e-06, + "loss": 1.942, + "step": 22470 + }, + { + "epoch": 4.02, + "grad_norm": 0.9209474921226501, + "learning_rate": 4.576457759431216e-06, + "loss": 1.99, + "step": 22475 + }, + { + "epoch": 4.02, + "grad_norm": 0.9471963047981262, + "learning_rate": 4.568352235000553e-06, + "loss": 1.6965, + "step": 22480 + }, + { + "epoch": 4.02, + "grad_norm": 1.2862259149551392, + "learning_rate": 4.560253173080423e-06, + "loss": 1.7552, + "step": 22485 + }, + { + "epoch": 4.03, + "grad_norm": 0.8248801231384277, + "learning_rate": 4.552160576232553e-06, + "loss": 1.9429, + "step": 22490 + }, + { + "epoch": 4.03, + "grad_norm": 0.5932010412216187, + "learning_rate": 4.544074447016628e-06, + "loss": 1.6949, + "step": 22495 + }, + { + "epoch": 4.03, + "grad_norm": 1.4545879364013672, + "learning_rate": 4.535994787990283e-06, + "loss": 1.9368, + "step": 22500 + }, + { + "epoch": 4.03, + "grad_norm": 0.8596073985099792, + "learning_rate": 4.527921601709104e-06, + "loss": 1.8986, + "step": 22505 + }, + { + "epoch": 4.03, + "grad_norm": 1.0523921251296997, + "learning_rate": 4.519854890726624e-06, + "loss": 1.7516, + "step": 22510 + }, + { + "epoch": 4.03, + "grad_norm": 0.9680181741714478, + "learning_rate": 4.511794657594343e-06, + "loss": 1.6559, + "step": 22515 + }, + { + "epoch": 4.03, + "grad_norm": 1.0432225465774536, + "learning_rate": 4.503740904861703e-06, + "loss": 1.6328, + "step": 22520 + }, + { + "epoch": 4.03, + "grad_norm": 1.3763519525527954, + "learning_rate": 4.495693635076101e-06, + "loss": 1.6849, + "step": 22525 + }, + { + "epoch": 4.03, + "grad_norm": 0.7782979607582092, + "learning_rate": 4.487652850782886e-06, + "loss": 1.6029, + "step": 22530 + }, + { + "epoch": 4.03, + "grad_norm": 0.760043203830719, + "learning_rate": 4.479618554525339e-06, + "loss": 1.9747, + "step": 22535 + }, + { + "epoch": 4.03, + "grad_norm": 0.7995366454124451, + "learning_rate": 4.471590748844703e-06, + "loss": 1.8518, + "step": 22540 + }, + { + "epoch": 4.04, + "grad_norm": 1.1295702457427979, + "learning_rate": 4.4635694362801684e-06, + "loss": 2.1671, + "step": 22545 + }, + { + "epoch": 4.04, + "grad_norm": 0.6298494338989258, + "learning_rate": 4.4555546193688735e-06, + "loss": 1.894, + "step": 22550 + }, + { + "epoch": 4.04, + "grad_norm": 0.6293065547943115, + "learning_rate": 4.4475463006458855e-06, + "loss": 1.6044, + "step": 22555 + }, + { + "epoch": 4.04, + "grad_norm": 1.1865392923355103, + "learning_rate": 4.4395444826442395e-06, + "loss": 1.8025, + "step": 22560 + }, + { + "epoch": 4.04, + "grad_norm": 0.8114657998085022, + "learning_rate": 4.431549167894891e-06, + "loss": 1.973, + "step": 22565 + }, + { + "epoch": 4.04, + "grad_norm": 0.7298006415367126, + "learning_rate": 4.423560358926756e-06, + "loss": 1.809, + "step": 22570 + }, + { + "epoch": 4.04, + "grad_norm": 1.246186375617981, + "learning_rate": 4.415578058266687e-06, + "loss": 1.733, + "step": 22575 + }, + { + "epoch": 4.04, + "grad_norm": 0.7696637511253357, + "learning_rate": 4.407602268439479e-06, + "loss": 1.481, + "step": 22580 + }, + { + "epoch": 4.04, + "grad_norm": 0.4412114918231964, + "learning_rate": 4.3996329919678666e-06, + "loss": 2.2101, + "step": 22585 + }, + { + "epoch": 4.04, + "grad_norm": 0.5162980556488037, + "learning_rate": 4.3916702313725276e-06, + "loss": 2.1751, + "step": 22590 + }, + { + "epoch": 4.04, + "grad_norm": 1.8025277853012085, + "learning_rate": 4.3837139891720755e-06, + "loss": 1.7814, + "step": 22595 + }, + { + "epoch": 4.05, + "grad_norm": 0.36622488498687744, + "learning_rate": 4.375764267883048e-06, + "loss": 2.1449, + "step": 22600 + }, + { + "epoch": 4.05, + "grad_norm": 1.1410011053085327, + "learning_rate": 4.367821070019948e-06, + "loss": 1.7696, + "step": 22605 + }, + { + "epoch": 4.05, + "grad_norm": 0.6294861435890198, + "learning_rate": 4.359884398095196e-06, + "loss": 1.8894, + "step": 22610 + }, + { + "epoch": 4.05, + "grad_norm": 1.1744203567504883, + "learning_rate": 4.351954254619156e-06, + "loss": 1.687, + "step": 22615 + }, + { + "epoch": 4.05, + "grad_norm": 0.8127630949020386, + "learning_rate": 4.344030642100133e-06, + "loss": 1.8386, + "step": 22620 + }, + { + "epoch": 4.05, + "grad_norm": 0.8806772828102112, + "learning_rate": 4.336113563044339e-06, + "loss": 1.6666, + "step": 22625 + }, + { + "epoch": 4.05, + "grad_norm": 1.0629003047943115, + "learning_rate": 4.328203019955951e-06, + "loss": 1.8621, + "step": 22630 + }, + { + "epoch": 4.05, + "grad_norm": 0.7076244354248047, + "learning_rate": 4.320299015337065e-06, + "loss": 1.7092, + "step": 22635 + }, + { + "epoch": 4.05, + "grad_norm": 1.0784873962402344, + "learning_rate": 4.312401551687714e-06, + "loss": 1.6767, + "step": 22640 + }, + { + "epoch": 4.05, + "grad_norm": 1.1845802068710327, + "learning_rate": 4.304510631505848e-06, + "loss": 1.7707, + "step": 22645 + }, + { + "epoch": 4.05, + "grad_norm": 0.7437544465065002, + "learning_rate": 4.296626257287367e-06, + "loss": 1.8259, + "step": 22650 + }, + { + "epoch": 4.06, + "grad_norm": 1.0520071983337402, + "learning_rate": 4.288748431526082e-06, + "loss": 1.9545, + "step": 22655 + }, + { + "epoch": 4.06, + "grad_norm": 0.893604040145874, + "learning_rate": 4.280877156713748e-06, + "loss": 1.7078, + "step": 22660 + }, + { + "epoch": 4.06, + "grad_norm": 3.4858219623565674, + "learning_rate": 4.273012435340038e-06, + "loss": 2.1413, + "step": 22665 + }, + { + "epoch": 4.06, + "grad_norm": 0.628693163394928, + "learning_rate": 4.265154269892557e-06, + "loss": 1.5482, + "step": 22670 + }, + { + "epoch": 4.06, + "grad_norm": 1.399335265159607, + "learning_rate": 4.257302662856838e-06, + "loss": 1.751, + "step": 22675 + }, + { + "epoch": 4.06, + "grad_norm": 0.8125795722007751, + "learning_rate": 4.24945761671634e-06, + "loss": 1.9325, + "step": 22680 + }, + { + "epoch": 4.06, + "grad_norm": 1.0702699422836304, + "learning_rate": 4.241619133952432e-06, + "loss": 1.8058, + "step": 22685 + }, + { + "epoch": 4.06, + "grad_norm": 0.5640363097190857, + "learning_rate": 4.23378721704443e-06, + "loss": 1.8514, + "step": 22690 + }, + { + "epoch": 4.06, + "grad_norm": 0.6807628273963928, + "learning_rate": 4.225961868469553e-06, + "loss": 1.6916, + "step": 22695 + }, + { + "epoch": 4.06, + "grad_norm": 0.7918212413787842, + "learning_rate": 4.218143090702953e-06, + "loss": 1.8782, + "step": 22700 + }, + { + "epoch": 4.06, + "grad_norm": 0.6629428863525391, + "learning_rate": 4.210330886217706e-06, + "loss": 2.0677, + "step": 22705 + }, + { + "epoch": 4.06, + "grad_norm": 0.6261160373687744, + "learning_rate": 4.202525257484807e-06, + "loss": 1.8922, + "step": 22710 + }, + { + "epoch": 4.07, + "grad_norm": 0.9577205181121826, + "learning_rate": 4.194726206973157e-06, + "loss": 1.6897, + "step": 22715 + }, + { + "epoch": 4.07, + "grad_norm": 1.4098031520843506, + "learning_rate": 4.186933737149598e-06, + "loss": 1.9442, + "step": 22720 + }, + { + "epoch": 4.07, + "grad_norm": 1.430624008178711, + "learning_rate": 4.179147850478876e-06, + "loss": 1.9234, + "step": 22725 + }, + { + "epoch": 4.07, + "grad_norm": 1.2811557054519653, + "learning_rate": 4.171368549423665e-06, + "loss": 1.9003, + "step": 22730 + }, + { + "epoch": 4.07, + "grad_norm": 0.9562197923660278, + "learning_rate": 4.163595836444551e-06, + "loss": 1.8844, + "step": 22735 + }, + { + "epoch": 4.07, + "grad_norm": 0.8576022982597351, + "learning_rate": 4.1558297140000314e-06, + "loss": 1.6477, + "step": 22740 + }, + { + "epoch": 4.07, + "grad_norm": 2.603815793991089, + "learning_rate": 4.1480701845465205e-06, + "loss": 1.988, + "step": 22745 + }, + { + "epoch": 4.07, + "grad_norm": 0.5836585760116577, + "learning_rate": 4.14031725053835e-06, + "loss": 2.0429, + "step": 22750 + }, + { + "epoch": 4.07, + "grad_norm": 1.4115853309631348, + "learning_rate": 4.132570914427772e-06, + "loss": 2.0125, + "step": 22755 + }, + { + "epoch": 4.07, + "grad_norm": 0.5206339359283447, + "learning_rate": 4.12483117866494e-06, + "loss": 1.9237, + "step": 22760 + }, + { + "epoch": 4.07, + "grad_norm": 1.0422269105911255, + "learning_rate": 4.117098045697931e-06, + "loss": 1.6165, + "step": 22765 + }, + { + "epoch": 4.08, + "grad_norm": 1.0876985788345337, + "learning_rate": 4.109371517972721e-06, + "loss": 1.6538, + "step": 22770 + }, + { + "epoch": 4.08, + "grad_norm": 0.7427853345870972, + "learning_rate": 4.101651597933204e-06, + "loss": 2.3134, + "step": 22775 + }, + { + "epoch": 4.08, + "grad_norm": 1.1198269128799438, + "learning_rate": 4.09393828802119e-06, + "loss": 1.8009, + "step": 22780 + }, + { + "epoch": 4.08, + "grad_norm": 0.47877076268196106, + "learning_rate": 4.086231590676382e-06, + "loss": 1.9703, + "step": 22785 + }, + { + "epoch": 4.08, + "grad_norm": 0.6987916231155396, + "learning_rate": 4.078531508336403e-06, + "loss": 1.4546, + "step": 22790 + }, + { + "epoch": 4.08, + "grad_norm": 0.5703626275062561, + "learning_rate": 4.070838043436786e-06, + "loss": 1.911, + "step": 22795 + }, + { + "epoch": 4.08, + "grad_norm": 1.0218112468719482, + "learning_rate": 4.063151198410969e-06, + "loss": 1.8741, + "step": 22800 + }, + { + "epoch": 4.08, + "grad_norm": 0.6620162725448608, + "learning_rate": 4.055470975690284e-06, + "loss": 1.8318, + "step": 22805 + }, + { + "epoch": 4.08, + "grad_norm": 2.1791741847991943, + "learning_rate": 4.047797377703985e-06, + "loss": 1.6375, + "step": 22810 + }, + { + "epoch": 4.08, + "grad_norm": 0.8899803757667542, + "learning_rate": 4.040130406879219e-06, + "loss": 1.7384, + "step": 22815 + }, + { + "epoch": 4.08, + "grad_norm": 0.5828153491020203, + "learning_rate": 4.032470065641047e-06, + "loss": 1.7185, + "step": 22820 + }, + { + "epoch": 4.09, + "grad_norm": 0.5616529583930969, + "learning_rate": 4.024816356412431e-06, + "loss": 1.7911, + "step": 22825 + }, + { + "epoch": 4.09, + "grad_norm": 0.8453607559204102, + "learning_rate": 4.017169281614225e-06, + "loss": 1.6574, + "step": 22830 + }, + { + "epoch": 4.09, + "grad_norm": 0.6679532527923584, + "learning_rate": 4.009528843665186e-06, + "loss": 1.7073, + "step": 22835 + }, + { + "epoch": 4.09, + "grad_norm": 1.1486124992370605, + "learning_rate": 4.001895044981982e-06, + "loss": 1.7303, + "step": 22840 + }, + { + "epoch": 4.09, + "grad_norm": 0.7910205721855164, + "learning_rate": 3.99426788797918e-06, + "loss": 1.9838, + "step": 22845 + }, + { + "epoch": 4.09, + "grad_norm": 1.2929234504699707, + "learning_rate": 3.986647375069241e-06, + "loss": 1.8347, + "step": 22850 + }, + { + "epoch": 4.09, + "grad_norm": 0.7869713306427002, + "learning_rate": 3.9790335086625275e-06, + "loss": 1.8313, + "step": 22855 + }, + { + "epoch": 4.09, + "grad_norm": 1.0830061435699463, + "learning_rate": 3.971426291167294e-06, + "loss": 1.9421, + "step": 22860 + }, + { + "epoch": 4.09, + "grad_norm": 1.3566019535064697, + "learning_rate": 3.963825724989695e-06, + "loss": 1.8847, + "step": 22865 + }, + { + "epoch": 4.09, + "grad_norm": 0.9219987988471985, + "learning_rate": 3.956231812533784e-06, + "loss": 2.0614, + "step": 22870 + }, + { + "epoch": 4.09, + "grad_norm": 1.1585674285888672, + "learning_rate": 3.948644556201517e-06, + "loss": 2.011, + "step": 22875 + }, + { + "epoch": 4.1, + "grad_norm": 1.1043351888656616, + "learning_rate": 3.941063958392721e-06, + "loss": 2.111, + "step": 22880 + }, + { + "epoch": 4.1, + "grad_norm": 0.9520683884620667, + "learning_rate": 3.93349002150514e-06, + "loss": 1.7933, + "step": 22885 + }, + { + "epoch": 4.1, + "grad_norm": 1.2550685405731201, + "learning_rate": 3.925922747934407e-06, + "loss": 1.7792, + "step": 22890 + }, + { + "epoch": 4.1, + "grad_norm": 0.6732292175292969, + "learning_rate": 3.918362140074031e-06, + "loss": 1.9497, + "step": 22895 + }, + { + "epoch": 4.1, + "grad_norm": 1.17646324634552, + "learning_rate": 3.9108082003154325e-06, + "loss": 1.7155, + "step": 22900 + }, + { + "epoch": 4.1, + "grad_norm": 0.677506148815155, + "learning_rate": 3.903260931047917e-06, + "loss": 1.7703, + "step": 22905 + }, + { + "epoch": 4.1, + "grad_norm": 1.1620838642120361, + "learning_rate": 3.895720334658676e-06, + "loss": 1.6102, + "step": 22910 + }, + { + "epoch": 4.1, + "grad_norm": 0.9558453559875488, + "learning_rate": 3.888186413532799e-06, + "loss": 1.9861, + "step": 22915 + }, + { + "epoch": 4.1, + "grad_norm": 0.9614786505699158, + "learning_rate": 3.880659170053253e-06, + "loss": 1.8458, + "step": 22920 + }, + { + "epoch": 4.1, + "grad_norm": 0.40708935260772705, + "learning_rate": 3.87313860660089e-06, + "loss": 1.845, + "step": 22925 + }, + { + "epoch": 4.1, + "grad_norm": 0.8563295006752014, + "learning_rate": 3.865624725554465e-06, + "loss": 1.598, + "step": 22930 + }, + { + "epoch": 4.11, + "grad_norm": 2.2305939197540283, + "learning_rate": 3.8581175292906084e-06, + "loss": 1.9656, + "step": 22935 + }, + { + "epoch": 4.11, + "grad_norm": 0.7936108112335205, + "learning_rate": 3.850617020183845e-06, + "loss": 1.9327, + "step": 22940 + }, + { + "epoch": 4.11, + "grad_norm": 1.0209077596664429, + "learning_rate": 3.843123200606577e-06, + "loss": 2.0603, + "step": 22945 + }, + { + "epoch": 4.11, + "grad_norm": 1.030806541442871, + "learning_rate": 3.835636072929088e-06, + "loss": 1.7855, + "step": 22950 + }, + { + "epoch": 4.11, + "grad_norm": 0.9423354864120483, + "learning_rate": 3.828155639519551e-06, + "loss": 1.9576, + "step": 22955 + }, + { + "epoch": 4.11, + "grad_norm": 1.0258960723876953, + "learning_rate": 3.82068190274402e-06, + "loss": 1.9906, + "step": 22960 + }, + { + "epoch": 4.11, + "grad_norm": 0.5299312472343445, + "learning_rate": 3.81321486496643e-06, + "loss": 1.8934, + "step": 22965 + }, + { + "epoch": 4.11, + "grad_norm": 1.3776746988296509, + "learning_rate": 3.8057545285486053e-06, + "loss": 1.9732, + "step": 22970 + }, + { + "epoch": 4.11, + "grad_norm": 0.8569226264953613, + "learning_rate": 3.798300895850232e-06, + "loss": 1.8981, + "step": 22975 + }, + { + "epoch": 4.11, + "grad_norm": 0.8730267882347107, + "learning_rate": 3.790853969228897e-06, + "loss": 2.07, + "step": 22980 + }, + { + "epoch": 4.11, + "grad_norm": 0.7692422270774841, + "learning_rate": 3.7834137510400465e-06, + "loss": 1.5263, + "step": 22985 + }, + { + "epoch": 4.12, + "grad_norm": 0.8507816195487976, + "learning_rate": 3.7759802436370174e-06, + "loss": 1.905, + "step": 22990 + }, + { + "epoch": 4.12, + "grad_norm": 0.851302444934845, + "learning_rate": 3.7685534493710235e-06, + "loss": 1.7006, + "step": 22995 + }, + { + "epoch": 4.12, + "grad_norm": 1.0782297849655151, + "learning_rate": 3.7611333705911523e-06, + "loss": 1.9236, + "step": 23000 + }, + { + "epoch": 4.12, + "grad_norm": 0.5770434141159058, + "learning_rate": 3.753720009644371e-06, + "loss": 1.9372, + "step": 23005 + }, + { + "epoch": 4.12, + "grad_norm": 0.7802814841270447, + "learning_rate": 3.7463133688755066e-06, + "loss": 1.8211, + "step": 23010 + }, + { + "epoch": 4.12, + "grad_norm": 1.3625346422195435, + "learning_rate": 3.7389134506272903e-06, + "loss": 1.8262, + "step": 23015 + }, + { + "epoch": 4.12, + "grad_norm": 0.8293248414993286, + "learning_rate": 3.7315202572402896e-06, + "loss": 1.7531, + "step": 23020 + }, + { + "epoch": 4.12, + "grad_norm": 0.6201703548431396, + "learning_rate": 3.724133791052975e-06, + "loss": 1.9715, + "step": 23025 + }, + { + "epoch": 4.12, + "grad_norm": 1.0974987745285034, + "learning_rate": 3.7167540544016775e-06, + "loss": 2.0178, + "step": 23030 + }, + { + "epoch": 4.12, + "grad_norm": 0.7133528590202332, + "learning_rate": 3.709381049620608e-06, + "loss": 1.6524, + "step": 23035 + }, + { + "epoch": 4.12, + "grad_norm": 0.9783563017845154, + "learning_rate": 3.7020147790418263e-06, + "loss": 2.1077, + "step": 23040 + }, + { + "epoch": 4.12, + "grad_norm": 0.7139909863471985, + "learning_rate": 3.6946552449952847e-06, + "loss": 1.8681, + "step": 23045 + }, + { + "epoch": 4.13, + "grad_norm": 2.149613380432129, + "learning_rate": 3.6873024498087938e-06, + "loss": 1.9458, + "step": 23050 + }, + { + "epoch": 4.13, + "grad_norm": 1.5502854585647583, + "learning_rate": 3.67995639580804e-06, + "loss": 2.0566, + "step": 23055 + }, + { + "epoch": 4.13, + "grad_norm": 0.7524001598358154, + "learning_rate": 3.6726170853165763e-06, + "loss": 1.4629, + "step": 23060 + }, + { + "epoch": 4.13, + "grad_norm": 0.4955444633960724, + "learning_rate": 3.665284520655807e-06, + "loss": 1.5796, + "step": 23065 + }, + { + "epoch": 4.13, + "grad_norm": 0.7171782851219177, + "learning_rate": 3.6579587041450286e-06, + "loss": 2.1074, + "step": 23070 + }, + { + "epoch": 4.13, + "grad_norm": 0.5848670601844788, + "learning_rate": 3.6506396381013804e-06, + "loss": 1.8271, + "step": 23075 + }, + { + "epoch": 4.13, + "grad_norm": 1.1107635498046875, + "learning_rate": 3.6433273248398763e-06, + "loss": 1.6248, + "step": 23080 + }, + { + "epoch": 4.13, + "grad_norm": 1.556101679801941, + "learning_rate": 3.6360217666733975e-06, + "loss": 1.8316, + "step": 23085 + }, + { + "epoch": 4.13, + "grad_norm": 1.174280047416687, + "learning_rate": 3.628722965912687e-06, + "loss": 1.6224, + "step": 23090 + }, + { + "epoch": 4.13, + "grad_norm": 0.5544099807739258, + "learning_rate": 3.621430924866348e-06, + "loss": 2.1943, + "step": 23095 + }, + { + "epoch": 4.13, + "grad_norm": 0.6023959517478943, + "learning_rate": 3.6141456458408383e-06, + "loss": 1.9006, + "step": 23100 + }, + { + "epoch": 4.14, + "grad_norm": 0.9502812027931213, + "learning_rate": 3.6068671311404927e-06, + "loss": 1.6577, + "step": 23105 + }, + { + "epoch": 4.14, + "grad_norm": 0.8921631574630737, + "learning_rate": 3.5995953830675e-06, + "loss": 1.1959, + "step": 23110 + }, + { + "epoch": 4.14, + "grad_norm": 0.8794203996658325, + "learning_rate": 3.592330403921898e-06, + "loss": 1.6737, + "step": 23115 + }, + { + "epoch": 4.14, + "grad_norm": 0.5600360035896301, + "learning_rate": 3.585072196001599e-06, + "loss": 2.1309, + "step": 23120 + }, + { + "epoch": 4.14, + "grad_norm": 1.8456734418869019, + "learning_rate": 3.5778207616023684e-06, + "loss": 2.0503, + "step": 23125 + }, + { + "epoch": 4.14, + "grad_norm": 0.7940518856048584, + "learning_rate": 3.5705761030178224e-06, + "loss": 1.8708, + "step": 23130 + }, + { + "epoch": 4.14, + "grad_norm": 1.4641777276992798, + "learning_rate": 3.563338222539442e-06, + "loss": 1.7284, + "step": 23135 + }, + { + "epoch": 4.14, + "grad_norm": 0.6386775374412537, + "learning_rate": 3.5561071224565617e-06, + "loss": 1.8958, + "step": 23140 + }, + { + "epoch": 4.14, + "grad_norm": 0.9825034737586975, + "learning_rate": 3.548882805056372e-06, + "loss": 1.6577, + "step": 23145 + }, + { + "epoch": 4.14, + "grad_norm": 0.915550172328949, + "learning_rate": 3.541665272623923e-06, + "loss": 1.6564, + "step": 23150 + }, + { + "epoch": 4.14, + "grad_norm": 0.926216185092926, + "learning_rate": 3.5344545274421077e-06, + "loss": 1.9108, + "step": 23155 + }, + { + "epoch": 4.15, + "grad_norm": 1.014319896697998, + "learning_rate": 3.527250571791674e-06, + "loss": 1.7595, + "step": 23160 + }, + { + "epoch": 4.15, + "grad_norm": 0.9334921836853027, + "learning_rate": 3.5200534079512295e-06, + "loss": 1.7663, + "step": 23165 + }, + { + "epoch": 4.15, + "grad_norm": 0.9318943619728088, + "learning_rate": 3.512863038197231e-06, + "loss": 2.109, + "step": 23170 + }, + { + "epoch": 4.15, + "grad_norm": 0.7537411451339722, + "learning_rate": 3.505679464803985e-06, + "loss": 1.7133, + "step": 23175 + }, + { + "epoch": 4.15, + "grad_norm": 0.6408186554908752, + "learning_rate": 3.498502690043651e-06, + "loss": 1.7261, + "step": 23180 + }, + { + "epoch": 4.15, + "grad_norm": 0.8619161248207092, + "learning_rate": 3.491332716186238e-06, + "loss": 1.7955, + "step": 23185 + }, + { + "epoch": 4.15, + "grad_norm": 0.9069615006446838, + "learning_rate": 3.4841695454995936e-06, + "loss": 1.9615, + "step": 23190 + }, + { + "epoch": 4.15, + "grad_norm": 0.5599504113197327, + "learning_rate": 3.4770131802494278e-06, + "loss": 1.8248, + "step": 23195 + }, + { + "epoch": 4.15, + "grad_norm": 1.420801043510437, + "learning_rate": 3.469863622699293e-06, + "loss": 1.7244, + "step": 23200 + }, + { + "epoch": 4.15, + "grad_norm": 0.7983185052871704, + "learning_rate": 3.462720875110584e-06, + "loss": 2.053, + "step": 23205 + }, + { + "epoch": 4.15, + "grad_norm": 0.9097558856010437, + "learning_rate": 3.45558493974254e-06, + "loss": 1.8446, + "step": 23210 + }, + { + "epoch": 4.16, + "grad_norm": 0.5858725905418396, + "learning_rate": 3.448455818852267e-06, + "loss": 2.001, + "step": 23215 + }, + { + "epoch": 4.16, + "grad_norm": 1.1941709518432617, + "learning_rate": 3.4413335146946807e-06, + "loss": 1.8848, + "step": 23220 + }, + { + "epoch": 4.16, + "grad_norm": 0.5961235761642456, + "learning_rate": 3.434218029522568e-06, + "loss": 1.7481, + "step": 23225 + }, + { + "epoch": 4.16, + "grad_norm": 1.2655341625213623, + "learning_rate": 3.4271093655865476e-06, + "loss": 1.5217, + "step": 23230 + }, + { + "epoch": 4.16, + "grad_norm": 0.8132046461105347, + "learning_rate": 3.4200075251350826e-06, + "loss": 1.5498, + "step": 23235 + }, + { + "epoch": 4.16, + "grad_norm": 0.48109278082847595, + "learning_rate": 3.41291251041449e-06, + "loss": 1.7054, + "step": 23240 + }, + { + "epoch": 4.16, + "grad_norm": 0.6113245487213135, + "learning_rate": 3.4058243236688993e-06, + "loss": 1.7679, + "step": 23245 + }, + { + "epoch": 4.16, + "grad_norm": 1.200701117515564, + "learning_rate": 3.3987429671403088e-06, + "loss": 1.8985, + "step": 23250 + }, + { + "epoch": 4.16, + "grad_norm": 0.7015065550804138, + "learning_rate": 3.391668443068535e-06, + "loss": 1.9091, + "step": 23255 + }, + { + "epoch": 4.16, + "grad_norm": 1.0896352529525757, + "learning_rate": 3.3846007536912473e-06, + "loss": 1.5577, + "step": 23260 + }, + { + "epoch": 4.16, + "grad_norm": 1.282090425491333, + "learning_rate": 3.3775399012439506e-06, + "loss": 1.7736, + "step": 23265 + }, + { + "epoch": 4.17, + "grad_norm": 0.48688602447509766, + "learning_rate": 3.3704858879599843e-06, + "loss": 1.6715, + "step": 23270 + }, + { + "epoch": 4.17, + "grad_norm": 0.6023693084716797, + "learning_rate": 3.3634387160705325e-06, + "loss": 1.7387, + "step": 23275 + }, + { + "epoch": 4.17, + "grad_norm": 1.431891679763794, + "learning_rate": 3.356398387804596e-06, + "loss": 1.6708, + "step": 23280 + }, + { + "epoch": 4.17, + "grad_norm": 0.6251328587532043, + "learning_rate": 3.3493649053890326e-06, + "loss": 1.8878, + "step": 23285 + }, + { + "epoch": 4.17, + "grad_norm": 1.2962218523025513, + "learning_rate": 3.342338271048526e-06, + "loss": 1.6701, + "step": 23290 + }, + { + "epoch": 4.17, + "grad_norm": 0.9591766595840454, + "learning_rate": 3.3353184870055954e-06, + "loss": 1.78, + "step": 23295 + }, + { + "epoch": 4.17, + "grad_norm": 1.259443759918213, + "learning_rate": 3.3283055554805848e-06, + "loss": 1.6922, + "step": 23300 + }, + { + "epoch": 4.17, + "grad_norm": 1.0876199007034302, + "learning_rate": 3.3212994786916836e-06, + "loss": 1.9445, + "step": 23305 + }, + { + "epoch": 4.17, + "grad_norm": 2.2998528480529785, + "learning_rate": 3.314300258854902e-06, + "loss": 1.7383, + "step": 23310 + }, + { + "epoch": 4.17, + "grad_norm": 1.3563592433929443, + "learning_rate": 3.307307898184089e-06, + "loss": 1.8597, + "step": 23315 + }, + { + "epoch": 4.17, + "grad_norm": 0.7580876350402832, + "learning_rate": 3.3003223988909234e-06, + "loss": 1.7143, + "step": 23320 + }, + { + "epoch": 4.18, + "grad_norm": 0.7381443381309509, + "learning_rate": 3.2933437631849085e-06, + "loss": 1.8132, + "step": 23325 + }, + { + "epoch": 4.18, + "grad_norm": 0.7825798988342285, + "learning_rate": 3.2863719932733904e-06, + "loss": 1.8894, + "step": 23330 + }, + { + "epoch": 4.18, + "grad_norm": 0.96683269739151, + "learning_rate": 3.2794070913615165e-06, + "loss": 1.8698, + "step": 23335 + }, + { + "epoch": 4.18, + "grad_norm": 0.9194793701171875, + "learning_rate": 3.2724490596522937e-06, + "loss": 1.8079, + "step": 23340 + }, + { + "epoch": 4.18, + "grad_norm": 0.9263105988502502, + "learning_rate": 3.2654979003465313e-06, + "loss": 1.6324, + "step": 23345 + }, + { + "epoch": 4.18, + "grad_norm": 0.6331471800804138, + "learning_rate": 3.258553615642873e-06, + "loss": 1.9431, + "step": 23350 + }, + { + "epoch": 4.18, + "grad_norm": 0.6416884064674377, + "learning_rate": 3.2516162077377953e-06, + "loss": 1.865, + "step": 23355 + }, + { + "epoch": 4.18, + "grad_norm": 0.7806147933006287, + "learning_rate": 3.2446856788255958e-06, + "loss": 1.8503, + "step": 23360 + }, + { + "epoch": 4.18, + "grad_norm": 0.5219438076019287, + "learning_rate": 3.2377620310983873e-06, + "loss": 1.8821, + "step": 23365 + }, + { + "epoch": 4.18, + "grad_norm": 0.8223024606704712, + "learning_rate": 3.230845266746113e-06, + "loss": 1.6572, + "step": 23370 + }, + { + "epoch": 4.18, + "grad_norm": 0.9462054967880249, + "learning_rate": 3.223935387956545e-06, + "loss": 1.7214, + "step": 23375 + }, + { + "epoch": 4.18, + "grad_norm": 0.538049042224884, + "learning_rate": 3.217032396915265e-06, + "loss": 1.751, + "step": 23380 + }, + { + "epoch": 4.19, + "grad_norm": 0.767585813999176, + "learning_rate": 3.210136295805685e-06, + "loss": 1.6549, + "step": 23385 + }, + { + "epoch": 4.19, + "grad_norm": 0.9927555322647095, + "learning_rate": 3.2032470868090414e-06, + "loss": 1.7279, + "step": 23390 + }, + { + "epoch": 4.19, + "grad_norm": 0.8647646903991699, + "learning_rate": 3.196364772104379e-06, + "loss": 1.6213, + "step": 23395 + }, + { + "epoch": 4.19, + "grad_norm": 0.8548038005828857, + "learning_rate": 3.189489353868563e-06, + "loss": 2.1221, + "step": 23400 + }, + { + "epoch": 4.19, + "grad_norm": 0.7027987241744995, + "learning_rate": 3.182620834276284e-06, + "loss": 2.0151, + "step": 23405 + }, + { + "epoch": 4.19, + "grad_norm": 0.8726295828819275, + "learning_rate": 3.1757592155000486e-06, + "loss": 1.8991, + "step": 23410 + }, + { + "epoch": 4.19, + "grad_norm": 0.5377846360206604, + "learning_rate": 3.168904499710182e-06, + "loss": 2.0556, + "step": 23415 + }, + { + "epoch": 4.19, + "grad_norm": 0.492355078458786, + "learning_rate": 3.162056689074827e-06, + "loss": 1.737, + "step": 23420 + }, + { + "epoch": 4.19, + "grad_norm": 0.7007409930229187, + "learning_rate": 3.1552157857599324e-06, + "loss": 1.9326, + "step": 23425 + }, + { + "epoch": 4.19, + "grad_norm": 0.9527152180671692, + "learning_rate": 3.1483817919292726e-06, + "loss": 2.0368, + "step": 23430 + }, + { + "epoch": 4.19, + "grad_norm": 2.5235965251922607, + "learning_rate": 3.1415547097444374e-06, + "loss": 1.7703, + "step": 23435 + }, + { + "epoch": 4.2, + "grad_norm": 0.6163821816444397, + "learning_rate": 3.134734541364817e-06, + "loss": 1.7116, + "step": 23440 + }, + { + "epoch": 4.2, + "grad_norm": 0.6082310080528259, + "learning_rate": 3.12792128894763e-06, + "loss": 1.7943, + "step": 23445 + }, + { + "epoch": 4.2, + "grad_norm": 0.7631220817565918, + "learning_rate": 3.121114954647908e-06, + "loss": 2.0513, + "step": 23450 + }, + { + "epoch": 4.2, + "grad_norm": 0.9659873843193054, + "learning_rate": 3.114315540618473e-06, + "loss": 1.911, + "step": 23455 + }, + { + "epoch": 4.2, + "grad_norm": 0.7039878368377686, + "learning_rate": 3.107523049009983e-06, + "loss": 1.9678, + "step": 23460 + }, + { + "epoch": 4.2, + "grad_norm": 0.5036177039146423, + "learning_rate": 3.1007374819708947e-06, + "loss": 1.941, + "step": 23465 + }, + { + "epoch": 4.2, + "grad_norm": 0.522300124168396, + "learning_rate": 3.0939588416474762e-06, + "loss": 1.882, + "step": 23470 + }, + { + "epoch": 4.2, + "grad_norm": 1.4440438747406006, + "learning_rate": 3.0871871301838053e-06, + "loss": 1.4968, + "step": 23475 + }, + { + "epoch": 4.2, + "grad_norm": 0.7592721581459045, + "learning_rate": 3.0804223497217706e-06, + "loss": 1.8058, + "step": 23480 + }, + { + "epoch": 4.2, + "grad_norm": 1.0022876262664795, + "learning_rate": 3.0736645024010664e-06, + "loss": 1.7875, + "step": 23485 + }, + { + "epoch": 4.2, + "grad_norm": 0.6901012063026428, + "learning_rate": 3.066913590359183e-06, + "loss": 1.7997, + "step": 23490 + }, + { + "epoch": 4.21, + "grad_norm": 0.9948673844337463, + "learning_rate": 3.0601696157314315e-06, + "loss": 1.9634, + "step": 23495 + }, + { + "epoch": 4.21, + "grad_norm": 0.49334949254989624, + "learning_rate": 3.0534325806509307e-06, + "loss": 1.8824, + "step": 23500 + }, + { + "epoch": 4.21, + "grad_norm": 0.7687468528747559, + "learning_rate": 3.046702487248593e-06, + "loss": 1.7471, + "step": 23505 + }, + { + "epoch": 4.21, + "grad_norm": 0.5290055274963379, + "learning_rate": 3.0399793376531484e-06, + "loss": 1.7987, + "step": 23510 + }, + { + "epoch": 4.21, + "grad_norm": 0.5497667789459229, + "learning_rate": 3.03326313399111e-06, + "loss": 1.8059, + "step": 23515 + }, + { + "epoch": 4.21, + "grad_norm": 1.2975267171859741, + "learning_rate": 3.0265538783868117e-06, + "loss": 1.8914, + "step": 23520 + }, + { + "epoch": 4.21, + "grad_norm": 1.4567995071411133, + "learning_rate": 3.01985157296239e-06, + "loss": 1.7871, + "step": 23525 + }, + { + "epoch": 4.21, + "grad_norm": 1.1147876977920532, + "learning_rate": 3.013156219837776e-06, + "loss": 1.9195, + "step": 23530 + }, + { + "epoch": 4.21, + "grad_norm": 0.48365309834480286, + "learning_rate": 3.006467821130696e-06, + "loss": 1.8373, + "step": 23535 + }, + { + "epoch": 4.21, + "grad_norm": 0.6062785983085632, + "learning_rate": 2.9997863789566953e-06, + "loss": 1.726, + "step": 23540 + }, + { + "epoch": 4.21, + "grad_norm": 1.1633416414260864, + "learning_rate": 2.993111895429093e-06, + "loss": 1.7636, + "step": 23545 + }, + { + "epoch": 4.22, + "grad_norm": 0.9300481677055359, + "learning_rate": 2.9864443726590335e-06, + "loss": 1.6565, + "step": 23550 + }, + { + "epoch": 4.22, + "grad_norm": 1.1455543041229248, + "learning_rate": 2.9797838127554433e-06, + "loss": 2.0796, + "step": 23555 + }, + { + "epoch": 4.22, + "grad_norm": 0.33199450373649597, + "learning_rate": 2.9731302178250543e-06, + "loss": 2.0225, + "step": 23560 + }, + { + "epoch": 4.22, + "grad_norm": 0.6419559717178345, + "learning_rate": 2.966483589972394e-06, + "loss": 1.9895, + "step": 23565 + }, + { + "epoch": 4.22, + "grad_norm": 0.8773286938667297, + "learning_rate": 2.9598439312997745e-06, + "loss": 1.6463, + "step": 23570 + }, + { + "epoch": 4.22, + "grad_norm": 1.4193726778030396, + "learning_rate": 2.953211243907325e-06, + "loss": 1.9775, + "step": 23575 + }, + { + "epoch": 4.22, + "grad_norm": 0.9343559741973877, + "learning_rate": 2.946585529892951e-06, + "loss": 1.7366, + "step": 23580 + }, + { + "epoch": 4.22, + "grad_norm": 0.9293035268783569, + "learning_rate": 2.9399667913523582e-06, + "loss": 1.8817, + "step": 23585 + }, + { + "epoch": 4.22, + "grad_norm": 0.7766603827476501, + "learning_rate": 2.933355030379048e-06, + "loss": 1.904, + "step": 23590 + }, + { + "epoch": 4.22, + "grad_norm": 0.7661356329917908, + "learning_rate": 2.9267502490643194e-06, + "loss": 1.5144, + "step": 23595 + }, + { + "epoch": 4.22, + "grad_norm": 1.3593940734863281, + "learning_rate": 2.92015244949726e-06, + "loss": 1.9262, + "step": 23600 + }, + { + "epoch": 4.23, + "grad_norm": 1.014451265335083, + "learning_rate": 2.9135616337647386e-06, + "loss": 1.6614, + "step": 23605 + }, + { + "epoch": 4.23, + "grad_norm": 1.7844657897949219, + "learning_rate": 2.906977803951427e-06, + "loss": 1.8037, + "step": 23610 + }, + { + "epoch": 4.23, + "grad_norm": 1.5160473585128784, + "learning_rate": 2.9004009621397847e-06, + "loss": 1.6912, + "step": 23615 + }, + { + "epoch": 4.23, + "grad_norm": 1.6145546436309814, + "learning_rate": 2.893831110410067e-06, + "loss": 1.9369, + "step": 23620 + }, + { + "epoch": 4.23, + "grad_norm": 1.6145838499069214, + "learning_rate": 2.8872682508403e-06, + "loss": 1.9497, + "step": 23625 + }, + { + "epoch": 4.23, + "grad_norm": 2.104137659072876, + "learning_rate": 2.880712385506318e-06, + "loss": 1.916, + "step": 23630 + }, + { + "epoch": 4.23, + "grad_norm": 0.9931949973106384, + "learning_rate": 2.8741635164817315e-06, + "loss": 1.6568, + "step": 23635 + }, + { + "epoch": 4.23, + "grad_norm": 1.3967357873916626, + "learning_rate": 2.8676216458379396e-06, + "loss": 1.5115, + "step": 23640 + }, + { + "epoch": 4.23, + "grad_norm": 0.5755307078361511, + "learning_rate": 2.861086775644134e-06, + "loss": 1.8759, + "step": 23645 + }, + { + "epoch": 4.23, + "grad_norm": 0.9810478091239929, + "learning_rate": 2.8545589079672865e-06, + "loss": 1.6617, + "step": 23650 + }, + { + "epoch": 4.23, + "grad_norm": 0.7798149585723877, + "learning_rate": 2.848038044872159e-06, + "loss": 1.6842, + "step": 23655 + }, + { + "epoch": 4.24, + "grad_norm": 1.8288719654083252, + "learning_rate": 2.8415241884212853e-06, + "loss": 1.7013, + "step": 23660 + }, + { + "epoch": 4.24, + "grad_norm": 1.104956030845642, + "learning_rate": 2.8350173406749973e-06, + "loss": 1.8724, + "step": 23665 + }, + { + "epoch": 4.24, + "grad_norm": 1.367289423942566, + "learning_rate": 2.8285175036914107e-06, + "loss": 1.7069, + "step": 23670 + }, + { + "epoch": 4.24, + "grad_norm": 1.1730290651321411, + "learning_rate": 2.8220246795264057e-06, + "loss": 1.851, + "step": 23675 + }, + { + "epoch": 4.24, + "grad_norm": 1.0620871782302856, + "learning_rate": 2.815538870233661e-06, + "loss": 1.7259, + "step": 23680 + }, + { + "epoch": 4.24, + "grad_norm": 1.1758960485458374, + "learning_rate": 2.8090600778646362e-06, + "loss": 1.7401, + "step": 23685 + }, + { + "epoch": 4.24, + "grad_norm": 1.1776374578475952, + "learning_rate": 2.8025883044685646e-06, + "loss": 1.9266, + "step": 23690 + }, + { + "epoch": 4.24, + "grad_norm": 0.8739143013954163, + "learning_rate": 2.7961235520924584e-06, + "loss": 1.9129, + "step": 23695 + }, + { + "epoch": 4.24, + "grad_norm": 0.8561976552009583, + "learning_rate": 2.789665822781115e-06, + "loss": 2.0287, + "step": 23700 + }, + { + "epoch": 4.24, + "grad_norm": 0.6630309224128723, + "learning_rate": 2.7832151185771093e-06, + "loss": 1.8712, + "step": 23705 + }, + { + "epoch": 4.24, + "grad_norm": 0.8686838150024414, + "learning_rate": 2.7767714415207896e-06, + "loss": 1.9666, + "step": 23710 + }, + { + "epoch": 4.24, + "grad_norm": 1.0738399028778076, + "learning_rate": 2.770334793650292e-06, + "loss": 1.7293, + "step": 23715 + }, + { + "epoch": 4.25, + "grad_norm": 0.6418025493621826, + "learning_rate": 2.763905177001519e-06, + "loss": 1.5567, + "step": 23720 + }, + { + "epoch": 4.25, + "grad_norm": 0.8317776322364807, + "learning_rate": 2.757482593608143e-06, + "loss": 2.0659, + "step": 23725 + }, + { + "epoch": 4.25, + "grad_norm": 1.3214927911758423, + "learning_rate": 2.751067045501629e-06, + "loss": 1.9451, + "step": 23730 + }, + { + "epoch": 4.25, + "grad_norm": 1.1789897680282593, + "learning_rate": 2.7446585347112086e-06, + "loss": 1.7314, + "step": 23735 + }, + { + "epoch": 4.25, + "grad_norm": 0.8994229435920715, + "learning_rate": 2.7382570632638854e-06, + "loss": 1.8044, + "step": 23740 + }, + { + "epoch": 4.25, + "grad_norm": 1.0152734518051147, + "learning_rate": 2.7318626331844456e-06, + "loss": 1.7334, + "step": 23745 + }, + { + "epoch": 4.25, + "grad_norm": 1.6840885877609253, + "learning_rate": 2.725475246495432e-06, + "loss": 1.9423, + "step": 23750 + }, + { + "epoch": 4.25, + "grad_norm": 1.69658362865448, + "learning_rate": 2.7190949052171703e-06, + "loss": 1.6838, + "step": 23755 + }, + { + "epoch": 4.25, + "grad_norm": 0.9725009799003601, + "learning_rate": 2.7127216113677635e-06, + "loss": 1.9143, + "step": 23760 + }, + { + "epoch": 4.25, + "grad_norm": 0.49594831466674805, + "learning_rate": 2.7063553669630702e-06, + "loss": 1.8105, + "step": 23765 + }, + { + "epoch": 4.25, + "grad_norm": 1.0006866455078125, + "learning_rate": 2.6999961740167305e-06, + "loss": 1.8241, + "step": 23770 + }, + { + "epoch": 4.26, + "grad_norm": 0.8839683532714844, + "learning_rate": 2.6936440345401493e-06, + "loss": 1.7172, + "step": 23775 + }, + { + "epoch": 4.26, + "grad_norm": 0.9161496162414551, + "learning_rate": 2.6872989505425105e-06, + "loss": 1.8713, + "step": 23780 + }, + { + "epoch": 4.26, + "grad_norm": 0.7400037050247192, + "learning_rate": 2.6809609240307456e-06, + "loss": 1.6108, + "step": 23785 + }, + { + "epoch": 4.26, + "grad_norm": 1.0765010118484497, + "learning_rate": 2.6746299570095722e-06, + "loss": 2.0275, + "step": 23790 + }, + { + "epoch": 4.26, + "grad_norm": 1.344102144241333, + "learning_rate": 2.66830605148147e-06, + "loss": 1.6618, + "step": 23795 + }, + { + "epoch": 4.26, + "grad_norm": 2.3063645362854004, + "learning_rate": 2.661989209446683e-06, + "loss": 1.5832, + "step": 23800 + }, + { + "epoch": 4.26, + "grad_norm": 0.9692860245704651, + "learning_rate": 2.655679432903227e-06, + "loss": 1.9494, + "step": 23805 + }, + { + "epoch": 4.26, + "grad_norm": 1.3169384002685547, + "learning_rate": 2.649376723846875e-06, + "loss": 1.7188, + "step": 23810 + }, + { + "epoch": 4.26, + "grad_norm": 0.763640820980072, + "learning_rate": 2.6430810842711595e-06, + "loss": 1.8531, + "step": 23815 + }, + { + "epoch": 4.26, + "grad_norm": 0.6789270639419556, + "learning_rate": 2.636792516167394e-06, + "loss": 1.6805, + "step": 23820 + }, + { + "epoch": 4.26, + "grad_norm": 2.9606266021728516, + "learning_rate": 2.6305110215246455e-06, + "loss": 1.6875, + "step": 23825 + }, + { + "epoch": 4.27, + "grad_norm": 0.8681931495666504, + "learning_rate": 2.624236602329744e-06, + "loss": 1.8095, + "step": 23830 + }, + { + "epoch": 4.27, + "grad_norm": 0.6462576985359192, + "learning_rate": 2.6179692605672877e-06, + "loss": 1.8016, + "step": 23835 + }, + { + "epoch": 4.27, + "grad_norm": 0.38575172424316406, + "learning_rate": 2.6117089982196197e-06, + "loss": 1.9387, + "step": 23840 + }, + { + "epoch": 4.27, + "grad_norm": 0.7060482501983643, + "learning_rate": 2.6054558172668607e-06, + "loss": 1.691, + "step": 23845 + }, + { + "epoch": 4.27, + "grad_norm": 0.779434859752655, + "learning_rate": 2.5992097196868847e-06, + "loss": 1.6565, + "step": 23850 + }, + { + "epoch": 4.27, + "grad_norm": 4.394678115844727, + "learning_rate": 2.5929707074553363e-06, + "loss": 1.7052, + "step": 23855 + }, + { + "epoch": 4.27, + "grad_norm": 0.5779292583465576, + "learning_rate": 2.5867387825455917e-06, + "loss": 1.861, + "step": 23860 + }, + { + "epoch": 4.27, + "grad_norm": 0.7862660884857178, + "learning_rate": 2.580513946928817e-06, + "loss": 1.8878, + "step": 23865 + }, + { + "epoch": 4.27, + "grad_norm": 1.2498642206192017, + "learning_rate": 2.5742962025739115e-06, + "loss": 1.8267, + "step": 23870 + }, + { + "epoch": 4.27, + "grad_norm": 0.6954331398010254, + "learning_rate": 2.5680855514475466e-06, + "loss": 1.8246, + "step": 23875 + }, + { + "epoch": 4.27, + "grad_norm": 0.8826925754547119, + "learning_rate": 2.5618819955141456e-06, + "loss": 1.7513, + "step": 23880 + }, + { + "epoch": 4.28, + "grad_norm": 1.086248517036438, + "learning_rate": 2.5556855367358857e-06, + "loss": 1.6789, + "step": 23885 + }, + { + "epoch": 4.28, + "grad_norm": 0.739668071269989, + "learning_rate": 2.549496177072702e-06, + "loss": 1.6101, + "step": 23890 + }, + { + "epoch": 4.28, + "grad_norm": 1.1162097454071045, + "learning_rate": 2.5433139184822876e-06, + "loss": 1.8692, + "step": 23895 + }, + { + "epoch": 4.28, + "grad_norm": 0.627552330493927, + "learning_rate": 2.53713876292008e-06, + "loss": 1.9747, + "step": 23900 + }, + { + "epoch": 4.28, + "grad_norm": 1.044128179550171, + "learning_rate": 2.5309707123392727e-06, + "loss": 1.7328, + "step": 23905 + }, + { + "epoch": 4.28, + "grad_norm": 1.0667206048965454, + "learning_rate": 2.524809768690814e-06, + "loss": 1.8482, + "step": 23910 + }, + { + "epoch": 4.28, + "grad_norm": 0.584099292755127, + "learning_rate": 2.5186559339234085e-06, + "loss": 1.7584, + "step": 23915 + }, + { + "epoch": 4.28, + "grad_norm": 1.1192121505737305, + "learning_rate": 2.5125092099835047e-06, + "loss": 1.5993, + "step": 23920 + }, + { + "epoch": 4.28, + "grad_norm": 0.9891035556793213, + "learning_rate": 2.506369598815314e-06, + "loss": 1.6044, + "step": 23925 + }, + { + "epoch": 4.28, + "grad_norm": 0.9022369384765625, + "learning_rate": 2.500237102360778e-06, + "loss": 1.5102, + "step": 23930 + }, + { + "epoch": 4.28, + "grad_norm": 1.2190988063812256, + "learning_rate": 2.4941117225596054e-06, + "loss": 1.8696, + "step": 23935 + }, + { + "epoch": 4.29, + "grad_norm": 1.111484408378601, + "learning_rate": 2.4879934613492444e-06, + "loss": 1.9025, + "step": 23940 + }, + { + "epoch": 4.29, + "grad_norm": 1.1540158987045288, + "learning_rate": 2.4818823206649024e-06, + "loss": 1.9573, + "step": 23945 + }, + { + "epoch": 4.29, + "grad_norm": 0.7060664892196655, + "learning_rate": 2.475778302439524e-06, + "loss": 1.7662, + "step": 23950 + }, + { + "epoch": 4.29, + "grad_norm": 1.312042474746704, + "learning_rate": 2.469681408603802e-06, + "loss": 1.7862, + "step": 23955 + }, + { + "epoch": 4.29, + "grad_norm": 0.7066253423690796, + "learning_rate": 2.463591641086177e-06, + "loss": 1.8912, + "step": 23960 + }, + { + "epoch": 4.29, + "grad_norm": 0.7877931594848633, + "learning_rate": 2.4575090018128383e-06, + "loss": 1.8976, + "step": 23965 + }, + { + "epoch": 4.29, + "grad_norm": 1.185813307762146, + "learning_rate": 2.4514334927077167e-06, + "loss": 1.9366, + "step": 23970 + }, + { + "epoch": 4.29, + "grad_norm": 1.511576533317566, + "learning_rate": 2.445365115692494e-06, + "loss": 1.7834, + "step": 23975 + }, + { + "epoch": 4.29, + "grad_norm": 0.49797123670578003, + "learning_rate": 2.4393038726865907e-06, + "loss": 1.9477, + "step": 23980 + }, + { + "epoch": 4.29, + "grad_norm": 0.5764663815498352, + "learning_rate": 2.4332497656071764e-06, + "loss": 1.9526, + "step": 23985 + }, + { + "epoch": 4.29, + "grad_norm": 0.518380880355835, + "learning_rate": 2.427202796369152e-06, + "loss": 1.6421, + "step": 23990 + }, + { + "epoch": 4.29, + "grad_norm": 0.8762568831443787, + "learning_rate": 2.421162966885171e-06, + "loss": 1.8422, + "step": 23995 + }, + { + "epoch": 4.3, + "grad_norm": 0.7528052926063538, + "learning_rate": 2.415130279065625e-06, + "loss": 1.9284, + "step": 24000 + }, + { + "epoch": 4.3, + "grad_norm": 1.1197984218597412, + "learning_rate": 2.409104734818646e-06, + "loss": 1.8608, + "step": 24005 + }, + { + "epoch": 4.3, + "grad_norm": 0.6748790144920349, + "learning_rate": 2.4030863360501087e-06, + "loss": 1.6257, + "step": 24010 + }, + { + "epoch": 4.3, + "grad_norm": 0.5878778100013733, + "learning_rate": 2.397075084663636e-06, + "loss": 1.884, + "step": 24015 + }, + { + "epoch": 4.3, + "grad_norm": 0.6087291240692139, + "learning_rate": 2.391070982560564e-06, + "loss": 1.8711, + "step": 24020 + }, + { + "epoch": 4.3, + "grad_norm": 0.7970364689826965, + "learning_rate": 2.3850740316399946e-06, + "loss": 2.0835, + "step": 24025 + }, + { + "epoch": 4.3, + "grad_norm": 1.3959071636199951, + "learning_rate": 2.3790842337987564e-06, + "loss": 1.6191, + "step": 24030 + }, + { + "epoch": 4.3, + "grad_norm": 0.6767655611038208, + "learning_rate": 2.3731015909314176e-06, + "loss": 1.6902, + "step": 24035 + }, + { + "epoch": 4.3, + "grad_norm": 0.8441483378410339, + "learning_rate": 2.3671261049302865e-06, + "loss": 2.0472, + "step": 24040 + }, + { + "epoch": 4.3, + "grad_norm": 1.3324369192123413, + "learning_rate": 2.3611577776853966e-06, + "loss": 1.7657, + "step": 24045 + }, + { + "epoch": 4.3, + "grad_norm": 1.7804863452911377, + "learning_rate": 2.3551966110845203e-06, + "loss": 1.7996, + "step": 24050 + }, + { + "epoch": 4.31, + "grad_norm": 1.5548887252807617, + "learning_rate": 2.3492426070131747e-06, + "loss": 1.9328, + "step": 24055 + }, + { + "epoch": 4.31, + "grad_norm": 0.956676185131073, + "learning_rate": 2.3432957673546087e-06, + "loss": 1.9565, + "step": 24060 + }, + { + "epoch": 4.31, + "grad_norm": 0.9713985919952393, + "learning_rate": 2.3373560939897954e-06, + "loss": 1.65, + "step": 24065 + }, + { + "epoch": 4.31, + "grad_norm": 2.308432102203369, + "learning_rate": 2.3314235887974547e-06, + "loss": 1.9528, + "step": 24070 + }, + { + "epoch": 4.31, + "grad_norm": 1.1004524230957031, + "learning_rate": 2.3254982536540267e-06, + "loss": 1.617, + "step": 24075 + }, + { + "epoch": 4.31, + "grad_norm": 0.641258180141449, + "learning_rate": 2.3195800904336874e-06, + "loss": 1.6639, + "step": 24080 + }, + { + "epoch": 4.31, + "grad_norm": 1.2109898328781128, + "learning_rate": 2.3136691010083535e-06, + "loss": 1.8781, + "step": 24085 + }, + { + "epoch": 4.31, + "grad_norm": 0.9537338614463806, + "learning_rate": 2.3077652872476624e-06, + "loss": 1.925, + "step": 24090 + }, + { + "epoch": 4.31, + "grad_norm": 0.7373508214950562, + "learning_rate": 2.301868651018982e-06, + "loss": 1.8247, + "step": 24095 + }, + { + "epoch": 4.31, + "grad_norm": 1.0468422174453735, + "learning_rate": 2.2959791941874143e-06, + "loss": 1.6905, + "step": 24100 + }, + { + "epoch": 4.31, + "grad_norm": 0.8555766940116882, + "learning_rate": 2.2900969186157916e-06, + "loss": 2.035, + "step": 24105 + }, + { + "epoch": 4.32, + "grad_norm": 1.271920084953308, + "learning_rate": 2.2842218261646677e-06, + "loss": 1.887, + "step": 24110 + }, + { + "epoch": 4.32, + "grad_norm": 1.4928789138793945, + "learning_rate": 2.2783539186923274e-06, + "loss": 1.9741, + "step": 24115 + }, + { + "epoch": 4.32, + "grad_norm": 1.0468617677688599, + "learning_rate": 2.272493198054787e-06, + "loss": 1.6921, + "step": 24120 + }, + { + "epoch": 4.32, + "grad_norm": 0.6943525075912476, + "learning_rate": 2.2666396661057907e-06, + "loss": 2.0084, + "step": 24125 + }, + { + "epoch": 4.32, + "grad_norm": 0.5706409811973572, + "learning_rate": 2.2607933246968027e-06, + "loss": 2.0996, + "step": 24130 + }, + { + "epoch": 4.32, + "grad_norm": 1.1674983501434326, + "learning_rate": 2.2549541756770156e-06, + "loss": 1.8062, + "step": 24135 + }, + { + "epoch": 4.32, + "grad_norm": 1.248380422592163, + "learning_rate": 2.2491222208933376e-06, + "loss": 1.8164, + "step": 24140 + }, + { + "epoch": 4.32, + "grad_norm": 1.2721612453460693, + "learning_rate": 2.243297462190419e-06, + "loss": 1.6068, + "step": 24145 + }, + { + "epoch": 4.32, + "grad_norm": 1.1479939222335815, + "learning_rate": 2.2374799014106246e-06, + "loss": 1.8664, + "step": 24150 + }, + { + "epoch": 4.32, + "grad_norm": 0.7115007638931274, + "learning_rate": 2.2316695403940433e-06, + "loss": 2.029, + "step": 24155 + }, + { + "epoch": 4.32, + "grad_norm": 0.615996778011322, + "learning_rate": 2.2258663809784892e-06, + "loss": 1.8326, + "step": 24160 + }, + { + "epoch": 4.33, + "grad_norm": 0.7172175049781799, + "learning_rate": 2.2200704249994874e-06, + "loss": 1.791, + "step": 24165 + }, + { + "epoch": 4.33, + "grad_norm": 0.530174732208252, + "learning_rate": 2.2142816742903e-06, + "loss": 1.8528, + "step": 24170 + }, + { + "epoch": 4.33, + "grad_norm": 0.49278607964515686, + "learning_rate": 2.208500130681901e-06, + "loss": 1.9395, + "step": 24175 + }, + { + "epoch": 4.33, + "grad_norm": 0.946496307849884, + "learning_rate": 2.2027257960029914e-06, + "loss": 1.7902, + "step": 24180 + }, + { + "epoch": 4.33, + "grad_norm": 0.7843234539031982, + "learning_rate": 2.1969586720799852e-06, + "loss": 1.9402, + "step": 24185 + }, + { + "epoch": 4.33, + "grad_norm": 1.0197824239730835, + "learning_rate": 2.1911987607370123e-06, + "loss": 1.7697, + "step": 24190 + }, + { + "epoch": 4.33, + "grad_norm": 0.8750340342521667, + "learning_rate": 2.18544606379594e-06, + "loss": 1.9873, + "step": 24195 + }, + { + "epoch": 4.33, + "grad_norm": 1.0324968099594116, + "learning_rate": 2.1797005830763246e-06, + "loss": 1.8219, + "step": 24200 + }, + { + "epoch": 4.33, + "grad_norm": 1.028013825416565, + "learning_rate": 2.1739623203954674e-06, + "loss": 1.7537, + "step": 24205 + }, + { + "epoch": 4.33, + "grad_norm": 1.2253060340881348, + "learning_rate": 2.1682312775683723e-06, + "loss": 1.8392, + "step": 24210 + }, + { + "epoch": 4.33, + "grad_norm": 3.8781888484954834, + "learning_rate": 2.1625074564077612e-06, + "loss": 1.7971, + "step": 24215 + }, + { + "epoch": 4.34, + "grad_norm": 0.5331137776374817, + "learning_rate": 2.156790858724078e-06, + "loss": 1.9863, + "step": 24220 + }, + { + "epoch": 4.34, + "grad_norm": 0.9256309270858765, + "learning_rate": 2.1510814863254737e-06, + "loss": 2.0353, + "step": 24225 + }, + { + "epoch": 4.34, + "grad_norm": 0.932483971118927, + "learning_rate": 2.1453793410178196e-06, + "loss": 1.7526, + "step": 24230 + }, + { + "epoch": 4.34, + "grad_norm": 0.6007688641548157, + "learning_rate": 2.1396844246046903e-06, + "loss": 1.6332, + "step": 24235 + }, + { + "epoch": 4.34, + "grad_norm": 0.9822659492492676, + "learning_rate": 2.1339967388873895e-06, + "loss": 1.3974, + "step": 24240 + }, + { + "epoch": 4.34, + "grad_norm": 0.8610334992408752, + "learning_rate": 2.1283162856649273e-06, + "loss": 2.0177, + "step": 24245 + }, + { + "epoch": 4.34, + "grad_norm": 0.5514267086982727, + "learning_rate": 2.1226430667340285e-06, + "loss": 1.8909, + "step": 24250 + }, + { + "epoch": 4.34, + "grad_norm": 0.7719696164131165, + "learning_rate": 2.1169770838891155e-06, + "loss": 1.6996, + "step": 24255 + }, + { + "epoch": 4.34, + "grad_norm": 0.8033381700515747, + "learning_rate": 2.111318338922344e-06, + "loss": 1.7499, + "step": 24260 + }, + { + "epoch": 4.34, + "grad_norm": 1.272058367729187, + "learning_rate": 2.1056668336235622e-06, + "loss": 1.4928, + "step": 24265 + }, + { + "epoch": 4.34, + "grad_norm": 1.0982578992843628, + "learning_rate": 2.100022569780341e-06, + "loss": 1.8814, + "step": 24270 + }, + { + "epoch": 4.35, + "grad_norm": 0.8227397799491882, + "learning_rate": 2.09438554917796e-06, + "loss": 1.7183, + "step": 24275 + }, + { + "epoch": 4.35, + "grad_norm": 1.012026071548462, + "learning_rate": 2.0887557735993908e-06, + "loss": 1.7103, + "step": 24280 + }, + { + "epoch": 4.35, + "grad_norm": 1.1738848686218262, + "learning_rate": 2.0831332448253386e-06, + "loss": 1.6326, + "step": 24285 + }, + { + "epoch": 4.35, + "grad_norm": 0.9276083707809448, + "learning_rate": 2.077517964634193e-06, + "loss": 1.7646, + "step": 24290 + }, + { + "epoch": 4.35, + "grad_norm": 3.0199804306030273, + "learning_rate": 2.071909934802066e-06, + "loss": 1.9786, + "step": 24295 + }, + { + "epoch": 4.35, + "grad_norm": 0.6301084756851196, + "learning_rate": 2.066309157102772e-06, + "loss": 1.6944, + "step": 24300 + }, + { + "epoch": 4.35, + "grad_norm": 1.1437808275222778, + "learning_rate": 2.0607156333078332e-06, + "loss": 1.9109, + "step": 24305 + }, + { + "epoch": 4.35, + "grad_norm": 0.41809070110321045, + "learning_rate": 2.0551293651864783e-06, + "loss": 2.0125, + "step": 24310 + }, + { + "epoch": 4.35, + "grad_norm": 3.662910223007202, + "learning_rate": 2.0495503545056303e-06, + "loss": 1.8321, + "step": 24315 + }, + { + "epoch": 4.35, + "grad_norm": 1.1465100049972534, + "learning_rate": 2.0439786030299306e-06, + "loss": 1.7392, + "step": 24320 + }, + { + "epoch": 4.35, + "grad_norm": 1.2465860843658447, + "learning_rate": 2.038414112521722e-06, + "loss": 1.7251, + "step": 24325 + }, + { + "epoch": 4.35, + "grad_norm": 0.9465045928955078, + "learning_rate": 2.032856884741041e-06, + "loss": 1.8722, + "step": 24330 + }, + { + "epoch": 4.36, + "grad_norm": 0.7800765633583069, + "learning_rate": 2.027306921445632e-06, + "loss": 1.719, + "step": 24335 + }, + { + "epoch": 4.36, + "grad_norm": 0.5892726182937622, + "learning_rate": 2.021764224390954e-06, + "loss": 1.4895, + "step": 24340 + }, + { + "epoch": 4.36, + "grad_norm": 1.069069504737854, + "learning_rate": 2.016228795330144e-06, + "loss": 1.7287, + "step": 24345 + }, + { + "epoch": 4.36, + "grad_norm": 0.602766215801239, + "learning_rate": 2.0107006360140597e-06, + "loss": 1.8655, + "step": 24350 + }, + { + "epoch": 4.36, + "grad_norm": 1.3404914140701294, + "learning_rate": 2.005179748191249e-06, + "loss": 1.8222, + "step": 24355 + }, + { + "epoch": 4.36, + "grad_norm": 0.8859615325927734, + "learning_rate": 1.9996661336079693e-06, + "loss": 2.0234, + "step": 24360 + }, + { + "epoch": 4.36, + "grad_norm": 0.8665515780448914, + "learning_rate": 1.9941597940081687e-06, + "loss": 1.6752, + "step": 24365 + }, + { + "epoch": 4.36, + "grad_norm": 0.8328173160552979, + "learning_rate": 1.988660731133499e-06, + "loss": 1.7841, + "step": 24370 + }, + { + "epoch": 4.36, + "grad_norm": 0.7565486431121826, + "learning_rate": 1.9831689467233015e-06, + "loss": 2.0957, + "step": 24375 + }, + { + "epoch": 4.36, + "grad_norm": 1.5388751029968262, + "learning_rate": 1.977684442514624e-06, + "loss": 2.0002, + "step": 24380 + }, + { + "epoch": 4.36, + "grad_norm": 1.155758261680603, + "learning_rate": 1.9722072202422144e-06, + "loss": 2.0851, + "step": 24385 + }, + { + "epoch": 4.37, + "grad_norm": 0.5179358720779419, + "learning_rate": 1.9667372816385114e-06, + "loss": 2.1737, + "step": 24390 + }, + { + "epoch": 4.37, + "grad_norm": 0.9456248879432678, + "learning_rate": 1.9612746284336513e-06, + "loss": 1.7682, + "step": 24395 + }, + { + "epoch": 4.37, + "grad_norm": 1.9018527269363403, + "learning_rate": 1.955819262355471e-06, + "loss": 1.6936, + "step": 24400 + }, + { + "epoch": 4.37, + "grad_norm": 0.984753429889679, + "learning_rate": 1.9503711851294846e-06, + "loss": 1.6365, + "step": 24405 + }, + { + "epoch": 4.37, + "grad_norm": 0.8661744594573975, + "learning_rate": 1.9449303984789253e-06, + "loss": 1.7751, + "step": 24410 + }, + { + "epoch": 4.37, + "grad_norm": 2.2209432125091553, + "learning_rate": 1.9394969041247074e-06, + "loss": 1.8097, + "step": 24415 + }, + { + "epoch": 4.37, + "grad_norm": 0.5390074253082275, + "learning_rate": 1.9340707037854316e-06, + "loss": 1.8734, + "step": 24420 + }, + { + "epoch": 4.37, + "grad_norm": 0.6612688899040222, + "learning_rate": 1.9286517991774085e-06, + "loss": 1.8623, + "step": 24425 + }, + { + "epoch": 4.37, + "grad_norm": 1.4000182151794434, + "learning_rate": 1.9232401920146306e-06, + "loss": 1.5901, + "step": 24430 + }, + { + "epoch": 4.37, + "grad_norm": 0.7332963943481445, + "learning_rate": 1.9178358840087814e-06, + "loss": 1.6914, + "step": 24435 + }, + { + "epoch": 4.37, + "grad_norm": 1.1628941297531128, + "learning_rate": 1.912438876869238e-06, + "loss": 1.7781, + "step": 24440 + }, + { + "epoch": 4.38, + "grad_norm": 2.224209785461426, + "learning_rate": 1.9070491723030715e-06, + "loss": 1.4185, + "step": 24445 + }, + { + "epoch": 4.38, + "grad_norm": 0.7662902474403381, + "learning_rate": 1.90166677201504e-06, + "loss": 1.845, + "step": 24450 + }, + { + "epoch": 4.38, + "grad_norm": 1.3199970722198486, + "learning_rate": 1.8962916777075928e-06, + "loss": 1.8531, + "step": 24455 + }, + { + "epoch": 4.38, + "grad_norm": 0.7640072107315063, + "learning_rate": 1.890923891080862e-06, + "loss": 2.048, + "step": 24460 + }, + { + "epoch": 4.38, + "grad_norm": 0.6150866746902466, + "learning_rate": 1.8855634138326806e-06, + "loss": 1.8992, + "step": 24465 + }, + { + "epoch": 4.38, + "grad_norm": 1.9289946556091309, + "learning_rate": 1.8802102476585537e-06, + "loss": 1.9587, + "step": 24470 + }, + { + "epoch": 4.38, + "grad_norm": 0.9716728925704956, + "learning_rate": 1.874864394251688e-06, + "loss": 1.6301, + "step": 24475 + }, + { + "epoch": 4.38, + "grad_norm": 0.83192378282547, + "learning_rate": 1.8695258553029699e-06, + "loss": 1.9311, + "step": 24480 + }, + { + "epoch": 4.38, + "grad_norm": 1.113206148147583, + "learning_rate": 1.8641946325009735e-06, + "loss": 1.419, + "step": 24485 + }, + { + "epoch": 4.38, + "grad_norm": 0.6721107363700867, + "learning_rate": 1.8588707275319667e-06, + "loss": 1.9316, + "step": 24490 + }, + { + "epoch": 4.38, + "grad_norm": 0.9275158643722534, + "learning_rate": 1.8535541420798858e-06, + "loss": 2.1295, + "step": 24495 + }, + { + "epoch": 4.39, + "grad_norm": 0.5885508060455322, + "learning_rate": 1.8482448778263662e-06, + "loss": 1.882, + "step": 24500 + }, + { + "epoch": 4.39, + "grad_norm": 0.7526519894599915, + "learning_rate": 1.8429429364507228e-06, + "loss": 1.9953, + "step": 24505 + }, + { + "epoch": 4.39, + "grad_norm": 0.7567302584648132, + "learning_rate": 1.837648319629956e-06, + "loss": 1.769, + "step": 24510 + }, + { + "epoch": 4.39, + "grad_norm": 0.6800612211227417, + "learning_rate": 1.8323610290387454e-06, + "loss": 1.8711, + "step": 24515 + }, + { + "epoch": 4.39, + "grad_norm": 0.9701638221740723, + "learning_rate": 1.827081066349459e-06, + "loss": 1.8497, + "step": 24520 + }, + { + "epoch": 4.39, + "grad_norm": 1.5889774560928345, + "learning_rate": 1.821808433232139e-06, + "loss": 2.0364, + "step": 24525 + }, + { + "epoch": 4.39, + "grad_norm": 0.7154751420021057, + "learning_rate": 1.8165431313545144e-06, + "loss": 1.7296, + "step": 24530 + }, + { + "epoch": 4.39, + "grad_norm": 1.3648035526275635, + "learning_rate": 1.811285162382001e-06, + "loss": 1.804, + "step": 24535 + }, + { + "epoch": 4.39, + "grad_norm": 0.6310946941375732, + "learning_rate": 1.8060345279776848e-06, + "loss": 2.1313, + "step": 24540 + }, + { + "epoch": 4.39, + "grad_norm": 0.9399888515472412, + "learning_rate": 1.80079122980234e-06, + "loss": 1.5419, + "step": 24545 + }, + { + "epoch": 4.39, + "grad_norm": 0.7775284647941589, + "learning_rate": 1.7955552695144123e-06, + "loss": 2.019, + "step": 24550 + }, + { + "epoch": 4.4, + "grad_norm": 2.3950979709625244, + "learning_rate": 1.7903266487700382e-06, + "loss": 2.064, + "step": 24555 + }, + { + "epoch": 4.4, + "grad_norm": 0.8698521852493286, + "learning_rate": 1.785105369223014e-06, + "loss": 1.8614, + "step": 24560 + }, + { + "epoch": 4.4, + "grad_norm": 0.7981832027435303, + "learning_rate": 1.7798914325248328e-06, + "loss": 1.7595, + "step": 24565 + }, + { + "epoch": 4.4, + "grad_norm": 1.5428972244262695, + "learning_rate": 1.7746848403246559e-06, + "loss": 1.869, + "step": 24570 + }, + { + "epoch": 4.4, + "grad_norm": 0.7430126667022705, + "learning_rate": 1.7694855942693239e-06, + "loss": 1.8618, + "step": 24575 + }, + { + "epoch": 4.4, + "grad_norm": 0.7668565511703491, + "learning_rate": 1.7642936960033578e-06, + "loss": 1.753, + "step": 24580 + }, + { + "epoch": 4.4, + "grad_norm": 2.907975196838379, + "learning_rate": 1.7591091471689414e-06, + "loss": 1.9419, + "step": 24585 + }, + { + "epoch": 4.4, + "grad_norm": 1.0696461200714111, + "learning_rate": 1.7539319494059458e-06, + "loss": 1.8617, + "step": 24590 + }, + { + "epoch": 4.4, + "grad_norm": 0.8900358080863953, + "learning_rate": 1.748762104351917e-06, + "loss": 1.8093, + "step": 24595 + }, + { + "epoch": 4.4, + "grad_norm": 0.6597603559494019, + "learning_rate": 1.743599613642069e-06, + "loss": 1.7068, + "step": 24600 + }, + { + "epoch": 4.4, + "grad_norm": 0.9883655309677124, + "learning_rate": 1.7384444789092957e-06, + "loss": 1.6025, + "step": 24605 + }, + { + "epoch": 4.41, + "grad_norm": 4.697280406951904, + "learning_rate": 1.7332967017841623e-06, + "loss": 1.8796, + "step": 24610 + }, + { + "epoch": 4.41, + "grad_norm": 1.5211069583892822, + "learning_rate": 1.7281562838948966e-06, + "loss": 1.7911, + "step": 24615 + }, + { + "epoch": 4.41, + "grad_norm": 0.9767752885818481, + "learning_rate": 1.7230232268674124e-06, + "loss": 1.7474, + "step": 24620 + }, + { + "epoch": 4.41, + "grad_norm": 1.0923303365707397, + "learning_rate": 1.7178975323252937e-06, + "loss": 1.6909, + "step": 24625 + }, + { + "epoch": 4.41, + "grad_norm": 0.7364457845687866, + "learning_rate": 1.7127792018897914e-06, + "loss": 1.9563, + "step": 24630 + }, + { + "epoch": 4.41, + "grad_norm": 0.9624817371368408, + "learning_rate": 1.7076682371798298e-06, + "loss": 1.9367, + "step": 24635 + }, + { + "epoch": 4.41, + "grad_norm": 0.6353481411933899, + "learning_rate": 1.7025646398119988e-06, + "loss": 1.8782, + "step": 24640 + }, + { + "epoch": 4.41, + "grad_norm": 1.050934910774231, + "learning_rate": 1.6974684114005601e-06, + "loss": 1.6196, + "step": 24645 + }, + { + "epoch": 4.41, + "grad_norm": 1.2270129919052124, + "learning_rate": 1.692379553557455e-06, + "loss": 2.0497, + "step": 24650 + }, + { + "epoch": 4.41, + "grad_norm": 0.5485097765922546, + "learning_rate": 1.6872980678922734e-06, + "loss": 1.781, + "step": 24655 + }, + { + "epoch": 4.41, + "grad_norm": 0.6211299300193787, + "learning_rate": 1.6822239560122882e-06, + "loss": 1.8401, + "step": 24660 + }, + { + "epoch": 4.41, + "grad_norm": 4.1435017585754395, + "learning_rate": 1.6771572195224433e-06, + "loss": 1.8793, + "step": 24665 + }, + { + "epoch": 4.42, + "grad_norm": 0.7795052528381348, + "learning_rate": 1.672097860025329e-06, + "loss": 1.629, + "step": 24670 + }, + { + "epoch": 4.42, + "grad_norm": 0.4488808512687683, + "learning_rate": 1.6670458791212263e-06, + "loss": 2.0304, + "step": 24675 + }, + { + "epoch": 4.42, + "grad_norm": 0.94510817527771, + "learning_rate": 1.6620012784080652e-06, + "loss": 1.6173, + "step": 24680 + }, + { + "epoch": 4.42, + "grad_norm": 0.4074093997478485, + "learning_rate": 1.6569640594814528e-06, + "loss": 2.0128, + "step": 24685 + }, + { + "epoch": 4.42, + "grad_norm": 0.7733169794082642, + "learning_rate": 1.6519342239346564e-06, + "loss": 1.9417, + "step": 24690 + }, + { + "epoch": 4.42, + "grad_norm": 0.7011056542396545, + "learning_rate": 1.6469117733586087e-06, + "loss": 2.0944, + "step": 24695 + }, + { + "epoch": 4.42, + "grad_norm": 0.8170765042304993, + "learning_rate": 1.6418967093419058e-06, + "loss": 1.9794, + "step": 24700 + }, + { + "epoch": 4.42, + "grad_norm": 0.5875701308250427, + "learning_rate": 1.6368890334708037e-06, + "loss": 1.9456, + "step": 24705 + }, + { + "epoch": 4.42, + "grad_norm": 0.7417488694190979, + "learning_rate": 1.6318887473292243e-06, + "loss": 1.6503, + "step": 24710 + }, + { + "epoch": 4.42, + "grad_norm": 1.339301347732544, + "learning_rate": 1.626895852498761e-06, + "loss": 1.7944, + "step": 24715 + }, + { + "epoch": 4.42, + "grad_norm": 0.6533433198928833, + "learning_rate": 1.6219103505586531e-06, + "loss": 1.764, + "step": 24720 + }, + { + "epoch": 4.43, + "grad_norm": 0.9360514283180237, + "learning_rate": 1.6169322430858198e-06, + "loss": 1.7368, + "step": 24725 + }, + { + "epoch": 4.43, + "grad_norm": 0.906342625617981, + "learning_rate": 1.6119615316548237e-06, + "loss": 1.7317, + "step": 24730 + }, + { + "epoch": 4.43, + "grad_norm": 2.1452555656433105, + "learning_rate": 1.606998217837899e-06, + "loss": 1.6536, + "step": 24735 + }, + { + "epoch": 4.43, + "grad_norm": 0.49390387535095215, + "learning_rate": 1.6020423032049342e-06, + "loss": 1.8927, + "step": 24740 + }, + { + "epoch": 4.43, + "grad_norm": 0.8980386257171631, + "learning_rate": 1.5970937893234894e-06, + "loss": 1.5554, + "step": 24745 + }, + { + "epoch": 4.43, + "grad_norm": 0.8987436890602112, + "learning_rate": 1.5921526777587625e-06, + "loss": 1.7345, + "step": 24750 + }, + { + "epoch": 4.43, + "grad_norm": 0.7175293564796448, + "learning_rate": 1.5872189700736339e-06, + "loss": 2.1268, + "step": 24755 + }, + { + "epoch": 4.43, + "grad_norm": 1.050519585609436, + "learning_rate": 1.5822926678286194e-06, + "loss": 1.9497, + "step": 24760 + }, + { + "epoch": 4.43, + "grad_norm": 0.6734400987625122, + "learning_rate": 1.577373772581911e-06, + "loss": 1.5229, + "step": 24765 + }, + { + "epoch": 4.43, + "grad_norm": 1.3259496688842773, + "learning_rate": 1.572462285889345e-06, + "loss": 1.8825, + "step": 24770 + }, + { + "epoch": 4.43, + "grad_norm": 1.473742127418518, + "learning_rate": 1.5675582093044261e-06, + "loss": 1.9713, + "step": 24775 + }, + { + "epoch": 4.44, + "grad_norm": 1.0151005983352661, + "learning_rate": 1.5626615443783077e-06, + "loss": 1.524, + "step": 24780 + }, + { + "epoch": 4.44, + "grad_norm": 0.7851746678352356, + "learning_rate": 1.557772292659801e-06, + "loss": 1.8783, + "step": 24785 + }, + { + "epoch": 4.44, + "grad_norm": 3.6031758785247803, + "learning_rate": 1.552890455695369e-06, + "loss": 1.9658, + "step": 24790 + }, + { + "epoch": 4.44, + "grad_norm": 1.0066990852355957, + "learning_rate": 1.5480160350291317e-06, + "loss": 1.7486, + "step": 24795 + }, + { + "epoch": 4.44, + "grad_norm": 0.8470297455787659, + "learning_rate": 1.543149032202862e-06, + "loss": 1.7935, + "step": 24800 + }, + { + "epoch": 4.44, + "grad_norm": 3.9266157150268555, + "learning_rate": 1.538289448755989e-06, + "loss": 1.6526, + "step": 24805 + }, + { + "epoch": 4.44, + "grad_norm": 1.250511884689331, + "learning_rate": 1.5334372862256002e-06, + "loss": 1.6387, + "step": 24810 + }, + { + "epoch": 4.44, + "grad_norm": 0.848745584487915, + "learning_rate": 1.5285925461464263e-06, + "loss": 2.0486, + "step": 24815 + }, + { + "epoch": 4.44, + "grad_norm": 0.8530756831169128, + "learning_rate": 1.5237552300508524e-06, + "loss": 1.4405, + "step": 24820 + }, + { + "epoch": 4.44, + "grad_norm": 0.5230215787887573, + "learning_rate": 1.5189253394689156e-06, + "loss": 1.7863, + "step": 24825 + }, + { + "epoch": 4.44, + "grad_norm": 1.091163992881775, + "learning_rate": 1.514102875928311e-06, + "loss": 1.5996, + "step": 24830 + }, + { + "epoch": 4.45, + "grad_norm": 1.1136764287948608, + "learning_rate": 1.5092878409543764e-06, + "loss": 1.8101, + "step": 24835 + }, + { + "epoch": 4.45, + "grad_norm": 1.0806092023849487, + "learning_rate": 1.5044802360700994e-06, + "loss": 1.949, + "step": 24840 + }, + { + "epoch": 4.45, + "grad_norm": 0.9689674973487854, + "learning_rate": 1.49968006279613e-06, + "loss": 1.7404, + "step": 24845 + }, + { + "epoch": 4.45, + "grad_norm": 1.1383538246154785, + "learning_rate": 1.4948873226507481e-06, + "loss": 1.6446, + "step": 24850 + }, + { + "epoch": 4.45, + "grad_norm": 0.7477939128875732, + "learning_rate": 1.490102017149894e-06, + "loss": 1.7101, + "step": 24855 + }, + { + "epoch": 4.45, + "grad_norm": 0.6716863512992859, + "learning_rate": 1.48532414780716e-06, + "loss": 1.7832, + "step": 24860 + }, + { + "epoch": 4.45, + "grad_norm": 1.1172704696655273, + "learning_rate": 1.4805537161337813e-06, + "loss": 1.6232, + "step": 24865 + }, + { + "epoch": 4.45, + "grad_norm": 0.48017409443855286, + "learning_rate": 1.4757907236386426e-06, + "loss": 1.7984, + "step": 24870 + }, + { + "epoch": 4.45, + "grad_norm": 0.6822175979614258, + "learning_rate": 1.4710351718282667e-06, + "loss": 1.745, + "step": 24875 + }, + { + "epoch": 4.45, + "grad_norm": 0.6264699697494507, + "learning_rate": 1.466287062206831e-06, + "loss": 1.778, + "step": 24880 + }, + { + "epoch": 4.45, + "grad_norm": 0.8009613752365112, + "learning_rate": 1.4615463962761671e-06, + "loss": 1.8883, + "step": 24885 + }, + { + "epoch": 4.46, + "grad_norm": 26.099721908569336, + "learning_rate": 1.4568131755357346e-06, + "loss": 1.8559, + "step": 24890 + }, + { + "epoch": 4.46, + "grad_norm": 1.0402318239212036, + "learning_rate": 1.4520874014826463e-06, + "loss": 1.7209, + "step": 24895 + }, + { + "epoch": 4.46, + "grad_norm": 0.6136242151260376, + "learning_rate": 1.4473690756116654e-06, + "loss": 1.8066, + "step": 24900 + }, + { + "epoch": 4.46, + "grad_norm": 1.0361982583999634, + "learning_rate": 1.442658199415192e-06, + "loss": 1.5658, + "step": 24905 + }, + { + "epoch": 4.46, + "grad_norm": 0.9705806374549866, + "learning_rate": 1.4379547743832683e-06, + "loss": 1.9806, + "step": 24910 + }, + { + "epoch": 4.46, + "grad_norm": 0.6670936346054077, + "learning_rate": 1.433258802003587e-06, + "loss": 1.9256, + "step": 24915 + }, + { + "epoch": 4.46, + "grad_norm": 0.9053754806518555, + "learning_rate": 1.4285702837614768e-06, + "loss": 1.8113, + "step": 24920 + }, + { + "epoch": 4.46, + "grad_norm": 0.8029524087905884, + "learning_rate": 1.4238892211399152e-06, + "loss": 1.9991, + "step": 24925 + }, + { + "epoch": 4.46, + "grad_norm": 0.7751874327659607, + "learning_rate": 1.4192156156195151e-06, + "loss": 1.6467, + "step": 24930 + }, + { + "epoch": 4.46, + "grad_norm": 0.59059077501297, + "learning_rate": 1.4145494686785387e-06, + "loss": 2.0298, + "step": 24935 + }, + { + "epoch": 4.46, + "grad_norm": 0.6298719048500061, + "learning_rate": 1.409890781792872e-06, + "loss": 1.7394, + "step": 24940 + }, + { + "epoch": 4.47, + "grad_norm": 1.1347723007202148, + "learning_rate": 1.405239556436061e-06, + "loss": 1.8038, + "step": 24945 + }, + { + "epoch": 4.47, + "grad_norm": 1.0450183153152466, + "learning_rate": 1.4005957940792846e-06, + "loss": 1.9163, + "step": 24950 + }, + { + "epoch": 4.47, + "grad_norm": 0.9296015501022339, + "learning_rate": 1.395959496191357e-06, + "loss": 1.8794, + "step": 24955 + }, + { + "epoch": 4.47, + "grad_norm": 0.8542814254760742, + "learning_rate": 1.3913306642387409e-06, + "loss": 1.7646, + "step": 24960 + }, + { + "epoch": 4.47, + "grad_norm": 1.308875560760498, + "learning_rate": 1.3867092996855236e-06, + "loss": 1.9903, + "step": 24965 + }, + { + "epoch": 4.47, + "grad_norm": 0.5955923795700073, + "learning_rate": 1.3820954039934413e-06, + "loss": 1.9698, + "step": 24970 + }, + { + "epoch": 4.47, + "grad_norm": 0.6191591024398804, + "learning_rate": 1.3774889786218687e-06, + "loss": 1.7368, + "step": 24975 + }, + { + "epoch": 4.47, + "grad_norm": 0.9830487966537476, + "learning_rate": 1.3728900250278065e-06, + "loss": 1.802, + "step": 24980 + }, + { + "epoch": 4.47, + "grad_norm": 1.0741496086120605, + "learning_rate": 1.3682985446659025e-06, + "loss": 1.7966, + "step": 24985 + }, + { + "epoch": 4.47, + "grad_norm": 0.583555281162262, + "learning_rate": 1.3637145389884394e-06, + "loss": 1.7261, + "step": 24990 + }, + { + "epoch": 4.47, + "grad_norm": 0.9671819806098938, + "learning_rate": 1.359138009445335e-06, + "loss": 1.8888, + "step": 24995 + }, + { + "epoch": 4.47, + "grad_norm": 0.8640641570091248, + "learning_rate": 1.3545689574841342e-06, + "loss": 1.8415, + "step": 25000 + }, + { + "epoch": 4.48, + "grad_norm": 0.8222974538803101, + "learning_rate": 1.3500073845500284e-06, + "loss": 1.9049, + "step": 25005 + }, + { + "epoch": 4.48, + "grad_norm": 1.0911484956741333, + "learning_rate": 1.345453292085841e-06, + "loss": 1.9566, + "step": 25010 + }, + { + "epoch": 4.48, + "grad_norm": 0.6996002793312073, + "learning_rate": 1.3409066815320226e-06, + "loss": 1.9323, + "step": 25015 + }, + { + "epoch": 4.48, + "grad_norm": 0.5919438600540161, + "learning_rate": 1.3363675543266674e-06, + "loss": 1.8602, + "step": 25020 + }, + { + "epoch": 4.48, + "grad_norm": 0.7225549221038818, + "learning_rate": 1.3318359119054935e-06, + "loss": 2.2012, + "step": 25025 + }, + { + "epoch": 4.48, + "grad_norm": 1.0485141277313232, + "learning_rate": 1.327311755701849e-06, + "loss": 1.6628, + "step": 25030 + }, + { + "epoch": 4.48, + "grad_norm": 0.6466124057769775, + "learning_rate": 1.3227950871467304e-06, + "loss": 1.8366, + "step": 25035 + }, + { + "epoch": 4.48, + "grad_norm": 1.215564489364624, + "learning_rate": 1.3182859076687481e-06, + "loss": 1.8886, + "step": 25040 + }, + { + "epoch": 4.48, + "grad_norm": 1.6709582805633545, + "learning_rate": 1.313784218694153e-06, + "loss": 1.6098, + "step": 25045 + }, + { + "epoch": 4.48, + "grad_norm": 1.1631311178207397, + "learning_rate": 1.3092900216468336e-06, + "loss": 1.9456, + "step": 25050 + }, + { + "epoch": 4.48, + "grad_norm": 0.9077101349830627, + "learning_rate": 1.304803317948286e-06, + "loss": 1.9834, + "step": 25055 + }, + { + "epoch": 4.49, + "grad_norm": 1.2171112298965454, + "learning_rate": 1.3003241090176587e-06, + "loss": 1.7559, + "step": 25060 + }, + { + "epoch": 4.49, + "grad_norm": 1.7720481157302856, + "learning_rate": 1.2958523962717185e-06, + "loss": 1.7203, + "step": 25065 + }, + { + "epoch": 4.49, + "grad_norm": 0.6833968162536621, + "learning_rate": 1.2913881811248696e-06, + "loss": 1.8261, + "step": 25070 + }, + { + "epoch": 4.49, + "grad_norm": 0.8987765312194824, + "learning_rate": 1.28693146498913e-06, + "loss": 1.977, + "step": 25075 + }, + { + "epoch": 4.49, + "grad_norm": 0.8224658966064453, + "learning_rate": 1.2824822492741584e-06, + "loss": 1.7903, + "step": 25080 + }, + { + "epoch": 4.49, + "grad_norm": 0.8352959752082825, + "learning_rate": 1.2780405353872426e-06, + "loss": 1.6669, + "step": 25085 + }, + { + "epoch": 4.49, + "grad_norm": 1.408292293548584, + "learning_rate": 1.273606324733284e-06, + "loss": 1.5876, + "step": 25090 + }, + { + "epoch": 4.49, + "grad_norm": 1.146064281463623, + "learning_rate": 1.2691796187148247e-06, + "loss": 1.7544, + "step": 25095 + }, + { + "epoch": 4.49, + "grad_norm": 1.177251935005188, + "learning_rate": 1.2647604187320223e-06, + "loss": 1.918, + "step": 25100 + }, + { + "epoch": 4.49, + "grad_norm": 0.5201444029808044, + "learning_rate": 1.2603487261826724e-06, + "loss": 2.0014, + "step": 25105 + }, + { + "epoch": 4.49, + "grad_norm": 2.2395122051239014, + "learning_rate": 1.2559445424621868e-06, + "loss": 1.7281, + "step": 25110 + }, + { + "epoch": 4.5, + "grad_norm": 0.5539356470108032, + "learning_rate": 1.251547868963604e-06, + "loss": 1.9028, + "step": 25115 + }, + { + "epoch": 4.5, + "grad_norm": 0.8390756249427795, + "learning_rate": 1.2471587070775837e-06, + "loss": 1.9719, + "step": 25120 + }, + { + "epoch": 4.5, + "grad_norm": 0.5405310392379761, + "learning_rate": 1.2427770581924181e-06, + "loss": 1.8543, + "step": 25125 + }, + { + "epoch": 4.5, + "grad_norm": 2.3233208656311035, + "learning_rate": 1.2384029236940181e-06, + "loss": 1.5512, + "step": 25130 + }, + { + "epoch": 4.5, + "grad_norm": 0.636073648929596, + "learning_rate": 1.2340363049659159e-06, + "loss": 1.9004, + "step": 25135 + }, + { + "epoch": 4.5, + "grad_norm": 0.8926206827163696, + "learning_rate": 1.2296772033892733e-06, + "loss": 1.7828, + "step": 25140 + }, + { + "epoch": 4.5, + "grad_norm": 0.6021214127540588, + "learning_rate": 1.2253256203428653e-06, + "loss": 2.0191, + "step": 25145 + }, + { + "epoch": 4.5, + "grad_norm": 0.6334590911865234, + "learning_rate": 1.2209815572030937e-06, + "loss": 2.1132, + "step": 25150 + }, + { + "epoch": 4.5, + "grad_norm": 7.690194606781006, + "learning_rate": 1.216645015343984e-06, + "loss": 1.5651, + "step": 25155 + }, + { + "epoch": 4.5, + "grad_norm": 0.8695514798164368, + "learning_rate": 1.212315996137181e-06, + "loss": 1.8521, + "step": 25160 + }, + { + "epoch": 4.5, + "grad_norm": 1.237356185913086, + "learning_rate": 1.2079945009519478e-06, + "loss": 1.638, + "step": 25165 + }, + { + "epoch": 4.51, + "grad_norm": 0.9352409243583679, + "learning_rate": 1.2036805311551714e-06, + "loss": 1.7746, + "step": 25170 + }, + { + "epoch": 4.51, + "grad_norm": 0.6192563772201538, + "learning_rate": 1.1993740881113492e-06, + "loss": 1.8845, + "step": 25175 + }, + { + "epoch": 4.51, + "grad_norm": 0.8423218131065369, + "learning_rate": 1.1950751731826083e-06, + "loss": 1.7119, + "step": 25180 + }, + { + "epoch": 4.51, + "grad_norm": 0.6161177158355713, + "learning_rate": 1.1907837877286942e-06, + "loss": 2.1457, + "step": 25185 + }, + { + "epoch": 4.51, + "grad_norm": 0.7731286287307739, + "learning_rate": 1.1864999331069687e-06, + "loss": 1.849, + "step": 25190 + }, + { + "epoch": 4.51, + "grad_norm": 1.3494420051574707, + "learning_rate": 1.182223610672406e-06, + "loss": 1.5065, + "step": 25195 + }, + { + "epoch": 4.51, + "grad_norm": 0.9249375462532043, + "learning_rate": 1.1779548217776077e-06, + "loss": 1.7186, + "step": 25200 + }, + { + "epoch": 4.51, + "grad_norm": 1.1438442468643188, + "learning_rate": 1.1736935677727852e-06, + "loss": 1.7934, + "step": 25205 + }, + { + "epoch": 4.51, + "grad_norm": 1.2738542556762695, + "learning_rate": 1.1694398500057714e-06, + "loss": 1.6784, + "step": 25210 + }, + { + "epoch": 4.51, + "grad_norm": 0.7526887059211731, + "learning_rate": 1.1651936698220068e-06, + "loss": 2.0072, + "step": 25215 + }, + { + "epoch": 4.51, + "grad_norm": 3.0581088066101074, + "learning_rate": 1.1609550285645588e-06, + "loss": 1.598, + "step": 25220 + }, + { + "epoch": 4.52, + "grad_norm": 1.558251976966858, + "learning_rate": 1.156723927574105e-06, + "loss": 1.9926, + "step": 25225 + }, + { + "epoch": 4.52, + "grad_norm": 1.8919259309768677, + "learning_rate": 1.1525003681889413e-06, + "loss": 1.9012, + "step": 25230 + }, + { + "epoch": 4.52, + "grad_norm": 0.8434092998504639, + "learning_rate": 1.1482843517449688e-06, + "loss": 2.1936, + "step": 25235 + }, + { + "epoch": 4.52, + "grad_norm": 1.0325753688812256, + "learning_rate": 1.144075879575715e-06, + "loss": 1.9423, + "step": 25240 + }, + { + "epoch": 4.52, + "grad_norm": 0.6491246223449707, + "learning_rate": 1.1398749530123127e-06, + "loss": 1.7421, + "step": 25245 + }, + { + "epoch": 4.52, + "grad_norm": 1.0722302198410034, + "learning_rate": 1.1356815733835153e-06, + "loss": 1.8059, + "step": 25250 + }, + { + "epoch": 4.52, + "grad_norm": 0.7224903702735901, + "learning_rate": 1.1314957420156812e-06, + "loss": 1.8328, + "step": 25255 + }, + { + "epoch": 4.52, + "grad_norm": 0.8088222742080688, + "learning_rate": 1.127317460232788e-06, + "loss": 1.9416, + "step": 25260 + }, + { + "epoch": 4.52, + "grad_norm": 0.5477873682975769, + "learning_rate": 1.123146729356414e-06, + "loss": 1.818, + "step": 25265 + }, + { + "epoch": 4.52, + "grad_norm": 2.747405767440796, + "learning_rate": 1.1189835507057655e-06, + "loss": 1.9102, + "step": 25270 + }, + { + "epoch": 4.52, + "grad_norm": 0.7023490071296692, + "learning_rate": 1.1148279255976475e-06, + "loss": 1.7424, + "step": 25275 + }, + { + "epoch": 4.52, + "grad_norm": 0.6943578720092773, + "learning_rate": 1.1106798553464804e-06, + "loss": 1.7916, + "step": 25280 + }, + { + "epoch": 4.53, + "grad_norm": 1.317313313484192, + "learning_rate": 1.1065393412642982e-06, + "loss": 1.8638, + "step": 25285 + }, + { + "epoch": 4.53, + "grad_norm": 1.2347160577774048, + "learning_rate": 1.102406384660745e-06, + "loss": 1.8903, + "step": 25290 + }, + { + "epoch": 4.53, + "grad_norm": 0.872135579586029, + "learning_rate": 1.0982809868430589e-06, + "loss": 1.7901, + "step": 25295 + }, + { + "epoch": 4.53, + "grad_norm": 0.9331486225128174, + "learning_rate": 1.0941631491161093e-06, + "loss": 1.8265, + "step": 25300 + }, + { + "epoch": 4.53, + "grad_norm": 0.7161145806312561, + "learning_rate": 1.0900528727823627e-06, + "loss": 1.7577, + "step": 25305 + }, + { + "epoch": 4.53, + "grad_norm": 0.8512054085731506, + "learning_rate": 1.0859501591418907e-06, + "loss": 1.6612, + "step": 25310 + }, + { + "epoch": 4.53, + "grad_norm": 0.9483695030212402, + "learning_rate": 1.081855009492383e-06, + "loss": 1.9453, + "step": 25315 + }, + { + "epoch": 4.53, + "grad_norm": 1.1600202322006226, + "learning_rate": 1.0777674251291308e-06, + "loss": 2.1087, + "step": 25320 + }, + { + "epoch": 4.53, + "grad_norm": 0.7305920124053955, + "learning_rate": 1.0736874073450282e-06, + "loss": 1.8974, + "step": 25325 + }, + { + "epoch": 4.53, + "grad_norm": 0.8582291603088379, + "learning_rate": 1.0696149574305842e-06, + "loss": 1.9827, + "step": 25330 + }, + { + "epoch": 4.53, + "grad_norm": 0.6721082329750061, + "learning_rate": 1.065550076673913e-06, + "loss": 1.8574, + "step": 25335 + }, + { + "epoch": 4.54, + "grad_norm": 0.8562425971031189, + "learning_rate": 1.0614927663607278e-06, + "loss": 2.1481, + "step": 25340 + }, + { + "epoch": 4.54, + "grad_norm": 0.5766037702560425, + "learning_rate": 1.0574430277743602e-06, + "loss": 1.8303, + "step": 25345 + }, + { + "epoch": 4.54, + "grad_norm": 1.4017845392227173, + "learning_rate": 1.0534008621957331e-06, + "loss": 1.7077, + "step": 25350 + }, + { + "epoch": 4.54, + "grad_norm": 1.1896699666976929, + "learning_rate": 1.0493662709033764e-06, + "loss": 1.6934, + "step": 25355 + }, + { + "epoch": 4.54, + "grad_norm": 1.20075261592865, + "learning_rate": 1.0453392551734276e-06, + "loss": 1.705, + "step": 25360 + }, + { + "epoch": 4.54, + "grad_norm": 0.5349302887916565, + "learning_rate": 1.0413198162796346e-06, + "loss": 2.1699, + "step": 25365 + }, + { + "epoch": 4.54, + "grad_norm": 0.656079113483429, + "learning_rate": 1.0373079554933357e-06, + "loss": 1.6925, + "step": 25370 + }, + { + "epoch": 4.54, + "grad_norm": 0.7854536771774292, + "learning_rate": 1.0333036740834856e-06, + "loss": 2.0432, + "step": 25375 + }, + { + "epoch": 4.54, + "grad_norm": 1.6201163530349731, + "learning_rate": 1.0293069733166294e-06, + "loss": 1.7864, + "step": 25380 + }, + { + "epoch": 4.54, + "grad_norm": 0.7943071722984314, + "learning_rate": 1.025317854456917e-06, + "loss": 1.573, + "step": 25385 + }, + { + "epoch": 4.54, + "grad_norm": 1.6613608598709106, + "learning_rate": 1.0213363187661084e-06, + "loss": 1.4672, + "step": 25390 + }, + { + "epoch": 4.55, + "grad_norm": 1.1010265350341797, + "learning_rate": 1.0173623675035604e-06, + "loss": 1.9815, + "step": 25395 + }, + { + "epoch": 4.55, + "grad_norm": 0.885063648223877, + "learning_rate": 1.0133960019262256e-06, + "loss": 1.6383, + "step": 25400 + }, + { + "epoch": 4.55, + "grad_norm": 0.5052914619445801, + "learning_rate": 1.0094372232886617e-06, + "loss": 1.8147, + "step": 25405 + }, + { + "epoch": 4.55, + "grad_norm": 0.9033421277999878, + "learning_rate": 1.005486032843031e-06, + "loss": 1.7289, + "step": 25410 + }, + { + "epoch": 4.55, + "grad_norm": 0.7714688181877136, + "learning_rate": 1.0015424318390837e-06, + "loss": 1.5649, + "step": 25415 + }, + { + "epoch": 4.55, + "grad_norm": 0.6860387921333313, + "learning_rate": 9.97606421524186e-07, + "loss": 1.8035, + "step": 25420 + }, + { + "epoch": 4.55, + "grad_norm": 0.9091027975082397, + "learning_rate": 9.936780031432863e-07, + "loss": 1.7979, + "step": 25425 + }, + { + "epoch": 4.55, + "grad_norm": 1.081512212753296, + "learning_rate": 9.897571779389437e-07, + "loss": 1.6822, + "step": 25430 + }, + { + "epoch": 4.55, + "grad_norm": 0.972420871257782, + "learning_rate": 9.858439471513131e-07, + "loss": 1.8356, + "step": 25435 + }, + { + "epoch": 4.55, + "grad_norm": 0.838660478591919, + "learning_rate": 9.819383120181436e-07, + "loss": 1.553, + "step": 25440 + }, + { + "epoch": 4.55, + "grad_norm": 0.8015768527984619, + "learning_rate": 9.780402737747828e-07, + "loss": 1.5111, + "step": 25445 + }, + { + "epoch": 4.56, + "grad_norm": 0.8660455942153931, + "learning_rate": 9.741498336541754e-07, + "loss": 1.9797, + "step": 25450 + }, + { + "epoch": 4.56, + "grad_norm": 0.7163037061691284, + "learning_rate": 9.702669928868674e-07, + "loss": 1.6109, + "step": 25455 + }, + { + "epoch": 4.56, + "grad_norm": 1.0026417970657349, + "learning_rate": 9.66391752700993e-07, + "loss": 1.8042, + "step": 25460 + }, + { + "epoch": 4.56, + "grad_norm": 0.9547916650772095, + "learning_rate": 9.62524114322294e-07, + "loss": 1.7232, + "step": 25465 + }, + { + "epoch": 4.56, + "grad_norm": 0.43152350187301636, + "learning_rate": 9.586640789740946e-07, + "loss": 2.017, + "step": 25470 + }, + { + "epoch": 4.56, + "grad_norm": 0.8599415421485901, + "learning_rate": 9.54811647877321e-07, + "loss": 1.6867, + "step": 25475 + }, + { + "epoch": 4.56, + "grad_norm": 1.4471591711044312, + "learning_rate": 9.509668222504958e-07, + "loss": 1.8693, + "step": 25480 + }, + { + "epoch": 4.56, + "grad_norm": 0.8084843158721924, + "learning_rate": 9.471296033097321e-07, + "loss": 1.8386, + "step": 25485 + }, + { + "epoch": 4.56, + "grad_norm": 0.5984373688697815, + "learning_rate": 9.432999922687396e-07, + "loss": 1.7504, + "step": 25490 + }, + { + "epoch": 4.56, + "grad_norm": 0.80012047290802, + "learning_rate": 9.39477990338819e-07, + "loss": 2.0373, + "step": 25495 + }, + { + "epoch": 4.56, + "grad_norm": 1.0529673099517822, + "learning_rate": 9.356635987288698e-07, + "loss": 1.5368, + "step": 25500 + }, + { + "epoch": 4.57, + "grad_norm": 0.46101412177085876, + "learning_rate": 9.318568186453741e-07, + "loss": 1.7577, + "step": 25505 + }, + { + "epoch": 4.57, + "grad_norm": 0.7907940149307251, + "learning_rate": 9.280576512924188e-07, + "loss": 1.8975, + "step": 25510 + }, + { + "epoch": 4.57, + "grad_norm": 1.587792992591858, + "learning_rate": 9.242660978716705e-07, + "loss": 2.0775, + "step": 25515 + }, + { + "epoch": 4.57, + "grad_norm": 0.5791236758232117, + "learning_rate": 9.204821595823976e-07, + "loss": 2.0065, + "step": 25520 + }, + { + "epoch": 4.57, + "grad_norm": 0.8675762414932251, + "learning_rate": 9.167058376214621e-07, + "loss": 1.8338, + "step": 25525 + }, + { + "epoch": 4.57, + "grad_norm": 0.7436877489089966, + "learning_rate": 9.129371331832975e-07, + "loss": 1.7142, + "step": 25530 + }, + { + "epoch": 4.57, + "grad_norm": 0.7845025658607483, + "learning_rate": 9.091760474599559e-07, + "loss": 1.9165, + "step": 25535 + }, + { + "epoch": 4.57, + "grad_norm": 0.5198835730552673, + "learning_rate": 9.054225816410522e-07, + "loss": 1.7308, + "step": 25540 + }, + { + "epoch": 4.57, + "grad_norm": 3.1437716484069824, + "learning_rate": 9.016767369138118e-07, + "loss": 1.8479, + "step": 25545 + }, + { + "epoch": 4.57, + "grad_norm": 1.3514385223388672, + "learning_rate": 8.979385144630397e-07, + "loss": 2.1328, + "step": 25550 + }, + { + "epoch": 4.57, + "grad_norm": 0.615734875202179, + "learning_rate": 8.942079154711347e-07, + "loss": 1.9266, + "step": 25555 + }, + { + "epoch": 4.58, + "grad_norm": 0.8276461958885193, + "learning_rate": 8.904849411180749e-07, + "loss": 1.844, + "step": 25560 + }, + { + "epoch": 4.58, + "grad_norm": 0.8791263103485107, + "learning_rate": 8.867695925814406e-07, + "loss": 2.0249, + "step": 25565 + }, + { + "epoch": 4.58, + "grad_norm": 1.3986464738845825, + "learning_rate": 8.830618710363892e-07, + "loss": 1.6506, + "step": 25570 + }, + { + "epoch": 4.58, + "grad_norm": 1.0997428894042969, + "learning_rate": 8.793617776556685e-07, + "loss": 1.9504, + "step": 25575 + }, + { + "epoch": 4.58, + "grad_norm": 0.9867760539054871, + "learning_rate": 8.7566931360962e-07, + "loss": 1.8399, + "step": 25580 + }, + { + "epoch": 4.58, + "grad_norm": 0.6265989542007446, + "learning_rate": 8.719844800661625e-07, + "loss": 2.0406, + "step": 25585 + }, + { + "epoch": 4.58, + "grad_norm": 1.4594182968139648, + "learning_rate": 8.68307278190808e-07, + "loss": 2.0839, + "step": 25590 + }, + { + "epoch": 4.58, + "grad_norm": 1.0297359228134155, + "learning_rate": 8.646377091466457e-07, + "loss": 1.5879, + "step": 25595 + }, + { + "epoch": 4.58, + "grad_norm": 0.9616431593894958, + "learning_rate": 8.609757740943608e-07, + "loss": 1.7454, + "step": 25600 + }, + { + "epoch": 4.58, + "grad_norm": 0.7618218064308167, + "learning_rate": 8.5805172330512e-07, + "loss": 1.7113, + "step": 25605 + }, + { + "epoch": 4.58, + "grad_norm": 0.9779794216156006, + "learning_rate": 8.544035323554217e-07, + "loss": 1.6699, + "step": 25610 + }, + { + "epoch": 4.58, + "grad_norm": 0.6378960013389587, + "learning_rate": 8.507629786346671e-07, + "loss": 1.5722, + "step": 25615 + }, + { + "epoch": 4.59, + "grad_norm": 1.0023730993270874, + "learning_rate": 8.47130063294349e-07, + "loss": 1.7534, + "step": 25620 + }, + { + "epoch": 4.59, + "grad_norm": 0.8808668255805969, + "learning_rate": 8.435047874835594e-07, + "loss": 1.5374, + "step": 25625 + }, + { + "epoch": 4.59, + "grad_norm": 6.013002395629883, + "learning_rate": 8.398871523489727e-07, + "loss": 1.658, + "step": 25630 + }, + { + "epoch": 4.59, + "grad_norm": 1.1340384483337402, + "learning_rate": 8.362771590348373e-07, + "loss": 1.7486, + "step": 25635 + }, + { + "epoch": 4.59, + "grad_norm": 0.4705217480659485, + "learning_rate": 8.326748086829956e-07, + "loss": 1.6366, + "step": 25640 + }, + { + "epoch": 4.59, + "grad_norm": 0.8444132804870605, + "learning_rate": 8.290801024328693e-07, + "loss": 1.8246, + "step": 25645 + }, + { + "epoch": 4.59, + "grad_norm": 1.68918776512146, + "learning_rate": 8.254930414214518e-07, + "loss": 1.8512, + "step": 25650 + }, + { + "epoch": 4.59, + "grad_norm": 0.906139075756073, + "learning_rate": 8.219136267833355e-07, + "loss": 1.8157, + "step": 25655 + }, + { + "epoch": 4.59, + "grad_norm": 0.8343780040740967, + "learning_rate": 8.183418596506786e-07, + "loss": 2.0715, + "step": 25660 + }, + { + "epoch": 4.59, + "grad_norm": 0.7080813050270081, + "learning_rate": 8.147777411532359e-07, + "loss": 2.1484, + "step": 25665 + }, + { + "epoch": 4.59, + "grad_norm": 0.721260130405426, + "learning_rate": 8.112212724183277e-07, + "loss": 1.6142, + "step": 25670 + }, + { + "epoch": 4.6, + "grad_norm": 1.5041923522949219, + "learning_rate": 8.076724545708681e-07, + "loss": 2.0871, + "step": 25675 + }, + { + "epoch": 4.6, + "grad_norm": 3.3682503700256348, + "learning_rate": 8.041312887333397e-07, + "loss": 1.5679, + "step": 25680 + }, + { + "epoch": 4.6, + "grad_norm": 0.7922682166099548, + "learning_rate": 8.005977760258105e-07, + "loss": 1.696, + "step": 25685 + }, + { + "epoch": 4.6, + "grad_norm": 1.6431972980499268, + "learning_rate": 7.970719175659281e-07, + "loss": 1.7266, + "step": 25690 + }, + { + "epoch": 4.6, + "grad_norm": 0.8077892065048218, + "learning_rate": 7.935537144689142e-07, + "loss": 1.7022, + "step": 25695 + }, + { + "epoch": 4.6, + "grad_norm": 1.4477850198745728, + "learning_rate": 7.900431678475817e-07, + "loss": 1.9362, + "step": 25700 + }, + { + "epoch": 4.6, + "grad_norm": 1.0458213090896606, + "learning_rate": 7.865402788123088e-07, + "loss": 1.6532, + "step": 25705 + }, + { + "epoch": 4.6, + "grad_norm": 1.3129156827926636, + "learning_rate": 7.830450484710511e-07, + "loss": 1.86, + "step": 25710 + }, + { + "epoch": 4.6, + "grad_norm": 0.689514696598053, + "learning_rate": 7.795574779293519e-07, + "loss": 1.9175, + "step": 25715 + }, + { + "epoch": 4.6, + "grad_norm": 1.1813409328460693, + "learning_rate": 7.76077568290326e-07, + "loss": 1.8346, + "step": 25720 + }, + { + "epoch": 4.6, + "grad_norm": 0.8686524033546448, + "learning_rate": 7.72605320654668e-07, + "loss": 1.9223, + "step": 25725 + }, + { + "epoch": 4.61, + "grad_norm": 0.858902096748352, + "learning_rate": 7.691407361206382e-07, + "loss": 1.9596, + "step": 25730 + }, + { + "epoch": 4.61, + "grad_norm": 0.6625041961669922, + "learning_rate": 7.656838157840906e-07, + "loss": 1.9814, + "step": 25735 + }, + { + "epoch": 4.61, + "grad_norm": 0.6935396790504456, + "learning_rate": 7.622345607384396e-07, + "loss": 1.8825, + "step": 25740 + }, + { + "epoch": 4.61, + "grad_norm": 1.0851209163665771, + "learning_rate": 7.587929720746817e-07, + "loss": 1.6372, + "step": 25745 + }, + { + "epoch": 4.61, + "grad_norm": 0.629657506942749, + "learning_rate": 7.553590508813907e-07, + "loss": 2.0008, + "step": 25750 + }, + { + "epoch": 4.61, + "grad_norm": 2.5433287620544434, + "learning_rate": 7.519327982447117e-07, + "loss": 1.7939, + "step": 25755 + }, + { + "epoch": 4.61, + "grad_norm": 0.5418505668640137, + "learning_rate": 7.485142152483665e-07, + "loss": 1.8458, + "step": 25760 + }, + { + "epoch": 4.61, + "grad_norm": 1.0671522617340088, + "learning_rate": 7.451033029736459e-07, + "loss": 1.7501, + "step": 25765 + }, + { + "epoch": 4.61, + "grad_norm": 0.7587215304374695, + "learning_rate": 7.417000624994231e-07, + "loss": 1.713, + "step": 25770 + }, + { + "epoch": 4.61, + "grad_norm": 0.7475984692573547, + "learning_rate": 7.383044949021339e-07, + "loss": 1.6557, + "step": 25775 + }, + { + "epoch": 4.61, + "grad_norm": 3.0722339153289795, + "learning_rate": 7.349166012557973e-07, + "loss": 1.6571, + "step": 25780 + }, + { + "epoch": 4.62, + "grad_norm": 0.7572489976882935, + "learning_rate": 7.315363826320005e-07, + "loss": 1.9279, + "step": 25785 + }, + { + "epoch": 4.62, + "grad_norm": 1.7964684963226318, + "learning_rate": 7.281638400998992e-07, + "loss": 1.8172, + "step": 25790 + }, + { + "epoch": 4.62, + "grad_norm": 1.0750715732574463, + "learning_rate": 7.247989747262346e-07, + "loss": 1.7758, + "step": 25795 + }, + { + "epoch": 4.62, + "grad_norm": 1.502988338470459, + "learning_rate": 7.214417875752999e-07, + "loss": 1.9997, + "step": 25800 + }, + { + "epoch": 4.62, + "grad_norm": 1.200313687324524, + "learning_rate": 7.180922797089762e-07, + "loss": 1.8816, + "step": 25805 + }, + { + "epoch": 4.62, + "grad_norm": 1.1731305122375488, + "learning_rate": 7.147504521867076e-07, + "loss": 2.0038, + "step": 25810 + }, + { + "epoch": 4.62, + "grad_norm": 0.778863787651062, + "learning_rate": 7.114163060655126e-07, + "loss": 1.4799, + "step": 25815 + }, + { + "epoch": 4.62, + "grad_norm": 1.777086853981018, + "learning_rate": 7.080898423999782e-07, + "loss": 1.2548, + "step": 25820 + }, + { + "epoch": 4.62, + "grad_norm": 0.545483410358429, + "learning_rate": 7.047710622422599e-07, + "loss": 1.5645, + "step": 25825 + }, + { + "epoch": 4.62, + "grad_norm": 1.450973629951477, + "learning_rate": 7.014599666420846e-07, + "loss": 1.5726, + "step": 25830 + }, + { + "epoch": 4.62, + "grad_norm": 0.8481055498123169, + "learning_rate": 6.981565566467507e-07, + "loss": 1.7464, + "step": 25835 + }, + { + "epoch": 4.63, + "grad_norm": 0.9643117785453796, + "learning_rate": 6.948608333011197e-07, + "loss": 1.5418, + "step": 25840 + }, + { + "epoch": 4.63, + "grad_norm": 0.6587722301483154, + "learning_rate": 6.915727976476299e-07, + "loss": 1.7643, + "step": 25845 + }, + { + "epoch": 4.63, + "grad_norm": 0.37052008509635925, + "learning_rate": 6.882924507262855e-07, + "loss": 1.6896, + "step": 25850 + }, + { + "epoch": 4.63, + "grad_norm": 0.8936975002288818, + "learning_rate": 6.850197935746511e-07, + "loss": 1.7418, + "step": 25855 + }, + { + "epoch": 4.63, + "grad_norm": 0.858160674571991, + "learning_rate": 6.817548272278706e-07, + "loss": 1.986, + "step": 25860 + }, + { + "epoch": 4.63, + "grad_norm": 0.920091450214386, + "learning_rate": 6.78497552718646e-07, + "loss": 1.8721, + "step": 25865 + }, + { + "epoch": 4.63, + "grad_norm": 0.7904292941093445, + "learning_rate": 6.752479710772502e-07, + "loss": 1.7012, + "step": 25870 + }, + { + "epoch": 4.63, + "grad_norm": 0.7170063853263855, + "learning_rate": 6.720060833315223e-07, + "loss": 1.9776, + "step": 25875 + }, + { + "epoch": 4.63, + "grad_norm": 0.6264550685882568, + "learning_rate": 6.687718905068697e-07, + "loss": 1.953, + "step": 25880 + }, + { + "epoch": 4.63, + "grad_norm": 1.133542537689209, + "learning_rate": 6.655453936262685e-07, + "loss": 1.8424, + "step": 25885 + }, + { + "epoch": 4.63, + "grad_norm": 1.201373815536499, + "learning_rate": 6.623265937102496e-07, + "loss": 1.7449, + "step": 25890 + }, + { + "epoch": 4.64, + "grad_norm": 1.5119297504425049, + "learning_rate": 6.591154917769183e-07, + "loss": 2.0277, + "step": 25895 + }, + { + "epoch": 4.64, + "grad_norm": 1.423034906387329, + "learning_rate": 6.559120888419451e-07, + "loss": 1.6196, + "step": 25900 + }, + { + "epoch": 4.64, + "grad_norm": 0.8784555196762085, + "learning_rate": 6.527163859185614e-07, + "loss": 1.806, + "step": 25905 + }, + { + "epoch": 4.64, + "grad_norm": 0.8068978190422058, + "learning_rate": 6.495283840175697e-07, + "loss": 1.8265, + "step": 25910 + }, + { + "epoch": 4.64, + "grad_norm": 0.9359546899795532, + "learning_rate": 6.463480841473274e-07, + "loss": 1.5987, + "step": 25915 + }, + { + "epoch": 4.64, + "grad_norm": 1.0493196249008179, + "learning_rate": 6.431754873137575e-07, + "loss": 1.8015, + "step": 25920 + }, + { + "epoch": 4.64, + "grad_norm": 0.918982207775116, + "learning_rate": 6.400105945203544e-07, + "loss": 1.9332, + "step": 25925 + }, + { + "epoch": 4.64, + "grad_norm": 0.6983169913291931, + "learning_rate": 6.368534067681675e-07, + "loss": 1.8346, + "step": 25930 + }, + { + "epoch": 4.64, + "grad_norm": 0.8050747513771057, + "learning_rate": 6.337039250558119e-07, + "loss": 2.021, + "step": 25935 + }, + { + "epoch": 4.64, + "grad_norm": 2.0071403980255127, + "learning_rate": 6.305621503794712e-07, + "loss": 1.8157, + "step": 25940 + }, + { + "epoch": 4.64, + "grad_norm": 1.5265626907348633, + "learning_rate": 6.274280837328811e-07, + "loss": 1.7936, + "step": 25945 + }, + { + "epoch": 4.64, + "grad_norm": 0.8027945160865784, + "learning_rate": 6.24301726107343e-07, + "loss": 1.8368, + "step": 25950 + }, + { + "epoch": 4.65, + "grad_norm": 0.6967876553535461, + "learning_rate": 6.211830784917244e-07, + "loss": 1.8268, + "step": 25955 + }, + { + "epoch": 4.65, + "grad_norm": 0.5953835844993591, + "learning_rate": 6.1807214187245e-07, + "loss": 1.7233, + "step": 25960 + }, + { + "epoch": 4.65, + "grad_norm": 1.5460771322250366, + "learning_rate": 6.149689172334994e-07, + "loss": 1.4829, + "step": 25965 + }, + { + "epoch": 4.65, + "grad_norm": 0.8539479374885559, + "learning_rate": 6.118734055564291e-07, + "loss": 1.9009, + "step": 25970 + }, + { + "epoch": 4.65, + "grad_norm": 0.4527439475059509, + "learning_rate": 6.087856078203419e-07, + "loss": 1.8119, + "step": 25975 + }, + { + "epoch": 4.65, + "grad_norm": 0.5889864563941956, + "learning_rate": 6.057055250019039e-07, + "loss": 1.9578, + "step": 25980 + }, + { + "epoch": 4.65, + "grad_norm": 1.3849375247955322, + "learning_rate": 6.026331580753442e-07, + "loss": 1.8432, + "step": 25985 + }, + { + "epoch": 4.65, + "grad_norm": 0.8188489079475403, + "learning_rate": 5.995685080124491e-07, + "loss": 1.8869, + "step": 25990 + }, + { + "epoch": 4.65, + "grad_norm": 0.6627269983291626, + "learning_rate": 5.965115757825657e-07, + "loss": 1.8587, + "step": 25995 + }, + { + "epoch": 4.65, + "grad_norm": 1.0890119075775146, + "learning_rate": 5.934623623526009e-07, + "loss": 1.7339, + "step": 26000 + }, + { + "epoch": 4.65, + "grad_norm": 1.010817050933838, + "learning_rate": 5.90420868687011e-07, + "loss": 1.9268, + "step": 26005 + }, + { + "epoch": 4.66, + "grad_norm": 1.0825525522232056, + "learning_rate": 5.873870957478211e-07, + "loss": 1.7062, + "step": 26010 + }, + { + "epoch": 4.66, + "grad_norm": 0.8610982894897461, + "learning_rate": 5.843610444946135e-07, + "loss": 1.726, + "step": 26015 + }, + { + "epoch": 4.66, + "grad_norm": 1.058080792427063, + "learning_rate": 5.813427158845198e-07, + "loss": 1.7532, + "step": 26020 + }, + { + "epoch": 4.66, + "grad_norm": 2.5200893878936768, + "learning_rate": 5.783321108722373e-07, + "loss": 1.719, + "step": 26025 + }, + { + "epoch": 4.66, + "grad_norm": 0.7497760057449341, + "learning_rate": 5.753292304100183e-07, + "loss": 1.9987, + "step": 26030 + }, + { + "epoch": 4.66, + "grad_norm": 0.674341082572937, + "learning_rate": 5.723340754476697e-07, + "loss": 1.7749, + "step": 26035 + }, + { + "epoch": 4.66, + "grad_norm": 1.0011730194091797, + "learning_rate": 5.693466469325587e-07, + "loss": 1.8461, + "step": 26040 + }, + { + "epoch": 4.66, + "grad_norm": 2.4020795822143555, + "learning_rate": 5.663669458096015e-07, + "loss": 1.8198, + "step": 26045 + }, + { + "epoch": 4.66, + "grad_norm": 0.7626039981842041, + "learning_rate": 5.633949730212806e-07, + "loss": 1.7917, + "step": 26050 + }, + { + "epoch": 4.66, + "grad_norm": 1.1910889148712158, + "learning_rate": 5.604307295076188e-07, + "loss": 1.9962, + "step": 26055 + }, + { + "epoch": 4.66, + "grad_norm": 0.8416028618812561, + "learning_rate": 5.574742162062163e-07, + "loss": 1.8465, + "step": 26060 + }, + { + "epoch": 4.67, + "grad_norm": 1.4080479145050049, + "learning_rate": 5.545254340522027e-07, + "loss": 1.948, + "step": 26065 + }, + { + "epoch": 4.67, + "grad_norm": 0.9326726794242859, + "learning_rate": 5.515843839782848e-07, + "loss": 1.9587, + "step": 26070 + }, + { + "epoch": 4.67, + "grad_norm": 0.7399839162826538, + "learning_rate": 5.486510669147072e-07, + "loss": 1.7994, + "step": 26075 + }, + { + "epoch": 4.67, + "grad_norm": 0.7813273072242737, + "learning_rate": 5.457254837892778e-07, + "loss": 1.7135, + "step": 26080 + }, + { + "epoch": 4.67, + "grad_norm": 0.8069972991943359, + "learning_rate": 5.428076355273564e-07, + "loss": 1.7997, + "step": 26085 + }, + { + "epoch": 4.67, + "grad_norm": 0.7533904910087585, + "learning_rate": 5.398975230518577e-07, + "loss": 1.745, + "step": 26090 + }, + { + "epoch": 4.67, + "grad_norm": 0.5249228477478027, + "learning_rate": 5.369951472832424e-07, + "loss": 1.8554, + "step": 26095 + }, + { + "epoch": 4.67, + "grad_norm": 0.8555483818054199, + "learning_rate": 5.341005091395318e-07, + "loss": 1.7167, + "step": 26100 + }, + { + "epoch": 4.67, + "grad_norm": 2.859463691711426, + "learning_rate": 5.312136095362963e-07, + "loss": 1.8371, + "step": 26105 + }, + { + "epoch": 4.67, + "grad_norm": 0.6293110847473145, + "learning_rate": 5.28334449386661e-07, + "loss": 1.8762, + "step": 26110 + }, + { + "epoch": 4.67, + "grad_norm": 0.6581356525421143, + "learning_rate": 5.254630296013002e-07, + "loss": 1.9024, + "step": 26115 + }, + { + "epoch": 4.68, + "grad_norm": 0.9610373377799988, + "learning_rate": 5.22599351088443e-07, + "loss": 1.9915, + "step": 26120 + }, + { + "epoch": 4.68, + "grad_norm": 0.6756641864776611, + "learning_rate": 5.19743414753865e-07, + "loss": 1.9754, + "step": 26125 + }, + { + "epoch": 4.68, + "grad_norm": 0.6355516910552979, + "learning_rate": 5.168952215008988e-07, + "loss": 1.6428, + "step": 26130 + }, + { + "epoch": 4.68, + "grad_norm": 1.207905888557434, + "learning_rate": 5.140547722304212e-07, + "loss": 1.999, + "step": 26135 + }, + { + "epoch": 4.68, + "grad_norm": 0.9611450433731079, + "learning_rate": 5.112220678408692e-07, + "loss": 1.9363, + "step": 26140 + }, + { + "epoch": 4.68, + "grad_norm": 1.3729864358901978, + "learning_rate": 5.083971092282258e-07, + "loss": 1.5812, + "step": 26145 + }, + { + "epoch": 4.68, + "grad_norm": 1.105804204940796, + "learning_rate": 5.055798972860154e-07, + "loss": 2.1176, + "step": 26150 + }, + { + "epoch": 4.68, + "grad_norm": 0.9496538043022156, + "learning_rate": 5.027704329053223e-07, + "loss": 1.7854, + "step": 26155 + }, + { + "epoch": 4.68, + "grad_norm": 0.8394901156425476, + "learning_rate": 4.999687169747775e-07, + "loss": 1.6806, + "step": 26160 + }, + { + "epoch": 4.68, + "grad_norm": 0.6391941905021667, + "learning_rate": 4.971747503805608e-07, + "loss": 1.415, + "step": 26165 + }, + { + "epoch": 4.68, + "grad_norm": 1.3702791929244995, + "learning_rate": 4.943885340064042e-07, + "loss": 1.4161, + "step": 26170 + }, + { + "epoch": 4.69, + "grad_norm": 2.06182599067688, + "learning_rate": 4.916100687335806e-07, + "loss": 2.0121, + "step": 26175 + }, + { + "epoch": 4.69, + "grad_norm": 0.6842718124389648, + "learning_rate": 4.888393554409204e-07, + "loss": 1.5254, + "step": 26180 + }, + { + "epoch": 4.69, + "grad_norm": 1.0825421810150146, + "learning_rate": 4.860763950047948e-07, + "loss": 1.6063, + "step": 26185 + }, + { + "epoch": 4.69, + "grad_norm": 1.3437987565994263, + "learning_rate": 4.833211882991268e-07, + "loss": 1.8736, + "step": 26190 + }, + { + "epoch": 4.69, + "grad_norm": 0.6650128960609436, + "learning_rate": 4.805737361953833e-07, + "loss": 1.7054, + "step": 26195 + }, + { + "epoch": 4.69, + "grad_norm": 1.0552135705947876, + "learning_rate": 4.77834039562583e-07, + "loss": 1.6637, + "step": 26200 + }, + { + "epoch": 4.69, + "grad_norm": 1.4072233438491821, + "learning_rate": 4.7510209926728834e-07, + "loss": 1.7958, + "step": 26205 + }, + { + "epoch": 4.69, + "grad_norm": 0.7076230049133301, + "learning_rate": 4.7237791617361085e-07, + "loss": 1.9844, + "step": 26210 + }, + { + "epoch": 4.69, + "grad_norm": 1.0958219766616821, + "learning_rate": 4.696614911432057e-07, + "loss": 1.8564, + "step": 26215 + }, + { + "epoch": 4.69, + "grad_norm": 1.130049467086792, + "learning_rate": 4.669528250352745e-07, + "loss": 1.9523, + "step": 26220 + }, + { + "epoch": 4.69, + "grad_norm": 0.9261044263839722, + "learning_rate": 4.6425191870656525e-07, + "loss": 1.6797, + "step": 26225 + }, + { + "epoch": 4.7, + "grad_norm": 0.8678873181343079, + "learning_rate": 4.615587730113752e-07, + "loss": 1.8165, + "step": 26230 + }, + { + "epoch": 4.7, + "grad_norm": 0.6471136212348938, + "learning_rate": 4.5887338880154504e-07, + "loss": 1.8645, + "step": 26235 + }, + { + "epoch": 4.7, + "grad_norm": 0.8412356972694397, + "learning_rate": 4.561957669264566e-07, + "loss": 1.6603, + "step": 26240 + }, + { + "epoch": 4.7, + "grad_norm": 0.5464197993278503, + "learning_rate": 4.535259082330351e-07, + "loss": 1.9706, + "step": 26245 + }, + { + "epoch": 4.7, + "grad_norm": 0.6575042009353638, + "learning_rate": 4.508638135657606e-07, + "loss": 1.9023, + "step": 26250 + }, + { + "epoch": 4.7, + "grad_norm": 1.1826757192611694, + "learning_rate": 4.482094837666456e-07, + "loss": 1.766, + "step": 26255 + }, + { + "epoch": 4.7, + "grad_norm": 0.831210196018219, + "learning_rate": 4.4556291967525755e-07, + "loss": 1.8469, + "step": 26260 + }, + { + "epoch": 4.7, + "grad_norm": 1.009545922279358, + "learning_rate": 4.4292412212869895e-07, + "loss": 1.803, + "step": 26265 + }, + { + "epoch": 4.7, + "grad_norm": 0.8731971979141235, + "learning_rate": 4.402930919616216e-07, + "loss": 1.9441, + "step": 26270 + }, + { + "epoch": 4.7, + "grad_norm": 0.8651910424232483, + "learning_rate": 4.3766983000621266e-07, + "loss": 1.6736, + "step": 26275 + }, + { + "epoch": 4.7, + "grad_norm": 1.4278265237808228, + "learning_rate": 4.350543370922111e-07, + "loss": 1.6739, + "step": 26280 + }, + { + "epoch": 4.7, + "grad_norm": 0.8426223993301392, + "learning_rate": 4.324466140468969e-07, + "loss": 1.9826, + "step": 26285 + }, + { + "epoch": 4.71, + "grad_norm": 1.1639264822006226, + "learning_rate": 4.2984666169508525e-07, + "loss": 1.5842, + "step": 26290 + }, + { + "epoch": 4.71, + "grad_norm": 1.0826095342636108, + "learning_rate": 4.2725448085914056e-07, + "loss": 1.6018, + "step": 26295 + }, + { + "epoch": 4.71, + "grad_norm": 0.7595674991607666, + "learning_rate": 4.246700723589708e-07, + "loss": 2.0188, + "step": 26300 + }, + { + "epoch": 4.71, + "grad_norm": 1.1753851175308228, + "learning_rate": 4.2209343701201666e-07, + "loss": 1.6287, + "step": 26305 + }, + { + "epoch": 4.71, + "grad_norm": 1.429380178451538, + "learning_rate": 4.1952457563326773e-07, + "loss": 1.9633, + "step": 26310 + }, + { + "epoch": 4.71, + "grad_norm": 0.7897043228149414, + "learning_rate": 4.16963489035252e-07, + "loss": 1.6705, + "step": 26315 + }, + { + "epoch": 4.71, + "grad_norm": 0.7799373865127563, + "learning_rate": 4.1441017802803793e-07, + "loss": 2.1176, + "step": 26320 + }, + { + "epoch": 4.71, + "grad_norm": 0.7283823490142822, + "learning_rate": 4.1186464341923805e-07, + "loss": 1.5486, + "step": 26325 + }, + { + "epoch": 4.71, + "grad_norm": 1.3124147653579712, + "learning_rate": 4.093268860139998e-07, + "loss": 1.9634, + "step": 26330 + }, + { + "epoch": 4.71, + "grad_norm": 1.9731324911117554, + "learning_rate": 4.067969066150146e-07, + "loss": 1.4846, + "step": 26335 + }, + { + "epoch": 4.71, + "grad_norm": 0.9217849969863892, + "learning_rate": 4.042747060225116e-07, + "loss": 2.1779, + "step": 26340 + }, + { + "epoch": 4.72, + "grad_norm": 2.033876419067383, + "learning_rate": 4.0176028503425835e-07, + "loss": 1.6705, + "step": 26345 + }, + { + "epoch": 4.72, + "grad_norm": 0.941889226436615, + "learning_rate": 3.9925364444556857e-07, + "loss": 1.6641, + "step": 26350 + }, + { + "epoch": 4.72, + "grad_norm": 1.0339444875717163, + "learning_rate": 3.9675478504929144e-07, + "loss": 1.7571, + "step": 26355 + }, + { + "epoch": 4.72, + "grad_norm": 0.7928982973098755, + "learning_rate": 3.942637076358058e-07, + "loss": 1.8514, + "step": 26360 + }, + { + "epoch": 4.72, + "grad_norm": 1.492556095123291, + "learning_rate": 3.917804129930452e-07, + "loss": 1.8306, + "step": 26365 + }, + { + "epoch": 4.72, + "grad_norm": 0.8601492643356323, + "learning_rate": 3.893049019064676e-07, + "loss": 1.8331, + "step": 26370 + }, + { + "epoch": 4.72, + "grad_norm": 1.3913602828979492, + "learning_rate": 3.8683717515907714e-07, + "loss": 1.7354, + "step": 26375 + }, + { + "epoch": 4.72, + "grad_norm": 1.4564555883407593, + "learning_rate": 3.843772335314133e-07, + "loss": 1.5457, + "step": 26380 + }, + { + "epoch": 4.72, + "grad_norm": 0.7062227725982666, + "learning_rate": 3.8192507780155375e-07, + "loss": 1.6495, + "step": 26385 + }, + { + "epoch": 4.72, + "grad_norm": 0.7881594300270081, + "learning_rate": 3.794807087451141e-07, + "loss": 1.963, + "step": 26390 + }, + { + "epoch": 4.72, + "grad_norm": 1.6842689514160156, + "learning_rate": 3.770441271352426e-07, + "loss": 1.5499, + "step": 26395 + }, + { + "epoch": 4.73, + "grad_norm": 0.4173828065395355, + "learning_rate": 3.7461533374262837e-07, + "loss": 2.0362, + "step": 26400 + }, + { + "epoch": 4.73, + "grad_norm": 1.6171375513076782, + "learning_rate": 3.7219432933549845e-07, + "loss": 1.6515, + "step": 26405 + }, + { + "epoch": 4.73, + "grad_norm": 0.5490692853927612, + "learning_rate": 3.6978111467961263e-07, + "loss": 1.8352, + "step": 26410 + }, + { + "epoch": 4.73, + "grad_norm": 0.5645069479942322, + "learning_rate": 3.6737569053827137e-07, + "loss": 1.7449, + "step": 26415 + }, + { + "epoch": 4.73, + "grad_norm": 0.6575131416320801, + "learning_rate": 3.6497805767230495e-07, + "loss": 1.8291, + "step": 26420 + }, + { + "epoch": 4.73, + "grad_norm": 2.23494815826416, + "learning_rate": 3.625882168400846e-07, + "loss": 1.6557, + "step": 26425 + }, + { + "epoch": 4.73, + "grad_norm": 2.7113113403320312, + "learning_rate": 3.6020616879750835e-07, + "loss": 1.8655, + "step": 26430 + }, + { + "epoch": 4.73, + "grad_norm": 0.5787123441696167, + "learning_rate": 3.5783191429802354e-07, + "loss": 1.7335, + "step": 26435 + }, + { + "epoch": 4.73, + "grad_norm": 1.4365309476852417, + "learning_rate": 3.554654540925989e-07, + "loss": 1.8369, + "step": 26440 + }, + { + "epoch": 4.73, + "grad_norm": 0.9646137356758118, + "learning_rate": 3.531067889297496e-07, + "loss": 1.6218, + "step": 26445 + }, + { + "epoch": 4.73, + "grad_norm": 1.048437237739563, + "learning_rate": 3.507559195555149e-07, + "loss": 1.8628, + "step": 26450 + }, + { + "epoch": 4.74, + "grad_norm": 1.032910943031311, + "learning_rate": 3.484128467134723e-07, + "loss": 1.6514, + "step": 26455 + }, + { + "epoch": 4.74, + "grad_norm": 1.4005285501480103, + "learning_rate": 3.4607757114473174e-07, + "loss": 1.5883, + "step": 26460 + }, + { + "epoch": 4.74, + "grad_norm": 0.918569803237915, + "learning_rate": 3.4375009358794395e-07, + "loss": 1.9173, + "step": 26465 + }, + { + "epoch": 4.74, + "grad_norm": 1.5146937370300293, + "learning_rate": 3.4143041477928686e-07, + "loss": 1.736, + "step": 26470 + }, + { + "epoch": 4.74, + "grad_norm": 1.1808782815933228, + "learning_rate": 3.391185354524706e-07, + "loss": 1.8146, + "step": 26475 + }, + { + "epoch": 4.74, + "grad_norm": 1.120993971824646, + "learning_rate": 3.36814456338741e-07, + "loss": 1.7978, + "step": 26480 + }, + { + "epoch": 4.74, + "grad_norm": 0.922327995300293, + "learning_rate": 3.3451817816687323e-07, + "loss": 1.7257, + "step": 26485 + }, + { + "epoch": 4.74, + "grad_norm": 2.25972056388855, + "learning_rate": 3.322297016631809e-07, + "loss": 1.6793, + "step": 26490 + }, + { + "epoch": 4.74, + "grad_norm": 0.4978795349597931, + "learning_rate": 3.2994902755150715e-07, + "loss": 1.9777, + "step": 26495 + }, + { + "epoch": 4.74, + "grad_norm": 0.6013870239257812, + "learning_rate": 3.276761565532277e-07, + "loss": 1.9321, + "step": 26500 + }, + { + "epoch": 4.74, + "grad_norm": 0.6871728897094727, + "learning_rate": 3.254110893872481e-07, + "loss": 1.8226, + "step": 26505 + }, + { + "epoch": 4.75, + "grad_norm": 1.049520492553711, + "learning_rate": 3.231538267700063e-07, + "loss": 1.6252, + "step": 26510 + }, + { + "epoch": 4.75, + "grad_norm": 1.513724684715271, + "learning_rate": 3.209043694154729e-07, + "loss": 1.7386, + "step": 26515 + }, + { + "epoch": 4.75, + "grad_norm": 3.1947197914123535, + "learning_rate": 3.186627180351509e-07, + "loss": 1.9751, + "step": 26520 + }, + { + "epoch": 4.75, + "grad_norm": 0.9903146624565125, + "learning_rate": 3.164288733380677e-07, + "loss": 1.5443, + "step": 26525 + }, + { + "epoch": 4.75, + "grad_norm": 0.7547667622566223, + "learning_rate": 3.1420283603079135e-07, + "loss": 1.9813, + "step": 26530 + }, + { + "epoch": 4.75, + "grad_norm": 1.0118529796600342, + "learning_rate": 3.1198460681741427e-07, + "loss": 1.9654, + "step": 26535 + }, + { + "epoch": 4.75, + "grad_norm": 0.8024758100509644, + "learning_rate": 3.097741863995557e-07, + "loss": 1.6795, + "step": 26540 + }, + { + "epoch": 4.75, + "grad_norm": 0.8070296049118042, + "learning_rate": 3.0757157547637307e-07, + "loss": 1.9996, + "step": 26545 + }, + { + "epoch": 4.75, + "grad_norm": 0.769829273223877, + "learning_rate": 3.053767747445535e-07, + "loss": 1.8527, + "step": 26550 + }, + { + "epoch": 4.75, + "grad_norm": 0.8586844801902771, + "learning_rate": 3.0318978489830286e-07, + "loss": 1.6287, + "step": 26555 + }, + { + "epoch": 4.75, + "grad_norm": 0.8367322683334351, + "learning_rate": 3.010106066293733e-07, + "loss": 2.0245, + "step": 26560 + }, + { + "epoch": 4.76, + "grad_norm": 0.4964026212692261, + "learning_rate": 2.988392406270274e-07, + "loss": 1.4126, + "step": 26565 + }, + { + "epoch": 4.76, + "grad_norm": 0.8849166035652161, + "learning_rate": 2.966756875780713e-07, + "loss": 1.9657, + "step": 26570 + }, + { + "epoch": 4.76, + "grad_norm": 0.6733694076538086, + "learning_rate": 2.945199481668326e-07, + "loss": 1.8609, + "step": 26575 + }, + { + "epoch": 4.76, + "grad_norm": 2.2621817588806152, + "learning_rate": 2.923720230751714e-07, + "loss": 1.9039, + "step": 26580 + }, + { + "epoch": 4.76, + "grad_norm": 0.6737426519393921, + "learning_rate": 2.9023191298247487e-07, + "loss": 1.6034, + "step": 26585 + }, + { + "epoch": 4.76, + "grad_norm": 0.8793543577194214, + "learning_rate": 2.8809961856565425e-07, + "loss": 2.0761, + "step": 26590 + }, + { + "epoch": 4.76, + "grad_norm": 0.6130470633506775, + "learning_rate": 2.8597514049915617e-07, + "loss": 2.0985, + "step": 26595 + }, + { + "epoch": 4.76, + "grad_norm": 0.6239492297172546, + "learning_rate": 2.838584794549487e-07, + "loss": 1.7084, + "step": 26600 + }, + { + "epoch": 4.76, + "grad_norm": 1.3211489915847778, + "learning_rate": 2.817496361025296e-07, + "loss": 1.6366, + "step": 26605 + }, + { + "epoch": 4.76, + "grad_norm": 0.7192025184631348, + "learning_rate": 2.7964861110892636e-07, + "loss": 1.8488, + "step": 26610 + }, + { + "epoch": 4.76, + "grad_norm": 1.1748217344284058, + "learning_rate": 2.7755540513868805e-07, + "loss": 1.9784, + "step": 26615 + }, + { + "epoch": 4.76, + "grad_norm": 0.6899864673614502, + "learning_rate": 2.754700188538961e-07, + "loss": 1.8471, + "step": 26620 + }, + { + "epoch": 4.77, + "grad_norm": 1.0577079057693481, + "learning_rate": 2.733924529141535e-07, + "loss": 1.7249, + "step": 26625 + }, + { + "epoch": 4.77, + "grad_norm": 0.9071574211120605, + "learning_rate": 2.7132270797659563e-07, + "loss": 1.6626, + "step": 26630 + }, + { + "epoch": 4.77, + "grad_norm": 0.7166086435317993, + "learning_rate": 2.6926078469587944e-07, + "loss": 2.038, + "step": 26635 + }, + { + "epoch": 4.77, + "grad_norm": 0.6789818406105042, + "learning_rate": 2.67206683724186e-07, + "loss": 1.8971, + "step": 26640 + }, + { + "epoch": 4.77, + "grad_norm": 1.1377719640731812, + "learning_rate": 2.6516040571123166e-07, + "loss": 1.5134, + "step": 26645 + }, + { + "epoch": 4.77, + "grad_norm": 1.127935767173767, + "learning_rate": 2.6312195130424875e-07, + "loss": 2.0224, + "step": 26650 + }, + { + "epoch": 4.77, + "grad_norm": 1.656747817993164, + "learning_rate": 2.610913211479993e-07, + "loss": 1.9184, + "step": 26655 + }, + { + "epoch": 4.77, + "grad_norm": 1.3809555768966675, + "learning_rate": 2.5906851588476945e-07, + "loss": 1.5658, + "step": 26660 + }, + { + "epoch": 4.77, + "grad_norm": 0.8653160333633423, + "learning_rate": 2.570535361543669e-07, + "loss": 1.7374, + "step": 26665 + }, + { + "epoch": 4.77, + "grad_norm": 1.040223479270935, + "learning_rate": 2.550463825941346e-07, + "loss": 1.9036, + "step": 26670 + }, + { + "epoch": 4.77, + "grad_norm": 1.1421611309051514, + "learning_rate": 2.530470558389286e-07, + "loss": 1.9153, + "step": 26675 + }, + { + "epoch": 4.78, + "grad_norm": 1.0162934064865112, + "learning_rate": 2.510555565211348e-07, + "loss": 1.6926, + "step": 26680 + }, + { + "epoch": 4.78, + "grad_norm": 0.722922682762146, + "learning_rate": 2.4907188527066303e-07, + "loss": 1.6469, + "step": 26685 + }, + { + "epoch": 4.78, + "grad_norm": 0.6988521218299866, + "learning_rate": 2.4709604271494766e-07, + "loss": 1.4591, + "step": 26690 + }, + { + "epoch": 4.78, + "grad_norm": 0.9216463565826416, + "learning_rate": 2.451280294789443e-07, + "loss": 1.6103, + "step": 26695 + }, + { + "epoch": 4.78, + "grad_norm": 0.6147362589836121, + "learning_rate": 2.4316784618513276e-07, + "loss": 1.788, + "step": 26700 + }, + { + "epoch": 4.78, + "grad_norm": 0.583010196685791, + "learning_rate": 2.412154934535199e-07, + "loss": 1.7635, + "step": 26705 + }, + { + "epoch": 4.78, + "grad_norm": 1.2794963121414185, + "learning_rate": 2.392709719016312e-07, + "loss": 1.8667, + "step": 26710 + }, + { + "epoch": 4.78, + "grad_norm": 1.555069923400879, + "learning_rate": 2.3733428214451903e-07, + "loss": 1.7455, + "step": 26715 + }, + { + "epoch": 4.78, + "grad_norm": 0.6522120833396912, + "learning_rate": 2.354054247947518e-07, + "loss": 1.8311, + "step": 26720 + }, + { + "epoch": 4.78, + "grad_norm": 0.864811360836029, + "learning_rate": 2.3348440046243035e-07, + "loss": 1.8641, + "step": 26725 + }, + { + "epoch": 4.78, + "grad_norm": 0.828033983707428, + "learning_rate": 2.3157120975516867e-07, + "loss": 1.8355, + "step": 26730 + }, + { + "epoch": 4.79, + "grad_norm": 0.6203212141990662, + "learning_rate": 2.2966585327811041e-07, + "loss": 2.0341, + "step": 26735 + }, + { + "epoch": 4.79, + "grad_norm": 0.5540245771408081, + "learning_rate": 2.2776833163391796e-07, + "loss": 2.0733, + "step": 26740 + }, + { + "epoch": 4.79, + "grad_norm": 0.819915235042572, + "learning_rate": 2.2587864542277237e-07, + "loss": 1.8464, + "step": 26745 + }, + { + "epoch": 4.79, + "grad_norm": 3.0156285762786865, + "learning_rate": 2.239967952423816e-07, + "loss": 1.5715, + "step": 26750 + }, + { + "epoch": 4.79, + "grad_norm": 0.48900169134140015, + "learning_rate": 2.2212278168797507e-07, + "loss": 1.6596, + "step": 26755 + }, + { + "epoch": 4.79, + "grad_norm": 0.8913301825523376, + "learning_rate": 2.202566053522953e-07, + "loss": 1.7543, + "step": 26760 + }, + { + "epoch": 4.79, + "grad_norm": 0.641745924949646, + "learning_rate": 2.1839826682562015e-07, + "loss": 2.1231, + "step": 26765 + }, + { + "epoch": 4.79, + "grad_norm": 0.7494519352912903, + "learning_rate": 2.1654776669573496e-07, + "loss": 1.89, + "step": 26770 + }, + { + "epoch": 4.79, + "grad_norm": 1.3078620433807373, + "learning_rate": 2.147051055479521e-07, + "loss": 1.4582, + "step": 26775 + }, + { + "epoch": 4.79, + "grad_norm": 3.589418411254883, + "learning_rate": 2.1287028396510534e-07, + "loss": 1.5499, + "step": 26780 + }, + { + "epoch": 4.79, + "grad_norm": 1.803573489189148, + "learning_rate": 2.1104330252754435e-07, + "loss": 1.9718, + "step": 26785 + }, + { + "epoch": 4.8, + "grad_norm": 0.8699598908424377, + "learning_rate": 2.092241618131402e-07, + "loss": 1.8986, + "step": 26790 + }, + { + "epoch": 4.8, + "grad_norm": 0.7145684361457825, + "learning_rate": 2.0741286239729097e-07, + "loss": 1.6495, + "step": 26795 + }, + { + "epoch": 4.8, + "grad_norm": 1.2166204452514648, + "learning_rate": 2.0560940485290782e-07, + "loss": 1.5914, + "step": 26800 + }, + { + "epoch": 4.8, + "grad_norm": 0.681698739528656, + "learning_rate": 2.0381378975041775e-07, + "loss": 1.5633, + "step": 26805 + }, + { + "epoch": 4.8, + "grad_norm": 0.5658136606216431, + "learning_rate": 2.0202601765777762e-07, + "loss": 1.8271, + "step": 26810 + }, + { + "epoch": 4.8, + "grad_norm": 0.5894467234611511, + "learning_rate": 2.0024608914045173e-07, + "loss": 1.8671, + "step": 26815 + }, + { + "epoch": 4.8, + "grad_norm": 1.0941160917282104, + "learning_rate": 1.9847400476143695e-07, + "loss": 1.679, + "step": 26820 + }, + { + "epoch": 4.8, + "grad_norm": 1.179622769355774, + "learning_rate": 1.9670976508123495e-07, + "loss": 2.0309, + "step": 26825 + }, + { + "epoch": 4.8, + "grad_norm": 1.4097144603729248, + "learning_rate": 1.9495337065787988e-07, + "loss": 2.1893, + "step": 26830 + }, + { + "epoch": 4.8, + "grad_norm": 0.7634019255638123, + "learning_rate": 1.9320482204691348e-07, + "loss": 1.7503, + "step": 26835 + }, + { + "epoch": 4.8, + "grad_norm": 0.8397713303565979, + "learning_rate": 1.914641198013989e-07, + "loss": 2.0055, + "step": 26840 + }, + { + "epoch": 4.81, + "grad_norm": 0.6048543453216553, + "learning_rate": 1.8973126447192345e-07, + "loss": 2.0813, + "step": 26845 + }, + { + "epoch": 4.81, + "grad_norm": 0.7331050634384155, + "learning_rate": 1.880062566065821e-07, + "loss": 1.8715, + "step": 26850 + }, + { + "epoch": 4.81, + "grad_norm": 0.8746078014373779, + "learning_rate": 1.862890967509967e-07, + "loss": 1.719, + "step": 26855 + }, + { + "epoch": 4.81, + "grad_norm": 1.2344975471496582, + "learning_rate": 1.8457978544830224e-07, + "loss": 2.062, + "step": 26860 + }, + { + "epoch": 4.81, + "grad_norm": 0.6543428301811218, + "learning_rate": 1.8287832323915233e-07, + "loss": 1.6365, + "step": 26865 + }, + { + "epoch": 4.81, + "grad_norm": 0.6258736252784729, + "learning_rate": 1.8118471066171648e-07, + "loss": 1.6869, + "step": 26870 + }, + { + "epoch": 4.81, + "grad_norm": 0.3230958580970764, + "learning_rate": 1.7949894825168568e-07, + "loss": 1.9677, + "step": 26875 + }, + { + "epoch": 4.81, + "grad_norm": 0.5818564891815186, + "learning_rate": 1.778210365422611e-07, + "loss": 1.7614, + "step": 26880 + }, + { + "epoch": 4.81, + "grad_norm": 0.5392193794250488, + "learning_rate": 1.761509760641683e-07, + "loss": 1.8393, + "step": 26885 + }, + { + "epoch": 4.81, + "grad_norm": 0.8611071109771729, + "learning_rate": 1.744887673456458e-07, + "loss": 1.7183, + "step": 26890 + }, + { + "epoch": 4.81, + "grad_norm": 1.2561523914337158, + "learning_rate": 1.7283441091244523e-07, + "loss": 1.9883, + "step": 26895 + }, + { + "epoch": 4.81, + "grad_norm": 0.6996719837188721, + "learning_rate": 1.7118790728783973e-07, + "loss": 2.3327, + "step": 26900 + }, + { + "epoch": 4.82, + "grad_norm": 0.7118642330169678, + "learning_rate": 1.6954925699261825e-07, + "loss": 1.9309, + "step": 26905 + }, + { + "epoch": 4.82, + "grad_norm": 1.1299420595169067, + "learning_rate": 1.6791846054508008e-07, + "loss": 1.8533, + "step": 26910 + }, + { + "epoch": 4.82, + "grad_norm": 1.176224946975708, + "learning_rate": 1.6629551846104874e-07, + "loss": 1.7047, + "step": 26915 + }, + { + "epoch": 4.82, + "grad_norm": 0.4716276228427887, + "learning_rate": 1.646804312538608e-07, + "loss": 2.2831, + "step": 26920 + }, + { + "epoch": 4.82, + "grad_norm": 0.8160402774810791, + "learning_rate": 1.630731994343604e-07, + "loss": 1.8131, + "step": 26925 + }, + { + "epoch": 4.82, + "grad_norm": 1.1298454999923706, + "learning_rate": 1.6147382351091865e-07, + "loss": 1.829, + "step": 26930 + }, + { + "epoch": 4.82, + "grad_norm": 1.2337285280227661, + "learning_rate": 1.5988230398941428e-07, + "loss": 1.9875, + "step": 26935 + }, + { + "epoch": 4.82, + "grad_norm": 0.5422369837760925, + "learning_rate": 1.5829864137324457e-07, + "loss": 2.1579, + "step": 26940 + }, + { + "epoch": 4.82, + "grad_norm": 0.5247811079025269, + "learning_rate": 1.5672283616331717e-07, + "loss": 1.7073, + "step": 26945 + }, + { + "epoch": 4.82, + "grad_norm": 0.9248299598693848, + "learning_rate": 1.5515488885806394e-07, + "loss": 1.8775, + "step": 26950 + }, + { + "epoch": 4.82, + "grad_norm": 0.7368882298469543, + "learning_rate": 1.535947999534215e-07, + "loss": 1.7339, + "step": 26955 + }, + { + "epoch": 4.83, + "grad_norm": 1.1814441680908203, + "learning_rate": 1.5204256994284238e-07, + "loss": 1.7054, + "step": 26960 + }, + { + "epoch": 4.83, + "grad_norm": 0.6178357005119324, + "learning_rate": 1.504981993172977e-07, + "loss": 2.0362, + "step": 26965 + }, + { + "epoch": 4.83, + "grad_norm": 0.6023995280265808, + "learning_rate": 1.489616885652717e-07, + "loss": 2.0766, + "step": 26970 + }, + { + "epoch": 4.83, + "grad_norm": 0.8028101921081543, + "learning_rate": 1.4743303817275899e-07, + "loss": 2.2482, + "step": 26975 + }, + { + "epoch": 4.83, + "grad_norm": 0.9648069143295288, + "learning_rate": 1.459122486232728e-07, + "loss": 1.8054, + "step": 26980 + }, + { + "epoch": 4.83, + "grad_norm": 1.3614518642425537, + "learning_rate": 1.4439932039783665e-07, + "loss": 1.6154, + "step": 26985 + }, + { + "epoch": 4.83, + "grad_norm": 1.080051302909851, + "learning_rate": 1.4289425397498445e-07, + "loss": 1.6912, + "step": 26990 + }, + { + "epoch": 4.83, + "grad_norm": 0.6755841374397278, + "learning_rate": 1.413970498307715e-07, + "loss": 1.7686, + "step": 26995 + }, + { + "epoch": 4.83, + "grad_norm": 12.691487312316895, + "learning_rate": 1.3990770843876067e-07, + "loss": 1.681, + "step": 27000 + }, + { + "epoch": 4.83, + "grad_norm": 0.9654897451400757, + "learning_rate": 1.3842623027002787e-07, + "loss": 1.8103, + "step": 27005 + }, + { + "epoch": 4.83, + "grad_norm": 1.236765742301941, + "learning_rate": 1.3695261579316777e-07, + "loss": 1.5977, + "step": 27010 + }, + { + "epoch": 4.84, + "grad_norm": 2.983935594558716, + "learning_rate": 1.3548686547427692e-07, + "loss": 1.7417, + "step": 27015 + }, + { + "epoch": 4.84, + "grad_norm": 0.7265644669532776, + "learning_rate": 1.3402897977697614e-07, + "loss": 1.9217, + "step": 27020 + }, + { + "epoch": 4.84, + "grad_norm": 1.0526750087738037, + "learning_rate": 1.3257895916238828e-07, + "loss": 1.8388, + "step": 27025 + }, + { + "epoch": 4.84, + "grad_norm": 0.9290766716003418, + "learning_rate": 1.3113680408915752e-07, + "loss": 1.7867, + "step": 27030 + }, + { + "epoch": 4.84, + "grad_norm": 0.6184346079826355, + "learning_rate": 1.2970251501343844e-07, + "loss": 1.7741, + "step": 27035 + }, + { + "epoch": 4.84, + "grad_norm": 1.0642505884170532, + "learning_rate": 1.2827609238888762e-07, + "loss": 1.7068, + "step": 27040 + }, + { + "epoch": 4.84, + "grad_norm": 0.8582693338394165, + "learning_rate": 1.2685753666668576e-07, + "loss": 1.4969, + "step": 27045 + }, + { + "epoch": 4.84, + "grad_norm": 0.6961191892623901, + "learning_rate": 1.2544684829552122e-07, + "loss": 1.6328, + "step": 27050 + }, + { + "epoch": 4.84, + "grad_norm": 1.6947689056396484, + "learning_rate": 1.2404402772159262e-07, + "loss": 1.6862, + "step": 27055 + }, + { + "epoch": 4.84, + "grad_norm": 0.803805947303772, + "learning_rate": 1.2264907538860893e-07, + "loss": 1.5166, + "step": 27060 + }, + { + "epoch": 4.84, + "grad_norm": 1.7080804109573364, + "learning_rate": 1.2126199173779496e-07, + "loss": 1.632, + "step": 27065 + }, + { + "epoch": 4.85, + "grad_norm": 0.6481025815010071, + "learning_rate": 1.1988277720788034e-07, + "loss": 1.9797, + "step": 27070 + }, + { + "epoch": 4.85, + "grad_norm": 1.0819138288497925, + "learning_rate": 1.1851143223511329e-07, + "loss": 1.5857, + "step": 27075 + }, + { + "epoch": 4.85, + "grad_norm": 0.7787542343139648, + "learning_rate": 1.1714795725324967e-07, + "loss": 1.5872, + "step": 27080 + }, + { + "epoch": 4.85, + "grad_norm": 2.253755807876587, + "learning_rate": 1.1579235269355005e-07, + "loss": 1.7227, + "step": 27085 + }, + { + "epoch": 4.85, + "grad_norm": 0.5482271313667297, + "learning_rate": 1.1444461898479364e-07, + "loss": 1.7388, + "step": 27090 + }, + { + "epoch": 4.85, + "grad_norm": 1.8450013399124146, + "learning_rate": 1.1310475655326724e-07, + "loss": 1.908, + "step": 27095 + }, + { + "epoch": 4.85, + "grad_norm": 0.5819000601768494, + "learning_rate": 1.1177276582276797e-07, + "loss": 1.613, + "step": 27100 + }, + { + "epoch": 4.85, + "grad_norm": 0.8016330003738403, + "learning_rate": 1.1044864721460602e-07, + "loss": 1.6827, + "step": 27105 + }, + { + "epoch": 4.85, + "grad_norm": 1.288928508758545, + "learning_rate": 1.091324011475936e-07, + "loss": 1.846, + "step": 27110 + }, + { + "epoch": 4.85, + "grad_norm": 1.4904510974884033, + "learning_rate": 1.0782402803806158e-07, + "loss": 2.0203, + "step": 27115 + }, + { + "epoch": 4.85, + "grad_norm": 2.6243813037872314, + "learning_rate": 1.0652352829984558e-07, + "loss": 1.6311, + "step": 27120 + }, + { + "epoch": 4.86, + "grad_norm": 2.045247793197632, + "learning_rate": 1.0523090234429434e-07, + "loss": 2.0991, + "step": 27125 + }, + { + "epoch": 4.86, + "grad_norm": 0.736707329750061, + "learning_rate": 1.0394615058026414e-07, + "loss": 1.7908, + "step": 27130 + }, + { + "epoch": 4.86, + "grad_norm": 1.2909306287765503, + "learning_rate": 1.0266927341411602e-07, + "loss": 2.2229, + "step": 27135 + }, + { + "epoch": 4.86, + "grad_norm": 0.4913961887359619, + "learning_rate": 1.0140027124973251e-07, + "loss": 1.8207, + "step": 27140 + }, + { + "epoch": 4.86, + "grad_norm": 0.5628072023391724, + "learning_rate": 1.0013914448848971e-07, + "loss": 2.0149, + "step": 27145 + }, + { + "epoch": 4.86, + "grad_norm": 0.8567386865615845, + "learning_rate": 9.8885893529288e-08, + "loss": 2.0128, + "step": 27150 + }, + { + "epoch": 4.86, + "grad_norm": 0.6592267155647278, + "learning_rate": 9.764051876852421e-08, + "loss": 2.0507, + "step": 27155 + }, + { + "epoch": 4.86, + "grad_norm": 1.0533310174942017, + "learning_rate": 9.640302060011375e-08, + "loss": 1.8298, + "step": 27160 + }, + { + "epoch": 4.86, + "grad_norm": 1.0910136699676514, + "learning_rate": 9.517339941547132e-08, + "loss": 1.7826, + "step": 27165 + }, + { + "epoch": 4.86, + "grad_norm": 1.4886125326156616, + "learning_rate": 9.395165560352748e-08, + "loss": 1.424, + "step": 27170 + }, + { + "epoch": 4.86, + "grad_norm": 0.8163747191429138, + "learning_rate": 9.273778955072032e-08, + "loss": 1.8783, + "step": 27175 + }, + { + "epoch": 4.87, + "grad_norm": 0.813804566860199, + "learning_rate": 9.153180164098995e-08, + "loss": 1.9613, + "step": 27180 + }, + { + "epoch": 4.87, + "grad_norm": 1.107713222503662, + "learning_rate": 9.033369225579514e-08, + "loss": 1.783, + "step": 27185 + }, + { + "epoch": 4.87, + "grad_norm": 1.2432942390441895, + "learning_rate": 8.914346177409105e-08, + "loss": 1.7609, + "step": 27190 + }, + { + "epoch": 4.87, + "grad_norm": 0.697809636592865, + "learning_rate": 8.79611105723488e-08, + "loss": 1.8995, + "step": 27195 + }, + { + "epoch": 4.87, + "grad_norm": 0.8270529508590698, + "learning_rate": 8.67866390245442e-08, + "loss": 1.8255, + "step": 27200 + }, + { + "epoch": 4.87, + "grad_norm": 0.9013446569442749, + "learning_rate": 8.562004750216346e-08, + "loss": 1.7947, + "step": 27205 + }, + { + "epoch": 4.87, + "grad_norm": 1.4320096969604492, + "learning_rate": 8.446133637419751e-08, + "loss": 1.6338, + "step": 27210 + }, + { + "epoch": 4.87, + "grad_norm": 0.9843838810920715, + "learning_rate": 8.331050600714485e-08, + "loss": 1.8977, + "step": 27215 + }, + { + "epoch": 4.87, + "grad_norm": 1.381026029586792, + "learning_rate": 8.216755676501431e-08, + "loss": 1.7343, + "step": 27220 + }, + { + "epoch": 4.87, + "grad_norm": 1.0219390392303467, + "learning_rate": 8.103248900931393e-08, + "loss": 1.7915, + "step": 27225 + }, + { + "epoch": 4.87, + "grad_norm": 1.4810404777526855, + "learning_rate": 7.99053030990704e-08, + "loss": 1.9256, + "step": 27230 + }, + { + "epoch": 4.87, + "grad_norm": 1.307970643043518, + "learning_rate": 7.878599939080689e-08, + "loss": 1.9488, + "step": 27235 + }, + { + "epoch": 4.88, + "grad_norm": 0.675598680973053, + "learning_rate": 7.767457823856239e-08, + "loss": 1.6861, + "step": 27240 + }, + { + "epoch": 4.88, + "grad_norm": 1.2670272588729858, + "learning_rate": 7.657103999387794e-08, + "loss": 1.7473, + "step": 27245 + }, + { + "epoch": 4.88, + "grad_norm": 1.0107417106628418, + "learning_rate": 7.547538500579932e-08, + "loss": 1.5708, + "step": 27250 + }, + { + "epoch": 4.88, + "grad_norm": 0.8755956888198853, + "learning_rate": 7.438761362087987e-08, + "loss": 1.728, + "step": 27255 + }, + { + "epoch": 4.88, + "grad_norm": 1.5213618278503418, + "learning_rate": 7.330772618318604e-08, + "loss": 1.6688, + "step": 27260 + }, + { + "epoch": 4.88, + "grad_norm": 0.8350248336791992, + "learning_rate": 7.223572303428072e-08, + "loss": 1.5138, + "step": 27265 + }, + { + "epoch": 4.88, + "grad_norm": 0.5836665630340576, + "learning_rate": 7.11716045132399e-08, + "loss": 1.6876, + "step": 27270 + }, + { + "epoch": 4.88, + "grad_norm": 0.3720304071903229, + "learning_rate": 7.011537095663878e-08, + "loss": 2.2579, + "step": 27275 + }, + { + "epoch": 4.88, + "grad_norm": 0.8098456859588623, + "learning_rate": 6.906702269856846e-08, + "loss": 1.8464, + "step": 27280 + }, + { + "epoch": 4.88, + "grad_norm": 1.2636229991912842, + "learning_rate": 6.802656007061925e-08, + "loss": 1.9191, + "step": 27285 + }, + { + "epoch": 4.88, + "grad_norm": 0.7597413063049316, + "learning_rate": 6.699398340188623e-08, + "loss": 1.9295, + "step": 27290 + }, + { + "epoch": 4.89, + "grad_norm": 0.806469738483429, + "learning_rate": 6.596929301897481e-08, + "loss": 1.8776, + "step": 27295 + }, + { + "epoch": 4.89, + "grad_norm": 0.9175061583518982, + "learning_rate": 6.49524892459924e-08, + "loss": 1.7537, + "step": 27300 + }, + { + "epoch": 4.89, + "grad_norm": 1.0410821437835693, + "learning_rate": 6.394357240455118e-08, + "loss": 1.7922, + "step": 27305 + }, + { + "epoch": 4.89, + "grad_norm": 0.9964597821235657, + "learning_rate": 6.294254281377366e-08, + "loss": 1.9924, + "step": 27310 + }, + { + "epoch": 4.89, + "grad_norm": 0.8122162818908691, + "learning_rate": 6.194940079028432e-08, + "loss": 1.8831, + "step": 27315 + }, + { + "epoch": 4.89, + "grad_norm": 2.2000720500946045, + "learning_rate": 6.096414664821248e-08, + "loss": 1.7875, + "step": 27320 + }, + { + "epoch": 4.89, + "grad_norm": 0.8936071395874023, + "learning_rate": 5.998678069919216e-08, + "loss": 1.8224, + "step": 27325 + }, + { + "epoch": 4.89, + "grad_norm": 0.8874375224113464, + "learning_rate": 5.901730325236221e-08, + "loss": 2.0273, + "step": 27330 + }, + { + "epoch": 4.89, + "grad_norm": 1.6214640140533447, + "learning_rate": 5.805571461437176e-08, + "loss": 1.7428, + "step": 27335 + }, + { + "epoch": 4.89, + "grad_norm": 1.3129786252975464, + "learning_rate": 5.7102015089366415e-08, + "loss": 1.6939, + "step": 27340 + }, + { + "epoch": 4.89, + "grad_norm": 1.0049517154693604, + "learning_rate": 5.615620497900209e-08, + "loss": 1.6544, + "step": 27345 + }, + { + "epoch": 4.9, + "grad_norm": 0.76762455701828, + "learning_rate": 5.521828458243672e-08, + "loss": 1.9076, + "step": 27350 + }, + { + "epoch": 4.9, + "grad_norm": 0.8442772030830383, + "learning_rate": 5.428825419633299e-08, + "loss": 1.7669, + "step": 27355 + }, + { + "epoch": 4.9, + "grad_norm": 1.0416758060455322, + "learning_rate": 5.336611411486114e-08, + "loss": 2.1945, + "step": 27360 + }, + { + "epoch": 4.9, + "grad_norm": 0.9710814356803894, + "learning_rate": 5.245186462969065e-08, + "loss": 1.6822, + "step": 27365 + }, + { + "epoch": 4.9, + "grad_norm": 0.7552046775817871, + "learning_rate": 5.15455060300013e-08, + "loss": 1.9038, + "step": 27370 + }, + { + "epoch": 4.9, + "grad_norm": 0.8535296320915222, + "learning_rate": 5.064703860246933e-08, + "loss": 1.8449, + "step": 27375 + }, + { + "epoch": 4.9, + "grad_norm": 5.469326019287109, + "learning_rate": 4.975646263128131e-08, + "loss": 1.8349, + "step": 27380 + }, + { + "epoch": 4.9, + "grad_norm": 0.8397666215896606, + "learning_rate": 4.8873778398123015e-08, + "loss": 1.7978, + "step": 27385 + }, + { + "epoch": 4.9, + "grad_norm": 0.7340749502182007, + "learning_rate": 4.799898618219057e-08, + "loss": 1.7184, + "step": 27390 + }, + { + "epoch": 4.9, + "grad_norm": 0.6980647444725037, + "learning_rate": 4.713208626017929e-08, + "loss": 1.8878, + "step": 27395 + }, + { + "epoch": 4.9, + "grad_norm": 0.6763983964920044, + "learning_rate": 4.627307890628374e-08, + "loss": 2.0292, + "step": 27400 + }, + { + "epoch": 4.91, + "grad_norm": 2.8043484687805176, + "learning_rate": 4.5421964392214354e-08, + "loss": 1.76, + "step": 27405 + }, + { + "epoch": 4.91, + "grad_norm": 1.528351068496704, + "learning_rate": 4.457874298717246e-08, + "loss": 1.8167, + "step": 27410 + }, + { + "epoch": 4.91, + "grad_norm": 1.0853112936019897, + "learning_rate": 4.3743414957872485e-08, + "loss": 2.0553, + "step": 27415 + }, + { + "epoch": 4.91, + "grad_norm": 0.819485068321228, + "learning_rate": 4.291598056852253e-08, + "loss": 1.9577, + "step": 27420 + }, + { + "epoch": 4.91, + "grad_norm": 0.8028863072395325, + "learning_rate": 4.209644008084379e-08, + "loss": 1.927, + "step": 27425 + }, + { + "epoch": 4.91, + "grad_norm": 0.6567749977111816, + "learning_rate": 4.128479375405392e-08, + "loss": 1.9202, + "step": 27430 + }, + { + "epoch": 4.91, + "grad_norm": 1.1371631622314453, + "learning_rate": 4.048104184487811e-08, + "loss": 2.0074, + "step": 27435 + }, + { + "epoch": 4.91, + "grad_norm": 0.6734791994094849, + "learning_rate": 3.968518460753801e-08, + "loss": 1.7147, + "step": 27440 + }, + { + "epoch": 4.91, + "grad_norm": 1.4970152378082275, + "learning_rate": 3.8897222293768356e-08, + "loss": 1.8979, + "step": 27445 + }, + { + "epoch": 4.91, + "grad_norm": 0.4336738586425781, + "learning_rate": 3.8117155152797567e-08, + "loss": 1.8246, + "step": 27450 + }, + { + "epoch": 4.91, + "grad_norm": 1.3205974102020264, + "learning_rate": 3.734498343135884e-08, + "loss": 1.837, + "step": 27455 + }, + { + "epoch": 4.92, + "grad_norm": 1.5103232860565186, + "learning_rate": 3.658070737369013e-08, + "loss": 1.7212, + "step": 27460 + }, + { + "epoch": 4.92, + "grad_norm": 0.7287768721580505, + "learning_rate": 3.5824327221534174e-08, + "loss": 1.9573, + "step": 27465 + }, + { + "epoch": 4.92, + "grad_norm": 1.077132225036621, + "learning_rate": 3.507584321412738e-08, + "loss": 1.9364, + "step": 27470 + }, + { + "epoch": 4.92, + "grad_norm": 0.7396588921546936, + "learning_rate": 3.433525558822204e-08, + "loss": 1.8377, + "step": 27475 + }, + { + "epoch": 4.92, + "grad_norm": 0.9306398034095764, + "learning_rate": 3.360256457805855e-08, + "loss": 1.8822, + "step": 27480 + }, + { + "epoch": 4.92, + "grad_norm": 1.1197668313980103, + "learning_rate": 3.287777041539042e-08, + "loss": 1.9135, + "step": 27485 + }, + { + "epoch": 4.92, + "grad_norm": 1.111188292503357, + "learning_rate": 3.2160873329470374e-08, + "loss": 1.8155, + "step": 27490 + }, + { + "epoch": 4.92, + "grad_norm": 0.7195913791656494, + "learning_rate": 3.145187354704759e-08, + "loss": 1.7282, + "step": 27495 + }, + { + "epoch": 4.92, + "grad_norm": 1.0142110586166382, + "learning_rate": 3.075077129238158e-08, + "loss": 1.9826, + "step": 27500 + }, + { + "epoch": 4.92, + "grad_norm": 0.8807904720306396, + "learning_rate": 3.005756678722826e-08, + "loss": 1.9266, + "step": 27505 + }, + { + "epoch": 4.92, + "grad_norm": 0.3830811381340027, + "learning_rate": 2.9372260250848384e-08, + "loss": 1.7985, + "step": 27510 + }, + { + "epoch": 4.93, + "grad_norm": 1.0831133127212524, + "learning_rate": 2.869485190000465e-08, + "loss": 1.6724, + "step": 27515 + }, + { + "epoch": 4.93, + "grad_norm": 1.3438478708267212, + "learning_rate": 2.8025341948959007e-08, + "loss": 1.7674, + "step": 27520 + }, + { + "epoch": 4.93, + "grad_norm": 0.8136791586875916, + "learning_rate": 2.7363730609478166e-08, + "loss": 1.8693, + "step": 27525 + }, + { + "epoch": 4.93, + "grad_norm": 0.6372392177581787, + "learning_rate": 2.6710018090828072e-08, + "loss": 1.7738, + "step": 27530 + }, + { + "epoch": 4.93, + "grad_norm": 0.6655257344245911, + "learning_rate": 2.606420459977943e-08, + "loss": 1.7049, + "step": 27535 + }, + { + "epoch": 4.93, + "grad_norm": 0.8369601964950562, + "learning_rate": 2.555324124416669e-08, + "loss": 1.6822, + "step": 27540 + }, + { + "epoch": 4.93, + "grad_norm": 0.9216079711914062, + "learning_rate": 2.4921646515879692e-08, + "loss": 1.6634, + "step": 27545 + }, + { + "epoch": 4.93, + "grad_norm": 0.874138593673706, + "learning_rate": 2.429795138085278e-08, + "loss": 1.948, + "step": 27550 + }, + { + "epoch": 4.93, + "grad_norm": 0.4770451784133911, + "learning_rate": 2.3682156036358704e-08, + "loss": 1.7761, + "step": 27555 + }, + { + "epoch": 4.93, + "grad_norm": 0.8585416078567505, + "learning_rate": 2.3074260677177773e-08, + "loss": 1.7923, + "step": 27560 + }, + { + "epoch": 4.93, + "grad_norm": 0.9223566055297852, + "learning_rate": 2.247426549558118e-08, + "loss": 1.7544, + "step": 27565 + }, + { + "epoch": 4.93, + "grad_norm": 3.3756611347198486, + "learning_rate": 2.188217068134768e-08, + "loss": 1.961, + "step": 27570 + }, + { + "epoch": 4.94, + "grad_norm": 0.6999983191490173, + "learning_rate": 2.129797642176079e-08, + "loss": 1.8334, + "step": 27575 + }, + { + "epoch": 4.94, + "grad_norm": 0.6343751549720764, + "learning_rate": 2.0721682901594928e-08, + "loss": 2.1177, + "step": 27580 + }, + { + "epoch": 4.94, + "grad_norm": 1.4471246004104614, + "learning_rate": 2.0153290303134843e-08, + "loss": 1.4078, + "step": 27585 + }, + { + "epoch": 4.94, + "grad_norm": 0.7915092706680298, + "learning_rate": 1.9592798806161716e-08, + "loss": 1.6325, + "step": 27590 + }, + { + "epoch": 4.94, + "grad_norm": 0.42571014165878296, + "learning_rate": 1.9040208587958742e-08, + "loss": 2.0373, + "step": 27595 + }, + { + "epoch": 4.94, + "grad_norm": 1.225578784942627, + "learning_rate": 1.8495519823308327e-08, + "loss": 1.7639, + "step": 27600 + }, + { + "epoch": 4.94, + "grad_norm": 3.2549374103546143, + "learning_rate": 1.7958732684497657e-08, + "loss": 1.7882, + "step": 27605 + }, + { + "epoch": 4.94, + "grad_norm": 0.732199490070343, + "learning_rate": 1.7429847341307594e-08, + "loss": 1.9269, + "step": 27610 + }, + { + "epoch": 4.94, + "grad_norm": 0.9305698871612549, + "learning_rate": 1.6908863961026534e-08, + "loss": 1.9614, + "step": 27615 + }, + { + "epoch": 4.94, + "grad_norm": 1.5303813219070435, + "learning_rate": 1.6395782708442108e-08, + "loss": 1.7238, + "step": 27620 + }, + { + "epoch": 4.94, + "grad_norm": 1.0078951120376587, + "learning_rate": 1.589060374584117e-08, + "loss": 1.7562, + "step": 27625 + }, + { + "epoch": 4.95, + "grad_norm": 0.5599997639656067, + "learning_rate": 1.539332723300979e-08, + "loss": 1.6895, + "step": 27630 + }, + { + "epoch": 4.95, + "grad_norm": 1.057673692703247, + "learning_rate": 1.4903953327238818e-08, + "loss": 1.6365, + "step": 27635 + }, + { + "epoch": 4.95, + "grad_norm": 1.3415483236312866, + "learning_rate": 1.442248218331277e-08, + "loss": 1.8398, + "step": 27640 + }, + { + "epoch": 4.95, + "grad_norm": 2.2374091148376465, + "learning_rate": 1.3948913953523713e-08, + "loss": 1.6291, + "step": 27645 + }, + { + "epoch": 4.95, + "grad_norm": 0.6912126541137695, + "learning_rate": 1.3483248787657387e-08, + "loss": 1.5426, + "step": 27650 + }, + { + "epoch": 4.95, + "grad_norm": 0.8157421350479126, + "learning_rate": 1.3025486833007083e-08, + "loss": 1.8275, + "step": 27655 + }, + { + "epoch": 4.95, + "grad_norm": 0.5781735181808472, + "learning_rate": 1.257562823435976e-08, + "loss": 1.7979, + "step": 27660 + }, + { + "epoch": 4.95, + "grad_norm": 0.9726621508598328, + "learning_rate": 1.2133673134007152e-08, + "loss": 1.719, + "step": 27665 + }, + { + "epoch": 4.95, + "grad_norm": 0.7362151145935059, + "learning_rate": 1.1699621671740212e-08, + "loss": 1.5477, + "step": 27670 + }, + { + "epoch": 4.95, + "grad_norm": 0.6761454343795776, + "learning_rate": 1.1273473984843575e-08, + "loss": 2.0092, + "step": 27675 + }, + { + "epoch": 4.95, + "grad_norm": 0.835422158241272, + "learning_rate": 1.0855230208114964e-08, + "loss": 1.7663, + "step": 27680 + }, + { + "epoch": 4.96, + "grad_norm": 0.9953777194023132, + "learning_rate": 1.0444890473837454e-08, + "loss": 1.5584, + "step": 27685 + }, + { + "epoch": 4.96, + "grad_norm": 0.6336172223091125, + "learning_rate": 1.0042454911804444e-08, + "loss": 1.968, + "step": 27690 + }, + { + "epoch": 4.96, + "grad_norm": 1.8674088716506958, + "learning_rate": 9.647923649308554e-09, + "loss": 1.7018, + "step": 27695 + }, + { + "epoch": 4.96, + "grad_norm": 0.79061359167099, + "learning_rate": 9.261296811133301e-09, + "loss": 2.1185, + "step": 27700 + }, + { + "epoch": 4.96, + "grad_norm": 0.5622634887695312, + "learning_rate": 8.882574519572529e-09, + "loss": 1.7226, + "step": 27705 + }, + { + "epoch": 4.96, + "grad_norm": 0.5330947637557983, + "learning_rate": 8.511756894416523e-09, + "loss": 1.8869, + "step": 27710 + }, + { + "epoch": 4.96, + "grad_norm": 0.9711994528770447, + "learning_rate": 8.14884405295202e-09, + "loss": 1.9226, + "step": 27715 + }, + { + "epoch": 4.96, + "grad_norm": 0.642650842666626, + "learning_rate": 7.793836109970532e-09, + "loss": 1.8734, + "step": 27720 + }, + { + "epoch": 4.96, + "grad_norm": 0.5877659916877747, + "learning_rate": 7.446733177757237e-09, + "loss": 1.7147, + "step": 27725 + }, + { + "epoch": 4.96, + "grad_norm": 0.6836099028587341, + "learning_rate": 7.107535366102092e-09, + "loss": 1.6646, + "step": 27730 + }, + { + "epoch": 4.96, + "grad_norm": 0.694749653339386, + "learning_rate": 6.776242782297049e-09, + "loss": 1.6795, + "step": 27735 + }, + { + "epoch": 4.97, + "grad_norm": 1.7434329986572266, + "learning_rate": 6.452855531122182e-09, + "loss": 1.8301, + "step": 27740 + }, + { + "epoch": 4.97, + "grad_norm": 1.0804805755615234, + "learning_rate": 6.137373714870664e-09, + "loss": 1.6298, + "step": 27745 + }, + { + "epoch": 4.97, + "grad_norm": 0.8923995494842529, + "learning_rate": 5.829797433323791e-09, + "loss": 1.7, + "step": 27750 + }, + { + "epoch": 4.97, + "grad_norm": 1.1635453701019287, + "learning_rate": 5.530126783773182e-09, + "loss": 1.6506, + "step": 27755 + }, + { + "epoch": 4.97, + "grad_norm": 2.5559163093566895, + "learning_rate": 5.238361861001351e-09, + "loss": 1.4222, + "step": 27760 + }, + { + "epoch": 4.97, + "grad_norm": 0.6483443975448608, + "learning_rate": 4.954502757295587e-09, + "loss": 2.0705, + "step": 27765 + }, + { + "epoch": 4.97, + "grad_norm": 0.3663721978664398, + "learning_rate": 4.678549562434076e-09, + "loss": 1.8228, + "step": 27770 + }, + { + "epoch": 4.97, + "grad_norm": 0.8673094511032104, + "learning_rate": 4.410502363708102e-09, + "loss": 1.8538, + "step": 27775 + }, + { + "epoch": 4.97, + "grad_norm": 1.0372596979141235, + "learning_rate": 4.150361245897072e-09, + "loss": 1.7484, + "step": 27780 + }, + { + "epoch": 4.97, + "grad_norm": 0.5771445035934448, + "learning_rate": 3.898126291285164e-09, + "loss": 1.9486, + "step": 27785 + }, + { + "epoch": 4.97, + "grad_norm": 0.7575253844261169, + "learning_rate": 3.653797579650231e-09, + "loss": 1.8841, + "step": 27790 + }, + { + "epoch": 4.98, + "grad_norm": 0.6591800451278687, + "learning_rate": 3.417375188274896e-09, + "loss": 2.1256, + "step": 27795 + }, + { + "epoch": 4.98, + "grad_norm": 0.8006610870361328, + "learning_rate": 3.1888591919437826e-09, + "loss": 2.0, + "step": 27800 + }, + { + "epoch": 4.98, + "grad_norm": 1.2859355211257935, + "learning_rate": 2.968249662929634e-09, + "loss": 1.8523, + "step": 27805 + }, + { + "epoch": 4.98, + "grad_norm": 0.5861554145812988, + "learning_rate": 2.755546671015519e-09, + "loss": 1.7343, + "step": 27810 + }, + { + "epoch": 4.98, + "grad_norm": 0.8748874068260193, + "learning_rate": 2.550750283478176e-09, + "loss": 1.8467, + "step": 27815 + }, + { + "epoch": 4.98, + "grad_norm": 0.9922271966934204, + "learning_rate": 2.353860565090793e-09, + "loss": 1.5163, + "step": 27820 + }, + { + "epoch": 4.98, + "grad_norm": 1.116941213607788, + "learning_rate": 2.164877578136881e-09, + "loss": 1.9498, + "step": 27825 + }, + { + "epoch": 4.98, + "grad_norm": 1.195860743522644, + "learning_rate": 1.9838013823852974e-09, + "loss": 1.593, + "step": 27830 + }, + { + "epoch": 4.98, + "grad_norm": 0.7228025794029236, + "learning_rate": 1.8106320351124472e-09, + "loss": 2.0734, + "step": 27835 + }, + { + "epoch": 4.98, + "grad_norm": 0.9624055624008179, + "learning_rate": 1.6453695910911837e-09, + "loss": 1.6158, + "step": 27840 + }, + { + "epoch": 4.98, + "grad_norm": 1.733687162399292, + "learning_rate": 1.4880141025935824e-09, + "loss": 2.0339, + "step": 27845 + }, + { + "epoch": 4.99, + "grad_norm": 0.7384397983551025, + "learning_rate": 1.3385656193909413e-09, + "loss": 1.9362, + "step": 27850 + }, + { + "epoch": 4.99, + "grad_norm": 1.2739673852920532, + "learning_rate": 1.197024188756557e-09, + "loss": 1.6635, + "step": 27855 + }, + { + "epoch": 4.99, + "grad_norm": 0.9207239747047424, + "learning_rate": 1.0633898554573972e-09, + "loss": 1.7917, + "step": 27860 + }, + { + "epoch": 4.99, + "grad_norm": 2.776045560836792, + "learning_rate": 9.376626617624286e-10, + "loss": 1.9558, + "step": 27865 + }, + { + "epoch": 4.99, + "grad_norm": 0.9185320734977722, + "learning_rate": 8.198426474370635e-10, + "loss": 1.8871, + "step": 27870 + }, + { + "epoch": 4.99, + "grad_norm": 0.9698821306228638, + "learning_rate": 7.099298497487139e-10, + "loss": 1.5789, + "step": 27875 + }, + { + "epoch": 4.99, + "grad_norm": 1.133810043334961, + "learning_rate": 6.079243034667891e-10, + "loss": 1.7469, + "step": 27880 + }, + { + "epoch": 4.99, + "grad_norm": 0.8469708561897278, + "learning_rate": 5.138260408488194e-10, + "loss": 2.05, + "step": 27885 + }, + { + "epoch": 4.99, + "grad_norm": 1.0808436870574951, + "learning_rate": 4.2763509166265927e-10, + "loss": 1.9934, + "step": 27890 + }, + { + "epoch": 4.99, + "grad_norm": 1.0401432514190674, + "learning_rate": 3.4935148316705967e-10, + "loss": 1.8931, + "step": 27895 + }, + { + "epoch": 4.99, + "grad_norm": 0.6832406520843506, + "learning_rate": 2.789752401283208e-10, + "loss": 2.1373, + "step": 27900 + }, + { + "epoch": 4.99, + "grad_norm": 0.7210304141044617, + "learning_rate": 2.1650638480086305e-10, + "loss": 1.9971, + "step": 27905 + }, + { + "epoch": 5.0, + "grad_norm": 1.0963140726089478, + "learning_rate": 1.6194493694665635e-10, + "loss": 1.7998, + "step": 27910 + }, + { + "epoch": 5.0, + "grad_norm": 0.7948571443557739, + "learning_rate": 1.1529091382134205e-10, + "loss": 1.9675, + "step": 27915 + }, + { + "epoch": 5.0, + "grad_norm": 0.8925894498825073, + "learning_rate": 7.654433018255969e-11, + "loss": 1.7117, + "step": 27920 + }, + { + "epoch": 5.0, + "grad_norm": 0.588504433631897, + "learning_rate": 4.5705198287171456e-11, + "loss": 1.6579, + "step": 27925 + }, + { + "epoch": 5.0, + "grad_norm": 2.567326068878174, + "learning_rate": 2.2773527888486635e-11, + "loss": 1.817, + "step": 27930 + }, + { + "epoch": 5.0, + "step": 27930, + "total_flos": 5.219882421458043e+18, + "train_loss": 1.860821307947505, + "train_runtime": 129586.2862, + "train_samples_per_second": 1.724, + "train_steps_per_second": 0.216 + } + ], + "logging_steps": 5, + "max_steps": 27930, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 100, + "total_flos": 5.219882421458043e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}